import anndata
adata = anndata.read_h5ad("./bioFound/Geneformer/pbmc.h5ad")
adata.var

adata.var.index.values

array(['MIR1302-10', 'FAM138A', 'OR4F5', ..., 'CU459201.1', 'AC002321.2',
       'AC002321.1'], dtype=object)

# from pyensembl import EnsemblRelease
import pandas as pd
file = pd.read_csv("gencode_cut.txt", sep ='\t', header=None)
file1 = file.set_index(0)
file1[1].index
values_to_keep=[]
for item in adata.var.index.values:
    if item in file1[1].index.values:
        values_to_keep.append(item)
# values_to_keep  20059
var_indices = [i for i, var_name in enumerate(adata.var_names) if var_name in values_to_keep]
# save corresponding data
adata_cut = adata[:, var_indices]
ensemble_list=[]
for item in adata.var.index.values:
    if item in file1[1].index.values:
        idx = list(np.where(file1[1].index.values==item))[0][0]  # find the index of item
        ensemble_list.append(file1[1].values[idx])
change_list=[]
for item in ensemble_list:
    change_list.append(item.split('.')[0])
adata_final=ad.AnnData(adata_cut.X,obs=adata_cut.obs,var=pd.DataFrame(index=pd.Index(change_list)))

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[1], line 2
      1 # from pyensembl import EnsemblRelease
----> 2 file = pd.read_csv("gencode_cut.txt", sep ='\t', header=None)
      3 file1 = file.set_index(0)
      4 file1[1].index

NameError: name 'pd' is not defined

adata_final.var['ensembl_id']=adata_final.var.index.values
adata_final.var

adata_final.obs

adata_final.obs["n_counts"]=adata_final.obs.iloc[:,1]

adata_final.write_loom("./Geneformer/loomdata/result.loom")

/scratch/PI/jgwang/mwang/anaconda3/lib/python3.11/site-packages/anndata/_io/write.py:84: UserWarning: DataFrame columns are not unique, some columns will be omitted.
  col_attrs = {k: np.array(v) for k, v in adata.obs.to_dict("list").items()}

"ENSG00000272272" in unserialized_data

False

"7SK" in adata_merge.var.index.values

True

import pandas as pd
file = pd.read_csv("gencode.txt",header=None,sep='\t')
import pickle
with open('/scratch/PI/jgwang/zhihuang/deep_learning/Generative_Model/geneformer/Geneformer/Dataset/token_dictionary.pkl', 'rb') as handle:
    unserialized_data = pickle.load(handle)
result = []
for idx in range(len(file[1])):
    value=file[1][idx].split('.')[0]
    if value in unserialized_data:
        result.append([file[0][idx], file[1][idx]])
import numpy as np       
with open("gencode_cut.txt", 'w') as file:
    for item in result:
        file.writelines(item[0]+'\t'+item[1]+'\n')

import loompy
from collections import Counter
with loompy.connect('./bioFound/Geneformer/loomdata/result.loom') as ds:
    # Add 'cell_type' and 'organ_major' to the loom file
    ds.ca['n_counts'] = adata_merge_final.obs.iloc[:,1].values
    ds.ca['organ_major']=['A']*247048
    # print(ds.ca['organ_major'])
    # print(Counter(ds.ca["organ_major"]).keys())

['A' 'A' 'A' ... 'A' 'A' 'A']
dict_keys(['A'])

adata_merge_final.obs

import anndata
adata = anndata.read_h5ad("./bioFound/Geneformer/pbmc.h5ad")
adata.var

# adata.obs["tumor_anno"].unique()  # array(['Normal', 'Tumor', 'NA'], dtype=object)
# adata.obs.columns
# adata.obs["condition"].unique() # array(['Primary', 'Recurrent'], dtype=object)
import pandas as pd, numpy as np, anndata as ad
adata = ad.read_h5ad("./Geneformer/pbmc.h5ad")
file = pd.read_csv("gencode_cut.txt", sep ='\t', header=None)
file1 = file.set_index(0)
file1[1].index
values_to_keep=[]
for item in adata.var.index.values:
    if item in file1[1].index.values:
        values_to_keep.append(item)
# np.array(values_to_keep).shape # 17341
var_indices = [i for i, var_name in enumerate(adata.var_names) if var_name in values_to_keep]
# 保留对应的数据
adata_cut = adata[:, var_indices]
ensemble_list=[]
for item in adata.var.index.values:
    if item in file1[1].index.values:
        idx = list(np.where(file1[1].index.values==item))[0][0]  # find the index of item
        ensemble_list.append(file1[1].values[idx])
change_list=[]
for item in ensemble_list:
    change_list.append(item.split('.')[0])
adata_final=ad.AnnData(adata_cut.X,obs=adata_cut.obs,var=pd.DataFrame(index=pd.Index(change_list)))
adata_final.var['ensembl_id']=adata_final.var.index.values
adata_final.obs["n_counts"]=adata_final.obs.iloc[:,1].values
adata_final.write_loom("./Geneformer/loomdata/pbmc.loom")

/mnt/disk/mwang/conda3/lib/python3.11/site-packages/anndata/_core/anndata.py:1113: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead
  if not is_categorical_dtype(df_full[k]):
/mnt/disk/mwang/conda3/lib/python3.11/site-packages/loompy/bus_file.py:67: NumbaDeprecationWarning: The 'nopython' keyword argument was not supplied to the 'numba.jit' decorator. The implicit default value for this argument is currently False, but it will be changed to True in Numba 0.59.0. See https://numba.readthedocs.io/en/stable/reference/deprecation.html#deprecation-of-object-mode-fall-back-behaviour-when-using-jit for details.
  @jit
/mnt/disk/mwang/conda3/lib/python3.11/site-packages/loompy/bus_file.py:84: NumbaDeprecationWarning: The 'nopython' keyword argument was not supplied to the 'numba.jit' decorator. The implicit default value for this argument is currently False, but it will be changed to True in Numba 0.59.0. See https://numba.readthedocs.io/en/stable/reference/deprecation.html#deprecation-of-object-mode-fall-back-behaviour-when-using-jit for details.
  @jit
/mnt/disk/mwang/conda3/lib/python3.11/site-packages/loompy/bus_file.py:101: NumbaDeprecationWarning: The 'nopython' keyword argument was not supplied to the 'numba.jit' decorator. The implicit default value for this argument is currently False, but it will be changed to True in Numba 0.59.0. See https://numba.readthedocs.io/en/stable/reference/deprecation.html#deprecation-of-object-mode-fall-back-behaviour-when-using-jit for details.
  @jit

print(adata_final.obs["celltype_0.1"].unique())
print(adata_final.obs["celltype_0.5"].unique())

print(adata_final.obs["celltype_1"].unique())
print(adata_final.obs["celltype_2"].unique())
print(adata_final.obs["celltype_3"].unique())

['Myeloid' 'Tumor' 'Oligodendrocyte' 'Vascular' 'Neuron' 'Endothelial'
 'lymphocyte']
['Myeloid' 'Tumor' 'Oligodendrocyte' 'Vascular' 'Neuron' 'Endothelial'
 'lymphocyte']
['Myeloid' 'Tumor' 'Oligodendrocyte' 'Vascular' 'Neuron' 'Endothelial'
 'lymphocyte']
['Myeloid' 'Tumor' 'Oligodendrocyte' 'Vascular' 'Neuron' 'Endothelial'
 'lymphocyte']
['Myeloid' 'Tumor' 'Oligodendrocyte' 'Mural' 'Neuron' 'Endothelial'
 'lymphocyte']

import pandas as pd, numpy as np, anndata as ad
adata = ad.read_h5ad("./Geneformer/pbmc.h5ad")
adata.obs["orig.ident"].unique()

(58,)

import pandas as pd, numpy as np, anndata as ad
adata = ad.read_h5ad("./pbmc_minghao.h5ad")
adata

AnnData object with n_obs × n_vars = 149341 × 28223
    obs: 'orig.ident', 'nCount_RNA', 'nFeature_RNA', 'Prefix', 'condition', 'percent.mt', 'orig.ident2', 'pair', 'nCount_integrated', 'nFeature_integrated', 'integrated_snn_res.1', 'seurat_clusters', 'integrated_snn_res.0.1', 'integrated_snn_res.0.5', 'celltype_0.1', 'celltype_0.5', 'celltype_1', 'MES2.like', 'MES1.like', 'AC.like', 'OPC.like', 'NPC1.like', 'NPC2.like', 'G1.S', 'G2.M', 'Astrocyte', 'Neuron', 'Oligodendrocyte', 'OPC', 'Endothelial', 'Perivascular_fibroblast', 'Pericyte', 'SMC', 'T_cell', 'TAM.MG', 'TAM.BDM', 'integrated_snn_res.2', 'celltype_2', 'integrated_snn_res.3', 'celltype_3', 'celltype_3_subtype', 'celltype_3_subtype2', 'integrated_snn_res.0.05', 'integrated_snn_res.0.08', 'celltype_3_factor', 'sum_sf', 'tumor_anno', 'HighSF_stem_enriched1', 'Tumor_stem_enriched1', 'Tumor_diff_enriched1', 'Tumor_prolif_enriched1', 'sig_diff_sf_enriched1', 'celltype_3_subtype3', 'MES.like', 'NPC.like'
    var: 'features'

	features
RP11-34P13.3	RP11-34P13.3
RP11-34P13.7	RP11-34P13.7
RP11-34P13.14	RP11-34P13.14
FO538757.3	FO538757.3
FO538757.2	FO538757.2
...	...
AC007325.4	AC007325.4
AC007325.2	AC007325.2
AL354822.1	AL354822.1
AC004556.1	AC004556.1
AC240274.1	AC240274.1

	ensembl_id
ENSG00000186092	ENSG00000186092
ENSG00000237683	ENSG00000237683
ENSG00000235249	ENSG00000235249
ENSG00000269831	ENSG00000269831
ENSG00000269308	ENSG00000269308
...	...
ENSG00000212907	ENSG00000212907
ENSG00000198886	ENSG00000198886
ENSG00000198786	ENSG00000198786
ENSG00000198695	ENSG00000198695
ENSG00000198727	ENSG00000198727

	orig.ident	nCount_RNA	nFeature_RNA	Prefix	condition	percent.mt	orig.ident2	pair	nCount_integrated	nFeature_integrated	...	sum_sf	tumor_anno	HighSF_stem_enriched1	Tumor_stem_enriched1	Tumor_diff_enriched1	Tumor_prolif_enriched1	sig_diff_sf_enriched1	celltype_3_subtype3	MES.like	NPC.like
I10_AAACCCACATGGGTTT-1	SF2501	662.0	461	GSM5319517_SF2501	Primary	0.151057	I10	10	185.0	85	...	-2.657172	Normal	-0.043302	-0.095862	-0.064256	0.044756	-0.143947	8	NaN	NaN
I10_AAACGAACAATCGTCA-1	SF2501	568.0	448	GSM5319517_SF2501	Primary	0.000000	I10	10	90.0	76	...	-2.657172	Tumor	-0.037989	-0.098335	0.063711	0.070403	-0.147895	6	NaN	NaN
I10_AAAGGTAGTCGAGTGA-1	SF2501	886.0	705	GSM5319517_SF2501	Primary	0.677201	I10	10	191.0	130	...	0.338127	Normal	-0.067565	-0.134437	-0.153833	0.031265	0.044746	8	NaN	NaN
I10_AAAGTCCCAGGGAATC-1	SF2501	952.0	710	GSM5319517_SF2501	Primary	0.210084	I10	10	233.0	128	...	-2.657172	Normal	-0.079482	-0.059993	-0.147041	-0.059460	-0.194016	8	NaN	NaN
I10_AAAGTGACAAGGCCTC-1	SF2501	459.0	376	GSM5319517_SF2501	Primary	0.000000	I10	10	53.0	45	...	1.075636	Tumor	-0.043786	-0.083700	0.127065	-0.038782	0.230097	6	NaN	NaN
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
R9_TTTGGTTGTTGGAGAC-1	SF12243	5227.0	2467	GSM5319565_SF12243	Recurrent	0.000000	R9	9	1442.0	351	...	0.080708	Normal	-0.040156	-0.151086	-0.344036	-0.044960	-0.018616	7	NaN	NaN
R9_TTTGGTTTCATGGTAC-1	SF12243	938.0	748	GSM5319565_SF12243	Recurrent	0.000000	R9	9	259.0	156	...	-0.685367	Tumor	-0.059343	0.404530	0.197543	0.011436	0.097027	2	NaN	NaN
R9_TTTGGTTTCGGTCATA-1	SF12243	3821.0	1933	GSM5319565_SF12243	Recurrent	0.026171	R9	9	1139.0	307	...	2.284892	Normal	-0.055897	-0.029113	-0.290330	-0.080831	0.039368	7	NaN	NaN
R9_TTTGTTGCAGAGTTGG-1	SF12243	4204.0	2281	GSM5319565_SF12243	Recurrent	0.023787	R9	9	1191.0	407	...	-1.679808	Tumor	-0.034292	0.050586	0.097786	0.041328	-0.195999	3	NaN	NaN
R9_TTTGTTGCATGGACAG-1	SF12243	3835.0	1926	GSM5319565_SF12243	Recurrent	0.000000	R9	9	1086.0	300	...	-1.627191	Normal	-0.041317	0.017252	-0.302003	-0.027876	-0.189377	7	NaN	NaN

	orig.ident	nCount_RNA	nFeature_RNA	percent.mt	rename_sample	type	major.type	subcluster	nCount_RNA	nFeature_RNA	id	tumor_non_tumor	infercnv	nCounts
P1194.F_AAAGAACCAAGTTTGC-1	P1194.F	10076.0	3081	7.423581	P1194.F	WT	myeloid	MG_ISG15	1000000.0	3076	CGGA_sc064	non-malignant	nonmalignant	10076.0
P1194.F_AAAGAACCACGAGGTA-1	P1194.F	1772.0	805	1.523702	P1194.F	WT	myeloid	MG_RPL31	1000000.0	805	CGGA_sc064	non-malignant	nonmalignant	1772.0
P1194.F_AAAGGATAGTTACGTC-1	P1194.F	4984.0	2143	4.133226	P1194.F	WT	myeloid	MG_IER2	1000000.0	2141	CGGA_sc064	non-malignant	nonmalignant	4984.0
P1194.F_AAAGGATGTCATCGCG-1	P1194.F	3292.0	1312	9.538275	P1194.F	WT	Tcell	CD8_IFI44L	1000000.0	1312	CGGA_sc064	non-malignant	nonmalignant	3292.0
P1194.F_AAAGGGCAGCTCCGAC-1	P1194.F	6060.0	2278	3.036304	P1194.F	WT	myeloid	MG_CCL2	1000000.0	2275	CGGA_sc064	non-malignant	nonmalignant	6060.0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
P969_TTTGTTGGTATGCGTT-1	P969	9172.0	2609	7.064980	P969	WT	myeloid	MDM_SEPP1	1000000.0	2608	CGGA_sc051	non-malignant	nonmalignant	9172.0
P969_TTTGTTGGTGCATACT-1	P969	29911.0	5913	5.613320	P969	WT	tumor	tmr_CHI3L1	1000000.0	5913	CGGA_sc051	malignant	malignant	29911.0
P969_TTTGTTGGTTTAAGGA-1	P969	16849.0	4393	8.837320	P969	WT	tumor	tmr_CHI3L1	1000000.0	4393	CGGA_sc051	malignant	malignant	16849.0
P969_TTTGTTGTCAAGCTGT-1	P969	10355.0	3377	9.570256	P969	WT	tumor	tmr_DLL3	1000000.0	3377	CGGA_sc051	malignant	malignant	10355.0
P969_TTTGTTGTCCACAGGC-1	P969	4188.0	2064	7.139446	P969	WT	tumor	tmr_CHI3L1	1000000.0	2064	CGGA_sc051	malignant	malignant	4188.0

	features
RP11-34P13.3	RP11-34P13.3
RP11-34P13.7	RP11-34P13.7
RP11-34P13.14	RP11-34P13.14
FO538757.3	FO538757.3
FO538757.2	FO538757.2
...	...
AC007325.4	AC007325.4
AC007325.2	AC007325.2
AL354822.1	AL354822.1
AC004556.1	AC004556.1
AC240274.1	AC240274.1

convert data from Anndata to Loom via Anndata.write_loom ¶

See the features of our anndata¶

How to get the names of these genes?¶

load the gene book, and then compare it with our used genes to get the gene id.¶

For pbmc dataset¶

convert data from Anndata to Loom via Anndata.write_loom¶

See the features of our anndata¶

How to get the names of these genes?¶

load the gene book, and then compare it with our used genes to get the gene id.¶

For pbmc dataset¶

convert data from Anndata to Loom via Anndata.write_loom ¶