import loompy
from collections import Counter
with loompy.connect('../loomdata/result.loom') as ds:
# Add 'cell_type' and 'organ_major' to the loom file
print(ds.shape)
print(ds.ra.keys())
print(ds.ca.keys())
print(ds[:,0])
(20059, 247048) ['ensembl_id', 'var_names'] ['id', 'infercnv', 'major.type', 'nCount_RNA', 'nCounts', 'nFeature_RNA', 'n_counts', 'obs_names', 'organ_major', 'orig.ident', 'percent.mt', 'rename_sample', 'subcluster', 'tumor_non_tumor', 'type'] [ 0. 0. 0. ... 14. 1. 57.]
from geneformer import TranscriptomeTokenizer
please include all of the features you have and you want in TranscriptomeTokenizer
function.
tokenize_data(input_loom_data_dir, output_data_dir, output_file_prefix)
Note: please make sure that you've already created the output_data_dir
folder.
Input data:
Required format: raw counts scRNAseq data without feature selection as .loom file
Required row (gene) attribute: "ensembl_id"; Ensembl ID for each gene
Required col (cell) attribute: "n_counts"; total read counts in that cell
Optional col (cell) attribute: "filter_pass"; binary indicator of whether cell should be tokenized based on user-defined filtering criteria
Optional col (cell) attributes: any other cell metadata can be passed on to the tokenized dataset as a custom attribute dictionary as shown below
tk = TranscriptomeTokenizer({"major.type": "major.type", "subcluster": "subcluster", "type": "type", "orig.ident": "orig.ident", "organ_major": "organ_major"}, nproc=20)
tk.tokenize_data("../loomdata", "../output", "scRNA_247048_20230503")
Tokenizing ../loomdata/result.loom ../loomdata/result.loom has no column attribute 'filter_pass'; tokenizing all cells.
Map (num_proc=20): 0%| | 0/247048 [00:00<?, ? examples/s]
Map (num_proc=20): 0%| | 0/247048 [00:00<?, ? examples/s]
Saving the dataset (0/2 shards): 0%| | 0/247048 [00:00<?, ? examples/s]
from datasets import load_from_disk
token_dataset = load_from_disk("../output/scRNA_247048_20230503.dataset/")
print(type(token_dataset))
<class 'datasets.arrow_dataset.Dataset'>
print(token_dataset)
print(token_dataset.shape)
print(token_dataset[0])
Dataset({ features: ['input_ids', 'major.type', 'subcluster', 'type', 'orig.ident', 'organ_major', 'length'], num_rows: 247048 }) (247048, 7) {'input_ids': [6196, 6198, 10068, 16018, 3784, 19629, 17269, 13341, 6572, 9794, 13012, 9567, 7532, 17585, 5815, 2050, 2975, 19899, 16436, 1720, 9101, 1480, 5965, 14087, 488, 7201, 4919, 16909, 17536, 13771, 2447, 10744, 3242, 396, 15627, 457, 2951, 695, 589, 7188, 10862, 4214, 6213, 12727, 17337, 4761, 7535, 4416, 15254, 13380, 15113, 8308, 7642, 466, 9244, 16634, 4698, 10256, 255, 1666, 14408, 11358, 6238, 11904, 14014, 10494, 13826, 11467, 3439, 9470, 11082, 18953, 13267, 17297, 10054, 3070, 6179, 6764, 12920, 4883, 11355, 1685, 3735, 12400, 5951, 7029, 13468, 14918, 6000, 1821, 2348, 690, 7768, 449, 11087, 5562, 3348, 4961, 16292, 555, 1584, 7754, 10413, 20393, 12779, 19968, 17865, 13059, 1142, 7461, 1557, 7975, 526, 5855, 11826, 33, 2991, 10989, 4282, 2992, 14291, 5361, 6317, 9584, 5324, 5492, 9235, 18574, 7437, 3876, 969, 23, 10600, 14289, 7184, 17303, 11139, 12172, 15177, 10509, 3560, 10689, 10877, 76, 4968, 438, 9288, 6090, 20317, 12051, 13785, 11185, 5118, 4469, 1146, 6654, 7058, 13964, 13248, 15790, 10245, 13129, 7315, 4754, 1489, 5736, 2383, 9702, 7203, 17886, 15572, 3669, 10947, 12415, 6487, 9225, 6971, 13381, 15363, 9732, 17025, 229, 13644, 17576, 1747, 3852, 11296, 5896, 13510, 20501, 10268, 1203, 5394, 5485, 7874, 20572, 5569, 17678, 1456, 9599, 8454, 14844, 19925, 3588, 6809, 16154, 14225, 8160, 17205, 5826, 12527, 10819, 12681, 11022, 17012, 8590, 5285, 4386, 1003, 5448, 6525, 9317, 9816, 6847, 955, 11842, 4401, 9300, 280, 7457, 6370, 17333, 11808, 14496, 1627, 454, 20590, 10384, 9275, 915, 7622, 7459, 12216, 1394, 8959, 16165, 6060, 12572, 14703, 5411, 4541, 2189, 9500, 2909, 12255, 7000, 2522, 7819, 3350, 4402, 1684, 19745, 17000, 7129, 3758, 5074, 7286, 4576, 16227, 2908, 7626, 3332, 428, 1036, 7556, 10402, 8867, 8289, 8495, 9504, 8754, 6221, 9646, 6391, 4240, 111, 5538, 8085, 9, 2546, 10674, 3153, 3528, 8413, 12403, 648, 6382, 11124, 20505, 14542, 7207, 10273, 17701, 4340, 8098, 9955, 8752, 2074, 14274, 7380, 5081, 9822, 919, 2364, 16181, 1837, 6677, 4703, 13272, 4211, 800, 12751, 7688, 6955, 2353, 3772, 2825, 7299, 17110, 11441, 1722, 1049, 12419, 14972, 750, 20269, 10651, 7061, 10867, 3101, 18821, 1071, 3728, 11496, 2368, 15507, 16617, 2287, 3605, 12900, 15434, 272, 3957, 20436, 2387, 8792, 4874, 5666, 3324, 3216, 12576, 10837, 17208, 15635, 2249, 10949, 10575, 13177, 425, 10459, 92, 7363, 4326, 12699, 19999, 8975, 3267, 15640, 5356, 443, 2250, 3573, 7500, 6479, 746, 3453, 10243, 20080, 658, 3330, 17575, 1137, 2068, 9558, 20858, 6134, 3915, 13483, 8004, 13900, 6129, 13814, 548, 7125, 17888, 13227, 4847, 16211, 14787, 8175, 11670, 13443, 5939, 710, 5053, 3954, 7641, 7827, 10225, 16411, 16844, 11986, 9913, 2953, 23941, 3369, 6961, 861, 6044, 13296, 5246, 10495, 14667, 7248, 4563, 1883, 17154, 3736, 16239, 13878, 18505, 6927, 2821, 12760, 20799, 148, 309, 2993, 13614, 7619, 14769, 17578, 1459, 4494, 17295, 8646, 3864, 14832, 2784, 14341, 12678, 2275, 833, 10281, 13076, 13238, 18369, 13801, 2770, 4794, 801, 4966, 3026, 2104, 10201, 2818, 2238, 6909, 350, 13983, 5354, 20031, 14348, 825, 19926, 1533, 9008, 20340, 4565, 7046, 5157, 17228, 15619, 20515, 12138, 12016, 5625, 5133, 3583, 703, 4305, 13352, 1870, 8673, 5863, 12262, 9789, 16470, 10107, 12409, 4518, 13487, 4127, 3875, 9936, 1455, 3937, 9531, 4811, 15602, 6252, 776, 13781, 14328, 4891, 4201, 17654, 748, 12268, 11860, 2244, 2509, 16441, 787, 6848, 379, 10247, 15300, 14047, 23100, 3866, 12688, 6390, 6634, 9129, 749, 1319, 14716, 3612, 7962, 3323, 11878, 1060, 4354, 868, 16646, 14131, 14436, 1228, 1087, 4841, 10546, 4619, 14468, 6652, 2758, 1032, 7235, 4744, 11495, 4269, 4159, 6532, 1751, 14782, 4567, 4658, 13982, 992, 5993, 13292, 13744, 2675, 17463, 10390, 9119, 2350, 1935, 14020, 6481, 3082, 15369, 12969, 13817, 10103, 17326, 3527, 1952, 6484, 7958, 8604, 4655, 512, 7548, 14838, 14137, 4789, 5866, 18591, 1443, 2100, 4503, 13855, 3950, 8907, 6787, 12318, 8674, 4377, 1833, 9896, 17905, 9808, 10449, 11436, 5414, 11406, 106, 15243, 13758, 13016, 3991, 5704, 15069, 13615, 3315, 2418, 7699, 8008, 6415, 4475, 5138, 12759, 3336, 3705, 3185, 7695, 1026, 3111, 8462, 11785, 11196, 9298, 5873, 1965, 2870, 4609, 6319, 30, 3516, 11177, 13477, 4702, 8125, 6729, 11434, 269, 1997, 4824, 3451, 13257, 11114, 651, 1967, 17314, 11902, 10010, 17236, 20350, 8274, 10804, 10312, 21070, 19468, 10401, 9057, 12217, 1532, 4065, 11587, 137, 253, 2170, 8295, 10569, 7662, 7647, 1311, 46, 12672, 7279, 7477, 4499, 17238, 116, 6753, 8094, 10886, 11844, 847, 1563, 19475, 11539, 3048, 1922, 6681, 16348, 11676, 5525, 8117, 11780, 4360, 1007, 9280, 2061, 4539, 12529, 7646, 4716, 12091, 12601, 9215, 7863, 10976, 1256, 4252, 2669, 19848, 4547, 9073, 4406, 20422, 4914, 4954, 16970, 14199, 10601, 2856, 1925, 4044, 1708, 15970, 4392, 4479, 7259, 3819, 11924, 1148, 5613, 10512, 7143, 2336, 17911, 4038, 219, 869, 5304, 13406, 4189, 1606, 4732, 7606, 8256, 6519, 14210, 7587, 19823, 321, 936, 14569, 7826, 6001, 13007, 6206, 1903, 4344, 1237, 13170, 12410, 10336, 6309, 17260, 5015, 7262, 6138, 15868, 4880, 16708, 5037, 16481, 5813, 5575, 8306, 11551, 9423, 2367, 1513, 3854, 8288, 10754, 10687, 16913, 12175, 3487, 5844, 8312, 5839, 523, 2560, 13466, 8205, 263, 20885, 5785, 3795, 8939, 11270, 4488, 4104, 7009, 698, 9512, 11048, 1806, 15863, 9945, 8225, 14037, 8066, 17715, 12282, 13173, 9352, 721, 7157, 12170, 12814, 8090, 2416, 6564, 15452, 8883, 2411, 43, 14205, 15415, 17200, 7041, 16692, 477, 13025, 6427, 5600, 14694, 16926, 7762, 4244, 8231, 13394, 8828, 5670, 17231, 7908, 6930, 18949, 13062, 10199, 13620, 7150, 4491, 21200, 14053, 11032, 3130, 16602, 11791, 1386, 3946, 2848, 20395, 11038, 4839, 3325, 8659, 11228, 2952, 2928, 3139, 13927, 2636, 6299, 9609, 13278, 4562, 11606, 13486, 8118, 16734, 17304, 14718, 8803, 15164, 9116, 17201, 13058, 997, 6150, 9971, 2113, 470, 8697, 14180, 8355, 3236, 13193, 16881, 8408, 11781, 5048, 17632, 14860, 10476, 7555, 6207, 3919, 1379, 5243, 16737, 3245, 8737, 4594, 431, 13061, 11408, 7063, 5689, 1893, 3012, 17660, 8860, 3155, 15641, 3824, 6439, 14679, 934, 7668, 17141, 8009, 7088, 666, 9642, 8666, 11948, 12080, 11457, 15949, 14552, 7111, 21358, 10526, 18653, 4291, 7117, 9094, 8943, 6621, 9198, 8236, 12078, 1293, 3039, 15651, 4902, 6331, 1415, 1079, 11708, 9916, 10755, 13933, 12768, 2799, 1742, 11636, 16005, 9842, 722, 8192, 5069, 12219, 1717, 13770, 16630, 1948, 3564, 10367, 1129, 3217, 13216, 3072, 271, 3810, 3331, 11851, 2114, 6058, 5326, 3196, 13852, 3911, 11318, 11428, 15567, 5400, 9024, 3270, 581, 18579, 20613, 13503, 40, 15699, 6268, 4343, 22562, 7495, 4253, 14327, 16527, 4393, 10018, 920, 7521, 8260, 1167, 7502, 2882, 13690, 7292, 6130, 17557, 6431, 10210, 10282, 12243, 17933, 1803, 20760, 18630, 4888, 9809, 16406, 17310, 9150, 12035, 11854, 15118, 4901, 8801, 9626, 933, 5822, 1911, 2091, 12515, 13064, 17247, 2258, 1346, 6016, 17161, 2867, 11828, 17984, 2436, 18034, 2257, 17147, 6730, 11376, 1120, 13527, 4151, 2797, 4333, 15425, 1828, 13570, 12026, 3013, 21050, 3709, 7330, 12044, 11640, 9572, 1626, 4734, 1878, 9089, 13024, 9028, 4170, 5892, 6703, 14357, 3552, 11745, 3428, 18581, 48, 1273, 2183, 14179, 5885, 9667, 13042, 12812, 9734, 2171, 14372, 4372, 6844, 12813, 9975, 16264, 5719, 190, 20316, 12671, 5319, 12017, 18789, 12164, 15890, 11088, 3288, 15728, 11543, 16621, 5184, 711, 9417, 12949, 2402, 16727, 16578, 8140, 9029, 4637, 590, 4749, 10456, 2489, 5267, 13909, 4096, 9210, 4907, 4592, 12750, 2502, 20618, 3882, 6459, 362, 4827, 3773, 4329, 5150, 17279, 15334, 12710, 1930, 1353, 13976, 3483, 13331, 9025, 2608, 5030, 14632, 8446, 12615, 4944, 17282, 9388, 20357, 2549, 5340, 10657, 9343, 3958, 19884, 1785, 8842, 12884, 4684, 4738, 538, 115, 725, 1769, 4276, 20281, 6776, 3654, 17571, 7317, 2783, 2780, 9538, 8432, 10634, 17567, 3741, 342, 10525, 5801, 10691, 9020, 12453, 13774, 4724, 5096, 7784, 3135, 6447, 13787, 11983, 10286, 9299, 13583, 9629, 5019, 970, 14163, 12565, 6938, 3589, 2359, 7329, 15211, 2289, 4929, 14132, 15005, 2707, 9081, 9979, 708, 701, 7889, 2796, 12557, 3539, 9976, 291, 12159, 2027, 16287, 5531, 588, 17940, 14192, 9575, 12166, 12828, 1457, 3594, 8104, 8428, 2920, 1334, 6794, 12375, 13157, 14491, 10130, 8010, 14577, 9968, 14732, 1845, 2500, 6176, 6217, 14155, 15707, 12094, 13139, 6791, 4612, 2822, 39, 9004, 14299, 10776, 18643, 9375, 7992, 8276, 4691, 3165, 11090, 10528, 16075, 8919, 6466, 5493, 7163, 9940, 18589, 4088, 9852, 16319, 3370, 7953, 2861, 11754, 2197, 12610, 7844, 8357, 423, 6419, 8354, 7706, 3704, 7984, 5779, 13322, 3607, 8934, 12742, 2337, 2527, 11221, 17769, 22, 14428, 15102, 1021, 11113, 3076, 12537, 10677, 11047, 3073, 2019, 16379, 11650, 12652, 9387, 2103, 6981, 6580, 5856, 10428, 3482, 856, 7917, 12015, 7055, 1649, 7020, 8880, 7930, 8961, 8798, 7886, 8482, 16255, 4667, 3193, 17544, 3050, 1580, 17097, 7280, 1863, 12357, 1568, 3010, 1131, 13711, 15446, 15395, 7440, 15426, 3074, 11992, 3507, 14373, 1212, 2233, 1734, 16719, 3720, 12530, 10072, 8152, 16035, 12943, 8373, 14785, 17079, 9369, 8345, 1018, 8285, 977, 5651, 100, 938, 13869, 2812, 6501, 3179, 12388, 9117, 1994, 10729, 2219, 12333, 10353, 14343, 17842, 12723, 12799, 2837, 17692, 5959, 18658, 8363, 3559, 1250, 1166, 13647, 11395, 12886, 13165, 2426, 679, 313, 8059, 2937, 3174, 9811, 12076, 9452, 12697, 11, 4886, 4086, 6549, 7618, 17053, 7110, 5290, 7171, 3502, 170, 346, 820, 7528, 11234, 5113, 7743, 13506, 7151, 4076, 11169, 3296, 5661, 5429, 3899, 7149, 4049, 7373, 14833, 7781, 12245, 5659, 19675, 2763, 13574, 3533, 1162, 5793, 17266, 2470, 4203, 8001, 15588, 1880, 1869, 7818, 12561, 1790, 91, 2749, 8146, 2152, 177, 14618, 1620, 16619, 14302, 5806, 17916, 4458, 6113, 6832, 6271, 2281, 15629, 11470, 1388, 13212, 8825, 7570, 5093, 9043, 15937, 15685, 1613, 9368, 13127, 2501, 8249, 10966, 5615, 18573, 2254, 1701, 7857, 4119, 2240, 7147, 16741, 3930, 6131, 15837, 5731, 10953, 3282, 8283, 5587, 2850, 6162, 13926, 3424, 10440, 15564, 12583, 18558, 16419, 12499, 1668, 5212, 4517, 11895, 8696, 6219, 12250, 12820, 3163, 3687, 13760, 9296, 4825, 3251, 20337, 9131, 3009, 1854, 4774, 8005, 2896, 9938, 8425, 10246, 1598, 2239, 14494, 5665, 4741, 9790, 971, 8656, 5932, 10460, 8374, 13825, 10593, 3699, 7489, 8743, 6069, 11326, 1835, 910, 12469, 3000, 7993, 5946, 3549, 1389, 3462, 4607, 11118, 10077, 5583, 998, 5180, 7939, 5496, 12046, 16352, 7552, 8456, 327, 7340, 4288, 9193, 1073, 7090, 2296, 14993, 14452, 929, 16567, 14342, 3149, 5387, 16313, 1383, 11706, 14140, 8328, 5368, 13756, 9490, 6708, 8391, 4275, 1136, 1054, 5282, 9746, 12025, 3800, 17112, 9950, 4903, 13311, 10029, 12382, 13738, 6528, 4845, 7896, 2629, 9901, 5073, 3047, 9321, 12632, 8386, 10965, 20331, 6032, 5348, 6758, 6805, 1889, 10362, 12277, 17211, 11062, 16203, 2331, 7326, 17616, 4711, 14968, 8940, 4019, 12500, 10801, 18510, 12775, 10545, 1630, 2184, 4357, 628, 4473, 3652, 2343, 17762, 2668, 4904, 17651, 2261, 10223, 8965, 18685, 17181, 14809, 12215, 6581, 8320, 1609, 11522, 2745, 6943, 6675, 7072, 8097, 2349, 1551, 5313, 2761, 6346, 20819, 16015, 12863, 1968, 3989, 18371, 9151, 15597, 8129, 1805, 2388, 6139, 1156, 6072, 15614, 4573, 9472, 1999, 8423, 1881, 5213, 11548, 8543, 5470, 5116, 16793, 7907, 2556, 104, 15492, 4237, 3746, 5716, 6295, 5163, 11947, 7685, 13818, 15180, 1909, 1192, 14517, 16707, 8689, 12244, 417, 10320, 3001, 5915, 13395, 1678, 8378, 6141, 2179, 16219, 3062, 6907, 13088, 4133, 8410, 18575, 16474, 672, 361, 10341, 5954, 7661, 10262, 13384, 5065, 1979, 11549, 11451, 2325, 16288, 549, 13681, 13294, 14670, 10752, 5261, 13849, 8664, 5307, 793, 14185, 3525, 5333, 11897, 10704, 6560, 4444, 5961, 11284, 10, 6600, 6863, 17071, 1478, 16611, 4337, 6205, 12437, 8332, 12113, 10357, 8487, 10001, 3992, 15405, 7445, 332, 20469, 5900, 10894, 12253, 15777, 4398, 13472, 9260, 1667, 4388, 6877, 15845, 8953, 3781, 7964, 4270, 7216, 1251, 2741, 12605, 3893, 3947, 5332, 2518, 6189, 7410, 14150, 7596, 2738, 5119, 11001, 5392, 10711, 10670, 2144, 17268, 1124, 1159, 4047, 6383, 8091, 11740, 5930, 233, 7298, 4942, 1836, 5352, 10957, 14275, 6332, 13611, 836, 5036, 6510, 2314, 7183, 7026, 2248, 7238, 8165, 2942, 11908, 11965, 7173, 9332, 16431, 3478, 2587, 5120, 7588, 18372, 6489, 20488, 2137, 2089, 12398, 11646, 13915, 16944, 9108, 8572, 10474, 8952, 9167, 9912, 12838, 2035, 2498, 752, 9793, 3292, 1933, 262, 1654, 16850, 3505, 4478, 4311, 2876, 19437, 9243, 13628, 11203, 2583, 4067, 17693, 11755, 7112, 3209, 6124, 8284, 3262, 7370, 2878, 4350, 957, 421, 6070, 896, 10022, 8914, 3093, 7589, 7413, 543, 7263, 9395, 3377, 7544, 6746, 14260, 11794, 12772, 12167, 6074, 3726, 7693, 14646, 10032, 11096, 9724, 1072, 15122, 7089, 4875, 10327, 12030, 6761, 2017, 20320, 11175, 6828, 4146, 12042, 2146, 16587, 20322, 7027, 11498, 9948, 12201, 7425, 3085, 2158, 16743, 5220, 14009, 18568, 1274, 8719, 12200, 5698, 5487, 16693, 12055, 5738, 4921, 7910, 890, 79, 10498, 14290, 976, 4229, 17075, 1051, 3081, 6303, 2941, 14421, 7628, 4588, 8724, 16572, 3762, 8601, 8161, 75], 'major.type': 'myeloid', 'subcluster': 'MG_ISG15', 'type': 'WT', 'orig.ident': 'P1194.F', 'organ_major': 'A', 'length': 2048}