In [17]:
from geneformer import EmbExtractor
In [14]:
# initiate EmbExtractor
embex = EmbExtractor(model_type="CellClassifier",
num_classes=2,
# filter_data={"major.type":["myeloid","Tcell","tumor","oligodendrocyte"]},
# filter_data={"major.type":["myeloid","Tcell","tumor"]},
max_ncells=300000,
emb_layer=-1, # -1 or 0, -1 is preferred.
emb_label=["condition","tumor_anno","celltype_1", "orig.ident"],
labels_to_plot=["condition"],
forward_batch_size=20,
nproc=20)
print("initialization ok")
initialization ok
when extract embeddings, you can use your finetuned model or the pretrained model. Note that if you want to change the label, change the labels_to_plot
argument in the codeblock above.
In [15]:
# extracts embedding from input data
# example dataset: https://huggingface.co/datasets/ctheodoris/Genecorpus-30M/tree/main/example_input_files/cell_classification/disease_classification/human_dcm_hcm_nf.dataset
embs = embex.extract_embs("../geneformer-12L-30M/",
"./test_dataset.dataset",
"../PBMC_result/",
"PBMC_pretrain_condition")
print("extracts embedding ok")
# model dir, dataset dir, output dir, output predix
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ../geneformer-12L-30M/ and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight'] You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
--------------------------------------------------------------------------- RuntimeError Traceback (most recent call last) File /mnt/disk/mwang/conda3/lib/python3.11/site-packages/torch/cuda/__init__.py:260, in _lazy_init() 259 try: --> 260 queued_call() 261 except Exception as e: File /mnt/disk/mwang/conda3/lib/python3.11/site-packages/torch/cuda/__init__.py:145, in _check_capability() 144 for d in range(device_count()): --> 145 capability = get_device_capability(d) 146 major = capability[0] File /mnt/disk/mwang/conda3/lib/python3.11/site-packages/torch/cuda/__init__.py:381, in get_device_capability(device) 369 r"""Gets the cuda capability of a device. 370 371 Args: (...) 379 tuple(int, int): the major and minor cuda capability of the device 380 """ --> 381 prop = get_device_properties(device) 382 return prop.major, prop.minor File /mnt/disk/mwang/conda3/lib/python3.11/site-packages/torch/cuda/__init__.py:399, in get_device_properties(device) 398 raise AssertionError("Invalid device id") --> 399 return _get_device_properties(device) RuntimeError: device >= 0 && device < num_gpus INTERNAL ASSERT FAILED at "/opt/conda/conda-bld/pytorch_1682343970094/work/aten/src/ATen/cuda/CUDAContext.cpp":50, please report a bug to PyTorch. The above exception was the direct cause of the following exception: DeferredCudaCallError Traceback (most recent call last) Cell In[15], line 3 1 # extracts embedding from input data 2 # example dataset: https://huggingface.co/datasets/ctheodoris/Genecorpus-30M/tree/main/example_input_files/cell_classification/disease_classification/human_dcm_hcm_nf.dataset ----> 3 embs = embex.extract_embs("../geneformer-12L-30M/", 4 "./test_dataset.dataset", 5 "../PBMC_result/", 6 "PBMC_pretrain_condition") 7 print("extracts embedding ok") 8 # model dir, dataset dir, output dir, output predix File /mnt/disk/mwang/conda3/lib/python3.11/site-packages/geneformer/emb_extractor.py:388, in EmbExtractor.extract_embs(self, model_directory, input_data_file, output_directory, output_prefix) 386 filtered_input_data = load_and_filter(self.filter_data, self.nproc, input_data_file) 387 downsampled_data = downsample_and_sort(filtered_input_data, self.max_ncells) --> 388 model = load_model(self.model_type, self.num_classes, model_directory) 389 layer_to_quant = quant_layers(model)+self.emb_layer 390 embs = get_embs(model, 391 downsampled_data, 392 self.emb_mode, (...) 395 self.forward_batch_size, 396 self.summary_stat) File /mnt/disk/mwang/conda3/lib/python3.11/site-packages/geneformer/in_silico_perturber.py:78, in load_model(model_type, num_classes, model_directory) 76 # put the model in eval mode for fwd pass 77 model.eval() ---> 78 model = model.to("cuda:0") 79 return model File /mnt/disk/mwang/conda3/lib/python3.11/site-packages/transformers/modeling_utils.py:2053, in PreTrainedModel.to(self, *args, **kwargs) 2048 raise ValueError( 2049 "`.to` is not supported for `4-bit` or `8-bit` bitsandbytes models. Please use the model as it is, since the" 2050 " model has already been set to the correct devices and casted to the correct `dtype`." 2051 ) 2052 else: -> 2053 return super().to(*args, **kwargs) File /mnt/disk/mwang/conda3/lib/python3.11/site-packages/torch/nn/modules/module.py:1145, in Module.to(self, *args, **kwargs) 1141 return t.to(device, dtype if t.is_floating_point() or t.is_complex() else None, 1142 non_blocking, memory_format=convert_to_format) 1143 return t.to(device, dtype if t.is_floating_point() or t.is_complex() else None, non_blocking) -> 1145 return self._apply(convert) File /mnt/disk/mwang/conda3/lib/python3.11/site-packages/torch/nn/modules/module.py:797, in Module._apply(self, fn) 795 def _apply(self, fn): 796 for module in self.children(): --> 797 module._apply(fn) 799 def compute_should_use_set_data(tensor, tensor_applied): 800 if torch._has_compatible_shallow_copy_type(tensor, tensor_applied): 801 # If the new tensor has compatible tensor type as the existing tensor, 802 # the current behavior is to change the tensor in-place using `.data =`, (...) 807 # global flag to let the user control whether they want the future 808 # behavior of overwriting the existing tensor or not. File /mnt/disk/mwang/conda3/lib/python3.11/site-packages/torch/nn/modules/module.py:797, in Module._apply(self, fn) 795 def _apply(self, fn): 796 for module in self.children(): --> 797 module._apply(fn) 799 def compute_should_use_set_data(tensor, tensor_applied): 800 if torch._has_compatible_shallow_copy_type(tensor, tensor_applied): 801 # If the new tensor has compatible tensor type as the existing tensor, 802 # the current behavior is to change the tensor in-place using `.data =`, (...) 807 # global flag to let the user control whether they want the future 808 # behavior of overwriting the existing tensor or not. File /mnt/disk/mwang/conda3/lib/python3.11/site-packages/torch/nn/modules/module.py:797, in Module._apply(self, fn) 795 def _apply(self, fn): 796 for module in self.children(): --> 797 module._apply(fn) 799 def compute_should_use_set_data(tensor, tensor_applied): 800 if torch._has_compatible_shallow_copy_type(tensor, tensor_applied): 801 # If the new tensor has compatible tensor type as the existing tensor, 802 # the current behavior is to change the tensor in-place using `.data =`, (...) 807 # global flag to let the user control whether they want the future 808 # behavior of overwriting the existing tensor or not. File /mnt/disk/mwang/conda3/lib/python3.11/site-packages/torch/nn/modules/module.py:820, in Module._apply(self, fn) 816 # Tensors stored in modules are graph leaves, and we don't want to 817 # track autograd history of `param_applied`, so we have to use 818 # `with torch.no_grad():` 819 with torch.no_grad(): --> 820 param_applied = fn(param) 821 should_use_set_data = compute_should_use_set_data(param, param_applied) 822 if should_use_set_data: File /mnt/disk/mwang/conda3/lib/python3.11/site-packages/torch/nn/modules/module.py:1143, in Module.to.<locals>.convert(t) 1140 if convert_to_format is not None and t.dim() in (4, 5): 1141 return t.to(device, dtype if t.is_floating_point() or t.is_complex() else None, 1142 non_blocking, memory_format=convert_to_format) -> 1143 return t.to(device, dtype if t.is_floating_point() or t.is_complex() else None, non_blocking) File /mnt/disk/mwang/conda3/lib/python3.11/site-packages/torch/cuda/__init__.py:264, in _lazy_init() 261 except Exception as e: 262 msg = (f"CUDA call failed lazily at initialization with error: {str(e)}\n\n" 263 f"CUDA call was originally invoked at:\n\n{orig_traceback}") --> 264 raise DeferredCudaCallError(msg) from e 265 finally: 266 delattr(_tls, 'is_initializing') DeferredCudaCallError: CUDA call failed lazily at initialization with error: device >= 0 && device < num_gpus INTERNAL ASSERT FAILED at "/opt/conda/conda-bld/pytorch_1682343970094/work/aten/src/ATen/cuda/CUDAContext.cpp":50, please report a bug to PyTorch. CUDA call was originally invoked at: [' File "<frozen runpy>", line 198, in _run_module_as_main\n', ' File "<frozen runpy>", line 88, in _run_code\n', ' File "/mnt/disk/mwang/conda3/lib/python3.11/site-packages/ipykernel_launcher.py", line 17, in <module>\n app.launch_new_instance()\n', ' File "/mnt/disk/mwang/conda3/lib/python3.11/site-packages/traitlets/config/application.py", line 1043, in launch_instance\n app.start()\n', ' File "/mnt/disk/mwang/conda3/lib/python3.11/site-packages/ipykernel/kernelapp.py", line 736, in start\n self.io_loop.start()\n', ' File "/mnt/disk/mwang/conda3/lib/python3.11/site-packages/tornado/platform/asyncio.py", line 195, in start\n self.asyncio_loop.run_forever()\n', ' File "/mnt/disk/mwang/conda3/lib/python3.11/asyncio/base_events.py", line 607, in run_forever\n self._run_once()\n', ' File "/mnt/disk/mwang/conda3/lib/python3.11/asyncio/base_events.py", line 1922, in _run_once\n handle._run()\n', ' File "/mnt/disk/mwang/conda3/lib/python3.11/asyncio/events.py", line 80, in _run\n self._context.run(self._callback, *self._args)\n', ' File "/mnt/disk/mwang/conda3/lib/python3.11/site-packages/ipykernel/kernelbase.py", line 516, in dispatch_queue\n await self.process_one()\n', ' File "/mnt/disk/mwang/conda3/lib/python3.11/site-packages/ipykernel/kernelbase.py", line 505, in process_one\n await dispatch(*args)\n', ' File "/mnt/disk/mwang/conda3/lib/python3.11/site-packages/ipykernel/kernelbase.py", line 412, in dispatch_shell\n await result\n', ' File "/mnt/disk/mwang/conda3/lib/python3.11/site-packages/ipykernel/kernelbase.py", line 740, in execute_request\n reply_content = await reply_content\n', ' File "/mnt/disk/mwang/conda3/lib/python3.11/site-packages/ipykernel/ipkernel.py", line 422, in do_execute\n res = shell.run_cell(\n', ' File "/mnt/disk/mwang/conda3/lib/python3.11/site-packages/ipykernel/zmqshell.py", line 546, in run_cell\n return super().run_cell(*args, **kwargs)\n', ' File "/mnt/disk/mwang/conda3/lib/python3.11/site-packages/IPython/core/interactiveshell.py", line 3024, in run_cell\n result = self._run_cell(\n', ' File "/mnt/disk/mwang/conda3/lib/python3.11/site-packages/IPython/core/interactiveshell.py", line 3079, in _run_cell\n result = runner(coro)\n', ' File "/mnt/disk/mwang/conda3/lib/python3.11/site-packages/IPython/core/async_helpers.py", line 129, in _pseudo_sync_runner\n coro.send(None)\n', ' File "/mnt/disk/mwang/conda3/lib/python3.11/site-packages/IPython/core/interactiveshell.py", line 3284, in run_cell_async\n has_raised = await self.run_ast_nodes(code_ast.body, cell_name,\n', ' File "/mnt/disk/mwang/conda3/lib/python3.11/site-packages/IPython/core/interactiveshell.py", line 3466, in run_ast_nodes\n if await self.run_code(code, result, async_=asy):\n', ' File "/mnt/disk/mwang/conda3/lib/python3.11/site-packages/IPython/core/interactiveshell.py", line 3526, in run_code\n exec(code_obj, self.user_global_ns, self.user_ns)\n', ' File "/tmp/ipykernel_17272/2773193863.py", line 1, in <module>\n from geneformer import EmbExtractor\n', ' File "<frozen importlib._bootstrap>", line 1176, in _find_and_load\n', ' File "<frozen importlib._bootstrap>", line 1147, in _find_and_load_unlocked\n', ' File "<frozen importlib._bootstrap>", line 690, in _load_unlocked\n', ' File "<frozen importlib._bootstrap_external>", line 940, in exec_module\n', ' File "<frozen importlib._bootstrap>", line 241, in _call_with_frames_removed\n', ' File "/mnt/disk/mwang/conda3/lib/python3.11/site-packages/geneformer/__init__.py", line 2, in <module>\n from . import pretrainer\n', ' File "<frozen importlib._bootstrap>", line 1232, in _handle_fromlist\n', ' File "<frozen importlib._bootstrap>", line 241, in _call_with_frames_removed\n', ' File "<frozen importlib._bootstrap>", line 1176, in _find_and_load\n', ' File "<frozen importlib._bootstrap>", line 1147, in _find_and_load_unlocked\n', ' File "<frozen importlib._bootstrap>", line 690, in _load_unlocked\n', ' File "<frozen importlib._bootstrap_external>", line 940, in exec_module\n', ' File "<frozen importlib._bootstrap>", line 241, in _call_with_frames_removed\n', ' File "/mnt/disk/mwang/conda3/lib/python3.11/site-packages/geneformer/pretrainer.py", line 14, in <module>\n import torch\n', ' File "<frozen importlib._bootstrap>", line 1176, in _find_and_load\n', ' File "<frozen importlib._bootstrap>", line 1147, in _find_and_load_unlocked\n', ' File "<frozen importlib._bootstrap>", line 690, in _load_unlocked\n', ' File "<frozen importlib._bootstrap_external>", line 940, in exec_module\n', ' File "<frozen importlib._bootstrap>", line 241, in _call_with_frames_removed\n', ' File "/mnt/disk/mwang/conda3/lib/python3.11/site-packages/torch/__init__.py", line 1146, in <module>\n _C._initExtension(manager_path())\n', ' File "<frozen importlib._bootstrap>", line 1176, in _find_and_load\n', ' File "<frozen importlib._bootstrap>", line 1147, in _find_and_load_unlocked\n', ' File "<frozen importlib._bootstrap>", line 690, in _load_unlocked\n', ' File "<frozen importlib._bootstrap_external>", line 940, in exec_module\n', ' File "<frozen importlib._bootstrap>", line 241, in _call_with_frames_removed\n', ' File "/mnt/disk/mwang/conda3/lib/python3.11/site-packages/torch/cuda/__init__.py", line 197, in <module>\n _lazy_call(_check_capability)\n', ' File "/mnt/disk/mwang/conda3/lib/python3.11/site-packages/torch/cuda/__init__.py", line 195, in _lazy_call\n _queued_calls.append((callable, traceback.format_stack()))\n']
In [ ]:
# plot UMAP of cell embeddings
# note: scanpy umap necessarily saves figs to figures directory
embex.plot_embs(embs=embs,
plot_style="umap",
max_ncells_to_plot=300000,
output_directory="../PBMC_result/",
output_prefix="emb_plot_pretrain_condition")
print("uamp OK")
/scratch/PI/jgwang/mwang/anaconda3/envs/py310/lib/python3.10/site-packages/anndata/utils.py:252: UserWarning: X converted to numpy array with dtype float64 warnings.warn(f"{name} converted to numpy array with dtype {arr.dtype}")
In [ ]:
# plot heatmap of cell embeddings
# embex.plot_embs(embs=embs,
# plot_style="heatmap",
# max_ncells_to_plot=300000,
# output_directory="../scRNA_result/",
# output_prefix="emb_plot_heatmap")