Remove stale path: examples/legacy_from_gene_mamba/mamba2_classification_finetune_without_label_zero_shot.py

Browse files

Files changed (1) hide show

examples/legacy_from_gene_mamba/mamba2_classification_finetune_without_label_zero_shot.py +0 -197

examples/legacy_from_gene_mamba/mamba2_classification_finetune_without_label_zero_shot.py DELETED Viewed

@@ -1,197 +0,0 @@
-# %%
-import torch
-from transformers import Trainer
-import os
-import pyarrow as pa
-import pandas as pd
-import numpy as np
-from matplotlib import pyplot as plt
-from torch.utils.data import Dataset
-from transformers import AutoTokenizer, TrainingArguments
-import argparse
-from mamba_ssm.models.mixer_seq_simple import MambaLMHeadModel
-from transformers import AutoTokenizer, TrainingArguments, MambaForCausalLM
-from dotmap import DotMap
-import sys
-import os
-import torch
-sys.path.append("/project/zhiwei/cq5/PythonWorkSpace/gene_mamba")
-from models import Classifier, GeneMamba, GeneMambaForCellAnnotation, GeneMambaForGeneClassification, GeneMamba2, GeneMamba2ForCellClassification
-from utils import permute_genes_by_expression, build_downstream_dataset, get_last_checkpoint
-import importlib
-importlib.reload(sys.modules['models'])
-importlib.reload(sys.modules['utils'])
-# %%
-import scanpy as sc
-# import argparse
-# parser = argparse.ArgumentParser()
-# parser.add_argument("--dataset_name", type=str)
-# args2 = parser.parse_args()
-# dataset_name = args2.dataset_name
-dataset_name = "pbmc12k"
-assert dataset_name in ["pbmc12k", "perirhinal_cortex", "covid19"]
-adata = sc.read_h5ad(f'/project/zhiwei/cq5/PythonWorkSpace/gene_mamba/dataset/downstream/processed/{dataset_name}_processed.h5ad')
-assert "celltype" in adata.obs
-print(adata)
-# %%
-from transformers import PretrainedConfig
-config = PretrainedConfig.from_dict({
-    "d_model": 512,
-    "mamba_layer": 24,
-})
-# %%
-model = GeneMamba2(config, model_path="/project/zhiwei/cq5/LLM_checkpoints/GeneMamba/GeneMamba2_24l_512d/1/16m/checkpoint-31250", tokenizer_path="/project/zhiwei/cq5/PythonWorkSpace/gene_mamba/gene_tokenizer.json", args=None)
-# %%
-permuted_gene_ids = permute_genes_by_expression(adata, dataset_name, model.tokenizer, model.symbol2id)
-permuted_gene_ids
-# %%
-num_samples = permuted_gene_ids.shape[0]
-num_avaliable_gpu = torch.cuda.device_count()
-# %%
-from dotmap import DotMap
-args = DotMap(
-    {
-        # "model": "state-spaces/mamba-130m-hf",
-        # "tokenizer": "state-spaces/mamba-130m-hf",
-        "learning_rate": 5e-5,
-        "batch_size": 16,
-        "gradient_accumulation_steps": 1,
-        "optim": "adamw_torch",
-        # "data_path": "/home/cong/study/codeSpace/VSCodeSpace/PythonWorkSpace/TCRPrediction/mamba_transformer/smiles_data.txt",
-        # "num_epochs": args2.num_epochs,
-        "seq_len": 2048,
-        "num_samples": num_samples,
-        "num_gpus": num_avaliable_gpu,
-        "output_dir": "/project/zhiwei/cq5/PythonWorkSpace/gene_mamba/analysis/cell_type_annotation/fine-tuned",
-    }
-)
-#%%
-model = GeneMamba2(config, model_path="/project/zhiwei/cq5/LLM_checkpoints/GeneMamba/GeneMamba2_24l_512d/1/16m/checkpoint-31250", tokenizer_path="/project/zhiwei/cq5/PythonWorkSpace/gene_mamba/gene_tokenizer.json", args=None)
-model.resize_token_embeddings()
-#%%
-def get_last_checkpoint(output_dir):
-    checkpoints = os.listdir(output_dir)
-    checkpoints = [ckpt for ckpt in checkpoints if "checkpoint" in ckpt]
-    checkpoints = [int(ckpt.split("-")[1]) for ckpt in checkpoints]
-    checkpoints = sorted(checkpoints)
-    last_checkpoint = checkpoints[-1]
-    last_checkpoint = os.path.join(output_dir, f"checkpoint-{last_checkpoint}")
-    return last_checkpoint
-ckpt_pth = f"/project/zhiwei/cq5/PythonWorkSpace/gene_mamba/analysis/cell_type_annotation/fine-tuned/{dataset_name}"
-last_checkpoint = get_last_checkpoint(ckpt_pth)
-state_dict_pth = os.path.join(last_checkpoint, "model.safetensors")
-print(state_dict_pth)
-#%%
-from safetensors.torch import load_file
-state_dict = load_file(state_dict_pth)
-model.model.load_state_dict(state_dict)
-# %%
-input_data = permuted_gene_ids[:, :args.seq_len]
-# %%
-input_data.shape
-#%%
-# check if cls_token in the tokenizer:
-if model.tokenizer.cls_token_id is None:
-    model.tokenizer.add_special_tokens({'cls_token': '[CLS]'})
-#%%
-input_data = np.hstack([np.array([model.tokenizer.cls_token_id for _ in range(input_data.shape[0])]).reshape(-1, 1), input_data])
-#%%
-input_data.shape
-#%%
-from torch.utils.data import DataLoader, Dataset
-class GeneDataset(Dataset):
-    def __init__(self, data):
-        self.data = data
-    def __len__(self):
-        return len(self.data)
-    def __getitem__(self, idx):
-        return self.data[idx]
-#%%
-all_dataset = GeneDataset(input_data)
-all_loader = DataLoader(all_dataset, batch_size = args.batch_size, shuffle=False)
-# %%
-def cell_embeddings(data_loader, model):
-    cell_repr = []
-    for i, batch in enumerate(data_loader):
-        batch = batch.to(model.device)
-        outputs = model(batch)
-        cls_representation = outputs.hidden_states[:, 0, :]
-        cell_repr.append(cls_representation.detach().cpu().numpy())
-        if i % 10 == 0:
-            print(f"Processed {i} batches")
-    cell_repr = np.concatenate(cell_repr)
-    return cell_repr
-# %%
-model = model.to("cuda")
-model.eval()
-# %%
-cell_repr = cell_embeddings(all_loader, model)
-cell_repr.shape
-# cell_repr = np.concatenate(cell_repr)
-# %%
-np.save(f"/project/zhiwei/cq5/PythonWorkSpace/gene_mamba/analysis/cell_type_annotation/embeddings/fine-tuned/{dataset_name}_cell_repr.npy", cell_repr)
-# %%