theislab
/

Nicheformer

@@ -1,399 +1,330 @@
-from transformers import PreTrainedTokenizer
-import numpy as np
 from typing import List, Dict, Optional, Union, Tuple
 import os
 import json
-from anndata import AnnData
-import torch
 class NicheformerTokenizer(PreTrainedTokenizer):
-    """
-    Tokenizer for Nicheformer models.
-    This tokenizer converts gene expression data from AnnData objects into token IDs
-    for the Nicheformer model. It also handles special tokens for modality, species,
-    and assay information extracted from the observation columns.
-    """
-    vocab_files_names = {"vocab_file": "vocab.json"}
     model_input_names = ["input_ids", "attention_mask"]
     def __init__(
         self,
         vocab_file=None,
-        max_seq_len=4096,
-        aux_tokens=30,
         **kwargs
     ):
-        """
-        Initialize the tokenizer.
-        Args:
-            vocab_file: Path to the vocabulary file
-            max_seq_len: Maximum sequence length
-            aux_tokens: Number of auxiliary tokens reserved
-        """
-        # Initialize vocabulary
-        self.vocab = {}
-        self.ids_to_tokens = {}
-        # Load vocabulary if provided
-        if vocab_file is not None and os.path.isfile(vocab_file):
-            with open(vocab_file, 'r', encoding='utf-8') as f:
-                self.vocab = json.load(f)
-                self.ids_to_tokens = {v: k for k, v in self.vocab.items()}
-        # Initialize the parent class
-        super().__init__(
-            pad_token="<pad>",
-            eos_token="<eos>",
-            unk_token="",
-            **kwargs
-        )
-        self.max_seq_len = max_seq_len
         self.aux_tokens = aux_tokens
-        # Define token constants to match Nicheformer
-        self._pad_token_id = 0
-        # Define special token mappings
-        self.modality_dict = {
-            'dissociated': 3,
-            'spatial': 4,
-        }
-        self.specie_dict = {
-            'human': 5,
-            'Homo sapiens': 5,
-            'Mus musculus': 6,
-            'mouse': 6,
-        }
-        self.technology_dict = {
-            "merfish": 7,
-            "MERFISH": 7,
-            "cosmx": 8,
-            "visium": 9,
-            "10x 5' v2": 10,
-            "10x 3' v3": 11,
-            "10x 3' v2": 12,
-            "10x 5' v1": 13,
-            "10x 3' v1": 14,
-            "10x 3' transcription profiling": 15,
-            "10x transcription profiling": 15,
-            "10x 5' transcription profiling": 16,
-            "CITE-seq": 17,
-            "Smart-seq v4": 18,
-        }
-    def get_vocab(self) -> Dict[str, int]:
-        """Return the vocabulary as a dictionary of token to index."""
-        if not self.vocab:
-            # If vocab is empty, create a minimal vocab with special tokens
-            vocab = {}
-            # Add special tokens
-            vocab["<pad>"] = 0
-            vocab["<eos>"] = 1
-            vocab[""] = 2
-            # Add modality tokens
-            for token, idx in self.modality_dict.items():
-                vocab[token] = idx
-            # Add species tokens
-            for token, idx in self.specie_dict.items():
-                vocab[token] = idx
-            # Add technology tokens
-            for token, idx in self.technology_dict.items():
-                vocab[token] = idx
-            return vocab
-        return self.vocab
-    def _tokenize(self, text):
-        """
-        Not used for gene expression data, but required by the interface.
-        """
-        return [text]
-    def _convert_token_to_id(self, token):
-        """Convert a token to an ID using the vocab."""
-        return self.vocab.get(token, self.vocab.get(self.unk_token))
-    def _convert_id_to_token(self, index):
-        """Convert an ID to a token using the vocab."""
-        return self.ids_to_tokens.get(index, self.unk_token)
-    def convert_tokens_to_string(self, tokens):
-        """
-        Not used for gene expression data, but required by the interface.
-        """
-        return " ".join(tokens)
-    def save_vocabulary(self, save_directory, filename_prefix=None):
         """Save the vocabulary to a file."""
         vocab_file = os.path.join(
             save_directory,
-            (filename_prefix + "-" if filename_prefix else "") + self.vocab_files_names["vocab_file"]
         )
         with open(vocab_file, "w", encoding="utf-8") as f:
-            json.dump(self.vocab, f, ensure_ascii=False)
         return (vocab_file,)
-    def _sf_normalize(self, X):
-        """Normalize the input matrix to a scale of 10000."""
-        X = X.copy()
-        counts = np.array(X.sum(axis=1))
-        # avoid zero division error
-        counts += counts == 0.
-        # normalize to 10000 counts
-        scaling_factor = 10000. / counts
-        from scipy.sparse import issparse
-        if issparse(X):
-            from sklearn.utils import sparsefuncs
-            sparsefuncs.inplace_row_scale(X, scaling_factor)
-        else:
-            np.multiply(X, scaling_factor.reshape((-1, 1)), out=X)
-        return X
-    def _sub_tokenize_data(self, x):
-        """Tokenize the input gene vector"""
-        from scipy.sparse import issparse
-        if issparse(x):
-            x = x.toarray()
-        n_cells, n_genes = x.shape
-        scores_final = np.zeros((n_cells, self.max_seq_len), dtype=np.int32)
-        for i, cell in enumerate(x):
-            nonzero_mask = np.nonzero(cell)[0]
-            sorted_indices = nonzero_mask[np.argsort(-cell[nonzero_mask])][:self.max_seq_len]
-            sorted_indices = sorted_indices + self.aux_tokens  # reserve tokens for padding etc
-            scores = np.zeros(self.max_seq_len, dtype=np.int32)
-            scores[:len(sorted_indices)] = sorted_indices.astype(np.int32)
-            scores_final[i, :] = scores
-        return scores_final
-    def tokenize_anndata(self, adata, median_counts_per_gene=None, subset_obs=None):
-        """
-        Tokenize gene expression data from an AnnData object.
         Args:
-            adata: AnnData object containing gene expression data
-            median_counts_per_gene: Median counts per gene for normalization
-            subset_obs: Indices or boolean mask to subset observations
         Returns:
-            Dictionary with tokenized data
         """
-        # Subset data if requested
-        if subset_obs is not None:
-            adata = adata[subset_obs].copy()
-        else:
-            adata = adata.copy()
-        # Extract expression matrix
-        X = adata.X
-        # Normalize data
-        X = np.nan_to_num(X) if not isinstance(X, np.ndarray) or not np.issubdtype(X.dtype, np.integer) else X
-        X = self._sf_normalize(X)
-        if median_counts_per_gene is not None:
-            median_counts_per_gene = median_counts_per_gene.copy()
-            median_counts_per_gene += median_counts_per_gene == 0
-            if isinstance(X, np.ndarray):
-                X = X / median_counts_per_gene.reshape((1, -1))
-            else:
-                # For sparse matrices, we need to handle this differently
-                from scipy.sparse import issparse
-                if issparse(X):
-                    X = X.toarray() / median_counts_per_gene.reshape((1, -1))
-        # Tokenize
-        tokens = self._sub_tokenize_data(X)
-        # Create attention mask (1 for real tokens, 0 for padding)
-        attention_mask = (tokens != self._pad_token_id).astype(np.int32)
-        # Extract metadata from obs
-        result = {
-            "input_ids": tokens,
-            "attention_mask": attention_mask
-        }
-        # Extract modality, specie, and assay from obs if available
-        if 'modality' in adata.obs.columns:
-            if adata.obs['modality'].dtype == 'object':
-                # Convert string values to token IDs
-                modality_ids = np.array([self.modality_dict.get(m, 0) for m in adata.obs['modality']])
-            else:
-                # Assume already tokenized
-                modality_ids = adata.obs['modality'].values
-            result["modality"] = modality_ids
-        if 'specie' in adata.obs.columns:
-            if adata.obs['specie'].dtype == 'object':
-                specie_ids = np.array([self.specie_dict.get(s, 0) for s in adata.obs['specie']])
-            else:
-                specie_ids = adata.obs['specie'].values
-            result["specie"] = specie_ids
-        if 'assay' in adata.obs.columns:
-            if adata.obs['assay'].dtype == 'object':
-                assay_ids = np.array([self.technology_dict.get(a, 0) for a in adata.obs['assay']])
-            else:
-                assay_ids = adata.obs['assay'].values
-            result["assay"] = assay_ids
-        return result
-    def batch_encode_plus(
         self,
-        adata=None,
-        expression_matrix=None,
-        median_counts_per_gene=None,
-        modality=None,
-        specie=None,
-        assay=None,
-        subset_obs=None,
-        return_tensors=None,
         **kwargs
-    ):
-        """
-        Encode a batch of gene expression data.
         Args:
-            adata: AnnData object containing gene expression data
-            expression_matrix: Matrix of gene expression values (cells x genes)
-            median_counts_per_gene: Median counts per gene for normalization
-            modality: List or array of modality values
-            specie: List or array of species values
-            assay: List or array of assay/technology values
-            subset_obs: Indices or boolean mask to subset observations in adata
-            return_tensors: Format of the returned tensors ("pt" for PyTorch, "tf" for TensorFlow, None for numpy)
         Returns:
-            Dictionary with encoded data
         """
         if adata is not None:
-            # Use AnnData object
-            result = self.tokenize_anndata(adata, median_counts_per_gene, subset_obs)
-            # Override metadata if explicitly provided
-            if modality is not None:
-                if isinstance(modality[0], str):
-                    modality_ids = np.array([self.modality_dict.get(m, 0) for m in modality])
-                else:
-                    modality_ids = np.array(modality)
-                result["modality"] = modality_ids
-            if specie is not None:
-                if isinstance(specie[0], str):
-                    specie_ids = np.array([self.specie_dict.get(s, 0) for s in specie])
-                else:
-                    specie_ids = np.array(specie)
-                result["specie"] = specie_ids
-            if assay is not None:
-                if isinstance(assay[0], str):
-                    assay_ids = np.array([self.technology_dict.get(a, 0) for a in assay])
-                else:
-                    assay_ids = np.array(assay)
-                result["assay"] = assay_ids
-        elif expression_matrix is not None:
-            # Use raw expression matrix
-            from scipy.sparse import issparse
-            # Convert to numpy array if sparse
-            if issparse(expression_matrix):
-                expression_matrix = expression_matrix.toarray()
-            # Normalize data
-            expression_matrix = np.nan_to_num(expression_matrix)
-            expression_matrix = self._sf_normalize(expression_matrix)
-            if median_counts_per_gene is not None:
-                median_counts_per_gene = median_counts_per_gene.copy()
-                median_counts_per_gene += median_counts_per_gene == 0
-                expression_matrix = expression_matrix / median_counts_per_gene.reshape((1, -1))
-            # Tokenize
-            tokens = self._sub_tokenize_data(expression_matrix)
-            # Create attention mask (1 for real tokens, 0 for padding)
-            attention_mask = (tokens != self._pad_token_id).astype(np.int32)
-            # Add metadata tokens if provided
-            result = {
-                "input_ids": tokens,
-                "attention_mask": attention_mask
-            }
-            if modality is not None:
-                if isinstance(modality[0], str):
-                    modality_ids = np.array([self.modality_dict.get(m, 0) for m in modality])
-                else:
-                    modality_ids = np.array(modality)
-                result["modality"] = modality_ids
-            if specie is not None:
-                if isinstance(specie[0], str):
-                    specie_ids = np.array([self.specie_dict.get(s, 0) for s in specie])
-                else:
-                    specie_ids = np.array(specie)
-                result["specie"] = specie_ids
-            if assay is not None:
-                if isinstance(assay[0], str):
-                    assay_ids = np.array([self.technology_dict.get(a, 0) for a in assay])
-                else:
-                    assay_ids = np.array(assay)
-                result["assay"] = assay_ids
-        else:
-            raise ValueError("Either adata or expression_matrix must be provided")
-        # Convert to tensors if requested
-        if return_tensors == "pt":
-            result = {k: torch.tensor(v) for k, v in result.items()}
-        # Otherwise keep as numpy arrays
-        return result
-    def __call__(
-        self,
-        adata=None,
-        expression_matrix=None,
-        median_counts_per_gene=None,
-        modality=None,
-        specie=None,
-        assay=None,
-        return_tensors=None,
-        subset_obs=None,
-        **kwargs
-    ):
-        """
-        Encode gene expression data.
-        This is a convenience wrapper around batch_encode_plus.
-        """
-        return self.batch_encode_plus(
-            adata=adata,
-            expression_matrix=expression_matrix,
-            median_counts_per_gene=median_counts_per_gene,
-            modality=modality,
-            specie=specie,
-            assay=assay,
-            return_tensors=return_tensors,
-            subset_obs=subset_obs,
-            **kwargs
-        )

 from typing import List, Dict, Optional, Union, Tuple
+import numpy as np
+from transformers import PreTrainedTokenizer
+from dataclasses import dataclass
+import torch
+import anndata as ad
+from scipy.sparse import issparse
+import numba
 import os
 import json
+# Token IDs must match exactly with the original implementation
+PAD_TOKEN = 0
+MASK_TOKEN = 1
+CLS_TOKEN = 2
+# These mappings preserve the exact token IDs from the original implementation
+MODALITY_DICT = {
+    'dissociated': 3,
+    'spatial': 4,
+}
+SPECIES_DICT = {
+    'human': 5,
+    'Homo sapiens': 5,
+    'Mus musculus': 6,
+    'mouse': 6,
+}
+TECHNOLOGY_DICT = {
+    "merfish": 7,
+    "MERFISH": 7,
+    "cosmx": 8,
+    "NanoString digital spatial profiling": 8,
+    "Xenium": 9,
+    "10x 5' v2": 10,
+    "10x 3' v3": 11,
+    "10x 3' v2": 12,
+    "10x 5' v1": 13,
+    "10x 3' v1": 14,
+    "10x 3' transcription profiling": 15,
+    "10x transcription profiling": 15,
+    "10x 5' transcription profiling": 16,
+    "CITE-seq": 17,
+    "Smart-seq v4": 18,
+}
+def sf_normalize(X: np.ndarray) -> np.ndarray:
+    """Size factor normalize to 10k counts."""
+    X = X.copy()
+    counts = np.array(X.sum(axis=1))
+    # avoid zero division error
+    counts += counts == 0.
+    # normalize to 10000 counts
+    scaling_factor = 10000. / counts
+    if issparse(X):
+        from scipy.sparse import sparsefuncs
+        sparsefuncs.inplace_row_scale(X, scaling_factor)
+    else:
+        np.multiply(X, scaling_factor.reshape((-1, 1)), out=X)
+    return X
+@numba.jit(nopython=True, nogil=True)
+def _sub_tokenize_data(x: np.ndarray, max_seq_len: int = -1, aux_tokens: int = 30) -> np.ndarray:
+    """Tokenize the input gene vector."""
+    scores_final = np.empty((x.shape[0], max_seq_len if max_seq_len > 0 else x.shape[1]))
+    for i, cell in enumerate(x):
+        nonzero_mask = np.nonzero(cell)[0]
+        sorted_indices = nonzero_mask[np.argsort(-cell[nonzero_mask])][:max_seq_len]
+        sorted_indices = sorted_indices + aux_tokens
+        if max_seq_len:
+            scores = np.zeros(max_seq_len, dtype=np.int32)
+        else:
+            scores = np.zeros_like(cell, dtype=np.int32)
+        scores[:len(sorted_indices)] = sorted_indices.astype(np.int32)
+        scores_final[i, :] = scores
+    return scores_final
 class NicheformerTokenizer(PreTrainedTokenizer):
+    """Tokenizer for Nicheformer that handles single-cell data."""
     model_input_names = ["input_ids", "attention_mask"]
+    vocab_files_names = {"vocab_file": "vocab.json"}
+    modality_dict = MODALITY_DICT
+    species_dict = SPECIES_DICT
+    technology_dict = TECHNOLOGY_DICT
     def __init__(
         self,
         vocab_file=None,
+        max_length: int = 1500,
+        aux_tokens: int = 30,
+        median_counts_per_gene: Optional[np.ndarray] = None,
+        gene_names: Optional[List[str]] = None,
         **kwargs
     ):
+        # Initialize base vocabulary
+        self._vocabulary = {
+            "[PAD]": PAD_TOKEN,
+            "[MASK]": MASK_TOKEN,
+            "[CLS]": CLS_TOKEN,
+        }
+        if vocab_file is not None:
+            with open(vocab_file, 'r') as f:
+                self._vocabulary.update(json.load(f))
+        else:
+            # Add modality tokens
+            for name, idx in self.modality_dict.items():
+                self._vocabulary[f"[MODALITY_{name}]"] = idx
+            # Add species tokens
+            for name, idx in self.species_dict.items():
+                if name in ["Homo sapiens", "Mus musculus"]:
+                    continue  # Skip redundant names
+                self._vocabulary[f"[SPECIES_{name}]"] = idx
+            # Add technology tokens
+            for name, idx in self.technology_dict.items():
+                if name in ["MERFISH", "10x transcription profiling"]:
+                    continue  # Skip redundant names
+                clean_name = name.lower().replace(" ", "_").replace("'", "_")
+                self._vocabulary[f"[TECH_{clean_name}]"] = idx
+            # Add gene tokens if provided
+            if gene_names is not None:
+                for i, gene in enumerate(gene_names):
+                    self._vocabulary[gene] = i + aux_tokens
+                # Save vocabulary
+                os.makedirs('to_hf', exist_ok=True)
+                with open('to_hf/vocab.json', 'w') as f:
+                    json.dump(self._vocabulary, f, indent=4)
+        super().__init__(**kwargs)
+        self.max_length = max_length
         self.aux_tokens = aux_tokens
+        self.median_counts_per_gene = median_counts_per_gene
+        self.gene_names = gene_names
+        # Set up special token mappings
+        self._pad_token = "[PAD]"
+        self._mask_token = "[MASK]"
+        self._cls_token = "[CLS]"
+    def get_vocab(self) -> Dict[str, int]:
+        """Returns the vocabulary mapping."""
+        return self._vocabulary.copy()
+    def _tokenize(self, text: str) -> List[str]:
+        """Tokenize text input."""
+        # This tokenizer doesn't handle text input directly
+        raise NotImplementedError("This tokenizer only works with gene expression data")
+    def _convert_token_to_id(self, token: str) -> int:
+        """Convert token to ID."""
+        # First check special token mappings
+        if token in self.modality_dict:
+            return self.modality_dict[token]
+        if token in self.species_dict:
+            return self.species_dict[token]
+        if token in self.technology_dict:
+            return self.technology_dict[token]
+        # Then check vocabulary
+        return self._vocabulary.get(token, self._vocabulary["[PAD]"])
+    def _convert_id_to_token(self, index: int) -> str:
+        """Convert ID to token."""
+        # First check special token mappings
+        for token, idx in self.modality_dict.items():
+            if idx == index:
+                return token
+        for token, idx in self.species_dict.items():
+            if idx == index:
+                return token
+        for token, idx in self.technology_dict.items():
+            if idx == index:
+                return token
+        # Then check vocabulary
+        for token, idx in self._vocabulary.items():
+            if idx == index:
+                return token
+        return "[PAD]"
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
         """Save the vocabulary to a file."""
         vocab_file = os.path.join(
             save_directory,
+            (filename_prefix + "-" if filename_prefix else "") + "vocab.json"
         )
         with open(vocab_file, "w", encoding="utf-8") as f:
+            json.dump(self._vocabulary, f, ensure_ascii=False)
         return (vocab_file,)
+    def _tokenize_gene_expression(self, x: np.ndarray) -> np.ndarray:
+        """Tokenize gene expression matrix.
         Args:
+            x: Gene expression matrix (cells x genes)
         Returns:
+            Tokenized matrix
         """
+        # Handle sparse input
+        if issparse(x):
+            x = x.toarray()
+        # Normalize and scale
+        x = np.nan_to_num(x)
+        x = sf_normalize(x)
+        if self.median_counts_per_gene is not None:
+            median_counts = self.median_counts_per_gene.copy()
+            median_counts += median_counts == 0
+            x = x / median_counts.reshape((1, -1))
+        # Convert to tokens
+        tokens = _sub_tokenize_data(x, self.max_length, self.aux_tokens)
+        return tokens.astype(np.int32)
+    def __call__(
         self,
+        adata: Optional[ad.AnnData] = None,
+        gene_expression: Optional[Union[np.ndarray, List[float]]] = None,
+        modality: Optional[str] = None,
+        species: Optional[str] = None,
+        technology: Optional[str] = None,
         **kwargs
+    ) -> Dict[str, torch.Tensor]:
+        """Convert inputs to model inputs.
         Args:
+            adata: AnnData object
+            gene_expression: Gene expression matrix if not using AnnData
+            modality: Modality type
+            species: Species type
+            technology: Technology/assay type
         Returns:
+            Dictionary with model inputs
         """
         if adata is not None:
+            # Get expression matrix
+            if issparse(adata.X):
+                x = adata.X.toarray()
+            else:
+                x = adata.X
+            # Get metadata for each cell if not provided
+            if modality is None and 'modality' in adata.obs:
+                modality = adata.obs['modality'].values
+            if species is None and 'specie' in adata.obs:
+                species = adata.obs['specie'].values
+            if technology is None and 'assay' in adata.obs:
+                technology = adata.obs['assay'].values
+        elif gene_expression is not None:
+            x = np.array(gene_expression)
+            if len(x.shape) == 1:
+                x = x.reshape(1, -1)
+            # For single gene expression input, convert scalar metadata to arrays
+            if modality is not None:
+                modality = np.array([modality])
+            if species is not None:
+                species = np.array([species])
+            if technology is not None:
+                technology = np.array([technology])
+        else:
+            raise ValueError("Either adata or gene_expression must be provided")
+        # Tokenize gene expression
+        token_ids = self._tokenize_gene_expression(x)
+        n_cells = token_ids.shape[0]
+        # Add special tokens for each cell
+        special_tokens = np.zeros((n_cells, 3), dtype=np.int32)  # 3 for modality, species, technology
+        special_token_mask = np.zeros((n_cells, 3), dtype=bool)  # Track which tokens are actually present
+        if modality is not None:
+            special_tokens[:, 0] = [self.modality_dict.get(m, self._vocabulary["[PAD]"]) for m in modality]
+            special_token_mask[:, 0] = True
+        if species is not None:
+            special_tokens[:, 1] = [self.species_dict.get(s, self._vocabulary["[PAD]"]) for s in species]
+            special_token_mask[:, 1] = True
+        if technology is not None:
+            special_tokens[:, 2] = [self.technology_dict.get(t, self._vocabulary["[PAD]"]) for t in technology]
+            special_token_mask[:, 2] = True
+        # Only keep the special tokens that are present (have True in mask)
+        special_tokens = special_tokens[:, special_token_mask[0]]
+        if special_tokens.size > 0:
+            token_ids = np.concatenate([special_tokens, token_ids[:, :(self.max_length - special_tokens.shape[1])]], axis=1)
+        # Create attention mask
+        attention_mask = (token_ids != self._vocabulary["[PAD]"])
+        return {
+            "input_ids": torch.tensor(token_ids, dtype=torch.long),
+            "attention_mask": torch.tensor(attention_mask)
+        }
+    def get_vocab_size(self) -> int:
+        """Get vocabulary size."""
+        if self.gene_names is not None:
+            return len(self.gene_names) + self.aux_tokens
+        return max(
+            max(self.modality_dict.values()),
+            max(self.species_dict.values()),
+            max(self.technology_dict.values())
+        ) + 1
+    def convert_tokens_to_string(self, tokens: List[str]) -> str:
+        """Convert a sequence of tokens to a string. Not used for gene expression."""
+        raise NotImplementedError("This tokenizer only works with gene expression data")
+    def build_inputs_with_special_tokens(self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None) -> List[int]:
+        """Build model inputs from a sequence by adding special tokens."""
+        # For gene expression data, special tokens are handled in __call__
+        return token_ids_0
+    def get_special_tokens_mask(self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False) -> List[int]:
+        """Get list where entries are [1] if a token is [special] else [0]."""
+        # Consider tokens < aux_tokens as special
+        return [1 if token_id < self.aux_tokens else 0 for token_id in token_ids_0]