theislab
/

Nicheformer

@@ -9,6 +9,7 @@ import numba
 import os
 import json
 from huggingface_hub import hf_hub_download
 # Token IDs must match exactly with the original implementation
 PAD_TOKEN = 0
@@ -236,89 +237,80 @@ class NicheformerTokenizer(PreTrainedTokenizer):
         return tokens.astype(np.int32)
-    def __call__(
-        self,
-        adata: Optional[ad.AnnData] = None,
-        gene_expression: Optional[Union[np.ndarray, List[float]]] = None,
-        modality: Optional[str] = None,
-        species: Optional[str] = None,
-        technology: Optional[str] = None,
-        **kwargs
-    ) -> Dict[str, torch.Tensor]:
-        """Convert inputs to model inputs.
         Args:
-            adata: AnnData object
-            gene_expression: Gene expression matrix if not using AnnData
-            modality: Modality type
-            species: Species type
-            technology: Technology/assay type
         Returns:
-            Dictionary with model inputs
         """
-        if adata is not None:
-            # Align with reference model if needed
-            reference_model = self._load_reference_model()
-            if reference_model is not None:
-                # Concatenate and then remove the reference
-                adata = ad.concat([reference_model, adata], join='outer', axis=0)
-                adata = adata[1:]
-            # Get expression matrix
-            if issparse(adata.X):
-                x = adata.X.toarray()
             else:
-                x = adata.X
-            # Get metadata for each cell if not provided
-            if modality is None and 'modality' in adata.obs:
-                modality = adata.obs['modality'].values
-            if species is None and 'specie' in adata.obs:
-                species = adata.obs['specie'].values
-            if technology is None and 'assay' in adata.obs:
-                technology = adata.obs['assay'].values
-        elif gene_expression is not None:
-            x = np.array(gene_expression)
-            if len(x.shape) == 1:
-                x = x.reshape(1, -1)
-            # For single gene expression input, convert scalar metadata to arrays
-            if modality is not None:
-                modality = np.array([modality])
-            if species is not None:
-                species = np.array([species])
-            if technology is not None:
-                technology = np.array([technology])
-        else:
-            raise ValueError("Either adata or gene_expression must be provided")
-        # Tokenize gene expression
-        token_ids = self._tokenize_gene_expression(x)
-        n_cells = token_ids.shape[0]
-        # Add special tokens for each cell
-        special_tokens = np.zeros((n_cells, 3), dtype=np.int32)  # 3 for modality, species, technology
-        special_token_mask = np.zeros((n_cells, 3), dtype=bool)  # Track which tokens are actually present
-        if modality is not None:
-            special_tokens[:, 0] = [self.modality_dict.get(m, self._vocabulary["[PAD]"]) for m in modality]
             special_token_mask[:, 0] = True
-        if species is not None:
-            special_tokens[:, 1] = [self.species_dict.get(s, self._vocabulary["[PAD]"]) for s in species]
             special_token_mask[:, 1] = True
-        if technology is not None:
-            special_tokens[:, 2] = [self.technology_dict.get(t, self._vocabulary["[PAD]"]) for t in technology]
             special_token_mask[:, 2] = True
         # Only keep the special tokens that are present (have True in mask)
         special_tokens = special_tokens[:, special_token_mask[0]]
         if special_tokens.size > 0:
             token_ids = np.concatenate([special_tokens, token_ids[:, :(self.max_length - special_tokens.shape[1])]], axis=1)
         # Create attention mask
         attention_mask = (token_ids != self._vocabulary["[PAD]"])

 import os
 import json
 from huggingface_hub import hf_hub_download
+import pandas as pd
 # Token IDs must match exactly with the original implementation
 PAD_TOKEN = 0
         return tokens.astype(np.int32)
+    def __call__(self, data: Union[ad.AnnData, np.ndarray], **kwargs) -> Dict[str, torch.Tensor]:
+        """Tokenize gene expression data.
         Args:
+            data: AnnData object or numpy array of gene expression data
         Returns:
+            Dictionary with input_ids and attention_mask tensors
         """
+        if isinstance(data, ad.AnnData):
+            adata = data.copy()
+            # Align with reference model if available
+            if hasattr(self, '_load_reference_model'):
+                reference_model = self._load_reference_model()
+                if reference_model is not None:
+                    # Concatenate and then remove the reference
+                    adata = ad.concat([reference_model, adata], join='outer', axis=0)
+                    adata = adata[1:]
+            # Get gene expression data
+            X = adata.X
+            # Get metadata for special tokens
+            modality = adata.obs.get('modality', None)
+            species = adata.obs.get('specie', None)  # Note: using 'specie' as in the notebook
+            technology = adata.obs.get('assay', None)  # Note: using 'assay' as in the notebook
+            # Use integer values directly if available
+            if modality is not None and pd.api.types.is_numeric_dtype(modality):
+                modality_tokens = modality.astype(int).tolist()
             else:
+                modality_tokens = [self.modality_dict.get(m, self._vocabulary["[PAD]"]) for m in modality] if modality is not None else None
+            if species is not None and pd.api.types.is_numeric_dtype(species):
+                species_tokens = species.astype(int).tolist()
+            else:
+                species_tokens = [self.species_dict.get(s, self._vocabulary["[PAD]"]) for s in species] if species is not None else None
+            if technology is not None and pd.api.types.is_numeric_dtype(technology):
+                technology_tokens = technology.astype(int).tolist()
+            else:
+                technology_tokens = [self.technology_dict.get(t, self._vocabulary["[PAD]"]) for t in technology] if technology is not None else None
+        else:
+            X = data
+            modality_tokens = None
+            species_tokens = None
+            technology_tokens = None
+        # Tokenize gene expression data
+        token_ids = self._tokenize_gene_expression(X)
+        # Add special tokens if available
+        special_tokens = np.zeros((token_ids.shape[0], 3), dtype=np.int64)
+        special_token_mask = np.zeros((token_ids.shape[0], 3), dtype=bool)
+        if modality_tokens is not None:
+            special_tokens[:, 0] = modality_tokens
             special_token_mask[:, 0] = True
+        if species_tokens is not None:
+            special_tokens[:, 1] = species_tokens
             special_token_mask[:, 1] = True
+        if technology_tokens is not None:
+            special_tokens[:, 2] = technology_tokens
             special_token_mask[:, 2] = True
         # Only keep the special tokens that are present (have True in mask)
         special_tokens = special_tokens[:, special_token_mask[0]]
         if special_tokens.size > 0:
             token_ids = np.concatenate([special_tokens, token_ids[:, :(self.max_length - special_tokens.shape[1])]], axis=1)
         # Create attention mask
         attention_mask = (token_ids != self._vocabulary["[PAD]"])