from typing import Tuple, List import numpy as np from anndata import AnnData class AnnDataValidator: """Validate AnnData objects for spatial visualization requirements""" MAX_OBS = 500_000 # Max number of observations (cells/spots) MAX_VARS = 50_000 # Max number of variables (genes) @staticmethod def validate(adata: AnnData) -> Tuple[bool, List[str]]: """ Validate AnnData object for spatial visualization Args: adata: AnnData object to validate Returns: Tuple of (is_valid, error_messages) """ errors = [] # Check spatial coordinates exist if "spatial" not in adata.obsm: errors.append( "Missing spatial coordinates. adata.obsm['spatial'] is required." ) # Validate spatial coordinates format if "spatial" in adata.obsm: spatial = adata.obsm["spatial"] if spatial.shape[1] != 2: errors.append( f"Spatial coordinates must be 2D (x, y). Got shape: {spatial.shape}" ) # Check number of observations if adata.n_obs > AnnDataValidator.MAX_OBS: errors.append( f"Too many observations: {adata.n_obs:,} (max: {AnnDataValidator.MAX_OBS:,})" ) # Check number of variables if adata.n_vars > AnnDataValidator.MAX_VARS: errors.append( f"Too many variables: {adata.n_vars:,} (max: {AnnDataValidator.MAX_VARS:,})" ) # Check if data is accessible try: _ = adata.var_names except Exception as e: errors.append(f"Cannot access variable names: {str(e)}") return (len(errors) == 0, errors) @staticmethod def validate_gene(adata: AnnData, gene_name: str) -> Tuple[bool, str]: """ Validate if a gene exists in the dataset Args: adata: AnnData object gene_name: Gene name to check Returns: Tuple of (exists, message) """ if gene_name not in adata.var_names: # Try to find similar gene names var_names = list(adata.var_names) similar = [g for g in var_names if gene_name.lower() in g.lower()][:5] if similar: return ( False, f"Gene '{gene_name}' not found. Similar genes: {', '.join(similar)}", ) else: return (False, f"Gene '{gene_name}' not found in dataset.") return (True, f"Gene '{gene_name}' found.") @staticmethod def get_gene_expression(adata: AnnData, gene_name: str) -> np.ndarray: """ Extract gene expression for a specific gene Args: adata: AnnData object gene_name: Gene name to extract Returns: Expression vector as numpy array Raises: ValueError: If gene not found """ is_valid, message = AnnDataValidator.validate_gene(adata, gene_name) if not is_valid: raise ValueError(message) # Extract gene expression (works with backed mode) gene_data = adata[:, gene_name].X # Convert to dense array if sparse if hasattr(gene_data, "toarray"): gene_data = gene_data.toarray() # Flatten if needed if gene_data.ndim > 1: gene_data = gene_data.flatten() return gene_data @staticmethod def get_gene_list(adata: AnnData, limit: int = 1000) -> List[str]: """ Get list of available genes (limited for performance) Args: adata: AnnData object limit: Maximum number of genes to return Returns: List of gene names """ var_names = list(adata.var_names) return var_names[:limit]