anway's picture
h5ad_viewer
05fdb87 verified
from typing import Tuple, List
import numpy as np
from anndata import AnnData
class AnnDataValidator:
"""Validate AnnData objects for spatial visualization requirements"""
MAX_OBS = 500_000 # Max number of observations (cells/spots)
MAX_VARS = 50_000 # Max number of variables (genes)
@staticmethod
def validate(adata: AnnData) -> Tuple[bool, List[str]]:
"""
Validate AnnData object for spatial visualization
Args:
adata: AnnData object to validate
Returns:
Tuple of (is_valid, error_messages)
"""
errors = []
# Check spatial coordinates exist
if "spatial" not in adata.obsm:
errors.append(
"Missing spatial coordinates. adata.obsm['spatial'] is required."
)
# Validate spatial coordinates format
if "spatial" in adata.obsm:
spatial = adata.obsm["spatial"]
if spatial.shape[1] != 2:
errors.append(
f"Spatial coordinates must be 2D (x, y). Got shape: {spatial.shape}"
)
# Check number of observations
if adata.n_obs > AnnDataValidator.MAX_OBS:
errors.append(
f"Too many observations: {adata.n_obs:,} (max: {AnnDataValidator.MAX_OBS:,})"
)
# Check number of variables
if adata.n_vars > AnnDataValidator.MAX_VARS:
errors.append(
f"Too many variables: {adata.n_vars:,} (max: {AnnDataValidator.MAX_VARS:,})"
)
# Check if data is accessible
try:
_ = adata.var_names
except Exception as e:
errors.append(f"Cannot access variable names: {str(e)}")
return (len(errors) == 0, errors)
@staticmethod
def validate_gene(adata: AnnData, gene_name: str) -> Tuple[bool, str]:
"""
Validate if a gene exists in the dataset
Args:
adata: AnnData object
gene_name: Gene name to check
Returns:
Tuple of (exists, message)
"""
if gene_name not in adata.var_names:
# Try to find similar gene names
var_names = list(adata.var_names)
similar = [g for g in var_names if gene_name.lower() in g.lower()][:5]
if similar:
return (
False,
f"Gene '{gene_name}' not found. Similar genes: {', '.join(similar)}",
)
else:
return (False, f"Gene '{gene_name}' not found in dataset.")
return (True, f"Gene '{gene_name}' found.")
@staticmethod
def get_gene_expression(adata: AnnData, gene_name: str) -> np.ndarray:
"""
Extract gene expression for a specific gene
Args:
adata: AnnData object
gene_name: Gene name to extract
Returns:
Expression vector as numpy array
Raises:
ValueError: If gene not found
"""
is_valid, message = AnnDataValidator.validate_gene(adata, gene_name)
if not is_valid:
raise ValueError(message)
# Extract gene expression (works with backed mode)
gene_data = adata[:, gene_name].X
# Convert to dense array if sparse
if hasattr(gene_data, "toarray"):
gene_data = gene_data.toarray()
# Flatten if needed
if gene_data.ndim > 1:
gene_data = gene_data.flatten()
return gene_data
@staticmethod
def get_gene_list(adata: AnnData, limit: int = 1000) -> List[str]:
"""
Get list of available genes (limited for performance)
Args:
adata: AnnData object
limit: Maximum number of genes to return
Returns:
List of gene names
"""
var_names = list(adata.var_names)
return var_names[:limit]