Spaces:
Sleeping
Sleeping
File size: 3,951 Bytes
05fdb87 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 |
from typing import Tuple, List
import numpy as np
from anndata import AnnData
class AnnDataValidator:
"""Validate AnnData objects for spatial visualization requirements"""
MAX_OBS = 500_000 # Max number of observations (cells/spots)
MAX_VARS = 50_000 # Max number of variables (genes)
@staticmethod
def validate(adata: AnnData) -> Tuple[bool, List[str]]:
"""
Validate AnnData object for spatial visualization
Args:
adata: AnnData object to validate
Returns:
Tuple of (is_valid, error_messages)
"""
errors = []
# Check spatial coordinates exist
if "spatial" not in adata.obsm:
errors.append(
"Missing spatial coordinates. adata.obsm['spatial'] is required."
)
# Validate spatial coordinates format
if "spatial" in adata.obsm:
spatial = adata.obsm["spatial"]
if spatial.shape[1] != 2:
errors.append(
f"Spatial coordinates must be 2D (x, y). Got shape: {spatial.shape}"
)
# Check number of observations
if adata.n_obs > AnnDataValidator.MAX_OBS:
errors.append(
f"Too many observations: {adata.n_obs:,} (max: {AnnDataValidator.MAX_OBS:,})"
)
# Check number of variables
if adata.n_vars > AnnDataValidator.MAX_VARS:
errors.append(
f"Too many variables: {adata.n_vars:,} (max: {AnnDataValidator.MAX_VARS:,})"
)
# Check if data is accessible
try:
_ = adata.var_names
except Exception as e:
errors.append(f"Cannot access variable names: {str(e)}")
return (len(errors) == 0, errors)
@staticmethod
def validate_gene(adata: AnnData, gene_name: str) -> Tuple[bool, str]:
"""
Validate if a gene exists in the dataset
Args:
adata: AnnData object
gene_name: Gene name to check
Returns:
Tuple of (exists, message)
"""
if gene_name not in adata.var_names:
# Try to find similar gene names
var_names = list(adata.var_names)
similar = [g for g in var_names if gene_name.lower() in g.lower()][:5]
if similar:
return (
False,
f"Gene '{gene_name}' not found. Similar genes: {', '.join(similar)}",
)
else:
return (False, f"Gene '{gene_name}' not found in dataset.")
return (True, f"Gene '{gene_name}' found.")
@staticmethod
def get_gene_expression(adata: AnnData, gene_name: str) -> np.ndarray:
"""
Extract gene expression for a specific gene
Args:
adata: AnnData object
gene_name: Gene name to extract
Returns:
Expression vector as numpy array
Raises:
ValueError: If gene not found
"""
is_valid, message = AnnDataValidator.validate_gene(adata, gene_name)
if not is_valid:
raise ValueError(message)
# Extract gene expression (works with backed mode)
gene_data = adata[:, gene_name].X
# Convert to dense array if sparse
if hasattr(gene_data, "toarray"):
gene_data = gene_data.toarray()
# Flatten if needed
if gene_data.ndim > 1:
gene_data = gene_data.flatten()
return gene_data
@staticmethod
def get_gene_list(adata: AnnData, limit: int = 1000) -> List[str]:
"""
Get list of available genes (limited for performance)
Args:
adata: AnnData object
limit: Maximum number of genes to return
Returns:
List of gene names
"""
var_names = list(adata.var_names)
return var_names[:limit]
|