File size: 3,951 Bytes
05fdb87
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
from typing import Tuple, List
import numpy as np
from anndata import AnnData


class AnnDataValidator:
    """Validate AnnData objects for spatial visualization requirements"""

    MAX_OBS = 500_000  # Max number of observations (cells/spots)
    MAX_VARS = 50_000  # Max number of variables (genes)

    @staticmethod
    def validate(adata: AnnData) -> Tuple[bool, List[str]]:
        """
        Validate AnnData object for spatial visualization

        Args:
            adata: AnnData object to validate

        Returns:
            Tuple of (is_valid, error_messages)
        """
        errors = []

        # Check spatial coordinates exist
        if "spatial" not in adata.obsm:
            errors.append(
                "Missing spatial coordinates. adata.obsm['spatial'] is required."
            )

        # Validate spatial coordinates format
        if "spatial" in adata.obsm:
            spatial = adata.obsm["spatial"]
            if spatial.shape[1] != 2:
                errors.append(
                    f"Spatial coordinates must be 2D (x, y). Got shape: {spatial.shape}"
                )

        # Check number of observations
        if adata.n_obs > AnnDataValidator.MAX_OBS:
            errors.append(
                f"Too many observations: {adata.n_obs:,} (max: {AnnDataValidator.MAX_OBS:,})"
            )

        # Check number of variables
        if adata.n_vars > AnnDataValidator.MAX_VARS:
            errors.append(
                f"Too many variables: {adata.n_vars:,} (max: {AnnDataValidator.MAX_VARS:,})"
            )

        # Check if data is accessible
        try:
            _ = adata.var_names
        except Exception as e:
            errors.append(f"Cannot access variable names: {str(e)}")

        return (len(errors) == 0, errors)

    @staticmethod
    def validate_gene(adata: AnnData, gene_name: str) -> Tuple[bool, str]:
        """
        Validate if a gene exists in the dataset

        Args:
            adata: AnnData object
            gene_name: Gene name to check

        Returns:
            Tuple of (exists, message)
        """
        if gene_name not in adata.var_names:
            # Try to find similar gene names
            var_names = list(adata.var_names)
            similar = [g for g in var_names if gene_name.lower() in g.lower()][:5]

            if similar:
                return (
                    False,
                    f"Gene '{gene_name}' not found. Similar genes: {', '.join(similar)}",
                )
            else:
                return (False, f"Gene '{gene_name}' not found in dataset.")

        return (True, f"Gene '{gene_name}' found.")

    @staticmethod
    def get_gene_expression(adata: AnnData, gene_name: str) -> np.ndarray:
        """
        Extract gene expression for a specific gene

        Args:
            adata: AnnData object
            gene_name: Gene name to extract

        Returns:
            Expression vector as numpy array

        Raises:
            ValueError: If gene not found
        """
        is_valid, message = AnnDataValidator.validate_gene(adata, gene_name)
        if not is_valid:
            raise ValueError(message)

        # Extract gene expression (works with backed mode)
        gene_data = adata[:, gene_name].X

        # Convert to dense array if sparse
        if hasattr(gene_data, "toarray"):
            gene_data = gene_data.toarray()

        # Flatten if needed
        if gene_data.ndim > 1:
            gene_data = gene_data.flatten()

        return gene_data

    @staticmethod
    def get_gene_list(adata: AnnData, limit: int = 1000) -> List[str]:
        """
        Get list of available genes (limited for performance)

        Args:
            adata: AnnData object
            limit: Maximum number of genes to return

        Returns:
            List of gene names
        """
        var_names = list(adata.var_names)
        return var_names[:limit]