File size: 10,649 Bytes
155c1a6
 
 
 
 
 
 
5774c2d
155c1a6
 
 
 
 
 
 
 
 
 
 
 
5774c2d
155c1a6
 
 
 
 
 
 
 
5774c2d
155c1a6
b101933
155c1a6
 
5774c2d
155c1a6
 
 
 
5774c2d
 
 
 
 
 
 
 
 
155c1a6
 
 
 
 
 
 
5774c2d
155c1a6
 
 
 
 
5774c2d
155c1a6
 
5774c2d
 
 
 
155c1a6
 
5774c2d
155c1a6
 
5774c2d
155c1a6
 
 
 
 
 
 
 
 
5774c2d
 
 
155c1a6
 
5774c2d
155c1a6
5774c2d
155c1a6
 
 
5774c2d
155c1a6
 
5774c2d
155c1a6
 
5774c2d
155c1a6
 
 
5774c2d
155c1a6
5774c2d
155c1a6
 
 
5774c2d
155c1a6
 
5774c2d
155c1a6
 
5774c2d
155c1a6
 
5774c2d
155c1a6
 
5774c2d
 
 
155c1a6
 
 
5774c2d
 
 
 
155c1a6
 
5774c2d
155c1a6
5774c2d
155c1a6
 
 
 
 
5774c2d
 
155c1a6
 
 
5774c2d
155c1a6
 
 
 
 
 
 
 
5774c2d
155c1a6
 
5774c2d
155c1a6
5774c2d
155c1a6
 
 
5774c2d
155c1a6
 
 
 
 
5774c2d
155c1a6
 
5774c2d
155c1a6
 
 
b101933
 
 
 
 
 
5774c2d
155c1a6
 
 
 
 
 
 
 
 
 
 
5774c2d
155c1a6
 
 
5774c2d
98661c2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b101933
155c1a6
5774c2d
5a3a131
 
 
 
155c1a6
 
5774c2d
155c1a6
 
5774c2d
155c1a6
 
5774c2d
155c1a6
 
5774c2d
155c1a6
 
5774c2d
 
 
 
 
 
155c1a6
 
5774c2d
155c1a6
 
5774c2d
155c1a6
5774c2d
 
155c1a6
5774c2d
155c1a6
 
5774c2d
155c1a6
 
5774c2d
155c1a6
 
5774c2d
155c1a6
 
5774c2d
155c1a6
 
5774c2d
98661c2
 
 
5774c2d
 
 
 
 
 
 
 
 
 
 
 
155c1a6
5774c2d
155c1a6
 
 
5774c2d
155c1a6
 
5774c2d
155c1a6
5774c2d
 
b101933
 
 
 
 
 
 
 
5774c2d
 
 
 
b101933
 
 
 
5774c2d
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
"""
Data processing utilities for HeartMAP
"""

import os
import hashlib
from pathlib import Path
from typing import Tuple, List, Union
import warnings

import scanpy as sc
import numpy as np
import anndata as ad
from scipy.sparse import issparse

from ..config import Config


class DataValidator:
    """Validate data integrity and format"""

    @staticmethod
    def verify_checksum(file_path: str, expected_checksum: str) -> bool:
        """Verify file checksum"""
        sha256_hash = hashlib.sha256()
        with open(file_path, "rb") as f:
            for byte_block in iter(lambda: f.read(4096), b""):
                sha256_hash.update(byte_block)
        return sha256_hash.hexdigest() == expected_checksum

    @staticmethod
    def validate_anndata(adata: ad.AnnData, check_qc_metrics: bool = True) -> Tuple[bool, List[str]]:
        """Validate AnnData object structure"""
        issues = []

        if adata.n_obs == 0:
            issues.append("No cells in dataset")
        if adata.n_vars == 0:
            issues.append("No genes in dataset")

        # Check for QC metrics only if requested (after they should be calculated)
        if check_qc_metrics:
            # scanpy creates these standard QC metric columns
            required_obs = ['n_genes_by_counts', 'total_counts']
            for col in required_obs:
                if col not in adata.obs.columns:
                    issues.append(f"Missing required obs column: {col}")

        # Check for NaN/inf values
        if issparse(adata.X):
            if not np.isfinite(adata.X.data).all():
                issues.append("Non-finite values in X matrix")
        else:
            if not np.isfinite(adata.X).all():
                issues.append("Non-finite values in X matrix")

        return len(issues) == 0, issues


class DataLoader:
    """Load and preprocess data"""

    def __init__(self, config: Config):
        self.config = config

    def load_raw_data(
        self, file_path: Union[str, Path], verify_integrity: bool = True
    ) -> ad.AnnData:
        """Load raw single-cell data"""
        file_path = Path(file_path)

        if not file_path.exists():
            raise FileNotFoundError(f"Data file not found: {file_path}")

        # Load data based on file format
        if file_path.suffix == '.h5ad':
            adata = sc.read_h5ad(file_path)
        elif file_path.suffix == '.h5':
            adata = sc.read_10x_h5(file_path, genome=None, gex_only=True)
        elif file_path.suffix == '.csv':
            adata = sc.read_csv(file_path).T  # Transpose to have genes as variables
        else:
            raise ValueError(f"Unsupported file format: {file_path.suffix}")

        # Validate data (skip QC metrics check for raw data)
        is_valid, issues = DataValidator.validate_anndata(adata, check_qc_metrics=False)
        if not is_valid:
            warnings.warn(f"Data validation issues: {'; '.join(issues)}")

        return adata

    def preprocess_basic(self, adata: ad.AnnData) -> ad.AnnData:
        """Basic preprocessing pipeline"""
        adata = adata.copy()

        # Make gene names unique
        adata.var_names_make_unique()

        # Store raw data
        adata.raw = adata

        # Basic filtering
        sc.pp.filter_cells(adata, min_genes=self.config.data.min_genes)
        sc.pp.filter_genes(adata, min_cells=self.config.data.min_cells)

        return adata

    def calculate_qc_metrics(self, adata: ad.AnnData) -> ad.AnnData:
        """Calculate quality control metrics"""
        adata = adata.copy()

        # Mitochondrial genes
        adata.var['mt'] = adata.var_names.str.startswith('MT-')

        # Ribosomal genes
        adata.var['ribo'] = adata.var_names.str.startswith(('RPS', 'RPL'))

        # Hemoglobin genes
        adata.var['hb'] = adata.var_names.str.contains('^HB[^(P)]')

        # Calculate QC metrics
        sc.pp.calculate_qc_metrics(
            adata,
            percent_top=None,
            log1p=False,
            inplace=True
        )
        sc.pp.calculate_qc_metrics(
            adata,
            qc_vars=['mt', 'ribo', 'hb'],
            percent_top=None,
            log1p=False,
            inplace=True
        )

        return adata

    def scale_for_memory(self, adata: ad.AnnData) -> ad.AnnData:
        """Scale dataset for memory constraints"""
        if self.config.data.max_cells_subset and adata.n_obs > self.config.data.max_cells_subset:
            np.random.seed(self.config.data.random_seed)
            cell_indices = np.random.choice(
                adata.n_obs,
                size=self.config.data.max_cells_subset,
                replace=False
            )
            adata = adata[cell_indices].copy()

        if self.config.data.max_genes_subset and adata.n_vars > self.config.data.max_genes_subset:
            # Select most variable genes
            if issparse(adata.X):
                # For sparse matrices, convert to dense temporarily for variance calculation
                dense_subset = adata.X[:min(1000, adata.n_obs), :].toarray()
                gene_vars = np.var(dense_subset, axis=0)
            else:
                gene_vars = np.var(adata.X, axis=0)

            top_gene_indices = np.argsort(gene_vars)[-self.config.data.max_genes_subset:]
            adata = adata[:, top_gene_indices].copy()

        return adata

    def normalize_and_scale(self, adata: ad.AnnData) -> ad.AnnData:
        """Normalize and scale data"""
        adata = adata.copy()

        # Clean data - remove infinite values
        if issparse(adata.X):
            adata.X.data = np.nan_to_num(adata.X.data, nan=0, posinf=0, neginf=0)
        else:
            adata.X = np.nan_to_num(adata.X, nan=0, posinf=0, neginf=0)

        # Normalize to target sum
        sc.pp.normalize_total(adata, target_sum=self.config.data.target_sum)

        # Log transform
        sc.pp.log1p(adata)

        # Sanitize after log1p (can create NaNs/Inf from edge cases)
        if issparse(adata.X):
            adata.X.data = np.nan_to_num(adata.X.data, nan=0, posinf=0, neginf=0)
        else:
            adata.X = np.nan_to_num(adata.X, nan=0, posinf=0, neginf=0)

        return adata

    def preprocess(self, adata: ad.AnnData) -> ad.AnnData:
        """Complete preprocessing pipeline (convenience method)"""
        adata = self.preprocess_basic(adata)
        adata = self.scale_for_memory(adata)
        adata = self.normalize_and_scale(adata)
        return adata


class DataProcessor:
    """Main data processing class"""

    def __init__(self, config: Config):
        self.config = config
        self.loader = DataLoader(config)

    @staticmethod
    def _sanitize_before_pca(adata: ad.AnnData) -> ad.AnnData:
        """Ensure finite values and remove empty genes/cells before PCA."""
        adata = adata.copy()
        # Replace NaN/Inf with zeros
        if issparse(adata.X):
            import numpy as _np
            data = adata.X.data
            if data.size:
                adata.X.data = _np.nan_to_num(data, nan=0, posinf=0, neginf=0)
        else:
            adata.X = np.nan_to_num(adata.X, nan=0, posinf=0, neginf=0)

        # Drop all-zero genes/cells to avoid zero-variance issues
        try:
            sc.pp.filter_genes(adata, min_counts=1)
            sc.pp.filter_cells(adata, min_counts=1)
        except Exception:
            pass
        return adata

    def process_from_raw(self, file_path: str, save_intermediate: bool = True) -> ad.AnnData:
        """Complete processing pipeline from raw data"""

        # Ensure processed data directory exists
        if save_intermediate:
            os.makedirs(self.config.paths.processed_data_dir, exist_ok=True)

        # Load raw data
        adata = self.loader.load_raw_data(file_path)

        # Basic preprocessing
        adata = self.loader.preprocess_basic(adata)

        if save_intermediate:
            adata.write(os.path.join(
                self.config.paths.processed_data_dir,
                "preprocessed.h5ad"
            ))

        # Calculate QC metrics
        adata = self.loader.calculate_qc_metrics(adata)

        # Validate data with QC metrics
        is_valid, issues = DataValidator.validate_anndata(adata, check_qc_metrics=True)
        if not is_valid:
            warnings.warn(f"Data validation issues after QC calculation: {'; '.join(issues)}")

        if save_intermediate:
            adata.write(os.path.join(
                self.config.paths.processed_data_dir,
                "qc_calculated.h5ad"
            ))

        # Scale for memory if needed
        if (self.config.data.max_cells_subset or
                self.config.data.max_genes_subset):
            adata = self.loader.scale_for_memory(adata)

            if save_intermediate:
                adata.write(os.path.join(
                    self.config.paths.processed_data_dir,
                    "scaled.h5ad"
                ))

        # Normalize and scale
        adata = self.loader.normalize_and_scale(adata)

        if save_intermediate:
            adata.write(os.path.join(
                self.config.paths.processed_data_dir,
                "normalized.h5ad"
            ))

        # Final sanitization before PCA (handles web deployment NaNs)
        adata = self._sanitize_before_pca(adata)

        # Compute PCA for dimensionality reduction
        sc.tl.pca(adata, svd_solver='arpack')

        # Compute neighborhood graph (required for clustering)
        sc.pp.neighbors(adata, n_neighbors=15, n_pcs=40)

        if save_intermediate:
            adata.write(os.path.join(
                self.config.paths.processed_data_dir,
                "processed_with_neighbors.h5ad"
            ))

        return adata

    def create_test_dataset(self, adata: ad.AnnData, n_cells: int = 1000) -> ad.AnnData:
        """Create small test dataset"""
        np.random.seed(self.config.data.random_seed)

        n_cells = min(n_cells, adata.n_obs)
        cell_indices = np.random.choice(adata.n_obs, size=n_cells, replace=False)

        return adata[cell_indices].copy()


# Import ligand-receptor database module
try:
    from .lr_database import get_ligand_receptor_pairs, LigandReceptorDatabase
    LR_DATABASE_AVAILABLE = True
except ImportError:
    LR_DATABASE_AVAILABLE = False
    warnings.warn("Ligand-receptor database module not available. Install liana for full functionality.")

# Export data processing classes
__all__ = [
    'DataValidator',
    'DataLoader',
    'DataProcessor',
    'get_ligand_receptor_pairs',
    'LigandReceptorDatabase',
    'LR_DATABASE_AVAILABLE'
]