Spaces:

Tumo505
/

heartmap-cell-analysis

Sleeping

File size: 9,151 Bytes

b101933

"""
Ligand-Receptor Database Loader for HeartMAP
Uses LIANA's curated resources (consensus, CellPhoneDB, Omnipath, etc.)
"""

import pandas as pd
from typing import List, Tuple, Optional

# Try importing LIANA
try:
    import liana
    LIANA_AVAILABLE = True
    print(f"✓ LIANA v{liana.__version__} loaded successfully")
except ImportError:
    LIANA_AVAILABLE = False
    print("⚠ LIANA not available - will use fallback database")

class LigandReceptorDatabase:
    """
    Manage ligand-receptor interaction databases
    """

    def __init__(self, resource: str = 'consensus'):
        """
        Initialize L-R database

        Parameters:
        -----------
        resource : str
            Database to use:
            - 'consensus': curated from multiple sources (recommended)
            - 'cellphonedb': CellPhoneDB database
            - 'omnipath': OmniPath database  
            - 'connectome': Ramilowski 2015
            - 'cellinker': CellLinker database
        """
        self.resource = resource
        self.lr_pairs = None
        self.load_database()

    def load_database(self):
        """Load L-R database from LIANA or fallback"""

        if LIANA_AVAILABLE:
            self._load_from_liana()
        else:
            self._load_fallback_database()

    def _load_from_liana(self):
        """Load curated L-R pairs from LIANA"""
        try:
            from liana.resource import select_resource

            # Get the resource
            lr_df = select_resource(self.resource)

            # Standardize column names
            # LIANA typically has: ligand, receptor, (optional: source, confidence)
            if 'ligand' in lr_df.columns and 'receptor' in lr_df.columns:
                self.lr_pairs = lr_df[['ligand', 'receptor']].drop_duplicates()

                # Add confidence score if available
                if 'score' in lr_df.columns:
                    self.lr_pairs['confidence'] = lr_df['score']
                elif 'confidence' in lr_df.columns:
                    self.lr_pairs['confidence'] = lr_df['confidence']
                else:
                    self.lr_pairs['confidence'] = 1.0  # Default high confidence

                print(f"✓ Loaded {len(self.lr_pairs)} L-R pairs from LIANA {self.resource}")
            else:
                print(f" Unexpected LIANA format, using fallback")
                self._load_fallback_database()

        except Exception as e:
            print(f" Error loading LIANA database: {e}")
            self._load_fallback_database()

    def _load_fallback_database(self):
        """Fallback: comprehensive cardiac-focused L-R pairs"""

        # Expanded cardiac-relevant pairs (100+ interactions)
        cardiac_lr_data = [
            # Angiogenesis & Vascular
            ('VEGFA', 'FLT1', 0.95), ('VEGFA', 'KDR', 0.98), ('VEGFA', 'NRP1', 0.85),
            ('VEGFB', 'FLT1', 0.90), ('VEGFC', 'FLT4', 0.95),
            ('ANGPT1', 'TEK', 0.92), ('ANGPT2', 'TEK', 0.88),
            ('PGF', 'FLT1', 0.85),

            # TGF-beta superfamily
            ('TGFB1', 'TGFBR1', 0.98), ('TGFB1', 'TGFBR2', 0.98),
            ('TGFB2', 'TGFBR1', 0.95), ('TGFB3', 'TGFBR2', 0.95),
            ('BMP2', 'BMPR1A', 0.90), ('BMP2', 'BMPR2', 0.88),
            ('BMP4', 'BMPR1A', 0.92), ('BMP7', 'BMPR2', 0.85),
            ('INHBA', 'ACVR1B', 0.80),

            # FGF signaling
            ('FGF1', 'FGFR1', 0.90), ('FGF2', 'FGFR1', 0.95),
            ('FGF2', 'FGFR2', 0.92), ('FGF7', 'FGFR2', 0.88),
            ('FGF9', 'FGFR3', 0.85), ('FGF10', 'FGFR2', 0.87),

            # PDGF signaling
            ('PDGFA', 'PDGFRA', 0.98), ('PDGFB', 'PDGFRB', 0.98),
            ('PDGFC', 'PDGFRA', 0.90), ('PDGFD', 'PDGFRB', 0.88),

            # Inflammatory cytokines
            ('IL6', 'IL6R', 0.98), ('IL1B', 'IL1R1', 0.95),
            ('TNF', 'TNFRSF1A', 0.98), ('TNF', 'TNFRSF1B', 0.95),
            ('IFNG', 'IFNGR1', 0.92), ('IL10', 'IL10RA', 0.90),
            ('IL4', 'IL4R', 0.88), ('IL13', 'IL13RA1', 0.85),

            # Chemokines
            ('CXCL12', 'CXCR4', 0.98), ('CXCL12', 'CXCR7', 0.85),
            ('CCL2', 'CCR2', 0.95), ('CCL5', 'CCR5', 0.92),
            ('CXCL8', 'CXCR1', 0.90), ('CXCL8', 'CXCR2', 0.88),
            ('CCL3', 'CCR1', 0.85), ('CCL4', 'CCR5', 0.87),

            # Growth factors
            ('EGF', 'EGFR', 0.98), ('HBEGF', 'EGFR', 0.92),
            ('IGF1', 'IGF1R', 0.98), ('IGF2', 'IGF1R', 0.95),
            ('HGF', 'MET', 0.95), ('NGF', 'NTRK1', 0.92),

            # Notch signaling
            ('DLL1', 'NOTCH1', 0.90), ('DLL4', 'NOTCH1', 0.92),
            ('JAG1', 'NOTCH1', 0.88), ('JAG1', 'NOTCH2', 0.85),
            ('JAG2', 'NOTCH3', 0.82),

            # Wnt signaling
            ('WNT3A', 'FZD1', 0.85), ('WNT3A', 'FZD2', 0.83),
            ('WNT5A', 'FZD5', 0.88), ('WNT7A', 'FZD7', 0.85),

            # Extracellular matrix
            ('COL1A1', 'ITGA1', 0.90), ('COL1A1', 'ITGA2', 0.88),
            ('FN1', 'ITGA5', 0.95), ('FN1', 'ITGB1', 0.92),
            ('LAMB1', 'ITGA6', 0.88), ('THBS1', 'CD47', 0.85),

            # Cardiac specific
            ('NRG1', 'ERBB2', 0.95), ('NRG1', 'ERBB4', 0.92),
            ('EDN1', 'EDNRA', 0.95), ('EDN1', 'EDNRB', 0.90),
            ('NPPA', 'NPR1', 0.92), ('NPPB', 'NPR1', 0.90),

            # Semaphorins
            ('SEMA3A', 'NRP1', 0.88), ('SEMA3C', 'NRP2', 0.85),
            ('SEMA4D', 'PLXNB1', 0.82),

            # Ephrins
            ('EFNA1', 'EPHA2', 0.90), ('EFNB2', 'EPHB4', 0.92),

            # Complement
            ('C3', 'C3AR1', 0.88), ('C5', 'C5AR1', 0.90),

            # Adhesion
            ('ICAM1', 'ITGAL', 0.92), ('VCAM1', 'ITGA4', 0.90),
            ('CD34', 'SELP', 0.85), ('PECAM1', 'PECAM1', 0.88),

            # Apoptosis
            ('FASLG', 'FAS', 0.95), ('TNFSF10', 'TNFRSF10A', 0.90),

            # Neuropeptides
            ('BDNF', 'NTRK2', 0.88), ('NTF3', 'NTRK3', 0.85),

            # Metabolic
            ('LEP', 'LEPR', 0.92), ('ADIPOQ', 'ADIPOR1', 0.88),
            ('INS', 'INSR', 0.98), ('GCG', 'GCGR', 0.90),
        ]

        self.lr_pairs = pd.DataFrame(
            cardiac_lr_data,
            columns=['ligand', 'receptor', 'confidence']
        )

        print(f"✓ Loaded {len(self.lr_pairs)} L-R pairs from fallback cardiac database")

    def get_pairs(self, confidence_threshold: float = 0.0, present_in_data: Optional[List[str]] = None) -> List[Tuple[str, str]]:
        """
        Get L-R pairs as list of tuples

        Parameters:
        -----------
        confidence_threshold : float
            Minimum confidence score (0-1)
        present_in_data : list of str, optional
            Gene names present in dataset (filters to only available pairs)

        Returns:
        --------
        list of tuples: [(ligand, receptor), ...]
        """

        # Filter by confidence
        filtered = self.lr_pairs[self.lr_pairs['confidence'] >= confidence_threshold].copy()

        # Filter by gene availability
        if present_in_data is not None:
            present_set = set(present_in_data)
            filtered = filtered[
                filtered['ligand'].isin(present_set) &
                filtered['receptor'].isin(present_set)
            ]

        return list(zip(filtered['ligand'], filtered['receptor']))

    def get_dataframe(self, confidence_threshold: float = 0.0) -> pd.DataFrame:
        """Get L-R pairs as DataFrame"""
        return self.lr_pairs[self.lr_pairs['confidence'] >= confidence_threshold].copy()

    def save_to_csv(self, filepath: str):
        """Save database to CSV"""
        self.lr_pairs.to_csv(filepath, index=False)
        print(f"✓ Saved L-R database to {filepath}")


# Convenience function
def get_ligand_receptor_pairs(adata, resource: str = 'consensus', confidence_threshold: float = 0.7) -> List[Tuple[str, str]]:
    """
    Get ligand-receptor pairs filtered to genes present in adata

    Parameters:
    -----------
    adata : AnnData
        Annotated data object with gene names
    resource : str
        Database to use ('consensus', 'cellphonedb', etc.)
    confidence_threshold : float
        Minimum confidence (0-1)

    Returns:
    --------
    list of tuples: [(ligand, receptor), ...]
    """

    db = LigandReceptorDatabase(resource=resource)
    available_genes = adata.var_names.tolist()
    pairs = db.get_pairs(
        confidence_threshold=confidence_threshold,
        present_in_data=available_genes
    )

    print(f" Found {len(pairs)} L-R pairs present in dataset (from {len(db.lr_pairs)} total)")
    return pairs


if __name__ == "__main__":
    # Test the database loader
    print("Testing L-R Database Loader...")
    print("=" * 60)

    # Test with LIANA if available
    db = LigandReceptorDatabase(resource='consensus')
    print(f"\nTotal pairs: {len(db.lr_pairs)}")
    print(f"\nSample pairs:")
    print(db.lr_pairs.head(10))

    # Test filtering
    high_conf_pairs = db.get_pairs(confidence_threshold=0.9)
    print(f"\nHigh confidence pairs (>0.9): {len(high_conf_pairs)}")

    # Save example
    db.save_to_csv("lr_database_export.csv")