vedatonuryilmaz
/

MuLGIT

Model card Files Files and versions

xet

Community

vedatonuryilmaz commited on 11 days ago

Commit

a34e3f1

verified ·

1 Parent(s): c7ad40e

Upload mulgit/drug_target.py with huggingface_hub

Browse files

Files changed (1) hide show

mulgit/drug_target.py +438 -0

mulgit/drug_target.py ADDED Viewed

	@@ -0,0 +1,438 @@

+"""
+Chemical Genomics & Drug Target Identification Module
+Integrates multi-omics data with chemical genomics and perturbation
+genomics to identify molecular targets and pharmaceutical agents
+associated with exceptional longevity.
+Methods:
+  1. Drug-Target Affinity Prediction (SSM-DTA inspired)
+     - Cross-attention between drug (SMILES) and protein target representations
+     - Semi-supervised training with masked language modeling
+  2. Perturbation Response Prediction
+     - Predict gene expression changes after drug treatment
+     - Based on LINCS L1000 patterns + deep learning
+  3. Drug Repurposing for Longevity
+     - Match drug-induced expression changes to anti-aging signatures
+     - Identify existing drugs that mimic longevity-associated patterns
+Datasets:
+  - BALM/BALM-benchmark: Drug-target binding affinity
+  - LINCS L1000 (via pytdc): Perturbation gene expression signatures
+  - GDSC/CTRP (via pytdc): Drug sensitivity in cell lines
+References:
+  - SSM-DTA (arxiv:2206.09818): Drug-target affinity with semi-supervised training
+  - PaccMann (arxiv:1909.05114): Drug design from transcriptomic data
+  - MAMMAL (arxiv:2410.22367): Multi-modal drug discovery foundation model
+"""
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from typing import Optional, Dict, List, Tuple
+# ─── Molecular Encoders ──────────────────────────────────────────────────────
+class DrugEncoder(nn.Module):
+    """
+    Encodes drug SMILES strings into molecular embeddings.
+    Uses a simple 1D CNN over character-level SMILES tokens.
+    For production: replace with ChemBERTa, MolFormer, or similar
+    pretrained molecular transformer.
+    """
+    def __init__(
+        self,
+        vocab_size: int = 64,  # SMILES character vocabulary
+        embed_dim: int = 128,
+        hidden_dim: int = 256,
+        output_dim: int = 128,
+        num_layers: int = 3,
+        kernel_size: int = 5,
+    ):
+        super().__init__()
+        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
+        self.convs = nn.ModuleList([
+            nn.Conv1d(embed_dim if i == 0 else hidden_dim, hidden_dim, kernel_size, padding=kernel_size//2)
+            for i in range(num_layers)
+        ])
+        self.output = nn.Linear(hidden_dim, output_dim)
+        self.activation = nn.SELU()
+    def forward(self, smiles_tokens: torch.Tensor) -> torch.Tensor:
+        """
+        Args:
+          smiles_tokens: (B, L) tokenized SMILES strings
+        Returns:
+          drug_embedding: (B, output_dim)
+        """
+        x = self.embedding(smiles_tokens)  # (B, L, E)
+        x = x.transpose(1, 2)              # (B, E, L)
+        for conv in self.convs:
+            x = self.activation(conv(x))   # (B, H, L)
+        # Global average pooling
+        x = x.mean(dim=-1)                 # (B, H)
+        return self.output(x)
+class ProteinTargetEncoder(nn.Module):
+    """
+    Encodes protein target sequences (amino acid strings) into embeddings.
+    For production: replace with ESM-2 or ProtBERT pretrained embeddings.
+    """
+    def __init__(
+        self,
+        vocab_size: int = 26,  # amino acid alphabet
+        embed_dim: int = 128,
+        hidden_dim: int = 256,
+        output_dim: int = 128,
+        num_layers: int = 3,
+        kernel_size: int = 7,
+    ):
+        super().__init__()
+        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
+        self.convs = nn.ModuleList([
+            nn.Conv1d(embed_dim if i == 0 else hidden_dim, hidden_dim, kernel_size, padding=kernel_size//2)
+            for i in range(num_layers)
+        ])
+        self.output = nn.Linear(hidden_dim, output_dim)
+        self.activation = nn.SELU()
+    def forward(self, aa_tokens: torch.Tensor) -> torch.Tensor:
+        x = self.embedding(aa_tokens)
+        x = x.transpose(1, 2)
+        for conv in self.convs:
+            x = self.activation(conv(x))
+        x = x.mean(dim=-1)
+        return self.output(x)
+# ─── Drug-Target Affinity (DTA) Predictor ────────────────────────────────────
+class DrugTargetAffinityPredictor(nn.Module):
+    """
+    Predicts binding affinity between drugs and protein targets.
+    Uses cross-attention between drug and target representations,
+    inspired by SSM-DTA architecture.
+    """
+    def __init__(
+        self,
+        drug_dim: int = 128,
+        target_dim: int = 128,
+        hidden_dim: int = 256,
+        dropout: float = 0.1,
+    ):
+        super().__init__()
+        # Cross-attention: drug attends to target, target attends to drug
+        self.drug_cross_attn = nn.MultiheadAttention(
+            embed_dim=drug_dim, num_heads=4, batch_first=True, dropout=dropout
+        )
+        self.target_cross_attn = nn.MultiheadAttention(
+            embed_dim=target_dim, num_heads=4, batch_first=True, dropout=dropout
+        )
+        # Fusion + prediction
+        fusion_dim = drug_dim + target_dim
+        self.fusion = nn.Sequential(
+            nn.Linear(fusion_dim, hidden_dim),
+            nn.SELU(),
+            nn.AlphaDropout(dropout),
+            nn.Linear(hidden_dim, hidden_dim // 2),
+            nn.SELU(),
+            nn.AlphaDropout(dropout),
+            nn.Linear(hidden_dim // 2, 1),
+        )
+    def forward(
+        self,
+        drug_embed: torch.Tensor,
+        target_embed: torch.Tensor,
+    ) -> torch.Tensor:
+        """
+        Args:
+          drug_embed: (B, D_d) drug molecular embeddings
+          target_embed: (B, D_t) protein target embeddings
+        Returns:
+          affinity: (B,) predicted binding affinity (pKd)
+        """
+        # Cross-attention (treat as single-token sequences)
+        drug_attended, _ = self.drug_cross_attn(
+            drug_embed.unsqueeze(1),
+            target_embed.unsqueeze(1),
+            target_embed.unsqueeze(1),
+        )
+        target_attended, _ = self.target_cross_attn(
+            target_embed.unsqueeze(1),
+            drug_embed.unsqueeze(1),
+            drug_embed.unsqueeze(1),
+        )
+        # Concatenate and predict
+        fused = torch.cat([drug_attended.squeeze(1), target_attended.squeeze(1)], dim=-1)
+        return self.fusion(fused).squeeze(-1)
+# ─── Perturbation Response Predictor ─────────────────────────────────────────
+class PerturbationResponsePredictor(nn.Module):
+    """
+    Predicts gene expression changes after drug perturbation.
+    Architecture: drug embedding → conditioned decoder → gene expression delta.
+    Maps from LINCS L1000-style data: drug treatment → 978 landmark gene changes.
+    Reference: PaccMann, DeepProfile
+    """
+    def __init__(
+        self,
+        drug_dim: int = 128,
+        num_output_genes: int = 978,  # LINCS L1000 landmark genes
+        hidden_dim: int = 512,
+        dropout: float = 0.1,
+    ):
+        super().__init__()
+        self.condition_net = nn.Sequential(
+            nn.Linear(drug_dim, hidden_dim),
+            nn.SELU(),
+            nn.AlphaDropout(dropout),
+            nn.Linear(hidden_dim, hidden_dim),
+            nn.SELU(),
+            nn.AlphaDropout(dropout),
+        )
+        # Decoder: conditioned on drug embedding
+        self.decoder = nn.Sequential(
+            nn.Linear(hidden_dim + drug_dim, hidden_dim),
+            nn.SELU(),
+            nn.AlphaDropout(dropout),
+            nn.Linear(hidden_dim, hidden_dim // 2),
+            nn.SELU(),
+            nn.AlphaDropout(dropout),
+            nn.Linear(hidden_dim // 2, num_output_genes),
+        )
+    def forward(
+        self,
+        drug_embed: torch.Tensor,
+        baseline_expression: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        """
+        Args:
+          drug_embed: (B, D_d) drug embeddings
+          baseline_expression: (B, G) baseline gene expression (optional)
+        Returns:
+          predicted_expression: (B, G) predicted post-perturbation expression
+        """
+        condition = self.condition_net(drug_embed)
+        combined = torch.cat([condition, drug_embed], dim=-1)
+        delta = self.decoder(combined)
+        if baseline_expression is not None:
+            return baseline_expression + delta
+        return delta
+# ─── Longevity Drug Repurposing ──────────────────────────────────────────────
+class LongevityDrugScreener(nn.Module):
+    """
+    Screens drugs for longevity potential by comparing drug-induced
+    expression changes to anti-aging gene expression signatures.
+    Core idea: if a drug's perturbation signature reverses aging-associated
+    expression changes, it's a candidate longevity therapeutic.
+    """
+    def __init__(
+        self,
+        dta_predictor: DrugTargetAffinityPredictor,
+        perturbation_predictor: PerturbationResponsePredictor,
+        gene_dim: int = 978,
+    ):
+        super().__init__()
+        self.dta_predictor = dta_predictor
+        self.perturbation_predictor = perturbation_predictor
+        # Aging signature: the gene expression pattern to target
+        # Learned during training from aging datasets
+        self.aging_signature = nn.Parameter(torch.zeros(gene_dim))
+        nn.init.normal_(self.aging_signature, std=0.01)
+        # Longevity target signature: what we want to achieve
+        self.longevity_signature = nn.Parameter(torch.zeros(gene_dim))
+        nn.init.normal_(self.longevity_signature, std=0.01)
+    def compute_longevity_score(
+        self,
+        drug_embed: torch.Tensor,
+        target_embed: Optional[torch.Tensor] = None,
+    ) -> Tuple[torch.Tensor, Dict[str, torch.Tensor]]:
+        """
+        Score a drug for longevity potential.
+        Returns:
+          longevity_score: scalar (higher = better longevity drug)
+          details: dict with intermediate computations
+        """
+        # Predict perturbation effect
+        delta = self.perturbation_predictor(drug_embed)
+        # How well does the perturbation reverse the aging signature?
+        # We want: delta ≈ longevity_signature - aging_signature
+        target_delta = (self.longevity_signature - self.aging_signature).unsqueeze(0)  # (1, G)
+        reversal_score = -F.mse_loss(delta, target_delta.expand_as(delta), reduction='none').mean(dim=-1)
+        # Drug-target affinity (if target provided)
+        affinity = None
+        if target_embed is not None:
+            affinity = self.dta_predictor(drug_embed, target_embed)
+        details = {
+            "predicted_delta": delta,
+            "reversal_score": reversal_score,
+            "affinity": affinity,
+        }
+        return reversal_score, details
+    def screen_drugs(
+        self,
+        drug_embeds: List[torch.Tensor],
+        drug_names: List[str],
+        top_k: int = 10,
+    ) -> List[Tuple[str, float]]:
+        """Screen a batch of drugs and return top-k longevity candidates."""
+        scores = []
+        for embed, name in zip(drug_embeds, drug_names):
+            score, _ = self.compute_longevity_score(embed.unsqueeze(0))
+            scores.append((name, score.item()))
+        scores.sort(key=lambda x: x[1], reverse=True)
+        return scores[:top_k]
+# ─── End-to-End Drug Discovery Pipeline ──────────────────────────────────────
+class DrugDiscoveryPipeline:
+    """
+    Complete pipeline: multi-omics → drug targets → drug screening → validation.
+    Steps:
+      1. Use MuLGIT causal module to identify longevity-associated genes
+      2. Use DTA predictor to find drugs targeting those genes
+      3. Use perturbation predictor to verify drug effects
+      4. Rank drugs by longevity reversal potential
+    """
+    def __init__(
+        self,
+        dta_predictor: DrugTargetAffinityPredictor,
+        perturbation_predictor: PerturbationResponsePredictor,
+        screener: LongevityDrugScreener,
+    ):
+        self.dta = dta_predictor
+        self.perturbation = perturbation_predictor
+        self.screener = screener
+    def run(
+        self,
+        causal_gene_targets: List[str],
+        drug_pool: Dict[str, torch.Tensor],  # drug_name → embedding
+        target_pool: Dict[str, torch.Tensor],  # gene_name → embedding
+        top_k: int = 20,
+    ) -> Dict:
+        """
+        Full drug discovery run.
+        Args:
+          causal_gene_targets: genes identified as causal for longevity
+          drug_pool: dictionary of candidate drug embeddings
+          target_pool: dictionary of protein target embeddings
+          top_k: number of top drugs to return
+        """
+        results = []
+        for drug_name, drug_embed in drug_pool.items():
+            drug_scores = []
+            for gene in causal_gene_targets:
+                if gene in target_pool:
+                    target_embed = target_pool[gene]
+                    score, details = self.screener.compute_longevity_score(
+                        drug_embed.unsqueeze(0),
+                        target_embed.unsqueeze(0),
+                    )
+                    drug_scores.append({
+                        "gene": gene,
+                        "score": score.item(),
+                        "affinity": details["affinity"].item() if details["affinity"] is not None else None,
+                    })
+            if drug_scores:
+                avg_score = sum(d["score"] for d in drug_scores) / len(drug_scores)
+                results.append({
+                    "drug": drug_name,
+                    "avg_score": avg_score,
+                    "gene_details": sorted(drug_scores, key=lambda x: x["score"], reverse=True),
+                })
+        results.sort(key=lambda x: x["avg_score"], reverse=True)
+        return {"top_drugs": results[:top_k], "all_results": results}
+# ─── Molecular Tokenizers ────────────────────────────────────────────────────
+# Simple SMILES tokenizer (for MVP; use DeepChem/RDKit in production)
+SMILES_CHARS = sorted(set("ABCDEFGHIKLMNOPQRSTUVWXYZ[\\]^_abcdefghilmnopqrstuv=0123456789+-.()#@/\\%"))
+SMILES_TO_IDX = {c: i + 1 for i, c in enumerate(SMILES_CHARS)}  # 0 = padding
+# Amino acid tokenizer
+AA_CHARS = sorted(set("ACDEFGHIKLMNPQRSTVWY"))
+AA_TO_IDX = {c: i + 1 for i, c in enumerate(AA_CHARS)}
+def tokenize_smiles(smiles: str, max_len: int = 256) -> torch.Tensor:
+    """Tokenize a SMILES string."""
+    tokens = [SMILES_TO_IDX.get(c, 0) for c in smiles[:max_len]]
+    # Pad
+    tokens += [0] * (max_len - len(tokens))
+    return torch.tensor(tokens, dtype=torch.long)
+def tokenize_protein(sequence: str, max_len: int = 1024) -> torch.Tensor:
+    """Tokenize a protein amino acid sequence."""
+    tokens = [AA_TO_IDX.get(c, 0) for c in sequence[:max_len]]
+    tokens += [0] * (max_len - len(tokens))
+    return torch.tensor(tokens, dtype=torch.long)
+# ─── Model Factory ───────────────────────────────────────────────────────────
+def create_drug_discovery_modules() -> Tuple[
+    DrugTargetAffinityPredictor,
+    PerturbationResponsePredictor,
+    LongevityDrugScreener,
+]:
+    """Create all drug discovery modules with default configs."""
+    dta = DrugTargetAffinityPredictor(drug_dim=128, target_dim=128)
+    perturbation = PerturbationResponsePredictor(drug_dim=128)
+    screener = LongevityDrugScreener(dta, perturbation)
+    return dta, perturbation, screener