Spaces:

liuliu2333
/

deepmirt

Sleeping

App Files Files Community

liuliu2333 commited on Mar 9

Commit

fc481db

1 Parent(s): 9201562

Deploy DeepMiRT Gradio demo with model code

Browse files

Files changed (18) hide show

README.md +6 -7
app.py +360 -0
deepmirt/__init__.py +5 -0
deepmirt/data_module/__init__.py +0 -0
deepmirt/data_module/datamodule.py +239 -0
deepmirt/data_module/dataset.py +227 -0
deepmirt/data_module/preprocessing.py +251 -0
deepmirt/evaluation/__init__.py +1 -0
deepmirt/evaluation/predict.py +297 -0
deepmirt/model/__init__.py +0 -0
deepmirt/model/classifier.py +77 -0
deepmirt/model/cross_attention.py +115 -0
deepmirt/model/mirna_target_model.py +127 -0
deepmirt/model/rnafm_encoder.py +117 -0
deepmirt/predict.py +373 -0
deepmirt/training/__init__.py +0 -0
deepmirt/training/lightning_module.py +386 -0
requirements.txt +9 -0

README.md CHANGED Viewed

@@ -1,13 +1,12 @@
 ---
-title: Deepmirt
-emoji: 🚀
-colorFrom: indigo
-colorTo: pink
 sdk: gradio
-sdk_version: 6.9.0
 app_file: app.py
 pinned: false
 license: mit
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: DeepMiRT
+emoji: 🧬
+colorFrom: blue
+colorTo: green
 sdk: gradio
+sdk_version: 5.23.0
 app_file: app.py
 pinned: false
 license: mit
+short_description: miRNA target prediction with RNA foundation models
 ---

app.py ADDED Viewed

	@@ -0,0 +1,360 @@

+#!/usr/bin/env python3
+"""
+DeepMiRT Web Demo — Gradio interface for miRNA-target interaction prediction.
+Run locally:
+    python app.py
+Deploy on Hugging Face Spaces:
+    Set sdk: gradio in the Space README.md metadata.
+"""
+from __future__ import annotations
+import logging
+import re
+import tempfile
+from pathlib import Path
+import gradio as gr
+import numpy as np
+import pandas as pd
+import torch
+logging.basicConfig(level=logging.INFO, format="%(asctime)s %(message)s")
+logger = logging.getLogger(__name__)
+# ---------------------------------------------------------------------------
+# Global model (loaded once at startup)
+# ---------------------------------------------------------------------------
+_model = None
+_alphabet = None
+_config = None
+_device = "cuda" if torch.cuda.is_available() else "cpu"
+def _load_model():
+    """Load model from Hugging Face Hub (cached after first download)."""
+    global _model, _alphabet, _config
+    if _model is not None:
+        return
+    import fm
+    import torch
+    from huggingface_hub import hf_hub_download
+    from deepmirt.evaluation.predict import load_model_from_checkpoint
+    repo_id = "liuliu2333/deepmirt"
+    ckpt_path = hf_hub_download(repo_id=repo_id, filename="epoch=27-val_auroc=0.9612.ckpt")
+    config_path = hf_hub_download(repo_id=repo_id, filename="config.yaml")
+    logger.info("Loading model...")
+    _model, _config = load_model_from_checkpoint(ckpt_path, config_path, device=_device)
+    _, _alphabet = fm.pretrained.rna_fm_t12()
+    logger.info("Model loaded successfully.")
+# ---------------------------------------------------------------------------
+# Validation helpers
+# ---------------------------------------------------------------------------
+_VALID_BASES = set("AUGC")
+def _validate_seq(seq: str, name: str, min_len: int = 1, max_len: int = 200) -> str:
+    """Validate and clean an RNA/DNA sequence."""
+    seq = seq.strip().upper().replace("T", "U")
+    if not seq:
+        raise gr.Error(f"{name} sequence is empty.")
+    if len(seq) < min_len or len(seq) > max_len:
+        raise gr.Error(f"{name} must be {min_len}-{max_len} nt, got {len(seq)} nt.")
+    invalid = set(seq) - _VALID_BASES
+    if invalid:
+        raise gr.Error(f"{name} contains invalid characters: {invalid}. Only A/U/G/C/T allowed.")
+    return seq
+# ---------------------------------------------------------------------------
+# Prediction logic
+# ---------------------------------------------------------------------------
+def _predict_pair(mirna_seq: str, target_seq: str) -> np.ndarray:
+    """Run model inference on a single pair."""
+    import torch
+    from torch.nn.utils.rnn import pad_sequence
+    _load_model()
+    batch_converter = _alphabet.get_batch_converter()
+    padding_idx = _alphabet.padding_idx
+    _, _, m_tok = batch_converter([("m", mirna_seq)])
+    _, _, t_tok = batch_converter([("t", target_seq)])
+    mirna_padded = pad_sequence([m_tok[0]], batch_first=True, padding_value=padding_idx)
+    target_stacked = torch.stack([t_tok[0]])
+    attn_mask_mirna = (mirna_padded != padding_idx).long().to(_device)
+    attn_mask_target = torch.ones_like(target_stacked, dtype=torch.long).to(_device)
+    mirna_padded = mirna_padded.to(_device)
+    target_stacked = target_stacked.to(_device)
+    with torch.no_grad():
+        logits = _model.model(mirna_padded, target_stacked, attn_mask_mirna, attn_mask_target)
+        prob = torch.sigmoid(logits.squeeze(-1)).cpu().numpy()
+    return prob
+def predict_single(mirna_seq: str, target_seq: str):
+    """Gradio callback for single prediction."""
+    mirna_rna = _validate_seq(mirna_seq, "miRNA", min_len=15, max_len=30)
+    target_rna = _validate_seq(target_seq, "Target", min_len=20, max_len=50)
+    prob = _predict_pair(mirna_rna, target_rna)
+    p = float(prob[0])
+    label = "INTERACTION" if p >= 0.5 else "NO INTERACTION"
+    color = "#2ecc71" if p >= 0.5 else "#e74c3c"
+    details = {
+        "probability": round(p, 6),
+        "prediction": label,
+        "threshold": 0.5,
+        "mirna_length": len(mirna_rna),
+        "target_length": len(target_rna),
+    }
+    return (
+        f"<div style='text-align:center;padding:20px;'>"
+        f"<span style='font-size:48px;font-weight:bold;color:{color};'>{p:.4f}</span><br>"
+        f"<span style='font-size:20px;color:{color};'>{label}</span></div>"
+    ), details
+def predict_batch(file):
+    """Gradio callback for batch prediction."""
+    if file is None:
+        raise gr.Error("Please upload a CSV file.")
+    _load_model()
+    df = pd.read_csv(file.name)
+    mirna_col = None
+    target_col = None
+    for col in df.columns:
+        cl = col.lower().strip()
+        if "mirna" in cl:
+            mirna_col = col
+        elif "target" in cl:
+            target_col = col
+    if mirna_col is None or target_col is None:
+        raise gr.Error(
+            "CSV must contain a column with 'mirna' and a column with 'target' in the name. "
+            f"Found columns: {list(df.columns)}"
+        )
+    mirna_seqs = df[mirna_col].astype(str).tolist()
+    target_seqs = df[target_col].astype(str).tolist()
+    # Validate and convert
+    cleaned_mirna = []
+    cleaned_target = []
+    for i, (m, t) in enumerate(zip(mirna_seqs, target_seqs)):
+        m = m.strip().upper().replace("T", "U")
+        t = t.strip().upper().replace("T", "U")
+        invalid_m = set(m) - _VALID_BASES
+        invalid_t = set(t) - _VALID_BASES
+        if invalid_m or invalid_t:
+            raise gr.Error(f"Row {i}: invalid characters in sequences.")
+        cleaned_mirna.append(m)
+        cleaned_target.append(t)
+    # Batch inference
+    import torch
+    from torch.nn.utils.rnn import pad_sequence
+    batch_converter = _alphabet.get_batch_converter()
+    padding_idx = _alphabet.padding_idx
+    all_probs = []
+    batch_size = 128
+    with torch.no_grad():
+        for start in range(0, len(cleaned_mirna), batch_size):
+            batch_m = cleaned_mirna[start : start + batch_size]
+            batch_t = cleaned_target[start : start + batch_size]
+            m_toks = []
+            t_toks = []
+            for ms, ts in zip(batch_m, batch_t):
+                _, _, mt = batch_converter([("m", ms)])
+                _, _, tt = batch_converter([("t", ts)])
+                m_toks.append(mt[0])
+                t_toks.append(tt[0])
+            mirna_padded = pad_sequence(m_toks, batch_first=True, padding_value=padding_idx)
+            target_stacked = torch.stack(t_toks)
+            attn_mask_mirna = (mirna_padded != padding_idx).long().to(_device)
+            attn_mask_target = torch.ones_like(target_stacked, dtype=torch.long).to(_device)
+            logits = _model.model(
+                mirna_padded.to(_device),
+                target_stacked.to(_device),
+                attn_mask_mirna,
+                attn_mask_target,
+            )
+            probs = torch.sigmoid(logits.squeeze(-1)).cpu().numpy()
+            all_probs.append(probs)
+    all_probs = np.concatenate(all_probs)
+    df["probability"] = all_probs
+    df["prediction"] = (all_probs >= 0.5).astype(int)
+    # Save to temp file for download
+    out_path = Path(tempfile.mkdtemp()) / "deepmirt_predictions.csv"
+    df.to_csv(str(out_path), index=False)
+    return str(out_path), df.head(20)
+# ---------------------------------------------------------------------------
+# Examples
+# ---------------------------------------------------------------------------
+EXAMPLES = [
+    # [miRNA, target_40nt] - real miRNA-target pairs
+    ["UGAGGUAGUAGGUUGUAUAGUU", "ACUGCAGCAUAUCUACUAUUUGCUACUGUAACCAUUGAUCU"],   # let-7a / lin-41
+    ["UAAAGUGCUUAUAGUGCAGGUAG", "GCAGCAUUGUACAGGGCUAUCAGAAACUAUUGACACUAAAA"],  # miR-20a / E2F1
+    ["UAGCAGCACGUAAAUAUUGGCG", "GCAAUGUUUUCCACAGUGCUUACACAGAAAUAGCAACUUUA"],   # miR-16 / BCL2
+    ["CAUCAAAGUGGAGGCCCUCUCU", "AAUGCUUCUAAAUUGAAUCCAAACUGCAGUUUAUUAGUGGU"],   # miR-198 (negative)
+    ["UGGAAUGUAAAGAAGUAUGUAU", "UCGAAUCCAUGCAAAACAGCUUGAUUUGUUAGUACACGAAU"],   # miR-1 / HAND2
+]
+# ---------------------------------------------------------------------------
+# Gradio UI
+# ---------------------------------------------------------------------------
+def build_demo():
+    with gr.Blocks(
+        title="DeepMiRT: miRNA Target Prediction",
+        theme=gr.themes.Soft(),
+    ) as demo:
+        gr.Markdown(
+            """
+            # DeepMiRT: miRNA Target Prediction with RNA Foundation Models
+            Predict miRNA-target interactions using RNA-FM embeddings and cross-attention.
+            Ranked **#1** on eCLIP benchmarks (AUROC 0.75) and achieves **AUROC 0.96** on our comprehensive test set.
+            **Paper:** *coming soon* | **GitHub:** [DeepMiRT](https://github.com/zichengll/DeepMiRT) | **Model:** [Hugging Face](https://huggingface.co/liuliu2333/deepmirt)
+            """
+        )
+        with gr.Tab("Single Prediction"):
+            with gr.Row():
+                with gr.Column():
+                    mirna_input = gr.Textbox(
+                        label="miRNA Sequence",
+                        placeholder="e.g., UGAGGUAGUAGGUUGUAUAGUU",
+                        info="18-25 nt. DNA (T) or RNA (U) format accepted.",
+                    )
+                    target_input = gr.Textbox(
+                        label="Target Sequence",
+                        placeholder="e.g., ACUGCAGCAUAUCUACUAUUUGCUACUGUAACCAUUGAUCU",
+                        info="40 nt recommended. DNA (T) or RNA (U) format accepted.",
+                    )
+                    predict_btn = gr.Button("Predict", variant="primary")
+                with gr.Column():
+                    result_html = gr.HTML(label="Prediction Result")
+                    result_json = gr.JSON(label="Details")
+            predict_btn.click(
+                predict_single,
+                inputs=[mirna_input, target_input],
+                outputs=[result_html, result_json],
+            )
+            gr.Examples(
+                examples=EXAMPLES,
+                inputs=[mirna_input, target_input],
+                outputs=[result_html, result_json],
+                fn=predict_single,
+                cache_examples=False,
+            )
+        with gr.Tab("Batch Prediction"):
+            gr.Markdown(
+                """
+                Upload a CSV file with columns containing **mirna** and **target** in the column names.
+                Example format:
+                | mirna_seq | target_seq |
+                |-----------|------------|
+                | UGAGGUAGUAGGUUGUAUAGUU | ACUGCAGCAUAUCUACUAUUUGCUACUGUAACCAUUGAUCU |
+                """
+            )
+            csv_input = gr.File(label="Upload CSV", file_types=[".csv"])
+            batch_btn = gr.Button("Run Batch Prediction", variant="primary")
+            csv_output = gr.File(label="Download Results")
+            preview = gr.Dataframe(label="Preview (first 20 rows)")
+            batch_btn.click(
+                predict_batch,
+                inputs=[csv_input],
+                outputs=[csv_output, preview],
+            )
+        with gr.Tab("About"):
+            gr.Markdown(
+                """
+                ## Model Architecture
+                DeepMiRT uses a **shared RNA-FM encoder** (12-layer Transformer, pre-trained on 23M non-coding RNAs)
+                to embed both miRNA and target sequences into the same representation space.
+                A **cross-attention module** (2 layers, 8 heads) allows the target to attend to the miRNA,
+                capturing interaction patterns. The attended representations are pooled and classified
+                by an **MLP head** (640 → 256 → 64 → 1).
+                ```
+                miRNA  → [RNA-FM Encoder] → miRNA embedding  ─────────┐
+                                                                       ↓
+                Target → [RNA-FM Encoder] → target embedding → [Cross-Attention] → Pool → [MLP] → probability
+                ```
+                ## Training
+                - **Data:** miRNA-target interactions from multiple databases and literature mining
+                - **Two-phase training:** Phase 1 (frozen backbone) → Phase 2 (unfreeze top 3 RNA-FM layers)
+                - **Hardware:** 2× NVIDIA L20 GPUs, mixed-precision (fp16)
+                - **Best checkpoint:** epoch 27, val AUROC = 0.9612
+                ## Performance
+                | Benchmark | AUROC | Rank |
+                |-----------|-------|------|
+                | miRBench eCLIP (Klimentova 2022) | 0.7511 | #1/12 |
+                | miRBench eCLIP (Manakov 2022) | 0.7543 | #1/12 |
+                | miRBench CLASH (Hejret 2023) | 0.6952 | #5/12 |
+                | Our test set (813K samples, 16 methods) | 0.9606 | #1/16 |
+                ## Citation
+                If you use DeepMiRT in your research, please cite:
+                ```
+                @software{liu2026deepmirt,
+                  title={DeepMiRT: miRNA Target Prediction with RNA Foundation Models},
+                  author={Liu, Zicheng},
+                  year={2026},
+                  url={https://github.com/zichengll/DeepMiRT}
+                }
+                ```
+                ## License
+                MIT License. See [LICENSE](https://github.com/zichengll/DeepMiRT/blob/main/LICENSE).
+                """
+            )
+    return demo
+if __name__ == "__main__":
+    demo = build_demo()
+    demo.launch()

deepmirt/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+"""DeepMiRT: miRNA target prediction using RNA foundation models and cross-attention."""
+__version__ = "1.0.0"
+from deepmirt.predict import predict as predict

deepmirt/data_module/__init__.py ADDED Viewed

File without changes

deepmirt/data_module/datamodule.py ADDED Viewed

	@@ -0,0 +1,239 @@

+#!/usr/bin/env python3
+"""
+miRNA-Target PyTorch Lightning DataModule
+[Lightning DataModule Lifecycle]
+Lightning DataModule encapsulates data loading logic into a reusable module.
+Its lifecycle is as follows:
+  1. prepare_data()    — download data (runs only on main process; not needed in this project)
+  2. setup(stage)      — create Dataset instances (runs on every process)
+     - stage='fit'     → create train_dataset + val_dataset
+     - stage='test'    → create test_dataset
+     - stage='predict' → create predict_dataset
+  3. train_dataloader() — return training DataLoader
+  4. val_dataloader()   — return validation DataLoader
+  5. test_dataloader()  — return test DataLoader
+[Why use DataModule instead of manually creating DataLoaders?]
+- Centralizes all data-related logic (paths, batch size, tokenizer, data splits)
+- Lightning Trainer automatically calls the correct methods, reducing boilerplate
+- Makes it easy to reuse the same data configuration across different experiments
+[collate_fn Explained — The Core Difficulty of This Module]
+Since miRNA sequence lengths are variable (15-30nt → 17-32 tokens),
+samples in the same batch may have mirna_tokens of different lengths.
+PyTorch's default collate cannot stack variable-length tensors,
+so we need a custom collate_fn to:
+  1. Find the longest miRNA sequence in the batch
+  2. Pad all miRNA sequences to the same length
+  3. Generate an attention mask indicating which positions are real tokens vs. padding
+Target sequences are fixed at 40nt (→ 42 tokens) and do not require additional padding.
+"""
+from __future__ import annotations
+import os
+import fm
+import pytorch_lightning as pl
+import torch
+from torch.nn.utils.rnn import pad_sequence
+from torch.utils.data import DataLoader
+from deepmirt.data_module.dataset import MiRNATargetDataset
+class MiRNATargetDataModule(pl.LightningDataModule):
+    """
+    Lightning DataModule for miRNA-target pairs.
+    [Responsibilities]
+    - Manage creation and DataLoader configuration for train / val / test datasets
+    - Provide a custom collate_fn to handle variable-length miRNA sequence padding
+    - Encapsulate RNA-FM alphabet loading to avoid redundant initialization in multiple places
+    """
+    def __init__(
+        self,
+        data_dir: str,
+        batch_size: int = 128,
+        num_workers: int = 8,
+        pin_memory: bool = True,
+    ):
+        """
+        Initialize the DataModule.
+        Args:
+            data_dir (str): path to the directory containing train.csv / val.csv / test.csv
+            batch_size (int): number of samples per batch, default 128
+            num_workers (int): number of DataLoader worker processes, default 8
+                # Design decision: num_workers controls data prefetching parallelism
+                # - 0 = load in main process (for debugging, slow but easy to troubleshoot)
+                # - 8 = 8 subprocesses load in parallel (for training, fully utilize multi-core CPU)
+                # - Rule of thumb: set to half of CPU cores or GPU count x 4
+                # - Too many will cause memory overhead and process switching overhead
+            pin_memory (bool): whether to pin data to page-locked memory, default True
+                # Design decision: pin_memory accelerates CPU→GPU data transfer
+                # - True: data is first copied to pinned memory, then transferred to GPU via DMA
+                #   Eliminates one memory copy, improving throughput by ~2x
+                # - False: data is in pageable memory and must be copied to pinned memory before transfer
+                # - Only meaningful when using GPU; set to False for CPU training
+        """
+        super().__init__()
+        self.data_dir = data_dir
+        self.batch_size = batch_size
+        self.num_workers = num_workers
+        self.pin_memory = pin_memory
+        # Dataset instances, created in setup()
+        self.train_dataset: MiRNATargetDataset | None = None
+        self.val_dataset: MiRNATargetDataset | None = None
+        self.test_dataset: MiRNATargetDataset | None = None
+        # Load RNA-FM alphabet in the main process (before DDP fork)
+        # This way the alphabet is loaded only once, avoiding redundant full model loading on each DDP rank
+        _model, alphabet = fm.pretrained.rna_fm_t12()
+        del _model  # Free model weights, keep only the alphabet (tokenizer)
+        self._alphabet = alphabet
+        self._padding_idx = alphabet.padding_idx  # padding_idx = 1
+    def setup(self, stage: str | None = None) -> None:
+        """
+        Create Dataset instances.
+        Lightning automatically calls this method before training/validation/testing begins.
+        Each process (including multi-GPU DDP scenarios) calls setup() independently.
+        Args:
+            stage: 'fit' (train+val), 'test', 'predict', or None (all)
+        """
+        # alphabet was already loaded in __init__() (before DDP fork, loaded only once)
+        alphabet = self._alphabet
+        if stage == "fit" or stage is None:
+            self.train_dataset = MiRNATargetDataset(
+                os.path.join(self.data_dir, "train.csv"), alphabet
+            )
+            self.val_dataset = MiRNATargetDataset(
+                os.path.join(self.data_dir, "val.csv"), alphabet
+            )
+        if stage == "test" or stage is None:
+            self.test_dataset = MiRNATargetDataset(
+                os.path.join(self.data_dir, "test.csv"), alphabet
+            )
+    def train_dataloader(self) -> DataLoader:
+        """Return the training DataLoader (shuffle=True to randomize data order)."""
+        return DataLoader(
+            self.train_dataset,
+            batch_size=self.batch_size,
+            shuffle=True,
+            num_workers=self.num_workers,
+            pin_memory=self.pin_memory,
+            collate_fn=self._collate_fn,
+            drop_last=True,
+        )
+    def val_dataloader(self) -> DataLoader:
+        """Return the validation DataLoader (shuffle=False to preserve order for reproducible evaluation)."""
+        return DataLoader(
+            self.val_dataset,
+            batch_size=self.batch_size,
+            shuffle=False,
+            num_workers=self.num_workers,
+            pin_memory=self.pin_memory,
+            collate_fn=self._collate_fn,
+        )
+    def test_dataloader(self) -> DataLoader:
+        """Return the test DataLoader."""
+        return DataLoader(
+            self.test_dataset,
+            batch_size=self.batch_size,
+            shuffle=False,
+            num_workers=self.num_workers,
+            pin_memory=self.pin_memory,
+            collate_fn=self._collate_fn,
+        )
+    def _collate_fn(self, batch: list[dict]) -> dict:
+        """
+        Custom batch collation function — handles padding of variable-length miRNA sequences.
+        [Why is a custom collate_fn needed?]
+        PyTorch's default collate_fn attempts to stack all sample tensors.
+        But miRNA sequence lengths are variable (15-30nt → 17-32 tokens), and direct stacking fails:
+            RuntimeError: stack expects each tensor to be equal size
+        [Why does miRNA need padding but target does not?]
+        - miRNA has variable length: 15-30 nucleotides → 17-32 tokens after adding BOS+EOS
+          A single batch may contain lengths of both 17 and 32, which must be aligned
+        - Target has fixed length: all samples are 40 nucleotides → 42 tokens
+          Naturally aligned, no padding needed
+        [Role of attention_mask]
+        - Tells the model which positions are real tokens (1) and which are padding (0)
+        - The Transformer's self-attention uses the mask to block padding positions
+        - Prevents padding tokens from participating in attention computation, avoiding noise
+        # Design decision: use pad_sequence instead of manual loop padding
+        # pad_sequence is a PyTorch built-in utility, optimized in C++, faster than Python loops
+        # It automatically finds the maximum length and pads shorter sequences with the specified value
+        Args:
+            batch: list of dicts, each dict from MiRNATargetDataset.__getitem__
+        Returns:
+            dict: containing the following key-value pairs:
+                - 'mirna_tokens':          (batch_size, max_mirna_len) LongTensor
+                - 'target_tokens':         (batch_size, 42) LongTensor
+                - 'labels':                (batch_size,) float32 Tensor
+                - 'attention_mask_mirna':  (batch_size, max_mirna_len) LongTensor
+                - 'attention_mask_target': (batch_size, 42) LongTensor
+        """
+        # ── 1. Collect individual fields ──
+        mirna_list = [sample["mirna_tokens"] for sample in batch]
+        target_list = [sample["target_tokens"] for sample in batch]
+        label_list = [sample["label"] for sample in batch]
+        # ── 2. Pad miRNA sequences ──
+        # pad_sequence converts list of 1D tensors → 2D tensor (batch, max_len)
+        # batch_first=True ensures the batch dimension comes first
+        # padding_value=1 is RNA-FM's <pad> token ID
+        mirna_padded = pad_sequence(
+            mirna_list, batch_first=True, padding_value=self._padding_idx
+        )
+        # ── 3. Stack target sequences (fixed 42 tokens, no padding needed) ──
+        target_stacked = torch.stack(target_list)
+        # ── 4. Stack labels ──
+        labels = torch.stack(label_list)
+        # ── 5. Generate attention masks ──
+        # miRNA mask: non-padding positions = 1, padding positions = 0
+        attention_mask_mirna = (mirna_padded != self._padding_idx).long()
+        # target mask: all positions are real tokens, so all 1s
+        # Because target is fixed at 40nt with no padding, every position is valid
+        attention_mask_target = torch.ones_like(target_stacked, dtype=torch.long)
+        # ── 6. Collect metadata (for stratified analysis during evaluation) ──
+        # Each metadata field is collected as list[str], kept on CPU
+        metadata_keys = batch[0].get("metadata", {}).keys()
+        metadata = {
+            key: [sample["metadata"][key] for sample in batch]
+            for key in metadata_keys
+        } if metadata_keys else {}
+        return {
+            "mirna_tokens": mirna_padded,             # (B, max_mirna_len)
+            "target_tokens": target_stacked,          # (B, 42)
+            "labels": labels,                         # (B,)
+            "attention_mask_mirna": attention_mask_mirna,    # (B, max_mirna_len)
+            "attention_mask_target": attention_mask_target,  # (B, 42)
+            "metadata": metadata,                     # dict[str, list[str]]
+        }

deepmirt/data_module/dataset.py ADDED Viewed

	@@ -0,0 +1,227 @@

+#!/usr/bin/env python3
+"""
+miRNA-Target Pair Dataset — PyTorch Dataset Implementation
+[Data Flow ASCII Diagram]
+┌─────────────────────────────────────────────────────────────────────┐
+│                  MiRNATargetDataset Data Flow                       │
+│                                                                     │
+│  CSV file (train.csv / val.csv / test.csv)                          │
+│     │                                                               │
+│     ▼                                                               │
+│  pd.read_csv() ─→ DataFrame (loaded entirely into memory)           │
+│     │                                                               │
+│     ▼                                                               │
+│  __getitem__(idx) ─→ retrieve row idx                               │
+│     │                                                               │
+│     ├─→ mirna_seq: "ATCGATCG"                                      │
+│     │       │                                                       │
+│     │       ▼                                                       │
+│     │   dna_to_rna() ─→ "AUCGAUCG"  (T→U conversion)               │
+│     │       │                                                       │
+│     │       ▼                                                       │
+│     │   batch_converter([("mirna", "AUCGAUCG")])                    │
+│     │       │                                                       │
+│     │       ▼                                                       │
+│     │   tokens: tensor([0, 4, 7, 5, 6, ...., 2])                   │
+│     │           ^^BOS                   ^^EOS                       │
+│     │                                                               │
+│     ├─→ target_fragment_40nt: "TAGCTAGC..."                         │
+│     │       │  (same dna_to_rna + batch_converter pipeline)         │
+│     │       ▼                                                       │
+│     │   tokens: tensor([0, ..., 2])  (fixed 42 tokens: BOS+40nt+EOS)│
+│     │                                                               │
+│     └─→ return dict:                                                │
+│           {                                                         │
+│             'mirna_tokens':  1D LongTensor (variable 17-32)         │
+│             'target_tokens': 1D LongTensor (fixed 42)               │
+│             'label':         float32 scalar (0.0 or 1.0)            │
+│             'metadata':      dict (species, mirna_name, ...)        │
+│           }                                                         │
+└─────────────────────────────────────────────────────────────────────┘
+[RNA-FM batch_converter Input/Output Format]
+- Input: List[Tuple[str, str]] = [("label_name", "RNA_sequence")]
+  e.g.: [("mirna", "AUCGAUCG")]
+- Output: Tuple[List[str], List[str], Tensor]
+  - labels: ["mirna"]           — label list (not used by us)
+  - strs:   ["AUCGAUCG"]       — raw sequences (not used by us)
+  - tokens: tensor([[0, 4, 7, 5, 6, 4, 7, 5, 6, 2]])
+             shape = (batch=1, seq_len)
+             where 0=BOS(<cls>), 2=EOS(<eos>), 1=PAD(<pad>)
+             A=4, C=5, G=6, U=7
+- Important: batch_converter already adds BOS and EOS for us!
+  So 22nt miRNA → 24 tokens (BOS + 22nt + EOS)
+  40nt target → 42 tokens (BOS + 40nt + EOS)
+"""
+from __future__ import annotations
+import pandas as pd
+import torch
+from torch.utils.data import Dataset
+from deepmirt.data_module.preprocessing import dna_to_rna
+class MiRNATargetDataset(Dataset):
+    """
+    PyTorch Dataset for miRNA-target pairs.
+    [Overview]
+    Loads miRNA-target sequence pairs from a CSV file, tokenizes them using
+    the RNA-FM alphabet, and returns token tensors and labels for training.
+    [Usage]
+    >>> import fm
+    >>> _, alphabet = fm.pretrained.rna_fm_t12()
+    >>> ds = MiRNATargetDataset('path/to/train.csv', alphabet)
+    >>> sample = ds[0]
+    >>> sample['mirna_tokens']   # tensor([0, 4, 7, 5, ..., 2])
+    >>> sample['label']          # tensor(1.)
+    [Why inherit from torch.utils.data.Dataset?]
+    - It is the standard PyTorch interface for data loading
+    - After defining __len__ and __getitem__, it can be used with DataLoader
+    - DataLoader automatically handles batching, multi-process loading, shuffling, etc.
+    """
+    def __init__(
+        self,
+        csv_path: str,
+        alphabet,
+        max_mirna_len: int = 30,
+        max_target_len: int = 40,
+    ):
+        """
+        Initialize the dataset.
+        Args:
+            csv_path (str): Path to the CSV file, which must contain the following columns:
+                - mirna_seq: miRNA sequence (DNA notation)
+                - target_fragment_40nt: target fragment sequence (DNA notation)
+                - label: binary label (0 or 1)
+                - species, mirna_name, target_gene_name: metadata columns
+            alphabet: RNA-FM alphabet object that provides tokenization capability
+            max_mirna_len (int): maximum nucleotide length for miRNA, default 30
+                (actual token count = max_mirna_len + 2, due to BOS and EOS)
+            max_target_len (int): maximum nucleotide length for target, default 40
+                (actual token count = max_target_len + 2 = 42)
+        [Design Decision: Memory Strategy]
+        We use pd.read_csv() to load the entire CSV into a DataFrame at once.
+        This is the simplest approach — for our data scale (~5.4 million training rows),
+        the DataFrame occupies approximately 2-3 GB of memory.
+        The current system has 1TB RAM, so this is not an issue at all.
+        # Design decision: if memory is limited (e.g., 8GB), consider these alternatives:
+        # 1. Byte-offset indexing: first pass records byte positions of each row in the file,
+        #    __getitem__ uses file.seek(offset) to jump to and read that row
+        # 2. Memory mapping (mmap): open the file with mmap, read on demand
+        # 3. Chunked reading: load in chunks, combined with LRU cache
+        # These methods sacrifice code simplicity for lower memory usage
+        """
+        super().__init__()
+        # Save configuration parameters
+        self.csv_path = csv_path
+        self.alphabet = alphabet
+        self.max_mirna_len = max_mirna_len
+        self.max_target_len = max_target_len
+        # Get batch_converter for tokenization
+        # batch_converter is the tokenization tool provided by RNA-FM, converting RNA strings to token IDs
+        self.batch_converter = alphabet.get_batch_converter()
+        # Design decision: load entire CSV into memory (see docstring above for details)
+        # On a 1TB RAM system, 5.4 million rows ≈ 2-3 GB, easily affordable
+        self.df = pd.read_csv(
+            csv_path,
+            dtype={"target_gene_name": str, "target_gene_id": str},
+        )
+    def __len__(self) -> int:
+        """
+        Return the number of samples in the dataset.
+        DataLoader calls this method to determine how many steps per epoch.
+        e.g.: len(dataset)=557521, batch_size=128 → ~4356 steps per epoch
+        """
+        return len(self.df)
+    def __getitem__(self, idx: int) -> dict:
+        """
+        Retrieve the idx-th sample, returning a dict of tokenized tensors.
+        [Processing Pipeline]
+        1. Extract row idx from the DataFrame
+        2. Get mirna_seq and target_fragment_40nt
+        3. Apply dna_to_rna() for T→U conversion
+        4. Tokenize with RNA-FM batch_converter
+        5. Assemble and return the dict
+        Args:
+            idx (int): sample index, range [0, len(self)-1]
+        Returns:
+            dict: containing the following key-value pairs:
+                - 'mirna_tokens': 1D LongTensor, miRNA token sequence
+                    shape = (mirna_len+2,), including BOS and EOS
+                - 'target_tokens': 1D LongTensor, target token sequence
+                    shape = (42,), fixed length (BOS + 40nt + EOS)
+                - 'label': float32 scalar tensor (0.0 or 1.0)
+                - 'metadata': dict, containing species, mirna_name, target_gene_name
+        """
+        # ── Step 1: Extract one row from the DataFrame ──
+        row = self.df.iloc[idx]
+        # ── Step 2: Extract sequences and label ──
+        mirna_seq_raw = row["mirna_seq"]
+        target_seq_raw = row["target_fragment_40nt"]
+        label = row["label"]
+        # ── Step 3: DNA-to-RNA conversion (T → U) ──
+        # Sequences in the dataset use DNA notation (T for thymine),
+        # but the RNA-FM model expects RNA notation (U for uridine), so conversion is needed
+        mirna_rna = dna_to_rna(mirna_seq_raw)
+        target_rna = dna_to_rna(target_seq_raw)
+        # ── Step 4: Tokenize using RNA-FM batch_converter ──
+        # batch_converter input format: List[Tuple[label, sequence]]
+        # It automatically adds BOS(<cls>=0) and EOS(<eos>=2) tokens around the sequence
+        #
+        # e.g.: [("mirna", "AUCG")]
+        # output tokens: tensor([[0, 4, 7, 5, 6, 2]])
+        #                 BOS=0  A  U  C  G  EOS=2
+        #
+        # Here we process only 1 sequence at a time (batch_size=1),
+        # so we use tokens[0] to extract the first one, yielding a 1D tensor
+        # Tokenize miRNA
+        _, _, mirna_tokens = self.batch_converter([("mirna", mirna_rna)])
+        mirna_tokens = mirna_tokens[0]  # (1, seq_len) → (seq_len,)
+        # Tokenize target
+        _, _, target_tokens = self.batch_converter([("target", target_rna)])
+        target_tokens = target_tokens[0]  # (1, 42) → (42,)
+        # ── Step 5: Assemble the return dict ──
+        # Why use float32 for label?
+        # Because training uses BCEWithLogitsLoss (binary cross-entropy),
+        # which requires both target and prediction to be float type.
+        # If label is int/long, PyTorch will raise a type mismatch error.
+        return {
+            "mirna_tokens": mirna_tokens,       # 1D LongTensor, variable (17-32)
+            "target_tokens": target_tokens,     # 1D LongTensor, fixed 42
+            "label": torch.tensor(label, dtype=torch.float32),  # scalar float32
+            "metadata": {
+                "species": row["species"],
+                "mirna_name": row["mirna_name"],
+                "target_gene_name": row["target_gene_name"],
+                "evidence_type": row.get("evidence_type", ""),
+                "source_database": row.get("source_database", ""),
+            },
+        }

deepmirt/data_module/preprocessing.py ADDED Viewed

	@@ -0,0 +1,251 @@

+#!/usr/bin/env python3
+"""
+Data Preprocessing Utilities — RNA Sequence Format Conversion Module
+This module converts DNA-notation sequences in the dataset to the RNA notation
+format required by the RNA-FM model.
+[Why is this conversion needed?]
+- The RNA-FM model was trained on RNA sequences and expects input in RNA notation: A, U, G, C
+- Our dataset stores sequences in DNA notation: A, T, G, C (where T replaces U)
+- During training, DNA notation T must be converted to RNA notation U to match the model's expected input format
+[Architecture Position]
+- This module is called by Dataset.__getitem__() during training
+- The conversion happens at the data loading stage without modifying the original CSV files
+- Reference: finalize_dataset.py:86-93 performs the reverse operation (U→T) for data export
+[Design Decisions]
+- Conversion is performed online (in the Dataset) rather than preprocessing the CSV, to preserve original data integrity
+- All sequences are converted to uppercase to ensure format consistency
+- The character N (representing ambiguous bases) is allowed; RNA-FM can handle ambiguous bases
+"""
+from __future__ import annotations
+def dna_to_rna(seq: str) -> str:
+    """
+    Convert a DNA-notation sequence to an RNA-notation sequence.
+    [Description]
+    - Converts T (thymine, DNA) to U (uridine, RNA)
+    - Converts to uppercase
+    - Removes all whitespace characters
+    - Idempotent: sequences already in RNA format remain unchanged
+    [Design Decisions]
+    - Why convert online? To keep the original CSV data intact for auditing and reproducibility
+    - Why uppercase? To ensure consistency with the RNA-FM model's expected input format
+    - Why allow N? RNA-FM's tokenizer can handle ambiguous bases
+    Args:
+        seq (str): DNA-notation sequence string, may contain A, T, G, C, N and whitespace
+    Returns:
+        str: RNA-notation sequence string, containing A, U, G, C, N (uppercase, no whitespace)
+    Example:
+        >>> dna_to_rna('ATCGATCG')
+        'AUCGAUCG'
+        >>> dna_to_rna('atcg')  # mixed case
+        'AUCG'
+        >>> dna_to_rna('AUCGAUCG')  # already RNA format (idempotent)
+        'AUCGAUCG'
+        >>> dna_to_rna('ATC NGATCG')  # contains N and whitespace
+        'AUCNGAUCG'
+        >>> dna_to_rna(' ATC G ')  # leading/trailing whitespace
+        'AUCG'
+    """
+    # Step 1: Convert to uppercase
+    seq = str(seq).upper()
+    # Step 2: Remove all whitespace characters (spaces, tabs, newlines)
+    seq = seq.replace(" ", "").replace("\t", "").replace("\n", "").replace("\r", "")
+    # Step 3: Convert T (DNA) to U (RNA)
+    seq = seq.replace("T", "U")
+    return seq
+def validate_rna_sequence(seq: str, min_len: int = 5, max_len: int = 100) -> bool:
+    """
+    Validate whether a sequence is in valid RNA format.
+    [Description]
+    - Checks that the sequence contains only valid RNA characters: A, U, G, C, N
+    - Checks that the sequence length is within the specified range
+    - If it contains T, the DNA-to-RNA conversion was not performed; returns False
+    [Design Decisions]
+    - Why check for T? It serves as an indicator of conversion failure, aiding data flow debugging
+    - Why allow N? RNA-FM's tokenizer supports ambiguous bases
+    - Why impose length limits? To prevent abnormally long sequences from causing memory overflow
+    Args:
+        seq (str): the sequence string to validate
+        min_len (int): minimum length (inclusive), default 5
+        max_len (int): maximum length (inclusive), default 100
+    Returns:
+        bool: True if the sequence is valid, False otherwise
+    Example:
+        >>> validate_rna_sequence('AUCGAUCG', 5, 30)
+        True
+        >>> validate_rna_sequence('ATCG', 5, 30)  # contains T (DNA notation)
+        False
+        >>> validate_rna_sequence('AU', 5, 30)  # too short
+        False
+        >>> validate_rna_sequence('A' * 31, 5, 30)  # too long
+        False
+        >>> validate_rna_sequence('AUCNGAUCG', 5, 30)  # contains N (valid)
+        True
+    """
+    # Check length
+    if len(seq) < min_len or len(seq) > max_len:
+        return False
+    # Define valid RNA character set
+    valid_chars = {"A", "U", "G", "C", "N"}
+    # Check if all characters are valid
+    for char in seq:
+        if char not in valid_chars:
+            # Specifically check for T, indicating conversion failure
+            if char == "T":
+                return False
+            # Other invalid characters also return False
+            return False
+    return True
+def prepare_rnafm_input(mirna_seq: str, target_seq: str) -> tuple[str, str]:
+    """
+    Prepare an input sequence pair for the RNA-FM model.
+    [Description]
+    - Converts both miRNA and target sequences to RNA notation
+    - Returns two separate strings (not concatenated)
+    - RNA-FM uses a shared encoder architecture that processes each sequence independently
+    [Design Decisions]
+    - Why not concatenate? The dual-encoder processes each sequence in separate forward passes
+    - Concatenation would break the model's architectural design and degrade performance
+    - Returning a tuple is convenient for use in Dataset.__getitem__()
+    Args:
+        mirna_seq (str): miRNA sequence (DNA notation)
+        target_seq (str): target sequence (DNA notation)
+    Returns:
+        tuple[str, str]: (mirna_rna, target_rna) tuple, both in RNA notation
+    Example:
+        >>> mirna_rna, target_rna = prepare_rnafm_input('ATCG', 'TAGC')
+        >>> mirna_rna
+        'AUCG'
+        >>> target_rna
+        'UAGC'
+    """
+    # Convert the two sequences separately
+    mirna_rna = dna_to_rna(mirna_seq)
+    target_rna = dna_to_rna(target_seq)
+    return mirna_rna, target_rna
+def compute_sequence_stats(csv_path: str, sample_n: int = 10000) -> dict:
+    """
+    Compute statistics for sequences in a CSV file.
+    [Description]
+    - Samples a specified number of rows from the CSV file
+    - Computes sequence length distributions, character frequencies, DNA notation detection, etc.
+    - Used for data quality checks and analysis
+    [Design Decisions]
+    - Why lazy-import pandas? To avoid introducing a heavy dependency at module load time
+    - Import only when needed, reducing startup time
+    - Sampling instead of full processing speeds up statistics computation
+    Args:
+        csv_path (str): path to the CSV file
+        sample_n (int): number of rows to sample, default 10000. If the file has fewer rows, all rows are used
+    Returns:
+        dict: statistics dictionary containing the following keys:
+            - 'total_rows': total number of rows in the file (excluding header)
+            - 'sample_rows': actual number of sampled rows
+            - 'mirna_length_min': minimum miRNA length
+            - 'mirna_length_max': maximum miRNA length
+            - 'mirna_length_mean': mean miRNA length
+            - 'target_length_min': minimum target sequence length
+            - 'target_length_max': maximum target sequence length
+            - 'target_length_mean': mean target sequence length
+            - 'mirna_char_freq': miRNA character frequency dictionary
+            - 'target_char_freq': target sequence character frequency dictionary
+            - 'mirna_with_t_count': number of miRNA sequences containing T
+            - 'target_with_t_count': number of target sequences containing T
+    Example:
+        >>> stats = compute_sequence_stats('deepmirt/data/training/train.csv', sample_n=100)
+        >>> print(f"Total rows: {stats['total_rows']}")
+        >>> print(f"miRNA length range: {stats['mirna_length_min']}-{stats['mirna_length_max']}")
+    """
+    # Lazy-import pandas to avoid introducing a heavy dependency at module load time
+    import pandas as pd
+    # Read the CSV file
+    df = pd.read_csv(csv_path)
+    # Compute total number of rows
+    total_rows = len(df)
+    # Determine sample size (capped at total number of rows)
+    actual_sample_n = min(sample_n, total_rows)
+    # Sample data
+    if actual_sample_n < total_rows:
+        sample_df = df.sample(n=actual_sample_n, random_state=42)
+    else:
+        sample_df = df
+    # Initialize statistics dictionary
+    stats = {
+        'total_rows': total_rows,
+        'sample_rows': len(sample_df),
+    }
+    # Compute miRNA sequence statistics
+    mirna_lengths = sample_df['mirna_seq'].str.len()
+    stats['mirna_length_min'] = int(mirna_lengths.min())
+    stats['mirna_length_max'] = int(mirna_lengths.max())
+    stats['mirna_length_mean'] = float(mirna_lengths.mean())
+    # Compute target sequence statistics
+    target_lengths = sample_df['target_fragment_40nt'].str.len()
+    stats['target_length_min'] = int(target_lengths.min())
+    stats['target_length_max'] = int(target_lengths.max())
+    stats['target_length_mean'] = float(target_lengths.mean())
+    # Compute character frequencies
+    def compute_char_freq(seq_series):
+        """Compute the frequency of each character in the sequences"""
+        freq = {}
+        for seq in seq_series:
+            seq = str(seq).upper()
+            for char in seq:
+                freq[char] = freq.get(char, 0) + 1
+        return freq
+    stats['mirna_char_freq'] = compute_char_freq(sample_df['mirna_seq'])
+    stats['target_char_freq'] = compute_char_freq(sample_df['target_fragment_40nt'])
+    # Count sequences containing T (DNA notation)
+    stats['mirna_with_t_count'] = (sample_df['mirna_seq'].str.contains('T', case=False, na=False)).sum()
+    stats['target_with_t_count'] = (sample_df['target_fragment_40nt'].str.contains('T', case=False, na=False)).sum()
+    return stats

deepmirt/evaluation/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ """miRNA target prediction model — comprehensive evaluation framework."""

deepmirt/evaluation/predict.py ADDED Viewed

	@@ -0,0 +1,297 @@

+#!/usr/bin/env python3
+"""
+Inference engine: load checkpoint and generate prediction DataFrame on the test set.
+Independent of Lightning trainer.test(), performs batch inference directly and
+retains all metadata. Prediction results are cached as parquet to avoid repeated inference.
+"""
+from __future__ import annotations
+import logging
+from pathlib import Path
+import numpy as np
+import pandas as pd
+import torch
+import yaml
+from torch.utils.data import DataLoader
+logger = logging.getLogger(__name__)
+def load_model_from_checkpoint(
+    ckpt_path: str,
+    config_path: str,
+    device: str = "cuda",
+):
+    """
+    Load a trained model from checkpoint.
+    Args:
+        ckpt_path: path to the checkpoint file
+        config_path: path to the training config YAML
+        device: inference device
+    Returns:
+        (model, config) tuple
+    """
+    from deepmirt.training.lightning_module import MiRNATargetLitModule
+    with open(config_path) as f:
+        config = yaml.safe_load(f)
+    lit_model = MiRNATargetLitModule.load_from_checkpoint(
+        ckpt_path, config=config, map_location=device
+    )
+    lit_model.eval()
+    lit_model.to(device)
+    return lit_model, config
+def run_inference(
+    ckpt_path: str,
+    config_path: str,
+    test_csv_path: str,
+    batch_size: int = 256,
+    num_workers: int = 8,
+    device: str = "cuda",
+    cache_path: str | None = None,
+) -> pd.DataFrame:
+    """
+    Run model inference on the test set, returning a DataFrame with predictions and metadata.
+    If cache_path exists and is non-empty, loads cached results directly.
+    Args:
+        ckpt_path: path to the checkpoint
+        config_path: path to the config YAML
+        test_csv_path: path to test.csv
+        batch_size: inference batch size
+        num_workers: number of DataLoader worker threads
+        device: inference device
+        cache_path: cache file path (parquet), None to disable caching
+    Returns:
+        DataFrame with columns:
+            mirna_seq, target_fragment_40nt, label, prob, pred, logit,
+            species, mirna_name, target_gene_name, evidence_type, source_database
+    """
+    # Check cache (supports both parquet and csv formats)
+    if cache_path and Path(cache_path).exists():
+        logger.info(f"Loading cached predictions from {cache_path}")
+        if cache_path.endswith(".parquet"):
+            return pd.read_parquet(cache_path)
+        else:
+            return pd.read_csv(cache_path)
+    logger.info(f"Loading model from {ckpt_path}")
+    lit_model, config = load_model_from_checkpoint(ckpt_path, config_path, device)
+    # Load data (using DataModule approach for consistency)
+    import fm
+    from deepmirt.data_module.datamodule import MiRNATargetDataModule
+    from deepmirt.data_module.dataset import MiRNATargetDataset
+    _, alphabet = fm.pretrained.rna_fm_t12()
+    del _
+    padding_idx = alphabet.padding_idx
+    dataset = MiRNATargetDataset(test_csv_path, alphabet)
+    # Use the DataModule's collate_fn logic
+    dm = MiRNATargetDataModule.__new__(MiRNATargetDataModule)
+    dm._padding_idx = padding_idx
+    dataloader = DataLoader(
+        dataset,
+        batch_size=batch_size,
+        shuffle=False,
+        num_workers=num_workers,
+        pin_memory=True,
+        collate_fn=dm._collate_fn,
+    )
+    # Inference
+    all_logits = []
+    all_labels = []
+    all_metadata = {
+        "species": [],
+        "mirna_name": [],
+        "target_gene_name": [],
+        "evidence_type": [],
+        "source_database": [],
+    }
+    logger.info(f"Running inference on {len(dataset)} samples...")
+    with torch.no_grad():
+        for batch_idx, batch in enumerate(dataloader):
+            mirna_tokens = batch["mirna_tokens"].to(device)
+            target_tokens = batch["target_tokens"].to(device)
+            labels = batch["labels"]
+            attn_mask_mirna = batch["attention_mask_mirna"].to(device)
+            attn_mask_target = batch["attention_mask_target"].to(device)
+            logits = lit_model.model(
+                mirna_tokens, target_tokens, attn_mask_mirna, attn_mask_target
+            )
+            logits = logits.squeeze(-1).cpu()
+            all_logits.append(logits)
+            all_labels.append(labels)
+            metadata = batch.get("metadata", {})
+            for key in all_metadata:
+                if key in metadata:
+                    all_metadata[key].extend(metadata[key])
+                else:
+                    all_metadata[key].extend([""] * len(labels))
+            if (batch_idx + 1) % 500 == 0:
+                logger.info(
+                    f"  Processed {(batch_idx + 1) * batch_size} / {len(dataset)}"
+                )
+    all_logits = torch.cat(all_logits).numpy()
+    all_labels = torch.cat(all_labels).numpy()
+    all_probs = 1.0 / (1.0 + np.exp(-all_logits))  # sigmoid
+    all_preds = (all_probs >= 0.5).astype(int)
+    # Build raw sequence columns (read directly from CSV)
+    raw_df = pd.read_csv(
+        test_csv_path,
+        usecols=["mirna_seq", "target_fragment_40nt"],
+        dtype=str,
+    )
+    result_df = pd.DataFrame(
+        {
+            "mirna_seq": raw_df["mirna_seq"].values,
+            "target_fragment_40nt": raw_df["target_fragment_40nt"].values,
+            "label": all_labels.astype(int),
+            "prob": all_probs,
+            "pred": all_preds,
+            "logit": all_logits,
+            "species": all_metadata["species"],
+            "mirna_name": all_metadata["mirna_name"],
+            "target_gene_name": all_metadata["target_gene_name"],
+            "evidence_type": all_metadata["evidence_type"],
+            "source_database": all_metadata["source_database"],
+        }
+    )
+    # Cache results (prefer parquet, fallback to csv)
+    if cache_path:
+        Path(cache_path).parent.mkdir(parents=True, exist_ok=True)
+        try:
+            if cache_path.endswith(".parquet"):
+                result_df.to_parquet(cache_path, index=False)
+            else:
+                result_df.to_csv(cache_path, index=False)
+        except ImportError:
+            # pyarrow not installed, fallback to csv
+            csv_path = cache_path.replace(".parquet", ".csv")
+            result_df.to_csv(csv_path, index=False)
+            logger.info(f"pyarrow not available, saved as CSV: {csv_path}")
+            cache_path = csv_path
+        logger.info(f"Predictions cached to {cache_path}")
+    logger.info(
+        f"Inference complete: {len(result_df)} samples, "
+        f"pos={result_df['label'].sum()}, neg={(result_df['label'] == 0).sum()}"
+    )
+    return result_df
+def predict_on_sequences(
+    ckpt_path: str,
+    config_path: str,
+    mirna_seqs: list[str],
+    target_seqs: list[str],
+    batch_size: int = 256,
+    device: str = "cuda",
+    _lit_model=None,
+    _alphabet=None,
+) -> np.ndarray:
+    """
+    Run inference on arbitrary miRNA + target sequence pairs.
+    Used to run our model on external data such as miRBench standard benchmark datasets.
+    Sequences are automatically converted to RNA format (T->U).
+    Args:
+        ckpt_path: path to the checkpoint
+        config_path: path to the config YAML
+        mirna_seqs: list of miRNA sequences (DNA or RNA format accepted)
+        target_seqs: list of target sequences (DNA or RNA format, should be 40nt)
+        batch_size: inference batch size
+        device: inference device
+        _lit_model: pre-loaded model (internal use, for caching)
+        _alphabet: pre-loaded alphabet (internal use, for caching)
+    Returns:
+        numpy array of predicted probabilities, shape (n_samples,)
+    """
+    import fm
+    from torch.nn.utils.rnn import pad_sequence
+    if _lit_model is not None:
+        lit_model = _lit_model
+    else:
+        logger.info(f"Loading model from {ckpt_path}")
+        lit_model, config = load_model_from_checkpoint(ckpt_path, config_path, device)
+    if _alphabet is not None:
+        alphabet = _alphabet
+    else:
+        _, alphabet = fm.pretrained.rna_fm_t12()
+        del _
+    batch_converter = alphabet.get_batch_converter()
+    padding_idx = alphabet.padding_idx
+    def _to_rna(seq: str) -> str:
+        return seq.upper().replace("T", "U")
+    all_probs = []
+    n_samples = len(mirna_seqs)
+    logger.info(f"Running inference on {n_samples} sequences...")
+    with torch.no_grad():
+        for i in range(0, n_samples, batch_size):
+            batch_mirna = mirna_seqs[i : i + batch_size]
+            batch_target = target_seqs[i : i + batch_size]
+            mirna_tokens_list = []
+            target_tokens_list = []
+            for m_seq, t_seq in zip(batch_mirna, batch_target):
+                m_rna = _to_rna(str(m_seq))
+                t_rna = _to_rna(str(t_seq))
+                _, _, m_tok = batch_converter([("m", m_rna)])
+                _, _, t_tok = batch_converter([("t", t_rna)])
+                mirna_tokens_list.append(m_tok[0])
+                target_tokens_list.append(t_tok[0])
+            mirna_padded = pad_sequence(
+                mirna_tokens_list, batch_first=True, padding_value=padding_idx
+            )
+            target_stacked = torch.stack(target_tokens_list)
+            attn_mask_mirna = (mirna_padded != padding_idx).long()
+            attn_mask_target = torch.ones_like(target_stacked, dtype=torch.long)
+            mirna_padded = mirna_padded.to(device)
+            target_stacked = target_stacked.to(device)
+            attn_mask_mirna = attn_mask_mirna.to(device)
+            attn_mask_target = attn_mask_target.to(device)
+            logits = lit_model.model(
+                mirna_padded, target_stacked, attn_mask_mirna, attn_mask_target
+            )
+            probs = torch.sigmoid(logits.squeeze(-1)).cpu().numpy()
+            all_probs.append(probs)
+            if (i // batch_size + 1) % 100 == 0:
+                logger.info(f"  Processed {min(i + batch_size, n_samples)} / {n_samples}")
+    return np.concatenate(all_probs)

deepmirt/model/__init__.py ADDED Viewed

File without changes

deepmirt/model/classifier.py ADDED Viewed

	@@ -0,0 +1,77 @@

+#!/usr/bin/env python3
+# pyright: basic, reportMissingImports=false
+"""
+MLP classifier head (maps sequence representations to binary classification logits).
+Architecture diagram:
+    pooled_feature (B, 640)
+            |
+            v
+    Linear(640 -> 256)
+            |
+            v
+    BatchNorm + ReLU + Dropout(0.3)
+            |
+            v
+    Linear(256 -> 64) + ReLU + Dropout(0.2)
+            |
+            v
+    Linear(64 -> 1)
+            |
+            v
+    logits (B, 1)
+Note:
+- The output is logits (raw scores); do not apply sigmoid inside the model.
+- During training, use BCEWithLogitsLoss which applies sigmoid internally for numerical stability.
+"""
+from __future__ import annotations
+from collections.abc import Sequence
+from torch import Tensor, nn
+class MLPClassifier(nn.Module):
+    """MLP head for binary classification, outputting a single logit."""
+    def __init__(
+        self,
+        input_dim: int = 640,
+        hidden_dims: Sequence[int] | None = None,
+        dropout: float = 0.3,
+    ) -> None:
+        super().__init__()
+        dims = list(hidden_dims) if hidden_dims is not None else [256, 64]
+        if len(dims) != 2:
+            raise ValueError("hidden_dims must contain exactly two elements, e.g. [256, 64].")
+        hidden1, hidden2 = int(dims[0]), int(dims[1])
+        in_dim = int(input_dim)
+        # Design decision: [256, 64] balances expressiveness and overfitting risk,
+        # suitable for small-to-medium scale biological data.
+        # Design decision: first layer uses BatchNorm + Dropout; second layer retains
+        # a smaller Dropout for lightweight regularization.
+        self.layers = nn.Sequential(
+            nn.Linear(in_dim, hidden1),
+            nn.BatchNorm1d(hidden1),
+            nn.ReLU(),
+            nn.Dropout(dropout),
+            nn.Linear(hidden1, hidden2),
+            nn.ReLU(),
+            nn.Dropout(0.2),
+            nn.Linear(hidden2, 1),
+        )
+    def forward(self, x: Tensor) -> Tensor:
+        """
+        Args:
+            x: Pooled sequence representation, shape `(batch, input_dim)`.
+        Returns:
+            Logits, shape `(batch, 1)`.
+        """
+        return self.layers(x)

deepmirt/model/cross_attention.py ADDED Viewed

	@@ -0,0 +1,115 @@

+#!/usr/bin/env python3
+# pyright: basic, reportMissingImports=false
+"""
+Cross-Attention interaction module.
+Data flow diagram (target as Query, miRNA as Key/Value)::
+    target_emb (B, T, D) -------------------------------> Q
+                                                          |
+                                                          | Multi-Head Cross Attention
+                                                          | (batch_first=True)
+                                                          |
+    miRNA_emb  (B, M, D) ---> K, V -------------------->
+    Output: context_target (B, T, D)
+Why target=Q and miRNA=K/V:
+- Our task is to determine whether a target is regulated by a given miRNA.
+- Having each target position query miRNA information aligns with the semantics
+  of locating potential binding sites on the target.
+Mask convention:
+- key_padding_mask=True indicates a padding position that should be ignored.
+"""
+from __future__ import annotations
+import torch
+from torch import Tensor, nn
+class CrossAttentionBlock(nn.Module):
+    """Interaction module composed of stacked Cross-Attention + FFN layers."""
+    def __init__(
+        self,
+        embed_dim: int = 640,
+        num_heads: int = 8,
+        dropout: float = 0.1,
+        num_layers: int = 2,
+    ) -> None:
+        super().__init__()
+        self.embed_dim = int(embed_dim)
+        self.num_heads = int(num_heads)
+        self.num_layers = int(num_layers)
+        self.layers = nn.ModuleList()
+        for _ in range(self.num_layers):
+            layer = nn.ModuleDict(
+                {
+                    "cross_attn": nn.MultiheadAttention(
+                        embed_dim=self.embed_dim,
+                        num_heads=self.num_heads,
+                        dropout=dropout,
+                        batch_first=True,
+                    ),
+                    "dropout_attn": nn.Dropout(dropout),
+                    "norm1": nn.LayerNorm(self.embed_dim),
+                    "ffn": nn.Sequential(
+                        nn.Linear(self.embed_dim, self.embed_dim * 4),
+                        nn.ReLU(),
+                        nn.Dropout(dropout),
+                        nn.Linear(self.embed_dim * 4, self.embed_dim),
+                    ),
+                    "norm2": nn.LayerNorm(self.embed_dim),
+                }
+            )
+            self.layers.append(layer)
+        # Design decision: 2 layers by default is a lightweight yet effective trade-off;
+        # establish a trainable baseline first, then deepen based on data scale.
+        # Design decision: 8 attention heads by default improves interaction modeling across
+        # different subspaces while keeping GPU memory overhead manageable.
+    def forward(
+        self,
+        query: Tensor,
+        key_value: Tensor,
+        key_padding_mask: Tensor | None = None,
+    ) -> Tensor:
+        """
+        Args:
+            query: Target representation, shape `(batch, target_len, embed_dim)`.
+            key_value: miRNA representation, shape `(batch, mirna_len, embed_dim)`.
+            key_padding_mask: miRNA padding mask, shape `(batch, mirna_len)`,
+                where True indicates positions to ignore.
+        Returns:
+            Updated target representation, shape `(batch, target_len, embed_dim)`.
+        """
+        hidden = query
+        attn_mask = key_padding_mask
+        if attn_mask is not None and attn_mask.dtype is not torch.bool:
+            attn_mask = attn_mask.to(dtype=torch.bool)
+        for layer in self.layers:
+            # Step 1: Cross-Attention (target queries miRNA)
+            attn_out, _ = layer["cross_attn"](
+                query=hidden,
+                key=key_value,
+                value=key_value,
+                key_padding_mask=attn_mask,
+                need_weights=False,
+            )
+            # Step 2: Residual + LayerNorm to stabilize deep training and mitigate vanishing gradients
+            hidden = layer["norm1"](hidden + layer["dropout_attn"](attn_out))
+            # Step 3: Feed-forward network refines channel-wise features
+            ffn_out = layer["ffn"](hidden)
+            # Step 4: Residual + LayerNorm
+            hidden = layer["norm2"](hidden + ffn_out)
+        return hidden

deepmirt/model/mirna_target_model.py ADDED Viewed

	@@ -0,0 +1,127 @@

+#!/usr/bin/env python3
+# pyright: basic, reportMissingImports=false
+"""
+Full miRNA-target model: shared RNA-FM encoder + Cross-Attention + MLP classifier head.
+Complete data flow (with tensor shapes):
+    miRNA tokens (B, M_tok)  ---> [RNA-FM Encoder] ---> miRNA_emb  (B, M, D) ---┐
+                                                                                  |
+                                                                                  v
+    target tokens (B, T_tok) ---> [RNA-FM Encoder] ---> target_emb (B, T, D) --> [Cross-Attention]
+                                                                                  |
+                                                                                  v
+                                                                        cross_out (B, T, D)
+                                                                                  |
+                                                                                  v
+                                                                      masked mean pool
+                                                                                  |
+                                                                                  v
+                                                                              (B, D)
+                                                                                  |
+                                                                                  v
+                                                                            [MLP Head]
+                                                                                  |
+                                                                                  v
+                                                                              logits
+                                                                              (B, 1)
+Where D is automatically inferred from RNA-FM (typically 640) to avoid hard-coding.
+"""
+from __future__ import annotations
+from collections.abc import Sequence
+import torch
+from torch import Tensor, nn
+from .classifier import MLPClassifier
+from .cross_attention import CrossAttentionBlock
+from .rnafm_encoder import RNAFMEncoder
+class MiRNATargetModel(nn.Module):
+    """End-to-end model for miRNA-target binary classification."""
+    def __init__(
+        self,
+        freeze_backbone: bool = True,
+        cross_attn_heads: int = 8,
+        cross_attn_layers: int = 2,
+        classifier_hidden: Sequence[int] | None = None,
+        dropout: float = 0.3,
+    ) -> None:
+        super().__init__()
+        hidden_dims = list(classifier_hidden) if classifier_hidden is not None else [256, 64]
+        self.encoder = RNAFMEncoder(freeze_backbone=freeze_backbone)
+        embed_dim = self.encoder.embed_dim
+        # Design decision: the interaction layer uses a smaller dropout (~1/3 of main dropout)
+        # to preserve attention signals while still providing basic regularization.
+        self.cross_attention = CrossAttentionBlock(
+            embed_dim=embed_dim,
+            num_heads=cross_attn_heads,
+            dropout=dropout * 0.33,
+            num_layers=cross_attn_layers,
+        )
+        self.classifier = MLPClassifier(
+            input_dim=embed_dim,
+            hidden_dims=hidden_dims,
+            dropout=dropout,
+        )
+    def forward(
+        self,
+        mirna_tokens: Tensor,
+        target_tokens: Tensor,
+        attention_mask_mirna: Tensor | None = None,
+        attention_mask_target: Tensor | None = None,
+    ) -> Tensor:
+        """
+        Forward pass (step by step):
+        1) miRNA encoding: `(B, M_tok)` -> `(B, M, D)`
+        2) target encoding: `(B, T_tok)` -> `(B, T, D)`
+        3) Build key_padding_mask: attention_mask(1=real, 0=padding) -> (==0)
+        4) Cross-Attention: target(Q) queries miRNA(K/V) -> `(B, T, D)`
+        5) Masked mean pooling over target sequence -> `(B, D)`
+        6) Classifier head outputs logits -> `(B, 1)`
+        """
+        # Step 1: Shared encoder processes miRNA (shared weights)
+        mirna_emb = self.encoder(mirna_tokens)
+        # Step 2: Same encoder processes target to ensure consistent representation space
+        target_emb = self.encoder(target_tokens)
+        # Step 3: PyTorch MHA key_padding_mask convention: True=ignore.
+        key_padding_mask = None
+        if attention_mask_mirna is not None:
+            key_padding_mask = attention_mask_mirna == 0
+        # Step 4: target as Query, miRNA as Key/Value.
+        cross_out = self.cross_attention(
+            query=target_emb,
+            key_value=mirna_emb,
+            key_padding_mask=key_padding_mask,
+        )
+        # Step 5: Masked mean pooling over target sequence to obtain a fixed-length representation.
+        if attention_mask_target is None:
+            pooling_mask = torch.ones(
+                cross_out.size(0),
+                cross_out.size(1),
+                1,
+                device=cross_out.device,
+                dtype=cross_out.dtype,
+            )
+        else:
+            pooling_mask = attention_mask_target.to(dtype=cross_out.dtype).unsqueeze(-1)
+        summed = (cross_out * pooling_mask).sum(dim=1)
+        denom = pooling_mask.sum(dim=1).clamp_min(1e-6)
+        pooled = summed / denom
+        # Step 6: Output raw logits without applying sigmoid.
+        logits = self.classifier(pooled)
+        return logits

deepmirt/model/rnafm_encoder.py ADDED Viewed

	@@ -0,0 +1,117 @@

+#!/usr/bin/env python3
+# pyright: basic, reportMissingImports=false
+"""
+RNA-FM encoder wrapper (Shared Encoder).
+Architecture diagram (single-path encoding):
+    Input tokens (B, L)
+          |
+          v
+    [RNA-FM: 12-layer Transformer]
+          |
+          v
+    representations[12] (B, L, D)
+                     D is typically 640
+Training strategy diagram (freeze / staged unfreezing):
+    Frozen phase:    [L1][L2][L3]...[L12]   all requires_grad=False
+    Unfrozen phase:  [L1]...[L9][L10][L11][L12]
+                                    ^^^^^^^^
+                                    only unfreeze top N layers (e.g., N=3)
+Notes:
+- Both miRNA and target are RNA sequences, so sharing a single RNA-FM encoder is the most natural approach.
+- `repr_layers=[12]` extracts the 12th (final) layer output as the contextualized representation.
+"""
+from __future__ import annotations
+from collections.abc import Sequence
+import fm
+from torch import Tensor, nn
+class RNAFMEncoder(nn.Module):
+    """Lightweight wrapper around RNA-FM providing forward encoding, freezing, and staged unfreezing."""
+    def __init__(self, freeze_backbone: bool = True) -> None:
+        super().__init__()
+        self.model, self.alphabet = fm.pretrained.rna_fm_t12()
+        self.num_layers = len(self.model.layers)
+        self.embed_dim = self._infer_embed_dim(default=640)
+        # Design decision: freeze backbone by default to first stabilize training of the
+        # upper interaction module and classifier head, avoiding catastrophic forgetting
+        # from full fine-tuning on small datasets.
+        if freeze_backbone:
+            self.freeze()
+    def _infer_embed_dim(self, default: int = 640) -> int:
+        """Try to infer the embedding dimension from the RNA-FM model; fall back to default on failure."""
+        model_embed_dim = getattr(self.model, "embed_dim", None)
+        if model_embed_dim is not None:
+            return int(model_embed_dim)
+        model_args = getattr(self.model, "args", None)
+        if model_args is not None and hasattr(model_args, "embed_dim"):
+            return int(model_args.embed_dim)
+        embed_tokens = getattr(self.model, "embed_tokens", None)
+        if embed_tokens is not None and hasattr(embed_tokens, "embedding_dim"):
+            return int(embed_tokens.embedding_dim)
+        return int(default)
+    def forward(self, tokens: Tensor, repr_layers: Sequence[int] | None = None) -> Tensor:
+        """
+        Encode an RNA token sequence.
+        Args:
+            tokens: Token tensor of shape `(batch, seq_len)`.
+            repr_layers: List of layer indices to extract. Defaults to `[12]` (final layer).
+        Returns:
+            Contextualized representations of shape `(batch, seq_len, embed_dim)`.
+        """
+        if repr_layers is None:
+            # Design decision: use the final layer representation by default (most semantically
+            # complete), consistent with common pre-trained model usage.
+            repr_layers = [self.num_layers]
+        layer_ids = list(repr_layers)
+        if not layer_ids:
+            raise ValueError("repr_layers must not be empty; provide at least one layer index.")
+        outputs = self.model(tokens, repr_layers=layer_ids)
+        # Note: typically repr_layers=[12] is passed, so this retrieves representations[12].
+        final_layer_id = max(layer_ids)
+        return outputs["representations"][final_layer_id]
+    def freeze(self) -> None:
+        """Freeze all RNA-FM backbone parameters (requires_grad=False)."""
+        for param in self.model.parameters():
+            param.requires_grad = False
+    def unfreeze(self, num_layers: int = 3) -> None:
+        """
+        Unfreeze only the per-layer parameters of the top N Transformer layers.
+        Example: when `num_layers=3`, unfreezes layer[9], layer[10], layer[11].
+        Note: global LayerNorm (e.g., emb_layer_norm_after) is NOT unfrozen,
+        because unfreezing it would shift the output distribution of all layers at once,
+        leading to training instability.
+        """
+        # Design decision: always freeze all first, then selectively unfreeze, ensuring the
+        # set of trainable parameters is controllable and reproducible.
+        self.freeze()
+        n = max(0, min(int(num_layers), self.num_layers))
+        if n > 0:
+            start = self.num_layers - n
+            for layer in self.model.layers[start:]:
+                for param in layer.parameters():
+                    param.requires_grad = True

deepmirt/predict.py ADDED Viewed

	@@ -0,0 +1,373 @@

+#!/usr/bin/env python3
+"""
+Public prediction API for DeepMiRT.
+Provides simple interfaces for miRNA-target interaction prediction:
+- predict(): Python API for sequence pairs
+- predict_from_csv(): Batch prediction from CSV files
+- cli_main(): Command-line entry point
+Model weights are automatically downloaded from Hugging Face Hub on first use.
+"""
+from __future__ import annotations
+import argparse
+import logging
+import re
+import sys
+import warnings
+from pathlib import Path
+import numpy as np
+import pandas as pd
+logger = logging.getLogger(__name__)
+# Hugging Face Hub model repository
+HF_REPO_ID = "liuliu2333/deepmirt"
+HF_CKPT_FILENAME = "epoch=27-val_auroc=0.9612.ckpt"
+HF_CONFIG_FILENAME = "config.yaml"
+# Valid nucleotide characters (before T→U conversion)
+_VALID_BASES = re.compile(r"^[AUGCTaugct]+$")
+# Module-level model cache (avoids reloading 495 MB on every call)
+_model_cache: dict = {}
+def _get_model_files() -> tuple[str, str]:
+    """Download model checkpoint and config from Hugging Face Hub (cached locally)."""
+    from huggingface_hub import hf_hub_download
+    ckpt_path = hf_hub_download(repo_id=HF_REPO_ID, filename=HF_CKPT_FILENAME)
+    config_path = hf_hub_download(repo_id=HF_REPO_ID, filename=HF_CONFIG_FILENAME)
+    return ckpt_path, config_path
+def _get_cached_model(device: str):
+    """Load model and alphabet, caching for subsequent calls."""
+    if device not in _model_cache:
+        import fm
+        from deepmirt.evaluation.predict import load_model_from_checkpoint
+        ckpt_path, config_path = _get_model_files()
+        logger.info("Loading DeepMiRT model (first call, will be cached)...")
+        lit_model, config = load_model_from_checkpoint(ckpt_path, config_path, device)
+        _, alphabet = fm.pretrained.rna_fm_t12()
+        _model_cache[device] = (lit_model, alphabet, ckpt_path, config_path)
+        logger.info("Model loaded and cached.")
+    return _model_cache[device]
+def _validate_sequences(
+    mirna_seqs: list[str], target_seqs: list[str]
+) -> tuple[list[str], list[str]]:
+    """Validate and clean input sequences."""
+    cleaned_mirna = []
+    cleaned_target = []
+    for i, (m, t) in enumerate(zip(mirna_seqs, target_seqs)):
+        m = str(m).strip().upper()
+        t = str(t).strip().upper()
+        if not m:
+            raise ValueError(f"Empty miRNA sequence at index {i}")
+        if not t:
+            raise ValueError(f"Empty target sequence at index {i}")
+        if not _VALID_BASES.match(m):
+            invalid = set(m) - set("AUGCT")
+            raise ValueError(
+                f"miRNA at index {i} contains invalid characters: {invalid}. "
+                f"Only A/U/G/C/T are allowed."
+            )
+        if not _VALID_BASES.match(t):
+            invalid = set(t) - set("AUGCT")
+            raise ValueError(
+                f"Target at index {i} contains invalid characters: {invalid}. "
+                f"Only A/U/G/C/T are allowed."
+            )
+        cleaned_mirna.append(m)
+        cleaned_target.append(t)
+    # Warn about unusual lengths (non-blocking)
+    mirna_lens = [len(s) for s in cleaned_mirna]
+    target_lens = [len(s) for s in cleaned_target]
+    if any(n < 15 or n > 30 for n in mirna_lens):
+        warnings.warn(
+            "Some miRNA sequences have unusual length (expected 18-25 nt). "
+            "Results may be less reliable.",
+            stacklevel=3,
+        )
+    if any(n != 40 for n in target_lens):
+        warnings.warn(
+            "Some target sequences are not 40 nt. The model was trained on 40-nt "
+            "target fragments. Results may be less reliable for other lengths.",
+            stacklevel=3,
+        )
+    return cleaned_mirna, cleaned_target
+def predict(
+    mirna_seqs: list[str],
+    target_seqs: list[str],
+    device: str = "cpu",
+    batch_size: int = 256,
+) -> np.ndarray:
+    """
+    Predict miRNA-target interaction probabilities.
+    Automatically downloads model weights from Hugging Face Hub on first call.
+    The model is cached in memory for subsequent calls.
+    Sequences can be in DNA (T) or RNA (U) format -- conversion is handled internally.
+    Args:
+        mirna_seqs: List of miRNA sequences (typically 18-25 nt).
+        target_seqs: List of target site sequences (40 nt recommended).
+        device: Inference device ("cpu" or "cuda").
+        batch_size: Batch size for inference.
+    Returns:
+        Numpy array of interaction probabilities, shape (n_samples,).
+        Values range from 0 (no interaction) to 1 (strong interaction).
+    Example:
+        >>> from deepmirt import predict
+        >>> probs = predict(
+        ...     mirna_seqs=["UGAGGUAGUAGGUUGUAUAGUU"],
+        ...     target_seqs=["ACUGCAGCAUAUCUACUAUUUGCUACUGUAACCAUUGAUCU"],
+        ... )
+        >>> print(f"Interaction probability: {probs[0]:.4f}")
+    """
+    if len(mirna_seqs) != len(target_seqs):
+        raise ValueError(
+            f"mirna_seqs and target_seqs must have the same length, "
+            f"got {len(mirna_seqs)} and {len(target_seqs)}"
+        )
+    if len(mirna_seqs) == 0:
+        return np.array([])
+    mirna_seqs, target_seqs = _validate_sequences(mirna_seqs, target_seqs)
+    from deepmirt.evaluation.predict import predict_on_sequences
+    lit_model, alphabet, ckpt_path, config_path = _get_cached_model(device)
+    return predict_on_sequences(
+        ckpt_path=ckpt_path,
+        config_path=config_path,
+        mirna_seqs=mirna_seqs,
+        target_seqs=target_seqs,
+        batch_size=batch_size,
+        device=device,
+        _lit_model=lit_model,
+        _alphabet=alphabet,
+    )
+def predict_from_csv(
+    csv_path: str,
+    output_path: str | None = None,
+    device: str = "cpu",
+    batch_size: int = 256,
+    mirna_col: str = "mirna_seq",
+    target_col: str = "target_seq",
+) -> pd.DataFrame:
+    """
+    Batch prediction from a CSV file.
+    The CSV must contain columns for miRNA and target sequences.
+    Args:
+        csv_path: Path to input CSV file.
+        output_path: Path to save results CSV. If None, results are only returned.
+        device: Inference device ("cpu" or "cuda").
+        batch_size: Batch size for inference.
+        mirna_col: Column name for miRNA sequences.
+        target_col: Column name for target sequences.
+    Returns:
+        DataFrame with original columns plus 'probability' and 'prediction'.
+    """
+    df = pd.read_csv(csv_path)
+    if mirna_col not in df.columns or target_col not in df.columns:
+        raise ValueError(
+            f"CSV must contain columns '{mirna_col}' and '{target_col}'. "
+            f"Found columns: {list(df.columns)}"
+        )
+    mirna_seqs = df[mirna_col].astype(str).tolist()
+    target_seqs = df[target_col].astype(str).tolist()
+    probs = predict(mirna_seqs, target_seqs, device=device, batch_size=batch_size)
+    df["probability"] = probs
+    df["prediction"] = (probs >= 0.5).astype(int)
+    if output_path:
+        Path(output_path).parent.mkdir(parents=True, exist_ok=True)
+        df.to_csv(output_path, index=False)
+        logger.info(f"Results saved to {output_path}")
+    return df
+def scan_targets(
+    mirna_fasta: str | dict[str, str],
+    target_fasta: str,
+    output_prefix: str | None = None,
+    device: str = "cpu",
+    batch_size: int = 512,
+    prob_threshold: float = 0.5,
+    scan_mode: str = "hybrid",
+    stride: int = 20,
+    top_k: int | None = None,
+) -> list:
+    """
+    Scan target sequences for miRNA binding sites genome-wide.
+    Identifies candidate binding positions using seed matching and/or sliding
+    windows, then scores each position with the DeepMiRT model.
+    Args:
+        mirna_fasta: Path to miRNA FASTA file, or dict of {id: sequence}.
+        target_fasta: Path to target FASTA file (e.g. 3'UTRs or transcripts).
+        output_prefix: If given, write {prefix}_details.txt, {prefix}_hits.tsv,
+            and {prefix}_summary.tsv.
+        device: Inference device ("cpu" or "cuda").
+        batch_size: Batch size for GPU inference.
+        prob_threshold: Minimum probability to report a hit (default 0.5).
+        scan_mode: Scanning strategy -- "seed" (fastest), "hybrid" (default),
+            or "exhaustive" (slowest, stride-1).
+        stride: Window stride for hybrid/exhaustive modes (default 20).
+        top_k: If set, keep only the top-K hits per miRNA-target pair.
+    Returns:
+        List of TargetScanResult objects, one per miRNA-target pair with hits.
+        Each result contains a list of ScanHit objects with position, probability,
+        seed type, and the 40nt window sequence.
+    Example:
+        >>> from deepmirt import scan_targets
+        >>> results = scan_targets(
+        ...     mirna_fasta={"let-7": "UGAGGUAGUAGGUUGUAUAGUU"},
+        ...     target_fasta="3utrs.fa",
+        ...     output_prefix="results/scan",
+        ...     device="cuda",
+        ... )
+        >>> for r in results:
+        ...     for hit in r.hits:
+        ...         print(f"{r.target_id} pos={hit.position} prob={hit.probability:.3f}")
+    """
+    from deepmirt.scanning.scanner import TargetScanner
+    scanner = TargetScanner(
+        device=device,
+        batch_size=batch_size,
+        prob_threshold=prob_threshold,
+        scan_mode=scan_mode,
+        stride=stride,
+        top_k=top_k,
+    )
+    return scanner.scan(mirna_fasta, target_fasta, output_prefix)
+def cli_main() -> None:
+    """Command-line entry point for deepmirt-predict."""
+    parser = argparse.ArgumentParser(
+        prog="deepmirt-predict",
+        description="DeepMiRT: Predict miRNA-target interactions",
+    )
+    subparsers = parser.add_subparsers(dest="command", help="Available commands")
+    # Single prediction
+    single = subparsers.add_parser("single", help="Predict a single miRNA-target pair")
+    single.add_argument("--mirna", required=True, help="miRNA sequence")
+    single.add_argument("--target", required=True, help="Target sequence (40 nt)")
+    single.add_argument("--device", default="cpu", help="Device (cpu or cuda)")
+    # Batch prediction
+    batch = subparsers.add_parser("batch", help="Batch prediction from CSV")
+    batch.add_argument("--input", required=True, help="Input CSV path")
+    batch.add_argument("--output", required=True, help="Output CSV path")
+    batch.add_argument("--device", default="cpu", help="Device (cpu or cuda)")
+    batch.add_argument("--batch-size", type=int, default=256, help="Batch size")
+    batch.add_argument("--mirna-col", default="mirna_seq", help="miRNA column name")
+    batch.add_argument("--target-col", default="target_seq", help="Target column name")
+    # Genome-wide scanning
+    scan = subparsers.add_parser(
+        "scan", help="Scan target sequences for miRNA binding sites"
+    )
+    scan_input = scan.add_mutually_exclusive_group(required=True)
+    scan_input.add_argument("--mirna-fasta", help="miRNA FASTA file")
+    scan_input.add_argument("--mirna", help="Single miRNA sequence (use with --mirna-id)")
+    scan.add_argument("--mirna-id", default="query_mirna", help="miRNA ID (with --mirna)")
+    scan.add_argument("--target-fasta", required=True, help="Target FASTA file")
+    scan.add_argument("--output", required=True, help="Output prefix")
+    scan.add_argument("--device", default="cpu", help="Device (cpu or cuda)")
+    scan.add_argument("--batch-size", type=int, default=512, help="Batch size")
+    scan.add_argument("--threshold", type=float, default=0.5, help="Probability threshold")
+    scan.add_argument(
+        "--scan-mode", default="hybrid", choices=["seed", "hybrid", "exhaustive"],
+        help="Scanning mode (default: hybrid)",
+    )
+    scan.add_argument("--stride", type=int, default=20, help="Window stride (default: 20)")
+    scan.add_argument("--top-k", type=int, default=None, help="Keep top-K hits per target")
+    args = parser.parse_args()
+    logging.basicConfig(level=logging.INFO, format="%(message)s")
+    if args.command == "single":
+        probs = predict([args.mirna], [args.target], device=args.device)
+        prob = probs[0]
+        label = "INTERACTION" if prob >= 0.5 else "NO INTERACTION"
+        print(f"Probability: {prob:.4f}")
+        print(f"Prediction:  {label}")
+    elif args.command == "batch":
+        df = predict_from_csv(
+            csv_path=args.input,
+            output_path=args.output,
+            device=args.device,
+            batch_size=args.batch_size,
+            mirna_col=args.mirna_col,
+            target_col=args.target_col,
+        )
+        print(f"Processed {len(df)} samples. Results saved to {args.output}")
+    elif args.command == "scan":
+        if args.mirna:
+            mirna_input: str | dict[str, str] = {args.mirna_id: args.mirna}
+        else:
+            mirna_input = args.mirna_fasta
+        results = scan_targets(
+            mirna_fasta=mirna_input,
+            target_fasta=args.target_fasta,
+            output_prefix=args.output,
+            device=args.device,
+            batch_size=args.batch_size,
+            prob_threshold=args.threshold,
+            scan_mode=args.scan_mode,
+            stride=args.stride,
+            top_k=args.top_k,
+        )
+        total_hits = sum(len(r.hits) for r in results)
+        print(
+            f"Scan complete: {len(results)} miRNA-target pairs, "
+            f"{total_hits} hits above threshold {args.threshold}"
+        )
+        print(f"Results: {args.output}_details.txt, _hits.tsv, _summary.tsv")
+    else:
+        parser.print_help()
+        sys.exit(1)
+if __name__ == "__main__":
+    cli_main()

deepmirt/training/__init__.py ADDED Viewed

File without changes

deepmirt/training/lightning_module.py ADDED Viewed

	@@ -0,0 +1,386 @@

+#!/usr/bin/env python3
+# pyright: basic, reportMissingImports=false
+"""
+PyTorch Lightning training module for miRNA-target prediction.
+[Lightning Training Loop Overview -- Full Lifecycle of One Epoch]
+    ┌─────────────────────────────────────────────────────────────────────┐
+    │                     Lifecycle of One Epoch                          │
+    │                                                                     │
+    │  on_train_epoch_start()                                             │
+    │       │                                                             │
+    │       v                                                             │
+    │  ┌──────────────────────────────────────────┐                       │
+    │  │  for batch in train_dataloader:          │                       │
+    │  │      training_step(batch)                │  ← forward + loss    │
+    │  │      backward()          [automatic]     │  ← backpropagation   │
+    │  │      optimizer.step()    [automatic]     │  ← update params     │
+    │  └──────────────────────────────────────────┘                       │
+    │       │                                                             │
+    │       v                                                             │
+    │  on_train_epoch_end()                                               │
+    │       │                                                             │
+    │       v                                                             │
+    │  ┌──────────────────────────────────────────┐                       │
+    │  │  for batch in val_dataloader:            │                       │
+    │  │      validation_step(batch)              │  ← forward only, no  │
+    │  │                                          │     param updates     │
+    │  └──────────────────────────────────────────┘                       │
+    │       │                                                             │
+    │       v                                                             │
+    │  on_validation_epoch_end()                                          │
+    │       │                                                             │
+    │       v                                                             │
+    │  lr_scheduler.step()                [automatic]                     │
+    └─────────────────────────────────────────────────────────────────────┘
+    Things Lightning handles automatically (no manual code needed):
+    - loss.backward()
+    - optimizer.zero_grad()
+    - optimizer.step()
+    - Switching to model.eval() and torch.no_grad() during validation
+    - Gradient accumulation (if accumulate_grad_batches is configured)
+    - Multi-GPU distributed synchronization (if using DDP)
+    You only need to focus on:
+    - training_step(): return the loss
+    - validation_step(): compute validation metrics
+    - configure_optimizers(): define the optimizer and learning rate scheduler
+[Key Design Decisions]
+    1. BCEWithLogitsLoss vs BCELoss:
+       - BCEWithLogitsLoss = Sigmoid + BCELoss, using the log-sum-exp trick internally
+       - Numerical stability: directly computing log(sigmoid(x)) can produce log(0) at
+         extreme values. BCEWithLogitsLoss uses the equivalent formula
+         max(x,0) - x*y + log(1+exp(-|x|)) to avoid overflow
+       - Therefore the model outputs raw logits (no sigmoid); the loss function handles it
+    2. Differential Learning Rate:
+       - Backbone (RNA-FM): base_lr x 0.01 -- pretrained weights encode rich RNA knowledge;
+         a large learning rate would cause catastrophic forgetting of this knowledge
+       - Cross-attention layers: base_lr x 0.1 -- new module but needs stable attention
+         pattern learning
+       - Classifier head: base_lr x 1.0 -- learning from scratch, needs the highest
+         learning rate for fast convergence
+    3. Evaluation Metric Selection:
+       - AUROC (Area Under ROC Curve): measures the model's ranking ability, i.e., the
+         probability of ranking a positive sample above a negative one. Threshold-independent.
+       - AUPRC (Average Precision / PR-AUC): measures the precision-recall tradeoff;
+         more sensitive than AUROC on class-imbalanced data (biological data often has
+         positive:negative ratios of 1:10+)
+       - Accuracy: intuitive but can be misleading on imbalanced data (predicting all
+         negatives still yields 90% accuracy)
+       - F1: harmonic mean of precision and recall, balancing both
+    4. Logging Strategy -- on_step=False, on_epoch=True:
+       - Training loss: fluctuates heavily per step; step-level logging aids debugging
+       - Evaluation metrics: require full epoch data to be statistically meaningful,
+         hence on_epoch=True
+       - prog_bar=True: displays key metrics in the training progress bar for real-time
+         monitoring
+"""
+from __future__ import annotations
+import pytorch_lightning as pl
+import torch
+import torchmetrics
+from torch import nn
+from torch.optim.lr_scheduler import CosineAnnealingLR
+from deepmirt.model.mirna_target_model import MiRNATargetModel
+class MiRNATargetLitModule(pl.LightningModule):
+    """
+    Lightning training module for miRNA-target binary classification prediction.
+    Responsibilities:
+    - Wraps MiRNATargetModel, managing forward pass / loss / metric computation
+    - Configures optimizer with differential learning rates and LR scheduler
+    - Provides training_step / validation_step / test_step
+    Args:
+        config: Nested dictionary with the following structure:
+            {
+                'model': {
+                    'freeze_backbone': bool,
+                    'cross_attn_heads': int,
+                    'cross_attn_layers': int,
+                    'classifier_hidden': list[int],
+                    'dropout': float,
+                },
+                'training': {
+                    'lr': float,             # base learning rate (used by classifier head)
+                    'weight_decay': float,   # L2 regularization coefficient
+                    'scheduler': str,        # 'cosine' or 'onecycle'
+                    'max_epochs': int,       # total training epochs (needed by scheduler)
+                }
+            }
+    """
+    def __init__(self, config: dict) -> None:
+        super().__init__()
+        # Save hyperparameters to the checkpoint for restoring the full config on reload
+        # Design decision: save_hyperparameters ensures reproducibility -- checkpoint carries the full config
+        self.save_hyperparameters(config)
+        self.config = config
+        # ── Extract model parameters from config and instantiate ──
+        model_cfg = config["model"]
+        self.model = MiRNATargetModel(
+            freeze_backbone=model_cfg.get("freeze_backbone", True),
+            cross_attn_heads=model_cfg.get("cross_attn_heads", 8),
+            cross_attn_layers=model_cfg.get("cross_attn_layers", 2),
+            classifier_hidden=model_cfg.get("classifier_hidden", [256, 64]),
+            dropout=model_cfg.get("dropout", 0.3),
+        )
+        # ── Loss function ──
+        # Design decision: BCEWithLogitsLoss is more numerically stable than sigmoid + BCELoss.
+        # Internal formula: loss = max(logit, 0) - logit * label + log(1 + exp(-|logit|))
+        # This formula avoids numerical overflow from log(sigmoid(x)) at extreme values of x.
+        self.loss_fn = nn.BCEWithLogitsLoss()
+        # ── Training metrics ──
+        # torchmetrics automatically handles metric aggregation in distributed settings (DDP sync)
+        self.train_auroc = torchmetrics.AUROC(task="binary")
+        # ── Validation metrics ──
+        self.val_auroc = torchmetrics.AUROC(task="binary")
+        self.val_auprc = torchmetrics.AveragePrecision(task="binary")
+        self.val_acc = torchmetrics.Accuracy(task="binary")
+        self.val_f1 = torchmetrics.F1Score(task="binary")
+        # ── Test metrics (same as validation, but separate instances to avoid state contamination) ──
+        self.test_auroc = torchmetrics.AUROC(task="binary")
+        self.test_auprc = torchmetrics.AveragePrecision(task="binary")
+        self.test_acc = torchmetrics.Accuracy(task="binary")
+        self.test_f1 = torchmetrics.F1Score(task="binary")
+    # ─────────────────────────────────────────────────────────────
+    # Training step
+    # ─────────────────────────────────────────────────────────────
+    def training_step(self, batch: dict, batch_idx: int) -> torch.Tensor:
+        """
+        Single training step: forward pass -> compute loss -> update metrics.
+        Lightning automatically calls backward() and optimizer.step() on the returned loss.
+        There is no need to manually call loss.backward() or optimizer.zero_grad().
+        Args:
+            batch: Dictionary output from the DataModule collate_fn, containing:
+                - mirna_tokens:          (B, max_mirna_len)
+                - target_tokens:         (B, 42)
+                - labels:                (B,)  float32
+                - attention_mask_mirna:  (B, max_mirna_len)
+                - attention_mask_target: (B, 42)
+            batch_idx: Index of the current batch (automatically passed by Lightning)
+        Returns:
+            loss: Scalar tensor; Lightning automatically backpropagates through it
+        """
+        # Step 1: Extract inputs from the batch dictionary
+        mirna_tokens = batch["mirna_tokens"]
+        target_tokens = batch["target_tokens"]
+        labels = batch["labels"]
+        attention_mask_mirna = batch["attention_mask_mirna"]
+        attention_mask_target = batch["attention_mask_target"]
+        # Step 2: Forward pass -> logits shape (B, 1)
+        logits = self.model(
+            mirna_tokens, target_tokens, attention_mask_mirna, attention_mask_target
+        )
+        # Step 3: Compute loss
+        # squeeze(-1) reduces logits from (B, 1) to (B,), aligning with labels (B,)
+        loss = self.loss_fn(logits.squeeze(-1), labels)
+        # Step 4: Compute prediction probabilities and update metrics
+        # Note: sigmoid is only used for metric computation, not for the loss (BCEWithLogitsLoss includes sigmoid internally)
+        probs = torch.sigmoid(logits.squeeze(-1))
+        self.train_auroc(probs, labels.long())
+        # Step 5: Logging
+        # Design decision: train_loss uses on_step=True to monitor convergence trends,
+        # train_auroc uses on_epoch=True because per-step AUROC has little statistical significance.
+        self.log("train_loss", loss, prog_bar=True, on_step=True, on_epoch=True)
+        self.log(
+            "train_auroc",
+            self.train_auroc,
+            on_step=False,
+            on_epoch=True,
+            prog_bar=True,
+        )
+        return loss
+    # ─────────────────────────────────────────────────────────────
+    # Validation step
+    # ─────────────────────────────────────────────────────────────
+    def validation_step(self, batch: dict, batch_idx: int) -> None:
+        """
+        Single validation step: forward pass -> compute loss and full metric suite.
+        Lightning automatically handles the following during validation:
+        - Switches to model.eval() mode (disables Dropout, uses running mean for BatchNorm)
+        - Wraps in torch.no_grad(), skipping gradient computation to save memory
+        Args:
+            batch: Same as training_step
+            batch_idx: Index of the current batch
+        """
+        mirna_tokens = batch["mirna_tokens"]
+        target_tokens = batch["target_tokens"]
+        labels = batch["labels"]
+        attention_mask_mirna = batch["attention_mask_mirna"]
+        attention_mask_target = batch["attention_mask_target"]
+        logits = self.model(
+            mirna_tokens, target_tokens, attention_mask_mirna, attention_mask_target
+        )
+        loss = self.loss_fn(logits.squeeze(-1), labels)
+        probs = torch.sigmoid(logits.squeeze(-1))
+        # Update all validation metrics
+        self.val_auroc(probs, labels.long())
+        self.val_auprc(probs, labels.long())
+        self.val_acc(probs, labels.long())
+        self.val_f1(probs, labels.long())
+        # Design decision: all validation metrics use on_epoch=True, as they need full data to be statistically meaningful
+        # sync_dist=True automatically aggregates metrics across GPUs in multi-GPU settings
+        self.log("val_loss", loss, prog_bar=True, on_epoch=True, sync_dist=True)
+        self.log("val_auroc", self.val_auroc, on_epoch=True, prog_bar=True)
+        self.log("val_auprc", self.val_auprc, on_epoch=True)
+        self.log("val_acc", self.val_acc, on_epoch=True)
+        self.log("val_f1", self.val_f1, on_epoch=True)
+    # ─────────────────────────────────────────────────────────────
+    # Test step
+    # ─────────────────────────────────────────────────────────────
+    def test_step(self, batch: dict, batch_idx: int) -> None:
+        """
+        Single test step: same logic as validation_step, using separate test metric instances.
+        Test metrics are instantiated separately from validation metrics to avoid state
+        contamination. For example, val_auroc resets at the end of each validation epoch,
+        while test_auroc is only used when trainer.test() is called.
+        """
+        mirna_tokens = batch["mirna_tokens"]
+        target_tokens = batch["target_tokens"]
+        labels = batch["labels"]
+        attention_mask_mirna = batch["attention_mask_mirna"]
+        attention_mask_target = batch["attention_mask_target"]
+        logits = self.model(
+            mirna_tokens, target_tokens, attention_mask_mirna, attention_mask_target
+        )
+        loss = self.loss_fn(logits.squeeze(-1), labels)
+        probs = torch.sigmoid(logits.squeeze(-1))
+        # Update test metrics
+        self.test_auroc(probs, labels.long())
+        self.test_auprc(probs, labels.long())
+        self.test_acc(probs, labels.long())
+        self.test_f1(probs, labels.long())
+        self.log("test_loss", loss, on_epoch=True, sync_dist=True)
+        self.log("test_auroc", self.test_auroc, on_epoch=True)
+        self.log("test_auprc", self.test_auprc, on_epoch=True)
+        self.log("test_acc", self.test_acc, on_epoch=True)
+        self.log("test_f1", self.test_f1, on_epoch=True)
+    # ─────────────────────────────────────────────────────────────
+    # Optimizer and learning rate scheduling
+    # ─────────────────────────────────────────────────────────────
+    def configure_optimizers(self) -> dict:
+        """
+        Configure AdamW optimizer with differential learning rates and cosine annealing scheduler.
+        [Differential Learning Rates -- Why use different learning rates for different modules?]
+            Module          Learning Rate   Reason
+            ─────────────  ─────────────  ──────────────────────────────────
+            RNA-FM backbone base_lr×0.01   Pretrained weights contain rich RNA structure/sequence
+                                           knowledge; a large LR would destroy this knowledge
+                                           (catastrophic forgetting)
+            Cross-attention base_lr×0.1    Newly initialized module, but needs to stably learn
+                                           miRNA-target attention patterns
+            Classifier head base_lr×1.0    Learns the binary classification decision boundary
+                                           from scratch; needs the highest LR for fast convergence
+        Design decision: The LR ratios [0.01, 0.1, 1.0] follow common transfer learning practice;
+        the paper "Universal Language Model Fine-tuning" (Howard & Ruder, 2018)
+        calls this "discriminative fine-tuning".
+        [CosineAnnealingLR Scheduler]
+        The learning rate decays from its initial value toward 0 following a cosine curve:
+            lr(t) = lr_min + 0.5 * (lr_max - lr_min) * (1 + cos(pi * t / T_max))
+        Advantage: fast learning early on, fine-grained adjustment later, avoiding instability
+        from sudden LR drops.
+        Returns:
+            Dictionary containing the optimizer and lr_scheduler
+        """
+        training_cfg = self.config["training"]
+        base_lr = training_cfg["lr"]
+        weight_decay = training_cfg.get("weight_decay", 1e-5)
+        scheduler_type = training_cfg.get("scheduler", "cosine")
+        max_epochs = training_cfg.get("max_epochs", 30)
+        # Design decision: 3 parameter groups correspond to the model's 3 semantic modules;
+        # learning rates decrease from downstream to upstream (farther from the task = smaller LR).
+        param_groups = [
+            {
+                "params": list(self.model.encoder.parameters()),
+                "lr": base_lr * 0.01,
+                "name": "backbone",
+            },
+            {
+                "params": list(self.model.cross_attention.parameters()),
+                "lr": base_lr * 0.1,
+                "name": "cross_attention",
+            },
+            {
+                "params": list(self.model.classifier.parameters()),
+                "lr": base_lr,
+                "name": "classifier",
+            },
+        ]
+        optimizer = torch.optim.AdamW(param_groups, weight_decay=weight_decay)
+        # Design decision: CosineAnnealingLR is a safe default choice --
+        # it does not require knowing total steps (unlike OneCycleLR), and provides smooth decay.
+        if scheduler_type == "cosine":
+            scheduler = CosineAnnealingLR(optimizer, T_max=max_epochs)
+        elif scheduler_type == "onecycle":
+            # OneCycleLR requires total_steps = steps_per_epoch * max_epochs,
+            # but at the configure_optimizers stage the DataLoader has not been created yet,
+            # so steps_per_epoch is unavailable. Therefore, fall back to CosineAnnealingLR.
+            # If OneCycleLR is needed, it should be configured in train.py via the Trainer's
+            # estimated_stepping_batches.
+            scheduler = CosineAnnealingLR(optimizer, T_max=max_epochs)
+        else:
+            scheduler = CosineAnnealingLR(optimizer, T_max=max_epochs)
+        return {
+            "optimizer": optimizer,
+            "lr_scheduler": {
+                "scheduler": scheduler,
+                # Design decision: interval='epoch' adjusts the learning rate once per epoch,
+                # which is more stable than 'step' (adjusting after every batch), suitable for small to medium datasets.
+                "interval": "epoch",
+            },
+        }

requirements.txt ADDED Viewed

	@@ -0,0 +1,9 @@

+torch
+rna-fm
+pytorch-lightning>=2.0
+torchmetrics
+pyyaml
+scikit-learn
+numpy
+pandas
+huggingface-hub