"""
Build a JSONL of paired (human-viral-sequence, length+GC-matched bacterial-CDS) records
for a "is this a human virus?" probe.

Positives: 6000 human viral sequences from human_viral_sequences.xlsx.
Negatives: VFDB CDSs (14,695 bacterial CDSs from MGnify MAGs, length 78-9492 bp, GC 0.20-0.75).

Match per positive: closest VFDB CDS within ±20% length and ±0.05 GC, no replacement,
deterministic (seed=42). Viral records longer than 9492 bp get dropped (no VFDB match
possible) — these are mostly complete viral genomes (~44% of the corpus).

Output: ~/MGnify/data/targeted_jsonl/human_viral/{paired,unpaired}.jsonl
"""
from __future__ import annotations

import json
import random
from collections import defaultdict
from pathlib import Path

import pandas as pd


VIRAL_XLSX = "/home/ror25cal/MGnify/human_viral_sequences.xlsx"
VFDB_DIR = Path("/home/ror25cal/MGnify/data/targeted_jsonl/vfdb")
OUT_DIR = Path("/home/ror25cal/MGnify/data/targeted_jsonl/human_viral")
SEED = 42
LEN_TOL = 0.20
GC_TOL = 0.05


def gc_content(seq: str) -> float:
    s = seq.upper()
    n = sum(1 for c in s if c in "ACGT")
    if n == 0:
        return 0.0
    g = sum(1 for c in s if c in "GC")
    return g / n


def main():
    rng = random.Random(SEED)
    OUT_DIR.mkdir(parents=True, exist_ok=True)

    # ---- Load viral positives ----
    print("loading viral xlsx…")
    df = pd.read_excel(VIRAL_XLSX)
    print(f"  {len(df)} viral records loaded")
    df["gc_content"] = df["sequence"].astype(str).map(gc_content)
    print(f"  computed GC content for all viral records")

    # ---- Load VFDB negative pool ----
    print("loading VFDB JSONLs…")
    vfdb = []
    for fp in sorted(VFDB_DIR.glob("*.jsonl")):
        for line in fp.read_text().splitlines():
            r = json.loads(line)
            if r.get("extract_status") != "ok":
                continue
            vfdb.append(r)
    print(f"  {len(vfdb)} VFDB records (both VF positives and matched-CDS negatives)")

    # Pre-bucket VFDB by length-tens for fast lookup (e.g. round to nearest 100 bp)
    # Actually iterate linearly per viral; 14k × 6k is feasible (~80M comparisons,
    # each cheap). Bucket-and-skip cleverness not needed.
    vfdb_indexed = list(enumerate(vfdb))

    used_vfdb_idx: set[int] = set()
    paired_records = []
    unpaired_records = []
    n_too_long = 0

    # Shuffle viral records for unbiased no-replacement matching
    viral_rows = list(df.iterrows())
    rng.shuffle(viral_rows)

    for _orig_idx, vr in viral_rows:
        vlen = int(vr["sequence_len"])
        vgc = float(vr["gc_content"])

        if vlen > 9492:                                                  # max VFDB length
            n_too_long += 1
            unpaired_records.append({
                "drop_reason": "viral length exceeds VFDB max (9492 bp)",
                **{k: (v if pd.notna(v) else None)
                   for k, v in vr.to_dict().items()
                   if k != "sequence"},
                "sequence_first_200bp": str(vr["sequence"])[:200],
            })
            continue

        # Build candidate list: VFDB records within ±20% length AND ±0.05 GC, not yet used
        len_lo = vlen * (1 - LEN_TOL)
        len_hi = vlen * (1 + LEN_TOL)
        gc_lo = vgc - GC_TOL
        gc_hi = vgc + GC_TOL
        candidates = [
            (i, n) for i, n in vfdb_indexed
            if i not in used_vfdb_idx
               and len_lo <= n["cds_length"] <= len_hi
               and gc_lo <= n["gc_content"] <= gc_hi
        ]
        if not candidates:
            unpaired_records.append({
                "drop_reason": "no VFDB candidate within length+GC tolerance",
                "viral_length": vlen,
                "viral_gc": vgc,
                **{k: (v if pd.notna(v) else None)
                   for k, v in vr.to_dict().items()
                   if k not in ("sequence",)},
                "sequence_first_200bp": str(vr["sequence"])[:200],
            })
            continue

        # Pick one randomly from candidates (rng-driven)
        idx, neg = rng.choice(candidates)
        used_vfdb_idx.add(idx)

        # Construct paired records (positive then negative, mirroring our convention)
        viral_id = str(vr["sequence_id"])
        neg_locus = neg["locus_tag"] if "locus_tag" in neg else neg.get("region_id", f"vfdb_{idx}")

        pos_record = {
            "region_id": f"VIRAL_{viral_id}",
            "is_positive": True,
            "label": "HUMAN_VIRAL",
            "label_class": str(vr.get("genome_type") or "unknown"),
            "label_subclass": (str(vr["product_name"]) if pd.notna(vr.get("product_name")) else None),
            "source_db": str(vr.get("db") or "GenBank"),
            "source_accession": (str(vr["source_accession"]) if pd.notna(vr.get("source_accession")) else None),
            "sequence_id": viral_id,
            "seq_hash": (str(vr["seq_hash"]) if pd.notna(vr.get("seq_hash")) else None),
            "organism": (str(vr["organism"]) if pd.notna(vr.get("organism")) else None),
            "gene_name": (str(vr["gene_name"]) if pd.notna(vr.get("gene_name")) else None),
            "product_name": (str(vr["product_name"]) if pd.notna(vr.get("product_name")) else None),
            "description": (str(vr["description"]) if pd.notna(vr.get("description")) else None),
            "sample_source": (str(vr["sample_source"]) if pd.notna(vr.get("sample_source")) else None),
            "cds_length": vlen,
            "gc_content": vgc,
            "paired_with": neg_locus,
            "sequence": str(vr["sequence"]),
        }
        neg_record = {
            "region_id": f"NEG_{neg_locus}_for_{viral_id}",
            "is_positive": False,
            "label": "negative",
            "label_class": neg.get("label_class"),
            "label_subclass": neg.get("label_subclass"),
            "source_db": "VFDB-derived MGnify-MAG-CDS",
            "source_accession": None,
            "vfdb_origin": neg.get("region_id"),                          # original VFDB JSONL region_id
            "vfdb_paired_with": neg.get("paired_with"),                   # what the VFDB neg was originally paired with (a VF gene)
            "vfdb_is_positive": bool(neg.get("is_positive")),
            "locus_tag": neg.get("locus_tag"),
            "mag_id": neg.get("mag_id"),
            "species": neg.get("species"),
            "cds_length": int(neg["cds_length"]),
            "gc_content": float(neg["gc_content"]),
            "paired_with": viral_id,
            "sequence": str(neg["sequence"]),
        }
        paired_records.append(pos_record)
        paired_records.append(neg_record)

    print(f"\n=== matching summary ===")
    print(f"  viral positives:               {len(df)}")
    print(f"    too long (>9492 bp, dropped): {n_too_long}")
    print(f"    no length+GC match:           {sum(1 for u in unpaired_records if u.get('drop_reason') == 'no VFDB candidate within length+GC tolerance')}")
    print(f"    successfully paired:          {len(paired_records) // 2}")

    out_paired = OUT_DIR / "human_viral_v1.jsonl"
    out_paired.write_text("\n".join(json.dumps(r) for r in paired_records) + "\n")
    print(f"\n  wrote {out_paired}  ({out_paired.stat().st_size/1024/1024:.1f} MB)")

    out_unpaired = OUT_DIR / "human_viral_v1_unpaired.jsonl"
    out_unpaired.write_text("\n".join(json.dumps(u) for u in unpaired_records) + "\n")
    print(f"  wrote {out_unpaired}  ({out_unpaired.stat().st_size/1024:.1f} KB)  ← dropped/unpaired viral records, for inspection")

    # Quick stats by genome type for the paired viral set
    print("\n=== paired-viral genome_type breakdown ===")
    by_gt = defaultdict(int)
    for r in paired_records:
        if r["is_positive"]:
            by_gt[r["label_class"]] += 1
    for gt, n in sorted(by_gt.items(), key=lambda x: -x[1]):
        print(f"  {gt:20s}  {n}")


if __name__ == "__main__":
    main()