"""
Build combined Carbogrove + glycowork dataset for Bertint V5.

Steps:
  1. Load Carbogrove processed data (315K records)
  2. Load glycowork binding data (wide) → long (620K records)
  3. Merge and filter to proteins with sequences
  4. Compute per-experiment RANK PERCENTILE as training target
  5. Compute auxiliary targets (z-scores) for ablation
  6. Save combined dataset + sequence files

Output columns:
  protein_id        - canonical protein name
  glycan_wurcs      - WURCS string (for Bertose)
  target_raw        - original FractionBound or Z-score
  target_rank       - per-experiment rank percentile [0,1]   ← PRIMARY
  target_zscore     - z-score per source (global)             ← ABLATION
  target_zscore_exp - z-score within each experiment          ← ABLATION
  log_conc          - log10(concentration+1) or -1 (unknown)
  has_conc          - 1 if concentration known, 0 if not
  source            - 'carbogrove' or 'glycowork'
  data_source       - platform name (CG) or 'glycowork'
  exp_id            - experiment identifier
  exp_size          - records in this experiment

Canonical IDs:
  - Protein: amino acid sequence (from UniProt or glycowork 'target')
  - Glycan: WURCS string (for Bertose embeddings)
"""

import pandas as pd
import numpy as np
import csv
import json
import os
import logging
from pathlib import Path

logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s [%(levelname)s] %(message)s',
)
logger = logging.getLogger(__name__)

# ── Paths ──────────────────────────────────────────────────────────
BASE = Path(__file__).parent.parent
DATA_DIR = BASE / '19274777-2'
PROCESSED_DIR = BASE / 'processed_v5'
OUTPUT_DIR = BASE / 'combined_dataset'

# ── Step 1: Load Carbogrove ────────────────────────────────────────
def load_carbogrove() -> pd.DataFrame:
    """Load Carbogrove processed data with recovered lectins."""
    logger.info("Loading Carbogrove data...")

    # Load main binding data
    cg = pd.read_csv(PROCESSED_DIR / 'training_data.csv')
    logger.info(f"  Base Carbogrove: {len(cg):,} records, "
                f"{cg['LectinName'].nunique()} lectins")

    # Load glycan info for WURCS mapping
    glycan_info = {}
    with open(DATA_DIR / 'AllGlycanInfo_V2.csv') as f:
        for row in csv.DictReader(f):
            wurcs = row.get('WURCS', '').strip()
            if wurcs:
                glycan_info[row['GlycanID']] = wurcs

    # Load ALL protein sequences:
    # 1. UniProt-fetched sequences (most complete)
    prot_seqs = {}
    cg_seq_path = OUTPUT_DIR / 'carbogrove_protein_sequences.csv'
    if cg_seq_path.exists():
        cg_seqs = pd.read_csv(cg_seq_path)
        for _, row in cg_seqs.iterrows():
            prot_seqs[row['protein_id']] = {
                'sequence': row['sequence'],
                'source': row['source'],
            }
        logger.info(f"  Loaded {len(prot_seqs)} Carbogrove protein sequences")

    # 2. Recovered sequences (lectins without UniProt IDs)
    recovered_path = DATA_DIR / 'recovered_sequences.csv'
    if recovered_path.exists():
        recovered = pd.read_csv(recovered_path)
        recovered = recovered[recovered['Status'] == 'RECOVERED']
        for _, row in recovered.iterrows():
            name = row['LectinName']
            if name not in prot_seqs and row['Sequence'] and len(str(row['Sequence'])) > 20:
                prot_seqs[name] = {
                    'sequence': row['Sequence'],
                    'source': row['Source'],
                }
        n_recovered = len(recovered)
        logger.info(f"  Added {n_recovered} recovered lectin sequences")

    # Add WURCS to Carbogrove
    cg['WURCS'] = cg['GlycanID'].map(glycan_info)
    n_with_wurcs = cg['WURCS'].notna().sum()
    logger.info(f"  Records with WURCS: {n_with_wurcs:,} / {len(cg):,}")

    # Drop records without WURCS (can't generate Bertose embeddings)
    cg = cg[cg['WURCS'].notna()].copy()
    logger.info(f"  After WURCS filter: {len(cg):,} records")

    # Load experiment metadata for DataSource (platform)
    data_info = {}
    with open(DATA_DIR / 'AllDataInfo_V2.csv') as f:
        for row in csv.DictReader(f):
            data_info[int(row['DataID'])] = row.get('DataSource', 'unknown')
    cg['data_source'] = cg['DataID'].map(data_info)
    logger.info(f"  DataSources: {cg['data_source'].nunique()} platforms")

    # Standardize columns (keep DataID for per-experiment rank)
    cg['source'] = 'carbogrove'
    cg['target_raw'] = cg['FractionBound']
    cg['log_conc'] = np.log10(cg['Concentration'] + 1)
    cg['has_conc'] = 1
    cg['exp_id'] = 'cg_' + cg['DataID'].astype(str)

    return cg, prot_seqs, glycan_info


# ── Step 2: Load & convert glycowork ──────────────────────────────
def load_glycowork(iupac_to_wurcs: dict) -> pd.DataFrame:
    """
    Load glycowork binding data, convert wide→long,
    filter to glycans with WURCS mappings.
    """
    logger.info("Loading glycowork data...")
    from glycowork.glycan_data.loader import glycan_binding

    gw = glycan_binding
    meta_cols = ['protein', 'target']
    glycan_cols = [c for c in gw.columns if c not in meta_cols]
    logger.info(f"  Wide format: {gw.shape[0]} proteins × {len(glycan_cols)} glycans")

    # Filter to glycans with WURCS
    mapped_glycans = [g for g in glycan_cols if g in iupac_to_wurcs]
    logger.info(f"  Glycans with WURCS: {len(mapped_glycans)} / {len(glycan_cols)}")

    # Wide → Long (only for mapped glycans)
    logger.info("  Converting wide → long format...")
    records = []
    skipped_nan_protein = 0
    skipped_short_seq = 0

    for _, row in gw.iterrows():
        protein_name = row['protein']
        protein_seq = row['target']

        # Skip rows with NaN protein name
        if pd.isna(protein_name) or str(protein_name).strip() == 'nan':
            skipped_nan_protein += 1
            continue

        protein_name = str(protein_name).strip()
        protein_seq = str(protein_seq) if pd.notna(protein_seq) else ''

        # Skip proteins without sequences
        if len(protein_seq) < 50:
            skipped_short_seq += 1
            continue

        for glycan_iupac in mapped_glycans:
            val = row[glycan_iupac]
            if pd.notna(val):
                records.append({
                    'protein_name': protein_name,
                    'protein_seq': protein_seq,
                    'glycan_iupac': glycan_iupac,
                    'WURCS': iupac_to_wurcs[glycan_iupac],
                    'target_raw': float(val),
                    'target_zscore': float(val),  # Already z-scored
                })

    logger.info(f"  Skipped {skipped_nan_protein} rows with NaN protein")
    logger.info(f"  Skipped {skipped_short_seq} rows with short/no sequence")

    gw_long = pd.DataFrame(records)
    logger.info(f"  Long format: {len(gw_long):,} records")
    logger.info(f"  Proteins: {gw_long['protein_name'].nunique()}")
    logger.info(f"  Glycans: {gw_long['glycan_iupac'].nunique()}")

    gw_long['source'] = 'glycowork'
    gw_long['log_conc'] = -1.0  # Sentinel: concentration unknown
    gw_long['has_conc'] = 0
    gw_long['data_source'] = 'glycowork'
    gw_long['exp_id'] = 'gw_' + gw_long['protein_name']

    return gw_long


# ── Step 3: Merge datasets ────────────────────────────────────────
def merge_datasets(
    cg: pd.DataFrame,
    gw: pd.DataFrame,
    prot_seqs: dict,
) -> pd.DataFrame:
    """
    Merge Carbogrove and glycowork into unified format.

    Unified columns:
      - protein_id: canonical ID (UniProt or glycowork name)
      - protein_seq: amino acid sequence (for ESM-C)
      - glycan_wurcs: WURCS string (for Bertose)
      - target_raw: original target value
      - target_zscore: z-scored target (per source)
      - log_conc: log10(concentration + 1) or NaN
      - source: 'carbogrove' or 'glycowork'
    """
    logger.info("Merging datasets...")

    # Standardize Carbogrove columns
    cg_unified = pd.DataFrame({
        'protein_id': cg['LectinName'],
        'glycan_wurcs': cg['WURCS'],
        'target_raw': cg['target_raw'],
        'log_conc': cg['log_conc'],
        'has_conc': cg['has_conc'],
        'source': cg['source'],
        'data_source': cg['data_source'],
        'exp_id': cg['exp_id'],
    })

    # Standardize glycowork columns
    gw_unified = pd.DataFrame({
        'protein_id': gw['protein_name'],
        'glycan_wurcs': gw['WURCS'],
        'target_raw': gw['target_raw'],
        'log_conc': gw['log_conc'],
        'has_conc': gw['has_conc'],
        'source': gw['source'],
        'data_source': gw['data_source'],
        'exp_id': gw['exp_id'],
    })

    # Concatenate
    combined = pd.concat([cg_unified, gw_unified], ignore_index=True)
    logger.info(f"  Combined (pre-dedup): {len(combined):,} records")
    logger.info(f"    Carbogrove: {(combined['source'] == 'carbogrove').sum():,}")
    logger.info(f"    glycowork: {(combined['source'] == 'glycowork').sum():,}")

    # ── Deduplication ──────────────────────────────────────────────
    # Two sources of duplicates:
    #   1. glycowork: 160 proteins have multiple rows in the binding
    #      matrix (same protein tested on different arrays), creating
    #      duplicate (protein, glycan) pairs with different Z-scores.
    #   2. Carbogrove: different GlycanIDs can map to the same WURCS
    #      (structurally identical glycans with different printed names),
    #      creating duplicate (protein, glycan, experiment) entries.
    #
    # Fix: average target_raw per (protein_id, glycan_wurcs, exp_id).
    # Other columns (log_conc, has_conc, source, data_source) are
    # constant within each group, so we take the first value.

    group_cols = ['protein_id', 'glycan_wurcs', 'exp_id']
    before = len(combined)

    combined = combined.groupby(group_cols, as_index=False).agg({
        'target_raw': 'mean',       # Average replicate measurements
        'log_conc': 'first',        # Constant within experiment
        'has_conc': 'first',        # Constant within source
        'source': 'first',          # Constant within experiment
        'data_source': 'first',     # Constant within experiment
    })

    after = len(combined)
    removed = before - after
    logger.info(f"  Deduplicated: {before:,} → {after:,} ({removed:,} duplicates removed)")

    return combined


# ── Step 4: Generate protein sequence file ─────────────────────────
def build_protein_sequence_file(
    combined: pd.DataFrame,
    prot_seqs: dict,
    gw_long: pd.DataFrame,
) -> pd.DataFrame:
    """Build protein_id → sequence mapping for ESM-C embedding generation."""
    logger.info("Building protein sequence file...")

    seq_map = {}

    # Carbogrove proteins: from UniProt-fetched + recovered
    for name, info in prot_seqs.items():
        if 'sequence' in info and len(str(info['sequence'])) > 20:
            seq_map[name] = str(info['sequence'])

    n_cg = len(seq_map)
    logger.info(f"  Carbogrove sequences: {n_cg}")

    # glycowork proteins: sequence in data
    gw_seqs = gw_long.groupby('protein_name')['protein_seq'].first()
    for name, seq in gw_seqs.items():
        if name not in seq_map and len(str(seq)) > 50:
            seq_map[name] = str(seq)

    logger.info(f"  glycowork sequences added: {len(seq_map) - n_cg}")

    # Check coverage
    all_proteins = set(combined['protein_id'].unique())
    covered = all_proteins & set(seq_map.keys())
    missing = all_proteins - set(seq_map.keys())

    logger.info(f"  Total unique proteins: {len(all_proteins)}")
    logger.info(f"  With sequences: {len(covered)} ({len(covered)/len(all_proteins)*100:.1f}%)")
    logger.info(f"  Missing sequences: {len(missing)}")
    if missing:
        logger.info(f"    Missing: {sorted(missing)}")

    # Drop records for proteins without sequences from combined
    records_before = len(combined)
    combined_filtered = combined[combined['protein_id'].isin(seq_map)].copy()
    records_after = len(combined_filtered)
    dropped = records_before - records_after
    if dropped > 0:
        logger.info(f"  Dropped {dropped:,} records with missing protein sequences")

    # Save
    seq_df = pd.DataFrame([
        {'protein_id': pid, 'sequence': seq, 'seq_len': len(seq)}
        for pid, seq in seq_map.items()
        if pid in set(combined_filtered['protein_id'].unique())
    ])
    return seq_df, combined_filtered


# ── Step 5: Rank-based normalization ───────────────────────────────
def compute_rank_targets(combined: pd.DataFrame) -> pd.DataFrame:
    """
    Compute per-experiment rank percentile + auxiliary z-score targets.

    Targets computed:
      target_rank       - rank percentile [0,1] within experiment (PRIMARY)
      target_zscore     - z-score per source (global)              (ABLATION)
      target_zscore_exp - z-score within each experiment           (ABLATION)
      exp_size          - number of records in this experiment
    """
    logger.info("Computing targets...")

    # 1. Rank within each experiment, normalized to [0, 1]
    combined['target_rank'] = combined.groupby('exp_id')['target_raw'].transform(
        lambda x: x.rank(method='average', pct=True)
    )

    # 2. Global z-score per source
    for src in ['carbogrove', 'glycowork']:
        mask = combined['source'] == src
        raw = combined.loc[mask, 'target_raw']
        combined.loc[mask, 'target_zscore'] = (
            (raw - raw.mean()) / (raw.std() + 1e-10)
        )
    logger.info(f"  Global z-scores computed per source")

    # 3. Z-score within each experiment
    combined['target_zscore_exp'] = combined.groupby('exp_id')['target_raw'].transform(
        lambda x: (x - x.mean()) / (x.std() + 1e-10) if x.std() > 0 else 0.0
    )
    logger.info(f"  Per-experiment z-scores computed")

    # 4. Experiment size (for potential weighting)
    combined['exp_size'] = combined.groupby('exp_id')['target_raw'].transform('count').astype(int)

    # Validation: check rank distribution
    n_exps = combined['exp_id'].nunique()
    exp_sizes = combined.groupby('exp_id').size()
    logger.info(f"  Ranked across {n_exps:,} experiments")
    logger.info(f"  Experiment sizes: min={exp_sizes.min()}, "
                f"median={exp_sizes.median():.0f}, max={exp_sizes.max()}")

    # Drop experiments with fewer than 5 glycans — too noisy for ranking
    small_exps = (exp_sizes < 5).sum()
    if small_exps > 0:
        valid_exps = exp_sizes[exp_sizes >= 5].index
        before = len(combined)
        combined = combined[combined['exp_id'].isin(valid_exps)].copy()
        after = len(combined)
        if before > after:
            logger.info(f"  Dropped {before - after:,} records from "
                        f"experiments with <5 glycans")

    # Per-source stats
    for src in ['carbogrove', 'glycowork']:
        sub = combined[combined['source'] == src]
        r = sub['target_rank']
        z = sub['target_zscore']
        logger.info(f"  [{src}] rank: mean={r.mean():.3f} std={r.std():.3f} | "
                    f"zscore: mean={z.mean():.3f} std={z.std():.3f}")

    return combined


# ── Step 7: Summary statistics ─────────────────────────────────────
def print_summary(combined: pd.DataFrame) -> None:
    """Print comprehensive dataset summary."""
    logger.info("=" * 70)
    logger.info("COMBINED DATASET SUMMARY")
    logger.info("=" * 70)

    for src in ['carbogrove', 'glycowork', 'all']:
        if src == 'all':
            sub = combined
            label = 'COMBINED'
        else:
            sub = combined[combined['source'] == src]
            label = src.upper()

        logger.info(f"\n  [{label}]")
        logger.info(f"    Records: {len(sub):,}")
        logger.info(f"    Proteins: {sub['protein_id'].nunique()}")
        logger.info(f"    Glycans (WURCS): {sub['glycan_wurcs'].nunique()}")
        logger.info(f"    Experiments: {sub['exp_id'].nunique()}")
        logger.info(f"    target_rank: mean={sub['target_rank'].mean():.3f}, "
                     f"std={sub['target_rank'].std():.3f}")
        logger.info(f"    target_zscore: mean={sub['target_zscore'].mean():.3f}, "
                     f"std={sub['target_zscore'].std():.3f}")

    # Column inventory
    logger.info(f"\n  Columns: {list(combined.columns)}")
    logger.info(f"  Unique glycan WURCS: {combined['glycan_wurcs'].nunique()}")
    logger.info(f"  Unique protein IDs: {combined['protein_id'].nunique()}")


# ── Main ───────────────────────────────────────────────────────────
def main() -> None:
    """Build combined dataset."""
    # Create output directory
    OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

    # Step 1: Load Carbogrove
    cg, prot_seqs, glycan_info = load_carbogrove()

    # Load IUPAC → WURCS mapping
    iupac_wurcs_path = DATA_DIR / 'glycowork_iupac_to_wurcs.json'
    with open(iupac_wurcs_path) as f:
        iupac_to_wurcs = json.load(f)
    logger.info(f"Loaded {len(iupac_to_wurcs)} IUPAC→WURCS mappings")

    # Step 2: Load & convert glycowork
    gw = load_glycowork(iupac_to_wurcs)

    # Step 3: Merge
    combined = merge_datasets(cg, gw, prot_seqs)

    # Step 4: Protein sequences (also filters combined to drop missing)
    seq_df, combined_final = build_protein_sequence_file(combined, prot_seqs, gw)

    # Step 5: Rank-based normalization + auxiliary targets
    combined_final = compute_rank_targets(combined_final)

    # Save outputs
    logger.info("\nSaving outputs...")
    combined_final.to_csv(OUTPUT_DIR / 'combined_binding_data.csv', index=False)
    logger.info(f"  Saved combined_binding_data.csv: {len(combined_final):,} records")

    seq_df.to_csv(OUTPUT_DIR / 'protein_sequences.csv', index=False)
    logger.info(f"  Saved protein_sequences.csv: {len(seq_df)} proteins")

    # Save unique WURCS for Bertose embedding generation
    unique_wurcs = combined_final['glycan_wurcs'].unique()
    wurcs_df = pd.DataFrame({'wurcs': unique_wurcs})
    wurcs_df.to_csv(OUTPUT_DIR / 'unique_glycan_wurcs.csv', index=False)
    logger.info(f"  Saved unique_glycan_wurcs.csv: {len(wurcs_df)} glycans")

    # Step 7: Summary
    print_summary(combined_final)


if __name__ == '__main__':
    main()