#!/usr/bin/env python3
"""
Build English test sets from MELD (Friends) dataset.

Extracts 8 scenario-based test sets from MELD MP4 clips,
converts to WAV, and concatenates into single audio files
that simulate real phone calls for E2E pipeline testing.

Usage:
    python scripts/build_meld_test_sets.py

Output:
    data/meld_test/
    ├── 01_angry_fight.wav
    ├── 02_happy_loving.wav
    ├── ...
    ├── 08_calm_daily.wav
    ├── ground_truth.json     # per-utterance emotion labels
    └── README.md             # test set descriptions
"""

import csv
import json
import os
import subprocess
import sys
import tempfile
from collections import Counter
from pathlib import Path

# --- Configuration ---
PROJECT_ROOT = Path(__file__).resolve().parent.parent
ZIP_PATH = PROJECT_ROOT / "data" / "english_test.zip"
OUTPUT_DIR = PROJECT_ROOT / "data" / "meld_test"
SAMPLE_RATE = 16000  # 16kHz mono — matches our pipeline input

# 8 test scenarios — each maps to a specific MELD dialogue
TEST_SETS = [
    {
        "tag": "01_angry_fight",
        "desc": "Ross-Rachel breakup fight — anger dominant (S3E15)",
        "scenario": "Couple in a heated argument",
        "primary_emotion": "anger",
        "split": "train",
        "dia_id": "51",
    },
    {
        "tag": "02_happy_loving",
        "desc": "Monica-Chandler sweet moment — joy dominant (S5E14)",
        "scenario": "Couple being affectionate and playful",
        "primary_emotion": "joy",
        "split": "train",
        "dia_id": "1026",
    },
    {
        "tag": "03_sad_emotional",
        "desc": "Ross-Rachel emotional confession — sadness dominant (S3E25)",
        "scenario": "Emotional conversation with sadness and regret",
        "primary_emotion": "sadness",
        "split": "train",
        "dia_id": "312",
    },
    {
        "tag": "04_surprise_shock",
        "desc": "Ross-Rachel surprise revelations (S7E18)",
        "scenario": "Unexpected news and reactions",
        "primary_emotion": "surprise",
        "split": "train",
        "dia_id": "747",
    },
    {
        "tag": "05_fear_anxiety",
        "desc": "Monica-Chandler anxious situation — fear+mixed (S4E14)",
        "scenario": "Anxious and worried conversation",
        "primary_emotion": "fear",
        "split": "train",
        "dia_id": "109",
    },
    {
        "tag": "06_disgust_annoyance",
        "desc": "Family annoyance scene — disgust+anger (S6E9)",
        "scenario": "Annoyed and disgusted reactions",
        "primary_emotion": "disgust",
        "split": "train",
        "dia_id": "1025",
    },
    {
        "tag": "07_bittersweet",
        "desc": "Ross-Rachel bittersweet farewell — sadness+surprise (S5E5)",
        "scenario": "Mixed emotions: saying goodbye with conflicting feelings",
        "primary_emotion": "sadness",
        "split": "train",
        "dia_id": "676",
    },
    {
        "tag": "08_calm_daily",
        "desc": "Casual daily conversation — neutral baseline (S3E23)",
        "scenario": "Normal everyday chitchat (baseline)",
        "primary_emotion": "neutral",
        "split": "train",
        "dia_id": "450",
    },
]


def load_csv_from_zip(zip_path: Path) -> dict[str, list[dict]]:
    """Load all CSV data from zip, grouped by split_diaID."""
    import zipfile

    dialogues = {}
    with zipfile.ZipFile(zip_path, "r") as zf:
        csv_files = [
            ("train", "JSON files/JSON files/CSV Processed/train_sent_emo_cleaned_processed.csv"),
            ("dev", "JSON files/JSON files/CSV Processed/dev_sent_emo_cleaned_processed.csv"),
            ("test", "JSON files/JSON files/CSV Processed/test_sent_emo_cleaned_processed.csv"),
        ]
        for split, csv_path in csv_files:
            try:
                with zf.open(csv_path) as f:
                    import io
                    reader = csv.DictReader(io.TextIOWrapper(f, encoding="utf-8"))
                    for row in reader:
                        key = f"{split}_{row['Dialogue_ID']}"
                        dialogues.setdefault(key, []).append(row)
            except KeyError:
                print(f"  Warning: {csv_path} not found in zip")
    return dialogues


def find_mp4_path(split: str, dia_id: str, utt_id: str, available_files: set) -> str | None:
    """Find MP4 file path for a specific utterance."""
    patterns = [
        f"MELD.Raw/MELD.Raw/{split}/{split}_splits/dia{dia_id}_utt{utt_id}.mp4",
        f"MELD.Raw/MELD.Raw/{split}/{split}_splits_complete/dia{dia_id}_utt{utt_id}.mp4",
        f"MELD.Raw/MELD.Raw/{split}/output_repeated_splits_{split}/final_videos_{split}dia{dia_id}_utt{utt_id}.mp4",
    ]
    for p in patterns:
        if p in available_files:
            return p
    return None


def get_mp4_list_from_zip(zip_path: Path) -> set:
    """Get set of all MP4 file paths in zip."""
    import zipfile
    with zipfile.ZipFile(zip_path, "r") as zf:
        return {n for n in zf.namelist() if n.endswith(".mp4")}


def extract_and_concat_wav(
    zip_path: Path, mp4_paths: list[str], output_wav: Path, sample_rate: int = 16000
) -> float:
    """Extract audio from MP4s in zip and concatenate into single WAV."""
    with tempfile.TemporaryDirectory() as tmpdir:
        tmpdir = Path(tmpdir)
        wav_parts = []

        # Extract each MP4 and convert to WAV
        import zipfile
        with zipfile.ZipFile(zip_path, "r") as zf:
            for i, mp4_path in enumerate(mp4_paths):
                mp4_local = tmpdir / f"part_{i:03d}.mp4"
                wav_local = tmpdir / f"part_{i:03d}.wav"

                # Extract MP4
                with zf.open(mp4_path) as src, open(mp4_local, "wb") as dst:
                    dst.write(src.read())

                # Convert to WAV (16kHz mono)
                result = subprocess.run(
                    [
                        "ffmpeg", "-y", "-i", str(mp4_local),
                        "-ar", str(sample_rate),
                        "-ac", "1",
                        "-acodec", "pcm_s16le",
                        str(wav_local),
                    ],
                    capture_output=True,
                    text=True,
                )
                if result.returncode != 0:
                    print(f"    Warning: ffmpeg failed for {mp4_path}: {result.stderr[:200]}")
                    continue

                if wav_local.exists() and wav_local.stat().st_size > 0:
                    wav_parts.append(wav_local)

        if not wav_parts:
            return 0.0

        # Concatenate WAVs using ffmpeg concat
        list_file = tmpdir / "concat_list.txt"
        with open(list_file, "w") as f:
            for wp in wav_parts:
                f.write(f"file '{wp}'\n")

        output_wav.parent.mkdir(parents=True, exist_ok=True)
        result = subprocess.run(
            [
                "ffmpeg", "-y", "-f", "concat", "-safe", "0",
                "-i", str(list_file),
                "-ar", str(sample_rate),
                "-ac", "1",
                "-acodec", "pcm_s16le",
                str(output_wav),
            ],
            capture_output=True,
            text=True,
        )
        if result.returncode != 0:
            print(f"    Concat failed: {result.stderr[:300]}")
            return 0.0

        # Get duration
        probe = subprocess.run(
            ["ffprobe", "-v", "quiet", "-show_entries", "format=duration",
             "-of", "default=noprint_wrappers=1:nokey=1", str(output_wav)],
            capture_output=True, text=True,
        )
        try:
            return float(probe.stdout.strip())
        except ValueError:
            return 0.0


def main():
    print("=" * 60)
    print("  MELD English Test Set Builder")
    print("=" * 60)

    if not ZIP_PATH.exists():
        print(f"Error: {ZIP_PATH} not found")
        sys.exit(1)

    # 1. Load CSV data
    print("\n[1/4] Loading CSV data from zip...")
    dialogues = load_csv_from_zip(ZIP_PATH)
    print(f"  Loaded {len(dialogues)} dialogues")

    # 2. Get available MP4 files
    print("[2/4] Scanning MP4 files in zip...")
    mp4_files = get_mp4_list_from_zip(ZIP_PATH)
    print(f"  Found {len(mp4_files)} MP4 files")

    # 3. Process each test set
    print("[3/4] Building test sets...\n")
    ground_truth = {}
    summary_lines = []

    for ts in TEST_SETS:
        tag = ts["tag"]
        key = f"{ts['split']}_{ts['dia_id']}"
        utts = dialogues.get(key, [])

        if not utts:
            print(f"  ❌ {tag}: dialogue {key} not found")
            continue

        print(f"  📦 {tag} — {ts['desc']}")
        print(f"     {len(utts)} utterances", end="")

        # Find MP4 paths
        mp4_paths = []
        for u in utts:
            p = find_mp4_path(ts["split"], ts["dia_id"], u["Utterance_ID"], mp4_files)
            if p:
                mp4_paths.append(p)

        print(f", {len(mp4_paths)}/{len(utts)} MP4s found")

        if not mp4_paths:
            print(f"     ❌ No MP4 files found, skipping")
            continue

        # Extract and concatenate
        output_wav = OUTPUT_DIR / f"{tag}.wav"
        duration = extract_and_concat_wav(ZIP_PATH, mp4_paths, output_wav, SAMPLE_RATE)
        print(f"     ✅ {output_wav.name} — {duration:.1f}s")

        # Build ground truth
        emo_counts = Counter(u["Emotion"] for u in utts)
        ground_truth[tag] = {
            "description": ts["desc"],
            "scenario": ts["scenario"],
            "primary_emotion": ts["primary_emotion"],
            "source": f"MELD Friends S{utts[0]['Season']}E{utts[0]['Episode']} Dialogue {ts['dia_id']}",
            "duration_sec": round(duration, 1),
            "emotion_distribution": dict(emo_counts),
            "total_utterances": len(utts),
            "utterances": [
                {
                    "speaker": u["Speaker"],
                    "emotion": u["Emotion"],
                    "sentiment": u["Sentiment"],
                    "text": u["Utterance"],
                }
                for u in utts
            ],
        }

        summary_lines.append(
            f"| {tag} | {ts['scenario'][:40]} | {ts['primary_emotion']} | {duration:.1f}s | {len(utts)} utts | {dict(emo_counts)} |"
        )

    # 4. Save ground truth + README
    print("\n[4/4] Saving metadata...")

    gt_path = OUTPUT_DIR / "ground_truth.json"
    with open(gt_path, "w", encoding="utf-8") as f:
        json.dump(ground_truth, f, indent=2, ensure_ascii=False)
    print(f"  ✅ {gt_path}")

    # Emotion alignment check
    our_labels = {"neutral", "joy", "sadness", "anger", "surprise", "fear", "disgust"}
    meld_labels = set()
    for gt in ground_truth.values():
        meld_labels.update(gt["emotion_distribution"].keys())

    readme_content = f"""# MELD English Test Sets

## Emotion Label Alignment

| UsTwo Pipeline (EN) | MELD Label | Match |
|---|---|---|
| neutral | neutral | ✅ Exact |
| joy | joy | ✅ Exact |
| sadness | sadness | ✅ Exact |
| anger | anger | ✅ Exact |
| surprise | surprise | ✅ Exact |
| fear | fear | ✅ Exact |
| disgust | disgust | ✅ Exact |

**7/7 labels match exactly.** No mapping needed.

## Test Sets

| File | Scenario | Primary Emotion | Duration | Utterances | Emotion Distribution |
|---|---|---|---|---|---|
{chr(10).join(summary_lines)}

## Source
- Dataset: MELD (Multimodal EmotionLines Dataset)
- Source: Friends TV series
- Paper: Poria et al., ACL 2019
- Each WAV is a full dialogue concatenated from per-utterance MP4 clips
- Audio: 16kHz mono PCM (matches pipeline input format)

## Usage
```bash
# Run pipeline on a single test set
python scripts/run_pipeline.py data/meld_test/01_angry_fight.wav

# Evaluate all test sets
python scripts/evaluate_meld_test.py
```
"""
    readme_path = OUTPUT_DIR / "README.md"
    with open(readme_path, "w", encoding="utf-8") as f:
        f.write(readme_content)
    print(f"  ✅ {readme_path}")

    # Summary
    print("\n" + "=" * 60)
    print("  DONE")
    print("=" * 60)
    total_files = len(list(OUTPUT_DIR.glob("*.wav")))
    print(f"  {total_files} WAV files in {OUTPUT_DIR}")
    print(f"  Ground truth: {gt_path}")
    print(f"  Emotion alignment: {len(our_labels & meld_labels)}/{len(our_labels)} exact match")


if __name__ == "__main__":
    main()