#!/usr/bin/env python3
"""Prepare MELD test split for English fusion grid search.

Extracts mp4 → wav (16kHz mono) and builds a manifest with text + emotion labels.

Usage:
    python scripts/prepare_meld_fusion_data.py
"""
from __future__ import annotations

import csv
import io
import json
import logging
import subprocess
import tempfile
import zipfile
from collections import Counter
from pathlib import Path

logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
logger = logging.getLogger(__name__)

PROJECT_LABELS = ["neutral", "joy", "sadness", "anger", "surprise", "fear", "disgust"]

# MELD emotions map 1:1 to project labels
MELD_LABEL_MAP = {
    "neutral": "neutral",
    "joy": "joy",
    "sadness": "sadness",
    "anger": "anger",
    "surprise": "surprise",
    "fear": "fear",
    "disgust": "disgust",
}


def main():
    zip_path = Path("data/english_test.zip")
    output_dir = Path("data/meld_fusion")
    audio_dir = output_dir / "audio"
    audio_dir.mkdir(parents=True, exist_ok=True)

    zf = zipfile.ZipFile(zip_path)

    # Step 1: Parse test CSV
    logger.info("Parsing MELD test CSV...")
    with zf.open("MELD.Raw/MELD.Raw/test_sent_emo.csv") as f:
        reader = csv.DictReader(io.TextIOWrapper(f, encoding="utf-8"))
        rows = list(reader)
    logger.info("MELD test: %d utterances", len(rows))

    # Build lookup: (dia_id, utt_id) → row
    csv_lookup = {}
    for r in rows:
        key = (int(r["Dialogue_ID"]), int(r["Utterance_ID"]))
        csv_lookup[key] = r

    # Step 2: Find mp4 files in zip
    test_mp4s = {}
    for name in zf.namelist():
        if "output_repeated_splits_test" in name and name.endswith(".mp4"):
            fname = Path(name).name
            if fname.startswith("._"):
                continue  # skip macOS metadata
            # Parse dia{D}_utt{U}.mp4
            try:
                parts = fname.replace(".mp4", "").split("_")
                dia_id = int(parts[0].replace("dia", ""))
                utt_id = int(parts[1].replace("utt", ""))
                test_mp4s[(dia_id, utt_id)] = name
            except (ValueError, IndexError):
                continue

    logger.info("Found %d test mp4 files (excluding macOS metadata)", len(test_mp4s))

    # Step 3: Match CSV ↔ mp4, extract wav
    manifest = []
    skipped = 0

    matched_keys = set(csv_lookup.keys()) & set(test_mp4s.keys())
    logger.info("Matched CSV↔mp4: %d", len(matched_keys))

    for i, key in enumerate(sorted(matched_keys)):
        row = csv_lookup[key]
        mp4_name = test_mp4s[key]
        dia_id, utt_id = key

        label = MELD_LABEL_MAP.get(row["Emotion"])
        if label is None:
            skipped += 1
            continue

        text = row["Utterance"].strip()
        if not text:
            skipped += 1
            continue

        wav_path = audio_dir / f"dia{dia_id}_utt{utt_id}.wav"

        if not wav_path.exists():
            # Extract mp4 from zip → convert to 16kHz mono wav
            try:
                mp4_bytes = zf.read(mp4_name)
                with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as tmp:
                    tmp.write(mp4_bytes)
                    tmp_path = tmp.name

                result = subprocess.run(
                    ["ffmpeg", "-y", "-i", tmp_path,
                     "-ar", "16000", "-ac", "1", "-f", "wav",
                     str(wav_path)],
                    capture_output=True, timeout=30,
                )
                Path(tmp_path).unlink(missing_ok=True)

                if result.returncode != 0:
                    skipped += 1
                    continue
            except Exception as e:
                logger.warning("Failed dia%d_utt%d: %s", dia_id, utt_id, e)
                skipped += 1
                continue

        manifest.append({
            "path": str(wav_path),
            "text": text,
            "label": label,
            "source": "meld_test",
            "dialogue_id": dia_id,
            "utterance_id": utt_id,
        })

        if (i + 1) % 200 == 0:
            logger.info("Processed %d / %d", i + 1, len(matched_keys))

    zf.close()

    # Step 4: Save manifest
    manifest_path = output_dir / "manifest.json"
    with open(manifest_path, "w", encoding="utf-8") as f:
        json.dump(manifest, f, indent=2, ensure_ascii=False)

    logger.info("Saved %d samples to %s (skipped %d)", len(manifest), manifest_path, skipped)

    # Stats
    emotions = Counter(s["label"] for s in manifest)
    for e, c in sorted(emotions.items(), key=lambda x: -x[1]):
        print(f"  {e}: {c}")


if __name__ == "__main__":
    main()