#!/usr/bin/env python3 """ Build English test sets from MELD (Friends) dataset. Extracts 8 scenario-based test sets from MELD MP4 clips, converts to WAV, and concatenates into single audio files that simulate real phone calls for E2E pipeline testing. Usage: python scripts/build_meld_test_sets.py Output: data/meld_test/ ├── 01_angry_fight.wav ├── 02_happy_loving.wav ├── ... ├── 08_calm_daily.wav ├── ground_truth.json # per-utterance emotion labels └── README.md # test set descriptions """ import csv import json import os import subprocess import sys import tempfile from collections import Counter from pathlib import Path # --- Configuration --- PROJECT_ROOT = Path(__file__).resolve().parent.parent ZIP_PATH = PROJECT_ROOT / "data" / "english_test.zip" OUTPUT_DIR = PROJECT_ROOT / "data" / "meld_test" SAMPLE_RATE = 16000 # 16kHz mono — matches our pipeline input # 8 test scenarios — each maps to a specific MELD dialogue TEST_SETS = [ { "tag": "01_angry_fight", "desc": "Ross-Rachel breakup fight — anger dominant (S3E15)", "scenario": "Couple in a heated argument", "primary_emotion": "anger", "split": "train", "dia_id": "51", }, { "tag": "02_happy_loving", "desc": "Monica-Chandler sweet moment — joy dominant (S5E14)", "scenario": "Couple being affectionate and playful", "primary_emotion": "joy", "split": "train", "dia_id": "1026", }, { "tag": "03_sad_emotional", "desc": "Ross-Rachel emotional confession — sadness dominant (S3E25)", "scenario": "Emotional conversation with sadness and regret", "primary_emotion": "sadness", "split": "train", "dia_id": "312", }, { "tag": "04_surprise_shock", "desc": "Ross-Rachel surprise revelations (S7E18)", "scenario": "Unexpected news and reactions", "primary_emotion": "surprise", "split": "train", "dia_id": "747", }, { "tag": "05_fear_anxiety", "desc": "Monica-Chandler anxious situation — fear+mixed (S4E14)", "scenario": "Anxious and worried conversation", "primary_emotion": "fear", "split": "train", "dia_id": "109", }, { "tag": "06_disgust_annoyance", "desc": "Family annoyance scene — disgust+anger (S6E9)", "scenario": "Annoyed and disgusted reactions", "primary_emotion": "disgust", "split": "train", "dia_id": "1025", }, { "tag": "07_bittersweet", "desc": "Ross-Rachel bittersweet farewell — sadness+surprise (S5E5)", "scenario": "Mixed emotions: saying goodbye with conflicting feelings", "primary_emotion": "sadness", "split": "train", "dia_id": "676", }, { "tag": "08_calm_daily", "desc": "Casual daily conversation — neutral baseline (S3E23)", "scenario": "Normal everyday chitchat (baseline)", "primary_emotion": "neutral", "split": "train", "dia_id": "450", }, ] def load_csv_from_zip(zip_path: Path) -> dict[str, list[dict]]: """Load all CSV data from zip, grouped by split_diaID.""" import zipfile dialogues = {} with zipfile.ZipFile(zip_path, "r") as zf: csv_files = [ ("train", "JSON files/JSON files/CSV Processed/train_sent_emo_cleaned_processed.csv"), ("dev", "JSON files/JSON files/CSV Processed/dev_sent_emo_cleaned_processed.csv"), ("test", "JSON files/JSON files/CSV Processed/test_sent_emo_cleaned_processed.csv"), ] for split, csv_path in csv_files: try: with zf.open(csv_path) as f: import io reader = csv.DictReader(io.TextIOWrapper(f, encoding="utf-8")) for row in reader: key = f"{split}_{row['Dialogue_ID']}" dialogues.setdefault(key, []).append(row) except KeyError: print(f" Warning: {csv_path} not found in zip") return dialogues def find_mp4_path(split: str, dia_id: str, utt_id: str, available_files: set) -> str | None: """Find MP4 file path for a specific utterance.""" patterns = [ f"MELD.Raw/MELD.Raw/{split}/{split}_splits/dia{dia_id}_utt{utt_id}.mp4", f"MELD.Raw/MELD.Raw/{split}/{split}_splits_complete/dia{dia_id}_utt{utt_id}.mp4", f"MELD.Raw/MELD.Raw/{split}/output_repeated_splits_{split}/final_videos_{split}dia{dia_id}_utt{utt_id}.mp4", ] for p in patterns: if p in available_files: return p return None def get_mp4_list_from_zip(zip_path: Path) -> set: """Get set of all MP4 file paths in zip.""" import zipfile with zipfile.ZipFile(zip_path, "r") as zf: return {n for n in zf.namelist() if n.endswith(".mp4")} def extract_and_concat_wav( zip_path: Path, mp4_paths: list[str], output_wav: Path, sample_rate: int = 16000 ) -> float: """Extract audio from MP4s in zip and concatenate into single WAV.""" with tempfile.TemporaryDirectory() as tmpdir: tmpdir = Path(tmpdir) wav_parts = [] # Extract each MP4 and convert to WAV import zipfile with zipfile.ZipFile(zip_path, "r") as zf: for i, mp4_path in enumerate(mp4_paths): mp4_local = tmpdir / f"part_{i:03d}.mp4" wav_local = tmpdir / f"part_{i:03d}.wav" # Extract MP4 with zf.open(mp4_path) as src, open(mp4_local, "wb") as dst: dst.write(src.read()) # Convert to WAV (16kHz mono) result = subprocess.run( [ "ffmpeg", "-y", "-i", str(mp4_local), "-ar", str(sample_rate), "-ac", "1", "-acodec", "pcm_s16le", str(wav_local), ], capture_output=True, text=True, ) if result.returncode != 0: print(f" Warning: ffmpeg failed for {mp4_path}: {result.stderr[:200]}") continue if wav_local.exists() and wav_local.stat().st_size > 0: wav_parts.append(wav_local) if not wav_parts: return 0.0 # Concatenate WAVs using ffmpeg concat list_file = tmpdir / "concat_list.txt" with open(list_file, "w") as f: for wp in wav_parts: f.write(f"file '{wp}'\n") output_wav.parent.mkdir(parents=True, exist_ok=True) result = subprocess.run( [ "ffmpeg", "-y", "-f", "concat", "-safe", "0", "-i", str(list_file), "-ar", str(sample_rate), "-ac", "1", "-acodec", "pcm_s16le", str(output_wav), ], capture_output=True, text=True, ) if result.returncode != 0: print(f" Concat failed: {result.stderr[:300]}") return 0.0 # Get duration probe = subprocess.run( ["ffprobe", "-v", "quiet", "-show_entries", "format=duration", "-of", "default=noprint_wrappers=1:nokey=1", str(output_wav)], capture_output=True, text=True, ) try: return float(probe.stdout.strip()) except ValueError: return 0.0 def main(): print("=" * 60) print(" MELD English Test Set Builder") print("=" * 60) if not ZIP_PATH.exists(): print(f"Error: {ZIP_PATH} not found") sys.exit(1) # 1. Load CSV data print("\n[1/4] Loading CSV data from zip...") dialogues = load_csv_from_zip(ZIP_PATH) print(f" Loaded {len(dialogues)} dialogues") # 2. Get available MP4 files print("[2/4] Scanning MP4 files in zip...") mp4_files = get_mp4_list_from_zip(ZIP_PATH) print(f" Found {len(mp4_files)} MP4 files") # 3. Process each test set print("[3/4] Building test sets...\n") ground_truth = {} summary_lines = [] for ts in TEST_SETS: tag = ts["tag"] key = f"{ts['split']}_{ts['dia_id']}" utts = dialogues.get(key, []) if not utts: print(f" ❌ {tag}: dialogue {key} not found") continue print(f" 📦 {tag} — {ts['desc']}") print(f" {len(utts)} utterances", end="") # Find MP4 paths mp4_paths = [] for u in utts: p = find_mp4_path(ts["split"], ts["dia_id"], u["Utterance_ID"], mp4_files) if p: mp4_paths.append(p) print(f", {len(mp4_paths)}/{len(utts)} MP4s found") if not mp4_paths: print(f" ❌ No MP4 files found, skipping") continue # Extract and concatenate output_wav = OUTPUT_DIR / f"{tag}.wav" duration = extract_and_concat_wav(ZIP_PATH, mp4_paths, output_wav, SAMPLE_RATE) print(f" ✅ {output_wav.name} — {duration:.1f}s") # Build ground truth emo_counts = Counter(u["Emotion"] for u in utts) ground_truth[tag] = { "description": ts["desc"], "scenario": ts["scenario"], "primary_emotion": ts["primary_emotion"], "source": f"MELD Friends S{utts[0]['Season']}E{utts[0]['Episode']} Dialogue {ts['dia_id']}", "duration_sec": round(duration, 1), "emotion_distribution": dict(emo_counts), "total_utterances": len(utts), "utterances": [ { "speaker": u["Speaker"], "emotion": u["Emotion"], "sentiment": u["Sentiment"], "text": u["Utterance"], } for u in utts ], } summary_lines.append( f"| {tag} | {ts['scenario'][:40]} | {ts['primary_emotion']} | {duration:.1f}s | {len(utts)} utts | {dict(emo_counts)} |" ) # 4. Save ground truth + README print("\n[4/4] Saving metadata...") gt_path = OUTPUT_DIR / "ground_truth.json" with open(gt_path, "w", encoding="utf-8") as f: json.dump(ground_truth, f, indent=2, ensure_ascii=False) print(f" ✅ {gt_path}") # Emotion alignment check our_labels = {"neutral", "joy", "sadness", "anger", "surprise", "fear", "disgust"} meld_labels = set() for gt in ground_truth.values(): meld_labels.update(gt["emotion_distribution"].keys()) readme_content = f"""# MELD English Test Sets ## Emotion Label Alignment | UsTwo Pipeline (EN) | MELD Label | Match | |---|---|---| | neutral | neutral | ✅ Exact | | joy | joy | ✅ Exact | | sadness | sadness | ✅ Exact | | anger | anger | ✅ Exact | | surprise | surprise | ✅ Exact | | fear | fear | ✅ Exact | | disgust | disgust | ✅ Exact | **7/7 labels match exactly.** No mapping needed. ## Test Sets | File | Scenario | Primary Emotion | Duration | Utterances | Emotion Distribution | |---|---|---|---|---|---| {chr(10).join(summary_lines)} ## Source - Dataset: MELD (Multimodal EmotionLines Dataset) - Source: Friends TV series - Paper: Poria et al., ACL 2019 - Each WAV is a full dialogue concatenated from per-utterance MP4 clips - Audio: 16kHz mono PCM (matches pipeline input format) ## Usage ```bash # Run pipeline on a single test set python scripts/run_pipeline.py data/meld_test/01_angry_fight.wav # Evaluate all test sets python scripts/evaluate_meld_test.py ``` """ readme_path = OUTPUT_DIR / "README.md" with open(readme_path, "w", encoding="utf-8") as f: f.write(readme_content) print(f" ✅ {readme_path}") # Summary print("\n" + "=" * 60) print(" DONE") print("=" * 60) total_files = len(list(OUTPUT_DIR.glob("*.wav"))) print(f" {total_files} WAV files in {OUTPUT_DIR}") print(f" Ground truth: {gt_path}") print(f" Emotion alignment: {len(our_labels & meld_labels)}/{len(our_labels)} exact match") if __name__ == "__main__": main()