ustwo-api / scripts /build_meld_test_sets.py
asdfasdfqrqwer's picture
Deploy from GitHub 2026-04-23T03:56:31Z
c857b85
Raw
History Blame Contribute Delete
12.3 kB
#!/usr/bin/env python3
"""
Build English test sets from MELD (Friends) dataset.
Extracts 8 scenario-based test sets from MELD MP4 clips,
converts to WAV, and concatenates into single audio files
that simulate real phone calls for E2E pipeline testing.
Usage:
python scripts/build_meld_test_sets.py
Output:
data/meld_test/
β”œβ”€β”€ 01_angry_fight.wav
β”œβ”€β”€ 02_happy_loving.wav
β”œβ”€β”€ ...
β”œβ”€β”€ 08_calm_daily.wav
β”œβ”€β”€ ground_truth.json # per-utterance emotion labels
└── README.md # test set descriptions
"""
import csv
import json
import os
import subprocess
import sys
import tempfile
from collections import Counter
from pathlib import Path
# --- Configuration ---
PROJECT_ROOT = Path(__file__).resolve().parent.parent
ZIP_PATH = PROJECT_ROOT / "data" / "english_test.zip"
OUTPUT_DIR = PROJECT_ROOT / "data" / "meld_test"
SAMPLE_RATE = 16000 # 16kHz mono β€” matches our pipeline input
# 8 test scenarios β€” each maps to a specific MELD dialogue
TEST_SETS = [
{
"tag": "01_angry_fight",
"desc": "Ross-Rachel breakup fight β€” anger dominant (S3E15)",
"scenario": "Couple in a heated argument",
"primary_emotion": "anger",
"split": "train",
"dia_id": "51",
},
{
"tag": "02_happy_loving",
"desc": "Monica-Chandler sweet moment β€” joy dominant (S5E14)",
"scenario": "Couple being affectionate and playful",
"primary_emotion": "joy",
"split": "train",
"dia_id": "1026",
},
{
"tag": "03_sad_emotional",
"desc": "Ross-Rachel emotional confession β€” sadness dominant (S3E25)",
"scenario": "Emotional conversation with sadness and regret",
"primary_emotion": "sadness",
"split": "train",
"dia_id": "312",
},
{
"tag": "04_surprise_shock",
"desc": "Ross-Rachel surprise revelations (S7E18)",
"scenario": "Unexpected news and reactions",
"primary_emotion": "surprise",
"split": "train",
"dia_id": "747",
},
{
"tag": "05_fear_anxiety",
"desc": "Monica-Chandler anxious situation β€” fear+mixed (S4E14)",
"scenario": "Anxious and worried conversation",
"primary_emotion": "fear",
"split": "train",
"dia_id": "109",
},
{
"tag": "06_disgust_annoyance",
"desc": "Family annoyance scene β€” disgust+anger (S6E9)",
"scenario": "Annoyed and disgusted reactions",
"primary_emotion": "disgust",
"split": "train",
"dia_id": "1025",
},
{
"tag": "07_bittersweet",
"desc": "Ross-Rachel bittersweet farewell β€” sadness+surprise (S5E5)",
"scenario": "Mixed emotions: saying goodbye with conflicting feelings",
"primary_emotion": "sadness",
"split": "train",
"dia_id": "676",
},
{
"tag": "08_calm_daily",
"desc": "Casual daily conversation β€” neutral baseline (S3E23)",
"scenario": "Normal everyday chitchat (baseline)",
"primary_emotion": "neutral",
"split": "train",
"dia_id": "450",
},
]
def load_csv_from_zip(zip_path: Path) -> dict[str, list[dict]]:
"""Load all CSV data from zip, grouped by split_diaID."""
import zipfile
dialogues = {}
with zipfile.ZipFile(zip_path, "r") as zf:
csv_files = [
("train", "JSON files/JSON files/CSV Processed/train_sent_emo_cleaned_processed.csv"),
("dev", "JSON files/JSON files/CSV Processed/dev_sent_emo_cleaned_processed.csv"),
("test", "JSON files/JSON files/CSV Processed/test_sent_emo_cleaned_processed.csv"),
]
for split, csv_path in csv_files:
try:
with zf.open(csv_path) as f:
import io
reader = csv.DictReader(io.TextIOWrapper(f, encoding="utf-8"))
for row in reader:
key = f"{split}_{row['Dialogue_ID']}"
dialogues.setdefault(key, []).append(row)
except KeyError:
print(f" Warning: {csv_path} not found in zip")
return dialogues
def find_mp4_path(split: str, dia_id: str, utt_id: str, available_files: set) -> str | None:
"""Find MP4 file path for a specific utterance."""
patterns = [
f"MELD.Raw/MELD.Raw/{split}/{split}_splits/dia{dia_id}_utt{utt_id}.mp4",
f"MELD.Raw/MELD.Raw/{split}/{split}_splits_complete/dia{dia_id}_utt{utt_id}.mp4",
f"MELD.Raw/MELD.Raw/{split}/output_repeated_splits_{split}/final_videos_{split}dia{dia_id}_utt{utt_id}.mp4",
]
for p in patterns:
if p in available_files:
return p
return None
def get_mp4_list_from_zip(zip_path: Path) -> set:
"""Get set of all MP4 file paths in zip."""
import zipfile
with zipfile.ZipFile(zip_path, "r") as zf:
return {n for n in zf.namelist() if n.endswith(".mp4")}
def extract_and_concat_wav(
zip_path: Path, mp4_paths: list[str], output_wav: Path, sample_rate: int = 16000
) -> float:
"""Extract audio from MP4s in zip and concatenate into single WAV."""
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)
wav_parts = []
# Extract each MP4 and convert to WAV
import zipfile
with zipfile.ZipFile(zip_path, "r") as zf:
for i, mp4_path in enumerate(mp4_paths):
mp4_local = tmpdir / f"part_{i:03d}.mp4"
wav_local = tmpdir / f"part_{i:03d}.wav"
# Extract MP4
with zf.open(mp4_path) as src, open(mp4_local, "wb") as dst:
dst.write(src.read())
# Convert to WAV (16kHz mono)
result = subprocess.run(
[
"ffmpeg", "-y", "-i", str(mp4_local),
"-ar", str(sample_rate),
"-ac", "1",
"-acodec", "pcm_s16le",
str(wav_local),
],
capture_output=True,
text=True,
)
if result.returncode != 0:
print(f" Warning: ffmpeg failed for {mp4_path}: {result.stderr[:200]}")
continue
if wav_local.exists() and wav_local.stat().st_size > 0:
wav_parts.append(wav_local)
if not wav_parts:
return 0.0
# Concatenate WAVs using ffmpeg concat
list_file = tmpdir / "concat_list.txt"
with open(list_file, "w") as f:
for wp in wav_parts:
f.write(f"file '{wp}'\n")
output_wav.parent.mkdir(parents=True, exist_ok=True)
result = subprocess.run(
[
"ffmpeg", "-y", "-f", "concat", "-safe", "0",
"-i", str(list_file),
"-ar", str(sample_rate),
"-ac", "1",
"-acodec", "pcm_s16le",
str(output_wav),
],
capture_output=True,
text=True,
)
if result.returncode != 0:
print(f" Concat failed: {result.stderr[:300]}")
return 0.0
# Get duration
probe = subprocess.run(
["ffprobe", "-v", "quiet", "-show_entries", "format=duration",
"-of", "default=noprint_wrappers=1:nokey=1", str(output_wav)],
capture_output=True, text=True,
)
try:
return float(probe.stdout.strip())
except ValueError:
return 0.0
def main():
print("=" * 60)
print(" MELD English Test Set Builder")
print("=" * 60)
if not ZIP_PATH.exists():
print(f"Error: {ZIP_PATH} not found")
sys.exit(1)
# 1. Load CSV data
print("\n[1/4] Loading CSV data from zip...")
dialogues = load_csv_from_zip(ZIP_PATH)
print(f" Loaded {len(dialogues)} dialogues")
# 2. Get available MP4 files
print("[2/4] Scanning MP4 files in zip...")
mp4_files = get_mp4_list_from_zip(ZIP_PATH)
print(f" Found {len(mp4_files)} MP4 files")
# 3. Process each test set
print("[3/4] Building test sets...\n")
ground_truth = {}
summary_lines = []
for ts in TEST_SETS:
tag = ts["tag"]
key = f"{ts['split']}_{ts['dia_id']}"
utts = dialogues.get(key, [])
if not utts:
print(f" ❌ {tag}: dialogue {key} not found")
continue
print(f" πŸ“¦ {tag} β€” {ts['desc']}")
print(f" {len(utts)} utterances", end="")
# Find MP4 paths
mp4_paths = []
for u in utts:
p = find_mp4_path(ts["split"], ts["dia_id"], u["Utterance_ID"], mp4_files)
if p:
mp4_paths.append(p)
print(f", {len(mp4_paths)}/{len(utts)} MP4s found")
if not mp4_paths:
print(f" ❌ No MP4 files found, skipping")
continue
# Extract and concatenate
output_wav = OUTPUT_DIR / f"{tag}.wav"
duration = extract_and_concat_wav(ZIP_PATH, mp4_paths, output_wav, SAMPLE_RATE)
print(f" βœ… {output_wav.name} β€” {duration:.1f}s")
# Build ground truth
emo_counts = Counter(u["Emotion"] for u in utts)
ground_truth[tag] = {
"description": ts["desc"],
"scenario": ts["scenario"],
"primary_emotion": ts["primary_emotion"],
"source": f"MELD Friends S{utts[0]['Season']}E{utts[0]['Episode']} Dialogue {ts['dia_id']}",
"duration_sec": round(duration, 1),
"emotion_distribution": dict(emo_counts),
"total_utterances": len(utts),
"utterances": [
{
"speaker": u["Speaker"],
"emotion": u["Emotion"],
"sentiment": u["Sentiment"],
"text": u["Utterance"],
}
for u in utts
],
}
summary_lines.append(
f"| {tag} | {ts['scenario'][:40]} | {ts['primary_emotion']} | {duration:.1f}s | {len(utts)} utts | {dict(emo_counts)} |"
)
# 4. Save ground truth + README
print("\n[4/4] Saving metadata...")
gt_path = OUTPUT_DIR / "ground_truth.json"
with open(gt_path, "w", encoding="utf-8") as f:
json.dump(ground_truth, f, indent=2, ensure_ascii=False)
print(f" βœ… {gt_path}")
# Emotion alignment check
our_labels = {"neutral", "joy", "sadness", "anger", "surprise", "fear", "disgust"}
meld_labels = set()
for gt in ground_truth.values():
meld_labels.update(gt["emotion_distribution"].keys())
readme_content = f"""# MELD English Test Sets
## Emotion Label Alignment
| UsTwo Pipeline (EN) | MELD Label | Match |
|---|---|---|
| neutral | neutral | βœ… Exact |
| joy | joy | βœ… Exact |
| sadness | sadness | βœ… Exact |
| anger | anger | βœ… Exact |
| surprise | surprise | βœ… Exact |
| fear | fear | βœ… Exact |
| disgust | disgust | βœ… Exact |
**7/7 labels match exactly.** No mapping needed.
## Test Sets
| File | Scenario | Primary Emotion | Duration | Utterances | Emotion Distribution |
|---|---|---|---|---|---|
{chr(10).join(summary_lines)}
## Source
- Dataset: MELD (Multimodal EmotionLines Dataset)
- Source: Friends TV series
- Paper: Poria et al., ACL 2019
- Each WAV is a full dialogue concatenated from per-utterance MP4 clips
- Audio: 16kHz mono PCM (matches pipeline input format)
## Usage
```bash
# Run pipeline on a single test set
python scripts/run_pipeline.py data/meld_test/01_angry_fight.wav
# Evaluate all test sets
python scripts/evaluate_meld_test.py
```
"""
readme_path = OUTPUT_DIR / "README.md"
with open(readme_path, "w", encoding="utf-8") as f:
f.write(readme_content)
print(f" βœ… {readme_path}")
# Summary
print("\n" + "=" * 60)
print(" DONE")
print("=" * 60)
total_files = len(list(OUTPUT_DIR.glob("*.wav")))
print(f" {total_files} WAV files in {OUTPUT_DIR}")
print(f" Ground truth: {gt_path}")
print(f" Emotion alignment: {len(our_labels & meld_labels)}/{len(our_labels)} exact match")
if __name__ == "__main__":
main()