Spaces:

BBBAKERY
/

ustwo-api

Sleeping

App Files Files Community

ustwo-api / scripts /build_meld_test_sets.py

asdfasdfqrqwer

Deploy from GitHub 2026-04-23T03:56:31Z

c857b85 2 months ago

Raw

History Blame Contribute Delete

12.3 kB

	#!/usr/bin/env python3
	"""
	Build English test sets from MELD (Friends) dataset.

	Extracts 8 scenario-based test sets from MELD MP4 clips,
	converts to WAV, and concatenates into single audio files
	that simulate real phone calls for E2E pipeline testing.

	Usage:
	python scripts/build_meld_test_sets.py

	Output:
	data/meld_test/
	├── 01_angry_fight.wav
	├── 02_happy_loving.wav
	├── ...
	├── 08_calm_daily.wav
	├── ground_truth.json # per-utterance emotion labels
	└── README.md # test set descriptions
	"""

	import csv
	import json
	import os
	import subprocess
	import sys
	import tempfile
	from collections import Counter
	from pathlib import Path

	# --- Configuration ---
	PROJECT_ROOT = Path(__file__).resolve().parent.parent
	ZIP_PATH = PROJECT_ROOT / "data" / "english_test.zip"
	OUTPUT_DIR = PROJECT_ROOT / "data" / "meld_test"
	SAMPLE_RATE = 16000 # 16kHz mono — matches our pipeline input

	# 8 test scenarios — each maps to a specific MELD dialogue
	TEST_SETS = [
	{
	"tag": "01_angry_fight",
	"desc": "Ross-Rachel breakup fight — anger dominant (S3E15)",
	"scenario": "Couple in a heated argument",
	"primary_emotion": "anger",
	"split": "train",
	"dia_id": "51",
	},
	{
	"tag": "02_happy_loving",
	"desc": "Monica-Chandler sweet moment — joy dominant (S5E14)",
	"scenario": "Couple being affectionate and playful",
	"primary_emotion": "joy",
	"split": "train",
	"dia_id": "1026",
	},
	{
	"tag": "03_sad_emotional",
	"desc": "Ross-Rachel emotional confession — sadness dominant (S3E25)",
	"scenario": "Emotional conversation with sadness and regret",
	"primary_emotion": "sadness",
	"split": "train",
	"dia_id": "312",
	},
	{
	"tag": "04_surprise_shock",
	"desc": "Ross-Rachel surprise revelations (S7E18)",
	"scenario": "Unexpected news and reactions",
	"primary_emotion": "surprise",
	"split": "train",
	"dia_id": "747",
	},
	{
	"tag": "05_fear_anxiety",
	"desc": "Monica-Chandler anxious situation — fear+mixed (S4E14)",
	"scenario": "Anxious and worried conversation",
	"primary_emotion": "fear",
	"split": "train",
	"dia_id": "109",
	},
	{
	"tag": "06_disgust_annoyance",
	"desc": "Family annoyance scene — disgust+anger (S6E9)",
	"scenario": "Annoyed and disgusted reactions",
	"primary_emotion": "disgust",
	"split": "train",
	"dia_id": "1025",
	},
	{
	"tag": "07_bittersweet",
	"desc": "Ross-Rachel bittersweet farewell — sadness+surprise (S5E5)",
	"scenario": "Mixed emotions: saying goodbye with conflicting feelings",
	"primary_emotion": "sadness",
	"split": "train",
	"dia_id": "676",
	},
	{
	"tag": "08_calm_daily",
	"desc": "Casual daily conversation — neutral baseline (S3E23)",
	"scenario": "Normal everyday chitchat (baseline)",
	"primary_emotion": "neutral",
	"split": "train",
	"dia_id": "450",
	},
	]


	def load_csv_from_zip(zip_path: Path) -> dict[str, list[dict]]:
	"""Load all CSV data from zip, grouped by split_diaID."""
	import zipfile

	dialogues = {}
	with zipfile.ZipFile(zip_path, "r") as zf:
	csv_files = [
	("train", "JSON files/JSON files/CSV Processed/train_sent_emo_cleaned_processed.csv"),
	("dev", "JSON files/JSON files/CSV Processed/dev_sent_emo_cleaned_processed.csv"),
	("test", "JSON files/JSON files/CSV Processed/test_sent_emo_cleaned_processed.csv"),
	]
	for split, csv_path in csv_files:
	try:
	with zf.open(csv_path) as f:
	import io
	reader = csv.DictReader(io.TextIOWrapper(f, encoding="utf-8"))
	for row in reader:
	key = f"{split}_{row['Dialogue_ID']}"
	dialogues.setdefault(key, []).append(row)
	except KeyError:
	print(f" Warning: {csv_path} not found in zip")
	return dialogues


	def find_mp4_path(split: str, dia_id: str, utt_id: str, available_files: set) -> str \| None:
	"""Find MP4 file path for a specific utterance."""
	patterns = [
	f"MELD.Raw/MELD.Raw/{split}/{split}_splits/dia{dia_id}_utt{utt_id}.mp4",
	f"MELD.Raw/MELD.Raw/{split}/{split}_splits_complete/dia{dia_id}_utt{utt_id}.mp4",
	f"MELD.Raw/MELD.Raw/{split}/output_repeated_splits_{split}/final_videos_{split}dia{dia_id}_utt{utt_id}.mp4",
	]
	for p in patterns:
	if p in available_files:
	return p
	return None


	def get_mp4_list_from_zip(zip_path: Path) -> set:
	"""Get set of all MP4 file paths in zip."""
	import zipfile
	with zipfile.ZipFile(zip_path, "r") as zf:
	return {n for n in zf.namelist() if n.endswith(".mp4")}


	def extract_and_concat_wav(
	zip_path: Path, mp4_paths: list[str], output_wav: Path, sample_rate: int = 16000
	) -> float:
	"""Extract audio from MP4s in zip and concatenate into single WAV."""
	with tempfile.TemporaryDirectory() as tmpdir:
	tmpdir = Path(tmpdir)
	wav_parts = []

	# Extract each MP4 and convert to WAV
	import zipfile
	with zipfile.ZipFile(zip_path, "r") as zf:
	for i, mp4_path in enumerate(mp4_paths):
	mp4_local = tmpdir / f"part_{i:03d}.mp4"
	wav_local = tmpdir / f"part_{i:03d}.wav"

	# Extract MP4
	with zf.open(mp4_path) as src, open(mp4_local, "wb") as dst:
	dst.write(src.read())

	# Convert to WAV (16kHz mono)
	result = subprocess.run(
	[
	"ffmpeg", "-y", "-i", str(mp4_local),
	"-ar", str(sample_rate),
	"-ac", "1",
	"-acodec", "pcm_s16le",
	str(wav_local),
	],
	capture_output=True,
	text=True,
	)
	if result.returncode != 0:
	print(f" Warning: ffmpeg failed for {mp4_path}: {result.stderr[:200]}")
	continue

	if wav_local.exists() and wav_local.stat().st_size > 0:
	wav_parts.append(wav_local)

	if not wav_parts:
	return 0.0

	# Concatenate WAVs using ffmpeg concat
	list_file = tmpdir / "concat_list.txt"
	with open(list_file, "w") as f:
	for wp in wav_parts:
	f.write(f"file '{wp}'\n")

	output_wav.parent.mkdir(parents=True, exist_ok=True)
	result = subprocess.run(
	[
	"ffmpeg", "-y", "-f", "concat", "-safe", "0",
	"-i", str(list_file),
	"-ar", str(sample_rate),
	"-ac", "1",
	"-acodec", "pcm_s16le",
	str(output_wav),
	],
	capture_output=True,
	text=True,
	)
	if result.returncode != 0:
	print(f" Concat failed: {result.stderr[:300]}")
	return 0.0

	# Get duration
	probe = subprocess.run(
	["ffprobe", "-v", "quiet", "-show_entries", "format=duration",
	"-of", "default=noprint_wrappers=1:nokey=1", str(output_wav)],
	capture_output=True, text=True,
	)
	try:
	return float(probe.stdout.strip())
	except ValueError:
	return 0.0


	def main():
	print("=" * 60)
	print(" MELD English Test Set Builder")
	print("=" * 60)

	if not ZIP_PATH.exists():
	print(f"Error: {ZIP_PATH} not found")
	sys.exit(1)

	# 1. Load CSV data
	print("\n[1/4] Loading CSV data from zip...")
	dialogues = load_csv_from_zip(ZIP_PATH)
	print(f" Loaded {len(dialogues)} dialogues")

	# 2. Get available MP4 files
	print("[2/4] Scanning MP4 files in zip...")
	mp4_files = get_mp4_list_from_zip(ZIP_PATH)
	print(f" Found {len(mp4_files)} MP4 files")

	# 3. Process each test set
	print("[3/4] Building test sets...\n")
	ground_truth = {}
	summary_lines = []

	for ts in TEST_SETS:
	tag = ts["tag"]
	key = f"{ts['split']}_{ts['dia_id']}"
	utts = dialogues.get(key, [])

	if not utts:
	print(f" ❌ {tag}: dialogue {key} not found")
	continue

	print(f" 📦 {tag} — {ts['desc']}")
	print(f" {len(utts)} utterances", end="")

	# Find MP4 paths
	mp4_paths = []
	for u in utts:
	p = find_mp4_path(ts["split"], ts["dia_id"], u["Utterance_ID"], mp4_files)
	if p:
	mp4_paths.append(p)

	print(f", {len(mp4_paths)}/{len(utts)} MP4s found")

	if not mp4_paths:
	print(f" ❌ No MP4 files found, skipping")
	continue

	# Extract and concatenate
	output_wav = OUTPUT_DIR / f"{tag}.wav"
	duration = extract_and_concat_wav(ZIP_PATH, mp4_paths, output_wav, SAMPLE_RATE)
	print(f" ✅ {output_wav.name} — {duration:.1f}s")

	# Build ground truth
	emo_counts = Counter(u["Emotion"] for u in utts)
	ground_truth[tag] = {
	"description": ts["desc"],
	"scenario": ts["scenario"],
	"primary_emotion": ts["primary_emotion"],
	"source": f"MELD Friends S{utts[0]['Season']}E{utts[0]['Episode']} Dialogue {ts['dia_id']}",
	"duration_sec": round(duration, 1),
	"emotion_distribution": dict(emo_counts),
	"total_utterances": len(utts),
	"utterances": [
	{
	"speaker": u["Speaker"],
	"emotion": u["Emotion"],
	"sentiment": u["Sentiment"],
	"text": u["Utterance"],
	}
	for u in utts
	],
	}

	summary_lines.append(
	f"\| {tag} \| {ts['scenario'][:40]} \| {ts['primary_emotion']} \| {duration:.1f}s \| {len(utts)} utts \| {dict(emo_counts)} \|"
	)

	# 4. Save ground truth + README
	print("\n[4/4] Saving metadata...")

	gt_path = OUTPUT_DIR / "ground_truth.json"
	with open(gt_path, "w", encoding="utf-8") as f:
	json.dump(ground_truth, f, indent=2, ensure_ascii=False)
	print(f" ✅ {gt_path}")

	# Emotion alignment check
	our_labels = {"neutral", "joy", "sadness", "anger", "surprise", "fear", "disgust"}
	meld_labels = set()
	for gt in ground_truth.values():
	meld_labels.update(gt["emotion_distribution"].keys())

	readme_content = f"""# MELD English Test Sets

	## Emotion Label Alignment

	\| UsTwo Pipeline (EN) \| MELD Label \| Match \|
	\|---\|---\|---\|
	\| neutral \| neutral \| ✅ Exact \|
	\| joy \| joy \| ✅ Exact \|
	\| sadness \| sadness \| ✅ Exact \|
	\| anger \| anger \| ✅ Exact \|
	\| surprise \| surprise \| ✅ Exact \|
	\| fear \| fear \| ✅ Exact \|
	\| disgust \| disgust \| ✅ Exact \|

	7/7 labels match exactly. No mapping needed.

	## Test Sets

	\| File \| Scenario \| Primary Emotion \| Duration \| Utterances \| Emotion Distribution \|
	\|---\|---\|---\|---\|---\|---\|
	{chr(10).join(summary_lines)}

	## Source
	- Dataset: MELD (Multimodal EmotionLines Dataset)
	- Source: Friends TV series
	- Paper: Poria et al., ACL 2019
	- Each WAV is a full dialogue concatenated from per-utterance MP4 clips
	- Audio: 16kHz mono PCM (matches pipeline input format)

	## Usage
	```bash
	# Run pipeline on a single test set
	python scripts/run_pipeline.py data/meld_test/01_angry_fight.wav

	# Evaluate all test sets
	python scripts/evaluate_meld_test.py
	```
	"""
	readme_path = OUTPUT_DIR / "README.md"
	with open(readme_path, "w", encoding="utf-8") as f:
	f.write(readme_content)
	print(f" ✅ {readme_path}")

	# Summary
	print("\n" + "=" * 60)
	print(" DONE")
	print("=" * 60)
	total_files = len(list(OUTPUT_DIR.glob("*.wav")))
	print(f" {total_files} WAV files in {OUTPUT_DIR}")
	print(f" Ground truth: {gt_path}")
	print(f" Emotion alignment: {len(our_labels & meld_labels)}/{len(our_labels)} exact match")


	if __name__ == "__main__":
	main()