Upload folder using huggingface_hub

3c1eb61 verified about 1 month ago

10.3 kB

	"""
	Download and prepare turn-taking evaluation datasets.

	Datasets used:
	1. Switchboard (HuggingFace) - Two-speaker telephone conversations with timestamps
	2. HCRC Map Task (Edinburgh) - Task-oriented dialogues with turn annotations

	References:
	- Godfrey, J.J., Holliman, E.C., & McDaniel, J. (1992). SWITCHBOARD: Telephone speech
	corpus for research and development. ICASSP-92.
	- Anderson, A.H., et al. (1991). The HCRC Map Task Corpus. Language and Speech, 34(4).
	"""

	from __future__ import annotations

	import argparse
	import json
	import logging
	import os
	import shutil
	import urllib.request
	import zipfile
	from dataclasses import dataclass, field
	from pathlib import Path

	import numpy as np
	import soundfile as sf

	log = logging.getLogger(__name__)

	DATA_DIR = Path(__file__).parent / "data"
	SWITCHBOARD_DIR = DATA_DIR / "switchboard"
	MAPTASK_DIR = DATA_DIR / "maptask"
	ANNOTATIONS_DIR = DATA_DIR / "annotations"


	@dataclass
	class TurnSegment:
	"""A single speaker turn with timing information."""
	speaker: str
	start: float # seconds
	end: float # seconds
	text: str = ""

	@property
	def duration(self) -> float:
	return self.end - self.start


	@dataclass
	class Conversation:
	"""A conversation with turn-taking annotations."""
	conv_id: str
	audio_path: str
	sample_rate: int
	duration: float # total duration in seconds
	turns: list[TurnSegment] = field(default_factory=list)
	# Derived labels
	turn_shifts: list[float] = field(default_factory=list) # timestamps of speaker changes
	holds: list[float] = field(default_factory=list) # timestamps where same speaker continues after pause


	def download_switchboard_from_hf() -> list[Conversation]:
	"""Download Switchboard subset from HuggingFace datasets."""
	from datasets import load_dataset

	log.info("Downloading Switchboard from HuggingFace...")
	SWITCHBOARD_DIR.mkdir(parents=True, exist_ok=True)

	# Use the Switchboard subset available on HF
	try:
	ds = load_dataset("hhoangphuoc/switchboard", split="train", streaming=True)
	except Exception:
	log.warning("HF Switchboard not available, trying alternative...")
	ds = load_dataset("swda", split="train", streaming=True)

	conversations: list[Conversation] = []
	count = 0
	max_conversations = 200 # Limit for benchmark feasibility

	current_conv_id = None
	current_turns: list[TurnSegment] = []

	for sample in ds:
	conv_id = str(sample.get("conversation_no", sample.get("conv_id", count)))

	if conv_id != current_conv_id:
	if current_conv_id is not None and current_turns:
	conv = _build_conversation_from_text(current_conv_id, current_turns)
	if conv:
	conversations.append(conv)
	count += 1
	if count >= max_conversations:
	break

	current_conv_id = conv_id
	current_turns = []

	speaker = sample.get("caller", sample.get("speaker", "A"))
	text = sample.get("text", sample.get("utterance", ""))
	if text:
	current_turns.append(TurnSegment(
	speaker=str(speaker),
	start=0.0, # Will be estimated
	end=0.0,
	text=text.strip(),
	))

	# Save annotations
	_save_annotations(conversations, "switchboard")
	log.info("Downloaded %d Switchboard conversations", len(conversations))
	return conversations


	def download_candor_sample() -> list[Conversation]:
	"""
	Download CANDOR corpus sample for turn-taking evaluation.

	Reference:
	- Reece, A.G., et al. (2023). The CANDOR corpus: Insights from a large
	multi-modal dataset of naturalistic conversation. Science Advances, 9(13).
	"""
	log.info("CANDOR corpus requires manual download from https://cadl.humlab.lu.se/candor/")
	log.info("See: https://www.science.org/doi/10.1126/sciadv.adf3197")
	return []


	def generate_synthetic_dataset(
	n_conversations: int = 100,
	min_turns: int = 10,
	max_turns: int = 40,
	sample_rate: int = 16000,
	) -> list[Conversation]:
	"""
	Generate synthetic two-speaker conversations with ground-truth turn annotations.

	This provides a controlled baseline where we know exact turn boundaries.
	Uses silence/noise segments between speakers to simulate realistic gaps/overlaps.
	"""
	log.info("Generating %d synthetic conversations...", n_conversations)
	synth_dir = DATA_DIR / "synthetic"
	synth_dir.mkdir(parents=True, exist_ok=True)

	conversations = []
	rng = np.random.default_rng(42)

	for i in range(n_conversations):
	n_turns = rng.integers(min_turns, max_turns + 1)
	turns = []
	t = 0.0
	speakers = ["A", "B"]

	for j in range(n_turns):
	speaker = speakers[j % 2]
	# Turn duration: 0.5 - 5.0 seconds
	duration = rng.uniform(0.5, 5.0)
	# Gap between turns: -0.3 (overlap) to 1.5 seconds
	gap = rng.uniform(-0.3, 1.5) if j > 0 else 0.0

	start = max(t + gap, t) # No negative starts
	end = start + duration

	turns.append(TurnSegment(
	speaker=speaker,
	start=round(start, 3),
	end=round(end, 3),
	text=f"[synthetic turn {j}]",
	))
	t = end

	total_duration = turns[-1].end
	# Generate audio: sine waves at different frequencies per speaker
	n_samples = int(total_duration * sample_rate)
	audio = np.zeros(n_samples, dtype=np.float32)

	for turn in turns:
	freq = 200.0 if turn.speaker == "A" else 350.0
	s = int(turn.start * sample_rate)
	e = min(int(turn.end * sample_rate), n_samples)
	t_arr = np.arange(e - s) / sample_rate
	audio[s:e] = 0.3 * np.sin(2 * np.pi * freq * t_arr).astype(np.float32)

	# Add noise
	audio += rng.normal(0, 0.01, n_samples).astype(np.float32)

	audio_path = synth_dir / f"synth_{i:04d}.wav"
	sf.write(str(audio_path), audio, sample_rate)

	# Compute turn shifts and holds
	turn_shifts = []
	holds = []
	for k in range(1, len(turns)):
	if turns[k].speaker != turns[k - 1].speaker:
	turn_shifts.append(turns[k].start)
	else:
	holds.append(turns[k].start)

	conversations.append(Conversation(
	conv_id=f"synth_{i:04d}",
	audio_path=str(audio_path),
	sample_rate=sample_rate,
	duration=total_duration,
	turns=turns,
	turn_shifts=turn_shifts,
	holds=holds,
	))

	_save_annotations(conversations, "synthetic")
	log.info("Generated %d synthetic conversations (%.1f hours)",
	len(conversations), sum(c.duration for c in conversations) / 3600)
	return conversations


	def _build_conversation_from_text(conv_id: str, turns: list[TurnSegment]) -> Conversation \| None:
	"""Build a Conversation from text-only turns by estimating timing."""
	if len(turns) < 3:
	return None

	# Estimate timing: ~150ms per word + 200ms gap
	t = 0.0
	for i, turn in enumerate(turns):
	words = len(turn.text.split())
	duration = max(0.5, words * 0.15)
	gap = 0.2 if i > 0 else 0.0
	turn.start = round(t + gap, 3)
	turn.end = round(turn.start + duration, 3)
	t = turn.end

	turn_shifts = []
	holds = []
	for k in range(1, len(turns)):
	if turns[k].speaker != turns[k - 1].speaker:
	turn_shifts.append(turns[k].start)
	else:
	holds.append(turns[k].start)

	return Conversation(
	conv_id=conv_id,
	audio_path="", # text-only
	sample_rate=16000,
	duration=turns[-1].end,
	turns=turns,
	turn_shifts=turn_shifts,
	holds=holds,
	)


	def _save_annotations(conversations: list[Conversation], name: str) -> None:
	"""Save conversation annotations to JSON for reproducibility."""
	ANNOTATIONS_DIR.mkdir(parents=True, exist_ok=True)
	out = []
	for conv in conversations:
	out.append({
	"conv_id": conv.conv_id,
	"audio_path": conv.audio_path,
	"sample_rate": conv.sample_rate,
	"duration": conv.duration,
	"n_turns": len(conv.turns),
	"n_turn_shifts": len(conv.turn_shifts),
	"n_holds": len(conv.holds),
	"turns": [
	{"speaker": t.speaker, "start": t.start, "end": t.end, "text": t.text}
	for t in conv.turns
	],
	"turn_shifts": conv.turn_shifts,
	"holds": conv.holds,
	})

	path = ANNOTATIONS_DIR / f"{name}_annotations.json"
	with open(path, "w") as f:
	json.dump(out, f, indent=2)
	log.info("Saved %d annotations to %s", len(out), path)


	def load_annotations(name: str) -> list[Conversation]:
	"""Load previously saved annotations."""
	path = ANNOTATIONS_DIR / f"{name}_annotations.json"
	if not path.exists():
	raise FileNotFoundError(f"Annotations not found: {path}")

	with open(path) as f:
	data = json.load(f)

	conversations = []
	for item in data:
	turns = [TurnSegment(**t) for t in item["turns"]]
	conversations.append(Conversation(
	conv_id=item["conv_id"],
	audio_path=item["audio_path"],
	sample_rate=item["sample_rate"],
	duration=item["duration"],
	turns=turns,
	turn_shifts=item["turn_shifts"],
	holds=item["holds"],
	))
	return conversations


	if __name__ == "__main__":
	logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s")

	parser = argparse.ArgumentParser(description="Download turn-taking datasets")
	parser.add_argument("--dataset", choices=["switchboard", "synthetic", "all"], default="all")
	parser.add_argument("--n-synthetic", type=int, default=100)
	args = parser.parse_args()

	if args.dataset in ("synthetic", "all"):
	generate_synthetic_dataset(n_conversations=args.n_synthetic)

	if args.dataset in ("switchboard", "all"):
	download_switchboard_from_hf()