| """ |
| Download and prepare turn-taking evaluation datasets. |
| |
| Datasets used: |
| 1. Switchboard (HuggingFace) - Two-speaker telephone conversations with timestamps |
| 2. HCRC Map Task (Edinburgh) - Task-oriented dialogues with turn annotations |
| |
| References: |
| - Godfrey, J.J., Holliman, E.C., & McDaniel, J. (1992). SWITCHBOARD: Telephone speech |
| corpus for research and development. ICASSP-92. |
| - Anderson, A.H., et al. (1991). The HCRC Map Task Corpus. Language and Speech, 34(4). |
| """ |
|
|
| from __future__ import annotations |
|
|
| import argparse |
| import json |
| import logging |
| import os |
| import shutil |
| import urllib.request |
| import zipfile |
| from dataclasses import dataclass, field |
| from pathlib import Path |
|
|
| import numpy as np |
| import soundfile as sf |
|
|
| log = logging.getLogger(__name__) |
|
|
| DATA_DIR = Path(__file__).parent / "data" |
| SWITCHBOARD_DIR = DATA_DIR / "switchboard" |
| MAPTASK_DIR = DATA_DIR / "maptask" |
| ANNOTATIONS_DIR = DATA_DIR / "annotations" |
|
|
|
|
| @dataclass |
| class TurnSegment: |
| """A single speaker turn with timing information.""" |
| speaker: str |
| start: float |
| end: float |
| text: str = "" |
|
|
| @property |
| def duration(self) -> float: |
| return self.end - self.start |
|
|
|
|
| @dataclass |
| class Conversation: |
| """A conversation with turn-taking annotations.""" |
| conv_id: str |
| audio_path: str |
| sample_rate: int |
| duration: float |
| turns: list[TurnSegment] = field(default_factory=list) |
| |
| turn_shifts: list[float] = field(default_factory=list) |
| holds: list[float] = field(default_factory=list) |
|
|
|
|
| def download_switchboard_from_hf() -> list[Conversation]: |
| """Download Switchboard subset from HuggingFace datasets.""" |
| from datasets import load_dataset |
|
|
| log.info("Downloading Switchboard from HuggingFace...") |
| SWITCHBOARD_DIR.mkdir(parents=True, exist_ok=True) |
|
|
| |
| try: |
| ds = load_dataset("hhoangphuoc/switchboard", split="train", streaming=True) |
| except Exception: |
| log.warning("HF Switchboard not available, trying alternative...") |
| ds = load_dataset("swda", split="train", streaming=True) |
|
|
| conversations: list[Conversation] = [] |
| count = 0 |
| max_conversations = 200 |
|
|
| current_conv_id = None |
| current_turns: list[TurnSegment] = [] |
|
|
| for sample in ds: |
| conv_id = str(sample.get("conversation_no", sample.get("conv_id", count))) |
|
|
| if conv_id != current_conv_id: |
| if current_conv_id is not None and current_turns: |
| conv = _build_conversation_from_text(current_conv_id, current_turns) |
| if conv: |
| conversations.append(conv) |
| count += 1 |
| if count >= max_conversations: |
| break |
|
|
| current_conv_id = conv_id |
| current_turns = [] |
|
|
| speaker = sample.get("caller", sample.get("speaker", "A")) |
| text = sample.get("text", sample.get("utterance", "")) |
| if text: |
| current_turns.append(TurnSegment( |
| speaker=str(speaker), |
| start=0.0, |
| end=0.0, |
| text=text.strip(), |
| )) |
|
|
| |
| _save_annotations(conversations, "switchboard") |
| log.info("Downloaded %d Switchboard conversations", len(conversations)) |
| return conversations |
|
|
|
|
| def download_candor_sample() -> list[Conversation]: |
| """ |
| Download CANDOR corpus sample for turn-taking evaluation. |
| |
| Reference: |
| - Reece, A.G., et al. (2023). The CANDOR corpus: Insights from a large |
| multi-modal dataset of naturalistic conversation. Science Advances, 9(13). |
| """ |
| log.info("CANDOR corpus requires manual download from https://cadl.humlab.lu.se/candor/") |
| log.info("See: https://www.science.org/doi/10.1126/sciadv.adf3197") |
| return [] |
|
|
|
|
| def generate_synthetic_dataset( |
| n_conversations: int = 100, |
| min_turns: int = 10, |
| max_turns: int = 40, |
| sample_rate: int = 16000, |
| ) -> list[Conversation]: |
| """ |
| Generate synthetic two-speaker conversations with ground-truth turn annotations. |
| |
| This provides a controlled baseline where we know exact turn boundaries. |
| Uses silence/noise segments between speakers to simulate realistic gaps/overlaps. |
| """ |
| log.info("Generating %d synthetic conversations...", n_conversations) |
| synth_dir = DATA_DIR / "synthetic" |
| synth_dir.mkdir(parents=True, exist_ok=True) |
|
|
| conversations = [] |
| rng = np.random.default_rng(42) |
|
|
| for i in range(n_conversations): |
| n_turns = rng.integers(min_turns, max_turns + 1) |
| turns = [] |
| t = 0.0 |
| speakers = ["A", "B"] |
|
|
| for j in range(n_turns): |
| speaker = speakers[j % 2] |
| |
| duration = rng.uniform(0.5, 5.0) |
| |
| gap = rng.uniform(-0.3, 1.5) if j > 0 else 0.0 |
|
|
| start = max(t + gap, t) |
| end = start + duration |
|
|
| turns.append(TurnSegment( |
| speaker=speaker, |
| start=round(start, 3), |
| end=round(end, 3), |
| text=f"[synthetic turn {j}]", |
| )) |
| t = end |
|
|
| total_duration = turns[-1].end |
| |
| n_samples = int(total_duration * sample_rate) |
| audio = np.zeros(n_samples, dtype=np.float32) |
|
|
| for turn in turns: |
| freq = 200.0 if turn.speaker == "A" else 350.0 |
| s = int(turn.start * sample_rate) |
| e = min(int(turn.end * sample_rate), n_samples) |
| t_arr = np.arange(e - s) / sample_rate |
| audio[s:e] = 0.3 * np.sin(2 * np.pi * freq * t_arr).astype(np.float32) |
|
|
| |
| audio += rng.normal(0, 0.01, n_samples).astype(np.float32) |
|
|
| audio_path = synth_dir / f"synth_{i:04d}.wav" |
| sf.write(str(audio_path), audio, sample_rate) |
|
|
| |
| turn_shifts = [] |
| holds = [] |
| for k in range(1, len(turns)): |
| if turns[k].speaker != turns[k - 1].speaker: |
| turn_shifts.append(turns[k].start) |
| else: |
| holds.append(turns[k].start) |
|
|
| conversations.append(Conversation( |
| conv_id=f"synth_{i:04d}", |
| audio_path=str(audio_path), |
| sample_rate=sample_rate, |
| duration=total_duration, |
| turns=turns, |
| turn_shifts=turn_shifts, |
| holds=holds, |
| )) |
|
|
| _save_annotations(conversations, "synthetic") |
| log.info("Generated %d synthetic conversations (%.1f hours)", |
| len(conversations), sum(c.duration for c in conversations) / 3600) |
| return conversations |
|
|
|
|
| def _build_conversation_from_text(conv_id: str, turns: list[TurnSegment]) -> Conversation | None: |
| """Build a Conversation from text-only turns by estimating timing.""" |
| if len(turns) < 3: |
| return None |
|
|
| |
| t = 0.0 |
| for i, turn in enumerate(turns): |
| words = len(turn.text.split()) |
| duration = max(0.5, words * 0.15) |
| gap = 0.2 if i > 0 else 0.0 |
| turn.start = round(t + gap, 3) |
| turn.end = round(turn.start + duration, 3) |
| t = turn.end |
|
|
| turn_shifts = [] |
| holds = [] |
| for k in range(1, len(turns)): |
| if turns[k].speaker != turns[k - 1].speaker: |
| turn_shifts.append(turns[k].start) |
| else: |
| holds.append(turns[k].start) |
|
|
| return Conversation( |
| conv_id=conv_id, |
| audio_path="", |
| sample_rate=16000, |
| duration=turns[-1].end, |
| turns=turns, |
| turn_shifts=turn_shifts, |
| holds=holds, |
| ) |
|
|
|
|
| def _save_annotations(conversations: list[Conversation], name: str) -> None: |
| """Save conversation annotations to JSON for reproducibility.""" |
| ANNOTATIONS_DIR.mkdir(parents=True, exist_ok=True) |
| out = [] |
| for conv in conversations: |
| out.append({ |
| "conv_id": conv.conv_id, |
| "audio_path": conv.audio_path, |
| "sample_rate": conv.sample_rate, |
| "duration": conv.duration, |
| "n_turns": len(conv.turns), |
| "n_turn_shifts": len(conv.turn_shifts), |
| "n_holds": len(conv.holds), |
| "turns": [ |
| {"speaker": t.speaker, "start": t.start, "end": t.end, "text": t.text} |
| for t in conv.turns |
| ], |
| "turn_shifts": conv.turn_shifts, |
| "holds": conv.holds, |
| }) |
|
|
| path = ANNOTATIONS_DIR / f"{name}_annotations.json" |
| with open(path, "w") as f: |
| json.dump(out, f, indent=2) |
| log.info("Saved %d annotations to %s", len(out), path) |
|
|
|
|
| def load_annotations(name: str) -> list[Conversation]: |
| """Load previously saved annotations.""" |
| path = ANNOTATIONS_DIR / f"{name}_annotations.json" |
| if not path.exists(): |
| raise FileNotFoundError(f"Annotations not found: {path}") |
|
|
| with open(path) as f: |
| data = json.load(f) |
|
|
| conversations = [] |
| for item in data: |
| turns = [TurnSegment(**t) for t in item["turns"]] |
| conversations.append(Conversation( |
| conv_id=item["conv_id"], |
| audio_path=item["audio_path"], |
| sample_rate=item["sample_rate"], |
| duration=item["duration"], |
| turns=turns, |
| turn_shifts=item["turn_shifts"], |
| holds=item["holds"], |
| )) |
| return conversations |
|
|
|
|
| if __name__ == "__main__": |
| logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s") |
|
|
| parser = argparse.ArgumentParser(description="Download turn-taking datasets") |
| parser.add_argument("--dataset", choices=["switchboard", "synthetic", "all"], default="all") |
| parser.add_argument("--n-synthetic", type=int, default=100) |
| args = parser.parse_args() |
|
|
| if args.dataset in ("synthetic", "all"): |
| generate_synthetic_dataset(n_conversations=args.n_synthetic) |
|
|
| if args.dataset in ("switchboard", "all"): |
| download_switchboard_from_hf() |
|
|