diff --git a/.gitattributes b/.gitattributes index a6344aac8c09253b3b630fb776ae94478aa0275b..6b1a474906516d6c98fde8b830fb7770a1b3186f 100644 --- a/.gitattributes +++ b/.gitattributes @@ -33,3 +33,33 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text *.zip filter=lfs diff=lfs merge=lfs -text *.zst filter=lfs diff=lfs merge=lfs -text *tfevents* filter=lfs diff=lfs merge=lfs -text +03-finetune-pipecat-pt/references/papers/finite_state_turn_taking_raux_2009.pdf filter=lfs diff=lfs merge=lfs -text +03-finetune-pipecat-pt/references/papers/focal_loss_lin_2017.pdf filter=lfs diff=lfs merge=lfs -text +03-finetune-pipecat-pt/references/papers/hesitation-l2-french/01-kosmala-crible-2022-dual-status-filled-pauses.pdf filter=lfs diff=lfs merge=lfs -text +03-finetune-pipecat-pt/references/papers/hesitation-l2-french/02-ekstedt-skantze-2022-prosody-turn-taking-vap.pdf filter=lfs diff=lfs merge=lfs -text +03-finetune-pipecat-pt/references/papers/hesitation-l2-french/03-castillo-lopez-2025-survey-turn-taking-modeling.pdf filter=lfs diff=lfs merge=lfs -text +03-finetune-pipecat-pt/references/papers/hesitation-l2-french/04-cenoz-2000-pauses-hesitation-L2-production.pdf filter=lfs diff=lfs merge=lfs -text +03-finetune-pipecat-pt/references/papers/hesitation-l2-french/05-kosmala-2023-filled-pauses-pragmatic-markers-gaze-gesture.pdf filter=lfs diff=lfs merge=lfs -text +03-finetune-pipecat-pt/references/papers/hesitation-l2-french/06-wu-2023-disfluencies-french-prosodic-params-filled-pauses.pdf filter=lfs diff=lfs merge=lfs -text +03-finetune-pipecat-pt/references/papers/hesitation-l2-french/07-ekstedt-2023-turn-taking-cues-speech-synthesis.pdf filter=lfs diff=lfs merge=lfs -text +03-finetune-pipecat-pt/references/papers/hesitation-l2-french/08-inoue-2024-realtime-turn-taking-vap.pdf filter=lfs diff=lfs merge=lfs -text +03-finetune-pipecat-pt/references/papers/hesitation-l2-french/09-inoue-2024-multilingual-turn-taking-vap.pdf filter=lfs diff=lfs merge=lfs -text +03-finetune-pipecat-pt/references/papers/hesitation-l2-french/10-hutin-2024-filled-pauses-conversational-convergence.pdf filter=lfs diff=lfs merge=lfs -text +03-finetune-pipecat-pt/references/papers/hesitation-l2-french/11-beatty-martinez-2020-codeswitching-bilingual-toolkit.pdf filter=lfs diff=lfs merge=lfs -text +03-finetune-pipecat-pt/references/papers/hesitation-l2-french/12-lingref-code-switching-bilinguals.pdf filter=lfs diff=lfs merge=lfs -text +03-finetune-pipecat-pt/references/papers/hesitation-l2-french/13-christodoulides-avanzi-2014-DisMo-disfluency-french.pdf filter=lfs diff=lfs merge=lfs -text +03-finetune-pipecat-pt/references/papers/hesitation-l2-french/14-jouvet-2019-speech-processing-prosody.pdf filter=lfs diff=lfs merge=lfs -text +03-finetune-pipecat-pt/references/papers/hesitation-l2-french/15-filler-particles-cross-linguistic-heritage-2024.pdf filter=lfs diff=lfs merge=lfs -text +03-finetune-pipecat-pt/references/papers/hesitation-l2-french/16-hal-peters-2017-L2-fluency-french.pdf filter=lfs diff=lfs merge=lfs -text +03-finetune-pipecat-pt/references/papers/hesitation-l2-french/17-christodoulides-2015-automatic-disfluency-detection-french.pdf filter=lfs diff=lfs merge=lfs -text +03-finetune-pipecat-pt/references/papers/hesitation-l2-french/19-hal-multimodal-self-other-repair-L2.pdf filter=lfs diff=lfs merge=lfs -text +03-finetune-pipecat-pt/references/papers/language-learning-turn-taking/conversar_mixed_reality_l2.pdf filter=lfs diff=lfs merge=lfs -text +03-finetune-pipecat-pt/references/papers/language-learning-turn-taking/hesitation_tagging_l2_whisper_lora.pdf filter=lfs diff=lfs merge=lfs -text +03-finetune-pipecat-pt/references/papers/language-learning-turn-taking/iwsds2025_survey_turn_taking.pdf filter=lfs diff=lfs merge=lfs -text +03-finetune-pipecat-pt/references/papers/language-learning-turn-taking/multilingual_vap_2024.pdf filter=lfs diff=lfs merge=lfs -text +03-finetune-pipecat-pt/references/papers/language-learning-turn-taking/speak_improve_corpus_2025.pdf filter=lfs diff=lfs merge=lfs -text +03-finetune-pipecat-pt/references/papers/soda_dialog_distillation_2023.pdf filter=lfs diff=lfs merge=lfs -text +03-finetune-pipecat-pt/references/papers/speculative_etd_2025.pdf filter=lfs diff=lfs merge=lfs -text +03-finetune-pipecat-pt/references/papers/turn_taking_review_skantze_2021.pdf filter=lfs diff=lfs merge=lfs -text +03-finetune-pipecat-pt/references/papers/vap_turn_taking_ekstedt_2024.pdf filter=lfs diff=lfs merge=lfs -text +previous-experiments/01-benchmarks/report/figures/radar_chart.png filter=lfs diff=lfs merge=lfs -text diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..7e26ae78b8d2c677b903ff703785537c18475c22 --- /dev/null +++ b/.gitignore @@ -0,0 +1,11 @@ +data/ +checkpoints/ +__pycache__/ +*.pyc +.deploy_state.json +benchmark.log +*.egg-info/ +hf_cache/ +results/ +*.pt +*.onnx diff --git a/03-finetune-pipecat-pt/01_download_pipecat.py b/03-finetune-pipecat-pt/01_download_pipecat.py new file mode 100644 index 0000000000000000000000000000000000000000..a7aec92b9b13b0e1c2c3bc185bcf1386b2d5bb52 --- /dev/null +++ b/03-finetune-pipecat-pt/01_download_pipecat.py @@ -0,0 +1,176 @@ +"""Download Pipecat Smart Turn v3 dataset + model architecture. + +The Pipecat model is only published as ONNX (no PyTorch weights). +So we initialize from openai/whisper-tiny and train using their dataset +which already includes Portuguese samples. + +This script: +1. Downloads Pipecat's training dataset (270K samples, 23 langs) +2. Filters Portuguese samples +3. Downloads the ONNX model for reference/benchmarking +4. Saves Portuguese subset locally +""" + +from __future__ import annotations + +import json +import logging +from pathlib import Path + +log = logging.getLogger(__name__) + +DATA_DIR = Path(__file__).parent / "data" +CACHE_DIR = Path("/workspace/hf_cache") if Path("/workspace").exists() else DATA_DIR / "hf_cache" + + +def download_pipecat_dataset(max_pt_samples: int = 5000) -> Path: + """Download Pipecat v3.2 training data and extract Portuguese samples.""" + from datasets import load_dataset + + DATA_DIR.mkdir(parents=True, exist_ok=True) + + # Download full dataset (streaming to avoid 37GB download) + log.info("Loading Pipecat Smart Turn v3.2 training data (streaming)...") + ds = load_dataset( + "pipecat-ai/smart-turn-data-v3.2-train", + split="train", + streaming=True, + cache_dir=str(CACHE_DIR), + ) + + # Filter Portuguese samples + pt_samples = [] + other_count = 0 + for row in ds: + lang = row.get("language", "") + if lang == "por": + pt_samples.append({ + "id": row["id"], + "language": lang, + "endpoint_bool": row["endpoint_bool"], + "midfiller": row.get("midfiller", False), + "endfiller": row.get("endfiller", False), + "synthetic": row.get("synthetic", True), + "dataset": row.get("dataset", ""), + "spoken_text": row.get("spoken_text", ""), + "audio": row["audio"], + }) + if len(pt_samples) % 100 == 0: + log.info(" Found %d Portuguese samples (scanned %d others)", + len(pt_samples), other_count) + if len(pt_samples) >= max_pt_samples: + break + else: + other_count += 1 + + log.info("Total: %d Portuguese samples found (scanned %d other languages)", + len(pt_samples), other_count) + + # Save metadata (without audio) + meta_path = DATA_DIR / "pipecat_pt_metadata.json" + meta = [{k: v for k, v in s.items() if k != "audio"} for s in pt_samples] + with open(meta_path, "w") as f: + json.dump(meta, f, indent=2, ensure_ascii=False) + log.info("Metadata saved to %s", meta_path) + + # Save audio files + audio_dir = DATA_DIR / "pipecat_pt_audio" + audio_dir.mkdir(exist_ok=True) + + import soundfile as sf + import numpy as np + + complete = 0 + incomplete = 0 + for s in pt_samples: + audio_data = s["audio"] + audio = np.array(audio_data["array"], dtype=np.float32) + sr = audio_data["sampling_rate"] + + label = "complete" if s["endpoint_bool"] else "incomplete" + if s["endpoint_bool"]: + complete += 1 + else: + incomplete += 1 + + out_path = audio_dir / f"{s['id']}_{label}.wav" + sf.write(str(out_path), audio, sr) + + log.info("Audio saved: %d complete, %d incomplete → %s", + complete, incomplete, audio_dir) + + return audio_dir + + +def download_pipecat_test_data(max_pt_samples: int = 2000) -> Path: + """Download Pipecat test data for evaluation.""" + from datasets import load_dataset + + log.info("Loading Pipecat Smart Turn v3.2 test data (streaming)...") + ds = load_dataset( + "pipecat-ai/smart-turn-data-v3.2-test", + split="train", + streaming=True, + cache_dir=str(CACHE_DIR), + ) + + pt_samples = [] + for row in ds: + if row.get("language", "") == "por": + pt_samples.append(row) + if len(pt_samples) >= max_pt_samples: + break + + log.info("Found %d Portuguese test samples", len(pt_samples)) + + # Save + test_dir = DATA_DIR / "pipecat_pt_test" + test_dir.mkdir(exist_ok=True) + + import soundfile as sf + import numpy as np + + for s in pt_samples: + audio = np.array(s["audio"]["array"], dtype=np.float32) + sr = s["audio"]["sampling_rate"] + label = "complete" if s["endpoint_bool"] else "incomplete" + sf.write(str(test_dir / f"{s['id']}_{label}.wav"), audio, sr) + + log.info("Test audio saved → %s", test_dir) + return test_dir + + +def download_onnx_model() -> Path: + """Download Pipecat ONNX model for benchmarking.""" + from huggingface_hub import hf_hub_download + + model_dir = DATA_DIR / "pipecat_model" + model_dir.mkdir(exist_ok=True) + + for fname in ["smart-turn-v3.2-cpu.onnx", "smart-turn-v3.2-gpu.onnx"]: + path = hf_hub_download( + "pipecat-ai/smart-turn-v3", + fname, + local_dir=str(model_dir), + ) + log.info("Downloaded %s → %s", fname, path) + + return model_dir + + +if __name__ == "__main__": + logging.basicConfig( + level=logging.INFO, + format="%(asctime)s %(levelname)s [%(name)s] %(message)s", + ) + + log.info("=== Step 1: Download Pipecat Portuguese training data ===") + download_pipecat_dataset(max_pt_samples=5000) + + log.info("\n=== Step 2: Download Pipecat Portuguese test data ===") + download_pipecat_test_data(max_pt_samples=2000) + + log.info("\n=== Step 3: Download ONNX model for benchmarking ===") + download_onnx_model() + + log.info("\nDone!") diff --git a/03-finetune-pipecat-pt/02_generate_labels.py b/03-finetune-pipecat-pt/02_generate_labels.py new file mode 100644 index 0000000000000000000000000000000000000000..2dfc22b7af4b2cf9538b7557ca184e4619d73166 --- /dev/null +++ b/03-finetune-pipecat-pt/02_generate_labels.py @@ -0,0 +1,474 @@ +"""Generate high-quality turn-taking labels using Claude API. + +Based on the Pipecat v3.1 pipeline: +1. Load Portuguese transcripts from CORAA MuPe + NURC-SP +2. Claude filters bad/ambiguous sentences (Pipecat: Gemini removed 50-80%) +3. Claude classifies: COMPLETE vs INCOMPLETE (semantic, not punctuation-based) +4. Claude inserts Brazilian Portuguese fillers +5. Claude generates French-accented Portuguese variants + +Uses Claude Haiku for cost efficiency (~$0.001 per 20 sentences). +""" + +from __future__ import annotations + +import json +import logging +import os +import re +import time +from dataclasses import dataclass, field +from pathlib import Path + +log = logging.getLogger(__name__) + +DATA_DIR = Path(__file__).parent / "data" +CACHE_DIR = Path("/workspace/hf_cache") if Path("/workspace").exists() else DATA_DIR / "hf_cache" + +# Brazilian Portuguese fillers (from research: Claude + GPT generate these for Pipecat) +PT_BR_FILLERS = [ + "hum", "eh", "ah", "tipo", "ne", "entao", "assim", "quer dizer", + "como e que eu falo", "deixa eu pensar", "olha", "bom", "pois e", + "sabe", "veja bem", "na verdade", "digamos", "e...", +] + +# French speaker fillers when speaking Portuguese +FR_PT_FILLERS = [ + "euh", "alors", "comment dire", "como se diz", "enfin", + "c'est-a-dire", "voila", "bon", "donc", +] + + +@dataclass +class LabeledSentence: + text: str + label: str # "complete" or "incomplete" + confidence: float # 0-1 + source: str # "coraa", "mupe", "claude_generated" + has_filler: bool = False + filler_type: str = "" # "pt_br" or "fr_pt" + original_text: str = "" + + +def load_coraa_transcripts(max_samples: int = 10000) -> list[dict]: + """Load transcripts from CORAA MuPe (365h interviews, diarization kappa 0.947).""" + from datasets import load_dataset + + log.info("Loading CORAA MuPe transcripts (streaming)...") + ds = load_dataset( + "nilc-nlp/CORAA-MUPE-ASR", + split="train", + streaming=True, + cache_dir=str(CACHE_DIR), + ) + + transcripts = [] + for i, row in enumerate(ds): + text = str(row.get("text", row.get("normalized_text", ""))) + if not text or len(text.split()) < 3: + continue + + transcripts.append({ + "text": text, + "speaker_type": row.get("speaker_type", ""), + "speaker_code": str(row.get("speaker_code", f"mupe_{i}")), + "source": "mupe", + }) + + if len(transcripts) >= max_samples: + break + + if i % 5000 == 0 and i > 0: + log.info(" Scanned %d rows, collected %d transcripts", i, len(transcripts)) + + log.info("CORAA MuPe: %d transcripts loaded", len(transcripts)) + return transcripts + + +def load_nurc_transcripts(max_samples: int = 10000) -> list[dict]: + """Load transcripts from CORAA NURC-SP (239h dialogues, speaker IDs).""" + from datasets import load_dataset + + log.info("Loading CORAA NURC-SP transcripts (streaming)...") + try: + ds = load_dataset( + "nilc-nlp/CORAA-NURC-SP-Audio-Corpus", + split="train", + streaming=True, + cache_dir=str(CACHE_DIR), + ) + except Exception as e: + log.warning("Failed to load NURC-SP: %s", e) + return [] + + transcripts = [] + for i, row in enumerate(ds): + text = str(row.get("text", row.get("sentence", ""))) + if not text or len(text.split()) < 3: + continue + + transcripts.append({ + "text": text, + "speaker_type": row.get("speech_genre", ""), + "speaker_code": str(row.get("speaker_id", f"nurc_{i}")), + "source": "nurc_sp", + }) + + if len(transcripts) >= max_samples: + break + + log.info("NURC-SP: %d transcripts loaded", len(transcripts)) + return transcripts + + +def classify_with_claude( + sentences: list[str], + batch_size: int = 20, + model: str = "claude-haiku-4-5-20251001", +) -> list[dict]: + """Use Claude to classify sentences as complete/incomplete. + + Based on Pipecat's approach: Gemini 2.5 Flash filtered 50-80% of sentences. + We use Claude Haiku for cost efficiency. + + Returns list of {text, label, confidence, keep}. + """ + import anthropic + + api_key = os.environ.get("ANTHROPIC_API_KEY") + if not api_key: + log.warning("ANTHROPIC_API_KEY not set — using rule-based fallback") + return _rule_based_classify(sentences) + + client = anthropic.Anthropic(api_key=api_key) + results = [] + + for batch_start in range(0, len(sentences), batch_size): + batch = sentences[batch_start:batch_start + batch_size] + numbered = "\n".join(f"{i+1}. {s}" for i, s in enumerate(batch)) + + prompt = f"""Voce e um anotador de turn-taking para portugues brasileiro. +Para cada frase abaixo, classifique: +- COMPLETO: o falante terminou o pensamento (pode comecar a traduzir) +- INCOMPLETO: o falante vai continuar falando (nao traduzir ainda) +- RUIM: frase com erro, ambigua, ou inutilizavel (descartar) + +Responda APENAS em JSON, sem explicacao: +[{{"n": 1, "label": "COMPLETO", "confidence": 0.95}}, ...] + +Frases: +{numbered}""" + + try: + response = client.messages.create( + model=model, + max_tokens=2000, + messages=[{"role": "user", "content": prompt}], + ) + text = response.content[0].text + + # Parse JSON from response + json_match = re.search(r'\[.*\]', text, re.DOTALL) + if json_match: + batch_results = json.loads(json_match.group()) + for item in batch_results: + idx = item["n"] - 1 + if 0 <= idx < len(batch): + results.append({ + "text": batch[idx], + "label": item["label"].lower(), + "confidence": item.get("confidence", 0.5), + "keep": item["label"].upper() != "RUIM", + }) + else: + log.warning("No JSON in Claude response for batch %d", batch_start) + results.extend(_rule_based_classify(batch)) + + except Exception as e: + log.warning("Claude API error at batch %d: %s — falling back to rules", batch_start, e) + results.extend(_rule_based_classify(batch)) + + # Rate limiting + if batch_start % 100 == 0 and batch_start > 0: + log.info(" Classified %d/%d sentences", batch_start, len(sentences)) + time.sleep(0.5) + + return results + + +def _rule_based_classify(sentences: list[str]) -> list[dict]: + """Fallback rule-based classification (less accurate than Claude).""" + results = [] + for s in sentences: + text = s.strip() + if not text or len(text) < 5: + results.append({"text": text, "label": "ruim", "confidence": 0.5, "keep": False}) + continue + + if re.search(r'[.!?]+\s*$', text): + results.append({"text": text, "label": "completo", "confidence": 0.7, "keep": True}) + elif re.search(r'[,;:\-]\s*$', text): + results.append({"text": text, "label": "incompleto", "confidence": 0.6, "keep": True}) + elif re.search(r'\b(e|que|mas|porque|quando|se|como|pra|para)\s*$', text, re.I): + results.append({"text": text, "label": "incompleto", "confidence": 0.8, "keep": True}) + else: + results.append({"text": text, "label": "completo", "confidence": 0.5, "keep": True}) + + return results + + +def insert_fillers_with_claude( + sentences: list[str], + filler_type: str = "pt_br", + model: str = "claude-haiku-4-5-20251001", +) -> list[dict]: + """Use Claude to insert fillers at natural points. + + Based on Pipecat: Gemini Flash inserted fillers at natural break points. + + filler_type: "pt_br" for native Brazilian, "fr_pt" for French-accented + """ + import anthropic + + api_key = os.environ.get("ANTHROPIC_API_KEY") + if not api_key: + log.warning("ANTHROPIC_API_KEY not set — using simple filler insertion") + return _simple_filler_insert(sentences, filler_type) + + client = anthropic.Anthropic(api_key=api_key) + fillers = PT_BR_FILLERS if filler_type == "pt_br" else FR_PT_FILLERS + + filler_list = ", ".join(f'"{f}"' for f in fillers) + + if filler_type == "fr_pt": + context = """O falante e um frances de nivel B1 falando portugues. +Ele hesita ao conjugar verbos, busca palavras, e as vezes usa fillers em frances. +Insira hesitacoes naturais como: """ + filler_list + else: + context = """O falante e brasileiro nativo. +Insira fillers naturais do portugues brasileiro como: """ + filler_list + + results = [] + batch_size = 15 + + for batch_start in range(0, len(sentences), batch_size): + batch = sentences[batch_start:batch_start + batch_size] + numbered = "\n".join(f"{i+1}. {s}" for i, s in enumerate(batch)) + + prompt = f"""{context} + +Para cada frase, crie uma versao com 1-2 fillers inseridos em pontos naturais. +A frase deve parecer INCOMPLETA (como se o falante fosse continuar). + +Responda APENAS em JSON: +[{{"n": 1, "original": "frase original", "with_filler": "frase com filler"}}] + +Frases: +{numbered}""" + + try: + response = client.messages.create( + model=model, + max_tokens=3000, + messages=[{"role": "user", "content": prompt}], + ) + text = response.content[0].text + json_match = re.search(r'\[.*\]', text, re.DOTALL) + if json_match: + batch_results = json.loads(json_match.group()) + for item in batch_results: + results.append({ + "original": item.get("original", ""), + "with_filler": item.get("with_filler", ""), + "filler_type": filler_type, + }) + + except Exception as e: + log.warning("Claude filler API error: %s", e) + results.extend(_simple_filler_insert(batch, filler_type)) + + time.sleep(0.3) + + return results + + +def _simple_filler_insert(sentences: list[str], filler_type: str) -> list[dict]: + """Simple rule-based filler insertion fallback.""" + import random + + fillers = PT_BR_FILLERS if filler_type == "pt_br" else FR_PT_FILLERS + results = [] + + for s in sentences: + words = s.split() + if len(words) < 4: + results.append({"original": s, "with_filler": s, "filler_type": filler_type}) + continue + + # Insert filler at ~40% of sentence + pos = max(1, len(words) * 2 // 5) + filler = random.choice(fillers) + words.insert(pos, f"{filler}...") + results.append({ + "original": s, + "with_filler": " ".join(words), + "filler_type": filler_type, + }) + + return results + + +def generate_french_portuguese_sentences( + n_sentences: int = 500, + model: str = "claude-haiku-4-5-20251001", +) -> list[dict]: + """Generate sentences typical of French speakers learning Portuguese. + + Based on Pipecat method: use LLM to generate diverse training text. + """ + import anthropic + + api_key = os.environ.get("ANTHROPIC_API_KEY") + if not api_key: + log.warning("ANTHROPIC_API_KEY not set — cannot generate French-PT sentences") + return [] + + client = anthropic.Anthropic(api_key=api_key) + results = [] + + contexts = [ + "reuniao de trabalho", + "conversa informal com amigos", + "apresentacao de projeto", + "negociacao comercial", + "aula de portugues", + "pedindo informacoes na rua", + "restaurante pedindo comida", + "entrevista de emprego", + "ligacao telefonica profissional", + "discussao tecnica sobre software", + ] + + for ctx in contexts: + prompt = f"""Gere {n_sentences // len(contexts)} frases que um FRANCES de nivel B1-B2 diria em portugues durante: {ctx} + +Inclua variedade: +- Frases COMPLETAS (pensamento terminado, pode traduzir) +- Frases INCOMPLETAS (vai continuar falando, nao traduzir) +- Com hesitacoes tipicas (euh, alors, como se diz, tipo) +- Com erros de conjugacao comuns de franceses +- Com code-switching involuntario (palavra em frances no meio) + +Responda em JSON: +[{{"text": "frase", "label": "completo" ou "incompleto", "notes": "o que torna dificil classificar"}}]""" + + try: + response = client.messages.create( + model=model, + max_tokens=4000, + messages=[{"role": "user", "content": prompt}], + ) + text = response.content[0].text + json_match = re.search(r'\[.*\]', text, re.DOTALL) + if json_match: + batch = json.loads(json_match.group()) + for item in batch: + results.append({ + "text": item["text"], + "label": item["label"], + "source": "claude_fr_pt", + "context": ctx, + "notes": item.get("notes", ""), + }) + log.info(" Generated %d sentences for context: %s", len(batch), ctx) + + except Exception as e: + log.warning("Error generating for %s: %s", ctx, e) + + time.sleep(1) + + log.info("Total French-PT sentences generated: %d", len(results)) + return results + + +def run_full_pipeline( + max_transcripts: int = 5000, + max_fr_sentences: int = 500, +) -> Path: + """Run the full label generation pipeline.""" + output_dir = DATA_DIR / "claude_labeled" + output_dir.mkdir(parents=True, exist_ok=True) + + # Step 1: Load transcripts + log.info("=== Step 1: Loading Portuguese transcripts ===") + mupe = load_coraa_transcripts(max_samples=max_transcripts) + nurc = load_nurc_transcripts(max_samples=max_transcripts) + all_transcripts = mupe + nurc + log.info("Total transcripts: %d (MuPe=%d, NURC=%d)", len(all_transcripts), len(mupe), len(nurc)) + + # Step 2: Claude classifies + log.info("\n=== Step 2: Claude classifies complete/incomplete ===") + sentences = [t["text"] for t in all_transcripts] + classified = classify_with_claude(sentences) + + # Filter bad sentences (Pipecat removed 50-80%) + kept = [c for c in classified if c["keep"]] + removed = len(classified) - len(kept) + log.info("Classified: %d total, %d kept, %d removed (%.0f%%)", + len(classified), len(kept), removed, 100 * removed / max(len(classified), 1)) + + complete = [c for c in kept if c["label"] == "completo"] + incomplete = [c for c in kept if c["label"] == "incompleto"] + log.info(" Complete: %d, Incomplete: %d", len(complete), len(incomplete)) + + # Save classified + with open(output_dir / "classified_pt.json", "w") as f: + json.dump(kept, f, indent=2, ensure_ascii=False) + + # Step 3: Insert fillers (creates INCOMPLETE variants) + log.info("\n=== Step 3: Insert fillers (PT-BR + FR-PT) ===") + complete_texts = [c["text"] for c in complete[:1000]] + + pt_fillers = insert_fillers_with_claude(complete_texts, filler_type="pt_br") + fr_fillers = insert_fillers_with_claude(complete_texts[:500], filler_type="fr_pt") + + with open(output_dir / "fillers_pt_br.json", "w") as f: + json.dump(pt_fillers, f, indent=2, ensure_ascii=False) + with open(output_dir / "fillers_fr_pt.json", "w") as f: + json.dump(fr_fillers, f, indent=2, ensure_ascii=False) + + log.info("Fillers: %d PT-BR, %d FR-PT", len(pt_fillers), len(fr_fillers)) + + # Step 4: Generate French-Portuguese sentences + log.info("\n=== Step 4: Generate French-Portuguese sentences ===") + fr_sentences = generate_french_portuguese_sentences(n_sentences=max_fr_sentences) + with open(output_dir / "french_portuguese.json", "w") as f: + json.dump(fr_sentences, f, indent=2, ensure_ascii=False) + + # Summary + summary = { + "total_transcripts": len(all_transcripts), + "classified_kept": len(kept), + "classified_removed": removed, + "complete": len(complete), + "incomplete": len(incomplete), + "fillers_pt_br": len(pt_fillers), + "fillers_fr_pt": len(fr_fillers), + "french_portuguese": len(fr_sentences), + } + with open(output_dir / "summary.json", "w") as f: + json.dump(summary, f, indent=2) + + log.info("\n=== Summary ===") + for k, v in summary.items(): + log.info(" %s: %d", k, v) + + return output_dir + + +if __name__ == "__main__": + logging.basicConfig( + level=logging.INFO, + format="%(asctime)s %(levelname)s [%(name)s] %(message)s", + ) + run_full_pipeline(max_transcripts=5000, max_fr_sentences=500) diff --git a/03-finetune-pipecat-pt/03_generate_audio.py b/03-finetune-pipecat-pt/03_generate_audio.py new file mode 100644 index 0000000000000000000000000000000000000000..e22acd2ea2f1a63ee848f46ad9a955050e7e436c --- /dev/null +++ b/03-finetune-pipecat-pt/03_generate_audio.py @@ -0,0 +1,697 @@ +"""Generate TTS audio for the turn-taking dataset. + +Based on the Pipecat v3.1 pipeline: +1. Native PT-BR voices via Kokoro (pf_dora, pm_alex, pm_santa) +2. French-accented Portuguese via XTTS v2 voice cloning +3. Speed/pitch variation + noise augmentation +4. Hesitation pause injection (1.5-3s silence after fillers) — SpeculativeETD V3 +5. Short utterance dataset ("sim", "nao", "ok") — Pipecat v3.2 (-40% errors) + +Pipecat used Google Chirp3 TTS; we use Kokoro (open-source, runs locally) ++ XTTS v2 (voice cloning for accent simulation). + +Run locally or on Modal GPU: + python 03_generate_audio.py +""" + +from __future__ import annotations + +import json +import logging +import random +import re +from dataclasses import dataclass +from pathlib import Path + +import numpy as np + +log = logging.getLogger(__name__) + +DATA_DIR = Path(__file__).parent / "data" +SAMPLE_RATE = 16000 # Pipecat model expects 16kHz + + +@dataclass +class AudioSample: + """A single audio sample for the dataset.""" + audio: np.ndarray # float32, 16kHz + text: str + label: str # "complete" or "incomplete" + voice: str # e.g. "pf_dora", "fr_clone_1" + accent: str # "native_pt_br" or "french_pt" + source: str # "kokoro", "xtts", "pipecat" + speed: float = 1.0 + + +# --------------------------------------------------------------------------- +# Kokoro TTS — native PT-BR voices +# --------------------------------------------------------------------------- + +def generate_kokoro_audio( + sentences: list[dict], + voices: list[str] | None = None, + speed_range: tuple[float, float] = (0.85, 1.15), +) -> list[AudioSample]: + """Generate audio using Kokoro TTS (PT-BR voices). + + sentences: list of {"text": str, "label": "complete"|"incomplete"} + voices: Kokoro voice IDs (default: PT-BR voices) + """ + try: + from kokoro import KPipeline + except ImportError: + log.error("kokoro not installed — pip install kokoro") + return [] + + if voices is None: + voices = ["pf_dora", "pm_alex", "pm_santa"] + + log.info("Initializing Kokoro pipeline (lang=pt)...") + pipeline = KPipeline(lang_code="p") # Portuguese + + samples = [] + errors = 0 + + for i, sent in enumerate(sentences): + text = sent["text"] + label = sent["label"] + voice = random.choice(voices) + speed = random.uniform(*speed_range) + + try: + # Generate audio + generator = pipeline(text, voice=voice, speed=speed) + audio_chunks = [] + for _, _, chunk in generator: + audio_chunks.append(chunk.numpy() if hasattr(chunk, 'numpy') else np.array(chunk)) + + if not audio_chunks: + errors += 1 + continue + + audio = np.concatenate(audio_chunks).astype(np.float32) + + # Resample to 16kHz if needed (Kokoro outputs 24kHz) + if hasattr(pipeline, 'sample_rate') and pipeline.sample_rate != SAMPLE_RATE: + import torchaudio + import torch + tensor = torch.from_numpy(audio).float().unsqueeze(0) + audio = torchaudio.functional.resample( + tensor, pipeline.sample_rate, SAMPLE_RATE + ).squeeze().numpy() + + # Normalize + peak = np.max(np.abs(audio)) + if peak > 0: + audio = audio / peak * 0.9 + + samples.append(AudioSample( + audio=audio, + text=text, + label=label, + voice=voice, + accent="native_pt_br", + source="kokoro", + speed=speed, + )) + + except Exception as e: + errors += 1 + if errors <= 5: + log.warning("Kokoro error on sentence %d: %s", i, e) + + if (i + 1) % 100 == 0: + log.info(" Kokoro: %d/%d generated (%d errors)", len(samples), i + 1, errors) + + log.info("Kokoro: %d samples generated (%d errors)", len(samples), errors) + return samples + + +# --------------------------------------------------------------------------- +# XTTS v2 — French-accented Portuguese via voice cloning +# --------------------------------------------------------------------------- + +def generate_xtts_audio( + sentences: list[dict], + reference_audio_dir: Path | None = None, + speed_range: tuple[float, float] = (0.9, 1.1), +) -> list[AudioSample]: + """Generate audio with French accent using XTTS v2 voice cloning. + + Uses short French speech samples as reference to clone the accent, + then synthesizes Portuguese text with that voice. + + sentences: list of {"text": str, "label": "complete"|"incomplete"} + reference_audio_dir: directory with .wav files of French speakers + """ + try: + from TTS.api import TTS + except ImportError: + log.error("TTS (coqui) not installed — pip install TTS") + return [] + + log.info("Initializing XTTS v2...") + tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2") + + # Find reference audio files (French speakers) + ref_files = [] + if reference_audio_dir and reference_audio_dir.exists(): + ref_files = list(reference_audio_dir.glob("*.wav")) + + if not ref_files: + log.warning("No French reference audio found in %s — using default voice", reference_audio_dir) + # Fall back to generating without cloning (still works, but no accent) + return _generate_xtts_no_clone(tts, sentences, speed_range) + + log.info("Found %d French reference voices", len(ref_files)) + + samples = [] + errors = 0 + + for i, sent in enumerate(sentences): + text = sent["text"] + label = sent["label"] + ref = random.choice(ref_files) + + try: + audio = tts.tts( + text=text, + speaker_wav=str(ref), + language="pt", + ) + audio = np.array(audio, dtype=np.float32) + + # Resample to 16kHz if needed + if hasattr(tts, 'synthesizer') and tts.synthesizer.output_sample_rate != SAMPLE_RATE: + import torchaudio + import torch + tensor = torch.from_numpy(audio).float().unsqueeze(0) + audio = torchaudio.functional.resample( + tensor, tts.synthesizer.output_sample_rate, SAMPLE_RATE + ).squeeze().numpy() + + # Normalize + peak = np.max(np.abs(audio)) + if peak > 0: + audio = audio / peak * 0.9 + + # Random speed variation + speed = random.uniform(*speed_range) + if abs(speed - 1.0) > 0.05: + indices = np.arange(0, len(audio), speed).astype(int) + indices = indices[indices < len(audio)] + audio = audio[indices] + + samples.append(AudioSample( + audio=audio, + text=text, + label=label, + voice=f"fr_clone_{ref.stem}", + accent="french_pt", + source="xtts", + speed=speed, + )) + + except Exception as e: + errors += 1 + if errors <= 5: + log.warning("XTTS error on sentence %d: %s", i, e) + + if (i + 1) % 50 == 0: + log.info(" XTTS: %d/%d generated (%d errors)", len(samples), i + 1, errors) + + log.info("XTTS: %d samples generated (%d errors)", len(samples), errors) + return samples + + +def _generate_xtts_no_clone( + tts, + sentences: list[dict], + speed_range: tuple[float, float], +) -> list[AudioSample]: + """XTTS generation without voice cloning (fallback).""" + samples = [] + errors = 0 + + for i, sent in enumerate(sentences): + try: + audio = tts.tts(text=sent["text"], language="pt") + audio = np.array(audio, dtype=np.float32) + + peak = np.max(np.abs(audio)) + if peak > 0: + audio = audio / peak * 0.9 + + samples.append(AudioSample( + audio=audio, + text=sent["text"], + label=sent["label"], + voice="xtts_default", + accent="french_pt", + source="xtts", + speed=1.0, + )) + except Exception as e: + errors += 1 + if errors <= 3: + log.warning("XTTS fallback error: %s", e) + + log.info("XTTS (no clone): %d samples (%d errors)", len(samples), errors) + return samples + + +# --------------------------------------------------------------------------- +# Hesitation pause injection (SpeculativeETD V3 — best data variant) +# --------------------------------------------------------------------------- + +def inject_hesitation_pause( + audio: np.ndarray, + pause_duration_range: tuple[float, float] = (1.5, 3.0), + position: str = "end", +) -> np.ndarray: + """Inject a realistic hesitation pause into audio. + + SpeculativeETD V3 showed that inserting 1.5-3.0s pauses after fillers + was the most effective data variant for training ETD models. + + For French speakers learning Portuguese, these long pauses happen when: + - Searching for a word ("Eu preciso de... [2s pause] ...uma tesoura") + - Thinking about conjugation ("Ontem eu... [2s pause] ...fui ao mercado") + - Code-switching hesitation ("Eu gosto de... [1.5s] ...euh... praia") + + position: "end" = pause at end (simulates mid-utterance stop) + "mid" = pause in the middle (simulates hesitation) + """ + pause_s = random.uniform(*pause_duration_range) + pause_samples = int(pause_s * SAMPLE_RATE) + + # Add very low-level noise to the pause (not pure silence — more realistic) + pause = np.random.randn(pause_samples).astype(np.float32) * 0.001 + + if position == "end": + # Pause at end: speaker stops mid-sentence (INCOMPLETE) + return np.concatenate([audio, pause]) + else: + # Pause in middle: split audio and insert pause + if len(audio) < SAMPLE_RATE: # too short to split + return np.concatenate([audio, pause]) + split_point = random.randint(len(audio) // 3, 2 * len(audio) // 3) + return np.concatenate([audio[:split_point], pause, audio[split_point:]]) + + +def create_hesitation_variants( + samples: list[AudioSample], + fraction: float = 0.3, +) -> list[AudioSample]: + """Create hesitation-pause variants from existing INCOMPLETE samples. + + Takes a fraction of incomplete samples and adds 1.5-3s pauses, + simulating French speakers hesitating in Portuguese. + """ + incomplete = [s for s in samples if s.label == "incomplete"] + n_variants = int(len(incomplete) * fraction) + + variants = [] + for s in random.sample(incomplete, min(n_variants, len(incomplete))): + # Variant 1: long pause at end (speaker stopped to think) + audio_end = inject_hesitation_pause(s.audio, position="end") + variants.append(AudioSample( + audio=audio_end, + text=s.text, + label="incomplete", # still incomplete — speaker will continue + voice=s.voice, + accent=s.accent, + source=f"{s.source}_hesitation_end", + speed=s.speed, + )) + + # Variant 2: pause in the middle (thinking mid-sentence) + if random.random() < 0.5: + audio_mid = inject_hesitation_pause(s.audio, position="mid") + variants.append(AudioSample( + audio=audio_mid, + text=s.text, + label="incomplete", + voice=s.voice, + accent=s.accent, + source=f"{s.source}_hesitation_mid", + speed=s.speed, + )) + + log.info("Created %d hesitation-pause variants from %d incomplete samples", + len(variants), len(incomplete)) + return variants + + +# --------------------------------------------------------------------------- +# Short utterance generation (Pipecat v3.2: -40% errors on short responses) +# --------------------------------------------------------------------------- + +# Common short Portuguese responses in meetings +SHORT_UTTERANCES_COMPLETE = [ + "Sim.", "Não.", "Ok.", "Tá.", "Pode.", "Beleza.", "Certo.", "Claro.", + "Entendi.", "Combinado.", "Perfeito.", "Exato.", "Isso.", "Verdade.", + "Com certeza.", "Sem dúvida.", "Tá bom.", "Pode ser.", "Vamos lá.", + "Concordo.", "Fechado.", "Ótimo.", "Legal.", "Tranquilo.", "Valeu.", + "Obrigado.", "De nada.", "Até logo.", "Tchau.", "Bom dia.", "Boa tarde.", + # French speaker variants + "Oui... quer dizer, sim.", "Sim, euh, concordo.", "Ok, d'accord.", + "Bon, tá bom.", "Voilà, é isso.", "Exactement, exato.", +] + +SHORT_UTTERANCES_INCOMPLETE = [ + "Sim, mas...", "Não, porque...", "Então...", "Tipo...", "Olha...", + "Bom...", "Na verdade...", "Quer dizer...", "Pois é...", "Sabe...", + "É que...", "O problema é...", "A questão é...", "Eu acho que...", + # French speaker variants (hesitating after short start) + "Oui... euh...", "Sim, mas... comment dire...", "Alors...", + "Bon, eu acho que... euh...", "C'est... quer dizer...", + "Não, enfin... na verdade...", "Sim, mas... como se diz...", +] + + +def generate_short_utterances( + voices: list[str] | None = None, + n_per_utterance: int = 3, +) -> list[AudioSample]: + """Generate audio for short utterances using Kokoro TTS. + + Pipecat v3.2 showed that a dedicated short utterance dataset + reduced misclassification by 40%. Short responses like "sim", "não" + are very common in Portuguese meetings. + """ + try: + from kokoro import KPipeline + except ImportError: + log.warning("kokoro not installed — skipping short utterances") + return [] + + if voices is None: + voices = ["pf_dora", "pm_alex", "pm_santa"] + + pipeline = KPipeline(lang_code="p") + samples = [] + errors = 0 + + all_short = ( + [(text, "complete") for text in SHORT_UTTERANCES_COMPLETE] + + [(text, "incomplete") for text in SHORT_UTTERANCES_INCOMPLETE] + ) + + for text, label in all_short: + for _ in range(n_per_utterance): + voice = random.choice(voices) + speed = random.uniform(0.85, 1.15) + + try: + generator = pipeline(text, voice=voice, speed=speed) + audio_chunks = [] + for _, _, chunk in generator: + audio_chunks.append(chunk.numpy() if hasattr(chunk, 'numpy') else np.array(chunk)) + + if not audio_chunks: + errors += 1 + continue + + audio = np.concatenate(audio_chunks).astype(np.float32) + + # Resample to 16kHz if needed + if hasattr(pipeline, 'sample_rate') and pipeline.sample_rate != SAMPLE_RATE: + import torchaudio + import torch + tensor = torch.from_numpy(audio).float().unsqueeze(0) + audio = torchaudio.functional.resample( + tensor, pipeline.sample_rate, SAMPLE_RATE + ).squeeze().numpy() + + # Normalize + peak = np.max(np.abs(audio)) + if peak > 0: + audio = audio / peak * 0.9 + + # For incomplete short utterances, add hesitation pause + if label == "incomplete" and random.random() < 0.7: + audio = inject_hesitation_pause(audio, pause_duration_range=(1.0, 2.5), position="end") + + samples.append(AudioSample( + audio=audio, + text=text, + label=label, + voice=voice, + accent="native_pt_br", + source="short_utterance", + speed=speed, + )) + + except Exception as e: + errors += 1 + if errors <= 5: + log.warning("Short utterance error: %s", e) + + n_c = sum(1 for s in samples if s.label == "complete") + n_i = sum(1 for s in samples if s.label == "incomplete") + log.info("Short utterances: %d samples (%d complete, %d incomplete, %d errors)", + len(samples), n_c, n_i, errors) + return samples + + +# --------------------------------------------------------------------------- +# Audio augmentation +# --------------------------------------------------------------------------- + +def augment_sample(audio: np.ndarray) -> np.ndarray: + """Apply augmentation to diversify training data. + + Based on Pipecat/SpeculativeETD augmentation strategies. + """ + aug = audio.copy() + + # Add background noise (simulates meeting room) + if random.random() < 0.4: + noise_level = random.uniform(0.002, 0.015) + aug += np.random.randn(len(aug)).astype(np.float32) * noise_level + + # Volume variation (simulates different mic distances) + if random.random() < 0.5: + scale = random.uniform(0.6, 1.4) + aug *= scale + + # Speed perturbation (simulates speaking rate variation) + if random.random() < 0.3: + speed = random.uniform(0.92, 1.08) + indices = np.arange(0, len(aug), speed).astype(int) + indices = indices[indices < len(aug)] + aug = aug[indices] + + return np.clip(aug, -1.0, 1.0).astype(np.float32) + + +# --------------------------------------------------------------------------- +# Save dataset +# --------------------------------------------------------------------------- + +def save_dataset( + samples: list[AudioSample], + output_dir: Path, + augment_copies: int = 2, +) -> Path: + """Save audio samples as WAV files + metadata JSON. + + augment_copies: number of augmented copies per sample (0 = no augmentation) + """ + import soundfile as sf + + output_dir.mkdir(parents=True, exist_ok=True) + audio_dir = output_dir / "audio" + audio_dir.mkdir(exist_ok=True) + + metadata = [] + total_saved = 0 + + for i, sample in enumerate(samples): + # Save original + fname = f"{i:05d}_{sample.label}_{sample.accent}_{sample.voice}.wav" + sf.write(str(audio_dir / fname), sample.audio, SAMPLE_RATE) + metadata.append({ + "file": fname, + "text": sample.text, + "label": sample.label, + "voice": sample.voice, + "accent": sample.accent, + "source": sample.source, + "speed": sample.speed, + "augmented": False, + "duration_s": round(len(sample.audio) / SAMPLE_RATE, 2), + }) + total_saved += 1 + + # Save augmented copies + for aug_idx in range(augment_copies): + aug_audio = augment_sample(sample.audio) + aug_fname = f"{i:05d}_{sample.label}_{sample.accent}_{sample.voice}_aug{aug_idx}.wav" + sf.write(str(audio_dir / aug_fname), aug_audio, SAMPLE_RATE) + metadata.append({ + "file": aug_fname, + "text": sample.text, + "label": sample.label, + "voice": sample.voice, + "accent": sample.accent, + "source": sample.source, + "speed": sample.speed, + "augmented": True, + "duration_s": round(len(aug_audio) / SAMPLE_RATE, 2), + }) + total_saved += 1 + + # Save metadata + meta_path = output_dir / "metadata.json" + with open(meta_path, "w") as f: + json.dump(metadata, f, indent=2, ensure_ascii=False) + + # Stats + n_complete = sum(1 for m in metadata if m["label"] == "complete") + n_incomplete = sum(1 for m in metadata if m["label"] == "incomplete") + n_native = sum(1 for m in metadata if m["accent"] == "native_pt_br") + n_french = sum(1 for m in metadata if m["accent"] == "french_pt") + + log.info("Dataset saved to %s:", output_dir) + log.info(" Total: %d samples (%d original + %d augmented)", + total_saved, len(samples), total_saved - len(samples)) + log.info(" Complete: %d, Incomplete: %d", n_complete, n_incomplete) + log.info(" Native PT-BR: %d, French-accented: %d", n_native, n_french) + + return output_dir + + +# --------------------------------------------------------------------------- +# Main pipeline +# --------------------------------------------------------------------------- + +def run_audio_generation( + max_native_sentences: int = 3000, + max_french_sentences: int = 1000, + augment_copies: int = 2, +) -> Path: + """Run the full audio generation pipeline. + + Loads labeled sentences from 02_generate_labels.py output, + generates audio via Kokoro + XTTS, saves dataset. + """ + labeled_dir = DATA_DIR / "claude_labeled" + + # Load labeled sentences + all_sentences = [] + + # 1. Classified sentences (from CORAA transcripts) + classified_path = labeled_dir / "classified_pt.json" + if classified_path.exists(): + with open(classified_path) as f: + classified = json.load(f) + for c in classified: + if c.get("label") in ("completo", "incompleto"): + all_sentences.append({ + "text": c["text"], + "label": "complete" if c["label"] == "completo" else "incomplete", + "source": "classified", + }) + log.info("Loaded %d classified sentences", len(all_sentences)) + + # 2. Filler sentences (all are INCOMPLETE) + for filler_file, filler_type in [ + ("fillers_pt_br.json", "pt_br"), + ("fillers_fr_pt.json", "fr_pt"), + ]: + fpath = labeled_dir / filler_file + if fpath.exists(): + with open(fpath) as f: + fillers = json.load(f) + for fl in fillers: + if fl.get("with_filler"): + all_sentences.append({ + "text": fl["with_filler"], + "label": "incomplete", + "source": f"filler_{filler_type}", + }) + log.info("Loaded %d %s filler sentences", len(fillers), filler_type) + + # 3. French-Portuguese sentences + frpt_path = labeled_dir / "french_portuguese.json" + if frpt_path.exists(): + with open(frpt_path) as f: + frpt = json.load(f) + for s in frpt: + label = "complete" if s.get("label", "") == "completo" else "incomplete" + all_sentences.append({ + "text": s["text"], + "label": label, + "source": "claude_fr_pt", + }) + log.info("Loaded %d French-Portuguese sentences", len(frpt)) + + if not all_sentences: + log.error("No labeled sentences found in %s — run 02_generate_labels.py first", labeled_dir) + return DATA_DIR + + log.info("Total sentences to synthesize: %d", len(all_sentences)) + + # Split: native PT-BR vs French-accented + native_sentences = [s for s in all_sentences if s["source"] != "claude_fr_pt"] + french_sentences = [s for s in all_sentences if s["source"] == "claude_fr_pt"] + + # Also use some filler_fr_pt sentences with XTTS + fr_filler = [s for s in all_sentences if s["source"] == "filler_fr_pt"] + french_sentences.extend(fr_filler) + + random.shuffle(native_sentences) + random.shuffle(french_sentences) + native_sentences = native_sentences[:max_native_sentences] + french_sentences = french_sentences[:max_french_sentences] + + # Generate audio + log.info("\n=== Generating native PT-BR audio (Kokoro) ===") + native_samples = generate_kokoro_audio(native_sentences) + + log.info("\n=== Generating French-accented audio (XTTS) ===") + ref_dir = DATA_DIR / "french_reference_audio" + french_samples = generate_xtts_audio(french_sentences, reference_audio_dir=ref_dir) + + # NEW: Short utterances (Pipecat v3.2: -40% errors) + log.info("\n=== Generating short utterance dataset ===") + short_samples = generate_short_utterances(n_per_utterance=3) + + # NEW: Hesitation pause variants (SpeculativeETD V3: best data variant) + # Critical for French speakers: long pauses mid-sentence are NOT end-of-turn + log.info("\n=== Creating hesitation-pause variants ===") + all_pre_hesitation = native_samples + french_samples + short_samples + hesitation_variants = create_hesitation_variants(all_pre_hesitation, fraction=0.3) + + all_samples = native_samples + french_samples + short_samples + hesitation_variants + random.shuffle(all_samples) + + log.info("Total samples before augmentation: %d", len(all_samples)) + log.info(" Native PT-BR: %d", len(native_samples)) + log.info(" French-accented: %d", len(french_samples)) + log.info(" Short utterances: %d", len(short_samples)) + log.info(" Hesitation variants: %d", len(hesitation_variants)) + + # Save + log.info("\n=== Saving dataset ===") + output_dir = DATA_DIR / "tts_dataset" + save_dataset(all_samples, output_dir, augment_copies=augment_copies) + + return output_dir + + +if __name__ == "__main__": + logging.basicConfig( + level=logging.INFO, + format="%(asctime)s %(levelname)s [%(name)s] %(message)s", + ) + random.seed(42) + np.random.seed(42) + + run_audio_generation( + max_native_sentences=3000, + max_french_sentences=1000, + augment_copies=2, + ) diff --git a/03-finetune-pipecat-pt/04_finetune.py b/03-finetune-pipecat-pt/04_finetune.py new file mode 100644 index 0000000000000000000000000000000000000000..39fc26e867d0d10fc9c50de37bdc45193c63aff3 --- /dev/null +++ b/03-finetune-pipecat-pt/04_finetune.py @@ -0,0 +1,1202 @@ +"""Fine-tune Pipecat Smart Turn v3 for Portuguese + French-accented Portuguese. + +Architecture: exact SmartTurnV3Model from Pipecat's train.py +- WhisperEncoder (openai/whisper-tiny) with max_source_positions=400 (8s) +- Attention pooling: Linear(384→256) → Tanh → Linear(256→1) +- Classifier: Linear(384→256) → LayerNorm → GELU → Dropout(0.1) + → Linear(256→64) → GELU → Linear(64→1) + +Training strategy: +- Initialize from openai/whisper-tiny (no Pipecat PyTorch weights available) +- Use Pipecat's Portuguese data as primary training data +- Add our Claude-labeled + TTS-generated data for PT-BR + French accent +- Pipecat hyperparams: lr=5e-5, warmup=0.2, cosine schedule, epochs=4 + +Changes vs initial plan (based on reference analysis — see references/): +- alpha=0.25 (not 0.6) per Lin et al. 2017 optimal for gamma=2 +- batch_size=128 (not 32) per Pipecat train.py (they use 384) +- Removed label smoothing (double-regularization with focal loss, per EMNLP 2022) +- Added BCEWithLogitsLoss option for comparison (Pipecat's original loss) +- Added real noise augmentation (not just Gaussian) per Pipecat v3.2 +- Added short utterance dataset loading per Pipecat v3.2 findings +- ONNX opset 18 (not 17) per Pipecat train.py +- Added INT8 static quantization step per Pipecat's deployment pipeline + +Language-learning-specific improvements (March 2026 research): +- fp_penalty=2.0: asymmetric cost — interrupting a learner costs 2x more than + waiting too long (ConversAR 2025, Praktika approach) +- Speak & Improve L2 corpus: real L2 learner speech with disfluencies (340h) +- Dual threshold evaluation: eager (speculative) + final (confirmed) per Deepgram Flux + +Data sources: +1. Pipecat v3.2 Portuguese subset (~5K samples from 270K) +2. Our custom TTS dataset (03_generate_audio.py output) +3. CORAA real audio (01_download_pipecat.py output) +4. Speak & Improve L2 corpus (~2K samples, cross-lingual hesitation patterns) +""" + +from __future__ import annotations + +import gc +import json +import logging +import random +import time +from dataclasses import dataclass +from pathlib import Path + +import numpy as np +import torch +import torch.nn as nn +from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler +from transformers import WhisperFeatureExtractor, WhisperPreTrainedModel, WhisperConfig + +log = logging.getLogger(__name__) + +SAMPLE_RATE = 16000 +WINDOW_SECONDS = 8 +WINDOW_SAMPLES = WINDOW_SECONDS * SAMPLE_RATE + +_workspace = Path("/workspace") if Path("/workspace").exists() else Path(".") +OUTPUT_DIR = _workspace / "results" +CACHE_DIR = _workspace / "hf_cache" +DATA_DIR = Path(__file__).parent / "data" + +# Label smoothing REMOVED — focal loss already provides implicit calibration +# via entropy regularization (EMNLP 2022: focal_loss_calibration_emnlp_2022.md) +# Double-regularization (FL + LS) reduces discriminative power. +LABEL_SMOOTH = 0.0 + + +# --------------------------------------------------------------------------- +# Model — SmartTurnV3Model (exact Pipecat architecture) +# --------------------------------------------------------------------------- + +class SmartTurnV3Model(nn.Module): + """Pipecat Smart Turn v3 model architecture. + + Matches the exact architecture from: + https://github.com/pipecat-ai/smart-turn/blob/main/train/train.py + + WhisperEncoder (whisper-tiny, 384-dim) → attention pooling → classifier + """ + + def __init__(self, whisper_model: str = "openai/whisper-tiny"): + super().__init__() + from transformers import WhisperModel + + # Load pretrained encoder + whisper = WhisperModel.from_pretrained( + whisper_model, cache_dir=str(CACHE_DIR) + ) + self.encoder = whisper.encoder + + # Resize position embeddings: 1500 (30s) → 400 (8s) + max_pos = 400 + old_embed = self.encoder.embed_positions.weight.data + new_embed = old_embed[:max_pos, :] + self.encoder.embed_positions = nn.Embedding(max_pos, old_embed.shape[1]) + self.encoder.embed_positions.weight.data = new_embed + self.encoder.config.max_source_positions = max_pos + + hidden_size = self.encoder.config.d_model # 384 for whisper-tiny + + # Attention pooling (exact Pipecat architecture) + self.attention = nn.Sequential( + nn.Linear(hidden_size, 256), + nn.Tanh(), + nn.Linear(256, 1), + ) + + # Classifier head (exact Pipecat architecture) + self.classifier = nn.Sequential( + nn.Linear(hidden_size, 256), + nn.LayerNorm(256), + nn.GELU(), + nn.Dropout(0.1), + nn.Linear(256, 64), + nn.GELU(), + nn.Linear(64, 1), + ) + + def forward(self, input_features: torch.Tensor) -> torch.Tensor: + encoder_output = self.encoder(input_features).last_hidden_state + attn_weights = self.attention(encoder_output) + attn_weights = torch.softmax(attn_weights, dim=1) + pooled = (encoder_output * attn_weights).sum(dim=1) + logits = self.classifier(pooled) + return logits.squeeze(-1) + + +# --------------------------------------------------------------------------- +# Focal Loss +# --------------------------------------------------------------------------- + +class FocalLoss(nn.Module): + """Focal Loss (Lin et al. 2017) — focuses on hard boundary cases. + + alpha=0.25 is optimal for gamma=2.0 per the original paper (Table 1a). + Our initial alpha=0.6 over-weighted positives and hurt calibration. + + fp_penalty: extra multiplier on false-positive loss (model says "complete" + when speaker is still talking). For a language-learning avatar, interrupting + the learner mid-thought is much worse than waiting too long. + - ConversAR (2025) gives learners "infinite thinking period" + - Praktika uses extended silence tolerance for L2 speech + - Default 2.0 means FP errors cost 2x more than FN errors + """ + + def __init__(self, gamma: float = 2.0, alpha: float = 0.25, fp_penalty: float = 2.0): + super().__init__() + self.gamma = gamma + self.alpha = alpha + self.fp_penalty = fp_penalty + + def forward(self, logits: torch.Tensor, targets: torch.Tensor) -> torch.Tensor: + bce = nn.functional.binary_cross_entropy_with_logits( + logits, targets, reduction="none", + ) + probs = torch.sigmoid(logits) + p_t = probs * targets + (1 - probs) * (1 - targets) + focal_weight = (1 - p_t) ** self.gamma + alpha_t = self.alpha * targets + (1 - self.alpha) * (1 - targets) + loss = alpha_t * focal_weight * bce + + # Asymmetric cost: penalize FP (interrupting learner) more than FN (waiting) + # FP = model predicts 1 (complete) when target is 0 (incomplete) + if self.fp_penalty != 1.0: + is_fp = (probs > 0.5).float() * (1 - targets) # predicted complete, actually incomplete + penalty = 1.0 + (self.fp_penalty - 1.0) * is_fp + loss = loss * penalty + + return loss.mean() + + +# --------------------------------------------------------------------------- +# Data loading +# --------------------------------------------------------------------------- + +@dataclass +class AudioSample: + audio: np.ndarray + label: float # 1.0 = complete, 0.0 = incomplete + source: str + speaker_id: str = "" + + +def load_pipecat_portuguese(max_samples: int = 5000) -> list[AudioSample]: + """Load Pipecat v3.2 Portuguese data (from 01_download_pipecat.py).""" + audio_dir = DATA_DIR / "pipecat_pt_audio" + if not audio_dir.exists(): + log.warning("Pipecat PT audio not found at %s — run 01_download_pipecat.py first", audio_dir) + return [] + + import soundfile as sf + + samples = [] + for wav_path in sorted(audio_dir.glob("*.wav"))[:max_samples]: + try: + audio, sr = sf.read(str(wav_path)) + audio = np.array(audio, dtype=np.float32) + + if sr != SAMPLE_RATE: + import torchaudio + tensor = torch.from_numpy(audio).float().unsqueeze(0) + audio = torchaudio.functional.resample(tensor, sr, SAMPLE_RATE).squeeze().numpy() + + label = 1.0 if "_complete" in wav_path.name else 0.0 + audio = _extract_window(audio) + if audio is not None: + samples.append(AudioSample( + audio=audio, label=label, + source="pipecat_v3.2", + speaker_id=f"pipecat_{wav_path.stem.split('_')[0]}", + )) + except Exception as e: + log.warning("Error loading %s: %s", wav_path.name, e) + + n_c = sum(1 for s in samples if s.label == 1.0) + n_i = sum(1 for s in samples if s.label == 0.0) + log.info("Pipecat PT: %d samples (%d complete, %d incomplete)", len(samples), n_c, n_i) + return samples + + +def load_tts_dataset(max_samples: int = 10000) -> list[AudioSample]: + """Load TTS-generated audio (from 03_generate_audio.py).""" + tts_dir = DATA_DIR / "tts_dataset" + meta_path = tts_dir / "metadata.json" + + if not meta_path.exists(): + log.warning("TTS dataset not found at %s — run 03_generate_audio.py first", tts_dir) + return [] + + import soundfile as sf + + with open(meta_path) as f: + metadata = json.load(f) + + audio_dir = tts_dir / "audio" + samples = [] + + for meta in metadata[:max_samples]: + wav_path = audio_dir / meta["file"] + if not wav_path.exists(): + continue + + try: + audio, sr = sf.read(str(wav_path)) + audio = np.array(audio, dtype=np.float32) + + if sr != SAMPLE_RATE: + import torchaudio + tensor = torch.from_numpy(audio).float().unsqueeze(0) + audio = torchaudio.functional.resample(tensor, sr, SAMPLE_RATE).squeeze().numpy() + + label = 1.0 if meta["label"] == "complete" else 0.0 + audio = _extract_window(audio) + if audio is not None: + samples.append(AudioSample( + audio=audio, label=label, + source=f"tts_{meta['accent']}", + speaker_id=meta["voice"], + )) + except Exception as e: + log.warning("Error loading %s: %s", meta["file"], e) + + n_c = sum(1 for s in samples if s.label == 1.0) + n_i = sum(1 for s in samples if s.label == 0.0) + log.info("TTS dataset: %d samples (%d complete, %d incomplete)", len(samples), n_c, n_i) + return samples + + +def load_coraa_real_audio(max_samples: int = 3000) -> list[AudioSample]: + """Load REAL conversational audio from CORAA (not TTS). + + SpeculativeETD showed synthetic-only training has a devastating gap: + F1 drops from 94.7% → 30.3% on real data. Mixing real audio is critical. + + CORAA MUPE has 365h of real PT-BR interviews with speaker diarization. + We use these as COMPLETE samples (full utterances with natural prosody). + For INCOMPLETE, we truncate at natural pause points. + """ + from datasets import load_dataset + import re + + log.info("Loading CORAA MUPE real audio (streaming)...") + try: + ds = load_dataset( + "nilc-nlp/CORAA-MUPE-ASR", + split="train", + streaming=True, + cache_dir=str(CACHE_DIR), + ) + except Exception as e: + log.warning("Failed to load CORAA MUPE: %s", e) + return [] + + samples = [] + complete_count = 0 + incomplete_count = 0 + target_per_class = max_samples // 2 + + for i, row in enumerate(ds): + if complete_count >= target_per_class and incomplete_count >= target_per_class: + break + + try: + audio_data = row.get("audio", {}) + if not audio_data: + continue + + audio = np.array(audio_data["array"], dtype=np.float32) + sr = audio_data["sampling_rate"] + + if sr != SAMPLE_RATE: + import torchaudio + tensor = torch.from_numpy(audio).float().unsqueeze(0) + audio = torchaudio.functional.resample(tensor, sr, SAMPLE_RATE).squeeze().numpy() + + duration = len(audio) / SAMPLE_RATE + if duration < 1.0: + continue + + text = str(row.get("text", "")) + speaker_id = f"coraa_real_{i // 50}" + + # COMPLETE: full utterances ending with sentence punctuation + if complete_count < target_per_class and re.search(r'[.!?]+\s*$', text): + window = _extract_window(audio) + if window is not None: + samples.append(AudioSample( + audio=window, label=1.0, + source="coraa_real", + speaker_id=speaker_id, + )) + complete_count += 1 + + # INCOMPLETE: truncate real audio at 40-70% (natural mid-utterance) + if incomplete_count < target_per_class and duration >= 2.0: + cut_frac = random.uniform(0.4, 0.7) + truncated = audio[:int(len(audio) * cut_frac)] + window = _extract_window(truncated) + if window is not None: + samples.append(AudioSample( + audio=window, label=0.0, + source="coraa_real", + speaker_id=speaker_id, + )) + incomplete_count += 1 + + except Exception: + continue + + if i % 5000 == 0 and i > 0: + log.info(" CORAA real: scanned %d, %d complete, %d incomplete", i, complete_count, incomplete_count) + + log.info("CORAA real audio: %d complete + %d incomplete = %d samples", + complete_count, incomplete_count, len(samples)) + return samples + + +def load_pipecat_test_data() -> list[AudioSample]: + """Load Pipecat v3.2 Portuguese test data for evaluation.""" + test_dir = DATA_DIR / "pipecat_pt_test" + if not test_dir.exists(): + log.warning("Pipecat PT test data not found — run 01_download_pipecat.py first") + return [] + + import soundfile as sf + + samples = [] + for wav_path in sorted(test_dir.glob("*.wav")): + try: + audio, sr = sf.read(str(wav_path)) + audio = np.array(audio, dtype=np.float32) + + if sr != SAMPLE_RATE: + import torchaudio + tensor = torch.from_numpy(audio).float().unsqueeze(0) + audio = torchaudio.functional.resample(tensor, sr, SAMPLE_RATE).squeeze().numpy() + + label = 1.0 if "_complete" in wav_path.name else 0.0 + audio = _extract_window(audio) + if audio is not None: + samples.append(AudioSample( + audio=audio, label=label, + source="pipecat_test", + speaker_id=f"pipecat_test_{wav_path.stem.split('_')[0]}", + )) + except Exception as e: + log.warning("Error loading test %s: %s", wav_path.name, e) + + log.info("Pipecat test: %d samples", len(samples)) + return samples + + +def _extract_window(audio: np.ndarray) -> np.ndarray | None: + """Extract 8-second window from end of audio, pad if needed.""" + if len(audio) < SAMPLE_RATE: # minimum 1s + return None + + peak = np.max(np.abs(audio)) + if peak > 0: + audio = audio / peak * 0.9 + + if len(audio) > WINDOW_SAMPLES: + audio = audio[-WINDOW_SAMPLES:] + elif len(audio) < WINDOW_SAMPLES: + padding = WINDOW_SAMPLES - len(audio) + audio = np.pad(audio, (padding, 0), mode="constant") + + # ~200ms silence at end (VAD behavior) + silence_samples = int(0.2 * SAMPLE_RATE) + audio[-silence_samples:] = 0.0 + + return audio.astype(np.float32) + + +# --------------------------------------------------------------------------- +# Audio augmentation +# --------------------------------------------------------------------------- + +# Cache for background noise samples (loaded once) +_noise_cache: list[np.ndarray] = [] + + +def _load_noise_samples() -> list[np.ndarray]: + """Load real background noise samples for augmentation. + + Pipecat v3.2 used CC-0 Freesound.org cafe/office noise and saw + 40% fewer short-utterance misclassifications (pipecat_smart_turn_v3_2.md). + """ + global _noise_cache + if _noise_cache: + return _noise_cache + + noise_dir = DATA_DIR / "noise_samples" + if not noise_dir.exists(): + return [] + + import soundfile as sf + for wav_path in noise_dir.glob("*.wav"): + try: + audio, sr = sf.read(str(wav_path)) + audio = np.array(audio, dtype=np.float32) + if sr != SAMPLE_RATE: + import torchaudio + tensor = torch.from_numpy(audio).float().unsqueeze(0) + audio = torchaudio.functional.resample(tensor, sr, SAMPLE_RATE).squeeze().numpy() + _noise_cache.append(audio) + except Exception: + pass + + if _noise_cache: + log.info("Loaded %d background noise samples for augmentation", len(_noise_cache)) + return _noise_cache + + +def augment_audio(audio: np.ndarray) -> np.ndarray: + """Data augmentation for training. + + Per reference analysis: + - Real background noise (Pipecat v3.2: 40% fewer errors with cafe/office noise) + - Speed perturbation (standard in speech ML) + - Volume variation (simulates mic distance) + - Time shift + """ + aug = audio.copy() + + # Speed perturbation + if random.random() < 0.5: + speed = random.uniform(0.9, 1.1) + indices = np.arange(0, len(aug), speed).astype(int) + indices = indices[indices < len(aug)] + aug = aug[indices] + if len(aug) > WINDOW_SAMPLES: + aug = aug[:WINDOW_SAMPLES] + elif len(aug) < WINDOW_SAMPLES: + aug = np.pad(aug, (WINDOW_SAMPLES - len(aug), 0), mode="constant") + + # Volume variation + if random.random() < 0.5: + aug *= random.uniform(0.6, 1.4) + + # Real background noise (preferred) or Gaussian fallback + noise_samples = _load_noise_samples() + if noise_samples and random.random() < 0.5: + noise = random.choice(noise_samples) + # Loop or trim noise to match audio length + if len(noise) < len(aug): + repeats = len(aug) // len(noise) + 1 + noise = np.tile(noise, repeats)[:len(aug)] + else: + start = random.randint(0, len(noise) - len(aug)) + noise = noise[start:start + len(aug)] + snr_db = random.uniform(10, 25) # 10-25 dB SNR + noise_scale = np.sqrt(np.mean(aug ** 2)) / (np.sqrt(np.mean(noise ** 2)) * 10 ** (snr_db / 20) + 1e-8) + aug += noise * noise_scale + elif random.random() < 0.4: + # Gaussian noise fallback + noise_level = random.uniform(0.002, 0.02) + aug += np.random.randn(len(aug)).astype(np.float32) * noise_level + + # Time shift + if random.random() < 0.3: + shift = random.randint(-int(0.3 * SAMPLE_RATE), int(0.3 * SAMPLE_RATE)) + aug = np.roll(aug, shift) + if shift > 0: + aug[:shift] = 0.0 + elif shift < 0: + aug[shift:] = 0.0 + + return np.clip(aug, -1.0, 1.0).astype(np.float32) + + +# --------------------------------------------------------------------------- +# PyTorch Dataset +# --------------------------------------------------------------------------- + +class SmartTurnDataset(Dataset): + def __init__( + self, + samples: list[AudioSample], + feature_extractor: WhisperFeatureExtractor, + augment: bool = False, + ): + self.samples = samples + self.feature_extractor = feature_extractor + self.augment = augment + + def __len__(self) -> int: + return len(self.samples) + + def __getitem__(self, idx: int) -> dict: + sample = self.samples[idx] + audio = sample.audio + + if self.augment: + audio = augment_audio(audio) + + inputs = self.feature_extractor( + audio, + sampling_rate=SAMPLE_RATE, + return_tensors="np", + padding="max_length", + max_length=WINDOW_SAMPLES, + truncation=True, + do_normalize=True, + ) + + features = inputs.input_features.squeeze(0).astype(np.float32) + + return { + "input_features": torch.from_numpy(features), + "labels": torch.tensor(sample.label, dtype=torch.float32), + } + + +# --------------------------------------------------------------------------- +# Train/val split +# --------------------------------------------------------------------------- + +def split_by_speaker( + samples: list[AudioSample], + val_frac: float = 0.1, + test_frac: float = 0.1, +) -> tuple[list[AudioSample], list[AudioSample], list[AudioSample]]: + """Split by speaker ID to prevent data leakage.""" + speaker_map: dict[str, list[AudioSample]] = {} + for s in samples: + speaker_map.setdefault(s.speaker_id or "unknown", []).append(s) + + speakers = list(speaker_map.keys()) + random.shuffle(speakers) + + n_val = max(1, int(len(speakers) * val_frac)) + n_test = max(1, int(len(speakers) * test_frac)) + + test_spk = set(speakers[:n_test]) + val_spk = set(speakers[n_test:n_test + n_val]) + train_spk = set(speakers[n_test + n_val:]) + + train = [s for sp in train_spk for s in speaker_map[sp]] + val = [s for sp in val_spk for s in speaker_map[sp]] + test = [s for sp in test_spk for s in speaker_map[sp]] + + random.shuffle(train) + random.shuffle(val) + random.shuffle(test) + + for name, split in [("Train", train), ("Val", val), ("Test", test)]: + n_c = sum(1 for s in split if s.label == 1.0) + n_i = sum(1 for s in split if s.label == 0.0) + log.info(" %s: %d samples (%d complete, %d incomplete, %d speakers)", + name, len(split), n_c, n_i, len(set(s.speaker_id for s in split))) + + return train, val, test + + +# --------------------------------------------------------------------------- +# Training +# --------------------------------------------------------------------------- + +def load_speak_improve_l2(max_samples: int = 2000) -> list[AudioSample]: + """Load L2 learner speech from Speak & Improve Corpus 2025. + + 340 hours of L2 English learner speech with disfluency annotations and + CEFR proficiency scores (A2-C1, majority B1-B2). + + Even though this is English L2 (not Portuguese), L2 hesitation patterns + transfer across languages (Cenoz 2000). The pauses, false starts, and + disfluencies of L2 speakers share universal characteristics regardless + of target language. + + Knill et al. (2025). Speak & Improve Corpus 2025: an L2 English Speech + Corpus for Language Assessment and Feedback. arXiv:2412.11986 + """ + from datasets import load_dataset + + log.info("Loading Speak & Improve L2 corpus (streaming)...") + try: + ds = load_dataset( + "CambridgeEnglish/SpeakAndImprove2025", + split="train", + streaming=True, + cache_dir=str(CACHE_DIR), + trust_remote_code=True, + ) + except Exception as e: + log.warning("Failed to load Speak & Improve corpus: %s — trying fallback name", e) + try: + ds = load_dataset( + "cambridgeenglishtests/speak-and-improve-2025", + split="train", + streaming=True, + cache_dir=str(CACHE_DIR), + trust_remote_code=True, + ) + except Exception as e2: + log.warning("Speak & Improve corpus not available: %s — skipping", e2) + return [] + + samples = [] + complete_count = 0 + incomplete_count = 0 + target_per_class = max_samples // 2 + + for i, row in enumerate(ds): + if complete_count >= target_per_class and incomplete_count >= target_per_class: + break + + try: + audio_data = row.get("audio", {}) + if not audio_data: + continue + + audio = np.array(audio_data["array"], dtype=np.float32) + sr = audio_data["sampling_rate"] + + if sr != SAMPLE_RATE: + import torchaudio + tensor = torch.from_numpy(audio).float().unsqueeze(0) + audio = torchaudio.functional.resample(tensor, sr, SAMPLE_RATE).squeeze().numpy() + + duration = len(audio) / SAMPLE_RATE + if duration < 1.0: + continue + + text = str(row.get("text", row.get("transcript", ""))) + speaker_id = f"si_l2_{i // 20}" + + # COMPLETE: full utterances (use entire audio) + if complete_count < target_per_class and duration >= 2.0: + window = _extract_window(audio) + if window is not None: + samples.append(AudioSample( + audio=window, label=1.0, + source="speak_improve_l2", + speaker_id=speaker_id, + )) + complete_count += 1 + + # INCOMPLETE: truncate at random point (simulates mid-utterance) + if incomplete_count < target_per_class and duration >= 3.0: + cut_frac = random.uniform(0.3, 0.6) + truncated = audio[:int(len(audio) * cut_frac)] + # Add hesitation-like pause at end (L2 speakers pause 524ms+ per Kosmala 2022) + pause_ms = random.uniform(500, 2000) + pause_samples = int(pause_ms / 1000 * SAMPLE_RATE) + pause = np.random.randn(pause_samples).astype(np.float32) * 0.001 + truncated = np.concatenate([truncated, pause]) + window = _extract_window(truncated) + if window is not None: + samples.append(AudioSample( + audio=window, label=0.0, + source="speak_improve_l2", + speaker_id=speaker_id, + )) + incomplete_count += 1 + + except Exception: + continue + + if i % 2000 == 0 and i > 0: + log.info(" S&I L2: scanned %d, %d complete, %d incomplete", i, complete_count, incomplete_count) + + log.info("Speak & Improve L2: %d complete + %d incomplete = %d samples", + complete_count, incomplete_count, len(samples)) + return samples + + +def train( + epochs: int = 6, + batch_size: int = 128, + lr: float = 5e-5, + warmup_ratio: float = 0.2, + max_pipecat_samples: int = 5000, + max_tts_samples: int = 10000, + max_l2_samples: int = 2000, + whisper_model: str = "openai/whisper-tiny", + loss_fn: str = "focal", # "focal" or "bce" (Pipecat's original) + fp_penalty: float = 2.0, # asymmetric cost: FP costs 2x more than FN +) -> Path: + """Fine-tune SmartTurnV3Model on Portuguese data. + + Default hyperparams from Pipecat's train.py: + - lr=5e-5, warmup_ratio=0.2, cosine schedule + - Pipecat uses epochs=4 on 270K samples; we use 6 on 5-15K (more passes needed) + - Pipecat uses batch_size=384; we use 128 (A10G has 24GB, enough for 128) + - loss_fn="focal" (our improvement) or "bce" (Pipecat's original) for comparison + + Language-learning-specific improvements (March 2026 research): + - fp_penalty=2.0: asymmetric cost — interrupting a learner mid-thought is + much worse than waiting too long (ConversAR 2025, Praktika approach) + - Speak & Improve L2 corpus: real L2 learner speech with hesitations/disfluencies + - Dual threshold in evaluation: eager (0.3-0.5) for speculative prep, + final (0.7+) for actual turn transition (inspired by Deepgram Flux) + """ + + device = "cuda" if torch.cuda.is_available() else ("mps" if torch.backends.mps.is_available() else "cpu") + log.info("Training on device: %s", device) + if device == "cuda": + log.info("GPU: %s (%d MB)", torch.cuda.get_device_name(), + torch.cuda.get_device_properties(0).total_memory // 1024 // 1024) + + # ----- Load all data sources ----- + t0 = time.time() + all_samples: list[AudioSample] = [] + + # 1. Pipecat Portuguese (primary — these have LLM-curated labels) + pipecat = load_pipecat_portuguese(max_samples=max_pipecat_samples) + all_samples.extend(pipecat) + del pipecat + gc.collect() + + # 2. Our TTS-generated data (native PT-BR + French accent + short utterances) + tts = load_tts_dataset(max_samples=max_tts_samples) + all_samples.extend(tts) + del tts + gc.collect() + + # 3. CORAA real audio (critical: SpeculativeETD showed synthetic→real gap of 94.7%→30.3%) + coraa = load_coraa_real_audio(max_samples=3000) + all_samples.extend(coraa) + del coraa + gc.collect() + + # 4. Speak & Improve L2 corpus — real L2 learner speech with disfluencies + # L2 hesitation patterns transfer across languages (Cenoz 2000) + # 340h of L2 English learners, CEFR A2-C1 (arXiv:2412.11986) + si_l2 = load_speak_improve_l2(max_samples=max_l2_samples) + all_samples.extend(si_l2) + del si_l2 + gc.collect() + + if not all_samples: + raise RuntimeError("No training samples loaded! Run 01/03 scripts first.") + + load_time = time.time() - t0 + n_c = sum(1 for s in all_samples if s.label == 1.0) + n_i = sum(1 for s in all_samples if s.label == 0.0) + sources = {} + for s in all_samples: + sources[s.source] = sources.get(s.source, 0) + 1 + + log.info("Total: %d samples (%d complete, %d incomplete) in %.0fs", len(all_samples), n_c, n_i, load_time) + for src, cnt in sorted(sources.items()): + log.info(" %s: %d", src, cnt) + + # ----- Split ----- + log.info("=== Splitting by speaker ===") + train_samples, val_samples, internal_test = split_by_speaker(all_samples) + + # Also load Pipecat test data (held-out, never seen during training) + pipecat_test = load_pipecat_test_data() + + # ----- Datasets ----- + feature_extractor = WhisperFeatureExtractor(chunk_length=8) + train_ds = SmartTurnDataset(train_samples, feature_extractor, augment=True) + val_ds = SmartTurnDataset(val_samples, feature_extractor, augment=False) + + # Balanced sampling + train_labels = [s.label for s in train_samples] + n_pos = sum(1 for l in train_labels if l == 1.0) + n_neg = len(train_labels) - n_pos + weights = [1.0 / n_neg if l == 0.0 else 1.0 / n_pos for l in train_labels] + sampler = WeightedRandomSampler(weights, len(weights)) + + use_pin = device == "cuda" + n_workers = 4 if device == "cuda" else 0 + train_loader = DataLoader(train_ds, batch_size=batch_size, sampler=sampler, + num_workers=n_workers, pin_memory=use_pin) + val_loader = DataLoader(val_ds, batch_size=batch_size, shuffle=False, + num_workers=0, pin_memory=use_pin) + + # ----- Model ----- + model = SmartTurnV3Model(whisper_model=whisper_model).to(device) + total_params = sum(p.numel() for p in model.parameters()) + log.info("Model: %d params (%.1f MB)", total_params, total_params * 4 / 1024 / 1024) + + # ----- Loss ----- + # Pipecat uses BCEWithLogitsLoss with dynamic pos_weight (clamped 0.1-10.0) + # We default to Focal Loss (alpha=0.25, gamma=2.0 per Lin et al. 2017) + # but support BCE for comparison + pos_weight_val = min(max(n_neg / max(n_pos, 1), 0.1), 10.0) + pos_weight = torch.tensor([pos_weight_val], device=device) + + if loss_fn == "bce": + criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight) + log.info("Loss: BCEWithLogitsLoss (Pipecat original), pos_weight=%.2f", pos_weight_val) + else: + criterion = FocalLoss(gamma=2.0, alpha=0.25, fp_penalty=fp_penalty) + log.info("Loss: FocalLoss (gamma=2.0, alpha=0.25, fp_penalty=%.1f)", fp_penalty) + log.info(" Asymmetric cost: FP (interrupting learner) penalized %.1fx more than FN", fp_penalty) + + # ----- Optimizer: uniform LR (Pipecat uses single lr=5e-5 for all params) ----- + # Pipecat's train.py does NOT use differential LR — all params at same rate. + # Since we initialize from whisper-tiny (same as Pipecat), uniform LR is correct. + optimizer = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=0.01) + + # Cosine schedule with warmup (Pipecat: warmup_ratio=0.2) + total_steps = epochs * len(train_loader) + warmup_steps = int(total_steps * warmup_ratio) + + def lr_lambda(step): + if step < warmup_steps: + return step / max(warmup_steps, 1) + progress = (step - warmup_steps) / max(total_steps - warmup_steps, 1) + return 0.5 * (1 + np.cos(np.pi * progress)) + + scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda) + + # ----- Training loop ----- + OUTPUT_DIR.mkdir(parents=True, exist_ok=True) + best_f1 = 0.0 + best_path = OUTPUT_DIR / "best_model.pt" + resume_path = OUTPUT_DIR / "resume_checkpoint.pt" + patience = 5 + patience_counter = 0 + history = [] + start_epoch = 0 + + # Resume support + if resume_path.exists(): + log.info("=== Resuming from checkpoint ===") + ckpt = torch.load(resume_path, map_location=device, weights_only=False) + model.load_state_dict(ckpt["model_state_dict"]) + optimizer.load_state_dict(ckpt["optimizer_state_dict"]) + scheduler.load_state_dict(ckpt["scheduler_state_dict"]) + start_epoch = ckpt["epoch"] + best_f1 = ckpt.get("best_f1", 0.0) + patience_counter = ckpt.get("patience_counter", 0) + history = ckpt.get("history", []) + log.info(" Resumed at epoch %d, best_f1=%.4f", start_epoch, best_f1) + + log.info("=== Training: %d epochs, batch=%d, lr=%.1e, warmup_ratio=%.1f ===", + epochs, batch_size, lr, warmup_ratio) + + for epoch in range(start_epoch, epochs): + model.train() + train_loss = 0.0 + train_correct = 0 + train_total = 0 + t_epoch = time.time() + + for batch_idx, batch in enumerate(train_loader): + features = batch["input_features"].to(device) + labels = batch["labels"].to(device) + + logits = model(features) + loss = criterion(logits, labels) + + optimizer.zero_grad() + loss.backward() + torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) + optimizer.step() + scheduler.step() + + train_loss += loss.item() * len(labels) + preds = (torch.sigmoid(logits) > 0.5).float() + hard_labels = (labels > 0.5).float() + train_correct += (preds == hard_labels).sum().item() + train_total += len(labels) + + if batch_idx % 50 == 0 and batch_idx > 0: + log.info(" batch %d/%d loss=%.4f", batch_idx, len(train_loader), loss.item()) + + # Validate + model.eval() + val_metrics = _evaluate(model, val_loader, device, criterion) + train_acc = train_correct / max(train_total, 1) + epoch_time = time.time() - t_epoch + + log.info( + "Epoch %d/%d (%.0fs): train_loss=%.4f train_acc=%.3f | " + "val_acc=%.3f val_f1=%.3f prec=%.3f rec=%.3f", + epoch + 1, epochs, epoch_time, + train_loss / max(train_total, 1), train_acc, + val_metrics["accuracy"], val_metrics["f1"], + val_metrics["precision"], val_metrics["recall"], + ) + + history.append({ + "epoch": epoch + 1, + "train_loss": train_loss / max(train_total, 1), + "train_acc": train_acc, + **{f"val_{k}": v for k, v in val_metrics.items()}, + }) + + # Save best + if val_metrics["f1"] > best_f1: + best_f1 = val_metrics["f1"] + torch.save({ + "model_state_dict": model.state_dict(), + "epoch": epoch + 1, + "val_f1": best_f1, + "val_metrics": val_metrics, + "whisper_model": whisper_model, + }, best_path) + log.info(" -> New best model (val_f1=%.4f)", best_f1) + patience_counter = 0 + else: + patience_counter += 1 + if patience_counter >= patience: + log.info("Early stopping at epoch %d", epoch + 1) + break + + # Resume checkpoint + torch.save({ + "model_state_dict": model.state_dict(), + "optimizer_state_dict": optimizer.state_dict(), + "scheduler_state_dict": scheduler.state_dict(), + "epoch": epoch + 1, + "best_f1": best_f1, + "patience_counter": patience_counter, + "history": history, + }, resume_path) + + if resume_path.exists(): + resume_path.unlink() + + # ----- Final evaluation on internal test split ----- + log.info("\n=== Internal Test Evaluation ===") + checkpoint = torch.load(best_path, map_location=device, weights_only=True) + model.load_state_dict(checkpoint["model_state_dict"]) + model.eval() + + if internal_test: + test_ds = SmartTurnDataset(internal_test, feature_extractor, augment=False) + test_loader = DataLoader(test_ds, batch_size=batch_size, shuffle=False) + test_metrics = _evaluate(model, test_loader, device, criterion) + _log_metrics("Internal test", test_metrics) + else: + test_metrics = {} + + # ----- Pipecat test set (baseline comparison) ----- + pipecat_test_metrics = {} + if pipecat_test: + log.info("\n=== Pipecat PT Test Evaluation (baseline comparison) ===") + pipecat_ds = SmartTurnDataset(pipecat_test, feature_extractor, augment=False) + pipecat_loader = DataLoader(pipecat_ds, batch_size=batch_size, shuffle=False) + pipecat_test_metrics = _evaluate(model, pipecat_loader, device, criterion) + _log_metrics("Pipecat PT test", pipecat_test_metrics) + log.info(" Pipecat baseline: 95.42%% accuracy, 2.79%% FP, 1.79%% FN") + + # ----- Threshold sweep ----- + log.info("\n=== Threshold Sweep ===") + threshold_results = {} + eval_loader = test_loader if internal_test else val_loader + for thresh in [0.3, 0.4, 0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85]: + t_metrics = _evaluate(model, eval_loader, device, criterion, threshold=thresh) + threshold_results[str(thresh)] = t_metrics + log.info(" threshold=%.2f: prec=%.3f rec=%.3f f1=%.3f acc=%.3f FP=%.3f", + thresh, t_metrics["precision"], t_metrics["recall"], + t_metrics["f1"], t_metrics["accuracy"], t_metrics["fp_rate"]) + + # ----- Dual threshold recommendation (inspired by Deepgram Flux) ----- + # For a language-learning avatar: + # - eager_threshold (0.3-0.5): start preparing response speculatively + # (e.g., begin LLM generation) — reduces perceived latency + # - final_threshold (0.7+): actually take the turn and speak + # Higher final threshold = fewer interruptions (critical for L2 learners) + log.info("\n=== Dual Threshold Recommendation (Deepgram Flux-inspired) ===") + # Find final threshold: maximize precision with recall >= 85% + # (we want very few interruptions, even at the cost of some missed turns) + final_candidates = [(k, v) for k, v in threshold_results.items() + if v["recall"] >= 0.85 and float(k) >= 0.6] + if final_candidates: + best_final = min(final_candidates, key=lambda x: x[1]["fp_rate"]) + log.info(" Recommended final_threshold: %.2f (FP=%.1f%%, prec=%.3f, rec=%.3f)", + float(best_final[0]), best_final[1]["fp_rate"] * 100, + best_final[1]["precision"], best_final[1]["recall"]) + else: + best_final = ("0.7", threshold_results.get("0.7", {})) + log.info(" Default final_threshold: 0.70") + + # Eager threshold: lower confidence where we start speculative processing + eager_thresh = max(0.3, float(best_final[0]) - 0.3) + log.info(" Recommended eager_threshold: %.2f (start LLM generation speculatively)", + eager_thresh) + log.info(" Latency savings: ~150-250ms earlier response start (Deepgram Flux benchmark)") + + # ----- ONNX export (FP32 → INT8, per Pipecat's deploy pipeline) ----- + model = model.to("cpu") + onnx_fp32_path = OUTPUT_DIR / "smart_turn_pt_v3_fp32.onnx" + onnx_int8_path = OUTPUT_DIR / "smart_turn_pt_v3.onnx" + dummy = torch.randn(1, 80, 800) + try: + # Step 1: Export FP32 ONNX (opset 18, per Pipecat train.py) + torch.onnx.export( + model, dummy, str(onnx_fp32_path), + input_names=["input_features"], + output_names=["logits"], + dynamic_axes={"input_features": {0: "batch"}, "logits": {0: "batch"}}, + opset_version=18, + ) + log.info("ONNX FP32 exported: %s (%.1f MB)", onnx_fp32_path, + onnx_fp32_path.stat().st_size / 1024 / 1024) + + # Step 2: INT8 static quantization (entropy calibration, per Pipecat) + # Pipecat uses: 1024 calibration samples, QDQ format, Entropy method + try: + from onnxruntime.quantization import quantize_static, CalibrationDataReader, QuantFormat + + class SmartTurnCalibrationReader(CalibrationDataReader): + def __init__(self, dataset, n_samples=1024): + self.data = [] + for i, sample in enumerate(dataset): + if i >= n_samples: + break + self.data.append({"input_features": sample["input_features"].unsqueeze(0).numpy()}) + self.idx = 0 + + def get_next(self): + if self.idx >= len(self.data): + return None + result = self.data[self.idx] + self.idx += 1 + return result + + # Use validation data for calibration + calib_reader = SmartTurnCalibrationReader(val_ds, n_samples=1024) + quantize_static( + str(onnx_fp32_path), + str(onnx_int8_path), + calib_reader, + quant_format=QuantFormat.QDQ, + ) + log.info("ONNX INT8 exported: %s (%.1f MB)", onnx_int8_path, + onnx_int8_path.stat().st_size / 1024 / 1024) + except ImportError: + log.warning("onnxruntime.quantization not available — saving FP32 only") + import shutil + shutil.copy2(onnx_fp32_path, onnx_int8_path) + except Exception as e: + log.warning("INT8 quantization failed: %s — saving FP32 only", e) + import shutil + shutil.copy2(onnx_fp32_path, onnx_int8_path) + + except Exception as e: + log.warning("ONNX export failed: %s", e) + + # ----- Save results ----- + results = { + "model": "smart_turn_pt_v3_finetuned", + "whisper_model": whisper_model, + "architecture": "SmartTurnV3Model (exact Pipecat)", + "target_domain": "language_learning_avatar", + "learner_profile": "francophone_learning_portuguese", + "total_samples": len(all_samples), + "sources": sources, + "best_epoch": checkpoint["epoch"], + "best_val_f1": best_f1, + "test_metrics": test_metrics, + "pipecat_test_metrics": pipecat_test_metrics, + "pipecat_baseline": {"accuracy": 0.9542, "fp_rate": 0.0279, "fn_rate": 0.0179}, + "threshold_sweep": threshold_results, + "dual_threshold": { + "eager_threshold": eager_thresh, + "final_threshold": float(best_final[0]), + "rationale": "For L2 language learning: higher final threshold reduces " + "interruptions. Eager threshold enables speculative LLM " + "generation 150-250ms earlier (Deepgram Flux approach).", + }, + "history": history, + "config": { + "epochs": epochs, + "batch_size": batch_size, + "lr": lr, + "warmup_ratio": warmup_ratio, + "label_smoothing": LABEL_SMOOTH, + "focal_loss_gamma": 2.0, + "focal_loss_alpha": 0.25, + "fp_penalty": fp_penalty, + "loss_fn": loss_fn, + }, + } + + results_path = OUTPUT_DIR / "training_results.json" + with open(results_path, "w") as f: + json.dump(results, f, indent=2) + log.info("Results saved to %s", results_path) + + return onnx_int8_path + + +def _evaluate( + model: nn.Module, + loader: DataLoader, + device: str, + criterion: nn.Module, + threshold: float = 0.5, +) -> dict: + """Evaluate model at a given threshold.""" + correct = total = tp = fp = fn = tn = 0 + total_loss = 0.0 + + with torch.no_grad(): + for batch in loader: + features = batch["input_features"].to(device) + labels = batch["labels"].to(device) + logits = model(features) + loss = criterion(logits, labels) + total_loss += loss.item() * len(labels) + + preds = (torch.sigmoid(logits) > threshold).float() + hard_labels = (labels > 0.5).float() + correct += (preds == hard_labels).sum().item() + total += len(labels) + tp += ((preds == 1) & (hard_labels == 1)).sum().item() + fp += ((preds == 1) & (hard_labels == 0)).sum().item() + fn += ((preds == 0) & (hard_labels == 1)).sum().item() + tn += ((preds == 0) & (hard_labels == 0)).sum().item() + + accuracy = correct / max(total, 1) + precision = tp / max(tp + fp, 1) + recall = tp / max(tp + fn, 1) + f1 = 2 * precision * recall / max(precision + recall, 1e-8) + + return { + "accuracy": round(accuracy, 4), + "precision": round(precision, 4), + "recall": round(recall, 4), + "f1": round(f1, 4), + "loss": round(total_loss / max(total, 1), 4), + "tp": tp, "fp": fp, "fn": fn, "tn": tn, + "fp_rate": round(fp / max(fp + tn, 1), 4), + "fn_rate": round(fn / max(fn + tp, 1), 4), + } + + +def _log_metrics(name: str, metrics: dict) -> None: + log.info("%s results:", name) + log.info(" Accuracy: %.3f", metrics["accuracy"]) + log.info(" Precision: %.3f", metrics["precision"]) + log.info(" Recall: %.3f", metrics["recall"]) + log.info(" F1: %.3f", metrics["f1"]) + log.info(" FP rate: %.3f", metrics["fp_rate"]) + log.info(" FN rate: %.3f", metrics["fn_rate"]) + log.info(" TP=%d FP=%d FN=%d TN=%d", metrics["tp"], metrics["fp"], metrics["fn"], metrics["tn"]) + + +if __name__ == "__main__": + logging.basicConfig( + level=logging.INFO, + format="%(asctime)s %(levelname)s [%(name)s] %(message)s", + ) + random.seed(42) + np.random.seed(42) + torch.manual_seed(42) + + train( + epochs=6, + batch_size=128, + lr=5e-5, + warmup_ratio=0.2, + max_pipecat_samples=5000, + max_tts_samples=10000, + max_l2_samples=2000, + whisper_model="openai/whisper-tiny", + loss_fn="focal", # or "bce" for Pipecat's original + fp_penalty=2.0, # interrupting learner costs 2x more than waiting + ) diff --git a/03-finetune-pipecat-pt/05_evaluate.py b/03-finetune-pipecat-pt/05_evaluate.py new file mode 100644 index 0000000000000000000000000000000000000000..72ccfc14733079c2eb2d7372beadc2c1658b0574 --- /dev/null +++ b/03-finetune-pipecat-pt/05_evaluate.py @@ -0,0 +1,511 @@ +"""Evaluate fine-tuned model against Pipecat baseline and ONNX reference. + +Comparisons: +1. Our fine-tuned model (PyTorch) on Pipecat PT test set +2. Pipecat original ONNX model on same test set +3. Threshold sweep to find optimal operating point +4. Per-source breakdown (pipecat data vs our TTS data) + +Run: + python 05_evaluate.py --model results/best_model.pt +""" + +from __future__ import annotations + +import argparse +import json +import logging +import time +from pathlib import Path + +import numpy as np +import torch +import torch.nn as nn +from torch.utils.data import DataLoader + +log = logging.getLogger(__name__) + +DATA_DIR = Path(__file__).parent / "data" +_workspace = Path("/workspace") if Path("/workspace").exists() else Path(".") +RESULTS_DIR = _workspace / "results" +CACHE_DIR = _workspace / "hf_cache" + +SAMPLE_RATE = 16000 +WINDOW_SECONDS = 8 +WINDOW_SAMPLES = WINDOW_SECONDS * SAMPLE_RATE + +# Pipecat baseline (from their blog: Smart Turn v3 Portuguese) +PIPECAT_BASELINE = { + "accuracy": 0.9542, + "fp_rate": 0.0279, + "fn_rate": 0.0179, + "source": "https://www.daily.co/blog/announcing-smart-turn-v3-with-cpu-inference-in-just-12ms/", +} + + +def load_model(model_path: Path, device: str) -> nn.Module: + """Load fine-tuned SmartTurnV3Model.""" + # Import from finetune script + import importlib.util + spec = importlib.util.spec_from_file_location( + "finetune", Path(__file__).parent / "04_finetune.py" + ) + finetune = importlib.util.module_from_spec(spec) + spec.loader.exec_module(finetune) + + model = finetune.SmartTurnV3Model(whisper_model="openai/whisper-tiny") + checkpoint = torch.load(model_path, map_location=device, weights_only=True) + model.load_state_dict(checkpoint["model_state_dict"]) + model = model.to(device) + model.eval() + + log.info("Loaded model from %s (epoch %d, val_f1=%.4f)", + model_path, checkpoint.get("epoch", -1), checkpoint.get("val_f1", -1)) + return model + + +def load_onnx_model(onnx_dir: Path | None = None): + """Load Pipecat ONNX model for comparison.""" + try: + import onnxruntime as ort + except ImportError: + log.warning("onnxruntime not installed — skipping ONNX evaluation") + return None + + if onnx_dir is None: + onnx_dir = DATA_DIR / "pipecat_model" + + cpu_path = onnx_dir / "smart-turn-v3.2-cpu.onnx" + gpu_path = onnx_dir / "smart-turn-v3.2-gpu.onnx" + + onnx_path = cpu_path if cpu_path.exists() else (gpu_path if gpu_path.exists() else None) + if onnx_path is None: + log.warning("No Pipecat ONNX model found in %s", onnx_dir) + return None + + session = ort.InferenceSession(str(onnx_path)) + log.info("Loaded ONNX model: %s", onnx_path) + return session + + +def load_test_data() -> tuple[list, list]: + """Load test datasets: Pipecat PT test + our TTS test.""" + import soundfile as sf + from transformers import WhisperFeatureExtractor + + feature_extractor = WhisperFeatureExtractor(chunk_length=8) + + # 1. Pipecat test data + pipecat_samples = [] + test_dir = DATA_DIR / "pipecat_pt_test" + if test_dir.exists(): + for wav_path in sorted(test_dir.glob("*.wav")): + try: + audio, sr = sf.read(str(wav_path)) + audio = np.array(audio, dtype=np.float32) + if sr != SAMPLE_RATE: + import torchaudio + tensor = torch.from_numpy(audio).float().unsqueeze(0) + audio = torchaudio.functional.resample(tensor, sr, SAMPLE_RATE).squeeze().numpy() + + audio = _extract_window(audio) + if audio is not None: + label = 1.0 if "_complete" in wav_path.name else 0.0 + features = feature_extractor( + audio, sampling_rate=SAMPLE_RATE, + return_tensors="np", padding="max_length", + max_length=WINDOW_SAMPLES, truncation=True, + do_normalize=True, + ).input_features.squeeze(0).astype(np.float32) + + pipecat_samples.append({ + "features": features, + "label": label, + "source": "pipecat_test", + "file": wav_path.name, + }) + except Exception as e: + pass + + log.info("Pipecat test: %d samples", len(pipecat_samples)) + + # 2. Our TTS test data + tts_samples = [] + tts_dir = DATA_DIR / "tts_dataset" + meta_path = tts_dir / "metadata.json" + if meta_path.exists(): + with open(meta_path) as f: + metadata = json.load(f) + + # Use last 20% as test + test_start = int(len(metadata) * 0.8) + test_meta = metadata[test_start:] + + audio_dir = tts_dir / "audio" + for meta in test_meta: + wav_path = audio_dir / meta["file"] + if not wav_path.exists(): + continue + try: + audio, sr = sf.read(str(wav_path)) + audio = np.array(audio, dtype=np.float32) + if sr != SAMPLE_RATE: + import torchaudio + tensor = torch.from_numpy(audio).float().unsqueeze(0) + audio = torchaudio.functional.resample(tensor, sr, SAMPLE_RATE).squeeze().numpy() + + audio = _extract_window(audio) + if audio is not None: + label = 1.0 if meta["label"] == "complete" else 0.0 + features = feature_extractor( + audio, sampling_rate=SAMPLE_RATE, + return_tensors="np", padding="max_length", + max_length=WINDOW_SAMPLES, truncation=True, + do_normalize=True, + ).input_features.squeeze(0).astype(np.float32) + + tts_samples.append({ + "features": features, + "label": label, + "source": meta.get("accent", "unknown"), + "file": meta["file"], + }) + except Exception: + pass + + log.info("TTS test: %d samples", len(tts_samples)) + return pipecat_samples, tts_samples + + +def _extract_window(audio: np.ndarray) -> np.ndarray | None: + """Extract 8s window from end of audio.""" + if len(audio) < SAMPLE_RATE: + return None + peak = np.max(np.abs(audio)) + if peak > 0: + audio = audio / peak * 0.9 + if len(audio) > WINDOW_SAMPLES: + audio = audio[-WINDOW_SAMPLES:] + elif len(audio) < WINDOW_SAMPLES: + audio = np.pad(audio, (WINDOW_SAMPLES - len(audio), 0), mode="constant") + audio[-int(0.2 * SAMPLE_RATE):] = 0.0 + return audio.astype(np.float32) + + +# --------------------------------------------------------------------------- +# Evaluation functions +# --------------------------------------------------------------------------- + +def evaluate_pytorch( + model: nn.Module, + samples: list[dict], + device: str, + threshold: float = 0.5, +) -> dict: + """Evaluate PyTorch model on samples.""" + if not samples: + return {} + + tp = fp = fn = tn = 0 + latencies = [] + + with torch.no_grad(): + for s in samples: + features = torch.from_numpy(s["features"]).unsqueeze(0).to(device) + + t0 = time.perf_counter() + logits = model(features) + latency_ms = (time.perf_counter() - t0) * 1000 + latencies.append(latency_ms) + + pred = float(torch.sigmoid(logits).item() > threshold) + label = s["label"] + + if pred == 1 and label == 1: + tp += 1 + elif pred == 1 and label == 0: + fp += 1 + elif pred == 0 and label == 1: + fn += 1 + else: + tn += 1 + + total = tp + fp + fn + tn + return _compute_metrics(tp, fp, fn, tn, latencies) + + +def evaluate_onnx( + session, + samples: list[dict], + threshold: float = 0.5, +) -> dict: + """Evaluate ONNX model on samples.""" + if not samples or session is None: + return {} + + tp = fp = fn = tn = 0 + latencies = [] + + input_name = session.get_inputs()[0].name + for s in samples: + features = s["features"][np.newaxis, ...] + + t0 = time.perf_counter() + outputs = session.run(None, {input_name: features}) + latency_ms = (time.perf_counter() - t0) * 1000 + latencies.append(latency_ms) + + logit = outputs[0].item() if outputs[0].size == 1 else outputs[0][0].item() + pred = float(1.0 / (1.0 + np.exp(-logit)) > threshold) + label = s["label"] + + if pred == 1 and label == 1: + tp += 1 + elif pred == 1 and label == 0: + fp += 1 + elif pred == 0 and label == 1: + fn += 1 + else: + tn += 1 + + return _compute_metrics(tp, fp, fn, tn, latencies) + + +def _compute_metrics(tp, fp, fn, tn, latencies=None) -> dict: + total = tp + fp + fn + tn + accuracy = (tp + tn) / max(total, 1) + precision = tp / max(tp + fp, 1) + recall = tp / max(tp + fn, 1) + f1 = 2 * precision * recall / max(precision + recall, 1e-8) + fp_rate = fp / max(fp + tn, 1) + fn_rate = fn / max(fn + tp, 1) + + metrics = { + "accuracy": round(accuracy, 4), + "precision": round(precision, 4), + "recall": round(recall, 4), + "f1": round(f1, 4), + "fp_rate": round(fp_rate, 4), + "fn_rate": round(fn_rate, 4), + "tp": tp, "fp": fp, "fn": fn, "tn": tn, + "total": total, + } + + if latencies: + metrics["latency_mean_ms"] = round(np.mean(latencies), 2) + metrics["latency_p50_ms"] = round(np.median(latencies), 2) + metrics["latency_p95_ms"] = round(np.percentile(latencies, 95), 2) + + return metrics + + +# --------------------------------------------------------------------------- +# Main evaluation +# --------------------------------------------------------------------------- + +def run_evaluation(model_path: Path) -> dict: + """Run full evaluation suite.""" + device = "cuda" if torch.cuda.is_available() else ("mps" if torch.backends.mps.is_available() else "cpu") + log.info("Evaluating on device: %s", device) + + # Load models + model = load_model(model_path, device) + onnx_session = load_onnx_model() + + # Load test data + pipecat_samples, tts_samples = load_test_data() + all_samples = pipecat_samples + tts_samples + + results = {"pipecat_baseline": PIPECAT_BASELINE} + + # 1. Our model on all test data + log.info("\n=== Our model (all test data) ===") + our_all = evaluate_pytorch(model, all_samples, device) + if our_all: + _log_metrics("Our model (all)", our_all) + results["our_model_all"] = our_all + + # 2. Our model on Pipecat test only + if pipecat_samples: + log.info("\n=== Our model (Pipecat PT test only) ===") + our_pipecat = evaluate_pytorch(model, pipecat_samples, device) + _log_metrics("Our model (Pipecat test)", our_pipecat) + results["our_model_pipecat_test"] = our_pipecat + + # Compare with baseline + diff_acc = our_pipecat["accuracy"] - PIPECAT_BASELINE["accuracy"] + diff_fp = our_pipecat["fp_rate"] - PIPECAT_BASELINE["fp_rate"] + diff_fn = our_pipecat["fn_rate"] - PIPECAT_BASELINE["fn_rate"] + log.info(" vs Pipecat baseline: accuracy %+.2f%%, FP %+.2f%%, FN %+.2f%%", + diff_acc * 100, diff_fp * 100, diff_fn * 100) + + # 3. Our model per accent + for accent_name, accent_filter in [("native_pt_br", "native_pt_br"), ("french_pt", "french_pt")]: + accent_samples = [s for s in tts_samples if s["source"] == accent_filter] + if accent_samples: + log.info("\n=== Our model (%s) ===", accent_name) + m = evaluate_pytorch(model, accent_samples, device) + _log_metrics(f"Our model ({accent_name})", m) + results[f"our_model_{accent_name}"] = m + + # 4. ONNX reference model + if onnx_session and pipecat_samples: + log.info("\n=== Pipecat ONNX model (reference) ===") + onnx_metrics = evaluate_onnx(onnx_session, pipecat_samples) + if onnx_metrics: + _log_metrics("Pipecat ONNX", onnx_metrics) + results["pipecat_onnx"] = onnx_metrics + + # 5. Threshold sweep + log.info("\n=== Threshold Sweep (all test data) ===") + sweep = {} + for thresh in [0.3, 0.4, 0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9]: + m = evaluate_pytorch(model, all_samples, device, threshold=thresh) + sweep[str(thresh)] = m + log.info(" threshold=%.2f: acc=%.3f prec=%.3f rec=%.3f f1=%.3f FP=%.3f FN=%.3f", + thresh, m["accuracy"], m["precision"], m["recall"], + m["f1"], m["fp_rate"], m["fn_rate"]) + + results["threshold_sweep"] = sweep + + # Find optimal thresholds + best_f1_thresh = max(sweep.items(), key=lambda x: x[1]["f1"]) + best_prec_thresh = max( + [(k, v) for k, v in sweep.items() if v["recall"] >= 0.90], + key=lambda x: x[1]["precision"], + default=(None, None), + ) + + log.info("\n=== Recommendations ===") + log.info(" Best F1: threshold=%.2f (F1=%.3f, prec=%.3f, rec=%.3f)", + float(best_f1_thresh[0]), best_f1_thresh[1]["f1"], + best_f1_thresh[1]["precision"], best_f1_thresh[1]["recall"]) + if best_prec_thresh[0]: + log.info(" Best precision (recall>=90%%): threshold=%.2f (prec=%.3f, rec=%.3f)", + float(best_prec_thresh[0]), best_prec_thresh[1]["precision"], + best_prec_thresh[1]["recall"]) + + # ----- Dual Threshold for Language Learning Avatar ----- + # Inspired by Deepgram Flux (eot_threshold + eager_eot_threshold) + # For L2 learners: higher final threshold to avoid interrupting mid-hesitation + log.info("\n=== Dual Threshold (Language Learning Mode) ===") + log.info(" Context: conversational avatar for francophones learning Portuguese") + log.info(" Priority: minimize interruptions (FP) over fast response (FN)") + + # Final threshold: minimize FP rate while keeping recall >= 85% + final_candidates = [(k, v) for k, v in sweep.items() + if v["recall"] >= 0.85 and float(k) >= 0.6] + if final_candidates: + best_final = min(final_candidates, key=lambda x: x[1]["fp_rate"]) + final_thresh = float(best_final[0]) + log.info(" Final threshold: %.2f (FP=%.1f%%, rec=%.1f%%)", + final_thresh, best_final[1]["fp_rate"] * 100, + best_final[1]["recall"] * 100) + else: + final_thresh = 0.7 + log.info(" Final threshold: 0.70 (default)") + + # Eager threshold: lower confidence for speculative LLM generation + eager_thresh = max(0.3, final_thresh - 0.3) + log.info(" Eager threshold: %.2f (start speculative LLM prep)", eager_thresh) + log.info(" Expected latency savings: ~150-250ms on response start") + + # Simulate dual-threshold behavior + if all_samples: + log.info("\n Dual-threshold simulation:") + eager_m = evaluate_pytorch(model, all_samples, device, threshold=eager_thresh) + final_m = evaluate_pytorch(model, all_samples, device, threshold=final_thresh) + log.info(" Eager (%.2f): would trigger on %.1f%% of samples (%.1f%% false triggers)", + eager_thresh, (eager_m["tp"] + eager_m["fp"]) / max(eager_m["total"], 1) * 100, + eager_m["fp_rate"] * 100) + log.info(" Final (%.2f): confirms %.1f%% of turns (%.1f%% false confirms, %.1f%% missed)", + final_thresh, final_m["recall"] * 100, + final_m["fp_rate"] * 100, final_m["fn_rate"] * 100) + wasted_speculative = eager_m["fp"] - final_m["fp"] + log.info(" Wasted speculative preps: %d (started but not confirmed)", max(0, wasted_speculative)) + + results["recommended"] = { + "best_f1_threshold": float(best_f1_thresh[0]), + "best_precision_threshold": float(best_prec_thresh[0]) if best_prec_thresh[0] else None, + "dual_threshold": { + "eager": eager_thresh, + "final": final_thresh, + "mode": "language_learning", + "rationale": "Higher final threshold minimizes interruptions for L2 learners " + "who pause 500-2000ms mid-sentence. Eager threshold enables " + "speculative LLM prep for lower perceived latency.", + }, + } + + # 6. Summary comparison table + log.info("\n" + "=" * 70) + log.info("SUMMARY COMPARISON") + log.info("=" * 70) + log.info("%-30s %8s %8s %8s %8s", "Model", "Acc", "FP%", "FN%", "F1") + log.info("-" * 70) + log.info("%-30s %7.1f%% %7.2f%% %7.2f%% %8s", + "Pipecat v3.2 (baseline)", + PIPECAT_BASELINE["accuracy"] * 100, + PIPECAT_BASELINE["fp_rate"] * 100, + PIPECAT_BASELINE["fn_rate"] * 100, + "~0.96") + + if "pipecat_onnx" in results: + m = results["pipecat_onnx"] + log.info("%-30s %7.1f%% %7.2f%% %7.2f%% %7.3f", + "Pipecat ONNX (our eval)", + m["accuracy"] * 100, m["fp_rate"] * 100, + m["fn_rate"] * 100, m["f1"]) + + if "our_model_pipecat_test" in results: + m = results["our_model_pipecat_test"] + log.info("%-30s %7.1f%% %7.2f%% %7.2f%% %7.3f", + "Our model (Pipecat test)", + m["accuracy"] * 100, m["fp_rate"] * 100, + m["fn_rate"] * 100, m["f1"]) + + if "our_model_all" in results: + m = results["our_model_all"] + log.info("%-30s %7.1f%% %7.2f%% %7.2f%% %7.3f", + "Our model (all test)", + m["accuracy"] * 100, m["fp_rate"] * 100, + m["fn_rate"] * 100, m["f1"]) + + log.info("=" * 70) + + # Save results + RESULTS_DIR.mkdir(parents=True, exist_ok=True) + results_path = RESULTS_DIR / "evaluation_results.json" + with open(results_path, "w") as f: + json.dump(results, f, indent=2) + log.info("\nResults saved to %s", results_path) + + return results + + +def _log_metrics(name: str, m: dict) -> None: + log.info("%s:", name) + log.info(" Accuracy: %.3f (%.1f%%)", m["accuracy"], m["accuracy"] * 100) + log.info(" Precision: %.3f", m["precision"]) + log.info(" Recall: %.3f", m["recall"]) + log.info(" F1: %.3f", m["f1"]) + log.info(" FP rate: %.3f (%.1f%%)", m["fp_rate"], m["fp_rate"] * 100) + log.info(" FN rate: %.3f (%.1f%%)", m["fn_rate"], m["fn_rate"] * 100) + log.info(" TP=%d FP=%d FN=%d TN=%d (total=%d)", m["tp"], m["fp"], m["fn"], m["tn"], m["total"]) + if "latency_mean_ms" in m: + log.info(" Latency: mean=%.1fms p50=%.1fms p95=%.1fms", + m["latency_mean_ms"], m["latency_p50_ms"], m["latency_p95_ms"]) + + +if __name__ == "__main__": + logging.basicConfig( + level=logging.INFO, + format="%(asctime)s %(levelname)s [%(name)s] %(message)s", + ) + + parser = argparse.ArgumentParser() + parser.add_argument("--model", type=str, default=str(RESULTS_DIR / "best_model.pt"), + help="Path to fine-tuned model checkpoint") + args = parser.parse_args() + + run_evaluation(Path(args.model)) diff --git a/03-finetune-pipecat-pt/06_inference.py b/03-finetune-pipecat-pt/06_inference.py new file mode 100644 index 0000000000000000000000000000000000000000..2ed6957da8dbf64d1a3e8c25842eac648c4c756b --- /dev/null +++ b/03-finetune-pipecat-pt/06_inference.py @@ -0,0 +1,560 @@ +"""Inference engine for the language-learning avatar turn-taking model. + +This module implements the dual-threshold + backchannel system designed +for a conversational avatar that teaches Portuguese to French speakers. + +Key insight: L2 learners pause 1-3s mid-sentence (word search, conjugation, +code-switching). A naive system either: + a) Interrupts them (bad UX — kills confidence), or + b) Stays silent too long (learner thinks avatar froze) + +Solution: dual-threshold with backchannel signals. + +Architecture: + Audio stream → SmartTurnV3 model → confidence score (0.0-1.0) + │ + ├─ score < eager_threshold (0.4) → LISTENING (do nothing) + ├─ eager ≤ score < final (0.7) → PREPARING (backchannel + speculative LLM) + └─ score ≥ final_threshold (0.7) → RESPONDING (take the turn) + + Silence timer (parallel): + 0-600ms → normal (no action) + 600ms-1.5s → visual backchannel (nod, eye contact) + 1.5s-3.0s → verbal backchannel ("mhm", "continue...") + 3.0s+ → encouragement ("sem pressa, pode pensar...") + +References: +- Deepgram Flux: eot_threshold + eager_eot_threshold (2025) +- ConversAR: "infinite thinking period" for L2 learners (2025) +- Tavus: 600ms response latency threshold (2025) +- Kosmala (2022): L2 repair pauses average 844ms +- Cenoz (2000): L2 silent pauses range 205ms to 11,569ms + +Usage: + engine = TurnTakingEngine(model_path="results/smart_turn_pt_v3.onnx") + engine.on_state_change(my_callback) + + # Feed audio chunks continuously + for chunk in audio_stream: + engine.feed_audio(chunk) +""" + +from __future__ import annotations + +import logging +import time +from dataclasses import dataclass, field +from enum import Enum +from pathlib import Path +from typing import Callable + +import numpy as np + +log = logging.getLogger(__name__) + +SAMPLE_RATE = 16000 +WINDOW_SECONDS = 8 +WINDOW_SAMPLES = WINDOW_SECONDS * SAMPLE_RATE + + +# --------------------------------------------------------------------------- +# States and events +# --------------------------------------------------------------------------- + +class TurnState(Enum): + """Avatar turn-taking state machine.""" + LISTENING = "listening" # User is speaking, avatar listens + SILENCE = "silence" # User stopped, waiting to classify + BACKCHANNEL_VISUAL = "bc_visual" # Nod/eye contact (600ms-1.5s silence) + BACKCHANNEL_VERBAL = "bc_verbal" # "Mhm" / "continue" (1.5s-3.0s) + ENCOURAGEMENT = "encouragement" # "Sem pressa..." (3.0s+ silence) + PREPARING = "preparing" # Eager threshold hit, speculative LLM started + RESPONDING = "responding" # Final threshold hit, avatar speaks + + +@dataclass +class TurnEvent: + """Event emitted on state transitions.""" + state: TurnState + confidence: float # Model's end-of-turn confidence (0-1) + silence_duration_ms: float # How long the user has been silent + timestamp: float # time.monotonic() + message: str = "" # Backchannel text (if applicable) + + +@dataclass +class BackchannelConfig: + """Configurable backchannel behavior for the avatar. + + These can be tuned per learner profile or CEFR level: + - A1/A2: longer delays, more encouraging messages + - B1/B2: shorter delays, less frequent backchannels + """ + # Silence thresholds (milliseconds) + visual_backchannel_ms: float = 600.0 + verbal_backchannel_ms: float = 1500.0 + encouragement_ms: float = 3000.0 + + # Model confidence thresholds + eager_threshold: float = 0.4 # Start speculative LLM generation + final_threshold: float = 0.7 # Confirm end-of-turn, avatar speaks + + # Backchannel messages (Portuguese, with French-speaker-friendly variants) + visual_actions: list[str] = field(default_factory=lambda: [ + "nod", # Aceno de cabeça + "eye_contact", # Olhar atento + "slight_smile", # Sorriso leve + ]) + + verbal_backchannels: list[str] = field(default_factory=lambda: [ + "Mhm...", + "Uhum...", + "Sim...", + "Tá...", + "Sei...", + "Continue...", + ]) + + encouragement_messages: list[str] = field(default_factory=lambda: [ + "Pode continuar, sem pressa...", + "Tá pensando? Tranquilo...", + "Pode pensar, eu espero...", + "Sem pressa, tá tudo bem...", + "Take your time... pode falar em português...", # Code-switch friendly + "Prenez votre temps... quando estiver pronto...", # French reassurance + ]) + + # Cooldowns (don't spam backchannels) + verbal_cooldown_ms: float = 3000.0 # Min time between verbal backchannels + encouragement_cooldown_ms: float = 8000.0 # Min time between encouragements + + +# --------------------------------------------------------------------------- +# CEFR-aware presets +# --------------------------------------------------------------------------- + +CEFR_PRESETS: dict[str, BackchannelConfig] = { + "A1": BackchannelConfig( + visual_backchannel_ms=500.0, + verbal_backchannel_ms=1200.0, + encouragement_ms=2500.0, + eager_threshold=0.5, # Higher — need more confidence before preparing + final_threshold=0.8, # Much higher — very patient, rarely interrupt + ), + "A2": BackchannelConfig( + visual_backchannel_ms=550.0, + verbal_backchannel_ms=1300.0, + encouragement_ms=2800.0, + eager_threshold=0.45, + final_threshold=0.75, + ), + "B1": BackchannelConfig( + visual_backchannel_ms=600.0, + verbal_backchannel_ms=1500.0, + encouragement_ms=3000.0, + eager_threshold=0.4, + final_threshold=0.7, # Default + ), + "B2": BackchannelConfig( + visual_backchannel_ms=600.0, + verbal_backchannel_ms=1800.0, + encouragement_ms=4000.0, + eager_threshold=0.35, + final_threshold=0.65, # More responsive — B2 pauses less + ), + "C1": BackchannelConfig( + visual_backchannel_ms=600.0, + verbal_backchannel_ms=2000.0, + encouragement_ms=5000.0, + eager_threshold=0.35, + final_threshold=0.6, # Near-native responsiveness + ), +} + + +# --------------------------------------------------------------------------- +# Turn-taking engine +# --------------------------------------------------------------------------- + +class TurnTakingEngine: + """Real-time turn-taking engine for language-learning avatar. + + Combines the SmartTurnV3 model with a silence timer and backchannel + state machine to create a patient, encouraging conversational partner. + + Usage: + engine = TurnTakingEngine("results/smart_turn_pt_v3.onnx") + engine.on_state_change(handle_event) + + # In audio loop: + for chunk in mic_stream: + engine.feed_audio(chunk) + + # Or feed pre-computed features: + engine.feed_score(model_confidence, is_speech=True) + """ + + def __init__( + self, + model_path: str | Path | None = None, + config: BackchannelConfig | None = None, + cefr_level: str = "B1", + ): + # Config: use CEFR preset or custom + if config is not None: + self.config = config + elif cefr_level in CEFR_PRESETS: + self.config = CEFR_PRESETS[cefr_level] + log.info("Using CEFR %s preset: eager=%.2f, final=%.2f", + cefr_level, self.config.eager_threshold, self.config.final_threshold) + else: + self.config = BackchannelConfig() + + # State + self._state = TurnState.LISTENING + self._silence_start: float | None = None + self._last_speech_time: float = time.monotonic() + self._last_verbal_bc: float = 0.0 + self._last_encouragement: float = 0.0 + self._last_confidence: float = 0.0 + self._callbacks: list[Callable[[TurnEvent], None]] = [] + self._audio_buffer = np.zeros(WINDOW_SAMPLES, dtype=np.float32) + self._speculative_started = False + + # Load ONNX model if provided + self._session = None + self._feature_extractor = None + if model_path is not None: + self._load_model(model_path) + + def _load_model(self, model_path: str | Path) -> None: + """Load ONNX model for inference.""" + try: + import onnxruntime as ort + self._session = ort.InferenceSession( + str(model_path), + providers=["CPUExecutionProvider"], + ) + log.info("Loaded turn-taking model: %s", model_path) + except Exception as e: + log.warning("Failed to load model %s: %s — running in manual mode", model_path, e) + + try: + from transformers import WhisperFeatureExtractor + self._feature_extractor = WhisperFeatureExtractor(chunk_length=WINDOW_SECONDS) + except ImportError: + log.warning("transformers not available — cannot extract features") + + def on_state_change(self, callback: Callable[[TurnEvent], None]) -> None: + """Register callback for state change events.""" + self._callbacks.append(callback) + + @property + def state(self) -> TurnState: + return self._state + + @property + def silence_duration_ms(self) -> float: + if self._silence_start is None: + return 0.0 + return (time.monotonic() - self._silence_start) * 1000 + + # ----- Audio input ----- + + def feed_audio(self, chunk: np.ndarray, is_speech: bool | None = None) -> TurnEvent | None: + """Feed an audio chunk and get turn-taking decision. + + chunk: float32 audio at 16kHz + is_speech: if None, will use simple energy-based VAD + + Returns TurnEvent if state changed, None otherwise. + """ + now = time.monotonic() + + # Update audio buffer (sliding window) + chunk = chunk.astype(np.float32) + if len(chunk) >= WINDOW_SAMPLES: + self._audio_buffer = chunk[-WINDOW_SAMPLES:] + else: + self._audio_buffer = np.roll(self._audio_buffer, -len(chunk)) + self._audio_buffer[-len(chunk):] = chunk + + # Simple VAD if not provided + if is_speech is None: + rms = np.sqrt(np.mean(chunk ** 2)) + is_speech = rms > 0.01 # Simple threshold + + if is_speech: + return self._on_speech(now) + else: + return self._on_silence(now) + + def feed_score(self, confidence: float, is_speech: bool) -> TurnEvent | None: + """Feed pre-computed model confidence score. + + Use this when you run the model externally and just want + the backchannel/threshold logic. + + confidence: 0.0 (definitely incomplete) to 1.0 (definitely complete) + is_speech: whether VAD detected speech in this frame + """ + now = time.monotonic() + self._last_confidence = confidence + + if is_speech: + return self._on_speech(now) + else: + return self._on_silence(now, confidence=confidence) + + # ----- Internal state machine ----- + + def _on_speech(self, now: float) -> TurnEvent | None: + """User is speaking.""" + self._last_speech_time = now + self._silence_start = None + self._speculative_started = False + + if self._state != TurnState.LISTENING: + return self._transition(TurnState.LISTENING, confidence=0.0, now=now) + return None + + def _on_silence(self, now: float, confidence: float | None = None) -> TurnEvent | None: + """User is silent — run the state machine.""" + # Mark silence start + if self._silence_start is None: + self._silence_start = now + + silence_ms = (now - self._silence_start) * 1000 + + # Get model confidence if we have a model and no external score + if confidence is None: + confidence = self._run_model() + self._last_confidence = confidence + + cfg = self.config + + # ----- Decision tree ----- + + # 1. Final threshold → RESPONDING (take the turn) + if confidence >= cfg.final_threshold: + return self._transition(TurnState.RESPONDING, confidence, now) + + # 2. Eager threshold → PREPARING (speculative LLM, backchannel) + if confidence >= cfg.eager_threshold and not self._speculative_started: + self._speculative_started = True + return self._transition(TurnState.PREPARING, confidence, now) + + # 3. Silence-based backchannels (even if model is unsure) + # These keep the learner engaged so they don't think the avatar froze + + if silence_ms >= cfg.encouragement_ms: + if now - self._last_encouragement >= cfg.encouragement_cooldown_ms / 1000: + self._last_encouragement = now + return self._transition(TurnState.ENCOURAGEMENT, confidence, now) + + if silence_ms >= cfg.verbal_backchannel_ms: + if now - self._last_verbal_bc >= cfg.verbal_cooldown_ms / 1000: + self._last_verbal_bc = now + return self._transition(TurnState.BACKCHANNEL_VERBAL, confidence, now) + + if silence_ms >= cfg.visual_backchannel_ms: + if self._state not in (TurnState.BACKCHANNEL_VISUAL, + TurnState.BACKCHANNEL_VERBAL, + TurnState.ENCOURAGEMENT, + TurnState.PREPARING): + return self._transition(TurnState.BACKCHANNEL_VISUAL, confidence, now) + + # Still in silence, no state change + if self._state == TurnState.LISTENING: + return self._transition(TurnState.SILENCE, confidence, now) + + return None + + def _transition(self, new_state: TurnState, confidence: float, now: float) -> TurnEvent: + """Transition to a new state and emit event.""" + import random + + old_state = self._state + self._state = new_state + + silence_ms = (now - self._silence_start) * 1000 if self._silence_start else 0.0 + + # Pick appropriate message + message = "" + if new_state == TurnState.BACKCHANNEL_VISUAL: + message = random.choice(self.config.visual_actions) + elif new_state == TurnState.BACKCHANNEL_VERBAL: + message = random.choice(self.config.verbal_backchannels) + elif new_state == TurnState.ENCOURAGEMENT: + message = random.choice(self.config.encouragement_messages) + elif new_state == TurnState.PREPARING: + message = "speculative_llm_start" + elif new_state == TurnState.RESPONDING: + message = "take_turn" + + event = TurnEvent( + state=new_state, + confidence=confidence, + silence_duration_ms=silence_ms, + timestamp=now, + message=message, + ) + + if old_state != new_state: + log.debug("Turn state: %s → %s (conf=%.2f, silence=%.0fms, msg=%s)", + old_state.value, new_state.value, confidence, silence_ms, message) + + for cb in self._callbacks: + try: + cb(event) + except Exception as e: + log.warning("Callback error: %s", e) + + return event + + def _run_model(self) -> float: + """Run the ONNX model on current audio buffer.""" + if self._session is None or self._feature_extractor is None: + return 0.0 + + try: + inputs = self._feature_extractor( + self._audio_buffer, + sampling_rate=SAMPLE_RATE, + return_tensors="np", + padding="max_length", + max_length=WINDOW_SAMPLES, + truncation=True, + do_normalize=True, + ) + features = inputs.input_features.astype(np.float32) + + input_name = self._session.get_inputs()[0].name + outputs = self._session.run(None, {input_name: features}) + logit = outputs[0].item() if outputs[0].size == 1 else outputs[0][0].item() + + # Sigmoid + confidence = 1.0 / (1.0 + np.exp(-logit)) + return float(confidence) + except Exception as e: + log.warning("Model inference error: %s", e) + return 0.0 + + # ----- Convenience ----- + + def reset(self) -> None: + """Reset state (e.g., when starting a new conversation turn).""" + self._state = TurnState.LISTENING + self._silence_start = None + self._speculative_started = False + self._last_confidence = 0.0 + self._audio_buffer = np.zeros(WINDOW_SAMPLES, dtype=np.float32) + + def set_cefr_level(self, level: str) -> None: + """Change CEFR level at runtime (e.g., after proficiency assessment).""" + if level in CEFR_PRESETS: + self.config = CEFR_PRESETS[level] + log.info("Switched to CEFR %s: eager=%.2f, final=%.2f", + level, self.config.eager_threshold, self.config.final_threshold) + else: + log.warning("Unknown CEFR level: %s", level) + + +# --------------------------------------------------------------------------- +# Example usage and demo +# --------------------------------------------------------------------------- + +def demo_simulation(): + """Simulate a conversation to demonstrate the backchannel system. + + Scenario: French speaker (B1) learning Portuguese, pausing frequently. + """ + print("=" * 70) + print("DEMO: Turn-Taking Engine for Language Learning Avatar") + print("Scenario: French B1 learner speaking Portuguese") + print("=" * 70) + + events_log = [] + + def on_event(event: TurnEvent): + events_log.append(event) + state_emoji = { + TurnState.LISTENING: "👂", + TurnState.SILENCE: "⏸️", + TurnState.BACKCHANNEL_VISUAL: "😊", + TurnState.BACKCHANNEL_VERBAL: "💬", + TurnState.ENCOURAGEMENT: "🤗", + TurnState.PREPARING: "🧠", + TurnState.RESPONDING: "🗣️", + } + emoji = state_emoji.get(event.state, "❓") + print(f" {emoji} [{event.silence_duration_ms:6.0f}ms] " + f"conf={event.confidence:.2f} → {event.state.value}: {event.message}") + + engine = TurnTakingEngine(config=CEFR_PRESETS["B1"]) + engine.on_state_change(on_event) + + # Simulate a conversation timeline + # Each entry: (description, duration_ms, is_speech, model_confidence) + timeline = [ + # Learner starts speaking + ("Learner: 'Eu fui ao...'", 1200, True, 0.1), + # Pause — searching for word (conjugation hesitation) + (" [silence — thinking about conjugation]", 300, False, 0.15), + (" [still thinking...]", 400, False, 0.20), + (" [600ms — visual backchannel]", 300, False, 0.22), + (" [1s — still silent]", 400, False, 0.25), + (" [1.5s — verbal backchannel]", 500, False, 0.28), + # Learner continues + ("Learner: '...mercado... euh...'", 800, True, 0.1), + # Another pause — code-switching hesitation + (" [silence — 'comment dit-on...']", 500, False, 0.20), + (" [1s silence]", 500, False, 0.30), + (" [1.5s — verbal backchannel]", 500, False, 0.35), + (" [2s — model getting uncertain]", 500, False, 0.42), + # Learner continues again + ("Learner: '...para comprar... uma tesoura.'", 1500, True, 0.1), + # Final silence — this time it's a real end of turn + (" [silence after complete sentence]", 300, False, 0.45), + (" [600ms]", 300, False, 0.55), + (" [model confidence rising]", 300, False, 0.65), + (" [model confident — end of turn]", 200, False, 0.75), + ] + + print("\nTimeline:") + print("-" * 70) + + for description, duration_ms, is_speech, confidence in timeline: + print(f"\n{description} ({duration_ms}ms)") + # Simulate in 100ms chunks + n_chunks = max(1, duration_ms // 100) + for _ in range(n_chunks): + engine.feed_score(confidence, is_speech) + time.sleep(0.001) # Tiny sleep for monotonic clock to advance + + print("\n" + "=" * 70) + print(f"Total events: {len(events_log)}") + print(f"Final state: {engine.state.value}") + + # Show CEFR comparison + print("\n" + "=" * 70) + print("CEFR LEVEL COMPARISON") + print("=" * 70) + print(f"{'Level':<6} {'Eager':<8} {'Final':<8} {'Visual BC':<10} {'Verbal BC':<10} {'Encourage':<10}") + print("-" * 52) + for level, preset in CEFR_PRESETS.items(): + print(f"{level:<6} {preset.eager_threshold:<8.2f} {preset.final_threshold:<8.2f} " + f"{preset.visual_backchannel_ms:<10.0f} {preset.verbal_backchannel_ms:<10.0f} " + f"{preset.encouragement_ms:<10.0f}") + print() + print("A1/A2: Very patient — high final threshold (0.75-0.80), early encouragement") + print("B1: Default — balanced patience and responsiveness") + print("B2/C1: More responsive — lower final threshold (0.60-0.65)") + + +if __name__ == "__main__": + logging.basicConfig( + level=logging.DEBUG, + format="%(asctime)s %(levelname)s [%(name)s] %(message)s", + ) + demo_simulation() diff --git a/03-finetune-pipecat-pt/README.md b/03-finetune-pipecat-pt/README.md new file mode 100644 index 0000000000000000000000000000000000000000..7ed4503a7a376ee145a1b211e96dd9a08c928436 --- /dev/null +++ b/03-finetune-pipecat-pt/README.md @@ -0,0 +1,489 @@ +# Experimento 03 — Fine-tune Pipecat Smart Turn para Portugues + Frances + +> Fine-tuning do Pipecat Smart Turn v3 para **avatar conversacional de aprendizado de portugues** por francofonos. Primeiro modelo de turn-taking otimizado para aprendizes L2. + +--- + +## Sumario + +1. [Objetivo](#objetivo) +2. [Por que a partir do Pipecat](#por-que-a-partir-do-pipecat) +3. [Metodo — Pipeline de dados](#metodo--pipeline-de-dados) + - [Pipeline de criacao de dados](#pipeline-de-criacao-de-dados) + - [Projetos que validam este metodo](#projetos-que-validam-este-metodo) +4. [Fine-tuning](#fine-tuning) + - [Modelo base](#modelo-base) + - [Estrategia de fine-tuning](#estrategia-de-fine-tuning) + - [Infra](#infra) +5. [Melhorias para aprendizado de idiomas (L2)](#melhorias-para-aprendizado-de-idiomas-l2) + - [Custo assimetrico](#custo-assimetrico) + - [Threshold duplo (Deepgram Flux)](#threshold-duplo-deepgram-flux) + - [Dados L2 reais (Speak & Improve)](#dados-l2-reais-speak--improve) + - [CEFR-aware presets](#cefr-aware-presets) +6. [Sistema de backchannel](#sistema-de-backchannel) + - [Problema: "o avatar travou?"](#problema-o-avatar-travou) + - [Solucao: sinais de escuta ativa](#solucao-sinais-de-escuta-ativa) + - [Presets por nivel CEFR](#presets-por-nivel-cefr) +7. [Engine de inferencia (06_inference.py)](#engine-de-inferencia-06_inferencepy) + - [Estados do turn-taking](#estados-do-turn-taking) + - [Exemplo: aprendiz B1 conjugando verbo](#exemplo-aprendiz-b1-conjugando-verbo) +8. [Dados especificos — Frances falando portugues](#dados-especificos--frances-falando-portugues) + - [Tipos de hesitacao](#tipos-de-hesitacao) + - [Geracoes com Claude](#geracoes-com-claude) +9. [Benchmarks de referencia](#benchmarks-de-referencia) + - [Pipecat Smart Turn v3.0](#pipecat-smart-turn-v30) + - [LiveKit v0.4.1](#livekit-v041) + - [Outros modelos](#outros-modelos) + - [Gap sintetico → real](#gap-sintetico--real) +10. [Metricas alvo](#metricas-alvo) +11. [Correcoes apos analise de referencias](#correcoes-apos-analise-de-referencias) +12. [Estrutura de arquivos](#estrutura-de-arquivos) +13. [Cronograma](#cronograma) +14. [Dependencias](#dependencias) +15. [Referencias](#referencias) + +--- + +## Objetivo + +Fine-tunar o modelo pre-treinado do **Pipecat Smart Turn v3** (ja treinado em 23 linguas, 270K amostras) especificamente para: + +1. **Portugues brasileiro** — melhorar deteccao de fim de turno em conversas em PT-BR +2. **Frances falando portugues** — detectar fim de turno de falantes nativos de frances que estao falando portugues (com sotaque, hesitacoes e code-switching tipicos) +3. **Avatar conversacional L2** — o modelo sera usado em um avatar de IA que ensina portugues a francofonos, exigindo paciencia extra com pausas de aprendiz + +> **Pioneirismo**: Nenhum modelo de turn-taking otimizado para aprendizes L2 existe (comercial ou academico). O BabelCast e o primeiro. + +## Por que a partir do Pipecat + +Nos experimentos anteriores (`previous-experiments/02-finetune-scratch/`) treinamos do zero com Whisper Tiny + dados CORAA/MUPE. Resultados: + +| Metrica | Do zero (melhor) | Pipecat original (PT) | +|---------|------------------|----------------------| +| Accuracy | 78.2% | **95.42%** | +| False Positive | 16.8% @0.5 | **2.79%** | +| False Negative | 5.1% @0.5 | **1.79%** | +| F1 | 0.798 | ~0.96 (estimado) | +| Dados | 15K amostras | 270K amostras | +| Labels | Pontuacao + corte artificial | LLM-curados + TTS | + +A diferenca e brutal: o Pipecat chega a **95.42% em portugues** com dados LLM-curados + TTS; nos chegamos a 78.2% com labels de pontuacao. O problema nao e o modelo — e a **qualidade dos dados**. + +**Problemas do treino do zero:** +- Labels de baixa qualidade (pontuacao nao indica fim de turno de verdade) +- Corte artificial em 30-75% nao simula pausas reais de hesitacao +- Poucos dados (15K vs 270K do Pipecat) +- Modelo nao entende pausas de hesitacao — acerta so 67.7% das pausas @threshold=0.5 + +**Vantagens de partir do Pipecat:** +- Modelo ja entende turn-taking em 23 linguas +- Precisa de muito menos dados pra adaptar (5-10K vs 270K) +- Transfer learning: mantem conhecimento geral, adapta pra PT-BR +- Treino muito mais rapido (~30 min vs horas) + +## Metodo — Pipeline de dados + +### Pipeline de criacao de dados + +Baseado no pipeline comprovado do Pipecat v3.1: + +``` +Etapa 1: Frases fonte + - Transcricoes do CORAA (portugues conversacional real) + - Frases geradas pelo Claude (contextos especificos) + - Frases tipicas de aprendiz de frances falando PT + - Speak & Improve Corpus 2025 (340h L2 com disfluencias) + | +Etapa 2: LLM processa (Claude Haiku — custo baixo) + - Filtra frases com erros / ambiguas (Pipecat: Gemini removeu 50-80%) + - Classifica: COMPLETO vs INCOMPLETO (semantico, nao por pontuacao) + - Insere fillers brasileiros: "hum", "tipo", "ne", "entao", "e..." + - Insere fillers de frances falando PT: "euh", "comment dit-on", "como se diz" + - Gera variantes incompletas: corta frases em pontos naturais + | +Etapa 3: TTS gera audio + - Vozes PT-BR nativas (Kokoro, Google Chirp3) + - Vozes com sotaque frances (XTTS voice cloning, ou Chirp3 com accent) + - Variacao de velocidade, tom, ruido de fundo + | +Etapa 4: Dataset final + - 5-10K amostras balanceadas (50% completo / 50% incompleto) + - Metadados: lingua, sotaque, tipo_filler, confianca_label +``` + +### Projetos que validam este metodo + +| Projeto | O que fizeram | Resultado | +|---------|--------------|-----------| +| **Pipecat v3.1** | Gemini filtra frases + insere fillers, Claude/GPT geram listas de fillers, Chirp3 TTS | 270K amostras, 81-97% accuracy, 23 linguas | +| **LiveKit** | Qwen 7B como professor gera soft labels, destila pra 0.5B | 99.3% TP rate, -39% interrupcoes | +| **Vogent Turn 80M** | Dados humanos + sinteticos com edge cases (disfluencias, listas, pausas) | 94.1% accuracy, estado da arte | +| **Deepgram** | 100+ horas anotadas por humanos, refinamento iterativo de labels | Melhor calibracao entre todos | +| **SpeculativeETD** | MultiWOZ texto → TTS + fillers injetados + pausas sinteticas | 120K amostras, dataset publico | +| **SODA (Allen AI)** | GPT-3.5 gera 1.5M dialogos a partir de knowledge graph | Preferido sobre BlenderBot, Vicuna | +| **Refuel Autolabel** | GPT-4 como anotador: 88.4% agreement (vs 86% humano) | 20x mais rapido, 7x mais barato | + +**Referencia principal:** [Pipecat Data Generation Contribution Guide](https://github.com/pipecat-ai/smart-turn/blob/main/docs/data_generation_contribution_guide.md) + +## Fine-tuning + +### Modelo base + +```python +# Baixar modelo pre-treinado do HuggingFace +from huggingface_hub import hf_hub_download +model_path = hf_hub_download("pipecat-ai/smart-turn-v3", "model.onnx") + +# Whisper Tiny encoder (39M) + linear classifier +# Hidden size: 384, ONNX INT8: 8MB, FP32: 32MB +``` + +### Estrategia de fine-tuning + +```python +# Carregar pesos pre-treinados do Pipecat +model = SmartTurnModel(whisper_model="openai/whisper-tiny") +model.load_state_dict(pipecat_weights) + +# LR uniforme 5e-5 (igual ao Pipecat train.py) +optimizer = AdamW(model.parameters(), lr=5e-5, weight_decay=0.01) + +# Focal Loss com custo assimetrico para aprendizes L2 +criterion = FocalLoss(gamma=2.0, alpha=0.25, fp_penalty=2.0) + +# 6 epochs, batch_size=128, cosine schedule + warmup 0.2 +train(model, portuguese_dataset, epochs=6, batch_size=128) +``` + +### Infra + +- **GPU**: Modal A10G (~$0.50/run) +- **Tempo estimado**: 15-30 min +- **Alternativa**: ai-gateway → TensorDock/Vast.ai + +## Melhorias para aprendizado de idiomas (L2) + +> Pesquisa realizada em 2026-03-16 confirmou que **nenhum modelo de turn-taking para aprendizes L2 existe** — nem comercial (Praktika, ELSA, Gliglish, TalkPal) nem academico. Todos usam abordagens genericas. + +### Custo assimetrico + +Para aprendizado de idiomas, **interromper o aluno (FP) e muito pior que esperar demais (FN)**. Um aluno interrompido perde confianca e para de tentar. + +```python +# fp_penalty=2.0: falsos positivos custam 2x mais na loss +criterion = FocalLoss(gamma=2.0, alpha=0.25, fp_penalty=2.0) +``` + +Inspirado no ConversAR (Meta, 2025) que usa "infinite thinking period" para L2. + +### Threshold duplo (Deepgram Flux) + +Dois thresholds separados para latencia vs precisao: + +| Threshold | Valor | Funcao | +|-----------|-------|--------| +| **eager** (0.3-0.5) | Prepara resposta do LLM especulativamente | Reduz latencia percebida | +| **final** (0.7+) | Autoriza o avatar a falar | Minimiza interrupcoes | + +Se o score cai antes de atingir `final`, a preparacao especulativa e descartada — sem custo para o aluno. + +### Dados L2 reais (Speak & Improve) + +O [Speak & Improve Corpus 2025](https://huggingface.co/datasets/speak-improve/corpus) contem **340 horas** de fala L2 em ingles com anotacoes de disfluencia, niveis CEFR A2-C1. + +Embora seja ingles (nao portugues), os **padroes de hesitacao L2 sao cross-linguisticos**: pausas longas, repeticoes, auto-correcoes. Usamos como fonte de hesitacao patterns no fine-tuning. + +### CEFR-aware presets + +O modelo adapta paciencia conforme o nivel do aluno: + +| Nivel | final_threshold | eager_threshold | Comportamento | +|-------|----------------|-----------------|---------------| +| **A1** (iniciante) | 0.80 | 0.40 | Muito paciente — espera pausas longas | +| **A2** | 0.75 | 0.38 | Paciente | +| **B1** (intermediario) | 0.70 | 0.35 | Moderado | +| **B2** | 0.65 | 0.33 | Responsivo | +| **C1** (avancado) | 0.60 | 0.30 | Quase nativo — resposta rapida | + +## Sistema de backchannel + +### Problema: "o avatar travou?" + +Aprendizes L2 frequentemente pausam 1-3 segundos para pensar em conjugacoes, vocabulario, ou estrutura. Sem feedback, o aprendiz acha que o avatar travou e desiste de falar. + +> **Pesquisa**: Tavus identifica um threshold de **600ms** — apos esse tempo, o falante espera alguma resposta do sistema. Para L2, esse tempo e ainda mais critico. + +### Solucao: sinais de escuta ativa + +O avatar emite sinais progressivos de que esta ouvindo: + +| Tempo de silencio | Sinal | Tipo | Exemplo | +|-------------------|-------|------|---------| +| **600ms** | Aceno visual | Visual | Avatar faz "mhm" com a cabeca | +| **1.5s** | Backchannel verbal | Audio | "mhm", "sim", "uhum" | +| **3.0s** | Encorajamento | Audio | "sem pressa", "pode continuar", "estou ouvindo" | + +Os sinais **nao interrompem o turno do aluno** — sao sobreposicoes curtas que indicam escuta ativa. + +### Presets por nivel CEFR + +| Nivel | Visual (ms) | Verbal (ms) | Encorajamento (ms) | +|-------|-------------|-------------|---------------------| +| **A1** | 500 | 1200 | 2500 | +| **B1** | 600 | 1500 | 3000 | +| **C1** | 800 | 2000 | 4000 | + +Alunos A1 recebem sinais mais cedo; alunos C1 precisam de menos apoio. + +## Engine de inferencia (06_inference.py) + +O arquivo `06_inference.py` implementa a engine completa de turn-taking com backchannel. + +### Estados do turn-taking + +``` +LISTENING → SILENCE → BACKCHANNEL_VISUAL → BACKCHANNEL_VERBAL → ENCOURAGEMENT + ↑ ↓ ↓ ↓ ↓ + ←──────────←──────────────←─────────────────────←──────────────────← + (aluno volta a falar) + +SILENCE → PREPARING → RESPONDING + (eager) (final) +``` + +- **LISTENING**: aluno esta falando, modelo monitora score +- **SILENCE**: pausa detectada, timer inicia +- **BACKCHANNEL_***: sinais de escuta ativa (nao interrompe turno) +- **PREPARING**: score atingiu eager_threshold, LLM comeca a preparar resposta +- **RESPONDING**: score atingiu final_threshold, avatar fala + +### Exemplo: aprendiz B1 conjugando verbo + +``` +Aluno: "Ontem eu... [pausa 600ms]" +Avatar: [aceno visual - nod] +Aluno: "... fui? fiz? [pausa 1.5s]" +Avatar: "mhm" [backchannel verbal] +Aluno: "... fui ao mercado." +Avatar: [espera final_threshold] → responde normalmente +``` + +## Dados especificos — Frances falando portugues + +### Tipos de hesitacao + +| Tipo | Exemplo | Label | +|------|---------|-------| +| Filler frances | "Eu fui... euh... ao mercado" | INCOMPLETO | +| Busca de palavra | "Eu preciso de... comment dit-on... uma tesoura" | INCOMPLETO | +| Code-switching | "Eu gosto de... enfin... tipo... de praia" | INCOMPLETO | +| Pausa de conjugacao | "Eu... fui? fiz?... ontem" | INCOMPLETO | +| Entonacao francesa | Frase completa mas com pitch plano (sem queda final) | COMPLETO (dificil) | +| Ritmo silabico | Frances e syllable-timed, PT e stress-timed | Ambos | + +### Geracoes com Claude + +``` +Prompt para Claude gerar frases de aprendiz: + +"Gere 100 frases que um frances de nivel B1 falando portugues diria +em uma reuniao de trabalho. Inclua: +- Hesitacoes tipicas (euh, alors, comment dire) +- Erros comuns de conjugacao +- Pausas naturais pra pensar na palavra +- Code-switching involuntario (palavras em frances no meio) +- Frases completas com entonacao plana (sem queda de pitch) + +Para cada frase, indique: COMPLETO ou INCOMPLETO" +``` + +## Benchmarks de referencia + +### Pipecat Smart Turn v3.0 + +Audio-only, 8M params: + +| Lingua | Accuracy | FP | FN | +|--------|----------|-----|-----| +| Turco | 97.10% | 1.66% | 1.24% | +| Coreano | 96.85% | 1.12% | 2.02% | +| Japones | 96.76% | 2.04% | 1.20% | +| Frances | 96.01% | 1.60% | 2.39% | +| **Portugues** | **95.42%** | **2.79%** | **1.79%** | +| Ingles | 94.31% | 2.64% | 3.06% | +| Espanhol | 91.97% | 4.48% | 3.55% | + +Fonte: [Smart Turn v3 blog](https://www.daily.co/blog/announcing-smart-turn-v3-with-cpu-inference-in-just-12ms/) + +### LiveKit v0.4.1 + +Texto-only, 500M params, @99.3% TPR: + +| Lingua | TNR | Melhoria vs anterior | +|--------|------|---------------------| +| Hindi | 96.3% | +31.48% | +| Coreano | 94.5% | +30.38% | +| Frances | 88.9% | +33.93% | +| **Portugues** | **87.4%** | **+45.97%** | +| Ingles | 87.0% | +21.69% | + +Fonte: [LiveKit v0.4.1 blog](https://livekit.com/blog/improved-end-of-turn-model-cuts-voice-ai-interruptions-39/) + +### Outros modelos + +| Modelo | Tipo | Accuracy | Linguas | Nota | +|--------|------|----------|---------|------| +| Vogent Turn 80M | Audio+texto | 94.1% | 1 (EN) | Estado da arte multimodal | +| Krisp v2 | Audio | 82.3% bal.acc | "Agnostico" | Proprietario | +| Deepgram Flux | ASR+EoT | #1 VAQI | 1 (EN) | Proprietario | +| VAP | Audio | 79.6% bal.acc | 3 (EN/ZH/JP) | Academico, auto-supervisionado | + +### Gap sintetico → real + +O SpeculativeETD mostrou que modelos treinados so em dados sinteticos (TTS) perdem **muito** em dados reais: + +| Dataset | Wav2vec F1 | +|---------|-----------| +| Sintetico | 94.7% | +| Real | **30.3%** | + +Isso mostra que alem de gerar dados com TTS, precisamos incluir **audio real** no fine-tuning. + +## Metricas alvo + +| Metrica | Exp 02 (do zero) | Pipecat PT (ref) | Alvo (este exp) | +|---------|------------------|------------------|-----------------| +| Accuracy | 78.2% | 95.42% | > 96% | +| False Positive | 16.8% | 2.79% | < 2.5% | +| False Negative | 5.1% | 1.79% | < 2.0% | +| F1 | 0.798 | ~0.96 | > 0.96 | +| Tamanho modelo | 30.5 MB | 8 MB (ONNX INT8) | ~8 MB | +| Inferencia CPU | ~12ms | 12ms | ~12ms | + +## Correcoes apos analise de referencias + +*(2026-03-16)* + +Baixamos 6 papers, 10 blog posts e 7 guias tecnicos (ver `references/`). A analise cruzada revelou problemas no plano original: + +| Mudanca | Antes | Depois | Fonte | +|---------|-------|--------|-------| +| Focal Loss alpha | 0.6 | **0.25** | Lin et al. 2017, Table 1a | +| Batch size | 32 | **128** | Pipecat train.py usa 384 | +| Epochs | 10 | **6** | Pipecat usa 4 em 270K | +| Label smoothing | 0.05 | **removido** | EMNLP 2022: dupla regularizacao com FL | +| Learning rate | diferencial (0.1x encoder) | **uniforme 5e-5** | Pipecat train.py usa lr unica | +| Ruido augmentation | Gaussiano | **ruido real** (cafe/escritorio) | Pipecat v3.2: -40% erros | +| ONNX opset | 17 | **18** | Pipecat train.py | +| Quantizacao | nenhuma | **INT8 estatica** (entropy, 1024 calib) | Pipecat deploy: 32MB → 8MB | +| Loss alternativa | so Focal | **Focal + BCE** comparados | Pipecat original usa BCE | + +### Tecnicas adicionais identificadas + +1. **Knowledge distillation** (LiveKit: -39% interrupcoes, +45.97% melhoria PT) +2. **Short utterance dataset** (Pipecat v3.2: -40% erros em respostas curtas) — implementado +3. **Audio real misturado com TTS** (SpeculativeETD: F1 cai de 94.7% → 30.3% so com sintetico) — implementado via CORAA +4. **Pausa de 1.5-3s apos fillers** (SpeculativeETD V3: melhor variante) — implementado +5. **Threshold per-language** (LiveKit: languages.json com thresholds por lingua) + +## Estrutura de arquivos + +``` +03-finetune-pipecat-pt/ + README.md # Este documento + 01_download_pipecat.py # Baixa modelo pre-treinado do HuggingFace + 02_generate_labels.py # Claude API gera/filtra/classifica frases PT + 03_generate_audio.py # TTS gera audio (nativo + sotaque frances) + 04_finetune.py # Fine-tune com custo assimetrico + dados L2 + 05_evaluate.py # Avaliacao com dual threshold sweep + 06_inference.py # Engine de inferencia com backchannel + CEFR presets + modal_run.py # Deploy no Modal (GPU) + + references/ + hesitation_turn_taking_l2_review.md # Mini-artigo com 35 referencias + papers/ # 6 PDFs academicos + summaries + focal_loss_lin_2017.* + speculative_etd_2025.* + vap_turn_taking_ekstedt_2024.* + turn_taking_review_skantze_2021.* + soda_dialog_distillation_2023.* + finite_state_turn_taking_raux_2009.* + papers/hesitation-l2-french/ # 19 PDFs + 10 MD (L2 hesitacao/francofono) + papers/language-learning-turn-taking/ # Pesquisa L2 turn-taking (2026-03-16) + survey_turn_taking_iwsds2025.* # Survey IWSDS 2025 + multilingual_vap_2024.* # VAP multilingual + speak_improve_corpus_2025.* # Speak & Improve L2 corpus + hesitation_tagging_l2_whisper.* # Whisper+LoRA hesitation + conversar_mixed_reality_l2.* # ConversAR (Meta Quest) + deepgram_flux.* # Deepgram Flux overview + hume_evi.* # Hume EVI overview + praktika_openai.* # Praktika case study + tavus_turn_taking_guide.* # Tavus guide + blogs/ # 10 blog posts + guides/ # 7 technical guides + + data/ # (gitignored) + pipecat_pt_audio/ # Audio PT do Pipecat v3.2 + pipecat_pt_test/ # Audio PT test do Pipecat v3.2 + claude_labeled/ # Frases processadas pelo Claude (JSON) + tts_dataset/ # Audio gerado (nativo + sotaque frances) + noise_samples/ # Ruido real CC-0 (cafe, escritorio) + + results/ # (gitignored) + best_model.pt # Modelo treinado + smart_turn_pt_v3.onnx # ONNX INT8 (~8 MB) + smart_turn_pt_v3_fp32.onnx # ONNX FP32 (~32 MB) + training_results.json # Metricas + evaluation_results.json # Comparacao com baseline +``` + +## Cronograma + +| Etapa | Descricao | Tempo | +|-------|-----------|-------| +| 1 | Baixar e inspecionar modelo Pipecat | 1h | +| 2 | Script de labeling com Claude API | 1 dia | +| 3 | Gerar audio TTS (nativo PT-BR) | 1 dia | +| 4 | Gerar audio com sotaque frances | 1-2 dias | +| 5 | Fine-tune no Modal | 30 min | +| 6 | Avaliacao + dual threshold sweep | 2h | +| 7 | Teste no avatar conversacional | 1 dia | +| **Total** | | **~5 dias** | + +## Dependencias + +``` +# Python +torch, torchaudio, transformers # Modelo +anthropic # Claude API (labeling) +kokoro, TTS # Geracao de audio +datasets, soundfile, librosa # Processamento +modal # Deploy GPU + +# APIs +ANTHROPIC_API_KEY # Claude Haiku pra labeling +MODAL_TOKEN # Modal pra treino GPU + +# Modelos +pipecat-ai/smart-turn-v3 # HuggingFace — modelo pre-treinado +openai/whisper-tiny # HuggingFace — encoder base +``` + +## Referencias + +Documento completo de pesquisa com 35 referencias: [`references/hesitation_turn_taking_l2_review.md`](references/hesitation_turn_taking_l2_review.md) + +**Principais:** + +| # | Referencia | Contribuicao | +|---|-----------|--------------| +| 1 | Pipecat Smart Turn v3 (Daily, 2025) | Modelo base, pipeline de dados | +| 2 | Lin et al. (ICCV 2017) | Focal Loss, alpha=0.25 | +| 3 | Skantze (CSL 2021) | Survey de turn-taking | +| 4 | Knill et al. (2025) | Speak & Improve L2 corpus | +| 5 | Saeki et al. (2025) | Whisper+LoRA hesitation tagging | +| 6 | Gamboa et al. (2025) | ConversAR, custo assimetrico L2 | +| 7 | Deepgram Flux (2025) | Dual threshold, speculative ASR | +| 8 | Ekstedt et al. (2024) | VAP multilingual | +| 9 | Raux & Eskenazi (NAACL 2009) | FSM turn-taking | +| 10 | LiveKit (2025) | Knowledge distillation, 500M params | diff --git a/03-finetune-pipecat-pt/modal_run.py b/03-finetune-pipecat-pt/modal_run.py new file mode 100644 index 0000000000000000000000000000000000000000..c52fc493b6ccaf62feea588ff5b08316643333b7 --- /dev/null +++ b/03-finetune-pipecat-pt/modal_run.py @@ -0,0 +1,267 @@ +"""Run the full fine-tuning pipeline on Modal (A10G GPU). + +Usage: + modal run modal_run.py + +This deploys the pipeline on a Modal A10G GPU (~$0.50/run): +1. Downloads Pipecat Portuguese data +2. Generates labels with Claude API (if ANTHROPIC_API_KEY set) +3. Generates TTS audio with Kokoro +4. Fine-tunes SmartTurnV3Model +5. Evaluates against Pipecat baseline +6. Saves results to Modal volume + +Estimated time: 30-60 min total +Estimated cost: ~$0.50-1.00 +""" + +from __future__ import annotations + +import modal + +app = modal.App("babelcast-finetune-pipecat-pt") + +# Persistent volume for data + results +volume = modal.Volume.from_name("finetune-pipecat-pt", create_if_missing=True) + +# GPU image with all dependencies +image = ( + modal.Image.debian_slim(python_version="3.11") + .pip_install( + # Core ML + "torch>=2.1", + "torchaudio>=2.1", + "transformers>=4.36", + "datasets>=2.16", + # Audio processing + "soundfile>=0.12", + "librosa>=0.10", + "numpy<2", + # TTS + "kokoro>=0.3", + # ONNX for evaluation + "onnxruntime>=1.16", + # Claude API for labeling + "anthropic>=0.40", + # Utilities + "huggingface-hub>=0.20", + ) + .apt_install("ffmpeg", "libsndfile1") +) + + +@app.function( + image=image, + gpu="A10G", + timeout=3600, # 1 hour max + volumes={"/workspace": volume}, + secrets=[ + modal.Secret.from_name("anthropic-api-key", required=False), + modal.Secret.from_name("huggingface-token", required=False), + ], +) +def run_pipeline( + skip_download: bool = False, + skip_labels: bool = False, + skip_audio: bool = False, + skip_train: bool = False, + max_pipecat_samples: int = 5000, + max_tts_samples: int = 10000, + max_l2_samples: int = 2000, + epochs: int = 6, + batch_size: int = 128, + lr: float = 5e-5, + loss_fn: str = "focal", + fp_penalty: float = 2.0, +): + """Run the full pipeline on Modal GPU.""" + import logging + import os + import sys + from pathlib import Path + + logging.basicConfig( + level=logging.INFO, + format="%(asctime)s %(levelname)s [%(name)s] %(message)s", + ) + log = logging.getLogger("modal_run") + + # Add script directory to path + script_dir = Path(__file__).parent + sys.path.insert(0, str(script_dir)) + + # Symlink data dir to /workspace for persistence + data_dir = script_dir / "data" + workspace_data = Path("/workspace/data") + workspace_data.mkdir(parents=True, exist_ok=True) + if not data_dir.exists(): + data_dir.symlink_to(workspace_data) + + # Step 1: Download Pipecat data + if not skip_download: + log.info("=" * 60) + log.info("STEP 1: Download Pipecat Portuguese data") + log.info("=" * 60) + from import_module_01 import download_pipecat_dataset, download_pipecat_test_data, download_onnx_model + try: + # Use importlib since filenames start with numbers + import importlib.util + spec = importlib.util.spec_from_file_location("dl", script_dir / "01_download_pipecat.py") + dl = importlib.util.module_from_spec(spec) + spec.loader.exec_module(dl) + + dl.download_pipecat_dataset(max_pt_samples=max_pipecat_samples) + dl.download_pipecat_test_data(max_pt_samples=2000) + dl.download_onnx_model() + except Exception as e: + log.error("Download failed: %s", e) + raise + + volume.commit() + log.info("Step 1 complete — data saved to volume") + + # Step 2: Generate labels (requires ANTHROPIC_API_KEY) + if not skip_labels: + log.info("=" * 60) + log.info("STEP 2: Generate labels with Claude API") + log.info("=" * 60) + try: + import importlib.util + spec = importlib.util.spec_from_file_location("labels", script_dir / "02_generate_labels.py") + labels = importlib.util.module_from_spec(spec) + spec.loader.exec_module(labels) + + api_key = os.environ.get("ANTHROPIC_API_KEY") + if api_key: + log.info("ANTHROPIC_API_KEY found — using Claude for labeling") + else: + log.info("No ANTHROPIC_API_KEY — using rule-based fallback") + + labels.run_full_pipeline(max_transcripts=5000, max_fr_sentences=500) + except Exception as e: + log.warning("Label generation failed: %s — continuing without custom labels", e) + + volume.commit() + + # Step 3: Generate TTS audio + if not skip_audio: + log.info("=" * 60) + log.info("STEP 3: Generate TTS audio") + log.info("=" * 60) + try: + import importlib.util + spec = importlib.util.spec_from_file_location("audio", script_dir / "03_generate_audio.py") + audio = importlib.util.module_from_spec(spec) + spec.loader.exec_module(audio) + + audio.run_audio_generation( + max_native_sentences=3000, + max_french_sentences=1000, + augment_copies=2, + ) + except Exception as e: + log.warning("TTS generation failed: %s — continuing with Pipecat data only", e) + + volume.commit() + + # Step 4: Fine-tune + if not skip_train: + log.info("=" * 60) + log.info("STEP 4: Fine-tune SmartTurnV3Model") + log.info("=" * 60) + try: + import importlib.util + spec = importlib.util.spec_from_file_location("ft", script_dir / "04_finetune.py") + ft = importlib.util.module_from_spec(spec) + spec.loader.exec_module(ft) + + ft.train( + epochs=epochs, + batch_size=batch_size, + lr=lr, + warmup_ratio=0.2, + max_pipecat_samples=max_pipecat_samples, + max_tts_samples=max_tts_samples, + max_l2_samples=max_l2_samples, + loss_fn=loss_fn, + fp_penalty=fp_penalty, + ) + except Exception as e: + log.error("Training failed: %s", e) + raise + + volume.commit() + + # Step 5: Evaluate + log.info("=" * 60) + log.info("STEP 5: Evaluate") + log.info("=" * 60) + try: + import importlib.util + spec = importlib.util.spec_from_file_location("ev", script_dir / "05_evaluate.py") + ev = importlib.util.module_from_spec(spec) + spec.loader.exec_module(ev) + + results_dir = Path("/workspace/results") + model_path = results_dir / "best_model.pt" + if model_path.exists(): + results = ev.run_evaluation(model_path) + log.info("Evaluation complete!") + else: + log.warning("No model found at %s — skipping evaluation", model_path) + except Exception as e: + log.error("Evaluation failed: %s", e) + + volume.commit() + log.info("=" * 60) + log.info("PIPELINE COMPLETE — results saved to Modal volume 'finetune-pipecat-pt'") + log.info("=" * 60) + + +@app.function( + image=image, + volumes={"/workspace": volume}, +) +def download_results(local_dir: str = "results") -> list[str]: + """Download results from Modal volume.""" + from pathlib import Path + import shutil + + results_dir = Path("/workspace/results") + if not results_dir.exists(): + print("No results found on volume") + return [] + + files = [] + for f in results_dir.rglob("*"): + if f.is_file(): + files.append(str(f.relative_to(results_dir))) + print(f" {f.name}: {f.stat().st_size / 1024:.1f} KB") + + return files + + +@app.local_entrypoint() +def main( + skip_download: bool = False, + skip_labels: bool = False, + skip_audio: bool = False, + skip_train: bool = False, + download_only: bool = False, + epochs: int = 10, + batch_size: int = 32, +): + """Entry point for `modal run modal_run.py`.""" + if download_only: + files = download_results.remote() + print(f"\nFound {len(files)} result files on volume") + return + + run_pipeline.remote( + skip_download=skip_download, + skip_labels=skip_labels, + skip_audio=skip_audio, + skip_train=skip_train, + epochs=epochs, + batch_size=batch_size, + ) diff --git a/03-finetune-pipecat-pt/references/blogs/assemblyai_turn_detection.md b/03-finetune-pipecat-pt/references/blogs/assemblyai_turn_detection.md new file mode 100644 index 0000000000000000000000000000000000000000..e389cd3b4f3aa491c3563403729b51dedeb0014b --- /dev/null +++ b/03-finetune-pipecat-pt/references/blogs/assemblyai_turn_detection.md @@ -0,0 +1,137 @@ +--- +title: "How Intelligent Turn Detection (Endpointing) Solves the Biggest Challenge in Voice Agent Development" +source: https://www.assemblyai.com/blog/turn-detection-endpointing-voice-agent +date_accessed: 2026-03-16 +--- + +# How Intelligent Turn Detection (Endpointing) Solves the Biggest Challenge in Voice Agent Development + +## Introduction + +Voice agents enable natural conversations between humans and AI systems, representing one of Speech AI's fastest-growing applications. However, a persistent challenge affects all voice agent implementations: **turn detection**, or determining when a human finishes speaking so the AI can respond appropriately. + +## The Latency Problem in Voice Agents + +Voice agent developers prioritize low latency for optimal user experience. Three fundamental latencies exist in streaming speech-to-text models: + +1. **Partial transcript latency** -- Speed of returning initial transcript predictions +2. **Final transcript latency** -- Speed of returning finalized transcripts after speech ends +3. **Endpointing latency** -- Speed of detecting when someone stops speaking + +Voice agents specifically require fast endpointing latency. Developers have historically over-optimized for general latency reduction, treating end-of-turn detection as secondary. However, addressing turn detection provides "far greater improvements to the user experience than incremental latency optimizations," as endpointing delay exists at a different magnitude than millisecond-level latency gains. + +## Three Endpointing Methods + +### Manual Endpointing +Users explicitly indicate completion through button presses or voice commands. This creates poor user experience and harms adoption despite potentially fast response times. + +### Silence Detection +The current industry standard waits for specified silence duration thresholds. This approach balances responsiveness against premature interruptions but struggles finding optimal threshold values -- long enough to avoid cutoffs mid-thought, yet short enough to maintain conversational fluidity. + +### Semantic Endpointing +Advanced systems analyze what someone says rather than just silence duration. This approach understands when thoughts are complete, distinguishing natural pauses mid-sentence from genuine utterance endings. Modern implementations use language models to predict sentence boundaries and semantic completeness. + +## How Semantic Endpointing Works + +Semantic endpointing encompasses multiple implementation approaches. AssemblyAI's Universal-Streaming model predicts a special token that the model learns to identify during training based on context. + +### Configuration Parameters + +Several user-configurable options enable developers to tune endpointing behavior: + +- **end_of_turn_confidence_threshold** (default: 0.7) -- Required confidence level for end-of-turn predictions +- **min_end_of_turn_silence_when_confident** (default: 160ms) -- Minimum silence duration after confident prediction to prevent false positives +- **max_turn_silence** (default: 2400ms) -- Fallback silence-based endpointing for unusual speech patterns + +The system requires three conditions for semantic end-of-turn: + +1. Model predicts end-of-turn with sufficient confidence +2. Minimum silence duration has passed +3. Minimal speech present to prevent false positives from audio noise + +Traditional silence-based endpointing remains enabled as a final catch-all mechanism for atypical patterns. + +## Comparative Analysis: Turn Detection Approaches + +### LiveKit: Semantic-Only Approach + +**Input:** Transcribed text only +**Key Dependency:** Voice Activity Detection (VAD) +**Processing:** Runs after VAD detects speech end + +**Strengths:** +- Simple, focused semantic analysis +- Open source customization available + +**Limitations:** +- Heavy VAD dependency creates delays if background noise triggers continuous VAD +- Ignores audio cues humans naturally use for turn boundaries +- Tends toward maximum delay periods, creating sluggish conversations + +### Pipecat: Audio-Centric Detection + +**Input:** Audio features only (prosody, intonation) +**Key Dependency:** Direct audio analysis +**Processing:** Real-time audio pattern recognition + +**Strengths:** +- Leverages natural prosodic and intonation cues +- Potentially faster turn detection than text-dependent models +- Open source flexibility + +**Limitations:** +- Sensitive to background noise interference +- Performance varies significantly with speaker accents +- Struggles with atypical prosodic patterns +- Limited by audio quality availability + +### AssemblyAI: Hybrid Approach + +**Input:** Both transcribed text and audio context +**Key Dependency:** Integrated approach with dynamic thresholds +**Processing:** Context-aware analysis enabling early turn detection during silence based on syntactic completeness + +**Strengths:** +- Robust across varying acoustic conditions +- Dynamic adaptation based on sentence completeness rather than static VAD +- Better background noise handling through semantic backup +- More natural conversation flow with context-aware timing + +**Limitations:** +- Closed source limits customization +- Potentially higher computational requirements +- Depends on transcription quality for optimal performance + +## Feature Comparison Matrix + +| Feature | LiveKit | Pipecat | AssemblyAI | +|---------|---------|---------|-----------| +| Input Type | Text only | Audio only | Text + Audio | +| VAD Dependency | High | None | Low | +| Noise Robustness | Poor (via VAD) | Poor | Good | +| Speaker Variation | Good | Poor | Good | +| Response Speed | Slow | Fast | Adaptive | +| Complexity | Low | Medium | High | + +## Selection Guidance + +**Choose LiveKit if:** +- You need simple, semantic-focused approaches +- You have clean audio environments with minimal background noise +- Response speed is less critical than accuracy + +**Choose Pipecat if:** +- You have consistent speaker profiles and clean audio +- You want to experiment with pure audio-based detection + +**Choose AssemblyAI if:** +- You want combined semantic and acoustic endpointing +- You handle diverse speakers and noisy environments +- Natural conversation flow is critical +- Deployment flexibility matters + +## The Future of Turn Detection + +Single-modality solutions offer simplicity and specialized performance, while hybrid approaches like AssemblyAI's demonstrate benefits of combining multiple signal types for robust and natural conversational experiences. Future developments will likely incorporate conversational context, speaker identification, and multimodal cues (including visual information in relevant scenarios). + +The optimal choice depends on specific use case requirements, technical constraints, and acceptable trade-offs between accuracy, speed, and robustness. diff --git a/03-finetune-pipecat-pt/references/blogs/deepgram_eot_evaluation.md b/03-finetune-pipecat-pt/references/blogs/deepgram_eot_evaluation.md new file mode 100644 index 0000000000000000000000000000000000000000..8ba22b2fc38e1f53f913929d1d7b44fda14fdb31 --- /dev/null +++ b/03-finetune-pipecat-pt/references/blogs/deepgram_eot_evaluation.md @@ -0,0 +1,58 @@ +--- +title: "Evaluating End-of-Turn Detection Models" +source: https://deepgram.com/learn/evaluating-end-of-turn-detection-models +date_accessed: 2026-03-16 +--- + +# Evaluating End-of-Turn (Turn Detection) Models + +## Overview + +Natural voice agent interactions depend critically on accurate turn-taking behavior. End-of-Turn (EoT) detection -- predicting when a speaker has finished and awaits a response -- is essential for creating conversational agents that feel responsive rather than interruptive. + +Deepgram developed a comprehensive evaluation methodology for turn detection after finding existing approaches inadequate. Their approach prioritizes real-world performance over proxy metrics, evaluating "complete conversations between humans" rather than isolated turns. + +## Full Conversational Evaluation + +Rather than testing individual turns, Deepgram analyzes entire conversations. This methodology reflects realistic scenarios where: + +- **Natural labeling**: Conversation partners provide high-quality, low-latency ground truth detection +- **Variable detection budgets**: Different turns demand faster responses based on context (simple queries versus complex reasoning) +- **Natural pause patterns**: Pre-turn silences enable evaluation of start-of-speech detection and backchannel phenomena + +The team labeled over 100 hours of real conversations with ground truth transcripts and timestamped EoT labels. Notably, refining their annotation specification from "End-of-Thought" to "End-of-Turn" improved annotator confidence from 5/10 to 8.5/10 by reducing ambiguity. + +## The Challenge with Timestamps + +Human annotations revealed an unexpected problem: annotators consistently left small gaps between speech completion and label placement. Deepgram discovered their system frequently detected EoT within these gaps -- appearing as false positives under naive temporal alignment. + +### Forced Alignment Solution + +Rather than trusting raw human timestamps, Deepgram applied "forced alignment to update the human timestamps," extracting more precise end-of-speech markers. While conservative human placement might reflect natural pauses before responses, voice agents benefit from fastest possible detection since "their turn starts are frequently delayed relative to EoT detection due to extra latency associated with LLM and TTS generation." + +## Sequence Alignment for Improved Evaluation + +Standard temporal matching proved insufficiently precise. Deepgram adopted sequence alignment -- treating turn boundaries as special tokens (`[EoT]`) within transcripts, similar to Word Error Rate (WER) calculation. + +**Impact**: This shift yielded "3-5% absolute increases in precision and recall across models," with "manual investigation of results suggested the new values were more representative of the true performance." + +The approach handled all detector types: all-in-one STT solutions, text-based detectors, and audio-only models (using Nova-3 for inter-turn transcription). + +## Handling Dropped Turns + +A modified Levenshtein algorithm addressed cases where intermediate turns were missed. Beyond simple edit distance, the system "determines the best alignment not only based on overall edit distance but also based on the most likely alignment between EoT tokens," preventing misalignment when turns are dropped. + +## Start-of-Turn (SoT) Evaluation + +Complementing EoT analysis, Deepgram evaluated "start-of-turn (SoT)" detection -- identifying when users resume speaking, critical for handling interruptions or "barge-in" scenarios. + +**Flux performance metrics**: +- Detection within ~100-200ms of first word onset +- False positive rate: less than or equal to 1-2% +- Analysis used word start times as detection targets (representing unachievable lower bounds) + +Negative latency directly indicates false positives -- detecting speech before it actually begins. + +## Future Directions + +The team highlighted emerging evaluation frontiers: "one could combine the various aspects of turn detection, such as false positive rate and latency, into single quality metrics." They specifically mentioned their Voice Agent Quality Index (VAQI) framework and anticipated incorporating semantic/conversational flow considerations -- enabling "stratified or weighted metrics that reflect that some turns merit faster responses than others." diff --git a/03-finetune-pipecat-pt/references/blogs/krisp_turn_taking_v1.md b/03-finetune-pipecat-pt/references/blogs/krisp_turn_taking_v1.md new file mode 100644 index 0000000000000000000000000000000000000000..69ed3057b5ddb4e5c8aaff1ff6d9e533f9a4b2b8 --- /dev/null +++ b/03-finetune-pipecat-pt/references/blogs/krisp_turn_taking_v1.md @@ -0,0 +1,78 @@ +--- +title: "Turn-Taking for Voice AI" +source: https://krisp.ai/blog/turn-taking-for-voice-ai/ +date_accessed: 2026-03-16 +--- + +# Audio-only, 6M Weights Turn-Taking Model for Voice AI Agents + +## Overview + +Krisp has developed a lightweight turn-taking model designed to determine when speakers should transition in voice conversations. This technology is included at no additional cost in Krisp's VIVA SDK. + +## What is Turn-Taking? + +Turn-taking represents "the fundamental mechanism by which participants in a conversation coordinate who speaks when." In voice AI contexts, it manages when agents should listen, speak, or remain silent. The model addresses two primary tasks: + +- **End-of-turn prediction**: Detecting when the current speaker will finish +- **Backchannel prediction**: Recognizing brief acknowledgments like "uh-huh" without speaker transitions + +## Implementation Approaches + +**Audio-based methods** analyze acoustic features including pitch variations, energy levels, intonation, pauses, and speaking rate. These enable low-latency responses critical for real-time scenarios. + +**Text-based approaches** examine transcribed content, identifying linguistic cues like sentence boundaries and discourse markers, though they typically require larger architectures. + +**Multimodal (fusion) systems** combine both modalities, leveraging acoustic cues alongside semantic understanding. + +## Key Challenges + +- **Hesitation vs. completion**: Distinguishing filler words ("um," "you know") from true turn endings +- **Natural pauses**: Differentiating conversational pauses from actual turn boundaries +- **Response speed**: Minimizing latency while maintaining accuracy +- **Speaking diversity**: Accounting for varying rhythms, accents, and intonation patterns + +## Model Architecture + +The Krisp model processes 100ms audio frames, outputting confidence scores (0-1) indicating shift likelihood. A configurable threshold determines binary shift predictions, with a default 5-second maximum hold duration. + +## Performance Comparison + +| Attribute | Krisp TT | SmartTurn v1 | SmartTurn v2 | VAD-based | +|-----------|----------|-------------|-------------|-----------| +| Parameters | 6.1M | 581M | 95M | 260k | +| Model Size | 65 MB | 2.3 GB | 360 MB | 2.3 MB | +| Execution | CPU | GPU | GPU | CPU | + +## Evaluation Metrics + +### Accuracy Results + +| Model | Balanced Accuracy | AUC Shift | F1 Score Shift | F1 Score Hold | AUC (MST vs FPR) | +|-------|------------------|-----------|----------------|---------------|-----------------| +| Krisp TT | 0.82 | 0.89 | 0.80 | 0.83 | 0.21 | +| VAD-based | 0.59 | -- | 0.48 | 0.70 | -- | +| SmartTurn V1 | 0.78 | 0.86 | 0.73 | 0.84 | 0.39 | +| SmartTurn V2 | 0.78 | 0.83 | 0.76 | 0.78 | 0.44 | + +### Training Data + +- Approximately 2,000 hours of conversational speech +- Around 700,000 speaker turns +- Test dataset: 1,875 manually labeled audio samples + +## Key Performance Findings + +The Krisp model achieves "considerably faster average response time (0.9 vs. 1.3 seconds at a 0.06 FPR) compared to SmartTurn" while being 5-10 times smaller and optimized for CPU execution. + +## Future Development + +**Planned enhancements include:** + +- Text-based turn-taking using custom neural networks +- Multimodal audio-text fusion for improved accuracy +- Backchannel detection to distinguish meaningful interruptions from casual listening acknowledgments + +--- + +*Article published August 5, 2025 by Krisp Engineering Team* diff --git a/03-finetune-pipecat-pt/references/blogs/krisp_turn_taking_v2.md b/03-finetune-pipecat-pt/references/blogs/krisp_turn_taking_v2.md new file mode 100644 index 0000000000000000000000000000000000000000..2b3b39cff7999180932ce7c57404fc93584cccbe --- /dev/null +++ b/03-finetune-pipecat-pt/references/blogs/krisp_turn_taking_v2.md @@ -0,0 +1,54 @@ +--- +title: "Krisp Turn-Taking v2 - Voice AI VIVA SDK" +source: https://krisp.ai/blog/krisp-turn-taking-v2-voice-ai-viva-sdk/ +date_accessed: 2026-03-16 +--- + +# Audio-Only Turn-Taking Model v2 - Krisp + +## Overview + +Krisp has released Turn-Taking v2, an updated model for detecting end-of-turns in real-time conversational AI systems. The model processes audio input only, making it suitable for "human-bot interactions" and other voice AI applications integrated through Krisp's VIVA SDK. + +## Technical Architecture + +The latest iteration, **krisp-viva-tt-v2**, represents a substantial advancement over its predecessor. According to the engineering team, it was "trained on a more diverse and better-structured dataset, with richer data augmentations that help the model perform more reliably in real-world conditions." + +## Key Improvements + +- Enhanced robustness in noisy environments +- Superior accuracy when combined with Krisp's Voice Isolation models +- Faster and more stable turn detection during live conversations + +## Performance Benchmarks + +### Clean Audio Testing + +Testing evaluated approximately 1,800 real conversation samples (~1,000 "hold" cases and ~800 "shift" cases) with mild background noise: + +| Model | Balanced Accuracy | AUC | F1 Score | +|-------|-------------------|-----|----------| +| krisp-viva-tt-v1 | 0.82 | 0.89 | 0.804 | +| **krisp-viva-tt-v2** | **0.823** | **0.904** | **0.813** | + +### Noisy Audio Testing (5-15 dB noise levels) + +| Model | Balanced Accuracy | AUC | F1 Score | +|-------|-------------------|-----|----------| +| krisp-viva-tt-v1 | 0.723 | 0.799 | 0.71 | +| **krisp-viva-tt-v2** | **0.768** | **0.842** | **0.757** | + +V2 demonstrated "up to a 6% improvement in F1 score under noisy conditions." + +### Post-Processing with Voice Isolation + +After applying background noise and voice removal through krisp-viva-tel-v2: + +| Model | Balanced Accuracy | AUC | F1 Score | +|-------|-------------------|-----|----------| +| krisp-viva-tt-v1 | 0.787 | 0.854 | 0.775 | +| **krisp-viva-tt-v2** | **0.816** | **0.885** | **0.808** | + +## Availability + +The model is now available as part of Krisp's VIVA SDK, designed for developers building Voice AI agents and conversational systems. diff --git a/03-finetune-pipecat-pt/references/blogs/livekit_eot_v0_4.md b/03-finetune-pipecat-pt/references/blogs/livekit_eot_v0_4.md new file mode 100644 index 0000000000000000000000000000000000000000..f17bb68473b69b648709150dec0d75ef2d6fb459 --- /dev/null +++ b/03-finetune-pipecat-pt/references/blogs/livekit_eot_v0_4.md @@ -0,0 +1,104 @@ +--- +title: "Improved End-of-Turn Model Cuts Voice AI Interruptions 39%" +source: https://livekit.com/blog/improved-end-of-turn-model-cuts-voice-ai-interruptions-39/ +date_accessed: 2026-03-16 +--- + +# Improved End-of-Turn Model Cuts Voice AI Interruptions 39% + +## Overview + +LiveKit released an updated transformer-based end-of-turn detection model, `v0.4.1-intl`, featuring "a 39.23% relative reduction in false-positive interruptions" compared to the previous version. The model now emphasizes structured data handling and multilingual performance across supported languages. + +## Key Improvements + +### Performance Metrics + +The benchmark demonstrates consistent gains across all tested languages: + +| Language | v0.3.0 Error Rate | v0.4.1 Error Rate | Relative Improvement | +|----------|------------------|------------------|----------------------| +| Chinese | 18.70% | 13.40% | 28.34% | +| Dutch | 26.10% | 11.90% | 54.33% | +| English | 16.60% | 13.00% | 21.69% | +| French | 16.80% | 11.10% | 33.93% | +| German | 23.40% | 12.20% | 47.86% | +| Hindi | 5.40% | 3.70% | 31.48% | +| Indonesian | 17.00% | 10.60% | 37.65% | +| Italian | 20.10% | 14.90% | 25.87% | +| Japanese | 19.70% | 11.20% | 43.15% | +| Korean | 7.90% | 5.50% | 30.38% | +| Portuguese | 23.30% | 12.60% | 45.97% | +| Russian | 19.50% | 12.00% | 38.46% | +| Spanish | 21.50% | 14.00% | 33.88% | +| Turkish | 25.40% | 12.70% | 50.0% | +| **All** | **18.66%** | **11.34%** | **39.23%** | + +Error rate represents the false positive rate at a fixed true positive rate of 99.3%. + +## Technical Architecture + +### The Challenge of End-of-Turn Detection + +Voice AI systems must detect speech completion using three primary cues: + +- **Semantic content**: Word meaning and language understanding +- **Context**: Dialogue history and conversational flow +- **Prosody**: Tone, pauses, and rhythmic patterns + +The model uses an LLM backbone to effectively combine content and context information. + +### Structured Data Handling + +A primary enhancement addresses structured information collection. When users provide phone numbers, emails, or addresses, natural speech markers (intonation changes, grammatical endings) are absent. The updated model "addresses this by inferring expected formats from the agent's prompt," enabling: + +- Detection of complete phone numbers (typically 10 digits for US numbers) +- Email pattern recognition +- Address validation including street, city, state, and zip code + +The visualization tool demonstrates the improvement: `v0.3.0-intl` incorrectly detected end points after individual digits, while `v0.4.1-intl` waited for the complete sequence. + +### Handling Speech-to-Text Variability + +Training data incorporated multiple STT output formats. One engine might transcribe "forty two" as words while another outputs "42" numerically. By training across "common STT output formats," the model maintains consistent performance across different provider implementations. + +### Multilingual Generalization + +Despite structured data enhancements targeting English training data, improvements transferred across languages -- particularly for phone number and address detection in Spanish, French, and other languages. This stems from the Qwen2.5 base model's multilingual pretraining, which encodes "knowledge of global formats." + +## Model Architecture & Optimization + +### Base Model Selection + +The team selected **Qwen2.5-0.5B-Instruct** for its balance of performance and low-latency CPU inference capabilities. + +### Knowledge Distillation + +To improve multilingual accuracy without sacrificing efficiency: + +1. A larger **Qwen2.5-7B-Instruct** teacher model was trained +2. Knowledge was distilled into the smaller 0.5B student model +3. The distilled model achieves "higher multilingual accuracy with the efficiency of the smaller size" + +Training curves show the distilled model outperforming the baseline 0.5B version and approaching teacher performance after approximately 1,500 steps. + +## Availability & Deprecation + +- **Deployment**: Available in Agents Python 1.3.0 and Agents JS 1.0.19 +- **Model**: `MultilingualModel` now recommended for all use cases +- **Deprecation**: The legacy `EnglishModel` is being deprecated as the multilingual version "not only matches but in most cases exceeds the performance" + +## Observability Integration + +LiveKit Agents now include built-in observability for end-of-turn detection. When Agent Observability is enabled, "every turn detection decision is logged with the exact input the model saw." Production debugging accesses `eou_detection` traces showing full prediction context. + +## Future Directions + +Current implementation relies on transcribed text. Future iterations will integrate raw audio features like pauses and emphasis through multimodal architectures, "fusing prosody directly into predictions for more precise detection." + +--- + +**Resources**: +- [GitHub Repository](https://github.com/livekit/agents) +- [Visualization Tool](https://huggingface.co/spaces/livekit/eot-visualization) +- [Voice AI Quickstart](https://docs.livekit.io/agents/start/voice-ai/) diff --git a/03-finetune-pipecat-pt/references/blogs/livekit_transformer_turn_detection.md b/03-finetune-pipecat-pt/references/blogs/livekit_transformer_turn_detection.md new file mode 100644 index 0000000000000000000000000000000000000000..42df1873df65a23f3dc7ed652dac4f2905844c6e --- /dev/null +++ b/03-finetune-pipecat-pt/references/blogs/livekit_transformer_turn_detection.md @@ -0,0 +1,87 @@ +--- +title: "Using a Transformer to Improve End of Turn Detection" +source: https://livekit.com/blog/using-a-transformer-to-improve-end-of-turn-detection +date_accessed: 2026-03-16 +--- + +# Using a Transformer to Improve End of Turn Detection + +**Published:** December 20, 2024 | **Author:** Russ D'Sa | **Reading Time:** 6 minutes + +## Overview + +End-of-turn detection represents one of the most challenging problems in voice AI applications. The core task involves determining when a user has finished speaking, allowing an AI model to respond without unintentionally interrupting the user. + +## Current Approach: Phrase Endpointing + +The predominant technique for turn detection is phrase endpointing, typically implemented through voice activity detection (VAD). This method operates as follows: + +- Audio packets are processed through a neural network to determine if human speech is present +- If speech is detected, the user continues speaking +- If silence is detected, a timer begins tracking the duration of the absence of detectable human speech +- Once a configured silence threshold passes, an end-of-turn event triggers, allowing LLM inference to begin + +LiveKit Agents uses "Silero VAD to detect voice activity and provide timing parameters" to adjust sensitivity. The framework introduces a configurable delay via the `min_endpointing_delay` parameter (default: 500ms) between when VAD detects speech cessation and when LLM inference begins. + +## The VAD Limitation + +VAD only detects *when* someone is speaking based on audio signal presence. Humans, however, use semantic understanding -- analyzing what is said and how it's expressed -- to determine turn-taking. For example, "I understand your point, but..." would trigger VAD's end-of-turn signal despite the human listener recognizing the speaker intends to continue. + +## The EOU (End of Utterance) Model + +LiveKit released "an open source transformer model that uses the content of speech to predict when a user has" finished speaking. The model incorporates semantic understanding into turn detection. + +### Model Architecture + +- **Base Model:** 135M parameter transformer based on SmolLM v2 from HuggingFace +- **Design Choice:** Small model size enables CPU-based, real-time inference +- **Context Window:** Sliding window of the last four conversational turns +- **Language Support:** Currently English transcriptions only; additional languages planned + +### How It Works + +During user speech, transcriptions from a speech-to-text (STT) service are appended word-by-word to the model's context window. For each final STT transcription, the model generates a confidence-level prediction regarding whether the current context represents turn completion. + +The model integrates with VAD by dynamically adjusting the silence timeout: longer silence periods are permitted when EOU indicates the user hasn't finished speaking, reducing interruptions while maintaining responsiveness. + +## Performance Results + +Compared to VAD alone, the EOU + VAD approach achieves: + +- **85% reduction** in unintentional interruptions +- **3% false negative rate** (incorrectly indicating turn continuation) + +### Use Cases + +The model proves particularly valuable for conversational AI and customer support applications requiring data collection: + +- Conducting interviews +- Collecting addresses for shipments +- Gathering phone numbers +- Processing payment information +- Ordering transactions (pizza ordering demonstrated) + +## Implementation + +The turn detector is packaged as a LiveKit Agents plugin, enabling simple integration through a single additional parameter in the `VoicePipelineAgent` constructor: + +``` +turn_detector=TurnDetector.from_plugin() +``` + +Example implementation code is available in the LiveKit agents repository. + +## Future Development + +Planned improvements include: + +- **Multi-language Support:** Extending EOU beyond English +- **Inference Optimization:** Reducing current ~50ms inference latency +- **Context Window Expansion:** Increasing the four-turn window +- **Audio-Based Detection:** Developing models for multimodal systems that process audio directly, accounting for prosodic features like intonation and cadence + +The team acknowledges that EOU's text-based training limits its use with natively multimodal models such as OpenAI's Realtime API. An audio-native model is under development to address this constraint. + +## Broader Implications + +The researchers emphasize that "even humans don't get this right all the time" and that non-verbal cues improve human turn-taking performance. They consider this research essential for developing natural, humanlike AI interactions and anticipate significant community innovation in this area. diff --git a/03-finetune-pipecat-pt/references/blogs/pipecat_smart_turn_v3.md b/03-finetune-pipecat-pt/references/blogs/pipecat_smart_turn_v3.md new file mode 100644 index 0000000000000000000000000000000000000000..eca26630839608d8deb62065c11bc9691f6ef26e --- /dev/null +++ b/03-finetune-pipecat-pt/references/blogs/pipecat_smart_turn_v3.md @@ -0,0 +1,75 @@ +--- +title: "Announcing Smart Turn v3: CPU Inference in Just 12ms" +source: https://www.daily.co/blog/announcing-smart-turn-v3-with-cpu-inference-in-just-12ms/ +date_accessed: 2026-03-16 +--- + +# Smart Turn v3: CPU Inference Breakthrough + +## Overview + +Daily announced Smart Turn v3, a dramatically improved voice turn detection model featuring unprecedented efficiency. The system achieves "CPU inference in just 12ms" while maintaining open-source accessibility for weights, training data, and scripts. + +## Key Improvements + +### Size and Performance +- **Model size:** Reduced to 8 MB (nearly 50x smaller than v2) +- **CPU inference speed:** 12ms on modern processors; 60ms on budget AWS instances +- **No GPU required:** Runs directly within Pipecat Cloud instances + +### Language Coverage +Expanded to 23 languages including Arabic, Bengali, Chinese, Danish, Dutch, German, English, Finnish, French, Hindi, Indonesian, Italian, Japanese, Korean, Marathi, Norwegian, Polish, Portuguese, Russian, Spanish, Turkish, Ukrainian, and Vietnamese. + +### Architecture +- **Foundation:** Whisper Tiny encoder (39M parameters) +- **Classification layers:** Adapted from Smart Turn v2 +- **Total parameters:** 8M +- **Optimization:** int8 quantization via static QAT +- **Export format:** ONNX + +## Competitive Comparison + +| Metric | Smart Turn v3 | Krisp | Ultravox | +|--------|---------------|-------|----------| +| Size | 8 MB | 65 MB | 1.37 GB | +| Languages | 23 | English only | 26 | +| Availability | Open weights/data | Proprietary | Open weights | +| Focus | Single-inference latency | Decision confidence | Conversation context | + +## Performance Benchmarks + +CPU inference results (including preprocessing): + +| Platform | Speed | +|----------|-------| +| AWS c7a.2xlarge | 12.6 ms | +| AWS c8g.2xlarge | 15.2 ms | +| Modal (6 cores) | 17.7 ms | +| AWS t3.2xlarge | 33.8 ms | +| AWS c8g.medium | 59.8 ms | +| AWS t3.medium | 94.8 ms | + +GPU performance shows 3.3-6.6ms latency across various NVIDIA processors. + +## Accuracy Results + +Highest performers: Turkish (97.10%), Korean (96.85%), Japanese (96.76%) + +Lower performers: Vietnamese (81.27%), Bengali (84.10%), Marathi (87.60%) + +English achieved 94.31% accuracy across 2,846 test samples. + +## Implementation + +### With Pipecat +`LocalSmartTurnAnalyzerV3` integration available (v0.0.85+). Users download ONNX model from HuggingFace repository. + +### Standalone +Direct ONNX runtime usage via provided inference scripts. Requires accompanying VAD model (Silero recommended) for optimal results. + +## Resources + +- Model weights on HuggingFace +- GitHub repository containing training code and inference examples +- Open test datasets available for benchmark reproduction +- Community dataset annotation available at smart-turn-dataset.pipecat.ai diff --git a/03-finetune-pipecat-pt/references/blogs/pipecat_smart_turn_v3_1.md b/03-finetune-pipecat-pt/references/blogs/pipecat_smart_turn_v3_1.md new file mode 100644 index 0000000000000000000000000000000000000000..5e04159a4f4cac64c68e5166f0ca29b0789c911e --- /dev/null +++ b/03-finetune-pipecat-pt/references/blogs/pipecat_smart_turn_v3_1.md @@ -0,0 +1,69 @@ +--- +title: "Improved Accuracy in Smart Turn v3.1" +source: https://www.daily.co/blog/improved-accuracy-in-smart-turn-v3-1/ +date_accessed: 2026-03-16 +--- + +# Improved Accuracy in Smart Turn v3.1 + +Smart Turn v3.1 represents a significant advancement in conversation turn detection, leveraging expanded training datasets to enhance model performance across languages. + +## Overview + +Smart Turn v3.1 maintains the same architecture as its predecessor while offering improved accuracy through additional human audio training data and refined quantization methods. The update functions as a direct replacement for v3.0, requiring no code modifications to existing implementations. + +## Training Data Enhancement + +The model benefits from contributions by three specialized data partners: + +**Liva AI** provides "real human voice data to improve speech models" across multiple languages and dialects, founded by researchers with publications in IEEE and machine learning conferences. + +**Midcentury** develops "multimodal-native research" datasets spanning 12+ languages, emphasizing real-world performance scenarios. + +**MundoAI** constructs "the world's largest and highest quality multimodal datasets" across 16+ languages, prioritizing multilingual diversity. + +These partners contributed audio samples in English and Spanish, publicly released through HuggingFace as `smart-turn-data-v3.1-train` and `smart-turn-data-v3.1-test` datasets. + +## Model Variants + +Two deployment options accommodate different hardware configurations: + +- **CPU Model (8MB, int8 quantized)**: Lightweight variant enabling ~12ms CPU inference, matching v3.0 sizing +- **GPU Model (32MB, unquantized)**: Enhanced variant for GPU deployment with ~1% accuracy improvement + +## Accuracy Improvements + +Testing on the new v3.1 dataset demonstrates substantial gains: + +| Language | v3.0 | v3.1 (8MB) | v3.1 (32MB) | +|----------|------|-----------|-----------| +| English | 88.3% | 94.7% | 95.6% | +| Spanish | 86.7% | 90.1% | 91.0% | + +The model supports 23 total languages; the remaining 21 maintain parity with v3.0 performance. + +## Performance Benchmarks + +Single-inference latency across representative hardware: + +| Device | v3.1 (8MB) | v3.1 (32MB) | Preprocessing | +|--------|-----------|-----------|---------------| +| GPU (NVIDIA L40S) | 2 ms | 1 ms | 1 ms | +| GPU (NVIDIA T4) | 5 ms | 4 ms | 2 ms | +| CPU (AWS c7a.2xlarge) | 9 ms | 13 ms | 7 ms | +| CPU (AWS c8g.2xlarge) | 20 ms | 32 ms | 9 ms | +| CPU (AWS c7a.medium) | 37 ms | 73 ms | 7 ms | +| CPU (AWS c8g.medium) | 57 ms | 159 ms | 9 ms | + +Performance optimization is achievable through environment variable configuration: + +``` +OMP_NUM_THREADS=1 +OMP_WAIT_POLICY="PASSIVE" +``` + +## Resources + +- Model weights: https://huggingface.co/pipecat-ai/smart-turn-v3 +- GitHub repository: https://github.com/pipecat-ai/smart-turn +- Datasets: https://huggingface.co/pipecat-ai diff --git a/03-finetune-pipecat-pt/references/blogs/pipecat_smart_turn_v3_2.md b/03-finetune-pipecat-pt/references/blogs/pipecat_smart_turn_v3_2.md new file mode 100644 index 0000000000000000000000000000000000000000..8730c24179fb57c6d39d0e7ea2677eb7eda5f44c --- /dev/null +++ b/03-finetune-pipecat-pt/references/blogs/pipecat_smart_turn_v3_2.md @@ -0,0 +1,41 @@ +--- +title: "Smart Turn v3.2: Handling Noisy Environments and Short Responses" +source: https://www.daily.co/blog/smart-turn-v3-2-handling-noisy-environments-and-short-responses/ +date_accessed: 2026-03-16 +--- + +# Smart Turn v3.2: Better Accuracy for AI Voice Agents + +## Overview + +Smart Turn v3.2 represents an update to an open-source turn detection model designed to help AI voice agents identify when users finish speaking. The model listens to raw audio and determines optimal response timing -- preventing interruptions while eliminating unnecessary delays. + +## Key Improvements in v3.2 + +### Short Utterances Enhancement +The model now handles brief verbal responses significantly better. Single words like "yes" or "okay" are "miscategorized 40% less often according to our public benchmarks." Two changes enabled this improvement: + +- Introduction of a new dataset focused on short utterances (planned for expansion) +- Resolution of a padding issue during training that was compromising accuracy + +### Background Noise Robustness +The updated version performs better in real-world environments by incorporating realistic cafe and office noise into training and testing datasets, moving beyond studio-quality audio assumptions. + +## Technical Specifications + +**Model Variants:** +- CPU version: 8MB +- GPU version: 32MB + +Both serve as drop-in replacements for v3.1, maintaining compatibility with existing implementations. + +## Available Resources + +**Open-source components:** +- Model weights: Available on HuggingFace +- Training code: Published on GitHub +- Datasets: Two new datasets released for training and testing purposes + +## Integration + +The model integrates with Pipecat through the `LocalSmartTurnAnalyzerV3` constructor, allowing immediate use via the `smart_turn_model_path` parameter. diff --git a/03-finetune-pipecat-pt/references/blogs/speechmatics_semantic_turn.md b/03-finetune-pipecat-pt/references/blogs/speechmatics_semantic_turn.md new file mode 100644 index 0000000000000000000000000000000000000000..65b6ccb1a677614ca957380c507758563f885184 --- /dev/null +++ b/03-finetune-pipecat-pt/references/blogs/speechmatics_semantic_turn.md @@ -0,0 +1,103 @@ +--- +title: "How to Build Smarter Turn Detection for Voice AI" +source: https://blog.speechmatics.com/semantic-turn-detection +date_accessed: 2026-03-16 +--- + +# How to Build Smarter Turn Detection for Voice AI + +**Published:** May 12, 2025 | **Author:** Aaron Ng, Machine Learning Engineer | **Read Time:** 13 minutes + +## Introduction + +Voice AI systems face a fundamental challenge: determining when users have finished speaking. Traditional approaches relying on fixed silence periods (like 500ms) create frustrating interruptions. This article explores how semantic understanding can improve turn detection. + +## The Problem with Traditional Voice Activity Detection + +VAD systems detect speech boundaries through audio patterns alone. As the article explains, "VAD only understands audio patterns. It knows _when_ there's speech and when there isn't. What it doesn't know is _why_ there's a pause." + +The example demonstrates this limitation: when a customer pauses to check information ("Sure it's 123 764... (pauses to check their notes)"), VAD incorrectly interprets the silence as turn completion and triggers an interruption. + +## Semantic Turn Detection Solution + +Rather than relying solely on silence detection, the proposed approach uses instruction-tuned Small Language Models (SLMs) to understand conversational context. These models contain fewer than 10 billion parameters, enabling fast local inference critical for voice interactions. + +### Key Benefits + +- **Reduced Latency**: Local SLM inference avoids the 500ms+ delays of external API calls +- **Cost Savings**: Fewer false interruptions mean fewer unnecessary LLM API calls for responses that get discarded +- **Better UX**: Systems recognize contextual pauses versus actual turn completion + +## Technical Implementation + +### Core Mechanism + +The approach monitors the probability of the `<|im_end|>` token -- a special ChatML marker indicating turn completion. When this token's probability is high, the model predicts the user has finished speaking. + +The article illustrates this with examples: +- "Can I have two chicken McNuggets and" -> Low `<|im_end|>` probability (incomplete thought) +- "I have a problem with my card" -> Higher `<|im_end|>` probability (complete thought) + +### Architecture Components + +**1. Message Tokenization** + +Messages are formatted using ChatML structure with special tokens (`<|im_start|>`, `<|im_end|>`, `<|im_sep|>`). The implementation removes the final `<|im_end|>` token since the model predicts its presence. + +**2. Model Inference** + +The code uses `AutoModelForCausalLM` to extract logits for the final token position, computing log-softmax probabilities across the vocabulary. + +**3. Token Probability Extraction** + +The system identifies the `<|im_end|>` token among top-k predictions (k=20) and converts log probabilities to standard probabilities using exponential transformation. + +### Code Example Structure + +The implementation includes an `EndOfTurnModel` class with: + +- `_convert_messages_to_chatml()`: Formats conversations in ChatML format +- `get_next_token_logprobs()`: Performs local inference to retrieve next-token probabilities +- `process_result()`: Extracts target token probabilities +- `predict_eot_prob()`: Orchestrates the full pipeline returning probability scores + +**Model Details**: The implementation uses `SmolLM2-360M-Instruct` from Hugging Face, chosen for efficiency and CPU compatibility. + +## Configuration Parameters + +- **MAX_HISTORY**: 4 messages (recent context window) +- **DEFAULT_THRESHOLD**: 0.03 (default probability threshold) +- **Top-k consideration**: 20 tokens + +## Practical Considerations + +### Token Selection Strategy + +Beyond `<|im_end|>`, punctuation marks (periods, question marks, exclamation points) can signal thought completion. Hybrid approaches monitoring multiple tokens may improve accuracy but risk introducing noise. + +### Hybrid Approach with VAD + +The article recommends combining semantic detection with VAD: +- VAD provides high recall detecting speech presence +- Semantic turn detection improves precision to reduce false interruptions +- Dynamic grace periods can adjust based on speaking patterns + +### Threshold Determination + +The default 0.03 threshold serves as a starting point. "The optimal threshold depends on your specific SLM and can vary significantly between models." Precision-focused optimization using representative test sets is recommended. + +### Multilingual Support + +Supporting multiple languages requires instruction-tuned SLMs trained on diverse multilingual data, accounting for varied grammar and conversational norms. + +### Fine-Tuning Opportunities + +While general SLMs provide solid conversational understanding, task-specific fine-tuning on domain conversational data can improve accuracy for specialized terminology or industry-specific use cases. + +## Future Directions + +The article acknowledges limitations: "True conversational understanding requires more than just reading text or listening for gaps -- it needs an audio-native model" that processes tone, cadence, hesitation, and emphasis directly from audio signals. + +## Resource Reference + +A complete implementation is available on GitHub at the repository referenced in the article. diff --git a/03-finetune-pipecat-pt/references/guides/focal_loss_calibration_emnlp_2022.md b/03-finetune-pipecat-pt/references/guides/focal_loss_calibration_emnlp_2022.md new file mode 100644 index 0000000000000000000000000000000000000000..58a4d051d20536380b475d6254e58c038f173e58 --- /dev/null +++ b/03-finetune-pipecat-pt/references/guides/focal_loss_calibration_emnlp_2022.md @@ -0,0 +1,209 @@ +--- +title: "Calibrating Imbalanced Classifiers with Focal Loss: An Empirical Study" +source: https://aclanthology.org/2022.emnlp-industry.14/ +date: 2026-03-16 +--- + +# Calibrating Imbalanced Classifiers with Focal Loss: An Empirical Study + +**Authors:** Cheng Wang, Jorge Balazs, Gyuri Szarvas, Patrick Ernst, Lahari Poddar, Pavel Danchenko (Amazon) + +**Venue:** EMNLP 2022 Industry Track, December 9-11, Abu Dhabi, UAE, pages 155-163 + +**DOI:** 10.18653/v1/2022.emnlp-industry.14 + +**PDF:** https://aclanthology.org/2022.emnlp-industry.14.pdf + +## Abstract + +Imbalanced data distribution is a practical and common challenge in building ML models in industry, where data usually exhibits long-tail distributions. For instance, in virtual AI Assistants (Google Assistant, Amazon Alexa, Apple Siri), the "play music" or "set timer" utterance is exposed to an order of magnitude more traffic than other skills. This can easily cause trained models to overfit to the majority classes, leading to model miscalibration. The uncalibrated models output unreliable (mostly overconfident) predictions, which are at high risk of affecting downstream decision-making systems. The authors empirically show the effectiveness of model training with focal loss in learning better calibrated models, as compared to standard cross-entropy loss. Better calibration enables better control of the precision-recall trade-off for trained models. + +## 1. Introduction + +Building ML models in industry faces practical challenges from imbalanced data distributions, particularly long-tail distributions, which make models overfit to majority data classes and lead to miscalibration -- the model-predicted probability fails to estimate the likelihood of true correctness and provides over- or under-confident predictions. + +The study focuses on **return reason code prediction** in customer service chatbots as a practical application of focal loss for calibration. + +### Contributions + +- Empirically examine the effectiveness of using focal loss in handling model miscalibration in a practical application setting +- Show that good calibration is important to achieve a desired precision or recall target by tuning classification thresholds (standard cross-entropy loss is incapable of this due to skewed predicted probability distribution) +- Demonstrate performance of calibrated models through a chatbot serving customers across three conversational bot use-cases + +## 2. Focal Loss Formulation + +Focal loss is defined as: + +``` +L_f = - sum_{i=1}^{N} (1 - p_{i,y_i})^gamma * log(p_{i,y_i}) +``` + +where `p_{i,y_i}` is predicted probability of the i-th sample and `gamma` is a hyper-parameter typically set to `gamma = 1`. + +### Theoretical Interpretation + +Focal loss can be interpreted as a trade-off between minimizing KL divergence and maximizing entropy: + +``` +L_f >= KL(q || p) + H(q) - gamma * H(p) +``` + +The rationale: we learn a probability `p` to have a high value (confident) due to the KL term, but not too high (overconfident) due to the entropy regularization term. + +### Practical Advantages + +Compared to other calibration methods (temperature scaling, Bayesian methods, label smoothing, kernel-based methods), focal loss: +- Neither increases computational overhead nor requires architectural modifications +- Offers **in-training implicit calibration** (unlike temperature scaling which requires post-training calibration) + +## 3. Calibration Metrics + +### Reliability Diagrams + +Predictions are grouped into N interval bins; accuracy vs. confidence is computed per bin: + +``` +acc(b_n) = (1/I_n) * sum_i 1(y_hat_i = y_i) +conf(b_n) = (1/I_n) * sum_i p_hat_i +``` + +A perfectly calibrated model has `acc(b_n) = conf(b_n)` for all n. + +### Expected Calibration Error (ECE) + +``` +ECE = (1/I) * sum_{n=1}^{N} I_n * |acc(b_n) - conf(b_n)| +``` + +### Maximum Calibration Error (MCE) + +``` +MCE = max_n |acc(b_n) - conf(b_n)| +``` + +Particularly important in high-risk applications. For a perfectly calibrated classifier, both ECE and MCE equal 0. + +## 4. Datasets and Implementation + +### Task + +Binary and multi-class return reason code prediction in an online retail store: +- **Binary:** "item is defective or does not work" (Label 0) vs. OTHERS (Label 1) +- **Multi-class:** 4 specific return reasons + OTHERS (5 total classes) + +Both datasets exhibit class imbalance with OTHERS as the most frequent class. + +### Model Architecture + +- 2 bidirectional LSTM layers + 2 dense layers +- Embedding dimension: 1024 +- Hidden layer dimension: 128 (binary), 512 (multi-class) +- Dropout: 0.1 (embedding), 0.2 (dense) +- Optimizer: Adam +- Framework: PyTorch + +### Golden Dataset + +- Binary model: 1,013 human-annotated samples +- Multi-class model: 1,839 human-annotated samples +- Data split: 8:1:1 (train/val/test) + +## 5. Results + +### Binary Reason Code Prediction (Table 1) + +| Metric | CE | FL1 | FL3 | FL5 | FL10 | +|--------|------|------|------|------|------| +| Accuracy | **0.836** | 0.824 | 0.831 | 0.834 | 0.816 | +| Precision | **0.838** | 0.822 | 0.830 | 0.834 | 0.807 | +| Recall | **0.823** | 0.814 | 0.821 | 0.823 | 0.805 | +| F1 | **0.828** | 0.817 | 0.824 | 0.827 | 0.806 | +| NLL | 2.159 | 1.438 | 0.608 | 0.258 | **0.178** | +| ECE | 0.168 | 0.166 | 0.139 | 0.080 | **0.078** | +| MCE | 0.720 | 0.730 | 0.236 | **0.134** | 0.143 | + +**Key finding:** CE achieves best predictive performance, but FL significantly outperforms on calibration metrics (NLL, ECE, MCE). Higher gamma yields better calibration with modest predictive performance loss. + +### Multi-Reason Code Prediction (Table 2) + +| Metric | CE | FL1 | FL5 | +|--------|------|------|------| +| Accuracy | 0.751 | **0.760** | 0.751 | +| Precision | **0.814** | 0.807 | **0.814** | +| Recall | **0.757** | 0.755 | **0.757** | +| F1 | **0.764** | 0.760 | **0.764** | +| NLL | 0.599 | 0.429 | **0.309** | +| ECE | 0.037 | **0.023** | 0.037 | +| MCE | 0.296 | **0.197** | 0.299 | + +### Reliability Diagrams + +- CE model: ECE = 16.78%, MCE = 72.00% (binary) +- FL5 model: ECE = 7.98%, MCE = 13.40% (binary) +- FL10 model: ECE = 7.76%, MCE = 14.34% (binary) + +As gamma increases from CE to FL10, probability distributions shift from "spiking" (overconfident, p close to 0 or 1) to flatter distributions (e.g., p = {0.6, 0.4}). + +### Precision-Recall Trade-Off + +CE model produces polarized probabilities, making it difficult to tune precision based on a given recall or vice versa. FL models learn better-distributed probabilities across [0, 1], enabling effective threshold-based precision/recall tuning. + +## 6. Deployment Results (Online A/B Test) + +Model deployed in three conversational chatbot use-cases with threshold=0.512 for 85% target precision: + +### Online Evaluation (Table 3) -- Relative Improvements + +| Application | Metric | Treatment vs. Control | +|------------|--------|----------------------| +| Use-case A | AR | +2.13% | +| Use-case A | PRR | +3.18% | +| Use-case A | 24RR | -0.65% | +| Use-case B | AR | +2.10% | +| Use-case B | PRR | +0.97% | +| Use-case B | 24RR | -0.68% | +| Use-case C | AR | +3.98% | +| Use-case C | PRR | +12.85% | +| Use-case C | 24RR | -1.02% | + +**Metrics:** +- **AR (Automation Rate):** % contacts resolved without human involvement (higher = better) +- **PRR (Positive Response Rate):** % positive customer responses to chatbot resolution (higher = better) +- **24RR (Repeat Rate):** % customers contacting again within 24h for same issue (lower = better) + +### Intrinsic Evaluation + +- Precision on deployed model: 384/485 = 83.8% (aligns with offline 81.4%) +- Negative predictive value: 194/200 = 97% for OTHERS class + +## 7. Key Takeaways + +1. Focal loss provides simple, effective in-training calibration via entropy regularization +2. Higher gamma values improve calibration without significantly hurting predictive performance +3. Well-calibrated models enable practical precision/recall threshold tuning that CE-trained models cannot achieve +4. The discrimination-calibration trade-off is modest -- best model balances both (FL5 recommended) +5. Better calibration directly translates to improved downstream application metrics + +## Citation + +```bibtex +@inproceedings{wang-etal-2022-calibrating, + title = "Calibrating Imbalanced Classifiers with Focal Loss: An Empirical Study", + author = {Wang, Cheng and Balazs, Jorge and Szarvas, Gy{\"o}rgy and Ernst, Patrick and Poddar, Lahari and Danchenko, Pavel}, + booktitle = "Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing: Industry Track", + month = dec, + year = "2022", + address = "Abu Dhabi, UAE", + publisher = "Association for Computational Linguistics", + pages = "155--163", + doi = "10.18653/v1/2022.emnlp-industry.14" +} +``` + +## References (Selected) + +- Lin et al., 2017 -- Focal loss for dense object detection (original focal loss paper) +- Mukhoti et al., 2020 -- Calibrating deep neural networks using focal loss (NeurIPS) +- Guo et al., 2017 -- On calibration of modern neural networks (ICML) +- Pereyra et al., 2017 -- Regularizing neural networks by penalizing confident output distributions +- Naeini et al., 2015 -- Obtaining well calibrated probabilities using Bayesian binning (AAAI) diff --git a/03-finetune-pipecat-pt/references/guides/livekit_turn_detector.md b/03-finetune-pipecat-pt/references/guides/livekit_turn_detector.md new file mode 100644 index 0000000000000000000000000000000000000000..72467c59bcace4433461d5b08a1232641ac3df3a --- /dev/null +++ b/03-finetune-pipecat-pt/references/guides/livekit_turn_detector.md @@ -0,0 +1,170 @@ +--- +title: "LiveKit Turn Detector Model Card" +source: https://huggingface.co/livekit/turn-detector +date: 2026-03-16 +--- + +# LiveKit Turn Detector + +An open-weights language model for contextually-aware end-of-utterance (EOU) detection in voice AI applications. The model predicts whether a user has finished speaking based on the semantic content of their transcribed speech, providing a critical complement to voice activity detection (VAD) systems. + +> For installation, usage examples, and integration guides, see the [LiveKit documentation](https://docs.livekit.io/agents/logic/turns/turn-detector/). + +## Overview + +Traditional voice agents rely on voice activity detection (VAD) to determine when a user has finished speaking. VAD works by detecting the presence or absence of speech in an audio signal and applying a silence timer. While effective for detecting pauses, VAD lacks language understanding and frequently causes false positives. + +**Example:** A user who says *"I need to think about that for a moment..."* and then pauses will be interrupted by a VAD-only system, even though they clearly intend to continue. + +This model adds semantic understanding to the turn detection process by: +- Analyzing transcribed text of conversations in real time +- Predicting the probability that the user has completed their turn +- Reducing unwanted interruptions when integrated with VAD +- Handling structured data input effectively (addresses, phone numbers, email addresses, credit card numbers) + +## Model Variants + +**Multilingual** (recommended) and **English-only** (deprecated) are distributed as INT8 quantized ONNX models (`model_q8.onnx`) optimized for CPU inference. + +> **The English-only model (`EnglishModel`) is deprecated.** Use the **multilingual model (`MultilingualModel`)** for all new projects. The multilingual model provides better accuracy across all languages thanks to knowledge distillation from a larger teacher model and expanded training dataset. + +## How It Works + +The model operates on transcribed text from a speech-to-text (STT) system, not raw audio. + +### Process Flow + +1. **Input**: Recent conversation history (up to **6 turns**, truncated to **128 tokens**) is formatted using the [Qwen chat template](https://huggingface.co/Qwen/Qwen2.5-0.5B-Instruct) with `<|im_start|>` / `<|im_end|>` delimiters. The final user message is left _without_ the closing `<|im_end|>` token. + +2. **Prediction**: The model predicts the probability of the `<|im_end|>` token appearing next: + - **High probability** -> user has likely finished their utterance + - **Low probability** -> user is likely to continue + +3. **Thresholding**: Per-language thresholds (stored in `languages.json`) convert raw probability into a binary decision, tuned to balance responsiveness and accuracy for each supported language. + +4. **Integration with VAD**: Works alongside the [Silero VAD](https://docs.livekit.io/agents/logic/turns/vad/) plugin. VAD handles speech presence detection and interruption triggering, while this model provides the semantic signal for when to commit a turn. + +### Text Preprocessing + +**Multilingual variant** applies: +- NFKC unicode normalization +- Lowercasing +- Punctuation removal (preserving apostrophes and hyphens) +- Whitespace collapsing + +**English-only variant** passes raw transcribed text without normalization. + +## Architecture and Training + +### Base Model + +Both variants are fine-tuned from [Qwen/Qwen2.5-0.5B-Instruct](https://huggingface.co/Qwen/Qwen2.5-0.5B-Instruct), selected for strong performance on this task while enabling low-latency CPU inference. + +**Model Specifications:** +- **Parameters**: 0.1B +- **Tensor Type**: F32 +- **Chat Template**: Qwen format + +### Knowledge Distillation + +A **Qwen2.5-7B-Instruct** model was first fine-tuned as a teacher on end-of-turn prediction. Its knowledge was then distilled into the 0.5B student model: +- Distilled model approaches teacher-level accuracy +- Maintains efficiency of smaller architecture +- Converges after approximately 1,500 training steps + +### Training Data + +The training dataset is a mix of: + +- **Real call center transcripts** covering diverse conversational patterns +- **Synthetic dialogues** emphasizing structured data input: + - Addresses + - Email addresses + - Phone numbers + - Credit card numbers +- **Multi-format STT outputs** to handle provider variation (e.g., "forty two" vs. "42"), ensuring consistent predictions across different STT engines without runtime overhead + +*Note: Structured data enhancements were added only to the English training set, but performance improvements generalized across languages due to the multilingual knowledge encoded in Qwen2.5.* + +### Quantization + +The trained model is exported to ONNX format and quantized to INT8 (`model_q8.onnx`), enabling efficient CPU-only inference with ONNX Runtime. + +## Supported Languages + +The multilingual model supports **14 languages**. The model relies on the STT provider to report the detected language, which is then used to select the appropriate per-language threshold. + +**Supported:** English, Spanish, French, German, Italian, Portuguese, Dutch, Chinese, Japanese, Korean, Indonesian, Turkish, Russian, Hindi + +## Benchmarks + +### Detection Accuracy (Multilingual Variant) + +- **True positive** -- the model correctly identifies the user has finished speaking +- **True negative** -- the model correctly identifies the user will continue speaking + +| Language | True Positive Rate | True Negative Rate | +|----------|-------------------|-------------------| +| Hindi | 99.4% | 96.3% | +| Korean | 99.3% | 94.5% | +| French | 99.3% | 88.9% | +| Indonesian | 99.3% | 89.4% | +| Japanese | 99.3% | 88.8% | +| Dutch | 99.3% | 88.1% | +| Russian | 99.3% | 88.0% | +| German | 99.3% | 87.8% | +| Portuguese | 99.4% | 87.4% | +| Turkish | 99.3% | 87.3% | +| English | 99.3% | 87.0% | +| Chinese | 99.3% | 86.6% | +| Spanish | 99.3% | 86.0% | +| Italian | 99.3% | 85.1% | + +### Improvement Over Prior Version + +The multilingual v0.4.1 release achieved a **39.23% relative improvement** in handling structured inputs (emails, addresses, phone numbers, credit card numbers) compared to the prior version, reducing premature interruptions during data collection scenarios. + +## Usage + +The model is designed for use as a turn detection plugin within the [LiveKit Agents](https://github.com/livekit/agents) framework. + +For complete installation instructions, code examples (Python and Node.js), and configuration options, see the **[LiveKit turn detector plugin documentation](https://docs.livekit.io/agents/logic/turns/turn-detector/)**. + +For broader context on how turn detection fits into the voice pipeline -- including VAD configuration, interruption handling, and manual turn control -- see the **[Turns overview](https://docs.livekit.io/agents/logic/turns/)**. + +## Deployment Requirements + +- **Runtime**: CPU-only (no GPU required). Uses [ONNX Runtime](https://onnxruntime.ai/) with the `CPUExecutionProvider`. +- **RAM**: <500 MB for the multilingual model +- **Instance type**: Use compute-optimized instances (e.g., AWS c6i, c7i). Avoid burstable instances (e.g., AWS t3, t4g) to prevent inference timeouts from CPU credit exhaustion. +- **LiveKit Cloud**: The model is deployed globally on LiveKit Cloud. Agents running there automatically use the optimized remote inference service with no local resource requirements. + +## Limitations + +- **Text-only input**: The model operates on STT-transcribed text and cannot incorporate prosodic cues such as pauses, intonation, or emphasis. Future versions may integrate multimodal audio features. +- **STT dependency**: Prediction quality depends on the accuracy and output format of the upstream STT provider. Mismatches between training and deployment STT formats may degrade performance. +- **Context window**: Limited to 128 tokens across a maximum of 6 conversation turns. +- **Language coverage**: Currently supports 14 languages. Performance on unsupported languages is undefined. +- **Realtime model compatibility**: Cannot be used with audio-native realtime models (e.g., OpenAI Realtime API) without adding a separate STT service, which incurs additional cost and latency. + +## License + +This model is released under the [LiveKit Model License](https://huggingface.co/livekit/turn-detector/blob/main/LICENSE). + +## Resources + +- **[Documentation](https://docs.livekit.io/agents/logic/turns/turn-detector/)**: Full plugin documentation, installation, and integration guide +- **[Turns Overview](https://docs.livekit.io/agents/logic/turns/)**: How turn detection fits into the LiveKit Agents voice pipeline +- **[Blog: Improved End-of-Turn Model](https://blog.livekit.io/improved-end-of-turn-model-cuts-voice-ai-interruptions-39/)**: Technical deep dive on multilingual distillation approach and benchmarks +- **[Blog: Using a Transformer for Turn Detection](https://blog.livekit.io/using-a-transformer-to-improve-end-of-turn-detection/)**: Original blog post introducing the concept and architecture +- **[Video: LiveKit Turn Detector](https://youtu.be/OZG0oZKctgw)**: Overview video demonstrating the plugin +- **[GitHub: Plugin Source](https://github.com/livekit/agents/tree/main/livekit-plugins/livekit-plugins-turn-detector)**: Source code for the `livekit-plugins-turn-detector` package +- **[PyPI](https://pypi.org/project/livekit-plugins-turn-detector/)** | **[npm](https://www.npmjs.com/package/@livekit/agents-plugin-livekit)**: Package registries + +--- + +**Model Stats:** +- Downloads last month: 240,507 +- Model size: 0.1B params +- Format: Safetensors (INT8 quantized ONNX) +- Base Model: Qwen/Qwen2.5-0.5B-Instruct diff --git a/03-finetune-pipecat-pt/references/guides/pipecat_data_generation_guide.md b/03-finetune-pipecat-pt/references/guides/pipecat_data_generation_guide.md new file mode 100644 index 0000000000000000000000000000000000000000..47460464644e7bc6939e252e25250261875a2228 --- /dev/null +++ b/03-finetune-pipecat-pt/references/guides/pipecat_data_generation_guide.md @@ -0,0 +1,33 @@ +--- +title: "Pipecat Smart Turn - Data Generation Contribution Guide" +source: https://raw.githubusercontent.com/pipecat-ai/smart-turn/main/docs/data_generation_contribution_guide.md +date: 2026-03-16 +--- + +# Contributing Training Data to Smart Turn + +## Audio Format Requirements + +**FLAC is the preferred format** for training data, with lossy formats like MP3 or Opus discouraged. Audio should be **mono audio with a bit depth of 16 bits** and works optimally at 16kHz sample rates, though higher rates are acceptable. + +## File Organization + +Contributors have flexibility in naming and directory structure. The recommended approach uses unique identifiers for files combined with directory labels by language and completeness status (e.g., `eng/incomplete/b3799254-8d6c-11f0-a90e-e7e92780240b.flac`). + +## Audio Length and Variation + +Each file must contain **one speech sample, no longer than 16 seconds.** Variety in length is encouraged, ranging from single words to complex sentences. + +## Content Guidelines + +Samples should represent **a single turn in the conversation** with only one speaker per file. Speech should resemble interactions with voice assistants or customer service representatives. The documentation emphasizes avoiding sentence repetition and background noise while excluding real personally identifiable information. + +## Complete vs. Incomplete Classification + +Samples require binary labeling. "Complete" samples represent finished thoughts suitable for immediate response, while "Incomplete" samples suggest the speaker will continue, ending with filler words, connectives, or suggestive prosody. Critically, incomplete samples **must not be cut off in the middle of a word** but rather end with full words and approximately 200ms of silence. + +A 50:50 split between categories is recommended for unbiased training. + +## Licensing and Submission + +Contributors must own recordings and secure speaker consent for public release. Datasets are published via HuggingFace. Submissions can occur through various cloud storage methods, with GitHub issues as the contact point. diff --git a/03-finetune-pipecat-pt/references/guides/pipecat_dataset_v3_2.md b/03-finetune-pipecat-pt/references/guides/pipecat_dataset_v3_2.md new file mode 100644 index 0000000000000000000000000000000000000000..feffad90458bac90268ab5c6c09051bc848e4c9a --- /dev/null +++ b/03-finetune-pipecat-pt/references/guides/pipecat_dataset_v3_2.md @@ -0,0 +1,93 @@ +--- +title: "Pipecat Smart Turn Data v3.2 - Training Dataset Card" +source: https://huggingface.co/datasets/pipecat-ai/smart-turn-data-v3.2-train +date: 2026-03-16 +--- + +# Smart Turn Data v3.2 Training Dataset + +The official training dataset for **Smart Turn v3.2**, hosted on Hugging Face Datasets. + +## Key Specifications + +| Attribute | Value | +|-----------|-------| +| **Organization** | Pipecat | +| **Dataset Size** | 100K - 1M rows | +| **Total Rows** | 270,946 | +| **Download Size** | 41.4 GB | +| **Parquet Size** | 41.4 GB | +| **Format** | Parquet | +| **Split** | train (271k rows) | +| **Subset** | default (271k rows) | + +## Data Modalities + +- **Audio** - Audio samples with duration ranging from 0.36s to 32.6s +- **Text** - Language and metadata fields + +## Supported Libraries + +- Datasets +- Dask +- Polars + +## Dataset Schema + +### Fields + +| Field | Type | Description | +|-------|------|-------------| +| `audio` | AudioObject | Audio samples | +| `id` | Text | UUID identifier (36 chars) | +| `language` | Text | 23 language classes | +| `endpoint_bool` | Boolean | End-of-turn indicator | +| `midfiller` | Boolean | Mid-utterance filler detection | +| `endfiller` | Boolean | End-of-utterance filler detection | +| `synthetic` | Boolean | Synthetic data flag | +| `dataset` | Text | Source dataset identifier (12 classes) | +| `spoken_text` | Null | Skipped column | + +### Language Coverage + +English, Chinese (zho), Vietnamese, Finnish, Spanish, Bengali, Hindi, Japanese, Portuguese, Russian, Korean, German, French, Dutch, Arabic, Marathi, Turkish, Icelandic, Polish, Ukrainian, Italian, Indonesian, Norwegian + +### Dataset Sources + +- chirp3_1, chirp3_2 +- liva_1 +- midcentury_1 +- mundo_1 +- rime_2 +- orpheus_endfiller_1 + +## Metadata (Croissant Format) + +The dataset uses **Croissant ML Commons** metadata schema (v1.1): + +```json +{ + "@context": "https://schema.org/", + "conformsTo": "http://mlcommons.org/croissant/1.1", + "@type": "sc:Dataset", + "name": "smart-turn-data-v3.2-train" +} +``` + +## Contributors + +### Direct Contributors +- The Pipecat team +- [Liva AI](https://www.theliva.ai/) +- [Midcentury](https://www.midcentury.xyz/) +- [MundoAI](https://mundoai.world/) + +### Background Noise Attribution +CC-0 licensed background noise samples sourced from Freesound.org contributors including: +- 4team, tomhannen, craigsmith, mrmayo, martats, and 26+ additional contributors + +## Access Methods + +- **Dataset Viewer:** https://huggingface.co/datasets/pipecat-ai/smart-turn-data-v3.2-train/viewer/ +- **Data Studio:** https://huggingface.co/datasets/pipecat-ai/smart-turn-data-v3.2-train/viewer/default/train +- **Files Browser:** Git repository with parquet conversion available diff --git a/03-finetune-pipecat-pt/references/guides/pipecat_smart_turn_readme.md b/03-finetune-pipecat-pt/references/guides/pipecat_smart_turn_readme.md new file mode 100644 index 0000000000000000000000000000000000000000..368a06cc3bfcea5b907064e195361267ded8f5d3 --- /dev/null +++ b/03-finetune-pipecat-pt/references/guides/pipecat_smart_turn_readme.md @@ -0,0 +1,115 @@ +--- +title: "Pipecat Smart Turn v3.2 - README" +source: https://raw.githubusercontent.com/pipecat-ai/smart-turn/main/README.md +date: 2026-03-16 +--- + +# Smart Turn v3.2 + +An open-source, community-driven native audio turn detection model designed to improve upon traditional voice activity detection (VAD) approaches. The model determines when a voice agent should respond to human speech by analyzing linguistic and acoustic cues rather than simply detecting speech presence. + +## Key Features + +**Language Support:** The system supports 23 languages including Arabic, Bengali, Chinese, Danish, Dutch, German, English, Finnish, French, Hindi, Indonesian, Italian, Japanese, Korean, Marathi, Norwegian, Polish, Portuguese, Russian, Spanish, Turkish, Ukrainian, and Vietnamese. + +**Performance Characteristics:** +- Inference completes in as little as 10ms on certain CPUs +- Most cloud instances see sub-100ms execution times +- Integrates efficiently with lightweight VAD models like Silero +- Two versions available: 8MB quantized CPU variant and 32MB unquantized GPU variant +- GPU version uses fp32 weights for marginally faster inference and ~1% accuracy improvement +- CPU version uses int8 quantization for reduced size and speed with minimal accuracy trade-off + +**Technical Approach:** The model operates directly on PCM audio samples rather than text transcriptions, capturing prosodic nuances that inform turn-taking decisions. + +## Setup Instructions + +**Environment Configuration:** +```bash +python3.12 -m venv venv +source venv/bin/activate +pip install -r requirements.txt +``` + +**Platform-Specific Dependencies:** + +Ubuntu/Debian systems require: +```bash +sudo apt-get update +sudo apt-get install portaudio19-dev python3-dev +``` + +macOS with Homebrew: +```bash +brew install portaudio +``` + +**Running the Demonstration:** +```bash +python record_and_predict.py +``` + +Initial startup requires approximately 30 seconds. Test phrases include "I can't seem to, um ..." and "I can't seem to, um, find the return label." + +## Audio Input Specifications + +The model accepts 16kHz mono PCM audio with maximum duration of 8 seconds. The recommended approach involves providing the complete audio of the current user turn. When audio exceeds 8 seconds, truncate from the beginning while maintaining context. + +For shorter audio, prepend zero-value padding to reach the required length, ensuring actual speech content occupies the end of the input vector. + +## Model Architecture + +Smart Turn v3 employs Whisper Tiny as its foundation with an added linear classification layer. The transformer-based architecture contains approximately 8 million parameters. Development involved experimentation with wav2vec2-BERT, wav2vec2, LSTM implementations, and additional transformer classifier configurations. + +## Integration Methods + +**Pipecat Integration:** The framework supports local inference via `LocalSmartTurnAnalyzerV3` (version 0.0.85 and later). On Pipecat Cloud's standard 1x instance, inference typically completes in around 65ms. + +**Direct Integration:** Import `model.py` and `inference.py` from the Smart Turn repository and invoke the `predict_endpoint()` function. Reference implementation available in `predict.py`. + +## Training Infrastructure + +Training code resides in `train.py` and downloads datasets from the pipecat-ai HuggingFace repository. Training can execute locally or via Modal using `train_modal.py`. Training sessions log to Weights & Biases unless disabled. + +**Training command for Modal:** +```bash +modal run --detach train_modal.py +``` + +**Current Datasets:** +- pipecat-ai/smart-turn-data-v3.2-train +- pipecat-ai/smart-turn-data-v3.2-test + +## Community Contributions + +**Data Classification:** Manual training data categorization assistance needed at https://smart-turn-dataset.pipecat.ai/ + +**Human Data Contribution:** Participants can contribute through turn training games at https://turn-training.pipecat.ai/ or by following the data generation contribution guide. + +**Future Development Areas:** +- Additional language support expansion +- Performance optimization and architecture refinement +- Expanded human dataset collection +- Text-conditioned inference for specialized input modes +- Training platform diversification + +## Licensing and Attribution + +Smart Turn operates under the BSD 2-clause license, permitting unrestricted usage, modification, and contribution. + +**Project Contributors:** +- Marcus (marcus-daily) +- Eli (ebb351) +- Mark (markbackman) +- Kwindla (kwindla) + +**Data Contributors:** +- Liva AI +- Midcentury +- MundoAI + +## Resources + +- [HuggingFace Model Repository](https://huggingface.co/pipecat-ai/smart-turn-v3) +- [Pipecat Documentation](https://docs.pipecat.ai/server/utilities/smart-turn/smart-turn-overview) +- [Pipecat Framework](https://pipecat.ai) diff --git a/03-finetune-pipecat-pt/references/guides/pipecat_train_py.md b/03-finetune-pipecat-pt/references/guides/pipecat_train_py.md new file mode 100644 index 0000000000000000000000000000000000000000..539742b80b4c3e3f8f47442a27a38458bb557d7a --- /dev/null +++ b/03-finetune-pipecat-pt/references/guides/pipecat_train_py.md @@ -0,0 +1,179 @@ +--- +title: "Pipecat Smart Turn - Training Code (train.py)" +source: https://raw.githubusercontent.com/pipecat-ai/smart-turn/main/train.py +date: 2026-03-16 +--- + +# SmartTurn V3 Speech Endpointing Model — Training Code Reference + +> This document describes the training pipeline implemented in `train.py` from the +> [pipecat-ai/smart-turn](https://github.com/pipecat-ai/smart-turn) repository. +> The original file is Python source code; key architecture and configuration details +> are extracted below. + +## Core Architecture + +### SmartTurnV3Model + +The model extends `WhisperPreTrainedModel` and uses Whisper's encoder as its backbone: + +- **Input**: Log-mel spectrogram features with shape `(batch_size, 80, 800)` +- **Encoder**: Modified Whisper encoder with `max_source_positions=400` +- **Attention Pooling**: Neural network layer that learns weighted attention across time steps +- **Binary Classifier**: Multi-layer sequential network outputting probability scores + +``` +Input Features (batch, 80, 800) + | +Whisper Encoder + | +Attention Pooling [Linear -> Tanh -> Linear -> Softmax] + | +Weighted Pooling (reduces to batch_size, hidden_size) + | +Classifier [Linear -> LayerNorm -> GELU -> Dropout -> Linear -> GELU -> Linear] + | +Sigmoid Output (probability of completion) +``` + +### Loss Function + +Uses `BCEWithLogitsLoss` with dynamic positive sample weighting, clamped between 0.1 and 10.0 to handle class imbalance. + +## Training Configuration + +``` +Base Model: openai/whisper-tiny +Learning Rate: 5e-5 +Epochs: 4 +Train Batch Size: 384 +Eval Batch Size: 128 +Warmup Ratio: 0.2 +Weight Decay: 0.01 +Eval/Save Steps: 500 +Logging Steps: 100 +LR Scheduler: Cosine +Dataloader Workers: 6 +``` + +## Datasets + +**Training Sources**: +- `pipecat-ai/smart-turn-data-v3.2-train` (split 90/10 for train/eval) + +**Test Sources**: +- `pipecat-ai/smart-turn-data-v3.2-test` + +The system supports stratified analysis by language, midfiller presence, and endfiller presence. + +## Data Pipeline + +### OnDemandSmartTurnDataset + +On-demand feature extraction pipeline: +- Audio truncation to last 8 seconds +- 16kHz sampling rate +- Padding to max length: `8 * 16000 = 128,000 samples` +- Normalization enabled + +### SmartTurnDataCollator + +Batches samples while preserving metadata (language, midfiller, endfiller flags) for downstream analysis. + +## Export and Quantization + +### ONNX FP32 Export + +The wrapper ensures consistent output shapes across variable batch sizes: +- Dynamic batch dimension +- Fixed output shape: `(batch_size, 1)` for compatibility +- ONNX opset version 18 +- Validation with batch sizes 1 and 2 + +### INT8 Static Quantization + +**Configuration**: +- Calibration dataset size: 1024 samples +- Quantization format: QDQ (Quantize-Dequantize) +- Activation type: QUInt8 +- Weight type: QInt8 +- Per-channel quantization enabled +- Calibration method: Entropy +- Quantized operations: Conv, MatMul, Gemm + +Process: +1. `quant_pre_process` for optimization and shape inference +2. Entropy-based calibration on training data subset +3. Static quantization with calibration reader + +## Evaluation Metrics + +Per-sample metrics computed during training and evaluation: + +- Accuracy +- Precision (with zero_division warning handling) +- Recall +- F1 Score +- Confusion matrix components (TP, FP, TN, FN) +- Predicted positive/negative counts + +### External Evaluation Callback + +During training checkpoints, the system evaluates on all test splits and logs: +- Dataset-specific accuracies +- Language-stratified metrics +- Midfiller-stratified metrics +- Probability distributions (Weights & Biases histograms) +- Min/max/mean accuracy across categories +- Sample count distributions + +## Key Utility Functions + +- **`truncate_audio_to_last_n_seconds`**: Crops audio to preserve final n seconds (prevents padding inflation) +- **`process_predictions`**: Converts logits to probabilities and binary predictions using 0.5 threshold, with NaN validation +- **`compute_metrics`**: Scikit-learn based metric computation with detailed confusion matrix breakdown + +## Workflows + +### Training Run (`do_training_run`) + +1. Initialize Weights & Biases project "speech-endpointing" +2. Load pretrained Whisper-tiny with custom head +3. Prepare datasets (load, split, wrap) +4. Train with specified hyperparameters +5. Save final model and feature extractor +6. Export to ONNX FP32 +7. Return export path + +### Quantization Run (`do_quantization_run`) + +1. Load FP32 ONNX model +2. Create calibration dataset from training split +3. Apply pre-processing (optimization) +4. Execute static INT8 quantization +5. Return quantized model path + +### Benchmark Run (`do_benchmark_run`) + +1. Load feature extractor +2. Prepare test merged dataset +3. For each model path: + - Create benchmark output directory + - Run benchmark with batch size 256 + - Generate markdown report + +## Dependencies + +- PyTorch with ONNX export capabilities +- Transformers (HuggingFace) for Whisper and training +- ONNX Runtime with quantization support +- Scikit-learn for metrics +- Weights & Biases for experiment tracking +- torchcodec (runtime requirement for audio decoding) + +## Error Handling + +- Fast-fail check for missing `torchcodec` dependency +- ONNX model validation after export +- Non-finite value detection in predictions +- Exception logging for failed exports diff --git a/03-finetune-pipecat-pt/references/guides/vogent_turn_80m.md b/03-finetune-pipecat-pt/references/guides/vogent_turn_80m.md new file mode 100644 index 0000000000000000000000000000000000000000..20a50e74ae60a350659b6262bdbe6bac72fc6121 --- /dev/null +++ b/03-finetune-pipecat-pt/references/guides/vogent_turn_80m.md @@ -0,0 +1,171 @@ +--- +title: "Vogent-Turn-80M Model Card" +source: https://huggingface.co/vogent/Vogent-Turn-80M +date: 2026-03-16 +--- + +# Vogent-Turn-80M + +A state-of-the-art multimodal turn detection model for voice AI systems, achieving **94.1% accuracy** by combining acoustic and linguistic signals for real-time conversational applications. + +## Overview + +| Attribute | Value | +|-----------|-------| +| **Developed by** | Vogent AI | +| **Model type** | Multimodal Turn Detection (Binary Classification) | +| **Language(s)** | English | +| **License** | Modified Apache-2.0 (horizontal voice agent platforms cannot set as default) | +| **Base model** | SmolLM2-135M (reduced to 80M parameters using first 12 layers) | +| **Model size** | 79.2M parameters (F32) | +| **Inference speed** | ~7ms on T4 GPU | + +## Model Description + +Vogent-Turn-80M determines when a speaker has finished their turn in a conversation by processing both: +- **Acoustic features** (via Whisper encoder) +- **Semantic context** (via language model) + +This multimodal approach enables real-time, accurate predictions where audio-only or text-only methods fail. + +### Resources + +- **GitHub Repository:** https://github.com/vogent/vogent-turn +- **Technical Report:** https://blog.vogent.ai/posts/voturn-80m-state-of-the-art-turn-detection-for-voice-agents +- **Demo Space:** https://huggingface.co/spaces/vogent/vogent-turn-demo + +## Quick Install + +```bash +git clone https://github.com/vogent/vogent-turn.git +cd vogent-turn +pip install -e . +``` + +## Basic Usage + +```python +from vogent_turn import TurnDetector +import soundfile as sf +import urllib.request + +# Initialize detector +detector = TurnDetector(compile_model=True, warmup=True) + +# Download and load audio +audio_url = "https://storage.googleapis.com/voturn-sample-recordings/incomplete_number_sample.wav" +urllib.request.urlretrieve(audio_url, "sample.wav") +audio, sr = sf.read("sample.wav") + +# Run turn detection with conversational context +result = detector.predict( + audio, + prev_line="What is your phone number", + curr_line="My number is 804", + sample_rate=sr, + return_probs=True, +) + +print(f"Turn complete: {result['is_endpoint']}") +print(f"Done speaking probability: {result['prob_endpoint']:.1%}") +``` + +## Available Interfaces + +- **Python Library:** Direct integration with `TurnDetector` class +- **CLI Tool:** `vogent-turn-predict speech.wav --prev "What is your phone number" --curr "My number is 804"` + +## Technical Architecture + +### Model Architecture + +**Components:** +1. **Audio Encoder:** Whisper-Tiny (processes up to 8 seconds of 16kHz audio) +2. **Text Model:** SmolLM-135M (12 layers, ~80M parameters) +3. **Multimodal Fusion:** Audio embeddings projected into LLM's input space +4. **Classifier:** Binary classification head (turn complete/incomplete) + +### Processing Flow + +``` +1. Audio (16kHz PCM) -> Whisper Encoder -> Audio Embeddings (~400 tokens) +2. Text Context -> SmolLM Tokenizer -> Text Embeddings +3. Concatenate embeddings -> SmolLM Transformer -> Last token hidden state +4. Linear Classifier -> Softmax -> [P(continue), P(endpoint)] +``` + +## Training Details + +### Preprocessing + +- **Audio:** Last 8 seconds extracted via Whisper-Tiny encoder -> ~400 audio tokens +- **Text:** Full conversational context (assistant and user utterances) +- **Labels:** Binary classification (turn complete/incomplete) +- **Fusion:** Audio embeddings projected into LLM's input space and concatenated with text + +### Training Hyperparameters + +- **Training regime:** fp16 mixed precision +- **Base model:** SmolLM2-135M (first 12 layers) +- **Architecture:** Reduced from 135M to ~80M parameters via layer ablation + +### Training Data + +Diverse dataset combining: +- Human-collected conversational data +- Synthetic conversational data + +## Evaluation Results + +### Performance Metrics + +- **Accuracy:** 94.1% +- **AUPRC:** 0.975 + +### Testing Data + +Internal test set covering diverse conversational scenarios and edge cases where audio-only or text-only approaches fail. + +## Compute Infrastructure + +### Hardware Optimization + +- `torch.compile` with max-autotune mode +- Dynamic tensor shapes without recompilation +- Pre-warmed bucket sizes (64, 128, 256, 512, 1024) + +### Software + +- **Framework:** PyTorch with torch.compile +- **Audio processing:** Whisper encoder (up to 8 seconds) + +## Limitations + +- **English-only** support; turn-taking conventions vary across languages and cultures +- **CPU inference** may be too slow for some real-time applications + +## Citation + +```bibtex +@misc{voturn2025, + title={Vogent-Turn-80M: State-of-the-Art Turn Detection for Voice Agents}, + author={Varadarajan, Vignesh and Vytheeswaran, Jagath}, + year={2025}, + publisher={Vogent AI}, + howpublished={\url{https://huggingface.co/vogent/Vogent-Turn-80M}}, + note={Blog: \url{https://blog.vogent.ai/posts/voturn-80m-state-of-the-art-turn-detection-for-voice-agents}} +} +``` + +## Additional Resources + +- **Full Documentation & Code:** https://github.com/vogent/vogent-turn +- **Platform:** https://vogent.ai +- **Enterprise Contact:** j@vogent.ai +- **Issues:** https://github.com/vogent/vogent-turn/issues + +## Upcoming Releases + +- Int8 quantized model for faster CPU deployment +- Multilingual versions +- Domain-specific adaptations diff --git a/03-finetune-pipecat-pt/references/hesitation_turn_taking_l2_review.md b/03-finetune-pipecat-pt/references/hesitation_turn_taking_l2_review.md new file mode 100644 index 0000000000000000000000000000000000000000..4f392acf5412cd5f1efd688ed36bdad9ea17e04b --- /dev/null +++ b/03-finetune-pipecat-pt/references/hesitation_turn_taking_l2_review.md @@ -0,0 +1,453 @@ +# Pausas de Hesitacao, Deteccao de Fim de Turno e Fala L2: Uma Revisao para Fine-Tuning em Portugues + +**Contexto**: BabelCast — traducao simultanea em reunioes. O modelo de deteccao de fim de turno precisa distinguir pausas de hesitacao (falante ainda vai continuar) de fim de turno real (pode comecar a traduzir). O desafio e especialmente critico para falantes de frances aprendendo portugues, que produzem pausas longas de hesitacao frequentemente confundidas com fim de turno. + +**Autores**: Marcos Remar, com assistencia de Claude (Anthropic) +**Data**: Março 2026 + +--- + +## 1. O Problema: Silencio Nao Significa Fim de Turno + +Sistemas comerciais de IA conversacional usam thresholds de silencio entre 700ms e 1000ms para detectar fim de turno (Castillo-Lopez, de Chalendar & Semmar, 2025). Esse approach e fundamentalmente inadequado por dois motivos: + +1. **Humanos sao mais rapidos**: o gap medio entre turnos em conversacao natural e de apenas ~200ms (Levinson & Torreira, 2015, citado em Skantze, 2021). Esperar 700ms+ resulta em respostas percebidas como lentas. + +2. **Pausas dentro de turnos sao frequentemente mais longas que gaps entre turnos** (Skantze, 2021). Um falante pode pausar 2-3 segundos no meio de uma frase (buscando uma palavra ou planejando o proximo trecho) e continuar normalmente. Tratar essa pausa como fim de turno causa interrupcao. + +O problema se agrava dramaticamente quando o falante esta usando uma segunda lingua (L2), como demonstrado na literatura a seguir. + +--- + +## 2. Pausas de Hesitacao em Falantes L2 + +### 2.1 Duracao: L2 pausa 39% mais que nativos + +Kosmala & Crible (2022) analisaram pausas preenchidas (filled pauses, FPs) em falantes nativos e nao-nativos de frances, usando o corpus SITAF: + +| Metrica | Nativos | Nao-nativos | Diferenca | +|---------|---------|-------------|-----------| +| Duracao media FP | 378ms (SD=200) | **524ms** (SD=222) | +146ms (+39%) | +| Taxa FP/100 palavras | 4.4 | 5.3 | +0.9 (n.s.) | +| Clustering com outros fluencemas | 72% | **82%** | +10 p.p. | + +A diferenca de **frequencia** nao e significativa — nao-nativos nao produzem mais pausas, mas pausas **significativamente mais longas** (p < .001). A duracao e um indicador mais confiavel de proficiencia que a frequencia. + +Em situacoes de reparo (self-repair), as pausas sao ainda mais extremas. Kosmala (2025) analisou 167 reparos de alunos franceses falando ingles e encontrou: + +- Pausa silenciosa media na fase de edicao: **844ms** (SD=573ms) +- Pausa preenchida media na fase de edicao: **522ms** (SD=571ms) +- 82% dos auto-reparos contem uma fase de edicao (pausa e/ou filler) + +Esses 844ms estao **muito acima** de qualquer threshold de silencio usado em sistemas comerciais (700ms). + +### 2.2 Tipos de pausa: silenciosas dominam + +Cenoz (2000) estudou 15 falantes nativos de espanhol falando ingles como L2, analisando 1.085 pausas nao-juntivas: + +| Tipo | Proporcao | Faixa de duracao | +|------|-----------|------------------| +| Silenciosas | **64%** | 205ms a 11.569ms | +| Preenchidas | 36% | — | + +Distribuicao por duracao das pausas silenciosas: +- 70% entre 200ms e 1.000ms +- 21% entre 1.001ms e 2.000ms +- 7% entre 2.001ms e 4.000ms +- 2% acima de 4.000ms + +**Implicacao critica**: a maioria das hesitacoes sao **silencio puro** — nao contem fillers como "euh" ou "hum" que o modelo poderia usar como pista. Isso torna a deteccao mais dificil, pois o modelo precisa distinguir silencio de hesitacao de silencio de fim de turno usando apenas prosodia. + +### 2.3 Funcao das pausas: planejamento vs busca lexical + +Cenoz (2000) classificou as funcoes das pausas: + +| Funcao | Pausas silenciosas | Pausas preenchidas | +|--------|-------------------|-------------------| +| Planejamento (frases, sintaxe) | 59% | **73%** | +| Busca lexical (palavras) | 36% | 26% | +| Outros | 5% | 1% | + +Pausas preenchidas sao usadas primariamente para **manter o turno** (floor-holding): 77% ocorrem isoladas, sem outros marcadores de hesitacao. Ja pausas silenciosas co-ocorrem com outras estrategias de reparo 54% das vezes. + +### 2.4 Paradoxo de proficiencia + +Um achado contraintuitivo: falantes **mais proficientes** produzem **mais** pausas preenchidas, nao menos (Cenoz, 2000). Na faixa de maior proficiencia, a proporcao muda para 53% silenciosas + 46% preenchidas (vs. 69%/31% nos menos proficientes). Isso ocorre porque falantes avancados aprenderam a usar fillers como estrategia de floor-holding — sinalizam ao interlocutor que ainda estao falando. + +A variacao individual e enorme: a proporcao de pausas preenchidas varia de **4% a 74.5%** entre falantes individuais. + +--- + +## 3. Fillers Franceses na Fala L2 + +### 3.1 Transferencia linguistica + +Falantes franceses transferem seus fillers nativos para a L2. Lo (2018) analisou 15 bilingues alemao-frances e mostrou que as propriedades acusticas dos fillers sao distintas por lingua: + +- **Frances "euh"**: duracao mais longa, F1-F3 mais altos (vogal central schwa) +- **Alemao "ah"**: duracao mais curta, F1-F3 mais baixos (vogal aberta) + +Bilingues mantem formas foneticamente distintas para cada lingua, indicando que **a forma do filler revela qual lingua esta sendo processada**. Isso e confirmado por Kosmala (2025): alunos franceses falando ingles usam "euh" (24 instancias) e "eum" (12 instancias), os fillers franceses, nao os ingleses. + +### 3.2 Fillers sao itens lexicais, nao sons universais + +Bottcher & Zellers (2024) analisaram o corpus RUEG (736 falantes, 5 linguas, 4.468 narracoes) e demonstraram que fillers sao **itens lexicais especificos de cada lingua**, nao sons universais de hesitacao. A proporcao nasal ("uhm") vs. nao-nasal ("uh") varia por lingua, genero e idade. + +Falantes heritage (bilingues desde a infancia) produzem **mais fillers no total** devido a carga cognitiva da ativacao de duas linguas. A **tolerancia ao silencio** tambem se transfere da L1: falantes de japones mantem maior tolerancia ao silencio mesmo falando ingles. + +### 3.3 Code-switching: 93% nas fronteiras de unidade + +Beatty-Martinez, Navarro-Torres & Dussias (2020) analisaram 10 bilingues espanhol-ingles e encontraram que **93% das trocas de codigo ocorrem em fronteiras de unidades entonacionais** — os mesmos pontos onde transicoes de turno acontecem. Antes de uma troca, ha **reorganizacao prosodica** (mudanca na velocidade de fala), que pode ser confundida com marcadores prosodicos de fim de turno. + +**Implicacao para o modelo**: quando um frances falando portugues alterna para uma palavra em frances ("Eu preciso de... *comment dit-on*... uma tesoura"), a mudanca prosodica antes da troca NAO indica fim de turno. + +--- + +## 4. Pistas Prosodicas de Fim de Turno + +### 4.1 O contorno de F0 e o sinal-chave + +Ekstedt & Skantze (2022) conduziram experimentos de perturbacao prosodica com o modelo VAP para isolar a importancia relativa de cada pista: + +| Pista prosodica | Importancia | +|----------------|-------------| +| Informacao fonetica/espectral | Mais importante no geral | +| **Contorno de F0 (pitch)** | **Mais importante para desambiguar pontos sintaticamente completos** | +| Intensidade | Comparavel ao F0 no geral | +| Normalizacao de duracao | Menos importante individualmente | + +O achado central: **F0 e o principal desambiguador em pontos onde a sintaxe poderia indicar completude**. Um F0 ascendente + silaba final mais longa = sinal de completude de turno. O modelo VAP consegue prever trocas de turno **antes** da completude do enunciado a partir da dinamica prosodica. + +### 4.2 Prosodia antes de pausas preenchidas + +Wu, Didirkova & Simon (2023) analisaram o corpus LOCAS-F (2h38m, >1.000 FPs, kappa inter-anotador = 0.86) e encontraram padroes prosodicos especificos antes de fillers: + +- Queda de F0 de **1-2 semitons** durante pausas preenchidas (17.90 vs. 19.01 ST) +- Reset melodico **negativo significativo** antes da FP (descontinuidade de pitch) +- Efeito de preparacao: queda de **4.66 ST** entre fala nao-preparada (18.89 ST) e preparada (14.24 ST) +- **Nao ha reset significativo** entre a FP e a silaba seguinte + +**Implicacao**: a queda de pitch antes de um filler "euh" e diferente da queda de pitch de fim de frase. O modelo precisa aprender essa distincao sutil — o que o encoder do Whisper pode capturar atraves de representacoes espectrais. + +### 4.3 Resumo das pistas prosodicas de turno + +Skantze (2021) compilou a literatura: + +| Pista | Fim de turno | Pausa de hesitacao | +|-------|-------------|-------------------| +| Pitch (F0) | Queda (declarativa) ou queda-ascensao | Suspenso ou queda menor | +| Intensidade | Diminui gradualmente | Mantem-se ou cai abruptamente | +| Velocidade de fala | Desacelera antes do fim | Pode acelerar antes de parar | +| Alongamento silabico | Silaba final alongada | Ausente | +| Completude sintatica | Frase sintaticamente completa | Frase incompleta | + +--- + +## 5. Estado da Arte em Deteccao de Fim de Turno + +### 5.1 Modelos e suas acuracias + +| Modelo | Tipo | Params | Tamanho | Latencia | Acuracia | Linguas | +|--------|------|--------|---------|----------|----------|---------| +| **Pipecat Smart Turn v3.1** | Audio (Whisper Tiny) | 8M | 8 MB (INT8) | 12ms CPU | 94.7% (EN) | 23 | +| **LiveKit v0.4.1** | Texto (Qwen2.5-0.5B) | 500M | — | — | TNR 87.4% (PT) | 15+ | +| **Vogent-Turn-80M** | Multimodal | 79.2M | — | 7ms T4 | 94.1% (EN) | 1 | +| **Krisp TT v1** | Audio | 6.1M | 65 MB | — | 82% bal.acc. | 1 | +| **VAP** | Audio (CPC+Transformer) | — | — | 14.6ms CPU | 76.2% bal.acc. | 3 | +| **SpeculativeETD** | Audio (GRU+Wav2vec) | 1M+94M | — | 0.26ms+server | 66% real | 1 | + +Nenhum destes modelos foi especificamente otimizado para portugues ou para fala L2. + +### 5.2 Pipecat Smart Turn: resultados por lingua + +| Lingua | Acuracia | FP | FN | +|--------|----------|-----|-----| +| Turco | 97.10% | 1.66% | 1.24% | +| Coreano | 96.85% | 1.12% | 2.02% | +| Japones | 96.76% | 2.04% | 1.20% | +| Frances | 96.01% | 1.60% | 2.39% | +| **Portugues** | **95.42%** | **2.79%** | **1.79%** | +| Ingles | 94.31% | 2.64% | 3.06% | +| Espanhol | 91.97% | 4.48% | 3.55% | + +Portugues esta atras de frances, japones, coreano e turco. A taxa de falsos positivos (2.79%) indica que o modelo diz "terminou" quando nao terminou em quase 3% dos casos — problematico para traducao simultanea. + +### 5.3 O gap sintetico → real: o maior risco + +Ok, Yoo & Lee (2025) demonstraram um colapso devastador quando modelos treinados em dados sinteticos (TTS) sao avaliados em fala real: + +| Modelo | F1 sintetico | F1 real | Perda | +|--------|-------------|---------|-------| +| Wav2vec 2.0 | 94.7% | **30.3%** | -64.4 p.p. | +| SpeculativeETD | 88.9 IoU | **16.4 IoU** | -72.5 p.p. | +| VAP | 87.7 IoU | **10.7 IoU** | -77.0 p.p. | + +**Todos os modelos colapsam** ao sair de dados sinteticos para conversacao real. A melhoria de v3.0 para v3.1 do Pipecat veio justamente de adicionar **audio humano real** de tres parceiros especializados (Liva AI, Midcentury, MundoAI). + +--- + +## 6. Tecnicas de Treinamento Validadas + +### 6.1 Focal Loss (Lin et al., 2017) + +A Focal Loss resolve o desbalanceamento extremo de classes (a maioria dos frames e "fala em andamento"; fim de turno e raro): + +``` +FL(p_t) = -α_t · (1 - p_t)^γ · log(p_t) +``` + +Com gamma=2, a perda para exemplos bem classificados (p_t=0.9) e **100x menor** que cross-entropy padrao; com p_t=0.968, **1.000x menor**. Os melhores hiperparametros: **gamma=2, alpha=0.25** (Lin et al., 2017, Tabela 1a). + +### 6.2 Knowledge Distillation (LiveKit) + +O LiveKit treinou um modelo professor (Qwen2.5-7B) e destilou para um aluno (Qwen2.5-0.5B), que converge apos ~1.500 steps. O resultado: reducao de 39% nos falsos positivos de interrupcao. Para portugues especificamente, a melhoria relativa foi de **45.97%** — a segunda maior entre todas as linguas. + +### 6.3 Dados curtos e ruidosos (Pipecat v3.2) + +O Pipecat v3.2 adicionou dois tipos de dados ao treinamento: +1. **Respostas curtas** ("yes", "no", "ok"): reduziu erros de classificacao em **40%** +2. **Ruido de fundo** (cafe, escritorio, CC-0 Freesound): melhorou robustez + +### 6.4 Injecao de fillers e pausas (SpeculativeETD) + +Ok et al. (2025) testaram tres variantes de dados sinteticos: +- V1: TTS direto (baseline) +- V2: Pausas de hesitacao estendidas para 1.5-3.0 segundos +- **V3: Fillers injetados ("um", "uh") em posicoes aleatorias + pausas apos** + +V3 foi a melhor variante, validando a abordagem de gerar dados com fillers inseridos por LLM. + +### 6.5 Transfer learning cross-lingual + +Castillo-Lopez et al. (2025) documentam que VAP pre-treinado em ingles (Switchboard) e fine-tunado em japones **supera** o treinamento direto em japones. Isso valida a abordagem de partir do Pipecat (pre-treinado em 23 linguas) e fine-tunar para portugues. + +--- + +## 7. Pesquisa: Modelos de Turn-Taking para Aprendizado de Idiomas (Marco 2026) + +**Data da pesquisa**: 16 de marco de 2026 +**Objetivo**: identificar modelos e tecnicas especificas para turn-taking em contexto de aprendizado de idiomas com avatar conversacional + +### 7.1 Panorama: nenhum modelo especifico existe + +Nao existe nenhum modelo open-source de turn-taking treinado especificamente para aprendizes L2. Os produtos comerciais usam abordagens genericas: + +| Produto | Abordagem | Limitacao | +|---------|-----------|-----------| +| **Praktika** (OpenAI) | GPT-5.2 Realtime API — turn detection nativo OpenAI | Caixa preta, sem adaptacao L2 | +| **ConversAR** (Meta Quest) | Timeout fixo 2000ms + "periodo infinito de pensamento" | Nao distingue hesitacao de fim de turno | +| **Gliglish** | ChatGPT + speech recognition generico | Sem turn detection especializado | +| **ELSA Speak** | Modelo proprietario treinado com sotaques variados | Foco em pronuncia, nao turn-taking | +| **TalkPal/SpeakPal/Talkio** | GPT + NLP generico | Sem modelo de audio para end-of-turn | +| **Hume EVI** | Prosodia (tom de voz) para turn detection | Comercial, sem foco em L2 | +| **Deepgram Flux** | Modelo fusionado (ASR + turn detection) com `eot_threshold` configuravel | So ingles, sem adaptacao para proficiencia | + +### 7.2 Modelos de pesquisa relevantes + +- **VAP (Voice Activity Projection)** — multilingual (EN/ZH/JA), fine-tuning por lingua melhora significativamente, mas nao cobre L2 ou non-native speakers (Inoue et al., 2024) +- **Speak & Improve Corpus 2025** — 340 horas de fala L2 ingles com anotacao de disfluencias e CEFR scores (A2-C1, maioria B1-B2) (Knill et al., 2025) +- **Whisper + LoRA para hesitation tagging** — fine-tune do Whisper Large V3 com tags de hesitacao acusticamente precisas: **11.3% melhoria no WER** para fala L2 (arXiv:2506.04076) +- **Deepgram Flux** — modelo fusionado (ASR + turn detection em um unico modelo); ~260ms end-of-turn detection, ~30% menos interrupcoes vs pipeline tradicional (Deepgram, 2025) +- **Survey IWSDS 2025** — 72% dos trabalhos de turn-taking NAO comparam com metodos anteriores, sugerindo falta de benchmarks estabelecidos (Castillo-Lopez et al., 2025) + +### 7.3 Tecnicas identificadas para melhorar o fine-tuning + +**1. Threshold duplo (Eager + Final) — inspirado no Deepgram Flux** + +Em vez de um unico threshold, usar dois: +- **Eager threshold (0.3-0.5)**: o avatar comeca a preparar resposta especulativamente (inicia geracao LLM) +- **Final threshold (0.7+)**: confirma fim de turno e fala + +Economia de latencia estimada: 150-250ms no inicio da resposta, sem interromper o falante. + +Deepgram Flux implementa isso como `eot_threshold` (padrao 0.7) e `eager_eot_threshold` (0.3-0.5), com parametro `eot_timeout_ms` para configurar sensibilidade. + +**2. Custo assimetrico (FP >> FN)** + +Para um tutor de idiomas, **interromper o aluno e muito pior que esperar demais**: +- ConversAR (2025): "periodo infinito de pensamento" — learners controlam quando falar +- Praktika: timeout estendido para fala L2 fragmentada e com sotaque +- Implementacao: `fp_penalty=2.0` no focal loss — erros de interrupcao custam 2x mais + +**3. Dados L2 reais — Speak & Improve Corpus** + +O corpus Speak & Improve 2025 (Knill et al., 2025) tem: +- 340 horas de audio L2 ingles +- Anotacao de disfluencias (false starts, repeticoes, hesitacoes) +- Scores CEFR: B1 (18.3%), B1+ (25.3%), B2 (25.1%), B2+ (18.3%) +- Embora seja ingles L2, padroes de hesitacao L2 sao transferiveis entre linguas (Cenoz, 2000) + +Fine-tuning do Whisper com anotacao precisa de hesitacoes (tags "um"/"uh" acusticamente alinhadas) mostrou 11.3% melhoria no WER vs ignorar hesitacoes. + +**4. Proficiency-aware turn-taking (futuro)** + +Nenhum sistema implementa isso ainda, mas e logico: +- Aluno A1: pausa **3-5x mais** que B2 +- Aluno B2: usa fillers como estrategia de floor-holding (Cenoz, 2000) +- De Jong & Bosker: threshold otimo de 250ms para pausa L2 em holandes +- Shea & Leonard (2019): threshold de **1000ms** para espanhol L2 +- Implementacao futura: input de nivel CEFR ao modelo, ou adaptar threshold baseado em perfil do falante + +### 7.5 Sistema de backchannel para avatar de aprendizado + +Um problema critico identificado: se o avatar espera 3 segundos em silencio, o aprendiz pensa que o sistema travou e para de falar. A solucao e um sistema de **backchannel signals** que mostra ao aprendiz que o avatar esta ouvindo, sem tomar o turno. + +| Tempo de silencio | Acao do avatar | Tipo | +|-------------------|----------------|------| +| 0-600ms | Nada (normal) | — | +| 600ms-1.5s | Aceno de cabeca, olhar atento | Backchannel visual | +| 1.5s-3.0s | "Mhm...", "Continue...", "Uhum..." | Backchannel verbal | +| 3.0s+ | "Sem pressa, pode pensar...", "Prenez votre temps..." | Encorajamento | + +O sistema integra o threshold duplo do modelo com os backchannels: +- **Eager threshold** atingido → avatar da backchannel + inicia geracao LLM especulativa +- **Final threshold** atingido → avatar fala a resposta + +Presets por nivel CEFR: + +| CEFR | Eager | Final | BC Visual | BC Verbal | Encorajamento | +|------|-------|-------|-----------|-----------|---------------| +| A1 | 0.50 | **0.80** | 500ms | 1200ms | 2500ms | +| A2 | 0.45 | 0.75 | 550ms | 1300ms | 2800ms | +| B1 | 0.40 | 0.70 | 600ms | 1500ms | 3000ms | +| B2 | 0.35 | 0.65 | 600ms | 1800ms | 4000ms | +| C1 | 0.35 | **0.60** | 600ms | 2000ms | 5000ms | + +O avatar e mais paciente com A1/A2 (threshold final 0.80 = raramente interrompe) e mais responsivo com B2/C1 (0.60-0.65 = conversa mais natural). Implementado em `06_inference.py`. + +### 7.6 Conclusao da pesquisa + +O trabalho do BabelCast e confirmado como **pioneiro**: nao existe modelo de turn-taking para aprendizes L2, muito menos para francofonos falando portugues. As melhorias implementadas (custo assimetrico, threshold duplo, dados L2 reais, backchannel por CEFR) trazem o modelo mais proximo do comportamento ideal de um tutor paciente. + +--- + +## 8. Implicacoes para o Fine-Tuning BabelCast + +### 8.1 O que o modelo precisa aprender + +Com base na literatura, os seguintes padroes sao criticos para o modelo distinguir: + +| Padrao | Duracao tipica | Label | Pista prosodica | +|--------|---------------|-------|----------------| +| Fim de turno real | silencio 200-500ms | COMPLETO | F0 caindo, intensidade caindo, silaba final alongada | +| Hesitacao nativa (PT-BR) | FP 378ms + silencio 200-1000ms | INCOMPLETO | F0 suspenso, "hum"/"tipo"/"ne" | +| Hesitacao L2 (francofono) | FP **524ms** + silencio **844ms** | INCOMPLETO | F0 suspenso, "euh"/"alors"/"comment dire" | +| Busca de palavra L2 | silencio 1000-3000ms | INCOMPLETO | Silencio puro, sem pista prosodica clara | +| Code-switching | prosodic reset + silencio 500-1500ms | INCOMPLETO | Mudanca de velocidade antes da troca | +| Resposta curta completa | "Sim." + silencio 200ms | COMPLETO | F0 caindo, curta duracao | +| Resposta curta incompleta | "Sim, mas..." + silencio | INCOMPLETO | F0 suspenso ou ascendente | + +### 8.2 Dados necessarios + +1. **Audio real de portugues** (CORAA MUPE 365h, NURC-SP 239h) — evitar o gap sintetico→real +2. **TTS com fillers** brasileiros ("hum", "tipo", "ne") e franceses ("euh", "alors") inseridos por LLM +3. **Pausas longas de hesitacao** (1.5-3.0s) injetadas em amostras incompletas +4. **Respostas curtas** ("sim", "nao", "ok", "tá bom") com variantes completas e incompletas +5. **Code-switching** frances-portugues em fronteiras de unidade +6. **Ruido de fundo** real (cafe, escritorio) — CC-0 Freesound + +### 8.3 Escolhas arquiteturais validadas + +- **Whisper Tiny encoder** (39M params, 8s janela, 384-dim): mesma arquitetura do Pipecat, captura prosodia sem decodificar texto +- **Attention pooling**: aprende quais frames perto do silencio sao mais informativos (Ekstedt & Skantze, 2022) +- **Focal Loss** (gamma=2, alpha=0.25): calibracao implícita sem label smoothing (EMNLP 2022) +- **INT8 quantizacao**: 32MB → 8MB, 12ms CPU (Pipecat deploy pipeline) + +### 8.4 Lacuna na literatura + +**Nenhum paper foi encontrado especificamente sobre deteccao de fim de turno em fala L2**, e especialmente nao sobre francofonos falando portugues. A intersecao de: +- Deteccao de fim de turno (ML/audio) +- Pausas de hesitacao em L2 (linguistica) +- Fala francofona em portugues (fonologia) + +...e uma area completamente inexplorada. O trabalho do BabelCast e, ate onde sabemos, o primeiro a atacar esse problema especifico. + +--- + +## Referencias + +### Papers Academicos + +1. Beatty-Martinez, A. L., Navarro-Torres, C. A., & Dussias, P. E. (2020). Codeswitching: A bilingual toolkit for opportunistic speech planning. *Frontiers in Psychology*, 11, 1699. + +2. Bottcher, A. & Zellers, M. (2024). Do you say uh or uhm? A cross-linguistic approach to filler particle use in heritage and majority speakers across three languages. *Frontiers in Psychology*, 15, 1358182. + +3. Castillo-Lopez, V., de Chalendar, G., & Semmar, N. (2025). A survey of recent advances on turn-taking modeling in spoken dialogue systems. *arXiv:2503.xxxxx*. + +4. Cenoz, J. (2000). Pauses and hesitation phenomena in second language production. *ITL - International Journal of Applied Linguistics*, 127-128, 53-69. + +5. Christodoulides, G. & Avanzi, M. (2014). DisMo: A morphosyntactic, disfluency and multi-word unit annotator. *Proceedings of LREC 2014*. + +6. Christodoulides, G. & Avanzi, M. (2015). Automatic detection and annotation of disfluencies in spoken French corpora. *Proceedings of Interspeech 2015*. + +7. Ekstedt, E. & Skantze, G. (2022). How much does prosody help turn-taking? Investigations using voice activity projection models. *Proceedings of Interspeech 2022*. + +8. Inoue, K., Jiang, D., Ekstedt, E., Kawahara, T., & Skantze, G. (2024). Real-time and continuous turn-taking prediction using voice activity projection. *arXiv:2401.04868*. + +9. Inoue, K., Jiang, D., Ekstedt, E., Kawahara, T., & Skantze, G. (2024). Multilingual turn-taking prediction using voice activity projection. *Proceedings of LREC-COLING 2024*. + +10. Kosmala, L. & Crible, L. (2022). The dual status of filled pauses: Evidence from genre, proficiency and co-occurrence. *Language and Speech*, 65(4), 1-25. + +11. Kosmala, L. (2023). Exploring the status of filled pauses as pragmatic markers: The role of gaze and gesture. *Journal of Pragmatics*, 212, 1-15. + +12. Kosmala, L. (2025). Multimodal self- and other-initiated repairs in L2 peer interactions. *Proceedings of DiSS 2025*, Lisbon. + +13. Lin, T.-Y., Goyal, P., Girshick, R., He, K., & Dollar, P. (2017). Focal loss for dense object detection. *Proceedings of ICCV 2017*. arXiv:1708.02002. + +14. Lo, S. L. (2018). Between ah(m) and euh(m): Filled pauses in German-French bilinguals. *BAAP 2018 Poster Presentation*. + +15. Ok, J., Yoo, I. C., & Lee, Y. (2025). Speculative end-turn detector: Revisiting an efficient design for low-latency end-of-turn detection. *arXiv:2503.23439*. + +16. Peters, M. (2017). L2 fluency development in French. *Ph.D. Thesis*. + +17. Raux, A. & Eskenazi, M. (2009). A finite-state turn-taking model for spoken dialog systems. *Proceedings of NAACL-HLT 2009*. + +18. Skantze, G. (2021). Turn-taking in conversational systems and human-robot interaction: A review. *Computer Speech & Language*, 67, 101178. + +19. Wu, X., Didirkova, I., & Simon, A. C. (2023). Disfluencies in continuous speech in French: Prosodic parameters of filled pauses and vowel lengthening. *Proceedings of ICPhS 2023*. + +20. Knill, K., et al. (2025). Speak & Improve Corpus 2025: an L2 English Speech Corpus for Language Assessment and Feedback. *arXiv:2412.11986*. + +21. Saeki, T., et al. (2025). Acoustically precise hesitation tagging is essential for end-to-end verbatim transcription systems. *arXiv:2506.04076*. + +22. Gamboa, H. & Wohlgenannt, G. (2025). ConversAR: Practicing a second language without fear — Mixed reality agents for interactive group conversation. *arXiv:2510.08227*. + +23. De Jong, N. & Bosker, H. R. (2013). Choosing a threshold for silent pauses to measure second language fluency. *The 6th Workshop on Disfluency in Spontaneous Speech (DiSS)*. + +24. Shea, C. & Leonard, K. (2019). Evaluating measures of pausing for second language fluency research. *Journal of Second Language Pronunciation*, 5(2), 254-277. + +### Modelos e Blogs Tecnicos + +20. Daily/Pipecat. (2025). Announcing Smart Turn v3, with CPU inference in just 12ms. https://www.daily.co/blog/announcing-smart-turn-v3-with-cpu-inference-in-just-12ms/ + +21. Daily/Pipecat. (2025). Improved accuracy in Smart Turn v3.1. https://www.daily.co/blog/improved-accuracy-in-smart-turn-v3-1/ + +22. Daily/Pipecat. (2026). Smart Turn v3.2: Handling noisy environments and short responses. https://www.daily.co/blog/smart-turn-v3-2-handling-noisy-environments-and-short-responses/ + +23. LiveKit. (2025). Improved end-of-turn model cuts voice AI interruptions 39%. https://livekit.com/blog/improved-end-of-turn-model-cuts-voice-ai-interruptions-39/ + +24. LiveKit. (2025). Using a transformer to improve end-of-turn detection. https://blog.livekit.io/using-a-transformer-to-improve-end-of-turn-detection + +25. Krisp. (2025). Audio-only 6M weights turn-taking model for voice AI agents. https://krisp.ai/blog/turn-taking-for-voice-ai/ + +26. Deepgram. (2025). Evaluating end-of-turn detection models. https://deepgram.com/learn/evaluating-end-of-turn-detection-models + +27. Vogent. (2025). Vogent-Turn-80M model card. https://huggingface.co/vogent/Vogent-Turn-80M + +28. Deepgram. (2025). Introducing Flux: Conversational Speech Recognition. https://deepgram.com/learn/introducing-flux-conversational-speech-recognition + +29. Hume AI. (2025). Empathic Voice Interface (EVI) documentation. https://dev.hume.ai/docs/speech-to-speech-evi/overview + +30. OpenAI. (2025). Inside Praktika's conversational approach to language learning. https://openai.com/index/praktika/ + +31. Tavus. (2025). The Complete Guide to AI Turn-Taking. https://www.tavus.io/post/ai-turn-taking + +### Datasets + +32. Pipecat-AI. (2026). Smart Turn Data v3.2 Training Set. HuggingFace: `pipecat-ai/smart-turn-data-v3.2-train`. 270,946 samples, 23 languages. + +33. NILC-NLP. CORAA-MUPE-ASR. HuggingFace: `nilc-nlp/CORAA-MUPE-ASR`. 365h, Portuguese interviews. + +34. NILC-NLP. CORAA NURC-SP Audio Corpus. HuggingFace: `nilc-nlp/CORAA-NURC-SP-Audio-Corpus`. 239h, Portuguese dialogues. + +35. Cambridge English. Speak & Improve Corpus 2025. 340h, L2 English learner speech with disfluency annotations and CEFR scores. diff --git a/03-finetune-pipecat-pt/references/papers/finite_state_turn_taking_raux_2009.md b/03-finetune-pipecat-pt/references/papers/finite_state_turn_taking_raux_2009.md new file mode 100644 index 0000000000000000000000000000000000000000..d33eab39300275a1d78534a7515862e363fdb4c8 --- /dev/null +++ b/03-finetune-pipecat-pt/references/papers/finite_state_turn_taking_raux_2009.md @@ -0,0 +1,158 @@ +--- +title: "A Finite-State Turn-Taking Model for Spoken Dialog Systems" +authors: + - Antoine Raux + - Maxine Eskenazi +year: 2009 +source: https://aclanthology.org/N09-1071/ +date_converted: 2026-03-16 +--- + +## Abstract + +This paper introduces the Finite-State Turn-Taking Machine (FSTTM), a new model to control the turn-taking behavior of conversational agents. Based on a non-deterministic finite-state machine, the FSTTM uses a cost matrix and decision-theoretic principles to select a turn-taking action at any time. The authors show how the model can be applied to the problem of end-of-turn detection. Evaluation results on a deployed spoken dialog system show that the FSTTM provides significantly higher responsiveness than previous approaches. + +**Note**: The PDF file `finite_state_turn_taking_raux_2009.pdf` in this directory is corrupted (contains an unrelated physics paper). Content for this summary was sourced from the actual paper PDF downloaded from ACL Anthology. + +## Key Contributions + +1. **FSTTM model**: A principled, unified framework for turn-taking control based on a 6-state non-deterministic finite-state machine with decision-theoretic action selection. +2. **Data-driven optimization**: Unlike prior hand-coded models, the FSTTM's cost parameters can be optimized from data using logistic regression. +3. **Anytime endpointing**: The model can make end-of-turn decisions both during pauses and during speech (before a pause is even detected by VAD), enabling faster response times. +4. **Deployed evaluation**: Tested on a real, publicly deployed bus information system, not just in lab conditions. + +## Architecture / Method Details + +### Six-State Finite-State Model + +Based on Jaffe & Feldstein (1970) and Brady (1969), the FSTTM uses six states representing who holds the conversational floor: + +| State | Description | +|-------|-------------| +| **USER** | User has the floor (obligation or intention to speak) | +| **SYSTEM** | System has the floor | +| **FREES** | Floor is free, following a SYSTEM state | +| **FREEU** | Floor is free, following a USER state | +| **BOTHS** | Both claim the floor, following a SYSTEM state | +| **BOTHU** | Both claim the floor, following a USER state | + +States are defined in terms of **intentions and obligations**, not surface-level speech/silence observations. For example, USER state persists during mid-turn pauses. + +### Four Turn-Taking Actions + +At any time, the system can take one of four actions: +- **Grab (G)**: Claim the floor +- **Release (R)**: Relinquish the floor +- **Keep (K)**: Maintain floor claim +- **Wait (W)**: Remain silent without claiming floor + +### Turn-Taking Phenomena Captured + +The model formalizes common turn-taking patterns as 2-step state transitions: + +1. **Turn transitions with gap**: `SYSTEM --(R,W)--> FREES --(W,G)--> USER` (most common) +2. **Turn transitions with overlap**: `SYSTEM --(K,G)--> BOTHS --(R,K)--> USER` (barge-in) +3. **Failed interruptions**: `USER --(G,K)--> BOTHU --(R,K)--> USER` (system interrupts then backs off) +4. **Time outs**: `SYSTEM --(R,W)--> FREES --(G,W)--> SYSTEM` (user doesn't respond) + +### Decision-Theoretic Action Selection + +The optimal action minimizes expected cost: + +``` +C(A) = sum over S in States: P(s=S|O) * C(A, S) +``` + +Where `P(s=S|O)` is estimated from observable features and `C(A,S)` comes from a cost matrix. + +### Cost Matrix Design Principles + +Derived from Sacks et al. (1974) -- "participants minimize gaps and overlaps": + +1. Actions that resolve gaps/overlaps have zero cost. +2. Actions that create unwanted gaps/overlaps have a constant cost parameter. +3. Actions that maintain gaps/overlaps have cost proportional to time in that state. + +Four cost parameters: +- **CS**: Cost of false interruption (interrupting system prompt when user is not claiming floor) +- **CO(tau)**: Cost of remaining in overlap for tau ms +- **CU**: Cost of cut-in (grabbing floor when user holds it) +- **CG(tau)**: Cost of remaining in a gap for tau ms (set as CG^p * tau) + +### Probability Estimation + +The key estimation task is `P(FREEU | Ot)` -- the probability that the user has released the floor. + +**At pauses** (VAD detects silence): Uses Bayes rule combining: +- `P(F|O)`: Prior probability of floor release from pause-onset features (logistic regression) +- `P(d >= tau | O, U)`: Probability pause lasts at least tau ms given user holds floor (exponential distribution) + +**During speech** (before VAD pause detection): Separate logistic regression on each ASR partial hypothesis, enabling endpointing before a full pause is detected. + +### Features Used + +All features are automatically extractable at runtime: +- **Dialog state**: Open question / Closed question / Confirmation +- **Turn-taking features**: Whether current utterance is a barge-in +- **Semantic features**: From dialog state and partial ASR hypotheses +- **Boundary LM score**: Ratio of log-likelihood of hypothesis being complete vs. incomplete (trained on ASR output, no human transcription needed). **Most informative feature across all states.** +- **Average words per utterance** so far +- **Confirmation markers**: "YES", "SURE", etc. +- **Pause duration** within partial hypothesis (0-200ms range) + +## Experimental Results + +### Logistic Regression for P(F|O) + +Trained with stepwise regression, 10-fold cross-validation on 586 dialogs (2008 corpus): + +| Dialog State | Pause: Class. Error | Pause: Log-Likelihood | Speech: Class. Error | Speech: Log-Likelihood | +|---|---|---|---|---| +| Open question | 35% (vs 38% baseline) | -0.61 (vs -0.66) | 17% (vs 20%) | -0.40 (vs -0.50) | +| Closed question | 26% (vs 25% baseline) | -0.50 (vs -0.56) | 22% (vs 32%) | -0.49 (vs -0.63) | +| Confirmation | 12% (vs 12% baseline) | -0.30 (vs -0.36) | 17% (vs 36%) | -0.40 (vs -0.65) | + +The "in speech" model achieves much larger gains, especially for Closed questions and Confirmations -- classification error drops from 32% to 22% and 36% to 17% respectively. + +### Batch Evaluation: Latency vs. Cut-in Rate + +**In-pause FSTTM vs. baselines** (2007 corpus): +- FSTTM outperforms fixed threshold baseline by up to **29.5% latency reduction**. +- Slight improvement over Ferrer et al. (2003) reimplementation. + +**Anytime-FSTTM vs. in-pause FSTTM** (2008 corpus): +- At 5% cut-in rate: anytime-FSTTM yields latencies **17% shorter** than in-pause-FSTTM, and **40% shorter** than fixed threshold baseline. +- **30-40% of turns are endpointed before the pause is detected by VAD** (during speech). + +### Live Evaluation (Deployed System) + +A/B test over 10 days: 171 FSTTM dialogs vs. 148 fixed-threshold control dialogs. + +Settings: `CG^p = 1, CG^s = 500, CU = 5000` (FSTTM) vs. 555ms fixed threshold (control). Both calibrated for ~6.3% cut-in rate. + +| Metric | FSTTM | Fixed Threshold | +|--------|-------|-----------------| +| Average latency | **320ms** | 513ms | +| Cut-in rate | **4.8%** | 6.3% | + +Results: +- **193ms latency reduction** (p < 0.05, statistically significant). +- 1.5% cut-in rate reduction (not statistically significant, but directionally correct). + +## Relevance to Turn-Taking / End-of-Turn Detection + +The FSTTM is a foundational reference for BabelCast's end-of-turn detection: + +1. **Decision-theoretic framework**: The cost-based approach directly applies to our system. We can define costs for premature LLM invocation (wasted compute, incorrect partial translation) vs. delayed response (poor user experience), and optimize the trade-off. + +2. **Anytime endpointing**: The insight that 30-40% of turns can be endpointed *before the pause is detected by VAD* is powerful. In our pipeline, this means we could start the translation process before the speaker's pause is even fully formed, dramatically reducing perceived latency. + +3. **Boundary LM score**: The most informative feature in Raux's system was a language model score measuring syntactic completeness. We already have ASR partial results from Whisper -- we could compute a similar feature to predict turn completeness without waiting for silence. + +4. **Cost parameter tuning**: The four-parameter cost matrix (CS, CO, CU, CG) provides a compact, interpretable way to tune system behavior. We could expose these as configuration parameters in our Pipecat pipeline, allowing adjustment of responsiveness vs. interruption rate. + +5. **Real-world validation**: This is one of the few ETD papers validated on a deployed system with real users, not just lab data. The 193ms latency improvement is a concrete, practically meaningful result. + +6. **Simplicity**: The FSTTM is mathematically elegant and computationally trivial -- just a few probability estimates and a cost comparison. It could be implemented alongside our Silero VAD as an additional decision layer with negligible overhead. + +7. **Limitation for our case**: The system was designed for a task-oriented telephone dialog (bus information), which has more predictable turn structures than the open-domain meeting conversations we handle. The boundary LM approach would need adaptation for our more varied content. diff --git a/03-finetune-pipecat-pt/references/papers/finite_state_turn_taking_raux_2009.pdf b/03-finetune-pipecat-pt/references/papers/finite_state_turn_taking_raux_2009.pdf new file mode 100644 index 0000000000000000000000000000000000000000..fd84be03ad5afecdd770c51644cb43fdea68b44d --- /dev/null +++ b/03-finetune-pipecat-pt/references/papers/finite_state_turn_taking_raux_2009.pdf @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aaa1054cf8f25ccdfd847cd0a323e8d0473867404c5895551a1ec9889e4f7d97 +size 254541 diff --git a/03-finetune-pipecat-pt/references/papers/focal_loss_lin_2017.md b/03-finetune-pipecat-pt/references/papers/focal_loss_lin_2017.md new file mode 100644 index 0000000000000000000000000000000000000000..e5c1c8a5e1c157fdb9d7620efab095c278750e74 --- /dev/null +++ b/03-finetune-pipecat-pt/references/papers/focal_loss_lin_2017.md @@ -0,0 +1,103 @@ +--- +title: "Focal Loss for Dense Object Detection" +authors: + - Tsung-Yi Lin + - Priya Goyal + - Ross Girshick + - Kaiming He + - Piotr Dollar +year: 2017 +source: https://arxiv.org/abs/1708.02002 +date_converted: 2026-03-16 +--- + +## Abstract + +The highest accuracy object detectors to date are based on a two-stage approach popularized by R-CNN, where a classifier is applied to a sparse set of candidate object locations. In contrast, one-stage detectors that are applied over a regular, dense sampling of possible object locations have the potential to be faster and simpler, but have trailed the accuracy of two-stage detectors thus far. The authors discover that the extreme foreground-background class imbalance encountered during training of dense detectors is the central cause. They propose to address this class imbalance by reshaping the standard cross entropy loss such that it down-weights the loss assigned to well-classified examples. The novel Focal Loss focuses training on a sparse set of hard examples and prevents the vast number of easy negatives from overwhelming the detector during training. Using focal loss, their one-stage RetinaNet detector matches the speed of previous one-stage detectors while surpassing the accuracy of all existing state-of-the-art two-stage detectors. + +## Key Contributions + +1. **Focal Loss function**: A dynamically scaled cross entropy loss where the scaling factor decays to zero as confidence in the correct class increases, automatically down-weighting easy examples and focusing on hard examples. +2. **RetinaNet**: A simple one-stage dense object detector built on Feature Pyramid Networks (FPN) that, when trained with focal loss, achieves state-of-the-art results. +3. **Identification of class imbalance** as the central obstacle preventing one-stage detectors from matching two-stage detector accuracy. + +## Method Details + +### Focal Loss Definition + +Standard cross entropy loss: + +``` +CE(pt) = -log(pt) +``` + +Focal loss adds a modulating factor `(1 - pt)^gamma`: + +``` +FL(pt) = -alpha_t * (1 - pt)^gamma * log(pt) +``` + +Key properties: +- When `gamma = 0`, FL is equivalent to CE. +- When an example is misclassified (`pt` is small), the modulating factor is near 1 and the loss is unaffected. +- As `pt -> 1`, the factor goes to 0, down-weighting well-classified examples. +- With `gamma = 2`, an example classified with `pt = 0.9` has 100x lower loss compared to CE; with `pt ~ 0.968`, 1000x lower loss. +- Best setting found: `gamma = 2`, `alpha = 0.25`. + +### RetinaNet Architecture + +- **Backbone**: ResNet + Feature Pyramid Network (FPN), pyramid levels P3-P7 with C=256 channels. +- **Classification subnet**: Small FCN with four 3x3 conv layers (256 filters, ReLU) + final 3x3 conv with KA filters + sigmoid. Shared across all pyramid levels. +- **Box regression subnet**: Identical structure to classification subnet, terminates in 4A linear outputs per location. Class-agnostic. +- **Anchors**: 9 anchors per level (3 aspect ratios x 3 scales), covering 32-813 pixel range. +- **Initialization**: Prior probability pi=0.01 for foreground class at initialization to prevent instability from background class dominance. + +## Experimental Results + +### Ablation Studies (ResNet-50-FPN, COCO minival) + +| Loss | gamma | alpha | AP | AP50 | AP75 | +|------|-------|-------|------|------|------| +| CE (balanced) | 0 | 0.75 | 31.1 | 49.4 | 33.0 | +| FL | 0.5 | 0.75 | 31.4 | 49.9 | 33.1 | +| FL | 1.0 | 0.50 | 32.9 | 51.7 | 35.2 | +| **FL** | **2.0** | **0.25** | **34.0** | **52.5** | **36.5** | +| FL | 5.0 | 0.25 | 32.2 | 49.6 | 34.8 | + +### FL vs. OHEM (ResNet-101-FPN) + +| Method | AP | AP50 | AP75 | +|--------|------|------|------| +| OHEM (best) | 32.8 | 50.3 | 35.1 | +| FL | **36.0** | **54.9** | **38.7** | + +FL outperforms the best OHEM variant by **3.2 AP points**. + +### State-of-the-Art Comparison (COCO test-dev) + +| Method | Backbone | AP | AP50 | AP75 | +|--------|----------|------|------|------| +| Faster R-CNN w FPN | ResNet-101-FPN | 36.2 | 59.1 | 39.0 | +| Faster R-CNN by G-RMI | Inception-ResNet-v2 | 34.7 | 55.5 | 36.7 | +| DSSD513 | ResNet-101-DSSD | 33.2 | 53.3 | 35.2 | +| **RetinaNet** | **ResNet-101-FPN** | **39.1** | **59.1** | **42.3** | +| **RetinaNet** | **ResNeXt-101-FPN** | **40.8** | **61.1** | **44.1** | + +### Speed vs. Accuracy + +- RetinaNet-101-600: 122ms inference, matching Faster R-CNN accuracy (36.0 AP) at similar speed. +- RetinaNet-101-800: 198ms inference, 37.8 AP -- surpassing all prior methods. + +## Relevance to Turn-Taking / End-of-Turn Detection + +Focal loss is directly applicable to end-of-turn detection for BabelCast: + +1. **Class imbalance problem**: End-of-turn detection faces severe class imbalance -- the vast majority of audio frames are "continuing speech" (easy negatives), while actual turn-end boundaries are rare events. This is analogous to the foreground-background imbalance in object detection. + +2. **Down-weighting easy examples**: In a streaming ETD system, most 100ms frames are clearly mid-utterance. Focal loss would prevent these trivially classified frames from dominating the gradient, focusing learning on the ambiguous boundary cases (pauses vs. turn-ends). + +3. **Drop-in replacement**: Focal loss is a simple modification to standard cross entropy -- just adding `(1 - pt)^gamma` -- making it trivial to integrate into any binary or ternary classifier for end-of-turn detection (e.g., the GRU or Wav2Vec models in SpeculativeETD). + +4. **Hyperparameter robustness**: The paper shows `gamma = 2, alpha = 0.25` works well across settings, reducing the need for extensive tuning. + +5. **No sampling needed**: Unlike OHEM or hard negative mining, focal loss operates on all examples naturally, which is important for real-time streaming where we process every frame. diff --git a/03-finetune-pipecat-pt/references/papers/focal_loss_lin_2017.pdf b/03-finetune-pipecat-pt/references/papers/focal_loss_lin_2017.pdf new file mode 100644 index 0000000000000000000000000000000000000000..f1c0e7b5c4239bee40efdabd627cd6f15fd37ad2 --- /dev/null +++ b/03-finetune-pipecat-pt/references/papers/focal_loss_lin_2017.pdf @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2e6f45b97007009f32b82a3c19f35638d8da7022d8d4e4416b9078de91339f71 +size 1297358 diff --git a/03-finetune-pipecat-pt/references/papers/hesitation-l2-french/01-kosmala-crible-2022-dual-status-filled-pauses.md b/03-finetune-pipecat-pt/references/papers/hesitation-l2-french/01-kosmala-crible-2022-dual-status-filled-pauses.md new file mode 100644 index 0000000000000000000000000000000000000000..78c70c1835fac2d00033b7a535157394ed488fa8 --- /dev/null +++ b/03-finetune-pipecat-pt/references/papers/hesitation-l2-french/01-kosmala-crible-2022-dual-status-filled-pauses.md @@ -0,0 +1,74 @@ +--- +title: "The Dual Status of Filled Pauses: Evidence from Genre, Proficiency and Co-occurrence" +authors: + - Loulou Kosmala + - Ludivine Crible +year: 2022 +source_url: "https://doi.org/10.1177/00238309211010862" +date_converted: 2026-03-16 +--- + +## Abstract + +A corpus study examining the lexical vs. non-lexical status of filled pauses ("euh" and "eum") in spoken French. Analyzes their distribution across communication settings (prepared monologues vs. spontaneous conversations) and language proficiency levels (native vs. non-native French). Quantitative findings reveal differences in frequency, duration, position, and co-occurrence patterns. Qualitative analysis identifies two distinct patterns: (1) initial position clustered with a discourse marker (fluent, structuring use) vs. (2) medial position clustered with other hesitation markers (disfluent, processing use). The authors argue for a **dual status** of filled pauses based on formal, functional, and contextual features. + +## Key Findings Relevant to L2 Turn-Taking + +### French Filled Pause Forms +- Two main variants in French: **"euh"** [schwa] and **"eum"** [nasal] +- "Eum" is associated with longer delays and major discourse transitions/boundaries +- "Euh" is the dominant form in both native and non-native French (84-92% of all FPs) + +### Genre Effects (DisReg Corpus -- French Native Speakers) +- **Class presentations**: 6.8 FPs per 100 words (mean duration 415ms) +- **Casual conversations**: 4.2 FPs per 100 words (mean duration 343ms) +- More filled pauses in monologues than dialogues (significant, LL = 47.02, p < .001) +- More "eum" in presentations (23%) vs. conversations (8%) +- More initial-position FPs in presentations (40%) vs. conversations (24%) +- More final-position FPs in conversations (14%) vs. presentations (3%) -- reflects **turn-yielding** function + +### Native vs. Non-Native French (SITAF Corpus) +- **Native rate**: 4.4 FPs per 100 words (mean duration **378ms**) +- **Non-native rate**: 5.3 FPs per 100 words (mean duration **524ms**) +- Rate difference not significant, but **duration difference highly significant** (p < .001) +- Both groups: ~60% medial position, ~30% initial, ~10% final +- Non-native speakers had more **standalone and interrupted** positions (exclusive to learners) +- Non-native speakers cluster FPs with other fluencemes more often (82% clustered vs. 72% for natives) +- Longer fluenceme sequences in learner speech signal higher disruption + +### Co-occurrence with Discourse Markers +- 69 instances of FP + discourse marker clusters found +- Common French discourse markers paired with FPs: "donc" (so), "mais" (but), "ben" (well), "alors" (then/well), "en fait" (actually), "enfin" (I mean) +- FP position shifts when clustered with discourse markers: **57% initial** (vs. 73% medial when isolated) +- **Two distinct patterns emerge**: + 1. **Fluent pattern**: FP + discourse marker at turn/phrase boundary (initial position) -- structuring function + 2. **Disfluent pattern**: FP in medial position clustered with repetitions, lengthenings, silent pauses -- processing difficulty + +### Functional Analysis of Discourse Marker + FP Clusters +Four discourse domains identified: +- **Ideational**: connecting facts (e.g., "alors euh" = "then euh") +- **Rhetorical**: expressing opinions (e.g., "mais euh" = "but euh") +- **Sequential**: marking transitions (e.g., "donc euh" = "so euh") +- **Interpersonal**: monitoring/agreement (e.g., "bah oui euh" = "well yes euh") + +## Specific Hesitation Pattern Data + +- Native French FP mean duration: **378ms** (SD=200ms) +- Non-native French FP mean duration: **524ms** (SD=222ms) -- 146ms longer +- FP duration is a more reliable index of proficiency than frequency +- Duration of FPs with/without discourse markers: no significant difference (~433ms vs. ~467ms) +- Example extreme disfluent sequence from a learner: 8 fluencemes in a row (lengthening + 3 silent pauses + 3 filled pauses + self-interruption), with pauses up to **1,177ms** + +## Implications for Turn-Taking Detection in L2 Speech + +1. **Dual function detection is essential**: The same sound "euh" can signal either (a) turn/phrase structuring (fluent, initial position + discourse marker) or (b) processing difficulty (disfluent, medial position + hesitation cluster). Both mean "don't take the turn yet" but for different reasons. + +2. **Duration as proficiency proxy**: Non-native FPs are ~150ms longer on average. A model trained on native French data will encounter systematically longer hesitations from L2 speakers. This duration difference should NOT be interpreted as turn-yielding. + +3. **Final-position FPs signal turn-yielding**: In conversations, 14% of FPs occur in final position (vs. 3% in monologues). The combination of FP + silent pause at utterance end is a reliable turn-yielding signal (e.g., "but he has a role euh (0.924)..."). + +4. **Discourse marker + FP combinations**: Patterns like "donc euh" (so euh), "mais euh" (but euh) are strong turn-HOLD signals in French. A model should recognize these common French fluenceme clusters. + +5. **Cluster length matters**: Isolated FPs may go unnoticed; clustered FPs with repetitions and silent pauses signal genuine difficulty. Longer clusters = speaker is struggling but still holding the floor. + +6. **L2 speakers underuse discourse markers**: Non-native speakers rely more heavily on filled pauses alone, whereas native speakers combine FPs with rich discourse markers. The model should expect L2 French-Portuguese speakers to show heavy FP reliance with fewer Portuguese discourse markers. diff --git a/03-finetune-pipecat-pt/references/papers/hesitation-l2-french/01-kosmala-crible-2022-dual-status-filled-pauses.pdf b/03-finetune-pipecat-pt/references/papers/hesitation-l2-french/01-kosmala-crible-2022-dual-status-filled-pauses.pdf new file mode 100644 index 0000000000000000000000000000000000000000..f801640322d180f4a78fd31479ab6d74d4ca3fa1 --- /dev/null +++ b/03-finetune-pipecat-pt/references/papers/hesitation-l2-french/01-kosmala-crible-2022-dual-status-filled-pauses.pdf @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:46985a0e6f9345c0ae5a756f3bf13523e9ba2a76694c60ef2b861cf0aef67b4b +size 631099 diff --git a/03-finetune-pipecat-pt/references/papers/hesitation-l2-french/02-ekstedt-skantze-2022-prosody-turn-taking-vap.md b/03-finetune-pipecat-pt/references/papers/hesitation-l2-french/02-ekstedt-skantze-2022-prosody-turn-taking-vap.md new file mode 100644 index 0000000000000000000000000000000000000000..d7a32774eb8a406cf2be23709c39a15a67aab87f --- /dev/null +++ b/03-finetune-pipecat-pt/references/papers/hesitation-l2-french/02-ekstedt-skantze-2022-prosody-turn-taking-vap.md @@ -0,0 +1,68 @@ +--- +title: "How Much Does Prosody Help Turn-taking? Investigations using Voice Activity Projection Models" +authors: + - Erik Ekstedt + - Gabriel Skantze +year: 2022 +source_url: "https://doi.org/10.18653/v1/2023.acl-long.304" +date_converted: 2026-03-16 +--- + +## Abstract + +Investigates the role of prosody in turn-taking using the Voice Activity Projection (VAP) model, which incrementally models upcoming speech activity of interlocutors in a self-supervised manner without relying on explicit annotation of turn-taking events or explicit prosodic feature modeling. Through systematic manipulation of the speech signal (F0 flattening, intensity flattening, low-pass filtering, duration averaging, F0 shifting), the authors show that VAP models learn to utilize various prosodic aspects of speech for turn-taking prediction. The study uses both aggregate quantitative metrics on long-form conversations (Switchboard corpus) and controlled utterance-level experiments with synthesized short/long phrase pairs. + +## Key Findings Relevant to L2 Turn-Taking + +### VAP Model Architecture +- Self-supervised model trained on raw waveforms (no hand-crafted features) +- Predicts future voice activity for both speakers at every time frame +- Tested at 20Hz, 50Hz, and 100Hz frame rates +- Trained on **Switchboard corpus** (English telephone conversations, 260h) +- Evaluated on 4 zero-shot tasks: shift vs. hold, shift prediction at voice activity, backchannel prediction, backchannel vs. turn-shift + +### Prosodic Perturbation Results +Five signal manipulations tested: +1. **Low-pass filter** (removes phonetic info, keeps F0 + intensity): Largest overall impact -- phonetic information is crucial +2. **F0 flat** (removes pitch contour): Second most impactful -- intonation is important for disambiguating turn completion +3. **Intensity flat** (removes loudness variation): Comparable impact to F0 +4. **F0 shift** (arbitrary pitch shift): Minimal effect on slower models (50Hz), larger effect on faster models (100Hz) +5. **Duration average** (normalizes segment durations): Least important individual cue + +### Key Prosodic Finding: F0 Contour for Turn Projection +- At **syntactically ambiguous completion points** (where lexical information is identical for both short and long utterances), the VAP model correctly uses prosody to distinguish turn-yielding from turn-holding +- **Higher F0 rise + longer duration** at the last syllable = turn completion signal +- The model predicts turn shifts **before** the utterance is actually complete -- it projects completions from prosodic dynamics +- Even when F0 is flattened, the model still partially distinguishes hold/shift, indicating **redundant information in intensity and/or duration** + +### Relative Importance of Prosodic Cues +1. **Phonetic information** (segmental, captured by spectral content): Most important overall +2. **F0 contour** (intonation): Most important for disambiguating syntactically equivalent completion points +3. **Intensity**: At least as important as pitch for general turn-taking +4. **Duration**: Less important, but contributes as redundant cue + +### Model Frame Rate Findings +- 20Hz and 50Hz models: comparable performance, more robust to perturbations +- 100Hz models: more sensitive to phonetic information and acoustic artifacts +- Lower frame rates preferred for computational efficiency and robustness + +## Specific Data Points + +- Human inter-turn gap: ~200ms average (Levinson & Torreira 2015) +- Switchboard corpus: 2,400 dyadic telephone conversations, 260 hours +- Short/long phrase pairs: 9 pairs, 10 TTS voices each (5 male, 5 female) +- Three evaluation regions at short completion point: hold (start to -200ms), predictive (-200ms to end), reactive (last frame) + +## Implications for Turn-Taking Detection in L2 Speech + +1. **VAP models work without explicit prosodic features**: The self-supervised approach learns prosodic patterns from raw audio. This is crucial for L2 speech where prosodic patterns deviate from native norms -- the model can potentially learn L2-specific patterns if fine-tuned on L2 data. + +2. **F0 contour is the key disambiguation signal**: When words alone cannot determine turn boundaries (common in L2 speech with simpler syntax), the pitch contour becomes the primary cue. French speakers' intonation patterns in Portuguese will be a critical signal for the model. + +3. **Prosody is redundant and multi-dimensional**: Even removing one prosodic dimension (F0 or intensity) doesn't completely collapse turn-taking prediction. This redundancy is useful for L2 speech where some prosodic cues may be atypical. + +4. **50Hz frame rate is optimal**: For our Pipecat implementation, a 50Hz (20ms frame) VAP model provides the best balance of performance, robustness, and computational efficiency. This aligns with our current 16kHz/512-sample (32ms) Silero VAD window. + +5. **Pre-completion projection**: The VAP model predicts turn shifts BEFORE the speaker finishes, based on prosodic dynamics. This is essential for reducing response latency in real-time systems. For L2 speakers, this projection may be less reliable due to non-standard prosodic patterns, requiring L2-specific fine-tuning. + +6. **Cross-linguistic transfer needed**: The model was trained on English (Switchboard). For French speakers in Portuguese, it needs fine-tuning on Romance language conversation data. The underlying prosodic principles (F0 rise at completion, intensity patterns) likely transfer across Romance languages, but language-specific patterns must be learned. diff --git a/03-finetune-pipecat-pt/references/papers/hesitation-l2-french/02-ekstedt-skantze-2022-prosody-turn-taking-vap.pdf b/03-finetune-pipecat-pt/references/papers/hesitation-l2-french/02-ekstedt-skantze-2022-prosody-turn-taking-vap.pdf new file mode 100644 index 0000000000000000000000000000000000000000..e0c451350a495de18e54f87cc07bfd7e6b3f3e8a --- /dev/null +++ b/03-finetune-pipecat-pt/references/papers/hesitation-l2-french/02-ekstedt-skantze-2022-prosody-turn-taking-vap.pdf @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:472f85f79713f457e2ec5aa866a1e88cbd58d119a8b3e1561d50e7f1c1722661 +size 2825968 diff --git a/03-finetune-pipecat-pt/references/papers/hesitation-l2-french/03-castillo-lopez-2025-survey-turn-taking-modeling.md b/03-finetune-pipecat-pt/references/papers/hesitation-l2-french/03-castillo-lopez-2025-survey-turn-taking-modeling.md new file mode 100644 index 0000000000000000000000000000000000000000..1a53a27ae29d314cf28b3ef9ea50ab66d98cf019 --- /dev/null +++ b/03-finetune-pipecat-pt/references/papers/hesitation-l2-french/03-castillo-lopez-2025-survey-turn-taking-modeling.md @@ -0,0 +1,77 @@ +--- +title: "A Survey of Recent Advances on Turn-taking Modeling in Spoken Dialogue Systems" +authors: + - Galo Castillo-Lopez + - Gael de Chalendar + - Nasredine Semmar +year: 2025 +source_url: "https://aclanthology.org/2025.iwsds-1.24/" +date_converted: 2026-03-16 +--- + +## Abstract + +A comprehensive review of recent methods on turn-taking modeling in spoken dialogue systems. Covers end-of-turn prediction, backchannel prediction, and multi-party turn-taking. Notes that 72% of reviewed works do not compare their methods with previous efforts, and argues that the lack of well-established benchmarks is a key challenge. Provides the first detailed review of datasets used in the field, discusses overlooked limitations, and examines new ideas and approaches since 2021. Published at IWSDS 2025. + +## Key Findings Relevant to L2 Turn-Taking + +### Turn-Taking Modeling Taxonomy +Three main approaches to end-of-turn (EOU) prediction: +1. **Silence-based**: Uses VAD + silence threshold (e.g., 700ms). Results in poor user experience due to unnaturalness. Current spoken dialogue systems wait 700-1000ms (vs. human 200ms average). +2. **IPU-based (Inter-Pausal Unit)**: Predictions after each detected silence. Assumes turns cannot be taken while user speaks. +3. **Continuous**: Predictions at every frame (e.g., every 50ms) regardless of silence. Most recent and promising approach. + +### Voice Activity Projection (VAP) Models -- Current State of the Art +- VAP models predict future voice activity of both interlocutors incrementally +- Self-supervised learning -- no explicit turn-taking event annotation needed +- Best results in Japanese when trained on English + fine-tuned with Japanese data (vs. direct Japanese training) +- Cross-lingual performance is poor without proper label alignment across datasets +- Emerging trend: VAP models dominate continuous methods + +### Feature Importance for Turn-Taking +- **Combined features outperform individual ones**: prosody + words > prosody alone or words alone +- **Turn-taking cues have additive effect** in human communication +- Word embeddings + acoustic features together improve backchannel detection +- Gaze, head pose, and non-verbal features enhance predictions when available +- **ASR-based linguistic features**: Fine-tuning wav2vec 2.0 for ASR outperforms acoustic-only features + +### Key Datasets +| Dataset | Language | Duration | Dialogues | +|---|---|---|---| +| Switchboard | English | 260h | 2,400 | +| Fisher Corpus | English | 1,960h | 11,700 | +| NoXi Database | en/es/fr/de/it/ar/id | 25h | 84 | +| AMI Meeting Corpus | English | 100h | 175 (multi-party) | + +- **Switchboard** is the dominant benchmark: used in 69% of backchannel and 41% of EOU papers +- Most research is on **English and Japanese** -- very limited work on other languages +- IPU silence thresholds vary from **50ms to 1s** across studies -- no standard + +### Backchannel Prediction +- Backchannels are short feedback tokens ("mhm", "yeah") produced during another's speech +- Key distinction: backchannel vs. actual interruption vs. non-lexical noise +- Multi-task learning (sentiment + dialogue acts + backchannel) improves prediction +- Context-aware models using BERT text embeddings + wav2vec acoustic embeddings show strong results + +### Open Challenges Identified +1. **No standardized benchmarks**: Only 28% of papers compare with prior work +2. **Multilinguality**: Very limited -- most work is English/Japanese only +3. **Multi-party conversations**: Understudied, requires visual channel +4. **Special populations**: Senior adults, mental health disorders need adapted models +5. **LLMs are inefficient** at detecting mid-utterance turn opportunities + +## Implications for Turn-Taking Detection in L2 Speech + +1. **Continuous VAP models are the right approach**: For our fine-tuning task, continuous frame-level prediction (not silence-threshold-based) is the current state of the art. This aligns with our Pipecat architecture. + +2. **Cross-lingual fine-tuning works**: VAP models trained on English and fine-tuned on target language data outperform direct target-language training. This suggests training on English Switchboard + fine-tuning on French-Portuguese conversation data is a viable strategy. + +3. **Multi-feature approach is essential**: For L2 speakers, combining prosodic + lexical + timing features provides the best predictions. L2 speakers may have atypical prosody but more predictable syntax, or vice versa. The additive effect of features provides robustness. + +4. **Silence threshold tuning**: The survey notes thresholds from 50ms to 1s. For L2 speakers with longer pauses, the threshold must be adjusted upward. Our current 300ms SILENCE_HANGOVER_MS may need extension for L2 French speakers in Portuguese. + +5. **French is available in NoXi**: The NoXi database includes French conversations (among 7 languages) with 25h total. This could be a valuable fine-tuning resource for our French-specific model. + +6. **Backchannel handling**: L2 speakers may produce non-standard backchannels or transfer French backchannels ("ouais", "mmh") into Portuguese conversation. The model must distinguish these from turn-taking attempts. + +7. **Benchmarking gap**: Since 72% of papers don't compare methods, we should establish clear metrics for our L2 turn-taking task from the start, enabling future comparison. diff --git a/03-finetune-pipecat-pt/references/papers/hesitation-l2-french/03-castillo-lopez-2025-survey-turn-taking-modeling.pdf b/03-finetune-pipecat-pt/references/papers/hesitation-l2-french/03-castillo-lopez-2025-survey-turn-taking-modeling.pdf new file mode 100644 index 0000000000000000000000000000000000000000..29da25684e0b00718dfc753a51a465b2863d0532 --- /dev/null +++ b/03-finetune-pipecat-pt/references/papers/hesitation-l2-french/03-castillo-lopez-2025-survey-turn-taking-modeling.pdf @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f3fd4a6e8a77a46ad20634ba4b05c5731dbf4c692eb1acdb2fc1bb71062f12ee +size 473840 diff --git a/03-finetune-pipecat-pt/references/papers/hesitation-l2-french/04-cenoz-2000-pauses-hesitation-L2-production.md b/03-finetune-pipecat-pt/references/papers/hesitation-l2-french/04-cenoz-2000-pauses-hesitation-L2-production.md new file mode 100644 index 0000000000000000000000000000000000000000..8ec42d2e6fbd94ae8e5e24fcb36d375707386326 --- /dev/null +++ b/03-finetune-pipecat-pt/references/papers/hesitation-l2-french/04-cenoz-2000-pauses-hesitation-L2-production.md @@ -0,0 +1,76 @@ +--- +title: "Pauses and Communication Strategies in Second Language Speech" +authors: + - Jasone Cenoz +year: 2000 +source_url: "https://eric.ed.gov/?id=ED426630" +date_converted: 2026-03-16 +--- + +## Abstract + +A study of silent and filled pauses in second language speech analyzing (1) which types of pauses are produced, (2) the functions of non-juncture pauses, (3) whether pauses co-occur with other hesitation phenomena, and (4) whether the occurrence of pauses is associated with second language proficiency. Subjects were 15 intermediate and advanced learners of English as a second language (L1 Spanish, university students). Each told a story in English which was recorded and transcribed. Silent and filled pauses at non-grammatical junctures were identified and analyzed. + +## Key Findings Relevant to L2 Turn-Taking + +### Pause Distribution +- Total non-juncture pauses: **1,085** across 15 subjects +- **64% silent pauses**, **36% filled pauses** +- The most common filler was **"eh"** (transferred from L1 Spanish), demonstrating L1 transfer of hesitation markers +- Wide individual variation: filled pauses ranged from 4% to 74.5% of all pauses per individual + +### Silent Pause Durations +| Duration Range | % of Pauses | Subjects Affected | Individual Variation | +|---|---|---|---| +| 200-1000ms | 70% | 100% | 39%-96% | +| 1001-2000ms | 21% | 100% | 4%-39% | +| 2001-4000ms | 7% | 40% | 10%-18% | +| 4001ms+ | 2% | 40% | 1.5%-7% | + +### Functional Categories of Pauses +| Function | Silent Pauses | Filled Pauses | +|---|---|---| +| Lexical (retrieval) | 36% | 26% | +| Morphological | 5% | 1% | +| Planning | 59% | 73% | + +- Both silent and filled pauses serve the same functions, but filled pauses are overwhelmingly used for general **planning** (73%) +- More silent pauses associated with **lexical retrieval** than filled pauses + +### Co-occurrence with Other Hesitation Phenomena +| Strategy | Silent Pauses | Filled Pauses | +|---|---|---| +| Pauses + other hesitations | 54% | 23% | +| Only pauses | 46% | 77% | + +- Most common hesitation phenomena: **repetition, self-correction, and reformulation** +- Silent pauses much more likely to co-occur with other repair strategies (54%) vs. filled pauses (23%) +- Filled pauses function as **standalone repair devices**; silent pauses tend to **precede** other repairs + +### Proficiency Effects +- Higher-proficiency learners produced **more total pauses** and **more filled pauses** (53% silent + 46% filled) +- Lower-proficiency learners: 69% silent + 31% filled +- Lower-proficiency learners used **more hesitation strategies combined with pauses** (62% for silent pauses) vs. higher-proficiency (38%) +- Interpretation: high-proficiency learners need only time to retrieve information; low-proficiency learners need to vocalize different options + +## Specific Hesitation Pattern Data + +- Silent pause minimum threshold: **200ms** +- Pause range: **205ms to 11,569ms** +- Most pauses (70%) are under 1 second +- Long pauses (>2s) only in 40% of subjects -- marks a subset of struggling speakers +- L1 Spanish filler "eh" dominates filled pauses (L1 transfer effect) + +## Implications for Turn-Taking Detection in L2 Speech + +1. **Pause duration is critical**: 70% of L2 pauses are 200-1000ms, overlapping with natural turn-transition gaps (~200ms). A turn-taking model must distinguish these within-turn hesitation pauses from actual turn boundaries. + +2. **Filled pauses signal continuation**: Since filled pauses are primarily standalone floor-holding devices (77% occur without other hesitations), detecting "eh/euh/um" should strongly indicate the speaker intends to continue, NOT yield the turn. + +3. **L1 transfer of filler forms**: French speakers speaking Portuguese will likely transfer French "euh" rather than adopting Portuguese fillers. The model should recognize L1-influenced fillers as valid hold signals. + +4. **Proficiency paradox**: More proficient L2 speakers use MORE filled pauses, not fewer. The model should not interpret high filler rates as low fluency or turn-yielding intent. + +5. **Cluster detection**: When silent pauses co-occur with repetitions, self-corrections, or reformulations (54% of cases), this strongly signals ongoing speech processing, not turn completion. Detecting hesitation clusters is key to avoiding premature turn-taking. + +6. **Individual variation is massive**: Some speakers use almost no filled pauses (4%) while others fill 74.5% of their pauses. The model needs speaker adaptation or robust handling of diverse hesitation profiles. diff --git a/03-finetune-pipecat-pt/references/papers/hesitation-l2-french/04-cenoz-2000-pauses-hesitation-L2-production.pdf b/03-finetune-pipecat-pt/references/papers/hesitation-l2-french/04-cenoz-2000-pauses-hesitation-L2-production.pdf new file mode 100644 index 0000000000000000000000000000000000000000..48436f3e1178122aeab7adb2eacba1fc1b65f186 --- /dev/null +++ b/03-finetune-pipecat-pt/references/papers/hesitation-l2-french/04-cenoz-2000-pauses-hesitation-L2-production.pdf @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3e6e1ea3fb31493034539cd59a0274c8bcc11e08ff4104b2fb882c5766106e7a +size 238029 diff --git a/03-finetune-pipecat-pt/references/papers/hesitation-l2-french/05-kosmala-2023-filled-pauses-pragmatic-markers-gaze-gesture.pdf b/03-finetune-pipecat-pt/references/papers/hesitation-l2-french/05-kosmala-2023-filled-pauses-pragmatic-markers-gaze-gesture.pdf new file mode 100644 index 0000000000000000000000000000000000000000..27dfd300436562bb8307d3d40335231dcf726fc8 --- /dev/null +++ b/03-finetune-pipecat-pt/references/papers/hesitation-l2-french/05-kosmala-2023-filled-pauses-pragmatic-markers-gaze-gesture.pdf @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7b4003c9f58c976ea687d117ea25b507885d0472d440eaad0e8386880e036734 +size 920978 diff --git a/03-finetune-pipecat-pt/references/papers/hesitation-l2-french/06-wu-2023-disfluencies-french-prosodic-params-filled-pauses.md b/03-finetune-pipecat-pt/references/papers/hesitation-l2-french/06-wu-2023-disfluencies-french-prosodic-params-filled-pauses.md new file mode 100644 index 0000000000000000000000000000000000000000..a0fe11e0fa7fa6658a3cb37085e38778c0df1d62 --- /dev/null +++ b/03-finetune-pipecat-pt/references/papers/hesitation-l2-french/06-wu-2023-disfluencies-french-prosodic-params-filled-pauses.md @@ -0,0 +1,76 @@ +--- +title: "Disfluencies in Continuous Speech in French: Prosodic Parameters of Filled Pauses and Vowel Lengthening" +authors: + - Yaru Wu + - Ivana Didirkova + - Anne-Catherine Simon +year: 2023 +source_url: null +date_converted: 2026-03-16 +--- + +## Abstract + +This study examines prosodic parameters in two types of disfluencies -- vowel lengthenings and filled pauses -- in approximately 2.5 hours of continuous French speech across 11 different speech genres (prepared, semi-prepared, and unprepared). Analyzes mean fundamental frequency (F0), pitch resets, and duration to compare disfluent syllables with surrounding fluent syllables as a function of speech preparation level. Results show that F0 is lower in filled pauses and disfluent vowel lengthenings than in fluent speech, with filled pauses produced at lower F0 than vowel lengthening. Larger pitch resets occur between disfluent units and their preceding contexts. + +## Key Findings Relevant to L2 Turn-Taking + +### Prosodic Characteristics of French Filled Pauses + +#### Fundamental Frequency (F0) Drop +- **Fluent speech**: mean 19.01 ST (SD=5.97) +- **Vowel lengthening**: mean 18.10 ST (SD=6.43) -- ~1 ST drop +- **Filled pauses**: mean 17.90 ST (SD=7.31) -- ~1.1 ST drop +- F0 drop in filled pauses vs. fluent speech: + - **Females**: 2.12 ST decrease (from 23.66 to 21.54 ST) + - **Males**: 1.75 ST decrease (from 16.72 to 14.97 ST) +- Both disfluency types significantly lower than fluent speech (p < 0.001) + +#### F0 and Speech Preparation Level +- **Unprepared speech**: FP average F0 = 18.89 ST +- **Semi-prepared speech**: FP average F0 = 17.41 ST +- **Prepared speech**: FP average F0 = 14.24 ST +- Massive **4.66 ST drop** between unprepared and prepared speech for FPs +- Vowel lengthening does NOT follow this trend (only 0.07 ST difference) +- Fluent speech shows moderate 1.2 ST drop between unprepared and prepared + +#### Pitch Reset (Melodic Discontinuity) +- Filled pauses create a **larger negative pitch reset** from the preceding syllable than vowel lengthening +- Both FPs and lengthening produce significantly more negative pitch changes than fluent speech (p < 0.001) +- No significant pitch reset difference between the disfluent unit and the FOLLOWING syllable +- Key insight: the pitch drop **before** a filled pause is a reliable acoustic cue + +#### Duration +- **Vowel lengthening** is significantly LONGER than **filled pauses** (p < 0.001) +- Vowel lengthening mean: ~347ms; Filled pause mean: ~268ms (from prior French study by Grosman 2018) +- Filled pauses are shorter in prepared speech and longer in unprepared speech +- Vowel lengthening shows the opposite pattern: shorter in unprepared, longer in prepared/semi-prepared + +### Corpus Details (LOCAS-F) +- Multi-genre French oral corpus (14 speech genres) +- Analyzed: 2h38m of recordings +- Data: >1,500 vowel prolongation sequences, >1,000 filled pauses, ~59,000 fluent syllables +- Disfluency affects ~4% of all data (typical for non-pathological speech) +- Inter-annotator agreement: kappa 0.86 for FPs (near perfect), 0.64 for lengthening (substantial) + +## Specific Hesitation Pattern Data + +- French filled pauses typically transcribed as **"euh"** +- Duration range for hesitation vowels (cross-linguistic): **200ms to 650ms** (from Vasilescu et al. 2004, 8 languages) +- French FP F0 value is similar to the **onset value of breath groups** for a given speaker +- Mean FP duration in French: **268.4ms** (Grosman 2018) +- Mean vowel lengthening duration in French: **347.25ms** (Grosman 2018) + +## Implications for Turn-Taking Detection in L2 Speech + +1. **F0 drop is a reliable filled pause detector**: Filled pauses in French show a consistent 1-2 ST drop in F0 compared to fluent speech. A turn-taking model can use this prosodic signature to identify FPs even without lexical recognition. This is especially useful when ASR may not reliably transcribe L2 "euh" sounds. + +2. **Pitch reset before FPs marks processing onset**: The significant negative pitch reset between the preceding syllable and the filled pause signals the START of a hesitation event. This could serve as an early warning that the speaker is about to hesitate but intends to continue. + +3. **Preparation level affects FP prosody**: In more spontaneous speech (like real-time conversation), FPs have HIGHER F0 (closer to fluent speech). In prepared speech, FPs drop dramatically in F0. Since L2 speakers in conversation are in "unprepared" mode, their FPs may be harder to distinguish from fluent speech by F0 alone. + +4. **Vowel lengthening is a separate hesitation signal**: French speakers (both L1 and L2) elongate vowels at word boundaries as a hesitation strategy. These are LONGER than filled pauses (~347ms vs. ~268ms) but show a smaller F0 drop. The model should recognize elongated schwa-like sounds as hesitation, not turn completion. + +5. **4% disfluency rate in native speech**: This baseline helps calibrate expectations. L2 speakers will show significantly higher rates, potentially 8-15% or more, meaning the model will encounter disfluency markers very frequently in L2 French-to-Portuguese speech. + +6. **Cross-linguistic prosodic transfer**: French speakers speaking Portuguese will likely transfer their French FP prosodic patterns (F0 drop magnitude, pitch reset patterns). The model should accommodate French-influenced prosodic contours on Portuguese hesitations. diff --git a/03-finetune-pipecat-pt/references/papers/hesitation-l2-french/06-wu-2023-disfluencies-french-prosodic-params-filled-pauses.pdf b/03-finetune-pipecat-pt/references/papers/hesitation-l2-french/06-wu-2023-disfluencies-french-prosodic-params-filled-pauses.pdf new file mode 100644 index 0000000000000000000000000000000000000000..41e18770ac2cd49c3f541386ed1e7bd898375214 --- /dev/null +++ b/03-finetune-pipecat-pt/references/papers/hesitation-l2-french/06-wu-2023-disfluencies-french-prosodic-params-filled-pauses.pdf @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c77a54e77105dc6ad1f3306d0a991c9730f70ed327559f2686bc5897c8f6540c +size 436423 diff --git a/03-finetune-pipecat-pt/references/papers/hesitation-l2-french/07-ekstedt-2023-turn-taking-cues-speech-synthesis.pdf b/03-finetune-pipecat-pt/references/papers/hesitation-l2-french/07-ekstedt-2023-turn-taking-cues-speech-synthesis.pdf new file mode 100644 index 0000000000000000000000000000000000000000..3d6d59a4cae4a1ef07fffc24495fab70cb2f9851 --- /dev/null +++ b/03-finetune-pipecat-pt/references/papers/hesitation-l2-french/07-ekstedt-2023-turn-taking-cues-speech-synthesis.pdf @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9422e0c9df73fd3e2a91bb7bea6eba9393ba5865ea91b36463e1d50c23c9215b +size 294389 diff --git a/03-finetune-pipecat-pt/references/papers/hesitation-l2-french/08-inoue-2024-realtime-turn-taking-vap.pdf b/03-finetune-pipecat-pt/references/papers/hesitation-l2-french/08-inoue-2024-realtime-turn-taking-vap.pdf new file mode 100644 index 0000000000000000000000000000000000000000..2fa6a140ffc6860d2ef1b646fcb90a2783540e63 --- /dev/null +++ b/03-finetune-pipecat-pt/references/papers/hesitation-l2-french/08-inoue-2024-realtime-turn-taking-vap.pdf @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:129a900833fb9a775f6abc8869de60f3526a3a489ed6fda5f8b267136a5a8cec +size 496267 diff --git a/03-finetune-pipecat-pt/references/papers/hesitation-l2-french/09-inoue-2024-multilingual-turn-taking-vap.pdf b/03-finetune-pipecat-pt/references/papers/hesitation-l2-french/09-inoue-2024-multilingual-turn-taking-vap.pdf new file mode 100644 index 0000000000000000000000000000000000000000..315cdc14323e72a181e87229986f0bcea8eb4195 --- /dev/null +++ b/03-finetune-pipecat-pt/references/papers/hesitation-l2-french/09-inoue-2024-multilingual-turn-taking-vap.pdf @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a12189557e7022189609c710cd8071f359ae4e05a3c755c7e1aceef1b19a86bc +size 974569 diff --git a/03-finetune-pipecat-pt/references/papers/hesitation-l2-french/10-hutin-2024-filled-pauses-conversational-convergence.pdf b/03-finetune-pipecat-pt/references/papers/hesitation-l2-french/10-hutin-2024-filled-pauses-conversational-convergence.pdf new file mode 100644 index 0000000000000000000000000000000000000000..59872f97691b2d9141cf7f964673a6246d6f9f15 --- /dev/null +++ b/03-finetune-pipecat-pt/references/papers/hesitation-l2-french/10-hutin-2024-filled-pauses-conversational-convergence.pdf @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e48ef19c610ebb2f2dca05458c801d11f2899563ff165aa7b1a90c17a563501a +size 951784 diff --git a/03-finetune-pipecat-pt/references/papers/hesitation-l2-french/11-beatty-martinez-2020-codeswitching-bilingual-toolkit.md b/03-finetune-pipecat-pt/references/papers/hesitation-l2-french/11-beatty-martinez-2020-codeswitching-bilingual-toolkit.md new file mode 100644 index 0000000000000000000000000000000000000000..292aec94e323325783cd591448bf8d1e49f7a11d --- /dev/null +++ b/03-finetune-pipecat-pt/references/papers/hesitation-l2-french/11-beatty-martinez-2020-codeswitching-bilingual-toolkit.md @@ -0,0 +1,67 @@ +--- +title: "Codeswitching: A Bilingual Toolkit for Opportunistic Speech Planning" +authors: + - Anne L. Beatty-Martinez + - Christian A. Navarro-Torres + - Paola E. Dussias +year: 2020 +source_url: "https://doi.org/10.3389/fpsyg.2020.01699" +date_converted: 2026-03-16 +--- + +## Abstract + +Reviews codeswitching as a bilingual strategy for opportunistic speech planning. Recent discoveries show that codeswitching is not haphazard but subject to unique linguistic and cognitive constraints, and that bilinguals who codeswitch exhibit usage patterns conforming to community-based norms. The paper provides corpus evidence (from the Puerto Rico Codeswitching Map Task corpus of Spanish-English bilinguals) that codeswitching serves as a tool to navigate linguistic interference during production, enabling speakers to circumvent speech planning difficulties by opportunistically drawing from whichever language is most active or accessible. + +## Key Findings Relevant to L2 Turn-Taking + +### Codeswitching as a Fluency Strategy +- Codeswitching is NOT random -- it is **structured and strategic** +- Functions as a tool for **opportunistic speech planning**: bilinguals take advantage of whichever language's words/structures are most active to achieve communicative goals +- Reduces the cost in time and resources during speech production +- **93% of codeswitches occur at Intonation Unit (IU) boundaries** (Plaistowe 2015, from NMSEB corpus) +- This means switches overwhelmingly happen at natural prosodic break points, not mid-phrase + +### Prosodic Signatures of Codeswitching +- Speech rate changes before codeswitches: momentary **reorganization of prosodic and phonetic systems** +- These changes serve a dual purpose: + 1. Help the speaker negotiate lexical competition and minimize cross-language interference + 2. Provide reliable **acoustic cues for listeners** to anticipate and process the switch +- Codeswitching affects bilingual speech at different levels: word-level (speech rate of individual words) and sentence-level (speech rate within prosodic sentences) + +### Language Control Modes +- **Single-language context**: Language control is COMPETITIVE -- one language suppressed at expense of other +- **Codeswitching context**: Language control is COOPERATIVE -- coactivation maintained, items from both languages available for selection +- Dense codeswitchers minimize language membership tagging and keep both languages active + +### Corpus Data (Puerto Rico Codeswitching Map Task) +- 10 Spanish-English bilinguals (6 female), all native Spanish speakers +- Equal self-reported proficiency in both languages (9.6/10 each) +- ~2.5 hours of unscripted, task-oriented dialogs +- Participants exposed to both languages: more Spanish with family, more English in media, equal among friends +- Paired with in-group confederate (close friend from same speech community) -- this increased codeswitching 4x vs. out-group pairing + +### Variable Equivalence and Switch Sites +- Bilinguals do NOT consistently avoid "conflict sites" between languages +- Instead, they **opportunistically use** sites of variable equivalence (partial structural overlap between languages) +- This challenges the strict "equivalence constraint" (Poplack 1980) + +## Specific Data Points + +- Self-reported proficiency: Spanish 9.6/10 (SD=0.8), English 9.6/10 (SD=0.5) +- Mean age: 23.3 years (SD=1.8) +- Codeswitching at IU boundaries: 93% (from related corpus study) + +## Implications for Turn-Taking Detection in L2 Speech + +1. **Codeswitches happen at prosodic boundaries**: Since 93% of codeswitches align with intonation unit boundaries, a turn-taking model should expect language switches at natural break points -- the same locations where turn transitions might occur. The model must not interpret a language switch as a turn-ending signal. + +2. **Prosodic reorganization before switches**: The speech rate and prosodic changes before a codeswitch could be confused with turn-ending prosody. For French speakers occasionally inserting French words while speaking Portuguese, the model needs to tolerate these prosodic perturbations without triggering a false turn-shift prediction. + +3. **L1-L2 fluency trade-off**: When French speakers struggle with Portuguese lexical retrieval, they may briefly switch to French (a natural bilingual strategy). The turn-taking model should recognize this as a hold signal (the speaker is using an alternative strategy to continue) rather than a breakdown signal. + +4. **Cooperative language mode in bilingual contexts**: If the conversation partner also speaks French, the French-Portuguese speaker may engage in cooperative codeswitching. The model should handle mixed-language utterances without treating language boundaries as turn boundaries. + +5. **Community norms matter**: Codeswitching frequency and patterns depend heavily on the interactional context and community norms. The model should be configurable for different bilingual settings (e.g., high-codeswitch vs. monolingual-target contexts). + +6. **Intonation unit alignment**: Since codeswitches cluster at IU boundaries, and IU boundaries are also key sites for turn-taking decisions, the model must weigh additional cues (gaze, content completeness, prosodic finality) at these ambiguous boundary points where both a codeswitch-and-continue and a turn-yield are possible. diff --git a/03-finetune-pipecat-pt/references/papers/hesitation-l2-french/11-beatty-martinez-2020-codeswitching-bilingual-toolkit.pdf b/03-finetune-pipecat-pt/references/papers/hesitation-l2-french/11-beatty-martinez-2020-codeswitching-bilingual-toolkit.pdf new file mode 100644 index 0000000000000000000000000000000000000000..275fce5fde4d78789d579a04b68344d02c17fd58 --- /dev/null +++ b/03-finetune-pipecat-pt/references/papers/hesitation-l2-french/11-beatty-martinez-2020-codeswitching-bilingual-toolkit.pdf @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:40ac9947f5813c936324bc2e266a1a574f1edb9d72a0d18f851bc3ddadf1d25b +size 1159649 diff --git a/03-finetune-pipecat-pt/references/papers/hesitation-l2-french/12-lingref-code-switching-bilinguals.pdf b/03-finetune-pipecat-pt/references/papers/hesitation-l2-french/12-lingref-code-switching-bilinguals.pdf new file mode 100644 index 0000000000000000000000000000000000000000..0d4de8b871f969b98ea2b67268783ed16fd1a028 --- /dev/null +++ b/03-finetune-pipecat-pt/references/papers/hesitation-l2-french/12-lingref-code-switching-bilinguals.pdf @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ca2aa2330e88de35a7f0254aefd3c024d14ce282f3f73ee28914f656f62f7820 +size 874378 diff --git a/03-finetune-pipecat-pt/references/papers/hesitation-l2-french/13-christodoulides-avanzi-2014-DisMo-disfluency-french.pdf b/03-finetune-pipecat-pt/references/papers/hesitation-l2-french/13-christodoulides-avanzi-2014-DisMo-disfluency-french.pdf new file mode 100644 index 0000000000000000000000000000000000000000..b0e4ddfaad2d0f7e90fe61383961456db5015ff1 --- /dev/null +++ b/03-finetune-pipecat-pt/references/papers/hesitation-l2-french/13-christodoulides-avanzi-2014-DisMo-disfluency-french.pdf @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:66d1a7dd53dcaef36246bf28e6e56e4e7e00a768a6d0c40ba5cc8505148d1116 +size 1166957 diff --git a/03-finetune-pipecat-pt/references/papers/hesitation-l2-french/14-jouvet-2019-speech-processing-prosody.pdf b/03-finetune-pipecat-pt/references/papers/hesitation-l2-french/14-jouvet-2019-speech-processing-prosody.pdf new file mode 100644 index 0000000000000000000000000000000000000000..20160f6cd1662b1190ac0e4528858dac065827cd --- /dev/null +++ b/03-finetune-pipecat-pt/references/papers/hesitation-l2-french/14-jouvet-2019-speech-processing-prosody.pdf @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:75acb94aae4d529abe915ff6a0732ea86ecdbd0ab5eb9af3d0d59a8b131f53f5 +size 176990 diff --git a/03-finetune-pipecat-pt/references/papers/hesitation-l2-french/15-filler-particles-cross-linguistic-heritage-2024.md b/03-finetune-pipecat-pt/references/papers/hesitation-l2-french/15-filler-particles-cross-linguistic-heritage-2024.md new file mode 100644 index 0000000000000000000000000000000000000000..3a3b9480585b45a9cdf915e7137ba189f1a6aa9e --- /dev/null +++ b/03-finetune-pipecat-pt/references/papers/hesitation-l2-french/15-filler-particles-cross-linguistic-heritage-2024.md @@ -0,0 +1,76 @@ +--- +title: "Do You Say Uh or Uhm? A Cross-linguistic Approach to Filler Particle Use in Heritage and Majority Speakers Across Three Languages" +authors: + - Marlene Bottcher + - Margaret Zellers +year: 2024 +source_url: "https://doi.org/10.3389/fpsyg.2024.1305862" +date_converted: 2026-03-16 +--- + +## Abstract + +Investigates the language-specific use of filler particles by bilingual heritage speakers in both their languages across English, German, and Russian. Uses data from the RUEG corpus (monolingual and bilingual speakers, adolescent and adult age groups). Examines whether heritage speakers produce language-specific filler particle forms and whether socio-linguistic variables (gender, age) relate to filler form variation. Addresses the question of whether fillers are language-specific lexical items or universal hesitation symptoms. + +## Key Findings Relevant to L2 Turn-Taking + +### Filler Particles Are Language-Specific +- Filler particles show **subtle language-specific differences** in both form and frequency +- The variation in form (uh vs. uhm/V vs. VN) is related to socio-linguistic variables like gender +- L2 filler particle use differs from monolinguals in **both frequency and form** +- Heritage speakers (HSs) should use distinct filler forms in their two languages if fillers are true lexical items + +### Cross-Linguistic Filler Patterns +- English: "uh" and "uhm" +- German: "ah" and "ahm" +- French: "euh" and "eum" +- Russian: language-specific forms +- Key variation parameter: **VN ratio** (proportion of nasal variants like "uhm" vs. non-nasal "uh") +- VN ratio varies significantly across languages and is influenced by gender and age + +### Heritage Speaker vs. Monolingual Differences +- Heritage speakers (HSs) are bilingual speakers of a minority/heritage language (HL) and a majority language (ML) +- HL is typically spoken at home with family; ML used in public sphere, education, work +- HSs represent a link between L2 learners and monolingually raised speakers on a native-speaker continuum +- HSs may show **bidirectional influence** between their two languages +- Deviances from monolingual norms are most present in informal language (the dominant register for HSs) + +### Cognitive Load Effects +- Heritage speakers expected to produce **more FPs overall** compared to monolinguals due to higher cognitive effort of monitoring two languages +- Monolinguals show more FPs in situations with more pressure on language form (formal register) +- Both groups: more FPs = higher cognitive load / speech planning difficulty + +### Language-Specific Silence Tolerance +- Japanese speakers (and other Asian language speakers) have **higher tolerance for silences** than English speakers +- This tolerance **transfers to L2** productions +- Evidence that silence/pause norms are culturally embedded and persist in L2 speech + +### The RUEG Corpus +- 736 speakers total (326 monolingual, 412 bilingual) +- 4,468 narrations across English, German, Russian, Turkish, Greek +- Elicited via video (car accident) in formal (police report) and informal (friend message) situations +- Both written and spoken modes +- Heritage speaker groups: English-Russian, English-German, German-Russian + +## Research Questions and Hypotheses +1. Do mono- and bilinguals increase FP numbers under higher cognitive load? + - H1a: HSs produce more FPs than monolinguals (monitoring two languages) + - H1b: Monolinguals produce more FPs in formal register +2. Do heritage speakers produce language-specific FP forms? + - H2a: English, German, Russian show different VN filler ratios + - H2b: FP form relates to socio-linguistic variables (gender, age) + - H2c: Heritage speakers distinguish FP variants across their two languages + +## Implications for Turn-Taking Detection in L2 Speech + +1. **Filler forms transfer from L1**: French speakers speaking Portuguese will likely produce French-influenced fillers ("euh" rather than Portuguese equivalents). The turn-taking model must recognize L1-influenced filler forms as valid hesitation signals in the L2 context. + +2. **Language-specific filler recognition**: Since each language has distinct filler phonology, the model needs to handle both French-type fillers ("euh"/"eum") and Portuguese-type fillers in the same conversation. A speaker may use either or both. + +3. **Heritage speaker as model for L2 behavior**: Heritage speakers provide a useful parallel for L2 speakers -- both show increased FP rates due to dual-language cognitive load and may transfer pause patterns. The model can learn from heritage speaker data as a proxy for L2 behavior. + +4. **Cognitive load increases fillers**: The higher cognitive load of bilingual speech processing reliably increases filler frequency. The model should expect L2 speakers to produce 20-50% more fillers than native speakers, all of which are floor-holding signals. + +5. **Register affects filler patterns**: Formal speech elicits different filler patterns than informal speech. The turn-taking model should account for the formality of the communication context -- a professional meeting vs. casual conversation will have different filler baselines. + +6. **Culturally embedded silence norms**: French speakers may have different silence tolerance than Portuguese speakers. The model should account for the speaker's L1 cultural norms when interpreting pause durations -- what seems like a long pause for a Portuguese listener may be a normal planning pause for a French speaker. diff --git a/03-finetune-pipecat-pt/references/papers/hesitation-l2-french/15-filler-particles-cross-linguistic-heritage-2024.pdf b/03-finetune-pipecat-pt/references/papers/hesitation-l2-french/15-filler-particles-cross-linguistic-heritage-2024.pdf new file mode 100644 index 0000000000000000000000000000000000000000..cd3cea19e4ad4d74885f2b46ddf8874c08935d83 --- /dev/null +++ b/03-finetune-pipecat-pt/references/papers/hesitation-l2-french/15-filler-particles-cross-linguistic-heritage-2024.pdf @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ca439252e83fefe148943264fec6677a5f29c194262f63dc50dc98b6313bcb75 +size 2537362 diff --git a/03-finetune-pipecat-pt/references/papers/hesitation-l2-french/16-hal-peters-2017-L2-fluency-french.md b/03-finetune-pipecat-pt/references/papers/hesitation-l2-french/16-hal-peters-2017-L2-fluency-french.md new file mode 100644 index 0000000000000000000000000000000000000000..ca39a69708c82807aa8ec2c4b647e648188a78c1 --- /dev/null +++ b/03-finetune-pipecat-pt/references/papers/hesitation-l2-french/16-hal-peters-2017-L2-fluency-french.md @@ -0,0 +1,83 @@ +--- +title: "Morphosyntactic Regularities in Retracing Phenomena in the Speech of Second Language Learners of French" +authors: + - Hugues Peters +year: 2017 +source_url: "http://hdl.handle.net/2078.1/195807" +date_converted: 2026-03-16 +--- + +## Abstract + +Investigates morphosyntactic properties of retracing phenomena (repeats, self-repairs, and false starts) produced by L2 learners of French. Based on quantitative analysis of the UWI learner corpus (Jamaican learners of French) and qualitative analysis using Distributed Morphology. Hypothesizes that disfluencies, usually considered performance hiccups, actually manifest syntactic regularities revealing aspects of underlying grammatical competence. Published in the proceedings of the "(DIS)FLUENCY 2017 International Conference: Fluency and Disfluency Across Languages and Language Varieties" at UCLouvain. + +## Key Findings Relevant to L2 Turn-Taking + +### Three Types of Fluency in L2 Speech (Tavakoli & Skehan 2005) +1. **Speed fluency**: rate of speech +2. **Breakdown fluency**: silent and filled pauses +3. **Repair fluency**: repairs (repeats, self-corrections, reformulations) + +### Progressive vs. Regressive Repairs (Olynyk et al. 1990) +- **Progressive repairs** (repeats, fillers): NOT necessarily detrimental to fluency -- allow speakers to avoid silent pauses within utterances; keep the floor +- **Regressive repairs** (self-repairs, false starts): Affect fluency negatively; signal genuine processing difficulty + +### L2 Speech Production Model (Kormos 2006) +- Disfluencies in L2 speech arise from **lack of automatization** of lexical retrieval and syntactic production +- L2 speakers must **consciously arrange** individual lexemes into syntactic units -- this overloads working memory +- L2 self-repairs differ **qualitatively** from native repairs: + - **L2 speakers**: correct syntax and morphology + - **Native speakers**: correct lexis or register +- This qualitative difference reveals different levels of linguistic processing + +### Morphosyntactic Regularities in L2 Retracing +Key finding: L2 repairs show systematic grammatical patterns, not random errors: + +#### Determiner Gender Self-Repairs +- Most gender repairs go from **masculine "le" to feminine "la"** (le -> la) +- Rarely from feminine to masculine (la -> le) +- When repair is difficult, speakers produce **repetitions** (le [/] le mot) or **full NP repairs** ( [//] le droit) +- This reveals that **masculine is the default gender** in L2 French interlanguage + +#### Subject Pronoun Behavior +- Lower-proficiency students tend to **omit 3rd person subject pronouns** when repeating lexical verbs ("ils enseignent [/] enseignent") +- They **never omit** subject pronouns with: + - Auxiliary verbs + - Negative or interrogative structures + - 1st person pronouns +- These patterns reveal the **featural composition** given to clitic pronouns in the learner's interlanguage grammar + +### UWI Learner Corpus +- 10 Jamaican learners of French (L1: Jamaican Creole + Jamaican English) +- 67 one-on-one conversations over 19 months (longitudinal) +- 54,824 words total +- Transcribed using CHILDES conventions with morphological decomposition +- Learners had never been immersed in a French-speaking country + +### Conference Context (DIS)FLUENCY 2017 +The broader conference addressed fluency and disfluency across languages and language varieties, with key themes including: +- Filled pauses as discourse structure markers (not just disfluency) +- Cross-linguistic comparison of disfluency markers +- Filled pauses in recurrent multiword sequences becoming **constitutive parts** of formulaic language +- Disfluencies having both fluent and disfluent functions depending on context + +## Specific Hesitation Pattern Data + +- Typical L2 French speech example: multiple fillers ("&hum", "&euh"), repetitions, self-corrections within a single utterance +- L2 repair types: repeats, self-repairs, false starts +- Progressive repairs (repeats, fillers) serve as floor-holding devices +- Quantitative differences between L2 and native speakers in repair rates (L2 significantly higher, per Towell et al. 1996, Temple 2000, Hilton 2014) + +## Implications for Turn-Taking Detection in L2 Speech + +1. **Progressive repairs are floor-holding signals**: Repetitions and fillers are specifically used by L2 speakers to maintain the floor and avoid silent pauses. The model should interpret repetitions (e.g., "le le le mot") as strong hold signals, not as speaker confusion warranting a turn-take. + +2. **Repair type reveals processing level**: Lexical repairs (searching for a word) vs. morphological repairs (fixing grammar) vs. syntactic repairs (restructuring) have different temporal profiles. Lexical repairs tend to be shorter; syntactic restructuring takes longer and produces more complex disfluency patterns. + +3. **L2 repairs are qualitatively different from L1**: L2 speakers repair grammar and morphology far more than native speakers (who repair word choice and register). This means L2 speech will contain MORE frequent and DIFFERENT types of repairs. The model needs exposure to L2-specific repair patterns. + +4. **Default forms emerge during difficulty**: Under processing pressure, L2 speakers fall back to default forms (e.g., masculine determiners in French). These defaults may produce rapid self-corrections that look like false starts. The model should recognize gender/morphology corrections as within-turn processing, not turn-abandonment. + +5. **Longitudinal variation**: Over 19 months of learning, repair patterns change. The model should accommodate evolving disfluency profiles as L2 proficiency develops. + +6. **Formulaic sequences absorb pauses**: Research presented at the same conference showed that L2 speakers' filled pauses can become **constitutive parts** of multiword formulaic sequences (e.g., "I think euh that..."). These pause-formula combinations are actually markers of fluent, routinized delivery. The model should learn common formula+pause patterns as "normal" rather than disfluent. diff --git a/03-finetune-pipecat-pt/references/papers/hesitation-l2-french/16-hal-peters-2017-L2-fluency-french.pdf b/03-finetune-pipecat-pt/references/papers/hesitation-l2-french/16-hal-peters-2017-L2-fluency-french.pdf new file mode 100644 index 0000000000000000000000000000000000000000..69c7bbfefccb036e621bd0289eb20c543d260356 --- /dev/null +++ b/03-finetune-pipecat-pt/references/papers/hesitation-l2-french/16-hal-peters-2017-L2-fluency-french.pdf @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dd445df0fb8ff9a18f7cd6961eabf019408e436c714d2566f989bc6f92b44a78 +size 2113341 diff --git a/03-finetune-pipecat-pt/references/papers/hesitation-l2-french/17-christodoulides-2015-automatic-disfluency-detection-french.pdf b/03-finetune-pipecat-pt/references/papers/hesitation-l2-french/17-christodoulides-2015-automatic-disfluency-detection-french.pdf new file mode 100644 index 0000000000000000000000000000000000000000..5c7b1d61a3b373c0be1c55fd8cf9fdea7a5e0b8e --- /dev/null +++ b/03-finetune-pipecat-pt/references/papers/hesitation-l2-french/17-christodoulides-2015-automatic-disfluency-detection-french.pdf @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:293860dab7f9c6269986d36f6ba6bc36c6c59d5f9935b0692894834bea2618cc +size 527995 diff --git a/03-finetune-pipecat-pt/references/papers/hesitation-l2-french/18-lo-2018-filled-pauses-german-french-bilinguals-BAAP.md b/03-finetune-pipecat-pt/references/papers/hesitation-l2-french/18-lo-2018-filled-pauses-german-french-bilinguals-BAAP.md new file mode 100644 index 0000000000000000000000000000000000000000..c92b2af4fe484afc8d99fb9d7a96c722ad7b15d7 --- /dev/null +++ b/03-finetune-pipecat-pt/references/papers/hesitation-l2-french/18-lo-2018-filled-pauses-german-french-bilinguals-BAAP.md @@ -0,0 +1,58 @@ +--- +title: "Exploring the Effects of Bilingualism on Filled Pauses: An Acoustic-Phonetic Perspective" +authors: + - Justin Jing Hoi Lo +year: 2018 +source_url: null +date_converted: 2026-03-16 +--- + +## Abstract + +Examines the phonetic realization of filled pauses (FPs) in bilingual speakers from an acoustic-phonetic perspective. Focuses on German-French simultaneous bilinguals to explore how knowledge of multiple languages influences the production of FPs. Addresses two key questions: (1) whether bilinguals differentiate FPs across their two languages, and (2) whether bilingual FPs are acoustically distinct from monolingual FPs in the same language. Data from 15 female German-French simultaneous bilinguals (HABLA corpus) compared with 20 female French monolinguals (NCCFr corpus). + +## Key Findings Relevant to L2 Turn-Taking + +### Language-Specific Filler Phonetics +- **Bilinguals differentiate their FPs by language**: German vs. French FPs from the same speakers are acoustically distinct +- German FPs ("ah"): **shorter duration**, **lower F1-F3 formants** +- French FPs ("euh"): **longer duration**, **higher F1-F3 formants** +- This demonstrates that filled pauses are NOT universal grunts but have **language-specific phonetic targets** + +### Bilingual vs. Monolingual French FPs +- French FPs differed between bilinguals and monolinguals **only in duration** (not formant structure) +- Vocalic quality (F1-F3) was similar between bilingual and monolingual French FPs +- The formant structure of "euh" is maintained even by bilinguals -- it is a stable phonetic category + +### Methodological Details +- Analyzed FP variant: **UH type** only ("euh" in French, "ah" in German) -- the non-nasal variant +- Measurements: midpoint frequencies of F1, F2, F3 and duration from each vocalic segment +- Data: spontaneous speech recordings in both languages from the same speakers +- Bilingual speakers: 15 female, simultaneous acquisition of both German and French from birth +- Monolingual speakers: 20 female French monolinguals (NCCFr corpus, Torreira et al. 2010) + +### Cross-Linguistic Patterns +- Prior research (Candea et al. 2005) established language-specific patterns in vocalic quality of FPs across languages +- Speakers also exhibit **personal, speaker-specific FP variants** (Kunzel 1997) +- For bilinguals, both language-specificity and speaker-specificity are **intertwined** -- each speaker has distinct FPs per language + +## Specific Hesitation Pattern Data + +- French "euh": central vowel [schwa], longer, higher formants +- German "ah": open vowel, shorter, lower formants +- FP duration difference is the primary distinguishing feature between bilingual and monolingual productions +- Speaker-specific variation exists within the language-specific norms + +## Implications for Turn-Taking Detection in L2 Speech + +1. **French "euh" has a specific acoustic signature**: The formant structure (F1, F2, F3) of French filled pauses is distinct from other languages' fillers. A turn-taking model processing French speakers' Portuguese should be trained to recognize the French "euh" spectral pattern, which will likely persist even when the speaker is producing Portuguese. + +2. **Duration is the bilingual marker**: Since bilingual FPs differ from monolingual FPs primarily in duration (not formant quality), the model should expect French-Portuguese bilinguals to produce French-quality "euh" sounds but with potentially different durations than monolingual French speakers. + +3. **Language-specific FPs are real lexical items**: The fact that simultaneous bilinguals maintain distinct FP phonetics across languages supports treating FPs as language-specific "words" rather than involuntary sounds. This means the FP form can reveal which language is being processed, useful for detecting language switching. + +4. **Formant-based FP detection**: Since F1-F3 formants reliably distinguish French vs. German FPs, formant analysis could help a model determine whether a bilingual speaker is in "French processing mode" or "Portuguese processing mode" based on their filler pronunciation. This could improve turn-taking prediction by identifying the currently active language. + +5. **Speaker adaptation is needed**: Speaker-specific FP variants mean the model benefits from per-speaker calibration of what counts as a filled pause. During an initial calibration period, the model could learn each speaker's characteristic FP formant patterns and durations. + +6. **Simultaneous bilinguals maintain separation**: Even speakers who acquired both languages from birth maintain distinct FP phonetics. This means L2 speakers (sequential bilinguals) will likely show even MORE pronounced L1 influence on their L2 fillers, making French "euh" recognition in Portuguese speech essential. diff --git a/03-finetune-pipecat-pt/references/papers/hesitation-l2-french/18-lo-2018-filled-pauses-german-french-bilinguals-BAAP.pdf b/03-finetune-pipecat-pt/references/papers/hesitation-l2-french/18-lo-2018-filled-pauses-german-french-bilinguals-BAAP.pdf new file mode 100644 index 0000000000000000000000000000000000000000..28d65eac82c94af2832f1280f25013ee4ae1fed2 Binary files /dev/null and b/03-finetune-pipecat-pt/references/papers/hesitation-l2-french/18-lo-2018-filled-pauses-german-french-bilinguals-BAAP.pdf differ diff --git a/03-finetune-pipecat-pt/references/papers/hesitation-l2-french/19-hal-multimodal-self-other-repair-L2.md b/03-finetune-pipecat-pt/references/papers/hesitation-l2-french/19-hal-multimodal-self-other-repair-L2.md new file mode 100644 index 0000000000000000000000000000000000000000..0bcff50cbf14831b734d1afcbd14514fc9f24101 --- /dev/null +++ b/03-finetune-pipecat-pt/references/papers/hesitation-l2-french/19-hal-multimodal-self-other-repair-L2.md @@ -0,0 +1,90 @@ +--- +title: "Multimodal Self- and Other-Initiated Repairs in L2 Peer Interactions" +authors: + - Loulou Kosmala +year: 2025 +source_url: "https://doi.org/10.21437/DiSS.2025-7" +date_converted: 2026-03-16 +--- + +## Abstract + +Examines the multimodal organization of self- and other-initiated repairs in L2 peer interactions between French secondary pupils speaking English. While self-repairs typically reflect errors and language difficulties (syntactic, lexical, phonological), repair mechanisms are not solely responses to errors but tools to negotiate meaning and manage misunderstandings, especially through other-repairs. Adopts a multimodal approach analyzing visual-gestural properties (face, head, hands). Results show a preference for self-repairs over other-repairs, especially those associated with lexical and syntactic difficulties. Published at DiSS 2025 (Disfluency in Spontaneous Speech Workshop), Lisbon. + +## Key Findings Relevant to L2 Turn-Taking + +### Repair Distribution in L2 Peer Interactions +- Total repairs: **167** (from 12 pupils, 6 pairs) +- **119 self-repairs (71%)** vs. **48 other-repairs (29%)** +- Strong preference for self-repair over other-repair (significant, p = .006) +- Significant individual differences in repair types across speakers + +### Self-Repair Types +| Type | Count | Description | +|---|---|---| +| Repetitions | 46 | Most frequent -- repeating same word/phrase | +| Replacements | 29 | Substituting a single word or group | +| False starts | 26 | Cut-off word or sentence | +| Reformulations | 18 | Rephrasing a multi-word unit | + +- Significant difference between types (chi-square, p < .05) +- **Lexical** and **syntactic** repairs most frequent (46 and 38 respectively) +- Lexical issues resolved through **repetitions**; phonological issues through **false starts**; syntactic issues through **reformulations** (moderate-to-strong association, Cramer's V = .34) + +### Temporal Properties of Self-Repairs (Editing Phase) +- 98 out of 119 self-repairs (82%) contained an editing phase +- **Silent pauses** (SP): 46 instances, mean duration **844ms** (SD=573) +- **Filler particles** (FP): 19 instances -- forms: 24 "euh", 12 "eum", 1 "mm" +- **FP + SP combined**: 33 instances +- Mean FP duration: **522ms** (SD=571) +- Repetitions tend to be more frequently associated with FP+SP combinations (p = .037) +- Pause duration does NOT differ across repair types (non-significant for both FPs and SPs) + +### Other-Repair Types +| Type | Count | Description | +|---|---|---| +| Restricted offers | 26 | Seeking confirmation, offering candidate understanding | +| Open repairs | 12 | Signaling misunderstanding without specifying source | +| Restricted requests | 11 | Seeking specification of a specific element | + +- Restricted offers most frequent (significant, p = .014) +- Most other-repairs deal with **understanding** (20/48) and **acceptability** (20/48) problems +- Understanding problems trigger open repairs; acceptability problems trigger restricted offers/requests + +### Multimodal Patterns +| Feature | Other-Repair | Self-Repair | +|---|---|---| +| Gaze at interlocutor | 91.7% | 39.5% | +| Gaze averted | 2.1% | 29.4% | +| Gaze at paper | 6.3% | 21.8% | +| No gesture | 75.0% | 43.7% | +| Adaptor gesture | 16.7% | 38.7% | +| Smile | 35.4% | 15.1% | +| Neutral face | 33.3% | 58.0% | + +- **Self-repairs**: speakers avert gaze (29.4%), look at paper (21.8%), neutral face (58%), more adaptor gestures (38.7%) +- **Other-repairs**: speakers maintain gaze at interlocutor (91.7%), smile more (35.4%), fewer gestures (75% no gesture) +- Distinct multimodal signatures for self vs. other repair + +## Specific Hesitation Pattern Data + +- L2 pupils: aged 13-16, French secondary school, lower intermediate English +- Speaking time per participant: 1.4 to 3.6 minutes +- Proficiency levels: A (16-20/20), B (12-15), C (8-11), D (<=7) +- Self-repair editing phase silent pauses: mean **844ms** (much longer than native speaker pauses) +- Filler particle forms in French L2 English: "euh" (24), "eum" (12), "mm" (1) +- French fillers used even when speaking English -- strong L1 transfer + +## Implications for Turn-Taking Detection in L2 Speech + +1. **Repair sequences are NOT turn boundaries**: 82% of self-repairs include an editing phase (silent pause + filler). These pauses (mean 844ms) are well above typical turn-transition gaps (200ms) and silence thresholds (700ms). The model MUST NOT interpret repair pauses as turn completions. + +2. **L1 fillers persist in L2**: French learners use "euh" and "eum" even when speaking English. French speakers speaking Portuguese will almost certainly use French filler forms. The model must recognize French hesitation markers in Portuguese speech as floor-holding signals. + +3. **Gaze direction distinguishes repair types**: Self-repairs correlate with gaze aversion (looking away/down), while other-repairs correlate with direct gaze. If multimodal features are available, gaze can help distinguish between processing pauses (self-repair, don't interrupt) and comprehension checks (other-repair, response expected). + +4. **Repair clusters signal ongoing processing**: Self-repairs frequently combine FP + SP (33/119 cases = 28%), creating pause clusters of 1+ seconds. These are prime false-positive triggers for silence-based turn detectors. The model needs to recognize repair-in-progress patterns. + +5. **Other-repairs are turn-relevant**: When a listener initiates an other-repair (especially restricted offers like "you mean X?"), this IS a turn-taking event. The model should recognize other-repair initiations as legitimate turn entries, distinct from interruptions. + +6. **Proficiency affects repair patterns**: Lower-proficiency speakers produce more repairs with longer editing phases. The model should expect French speakers with lower Portuguese proficiency to show more and longer repair sequences. diff --git a/03-finetune-pipecat-pt/references/papers/hesitation-l2-french/19-hal-multimodal-self-other-repair-L2.pdf b/03-finetune-pipecat-pt/references/papers/hesitation-l2-french/19-hal-multimodal-self-other-repair-L2.pdf new file mode 100644 index 0000000000000000000000000000000000000000..a17dbf40cc249d39d2758bc8e26fc5f34febcf87 --- /dev/null +++ b/03-finetune-pipecat-pt/references/papers/hesitation-l2-french/19-hal-multimodal-self-other-repair-L2.pdf @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ea56b28b9aaebaed39a48dd7a53e9e4c930a478a9977bb85a2f4a6f154a6545f +size 545507 diff --git a/03-finetune-pipecat-pt/references/papers/language-learning-turn-taking/conversar_mixed_reality_l2.pdf b/03-finetune-pipecat-pt/references/papers/language-learning-turn-taking/conversar_mixed_reality_l2.pdf new file mode 100644 index 0000000000000000000000000000000000000000..dde5f5769a331aaa9531b4a7c841a8ae4af11f01 --- /dev/null +++ b/03-finetune-pipecat-pt/references/papers/language-learning-turn-taking/conversar_mixed_reality_l2.pdf @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:13e8de53a56903b66eec5f19600c0116ccd6b2b4d1865002eecd930b9efc00a7 +size 7002717 diff --git a/03-finetune-pipecat-pt/references/papers/language-learning-turn-taking/conversar_mixed_reality_l2_2025.md b/03-finetune-pipecat-pt/references/papers/language-learning-turn-taking/conversar_mixed_reality_l2_2025.md new file mode 100644 index 0000000000000000000000000000000000000000..1b5583ca4f2172f8c007ab843a946355ff9da28e --- /dev/null +++ b/03-finetune-pipecat-pt/references/papers/language-learning-turn-taking/conversar_mixed_reality_l2_2025.md @@ -0,0 +1,71 @@ +# ConversAR: Mixed Reality Agents for Interactive Group Conversation (L2 Learning) + +**PDF:** `conversar_mixed_reality_l2.pdf` +**Source:** https://arxiv.org/pdf/2510.08227 +**Authors:** Mariana Fernandez-Espinosa, Kai Zhang, Jad Bendarkawi, et al. (University of Notre Dame, Princeton, NJIT) +**Venue:** arXiv 2025 (CHI-style HCI paper) + +## Summary + +Presents ConversAR, a Mixed Reality system using Generative AI and XR (Meta Quest 3) to support situated, personalized group conversations for L2 language learners. Features embodied AI agents, scene recognition, and generative 3D props anchored to real-world surroundings. Tested with 21 L2 learners. + +## System Architecture + +### Key Components +- **Platform**: Unity on Meta Quest 3 +- **LLM Backend**: OpenAI API (GPT-5) -- not fine-tuned +- **NPCs**: Two AI agents (Ready Player Me avatars + Mixamo animations) +- **Speech**: Speech-to-text + Text-to-speech pipeline +- **Scene Recognition**: Meta Passthrough Camera API -> GPT-5 for object recognition +- **3D Props**: Text-to-3D generative AI for contextual objects during conversation + +### Interaction Flow +1. **Getting to Know You Phase** (1-on-1): Single NPC assesses learner's level, interests, strengths/weaknesses +2. **Scene Capture**: System photographs physical surroundings, identifies objects +3. **Multi-party Conversation**: Two NPCs engage learner in group conversation grounded in their environment +4. **Dynamic Props**: 3D objects generated based on conversation topics, placed on physical surfaces + +### Turn-Taking Implementation +- **Supervisor LLM** manages turn-taking to ensure balanced engagement +- NPCs converse with each other for up to 3 consecutive turns (pilot-tested limit) +- NPCs consistently address the learner with direct questions or invitations +- Learner gets infinite thinking time (no timer pressure) +- Turn assignment is dynamic, based on natural dialogue flow (e.g., if learner mentions an NPC by name) + +## Formative Study Findings (10 SLA Educators) + +### Key Insights About L2 Learners in Group Conversations +1. **KI1**: Students fear mistakes and judgment when speaking +2. **KI2**: Students fear being left out when peers have higher proficiency (more proficient speakers dominate) +3. **KI3**: Students fear repeating mistakes because corrective feedback is missing (leads to "fossilization of errors") +4. **KI4**: Students disengage when activities feel irrelevant to their lives + +## Design Goals +1. **DG1**: Facilitate confidence through group conversations with AI peers (no social pressure) +2. **DG2**: Deliver supportive corrective feedback (recasts, clarification requests, metalinguistic feedback) +3. **DG3**: Create realistic, contextualized conversations grounded in physical environment +4. **DG4**: Interactive 3D props to sustain and deepen conversation +5. **DG5**: Adaptive scaffolding matching learner proficiency level (vocabulary, sentence complexity) + +## User Study Results (21 L2 Learners) +- System enhanced learner engagement +- Increased willingness to communicate +- Offered a safe space for speaking practice +- Generative 3D props helped sustain and deepen conversations +- Corrective feedback was well-received (implicit recasts preferred) + +## Corrective Feedback Strategies +- **Recasts**: NPC repeats learner's utterance correctly without explicitly pointing out error +- **Clarification requests**: Signal misunderstanding, prompting reformulation +- **Metalinguistic feedback**: Comments about correctness without giving the answer +- **Circumlocution**: Rephrasing concepts in simpler terms for low-proficiency learners + +## Relevance to Turn-Taking for Language Learners + +- **Group dynamics matter**: L2 learners face unique turn-taking challenges in group settings -- fear of speaking, proficiency gaps, dominance by stronger speakers. A turn-taking system must account for these. +- **Infinite thinking time**: The system deliberately removes time pressure, acknowledging that L2 speakers need more time to formulate responses. This contrasts sharply with standard end-of-turn detectors that use fixed silence thresholds. +- **Supervisor LLM for turn management**: Using an LLM to manage turn-taking (rather than acoustic signals alone) is a practical approach for L2 group conversations where acoustic cues may be unreliable. +- **Balanced participation**: The system ensures NPCs invite the learner to contribute, preventing the common classroom problem of silent learners. +- **Corrective feedback integrated into turn-taking**: The NPCs use their turn to provide implicit corrections (recasts), which is a pedagogically sound approach that doesn't disrupt conversational flow. +- **Situated/contextual conversations**: Grounding conversations in the physical environment provides natural topic anchors, reducing the cognitive load of thinking of what to say (a major source of long pauses in L2 speech). +- **Proficiency-adaptive language**: The system adjusts complexity based on assessed level, which is relevant for calibrating turn-taking expectations (lower proficiency -> expect longer pauses). diff --git a/03-finetune-pipecat-pt/references/papers/language-learning-turn-taking/deepgram_etd_evaluation.md b/03-finetune-pipecat-pt/references/papers/language-learning-turn-taking/deepgram_etd_evaluation.md new file mode 100644 index 0000000000000000000000000000000000000000..1a7d62fea49385a2ad4d297b904d306e0e828595 --- /dev/null +++ b/03-finetune-pipecat-pt/references/papers/language-learning-turn-taking/deepgram_etd_evaluation.md @@ -0,0 +1,74 @@ +# Evaluating End-of-Turn Detection Models + +**Source:** https://deepgram.com/learn/evaluating-end-of-turn-detection-models +**Authors:** Jack Kearney (Staff Research Scientist) & Chau Luu (Senior Research Scientist) +**Published:** October 28, 2025 + +## Overview + +Natural voice agent interactions depend critically on accurate turn-taking behavior. An agent responding too quickly may interrupt users, while one that's overly cautious feels sluggish and unresponsive. + +Detecting End-of-Turn (EoT) -- recognizing when a speaker has finished and awaits a response -- is fundamental to building conversational voice systems. Multiple solutions exist, ranging from integrated ASR+EoT systems like Deepgram Flux and AssemblyAI Universal-Streaming, to audio-focused approaches like Pipecat Smart Turn and Krisp Turn-Taking, and transcript-based solutions such as LiveKit's EoU model. + +The Deepgram team needed rigorous evaluation methodology to ensure Flux delivered superior EoT detection. Rather than relying on proxy metrics or isolated turn analysis, they developed a novel evaluation framework centered on complete, real conversational data and sophisticated sequence alignment techniques. + +## Full Conversational Evaluation + +The team evaluated models against entire human conversations rather than individual turns. This approach captures realistic conversational dynamics that isolated turn analysis misses: + +- **High-quality labels:** Natural counterparty speech provides reliable turn boundary detection +- **Realistic detection budgets:** Not all turns allow equal response latency; simple queries demand faster responses than complex ones +- **Natural pause patterns:** Pre-turn silence can be studied alongside turn endings, enabling investigation of start-of-speech detection and potential backchanneling + +The research dataset comprised over 100 hours of genuine conversations with groundtruth transcripts and timestamped EoT annotations. An important lesson emerged during labeling: using "End-of-Thought" instead of "End-of-Turn" created ambiguity. The revised specification -- asking annotators to mark where a voice agent should naturally begin speaking -- improved self-reported confidence from 5/10 to 8.5/10. + +## The Trouble with Timestamps + +Initial attempts to match predicted EoT times against human-annotated labels revealed a critical problem: human annotators tended to be somewhat conservative in label placement, leaving a small gap between actual speech cessation and marked timing. This meant Flux sometimes detected EoT within these gaps, yet would register as incorrect using purely temporal comparison. + +To address this, researchers employed forced alignment to refine human timestamps, anchoring labels to precise speech endings extracted through acoustic analysis. While critics might argue that human labels better represent when naturally to speak (with a small pause), voice agent contexts differ from human conversation -- agents have separate latency from LLM and TTS processing, making early detection advantageous. + +### The Limitation of Time-Based Alignment + +A naive approach would simply match each groundtruth EoT to the earliest detection falling between turn end and the next speaker's start. However, this imposes unrealistic constraints: it forbids detection before acoustic speech completion, ignoring that sophisticated models might predict EoT before a word finishes. + +Relaxing this requirement by allowing slightly early detection introduces new hyperparameters without substantially improving reliability, according to manual review. + +## Using Sequence Alignment to Improve Turn Boundary Detection Evaluation + +The breakthrough came from applying sequence alignment -- the same technique used to calculate word error rate (WER) in speech-to-text -- to turn detection evaluation. By treating turn boundaries as special tokens (`[EoT]`) in transcripts, researchers leveraged transcript context to improve EoT prediction-to-groundtruth matching. + +### Example of Sequence Alignment Advantage + +Consider this scenario: + +``` +TRUTH: Hi Chau! [EoT-1] I'm fine thanks. [EoT-2] Sure that sounds great. [EoT-3] +PREDICTION: Hi! [EoT] Sure that sounds great. [EoT] +``` + +Purely temporal alignment would penalize the first `[EoT]` prediction with both a false positive and false negative because it appeared 180ms early. However, sequence alignment recognizes the prediction aligns reasonably well with the groundtruth structure -- just slightly ahead of time. + +### Performance Impact + +Switching from pure temporal to sequence-alignment-based evaluation yielded dramatic improvements: 3-5% absolute increases in precision and recall across models for all-in-one solutions, text-based detectors, and audio-only systems. Manual investigation confirmed these improvements accurately reflected true performance. + +## Handling Dropped Turns + +Standard Levenshtein alignment can produce ambiguous results when turns are dropped. The team modified the Levenshtein algorithm to prioritize `[EoT]`-to-`[EoT]` alignment quality alongside overall edit distance, with timestamp context providing additional evidence. + +## Turn Start Evaluation + +Beyond EoT, detecting when users begin speaking (Start-of-Turn/SoT) matters for handling interruptions and barge-in scenarios. Faster SoT detection reduces overlapping speech and preserves natural interaction flow. + +### Flux SoT Performance + +Flux demonstrates robust SoT detection, generally identifying turn starts within approximately 100-200ms of typical first-word duration, with false positive rates below 1-2%. + +## Key Takeaways + +1. **Complete conversation evaluation** beats isolated turn analysis for capturing realistic dynamics +2. **Sequence alignment** significantly outperforms pure timestamp matching for evaluation accuracy +3. **Forced alignment refinement** of human annotations improves label precision without removing human judgment +4. **Modified Levenshtein algorithms** better handle edge cases like dropped turns +5. **Comprehensive metrics** combining accuracy, latency, and false positives provide fuller performance pictures than isolated measurements diff --git a/03-finetune-pipecat-pt/references/papers/language-learning-turn-taking/deepgram_flux_conversational_asr.md b/03-finetune-pipecat-pt/references/papers/language-learning-turn-taking/deepgram_flux_conversational_asr.md new file mode 100644 index 0000000000000000000000000000000000000000..8ed1c7abb64b5492bfbe064d30b7604a3026ad4b --- /dev/null +++ b/03-finetune-pipecat-pt/references/papers/language-learning-turn-taking/deepgram_flux_conversational_asr.md @@ -0,0 +1,72 @@ +# Introducing Flux: Conversational Speech Recognition + +**Source:** https://deepgram.com/learn/introducing-flux-conversational-speech-recognition + +## Overview + +Deepgram Flux is described as "the first real-time Conversational Speech Recognition model built for voice agents." It addresses the critical challenge of determining when users have finished speaking by integrating turn detection directly into the ASR model. + +## Core Problem + +Traditional ASR systems were designed for transcription, not real-time conversation. Voice agent developers face an impossible tradeoff: + +- **Aggressive approach**: Agents interrupt mid-sentence, destroying user trust +- **Conservative approach**: Robotic pauses damage engagement + +The industry currently patches this gap by combining separate systems -- VADs, endpointing layers, and ASR pipelines -- creating complexity and latency issues. + +## Key Technical Innovations + +### Native Turn Detection + +Flux integrates turn-taking into the same model that produces transcripts: + +- Semantic awareness (distinguishing "because..." from completed thoughts) +- Fewer false cutoffs through full contextual understanding +- Eliminated pipeline delays since transcripts are ready when turns end + +### Performance Benchmarks + +- **Median latency reduction**: 200-600ms versus pipeline approaches +- **False interruption reduction**: ~30% fewer compared to alternatives +- **P90/P95 latency**: 1-1.5 seconds for most detections +- **Fast-response rate**: Majority of detections occur within 500ms + +### Transcription Quality + +- Matches Nova-3 on WER and WRR +- Preserves keyterm prompting capabilities +- Lowest WER on conversational audio benchmarks + +### Voice Agent Quality Index (VAQI) + +Flux ranked first in overall conversation quality metrics when tested on challenging real-world audio with background noise and disfluencies. + +## Developer API + +Flux replaces complex multi-system pipelines with two core events: + +- **StartOfTurn**: User begins speaking (enables barge-in) +- **EndOfTurn**: High confidence turn completion + +Configuration: +- `eot_threshold`: Confidence level (default 0.7) +- `eot_silence_threshold_ms`: Fallback silence duration (default 5000ms) + +### Eager End-of-Turn Detection + +For latency-critical applications: +- **EagerEndOfTurn**: Medium confidence (150-250ms earlier) +- **TurnResumed**: User continued speaking +- Tradeoff: 50-70% increase in LLM calls for faster responses + +## Integration Partners + +- Jambonz, Vapi, LiveKit, Pipecat, Cloudflare + +## Future Roadmap + +- Self-hosted deployment support +- Multilingual support +- Word-level timestamps +- Selective listening and backchanneling identification diff --git a/03-finetune-pipecat-pt/references/papers/language-learning-turn-taking/hesitation_tagging_l2_whisper_lora.pdf b/03-finetune-pipecat-pt/references/papers/language-learning-turn-taking/hesitation_tagging_l2_whisper_lora.pdf new file mode 100644 index 0000000000000000000000000000000000000000..e18ea9c75c6765cfb1182363b2daec1992efd7f8 --- /dev/null +++ b/03-finetune-pipecat-pt/references/papers/language-learning-turn-taking/hesitation_tagging_l2_whisper_lora.pdf @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3dbfeebbb920a66c87215e1a11536e24b6f3c05a15492e69cc72ba4ff33f97f4 +size 162480 diff --git a/03-finetune-pipecat-pt/references/papers/language-learning-turn-taking/hesitation_tagging_whisper_lora_2025.md b/03-finetune-pipecat-pt/references/papers/language-learning-turn-taking/hesitation_tagging_whisper_lora_2025.md new file mode 100644 index 0000000000000000000000000000000000000000..31f6338b06d85772ad9de67633627800cf68145d --- /dev/null +++ b/03-finetune-pipecat-pt/references/papers/language-learning-turn-taking/hesitation_tagging_whisper_lora_2025.md @@ -0,0 +1,67 @@ +# Acoustically Precise Hesitation Tagging Is Essential for End-to-End Verbatim Transcription Systems + +**PDF:** `hesitation_tagging_l2_whisper_lora.pdf` +**Source:** https://arxiv.org/pdf/2506.04076 +**Authors:** Jhen-Ke Lin, Hao-Chien Lu, Chung-Chun Wang, Hong-Yun Lin, Berlin Chen (National Taiwan Normal University) +**Venue:** arXiv 2025 (Speak & Improve Challenge 2025 submission) + +## Summary + +Demonstrates that acoustically precise hesitation tagging (labeling "um" and "uh" accurately) significantly improves ASR for L2 English learners when fine-tuning Whisper with LoRA. Compares three transcription schemes and shows that explicit filled-pause labeling yields an 11.3% relative WER improvement over omitting hesitations. + +## Method + +### Three Transcription Schemes Compared +1. **Pure**: All hesitation tags and punctuation removed (baseline) +2. **Rich**: Generic "#" tags for hesitations + punctuation markers (., ?, ...) +3. **Extra**: Acoustically precise "um"/"uh" tokens inferred by Gemini 2.0 Flash from audio-transcript pairs + punctuation + +### Gemini-based Annotation +- Used Google Gemini 2.0 Flash as an offline labeling tool +- Input: Rich transcription + corresponding audio +- Task: Infer acoustically plausible filled pauses from the generic "#" markers +- Cost: **Only $5 USD** to label entire training + dev set (thousands of utterances) +- Dramatically cheaper than human annotation + +### Fine-tuning Approach +- **Whisper Large V3** (1.55B params) for challenge tracks +- **Whisper Large V3 Turbo** (809M params, distilled) for post-challenge experiments +- **rsLoRA** (rank-stabilized Low-Rank Adaptation): rank=32, alpha=8, dropout=0.05 +- Applied to query, key, value, output projections + feedforward layers +- Trained on Speak & Improve Corpus 2025 (~55h transcribed subset) + +## Key Results + +### Baseline (no fine-tuning) +| Model | Parameters | WER | +|-------|-----------|-----| +| Whisper Small | 244M | 10.7% | +| Whisper Medium | 769M | 10.4% | +| Whisper Large V3 | 1.55B | 9.5% | +| Whisper Large V3 Turbo | 809M | 9.6% | + +### Challenge Results +- **Closed Track** (Pure scheme): 6.47% WER -- **1st place** +- **Open Track** (Extra scheme): 5.81% WER -- 3rd place + +### Post-Challenge: Transcription Scheme Comparison (Whisper Large V3 Turbo) +| Scheme | WER | vs. Pure | +|--------|-----|----------| +| Pure | 6.2% | baseline | +| Rich | 7.2% | +16.1% (worse!) | +| **Extra** | **5.5%** | **-11.3%** | + +### Critical Finding +- Generic "#" tags actually **hurt** performance (+16.1% relative WER increase) +- Acoustically precise "um"/"uh" **significantly help** (-11.3% relative WER improvement) +- Abstract/generic tags conflate different types of non-lexical vocalizations +- Real filled-pause tokens strengthen alignment between acoustic patterns and transcript output + +## Relevance to Turn-Taking for Language Learners + +- **Hesitations are turn-taking signals**: In L2 speech, filled pauses ("um", "uh") often signal that the speaker is still thinking and has NOT yielded the turn. Accurate detection of these pauses is critical for turn-taking systems. +- **Practical LoRA approach**: The LoRA fine-tuning approach (rank=32 on Whisper) is directly applicable to fine-tuning ASR for Portuguese L2 learners with minimal compute. +- **Cheap annotation via LLM**: Using Gemini/Claude to annotate hesitations at $5/dataset makes it feasible to create L2 Portuguese hesitation-annotated data. +- **Verbatim transcription enables turn-taking**: If the ASR system accurately captures filled pauses, the downstream turn-taking model gets richer signal about speaker state (thinking vs. yielding). +- **L2-specific challenges**: L2 speakers produce more and different hesitation patterns than L1 speakers. Models trained only on L1 speech will mishandle these. +- **Whisper as foundation**: Confirms Whisper Large V3 as a strong foundation for L2 speech, reducible from 9.5% to 5.5% WER with targeted fine-tuning on only ~55 hours of data. diff --git a/03-finetune-pipecat-pt/references/papers/language-learning-turn-taking/hume_evi_overview.md b/03-finetune-pipecat-pt/references/papers/language-learning-turn-taking/hume_evi_overview.md new file mode 100644 index 0000000000000000000000000000000000000000..be5ed7318378bcca5e96f740657058e9d596be74 --- /dev/null +++ b/03-finetune-pipecat-pt/references/papers/language-learning-turn-taking/hume_evi_overview.md @@ -0,0 +1,49 @@ +# Hume EVI (Empathic Voice Interface) Overview + +**Source:** https://dev.hume.ai/docs/speech-to-speech-evi/overview + +## Overview + +Hume's Empathic Voice Interface (EVI) is a real-time voice AI system that analyzes emotional nuances in speech. It processes "tune, rhythm, and timbre of speech" to enable more natural interactions between humans and AI systems. + +## Core Capabilities + +- **Transcription**: Rapid ASR with expression measurements aligned to each sentence +- **Language Generation**: Speech-language model, optionally integrated with Anthropic and OpenAI APIs +- **Voice Response**: Streamed speech generation via the speech-language model +- **Low Latency**: Immediate responses by running integrated models on unified infrastructure + +## Empathic AI Features + +- **Timing Detection**: Determines appropriate response moments using vocal tone analysis +- **Prosody Understanding**: Measures user speech characteristics via integrated prosody modeling +- **Tonal Matching**: Generates responses reflecting the user's emotional state (apologetic for frustration, sympathetic for sadness) +- **Expression-Aware Responses**: Crafts linguistically appropriate answers based on vocal expression +- **Interruptibility**: Stops speaking when interrupted and resumes with proper context +- **Multilingual Support**: EVI 4-mini handles 11 languages; EVI 3 supports English only + +## Version Comparison + +| Feature | EVI 3 | EVI 4-mini | +|---------|-------|------------| +| Quick responses | Yes | No | +| Supplemental LLM required | Optional | Required | +| Languages | English only | 11 languages | + +## API Architecture + +- WebSocket connections for real-time dialogue +- Authentication via API keys or access tokens (query parameters) +- Session concurrency limits by subscription tier +- Maximum 30-minute session duration +- 16 MB message size limit + +## Developer Resources + +- Quickstart guides for Next.js, TypeScript, and Python +- WebSocket and REST API references +- Sample code repositories + +## Relevance to Turn-Taking + +EVI's approach to turn-taking is notable because it uses prosody and emotional analysis rather than just silence detection. The system analyzes vocal tone to determine when a user has finished speaking, which is particularly relevant for language learners who may have longer pauses, hesitations, and non-standard prosodic patterns. diff --git a/03-finetune-pipecat-pt/references/papers/language-learning-turn-taking/iwsds2025_survey_turn_taking.pdf b/03-finetune-pipecat-pt/references/papers/language-learning-turn-taking/iwsds2025_survey_turn_taking.pdf new file mode 100644 index 0000000000000000000000000000000000000000..29da25684e0b00718dfc753a51a465b2863d0532 --- /dev/null +++ b/03-finetune-pipecat-pt/references/papers/language-learning-turn-taking/iwsds2025_survey_turn_taking.pdf @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f3fd4a6e8a77a46ad20634ba4b05c5731dbf4c692eb1acdb2fc1bb71062f12ee +size 473840 diff --git a/03-finetune-pipecat-pt/references/papers/language-learning-turn-taking/multilingual_vap_2024.md b/03-finetune-pipecat-pt/references/papers/language-learning-turn-taking/multilingual_vap_2024.md new file mode 100644 index 0000000000000000000000000000000000000000..29dc62a53643259d5c16f3583d6a0ba4072b8490 --- /dev/null +++ b/03-finetune-pipecat-pt/references/papers/language-learning-turn-taking/multilingual_vap_2024.md @@ -0,0 +1,57 @@ +# Multilingual Turn-taking Prediction Using Voice Activity Projection + +**PDF:** `multilingual_vap_2024.pdf` +**Source:** https://arxiv.org/pdf/2403.06487 +**Authors:** Koji Inoue, Bing'er Jiang, Erik Ekstedt, Tatsuya Kawahara, Gabriel Skantze +**Venue:** LREC-COLING 2024 (Kyoto University + KTH Royal Institute of Technology) + +## Summary + +Investigates whether the Voice Activity Projection (VAP) model -- a continuous turn-taking predictor that works on raw audio -- can be applied across languages. Tests on English, Mandarin, and Japanese (three different language families: Germanic, Sino-Tibetan, Japonic). + +## VAP Model Architecture + +- **Input**: Stereo audio (one channel per speaker), up to 20 seconds, 16kHz, 50Hz frame rate +- **Audio Encoder**: Contrastive Predictive Coding (CPC) pre-trained on English LibriSpeech, frozen during training +- **Architecture**: CPC encoder -> Self-attention Transformer (per channel) -> Cross-attention Transformer (interaction between channels) -> Linear layers for VAP + VAD predictions +- **Output**: Predicts joint voice activity of both speakers over a 2-second future window, divided into 4 bins (0-200ms, 200-600ms, 600-1200ms, 1200-2000ms), yielding 256 possible states + +## Research Questions & Answers + +### RQ1: Can a monolingual model transfer to other languages? +**No, not well.** A VAP model trained on one language does not make good predictions on other languages. Cross-lingual performance drops significantly. + +### RQ2: Can a single multilingual model match monolingual performance? +**Yes.** A multilingual model trained on all three languages performs on par with monolingual models across all languages. This is a key finding -- one model can handle multiple languages. + +### RQ3: Has the multilingual model learned to identify language? +**Yes.** Analysis shows the multilingual model has learned to discern the language of the input signal, suggesting it develops language-specific internal representations. + +### RQ4: How important is pitch? +Pitch (prosodic cue) is important for turn-taking prediction. Sensitivity analysis confirms the model leverages pitch information, which varies across languages: +- **Mandarin**: Turn-final pitch lowering for all words regardless of lexical tone +- **Japanese**: Turn transition time centers around 0ms +- **English**: More overlaps between turns + +### RQ5: Audio encoder effect? +Compared CPC (English pre-trained) with MMS (multilingual wav2vec 2.0). MMS provides benefits for multilingual scenarios but CPC still performs well given its English-only pre-training. + +## Cross-linguistic Turn-taking Differences +- Turn transition timing: Mandarin and Japanese ~0ms, English has more overlaps +- Intonation change at end of utterances is effective across all languages +- Backchannel frequency: Japanese highest, then English, then Mandarin +- Both universal tendencies and language-specific differences exist + +## Datasets Used +- **English**: Switchboard (telephone conversations) +- **Mandarin**: HKUST/MTS (telephone conversations) +- **Japanese**: CEJC (everyday conversations, with video) + +## Relevance to Turn-Taking for Language Learners + +- **Critical for L2 turn-taking**: Since monolingual models fail cross-linguistically, a turn-taking system for language learners MUST account for their L1 influence on turn-taking behavior +- A multilingual VAP model could handle learners from different L1 backgrounds with a single model +- L2 speakers may exhibit turn-taking patterns from their L1 (e.g., a Japanese L1 speaker learning English may produce more backchannels than expected) +- The model's ability to identify language could potentially be extended to identify L1 interference in L2 speech +- Pitch sensitivity is important because L2 speakers often transfer L1 prosodic patterns, which could confuse language-specific turn-taking detection +- **For Pipecat fine-tuning**: The VAP architecture (continuous prediction from raw audio using Transformers) is directly relevant as a baseline for adapting turn-taking to L2 Portuguese learners diff --git a/03-finetune-pipecat-pt/references/papers/language-learning-turn-taking/multilingual_vap_2024.pdf b/03-finetune-pipecat-pt/references/papers/language-learning-turn-taking/multilingual_vap_2024.pdf new file mode 100644 index 0000000000000000000000000000000000000000..315cdc14323e72a181e87229986f0bcea8eb4195 --- /dev/null +++ b/03-finetune-pipecat-pt/references/papers/language-learning-turn-taking/multilingual_vap_2024.pdf @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a12189557e7022189609c710cd8071f359ae4e05a3c755c7e1aceef1b19a86bc +size 974569 diff --git a/03-finetune-pipecat-pt/references/papers/language-learning-turn-taking/praktika_openai_case_study.md b/03-finetune-pipecat-pt/references/papers/language-learning-turn-taking/praktika_openai_case_study.md new file mode 100644 index 0000000000000000000000000000000000000000..79d5e2e5f7017184a4a67460dc2424a27d2ee23f --- /dev/null +++ b/03-finetune-pipecat-pt/references/papers/language-learning-turn-taking/praktika_openai_case_study.md @@ -0,0 +1,40 @@ +# Praktika - OpenAI Case Study + +**Source:** https://openai.com/index/praktika/ (not accessible via automated fetch; content reconstructed from public information) + +**Note:** OpenAI's website blocks automated access (403). This summary is based on publicly available information about the Praktika case study. Please visit the URL directly in a browser to read the full case study. + +## Overview + +Praktika is an AI-powered English language learning app that uses OpenAI's technology to create interactive conversational practice experiences. The app features AI avatars that serve as language tutors, enabling learners to practice speaking English in realistic conversational scenarios. + +## Key Features + +- **AI Avatar Tutors**: Lifelike AI characters that engage learners in conversation +- **Real-time Speech Interaction**: Uses OpenAI's Realtime API for low-latency voice conversations +- **Personalized Learning**: Adapts to individual learner levels and goals +- **Scenario-based Practice**: Real-world situations (job interviews, ordering food, etc.) +- **Pronunciation Feedback**: Real-time corrections and guidance +- **Progress Tracking**: Monitors learner improvement over time + +## Technology Stack + +- Built on OpenAI's GPT models for language understanding and generation +- Uses OpenAI's Realtime API for voice-to-voice interactions +- Implements turn-taking for natural conversational flow +- Handles L2 speaker disfluencies (hesitations, false starts, code-switching) + +## Relevance to Turn-Taking for Language Learners + +Praktika represents a key commercial example of turn-taking challenges in language learning: + +1. **L2 Speaker Patterns**: Learners pause longer, hesitate more, and have non-native prosody -- all of which confuse standard end-of-turn detectors +2. **Patience vs. Responsiveness**: The system must wait long enough for learners to formulate responses without creating awkward silences +3. **Scaffolding**: AI must recognize when a learner is struggling and offer help vs. when they are simply thinking +4. **Cultural Sensitivity**: Turn-taking norms vary across cultures; learners from different L1 backgrounds have different expectations + +## Scale + +- Millions of language learning sessions +- Available on iOS and Android +- One of the prominent showcases for OpenAI's Realtime API in education diff --git a/03-finetune-pipecat-pt/references/papers/language-learning-turn-taking/speak_improve_corpus_2025.md b/03-finetune-pipecat-pt/references/papers/language-learning-turn-taking/speak_improve_corpus_2025.md new file mode 100644 index 0000000000000000000000000000000000000000..e2ed510f34dfb34f21611cb3a12f4e538548e995 --- /dev/null +++ b/03-finetune-pipecat-pt/references/papers/language-learning-turn-taking/speak_improve_corpus_2025.md @@ -0,0 +1,62 @@ +# Speak & Improve Corpus 2025: L2 English Speech Corpus for Language Assessment and Feedback + +**PDF:** `speak_improve_corpus_2025.pdf` +**Source:** https://arxiv.org/pdf/2412.11986 +**Authors:** Kate Knill, Diane Nicholls, Mark J.F. Gales, Mengjie Qian, Pawel Stroinski (Cambridge University) +**Venue:** arXiv 2024 (associated with Speak & Improve Challenge 2025) + +## Summary + +Introduces the Speak & Improve Corpus 2025, the most comprehensive publicly available L2 English learner speech corpus to date. Contains ~315 hours of L2 English learner audio with proficiency scores, plus a 55-hour subset with manual transcriptions (including disfluencies) and grammatical error correction annotations. + +## Corpus Details + +### Scale +- ~315 hours of L2 English learner speech +- ~55 hours manually transcribed with disfluencies and error labels +- ~950 fully annotated test submissions +- ~2,500+ submissions with proficiency scores only +- Collected from 1.7 million users of the Speak & Improve platform (Dec 2018 - Sep 2024) + +### Speaker Diversity +- Speakers from across the globe (wide range of L1 backgrounds) +- CEFR proficiency levels: A2 (Elementary) to C1 (Advanced) +- Majority of data in B1-B2+ range (most common learner levels) + +### Test Structure (5 parts, based on Linguaskill Speaking Test) +1. **Interview**: 8 questions about themselves (10-20s each) +2. **Read Aloud**: 8 sentences (not included in corpus) +3. **Long Turn 1**: 1-minute opinion on a topic +4. **Long Turn 2**: 1-minute presentation about a graphic +5. **Communication Activity**: 5 questions on a topic (20s each) + +### Three-Phase Annotation +1. **Phase 1 - Scoring**: Audio quality score (3-5) + holistic CEFR score (1-6 per part) +2. **Phase 2 - Transcription**: Manual transcription including disfluencies (hesitations, false starts, repetitions), code-switching, pronunciation errors, phrase boundaries +3. **Phase 3 - Error Annotation**: Grammatical error correction on fluent transcriptions (disfluencies removed first) + +### Annotation Tags (Phase 2) +- Word-level: `backchannel`, `disfluency`, `partial`, `pronunciation` +- Word tags: `hesitation`, `code-switch`, `foreign-proper-noun`, `unknown` +- Phrase tags: `speech-unit-incomplete`, `speech-unit-statement`, `speech-unit-question` + +### Data Split +| Set | Submissions | Utterances | Hours (Trans) | Hours (SLA) | Words (Trans) | +|-----|-------------|------------|---------------|-------------|---------------| +| Dev | 438 | 5,616 | 22.9 | 35.3 | 140k | +| Eval | 442 | 5,642 | 22.7 | 35.4 | 140k | +| Train | 6,640 | 39,490 | 28.2 | 244.2 | 170k | + +## Key Innovations +- First corpus to provide **audio with grammatical error corrections** for spoken GEC research +- Handles complexities unique to spoken GEC: disfluencies, varied accents, spontaneous speech +- Available for non-commercial academic research via ELiT website + +## Relevance to Turn-Taking for Language Learners + +- **Disfluency patterns by proficiency level**: The corpus captures how hesitations, false starts, and pauses vary across CEFR levels -- directly relevant for training turn-taking models that must distinguish L2 thinking pauses from turn-yielding pauses +- **L1-dependent pronunciation**: Wide L1 diversity means the corpus captures how different L1 backgrounds affect speech patterns, including prosodic cues used in turn-taking +- **Monologic but structured**: While the tasks are monologic (not dialogic), the hesitation and disfluency annotations provide ground truth for understanding L2 speaker timing patterns +- **Phrase boundary annotations**: Can be used to study how L2 speakers signal phrase/turn boundaries differently from L1 speakers +- **Potential training data**: The 315 hours of scored L2 speech could be used to fine-tune ASR systems (like Whisper) that are more robust to L2 speech patterns, which is a prerequisite for accurate turn-taking in L2 conversations +- **Code-switching annotations**: Relevant for multilingual learners who may switch languages mid-turn, confusing standard turn-taking detectors diff --git a/03-finetune-pipecat-pt/references/papers/language-learning-turn-taking/speak_improve_corpus_2025.pdf b/03-finetune-pipecat-pt/references/papers/language-learning-turn-taking/speak_improve_corpus_2025.pdf new file mode 100644 index 0000000000000000000000000000000000000000..f403804ca9a33de90f6bb00364e31e213f5f4d78 --- /dev/null +++ b/03-finetune-pipecat-pt/references/papers/language-learning-turn-taking/speak_improve_corpus_2025.pdf @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3bc7626ed26c90ca326bccc47e3d8e610f83b1b836ccef9e2e32a994e6a07fd9 +size 204406 diff --git a/03-finetune-pipecat-pt/references/papers/language-learning-turn-taking/survey_turn_taking_iwsds2025.md b/03-finetune-pipecat-pt/references/papers/language-learning-turn-taking/survey_turn_taking_iwsds2025.md new file mode 100644 index 0000000000000000000000000000000000000000..128ff8a770675bcf30fe88001d22ae39268dc3ee --- /dev/null +++ b/03-finetune-pipecat-pt/references/papers/language-learning-turn-taking/survey_turn_taking_iwsds2025.md @@ -0,0 +1,63 @@ +# Survey of Recent Advances on Turn-taking Modeling in Spoken Dialogue Systems (IWSDS 2025) + +**PDF:** `iwsds2025_survey_turn_taking.pdf` +**Source:** https://aclanthology.org/2025.iwsds-1.27.pdf +**Authors:** Galo Castillo-Lopez, Gael de Chalendar, Nasredine Semmar (Universite Paris-Saclay, CEA, List) +**Venue:** IWSDS 2025 (15th International Workshop on Spoken Dialogue Systems Technology) + +## Summary + +Comprehensive survey of recent methods on turn-taking modeling in spoken dialogue systems, with special attention on studies published after 2021 (building on Skantze 2021's earlier survey). The paper reviews end-of-turn prediction, backchannel prediction, and multi-party conversation turn-taking. + +## Key Findings + +### The Problem +- Human-human conversation transitions take ~200ms on average +- Current spoken dialogue agents initiate turns after 700-1000ms gaps, resulting in unnatural conversations +- 72% of reviewed works do NOT compare their methods with previous efforts -- a major gap +- Lack of well-established benchmarks to monitor progress + +### Three Categories of End-of-Turn Methods +1. **Silence-based**: Simple VAD + silence threshold (e.g., 700ms). Poor user experience. +2. **IPU-based (Inter-Pausal Unit)**: Predictions made after each detected silence. Assume turns cannot be taken while user speaks. +3. **Continuous**: Constantly evaluate end-of-turn regardless of silences (e.g., every 50ms). Most promising. + +### Continuous Methods -- State of the Art +- **Voice Activity Projection (VAP)**: Predicts future voice activity for both speakers in a dialogue using cross-attention Transformers on raw audio. Emerging as the dominant approach. +- **TurnGPT**: GPT-2-based model fine-tuned on dialogue datasets for turn-completion prediction based on text features only. Outperforms previous work due to strong context representation. +- Combined prosodic + linguistic features consistently outperform individual feature types (additive effect of turn-taking cues). +- LLMs are currently **inefficient** at detecting mid-utterance turn-taking opportunities (Umair et al., 2024). + +### IPU-based Methods +- LSTM-based architectures for prosodic/phonetic/lexical features +- CNN models effective when incorporating visual cues (eye, mouth, head motion) +- Speech acts as auxiliary tasks improve turn-taking performance +- Recent work: instruction fine-tuning on LLMs with HuBERT audio features + +### Datasets Used +- **Switchboard** (260h, 2.4K dyadic telephone dialogues) -- most used +- **Fisher Corpus** (1960h, 11.7K topic-oriented telephone conversations) +- **AMI/ICSI Meeting Corpus** (100h/72h, multi-party) +- Datasets predominantly in English; some in Japanese, Mandarin, French, German +- Notable gap: very few datasets for non-English and multi-party scenarios + +### Multi-party Conversations +- Much less explored than dyadic scenarios +- Additional complexity: addressee recognition, floor management +- Mainly studied in human-robot interaction contexts +- CEJC (Japanese) and AMI/ICSI (English) corpora used + +### Key Challenges Identified +1. No standardized benchmarks for comparing turn-taking models +2. Most research focuses on English; multilingual models needed +3. Integration of visual/multimodal features underexplored in VAP models +4. Real-time deployment challenges +5. Handling interruptions and overlaps poorly addressed + +## Relevance to Turn-Taking for Language Learners + +- L2 speakers produce longer pauses, more hesitations, and non-standard prosody -- all confuse standard EoT detectors +- The VAP model's continuous prediction approach could be adapted to handle L2 speaker patterns by training on L2 speech data +- TurnGPT's text-based approach might be less affected by L2 pronunciation issues but more affected by grammatical errors +- Multi-party turn-taking (poorly studied) is exactly the scenario in classroom/group language learning +- The finding that combined cues work better than individual ones suggests L2-adapted systems should use both audio and text features diff --git a/03-finetune-pipecat-pt/references/papers/language-learning-turn-taking/tavus_ai_turn_taking_guide.md b/03-finetune-pipecat-pt/references/papers/language-learning-turn-taking/tavus_ai_turn_taking_guide.md new file mode 100644 index 0000000000000000000000000000000000000000..bef8c2a618a7d59d7f99ef8cbf5727ade11c2dbf --- /dev/null +++ b/03-finetune-pipecat-pt/references/papers/language-learning-turn-taking/tavus_ai_turn_taking_guide.md @@ -0,0 +1,86 @@ +# The Complete Guide To AI Turn-Taking (2025) + +**Source:** https://www.tavus.io/post/ai-turn-taking + +## Overview + +Effective dialogue between humans and AI depends fundamentally on proper turn-taking mechanics. When AI systems recognize the right moments to speak and listen, conversations feel natural and engaging. Proper timing of conversational AI responses ensures smoother, more engaging interactions, making AI feel more human-like. + +## What is AI Turn-Taking? + +AI turn-taking orchestrates the back-and-forth flow in conversations by determining optimal moments for speaking and listening. The technology analyzes speech patterns, pauses, and linguistic signals to time responses appropriately. + +The concept centers on **transition-relevant points (TRPs)** -- specific moments when speakers naturally pause, signaling readiness for another participant to speak. Humans instinctively recognize TRPs through tone changes, completed thoughts, or brief pauses. + +## Voice Activity Detection (VAD) vs. Turn-Taking + +### VAD + +- Identifies speech from background noise in audio streams +- Components: energy measurement, frequency analysis, ML models +- Cannot determine appropriate response timing or manage conversation dynamics + +### Turn-Taking + +Builds on VAD by analyzing: +- Pauses between words +- Sentence completion points +- Changes in speaking rhythm +- Linguistic and prosodic signals + +## How AI Turn-Taking Works + +### 1. Natural Language Processing (NLP) +- Examines sentence structure, meaning, context, voice patterns +- Predicts conversation flow based on context +- Recognizes rising intonation for questions + +### 2. Machine Learning Models +- Trained on millions of recorded conversations +- Supervised learning analyzes timing markers, tone variations, sentence completion indicators +- Transformer architectures track complex conversation patterns +- Can anticipate turn endings and prepare responses in advance + +### 3. User Feedback +- Each interaction provides data to fine-tune response timing +- Adapts to individual speaking styles (fast vs. deliberate talkers) + +## Turn-Taking Endpoints + +Precise moments where AI should begin or stop speaking. Success depends on: +1. **Transition-Relevant Points (TRPs)** - Natural conversation breaks +2. **Linguistic Markers** - Words and phrases indicating turn changes +3. **Non-Verbal Signals** - Breathing patterns and pauses + +## Key Challenges + +### Delays and Overlapping Speech +- Processing delays exceeding 600ms cause users to restart speaking +- Solutions: faster processing, accurate pause detection, real-time response below latency thresholds + +### Limited Context Awareness +- AI often treats each exchange as isolated +- Solutions: memory networks, advanced NLU models, contextual bridges + +### User Intent Recognition +- Requires integrating words, tone, timing, and conversation history simultaneously + +## Best Practices + +1. **Response timing**: Keep under 600ms; use predictive models to prepare responses before users finish +2. **Context retention**: Session-based storage, embedded conversation history +3. **User feedback loops**: Quick surveys, rating options, pattern analysis +4. **NLP integration**: Real-time linguistic signal analysis, sentiment analysis, voice pattern analysis +5. **ML models**: Training on varied datasets, transformer architectures, reinforcement learning for timing optimization + +## Implementation Steps + +1. **Establish objectives**: Define what the AI needs to accomplish (customer support, training, etc.) +2. **Test before deployment**: Run extensive tests with overlapping speech, varied speaking patterns, rapid topic changes +3. **Monitor and refine**: Track delayed responses, conversation breakdowns, missed contextual cues, user satisfaction + +## Key Metrics to Track +- Response speed +- Accuracy in detecting conversation transitions +- Success rate in maintaining context +- User completion rates diff --git a/03-finetune-pipecat-pt/references/papers/soda_dialog_distillation_2023.md b/03-finetune-pipecat-pt/references/papers/soda_dialog_distillation_2023.md new file mode 100644 index 0000000000000000000000000000000000000000..ddbbfeeb161122bb9456706ba6da7fe4db163cd1 --- /dev/null +++ b/03-finetune-pipecat-pt/references/papers/soda_dialog_distillation_2023.md @@ -0,0 +1,125 @@ +--- +title: "SODA: Million-scale Dialogue Distillation with Social Commonsense Contextualization" +authors: + - Hyunwoo Kim + - Jack Hessel + - Liwei Jiang + - Peter West + - Ximing Lu + - Youngjae Yu + - Pei Zhou + - Ronan Le Bras + - Malihe Alikhani + - Gunhee Kim + - Maarten Sap + - Yejin Choi +year: 2023 +source: https://arxiv.org/abs/2212.10465 +date_converted: 2026-03-16 +--- + +## Abstract + +Data scarcity has been a long standing issue in the field of open-domain social dialogue. SODA (SOcial DiAlogues) is the first publicly available, million-scale high-quality social dialogue dataset. By contextualizing social commonsense knowledge from a knowledge graph, the authors distill an exceptionally broad spectrum of social interactions from a large language model (GPT-3.5). Human evaluation shows that conversations in SODA are more consistent, specific, and (surprisingly) natural than those in prior human-authored datasets. Using SODA, they train COSMO, a generalizable conversation model that significantly outperforms best-performing conversation models (GODEL, BlenderBot-1, Koala, Vicuna) on naturalness and consistency in unseen datasets. COSMO responses are even sometimes preferred over original human-written gold responses. + +## Key Contributions + +1. **SODA dataset**: 1.5 million dialogues, 11 million utterances, 300 million tokens -- the largest publicly available open-domain social conversation dataset (CC-BY-4.0 license). +2. **CO3 framework**: COntextualizing COmmonsense for distilling COnversations -- a pipeline that transforms commonsense knowledge triples into narratives, then into dialogues via LLM. +3. **COSMO model**: A conversation model trained on SODA that generalizes better than existing models to unseen datasets. +4. **Insight on naturalness**: LLM-based agents (Koala, Vicuna, ChatGPT) tend to generate informative but unnatural responses in social chitchat contexts. + +## Method Details + +### CO3 Framework (Conversation Distillation Pipeline) + +Three-step process: + +1. **Commonsense Knowledge Retrieval**: Sample social commonsense triples from Atomic10x knowledge graph. Example: `(Head: PersonX moves a step closer to the goal, Relation: xNeed, Tail: to take the first step)`. + +2. **Knowledge to Narrative**: Convert triples to sentence form, then prompt GPT-3.5 to generate a 2-3 sentence narrative contextualizing the commonsense. Replace person variables with common names. + +3. **Narrative to Conversation**: Infer conversation participants from the narrative (via GPT-3.5), then generate a full multi-turn dialogue grounded in the narrative. + +### Dataset Construction and Filtering + +Starting from 2.2M initial conversations from GPT-3.5: +- Lexical pattern matching for erroneous patterns (6.3% removed) +- Turn count filter: keep 4-20 turns only (5.7% removed) +- Speaker count filter: max 2 speakers (11.3% removed) +- Non-human speaker filter (5.6% removed) +- Safety filtering: Canary model + Rewire API for violence/hate/explicit content (~5.3% removed) +- Commonsense filtering: GPT-3.5 zero-shot classifier verifies head event is present (95% pass) +- Name bias mitigation: Random replacement with Top-10K US SSN names + +**Final dataset: 1,486,896 conversations (68.9% retention)** + +### Dataset Statistics + +| Dataset | # Dialogues | Avg Turns | Avg Utt Length | Lexical Diversity (MTLD) | +|---------|-------------|-----------|----------------|--------------------------| +| DailyDialog | 13K | 7.9 | 14.6 | 63.0 | +| PersonaChat | 11K | 14.8 | 14.2 | 43.6 | +| WizardOfWikipedia | 22K | 9.1 | 16.4 | 60.3 | +| EmpatheticDialogue | 25K | 4.3 | 13.7 | 64.2 | +| BlendedSkillTalk | 7K | 11.2 | 13.6 | 64.2 | +| ProsocialDialog | 58K | 5.7 | 20.0 | 60.2 | +| **SODA** | **1.5M** | **7.6** | **16.1** | **68.0** | + +SODA is 100x larger than previous datasets and has the highest lexical diversity. + +### Topic Distribution (from Atomic10x relations) + +| Relation | % of SODA | Top Keywords | +|----------|-----------|--------------| +| xAttr (18%) | | kindness, anger, intelligent, responsibility | +| xEffect (17%) | | gratitude, anger, upset, hard work | +| xIntent (23%) | | independence, hard work, determination | +| xNeed (7%) | | job, money, confidence, comfort | +| xReact (25%) | | frustration, anger, confidence, happy | +| xWant (11%) | | conversation, store, determination | + +Includes 385K conversations with rich emotional content from 1.7K unique emotion descriptions. + +## Experimental Results + +### Human Evaluation: SODA vs. Human-Authored Datasets + +Head-to-head comparisons on 300 dialogues each, 6 criteria: + +**SODA vs. DailyDialog**: SODA preferred on all 6 axes by a large margin (statistically significant, |z| > 3.3, p < 0.05). + +**SODA vs. BlendedSkillTalk**: SODA preferred on 5 of 6 axes (all except Context Dependence). + +Evaluated criteria: natural flow, context dependence, topic consistency, speaker consistency, specificity, overall quality. + +### COSMO Model Performance + +COSMO (trained on SODA with LLaMA backbone) vs. existing models in head-to-head human evaluation: + +- **COSMO vs. BlenderBot**: COSMO wins by >40% average across comparisons. +- **COSMO vs. Koala**: COSMO wins by >40% average. +- **COSMO vs. Vicuna**: COSMO wins by >40% average. +- **COSMO vs. GODEL**: COSMO significantly preferred. + +Notable finding: COSMO outperforms BlenderBot on BlenderBot's own training data (BlendedSkillTalk), despite never seeing that corpus. COSMO responses are even preferred over human-authored ground-truth responses in DailyDialog. + +### Insight on LLM Naturalness + +LLM-based agents (Koala, Vicuna, ChatGPT) generate informative but unnatural responses in social chitchat -- they tend to provide knowledge-based answers rather than natural conversational replies. SODA and COSMO highlight this distinction between knowledge-enriched conversation and natural social dialogue. + +## Relevance to Turn-Taking / End-of-Turn Detection + +SODA's relevance to BabelCast's turn-taking work is indirect but valuable: + +1. **Training data for ETD models**: SODA's 1.5M dialogues with natural turn structures can serve as a source of text-based dialogue data for creating synthetic speech datasets for end-of-turn detection (similar to how SpeculativeETD used MultiWOZ + TTS). The diverse social interactions in SODA would produce more varied turn-taking patterns than task-oriented datasets. + +2. **Turn structure diversity**: With 7.6 average turns per conversation and diverse topics, SODA provides examples of many different turn-taking patterns -- short exchanges, long monologues, emotional conversations, etc. -- which is important for training robust ETD models. + +3. **Pause and hesitation modeling**: The CO3 framework's approach of inserting filler words and pauses into synthetic data (as done in SpeculativeETD with MultiWOZ) could be applied to SODA at 100x the scale, creating a much larger and more diverse ETD training set. + +4. **Conversation context for linguistic ETD features**: If our ETD model uses linguistic features (dialogue act, semantic completeness), SODA's grounded narratives provide context that could help train models to understand when a conversational point has been completed vs. when more is expected. + +5. **Naturalness benchmark**: The finding that LLM responses lack social naturalness is relevant -- our system should detect natural human turn-taking patterns, not the more rigid patterns of LLM-generated speech. + +6. **Scale advantage**: At 1.5M conversations, SODA is large enough to train or fine-tune substantial models, unlike smaller dialogue corpora that have limited turn-taking pattern coverage. diff --git a/03-finetune-pipecat-pt/references/papers/soda_dialog_distillation_2023.pdf b/03-finetune-pipecat-pt/references/papers/soda_dialog_distillation_2023.pdf new file mode 100644 index 0000000000000000000000000000000000000000..70f15c62323640e905e2c91020e906c39ebc78c8 --- /dev/null +++ b/03-finetune-pipecat-pt/references/papers/soda_dialog_distillation_2023.pdf @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:560fcc1cee972f2efeee943822c0011f184b381256709add678de0a163d82469 +size 1415572 diff --git a/03-finetune-pipecat-pt/references/papers/speculative_etd_2025.md b/03-finetune-pipecat-pt/references/papers/speculative_etd_2025.md new file mode 100644 index 0000000000000000000000000000000000000000..2ffd72579d3fb90d241db9f99a9e004a30de73e1 --- /dev/null +++ b/03-finetune-pipecat-pt/references/papers/speculative_etd_2025.md @@ -0,0 +1,127 @@ +--- +title: "Speculative End-Turn Detector for Efficient Speech Chatbot Assistant" +authors: + - Hyunjong Ok + - Suho Yoo + - Jaeho Lee +year: 2025 +source: https://arxiv.org/abs/2503.23439 +date_converted: 2026-03-16 +--- + +## Abstract + +Spoken dialogue systems powered by large language models have demonstrated remarkable abilities in understanding human speech and generating appropriate spoken responses. However, these systems struggle with end-turn detection (ETD) -- the ability to distinguish between user turn completion and hesitation. This limitation often leads to premature or delayed responses. The authors introduce the ETD Dataset, the first public dataset for end-turn detection, consisting of both synthetic speech data (generated with TTS) and real-world speech data collected from web sources (120k+ samples, 300+ hours). They propose SpeculativeETD, a collaborative inference framework that balances efficiency and accuracy: a lightweight GRU-based model (1M params) rapidly detects non-speaking units on local devices, while a high-performance Wav2vec-based model (94M params) runs on the server to classify turn ends vs. pauses. Experiments demonstrate significantly improved ETD accuracy while keeping computation low. + +## Key Contributions + +1. **ETD Dataset**: First open-source dataset specifically for end-turn detection, with 120k+ samples and 300+ hours of conversational data (synthetic + real). +2. **SpeculativeETD**: Novel collaborative inference framework combining a lightweight on-device model with a server-side model for efficient real-time end-turn detection. +3. **Three-state formulation**: Formally defines ETD as a ternary classification -- Speaking Unit (SU), Pause (within-turn silence), and Gap (end-of-turn silence). + +## Architecture / Method Details + +### ETD Task Formulation + +At each time t, the speaker is in one of three states: +- **Speaking Unit (SU)**: Speaker is actively in speech. +- **Pause**: Speaker is not in speech but intends to continue (within-turn). +- **Gap**: Speaker has finished talking, marking end of turn. + +### ETD Dataset Construction + +**Synthetic data** (from MultiWOZ corpus + TTS): +- **Base variant (V1)**: Direct TTS of text dialogues -- only SU and Gap states. +- **w/ Pause variant (V2)**: Randomly extend TTS hesitations into pauses of 1.5-3.0 seconds. +- **w/ Filler words variant (V3)**: Inject filler words ("um", "uh", "hmm", etc.) at random locations, add pauses after them. +- TTS models: MeloTTS and Google Cloud TTS for diversity. + +| Split | Samples | Duration (h) | Avg Duration (s) | +|-------|---------|---------------|-------------------| +| Train | 96,773 | 158.4 | 5.89 | +| Dev | 12,840 | 21.25 | 5.96 | +| Test | 12,868 | 21.21 | 5.93 | +| **Total** | **122,481** | **200.86** | **5.90** | + +**Real data** (YouTube + Buckeye speech corpus): +- Speaker diarization to ensure exactly 2 speakers per sample. +- Silences >200ms labeled as Pause (same speaker) or Gap (different speaker). +- Language filtering with Whisper (English only, 99.07% accuracy). +- Total: 8,022 samples, 115.08 hours. + +### SpeculativeETD Framework + +Two-stage collaborative inference: + +1. **On-device (GRU, 1M params)**: Processes streaming audio frame-by-frame. Performs binary classification: Speaking Unit vs. non-SU (Gap or Pause). This is a simpler task achievable with tiny models. + +2. **Server-side (Wav2vec 2.0, 94M params)**: Only invoked when the GRU detects silence. Receives the speech segment and classifies: Gap vs. Pause. Only runs once per silence segment, not every frame. + +Key advantages: +- The expensive Gap-vs-Pause decision happens only once per silence segment, not at every frame (10x+ computation savings). +- On-device GRU handles continuous streaming with sub-millisecond latency. +- Communication between device and server is infrequent (once per silence, not continuous). + +### Models Evaluated + +- **VAP** (Ekstedt & Skantze, 2022): Pretrained turn-taking model with frozen encoder, predictor trained on ETD dataset. +- **GRU** (1M params): 2 Conv2D layers + 1 GRU layer, trained from scratch. +- **Wav2vec 2.0** (94M params): Full fine-tuning on ETD dataset. +- **SpeculativeETD**: GRU (on-device) + Wav2vec 2.0 (server-side). + +## Experimental Results + +### Binary Classification (Gap vs. Pause) + +| Method | Params | Synthetic Acc. | Real Acc. | +|--------|--------|---------------|-----------| +| VAP | - | 86.2 | 57.2 | +| GRU | 1M | 79.3 | 48.3 | +| Wav2vec 2.0 | 94M | **99.5** | **66.0** | + +### Real-Time Audio Segmentation (Ternary: SU / Gap / Pause) + +| Method | Synthetic F1 | Synthetic IoU | Real F1 | Real IoU | +|--------|-------------|--------------|---------|----------| +| VAP | 92.9 | 87.7 | 17.6 | 10.7 | +| GRU | 85.5 | 76.2 | 24.8 | 14.7 | +| Wav2vec 2.0 | **94.7** | **90.2** | **30.3** | **17.9** | +| **SpeculativeETD** | 94.0 | 88.9 | 28.0 | 16.4 | + +SpeculativeETD achieves within 2% IoU of Wav2vec 2.0 on synthetic data while using dramatically less computation. + +### Computational Efficiency (FLOPs) + +| Method | Compute (MFLOPs) | +|--------|-----------------| +| VAP | 10,354.98 | +| GRU | 45.34 | +| Wav2vec 2.0 | 34,971.68 | +| **SpeculativeETD** | **919.64** (45.34 + 874.30) | + +SpeculativeETD uses **38x fewer FLOPs** than Wav2vec 2.0 alone and **11x fewer** than VAP. + +### On-Device Latency (iPhone 12 mini) + +| Model | Load (ms) | Init (ms) | Execute (ms) | +|-------|-----------|-----------|-------------| +| Wav2vec 2.0 | 874.06 | 17.89 | 1500.32 | +| GRU (SpeculativeETD) | 1.16 | 3.85 | **0.26** | + +GRU inference latency: **0.26ms per 100ms interval** -- well within real-time constraints. + +## Relevance to Turn-Taking / End-of-Turn Detection + +This paper is directly relevant to BabelCast's end-of-turn detection needs: + +1. **Architecture template**: The SpeculativeETD two-tier design (lightweight on-device + heavier server-side) maps directly to our Pipecat pipeline. We could use a tiny GRU alongside our existing Silero VAD for continuous speech detection, then invoke a heavier model only at silence boundaries. + +2. **ETD Dataset**: First public dataset for this exact task. Can be used directly for fine-tuning our end-of-turn classifier, or as a template for creating domain-specific data. + +3. **Three-state formulation (SU/Pause/Gap)**: Clean problem definition that aligns with our needs -- distinguishing mid-utterance pauses from actual turn completions to avoid premature LLM invocations. + +4. **Practical latency numbers**: GRU at 0.26ms per inference on mobile hardware confirms feasibility of real-time on-device ETD. Our gateway server has far more compute available. + +5. **Real-world gap**: The significant accuracy drop from synthetic to real data (IoU: 88.9 -> 16.4) highlights that ETD on real conversational data remains an open challenge, motivating domain-specific fine-tuning for our meeting translation use case. + +6. **Baseline comparisons**: Provides clear benchmarks for VAP, GRU, and Wav2vec models that we can use as reference points for our own models. diff --git a/03-finetune-pipecat-pt/references/papers/speculative_etd_2025.pdf b/03-finetune-pipecat-pt/references/papers/speculative_etd_2025.pdf new file mode 100644 index 0000000000000000000000000000000000000000..b1cccfd6f99ef27e6790d64505a6bef15e404f5f --- /dev/null +++ b/03-finetune-pipecat-pt/references/papers/speculative_etd_2025.pdf @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3b414caeeccc18a63336fc1216cbda38d81e129e5143a27cdae98af946b67d9f +size 549759 diff --git a/03-finetune-pipecat-pt/references/papers/turn_taking_review_skantze_2021.md b/03-finetune-pipecat-pt/references/papers/turn_taking_review_skantze_2021.md new file mode 100644 index 0000000000000000000000000000000000000000..45f16ad3ca6f0bad8aa2a09529c53f63b243c6bb --- /dev/null +++ b/03-finetune-pipecat-pt/references/papers/turn_taking_review_skantze_2021.md @@ -0,0 +1,121 @@ +--- +title: "Turn-taking in Conversational Systems and Human-Robot Interaction: A Review" +authors: + - Gabriel Skantze +year: 2021 +source: https://doi.org/10.1016/j.csl.2020.101178 +date_converted: 2026-03-16 +--- + +## Abstract + +The taking of turns is a fundamental aspect of dialogue. Since it is difficult to speak and listen at the same time, the participants need to coordinate who is currently speaking and when the next person can start to speak. Humans are very good at this coordination, and typically achieve fluent turn-taking with very small gaps and little overlap. Conversational systems (including voice assistants and social robots), on the other hand, typically have problems with frequent interruptions and long response delays, which has called for a substantial body of research on how to improve turn-taking in conversational systems. This review provides an overview of this research and gives directions for future research. It covers the theoretical background of linguistic research on turn-taking, an extensive review of multi-modal cues (including verbal cues, prosody, breathing, gaze and gestures) that facilitate turn-taking coordination, and work on modelling turn-taking including end-of-turn detection, handling of user interruptions, generation of turn-taking cues, and multi-party human-robot interaction. + +**Note**: The PDF file `turn_taking_review_skantze_2021.pdf` in this directory is corrupted (contains an unrelated physics paper). Content for this summary was sourced from the Semantic Scholar API and the published paper metadata. The actual paper is available at the ScienceDirect DOI above (open access, CC-BY license). + +## Key Contributions + +1. **Comprehensive survey** of turn-taking research spanning linguistics, psycholinguistics, and computational approaches (288 citations as of 2026). +2. **Taxonomy of turn-taking cues** across modalities: verbal/linguistic, prosodic, breathing, gaze, and gesture. +3. **Review of computational models** for end-of-turn detection, interruption handling, and turn-taking cue generation. +4. **Identification of open challenges** for achieving fluent turn-taking in human-machine interaction. + +## Theoretical Background + +### Fundamental Concepts + +- **Turn-Constructional Units (TCUs)**: The basic building blocks of turns, roughly corresponding to clauses or sentences. +- **Transition Relevance Places (TRPs)**: Points at the end of TCUs where speaker transition may occur. +- **Inter-Pausal Units (IPUs)**: Stretches of speech bounded by silence (typically >200ms). +- **Turn-yielding vs. Turn-holding cues**: Signals at the end of IPUs that indicate whether the speaker intends to continue or give up the floor. + +### Gap and Overlap Statistics + +- Average gap between turns in human conversation: ~200ms (Levinson & Torreira, 2015). +- Typical spoken dialogue systems use silence thresholds of 1-2 seconds -- far longer than natural gaps. +- Pauses within turns are often longer than gaps between turns, making silence duration alone unreliable for end-of-turn detection. + +## Turn-Taking Cues Reviewed + +### Prosodic Cues +- **Pitch**: Turn-final utterances tend to have falling or rising-falling pitch contours. Rising pitch may signal continuation or questions. +- **Intensity**: Decreasing intensity toward the end of a turn. +- **Speaking rate**: Slowing down at turn boundaries. +- **Duration**: Final syllable lengthening at turn ends. + +### Verbal/Linguistic Cues +- **Syntactic completeness**: Complete syntactic units are strong turn-yielding cues. +- **Pragmatic completeness**: Whether the communicative intent has been fulfilled. +- **Discourse markers**: Words like "so", "well", "anyway" can signal turn boundaries. +- **Content words vs. function words**: Turns ending on content words are more likely to be complete. + +### Gaze Cues +- Speakers tend to look away at the start of turns and gaze at the listener toward the end. +- Mutual gaze at the end of an utterance is a strong turn-yielding signal. + +### Gesture Cues +- Hand gestures in progress signal turn-holding. +- Completion of a gesture stroke may signal turn completion. + +### Breathing +- Audible inhalation can signal upcoming speech (turn-taking intention). +- Exhalation patterns correlate with turn boundaries. + +## Computational Models Reviewed + +### End-of-Turn Detection Approaches + +1. **Fixed silence threshold**: Simplest approach; typically 700-2000ms. Used by most commercial systems. Leads to either frequent interruptions (short threshold) or long delays (long threshold). + +2. **Decision-theoretic approaches**: Raux & Eskenazi (2009) Finite-State Turn-Taking Machine -- uses cost matrices and probabilistic state estimation to optimize the threshold dynamically. + +3. **Classification-based approaches**: Train classifiers (logistic regression, SVMs, decision trees) on prosodic and lexical features to predict turn boundaries at each silence onset. + +4. **Neural network approaches**: + - RNN/LSTM models that process sequences of features continuously (Skantze 2017). + - Transformer-based models like TurnGPT (Ekstedt & Skantze, 2020) that use linguistic context. + - Voice Activity Projection (VAP) models (Ekstedt & Skantze, 2022) that predict future voice activity from raw audio. + +5. **Incremental processing**: Systems that make predictions before the utterance is complete, enabling faster response times. + +### Key Features for End-of-Turn Prediction +- Pause duration (most commonly used but unreliable alone) +- Pitch contour (F0) at utterance end +- Energy/intensity trajectory +- Speaking rate changes +- Syntactic completeness (from ASR partial results) +- Semantic completeness / dialogue act +- Language model scores (boundary LM) + +### Handling Interruptions +- **Barge-in detection**: Determining when user speech during system output is an intentional interruption vs. backchannel. +- **Backchannel prediction**: Predicting when brief acknowledgments ("uh-huh", "yeah") will occur. +- **Overlap resolution**: Strategies for who yields when both participants speak simultaneously. + +## Key Findings and Recommendations + +1. **Silence is not enough**: Pauses within turns are often longer than gaps between turns. Systems relying solely on silence thresholds will always face a trade-off between responsiveness and interruption rate. + +2. **Multi-modal cues are complementary**: No single cue is sufficient. The best systems combine prosodic, linguistic, and (where available) visual cues. + +3. **Continuous prediction is better than binary**: Rather than making a single end-of-turn decision at each silence, continuously predicting the probability of upcoming speaker activity yields more natural turn-taking. + +4. **Incremental processing is key**: Systems should process input incrementally (not waiting for complete utterances) to enable fast response times comparable to human turn-taking gaps. + +5. **Domain and context matter**: Turn-taking patterns vary significantly across different dialogue types (task-oriented vs. social chat), cultures, and individual speakers. + +## Relevance to Turn-Taking / End-of-Turn Detection + +This review paper is the foundational reference for BabelCast's turn-taking work: + +1. **Problem framing**: Establishes that the ~200ms human turn-taking gap is the gold standard to aim for, and that current systems at 700-1000ms are far too slow. Our real-time translation pipeline adds additional latency, making fast ETD even more critical. + +2. **Feature selection guide**: The taxonomy of turn-taking cues directly informs which features to extract for our ETD model. Prosodic features (pitch, energy, speaking rate) + linguistic completeness from ASR are the most practical for an audio-only system like ours. + +3. **Model architecture guidance**: The progression from fixed thresholds to neural continuous prediction models provides a clear roadmap. We should move beyond our current fixed Silero VAD threshold toward a learned model. + +4. **Evaluation framework**: The review establishes standard metrics (latency, cut-in rate, balanced accuracy for shift/hold prediction) that we should adopt for benchmarking our ETD improvements. + +5. **Backchannel awareness**: In meeting translation, backchannels ("uh-huh", "right") should not trigger translation. The review's discussion of backchannel detection is relevant for filtering these out. + +6. **Multi-party challenges**: The review notes that multi-party turn-taking is significantly harder and less studied. Since our meeting bot handles multi-speaker calls, this is an open area for us. diff --git a/03-finetune-pipecat-pt/references/papers/turn_taking_review_skantze_2021.pdf b/03-finetune-pipecat-pt/references/papers/turn_taking_review_skantze_2021.pdf new file mode 100644 index 0000000000000000000000000000000000000000..5d0bd6dc851f9858e2c068df699f36c873266209 --- /dev/null +++ b/03-finetune-pipecat-pt/references/papers/turn_taking_review_skantze_2021.pdf @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fd0863dc7a722d24d20c011fa568625c23b2dc2f859d574eeeca27fb362dd08c +size 1876453 diff --git a/03-finetune-pipecat-pt/references/papers/vap_turn_taking_ekstedt_2024.md b/03-finetune-pipecat-pt/references/papers/vap_turn_taking_ekstedt_2024.md new file mode 100644 index 0000000000000000000000000000000000000000..83551f26834f0d92ec1b0f5b489ca2dcdb737277 --- /dev/null +++ b/03-finetune-pipecat-pt/references/papers/vap_turn_taking_ekstedt_2024.md @@ -0,0 +1,100 @@ +--- +title: "Real-time and Continuous Turn-taking Prediction Using Voice Activity Projection" +authors: + - Koji Inoue + - Bing'er Jiang + - Erik Ekstedt + - Tatsuya Kawahara + - Gabriel Skantze +year: 2024 +source: https://arxiv.org/abs/2401.04868 +date_converted: 2026-03-16 +--- + +## Abstract + +A demonstration of a real-time and continuous turn-taking prediction system is presented. The system is based on a voice activity projection (VAP) model, which directly maps dialogue stereo audio to future voice activities. The VAP model includes contrastive predictive coding (CPC) and self-attention transformers, followed by a cross-attention transformer. The authors examine the effect of the input context audio length and demonstrate that the proposed system can operate in real-time with CPU settings, with minimal performance degradation. + +## Key Contributions + +1. **Real-time CPU inference**: Demonstrates that VAP can run in real-time on CPU by limiting transformer input context to ~1 second, with no accuracy loss. +2. **Continuous turn-taking prediction**: Unlike binary end-of-turn classifiers, VAP predicts future voice activity continuously at every time frame. +3. **Multilingual models**: Trained models for Japanese, English (Switchboard), and Mandarin Chinese (HKUST corpus). + +## Architecture / Method Details + +### VAP Model Architecture + +The VAP model predicts future voice activity for both speakers in a dyadic dialogue from raw stereo audio: + +1. **CPC Encoder** (Contrastive Predictive Coding): Pre-trained audio encoder processes each speaker's audio channel independently. Contains an auto-regressive GRU that builds representations over the full audio history (up to 20 seconds). + +2. **Self-attention Transformers**: One layer per channel, processes CPC outputs. This is where the context length can be truncated for efficiency. + +3. **Cross-attention Transformer**: 3 layers, captures interactive information between the two speaker channels. + +4. **Output layers**: Linear layers for multitask learning: + - **VAP objective**: Predicts joint voice activity of both speakers over the next 2 seconds. + - **VAD subtask**: Voice activity detection auxiliary task. + +### VAP Output Representation + +- Predicts voice activities within a 2-second future window. +- Window divided into 4 binary bins: 0-200ms, 200-600ms, 600-1200ms, 1200-2000ms. +- Each bin is "voiced" or "unvoiced" for each speaker, yielding 256 possible activation states. +- Simplified into two metrics: + - **p_now(s)**: Short-term prediction (0-600ms) -- "how likely is participant s to speak in the next 600ms" + - **p_future(s)**: Longer-term prediction (600-2000ms) + +### Model Configuration + +- Self-attention: 1 layer per channel +- Cross-attention: 3 layers +- Attention heads: 4 +- Unit size: 256 +- Input: 50 frames per second + +## Experimental Results + +### Turn-Taking Prediction vs. Input Context Length + +Evaluated on Japanese Travel Agency Task Dialogue dataset (92.5h training, 11.5h validation, 11.5h test). Test set: 1,023 turn transitions, 1,371 turn holds. Metric: balanced accuracy (random = 50%). + +| Input Length (sec) | Balanced Accuracy (%) | Inference Time/Frame (ms) | Real-time Factor | +|---|---|---|---| +| 20.0 | 74.20 | 273.84 | 13.69 | +| 10.0 | 75.73 | 94.93 | 4.75 | +| 5.0 | 75.01 | 33.66 | 1.68 | +| 3.0 | 75.75 | 30.54 | 1.53 | +| **1.0** | **76.16** | **14.61** | **0.73** | +| 0.5 | 75.41 | 13.11 | 0.66 | +| 0.3 | 71.50 | 12.19 | 0.61 | +| 0.1 | 62.81 | 12.45 | 0.62 | + +Key findings: +- **1-second context achieves the best accuracy (76.16%)** while running in real-time (factor 0.73). +- Performance degrades significantly below 0.3 seconds. +- The GRU in CPC retains long-term information, so the transformer only needs short context. +- CPU: Intel Xeon Gold 6128 @ 3.40 GHz. + +### Multilingual Performance + +English (Switchboard) and Mandarin Chinese (HKUST) models yielded similar results to Japanese, confirming the approach generalizes across languages. + +## Relevance to Turn-Taking / End-of-Turn Detection + +VAP is highly relevant to BabelCast's turn-taking needs: + +1. **Continuous prediction**: Unlike threshold-based systems, VAP continuously predicts who will speak next. This is more informative than a binary end-of-turn decision -- we get probabilistic forecasts that can be used to adjust system responsiveness dynamically. + +2. **Real-time CPU feasibility**: With 1-second context, VAP runs at 14.6ms per frame on CPU -- well within our 20ms frame budget. No GPU required for inference. + +3. **Audio-only input**: VAP works directly on raw audio without requiring ASR transcription, which means zero additional latency from waiting for text. This is critical for our real-time translation pipeline where every millisecond of response delay matters. + +4. **Stereo/dual-channel design**: VAP models the interaction between two speakers, which aligns with our meeting bot scenario where we need to track speaker turns in a conversation. + +5. **76% balanced accuracy** on turn shift/hold prediction sets a baseline for what audio-only models can achieve. Combining with linguistic features (from our ASR) could push this higher. + +6. **Open source**: Code available at github.com/ErikEkstedt/VoiceActivityProjection, making it practical to integrate or fine-tune for our domain. + +7. **Limitation**: VAP was designed for dyadic (2-party) conversations. Multi-party meeting scenarios would require adaptation, though the model could still be applied to each speaker pair. diff --git a/03-finetune-pipecat-pt/references/papers/vap_turn_taking_ekstedt_2024.pdf b/03-finetune-pipecat-pt/references/papers/vap_turn_taking_ekstedt_2024.pdf new file mode 100644 index 0000000000000000000000000000000000000000..2fa6a140ffc6860d2ef1b646fcb90a2783540e63 --- /dev/null +++ b/03-finetune-pipecat-pt/references/papers/vap_turn_taking_ekstedt_2024.pdf @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:129a900833fb9a775f6abc8869de60f3526a3a489ed6fda5f8b267136a5a8cec +size 496267 diff --git a/README.md b/README.md new file mode 100644 index 0000000000000000000000000000000000000000..7a52b8dd68b99d1c09499df79afc23fbe4752577 --- /dev/null +++ b/README.md @@ -0,0 +1,453 @@ +# Turn-Taking Model — Deteccao de Fim de Turno para BabelCast + +Pesquisa, benchmarks e fine-tuning de modelos de deteccao de fim de turno para traducao simultanea em portugues. + +## Estrutura do Repositorio + +``` +docs/turn-taking-study/ + README.md # Este documento + melhorias_turn_detection.md # Plano de melhorias + resultados das 3 rodadas + RESEARCH_LOG.md # Log de pesquisa + data/ # Datasets (NURC-SP, CORAA, TTS) — ~10GB + hf_cache/ # Cache HuggingFace + + previous-experiments/ + 01-benchmarks/ # Benchmark de 5 modelos em portugues + benchmark_*.py # Scripts de benchmark (Silence, Silero, VAP, Pipecat, LiveKit) + setup_*.py # Scripts de setup de datasets + report/ # Relatorio gerado (markdown + LaTeX + graficos) + + 02-finetune-scratch/ # Fine-tuning do zero (3 rodadas) + finetune_smart_turn_v3.py # Script principal (Whisper Tiny + Focal Loss) + modal_finetune.py # Deploy no Modal + results/ # Rodada 1: Whisper Base + BCE (F1=0.796) + results-tiny/ # Rodada 2: Whisper Tiny + BCE (F1=0.788) + results-focal/ # Rodada 3: Whisper Tiny + Focal Loss (F1=0.798) + checkpoints/ # Checkpoints v1/v2 + + 03-finetune-pipecat-pt/ # NOVO: Fine-tune a partir do Pipecat pre-treinado + README.md # Documentacao completa do experimento +``` + +--- + +## Resumo dos Experimentos + +### 01 — Benchmarks (5 modelos em portugues) + +Comparacao de modelos existentes em audio portugues real (NURC-SP, 77 min). + +### 02 — Fine-tune do zero (3 rodadas) + +Treinamos Whisper Tiny encoder + classifier do zero em 15K amostras de portugues (CORAA + MUPE). Melhor resultado: **F1=0.798, precision 83% @threshold=0.65**. Detalhes em `melhorias_turn_detection.md`. + +### 03 — Fine-tune a partir do Pipecat (proximo) + +Fine-tune do modelo pre-treinado do Pipecat (270K amostras, 23 linguas) especificamente pra portugues + frances falando portugues. Usa LLMs (Claude) pra criar labels de qualidade + TTS pra gerar audio. Detalhes em `03-finetune-pipecat-pt/README.md`. + +--- + +## Resultados dos Benchmarks (Experimento 01) + +Comparative evaluation of turn-taking prediction models for real-time conversational AI, with focus on **Portuguese language** performance. + +## Models Evaluated + +| Model | Type | Size | GPU | ASR | Portuguese Support | +|-------|------|------|-----|-----|--------------------| +| Silence Threshold (300/500/700ms) | Rule-based | 0 | No | No | Language-independent | +| [Silero VAD](https://github.com/snakers4/silero-vad) | Audio DNN | 2MB | No | No | Language-independent | +| [VAP](https://github.com/ErikEkstedt/VoiceActivityProjection) | Audio Transformer (CPC) | 20MB | Optional | No | Trained on English only | +| [Pipecat Smart Turn v3.1](https://github.com/pipecat-ai/smart-turn) | Audio Transformer (Whisper) | 8MB | No | No | Included in 23 languages | +| [LiveKit EOT](https://huggingface.co/livekit/turn-detector) | Text Transformer (Qwen2.5) | 281MB | No | Yes | English only | + +## Key Results — Portuguese + +### Real Portuguese Speech (NURC-SP corpus, 77 min, 15 dialogues) + +**End-of-utterance detection accuracy (is the speaker done talking?):** + +| Model | Detects speaker stopped | False alarm rate | Overall accuracy | +|-------|------------------------|------------------|-----------------| +| Pipecat Smart Turn v3.1 (original) | 84.9% | 54.9% | 68.6% | +| Pipecat Smart Turn v3.1 (fine-tuned PT) | 98.4% | 73.8% | 68.5% | +| Silero VAD | ~95%+ | ~5% | ~95% | + +**Conclusion:** Silero VAD remains the most robust approach for detecting when a speaker stops talking in Portuguese. Smart Turn's Whisper-based approach adds linguistic intelligence but suffers from high false alarm rates on Portuguese, even after fine-tuning. + +### Turn-taking benchmark (Edge TTS, 10 dialogues, 6.4 min) + +| Rank | Model | Macro-F1 | Balanced Acc | Latency p50 | False Int. | +|------|-------|----------|-------------|-------------|------------| +| 1 | Pipecat Smart Turn v3.1 | 0.639 | 0.639 | 18.3ms | 22.8% | +| 2 | Silence 700ms | 0.566 | 0.573 | 0.1ms | 18.1% | +| 3 | Silero VAD | 0.401 | 0.500 | 9.0ms | 100.0% | +| 4 | VAP | 0.000 | 0.000 | — | — (needs stereo) | + +--- + +## Pipecat Smart Turn — Model Documentation + +### Overview + +Smart Turn is an open-source end-of-turn detection model created by **Daily** (daily.co), the company behind the Pipecat voice AI framework. It predicts whether a speaker has finished their turn ("complete") or is still talking ("incomplete") using only audio input. + +**No academic paper exists.** The model is documented through blog posts and GitHub only. + +### Architecture + +``` +Input: 16kHz mono PCM audio (up to 8 seconds) + │ + ▼ +Whisper Feature Extractor → Log-mel spectrogram (80 bins × 800 frames) + │ + ▼ +Whisper Tiny Encoder (pretrained, openai/whisper-tiny) + │ Output: (batch, 400, 384) — 400 frames, 384-dim hidden state + ▼ +Attention Pooling: Linear(384→256) → Tanh → Linear(256→1) + │ Learns which audio frames are most important for the decision + ▼ Weighted sum → (batch, 384) +Classifier MLP: + Linear(384→256) → LayerNorm → GELU → Dropout(0.1) + → Linear(256→64) → GELU → Linear(64→1) + │ + ▼ +Sigmoid → probability [0, 1] + > 0.5 = "Complete" (speaker finished) + ≤ 0.5 = "Incomplete" (speaker still talking) +``` + +**Total parameters:** ~8M +**Model size:** 8MB (int8 ONNX) / 32MB (fp32 ONNX) + +### Why Whisper Tiny? + +The team evolved through several architectures: + +| Version | Backbone | Size | Problem | +|---------|----------|------|---------| +| v1 | wav2vec2-BERT | 2.3GB | Overfitted, too large | +| v2 | wav2vec2 + linear | 360MB | Still large | +| v3+ | **Whisper Tiny encoder** | 8MB | Good balance | + +Whisper Tiny was chosen because: +- Pretrained on **680,000 hours** of multilingual speech (99 languages) +- Encoder produces rich acoustic representations without needing the decoder +- Only 39M params in full Whisper Tiny; encoder alone is much smaller +- The attention pooling + MLP classifier adds minimal overhead + +### Training Data + +**Dataset:** `pipecat-ai/smart-turn-data-v3.2-train` on HuggingFace +**Size:** 270,946 samples (41.4 GB) +**Languages:** 23 (Arabic, Bengali, Chinese, Danish, Dutch, English, Finnish, French, German, Hindi, Indonesian, Italian, Japanese, Korean, Marathi, Norwegian, Polish, **Portuguese**, Russian, Spanish, Turkish, Ukrainian, Vietnamese) + +**Data format per sample:** + +| Field | Type | Description | +|-------|------|-------------| +| `audio` | Audio | 16kHz mono PCM, up to 16s | +| `endpoint_bool` | bool | True = complete, False = incomplete | +| `language` | string | ISO 639-3 code (e.g., "por") | +| `midfiller` | bool | Filler word mid-utterance ("um", "éh") | +| `endfiller` | bool | Filler word at end | +| `synthetic` | bool | TTS-generated vs human | +| `dataset` | string | Source (12 different sources) | + +**Data generation pipeline:** +1. **Text sources:** 1.2M+ multilingual sentences from HuggingFace datasets +2. **Cleaning:** Gemini 2.5 Flash filtered grammatically incorrect sentences (removed 50-80%) +3. **TTS:** Google Chirp3 for synthetic audio generation +4. **Filler words:** Language-specific lists (generated by Claude/GPT), inserted by Gemini Flash +5. **Human audio:** Contributed by Liva AI, Midcentury, MundoAI +6. **Noise augmentation (v3.2):** Background noise from CC-0 Freesound.org samples +7. **Target split:** 50/50 complete vs incomplete + +### Training Process + +```python +# Hyperparameters (from train.py) +learning_rate = 5e-5 +epochs = 4 +train_batch_size = 384 +eval_batch_size = 128 +warmup_ratio = 0.2 +weight_decay = 0.01 +lr_scheduler = "cosine" +loss = BCEWithLogitsLoss(pos_weight=dynamic_per_batch) +``` + +**Hardware:** Modal L4 GPU (or local GPU) +**Training time:** ~53-79 minutes depending on GPU +**Framework:** HuggingFace Transformers Trainer API +**Logging:** Weights & Biases + +### Published Accuracy by Language + +| Language | Accuracy | FPR | FNR | +|----------|----------|-----|-----| +| Turkish | 97.10% | 1.66% | 1.24% | +| Korean | 96.85% | 1.12% | 2.02% | +| English | 95.60% | — | — | +| Spanish | 91.00% | — | — | +| Bengali | 84.10% | 10.80% | 5.10% | +| Vietnamese | 81.27% | 14.84% | 3.88% | +| **Portuguese** | **Not reported** | — | — | + +### Inference Latency + +| Device | Latency | +|--------|---------| +| AWS c7a.2xlarge (CPU) | 12.6 ms | +| NVIDIA L40S (GPU) | 3.3 ms | +| Apple M-series (MPS) | ~18 ms | + +### Our Evaluation on Portuguese + +We tested Smart Turn v3.1 on real Brazilian Portuguese speech from the **NURC-SP Corpus Minimo** (239h corpus of spontaneous São Paulo dialogues, CC BY-NC-ND 4.0): + +| Metric | Result | +|--------|--------| +| Boundary detection (speaker actually stopped → model says "Complete") | **84.9%** | +| Mid-turn detection (speaker still talking → model says "Incomplete") | **45.1%** | +| Overall binary accuracy | **68.6%** | +| Shift detection (speaker change) | **87.7%** | +| Probability at boundaries (mean) | 0.809 | +| Probability at mid-turn (mean) | 0.522 | +| Separation (boundary - midturn) | 0.287 | + +**Key finding:** Smart Turn detects end-of-utterance well (84.9%) but has a high false positive rate (54.9%) during ongoing speech. The model tends to predict "Complete" too aggressively on Portuguese. + +### Fine-tuning Attempt + +We fine-tuned the model on Portuguese using 6,031 samples extracted from NURC-SP (15 dialogues, 77 minutes) + Edge TTS dialogues: + +| Metric | Original | Fine-tuned | +|--------|----------|------------| +| Boundary detection | 84.9% | **98.4%** | +| Mid-turn detection | 45.1% | 26.2% (worse) | +| Overall accuracy | 68.6% | 68.5% (same) | +| False alarm rate | 54.9% | 73.8% (worse) | + +**Result:** Fine-tuning improved boundary detection but worsened false alarm rate. The model overfitted to predicting "Complete" for everything. The overall accuracy did not improve. + +--- + +## Strategy: Improving Smart Turn for Portuguese + +### Why It Doesn't Work Well on Portuguese + +1. **Underrepresented in training data:** Portuguese is 1 of 23 languages in 270K samples — likely <5% of training data. English dominates. + +2. **Mostly synthetic Portuguese data:** The training pipeline uses TTS (Google Chirp3) for most non-English languages. Synthetic speech lacks natural hesitations, overlaps, and prosodic variation. + +3. **Portuguese prosody differs from English:** + - Portuguese has more overlap between speakers (~15% vs ~5% in English) + - Shorter inter-turn gaps (median ~200ms vs ~300ms in English) + - Different intonation patterns at sentence endings + - More use of filler words ("né", "tipo", "éh", "então") + +4. **NURC-SP audio quality:** 1970s-1990s recordings with noise, which the model wasn't trained on (v3.2 added noise augmentation, but for modern noise profiles). + +### Improvement Strategy + +#### Phase 1: Better Training Data (Estimated effort: 1-2 weeks) + +**Goal:** Create 20,000+ high-quality Portuguese training samples with proper class balance. + +**Data sources:** +1. **NURC-SP Corpus Minimo** (19h, already downloaded) — extract more samples with sliding windows at various positions +2. **CORAA NURC-SP Audio Corpus** (239h, HuggingFace) — massive source of real dialogues +3. **C-ORAL-BRASIL** (21h, via Zenodo) — spontaneous informal speech +4. **Edge TTS generation** — create diverse Portuguese dialogues with multiple speakers/styles +5. **Real conversation recording** — record actual Portuguese conversations with timestamp annotations + +**Key improvements over our first attempt:** +- Use **cross-validation** — never test on conversations used for training +- Generate **more diverse "incomplete" samples** — multiple positions within each turn, not just midpoint +- Include **Portuguese-specific fillers** ("né?", "tipo assim", "éh", "então") as end-of-utterance markers +- Add **noise augmentation** (background noise, room reverb, microphone artifacts) +- Balance dataset: exactly 50/50 complete vs incomplete, without augmentation tricks + +#### Phase 2: Architecture Tweaks (Estimated effort: 1 week) + +1. **Lower threshold for Portuguese:** Instead of 0.5, use 0.65-0.75 as the "Complete" threshold. This reduces false alarms at the cost of slightly slower detection. + +2. **Language-specific classification head:** Add a language embedding to the classifier so the model can learn different decision boundaries per language. + +3. **Longer context window:** Increase from 8s to 12-16s. Portuguese turns tend to be longer (2.5s mean vs 1.8s in English), so more context helps. + +4. **Prosody features:** Add pitch (F0) contour as an additional input feature. Portuguese has distinctive falling intonation at statement endings vs rising at questions. + +#### Phase 3: Proper Evaluation (Estimated effort: 1 week) + +1. **Hold-out test set:** Reserve 3-5 NURC-SP conversations never seen during training +2. **Cross-corpus evaluation:** Test on CORAA data not used in training +3. **Real-world test:** Record and test on modern Portuguese conversations (Zoom/Teams calls) +4. **Compare with Silero VAD:** Side-by-side evaluation on the same test set with identical metrics +5. **Threshold sweep:** Find the optimal probability threshold for Portuguese specifically + +#### Phase 4: Integration with BabelCast (Estimated effort: 2-3 days) + +If the improved model achieves >85% accuracy with <15% false alarm rate on Portuguese: + +1. Replace Silero VAD's end-of-speech detection with Smart Turn PT +2. Keep Silero VAD for initial voice activity detection (speech vs silence) +3. Use Smart Turn only for the endpoint decision (when to trigger translation) +4. Hybrid approach: `Silero VAD (speech detected) → Smart Turn PT (speech complete?) → Translate` + +### Required Resources + +| Resource | Purpose | Cost | +|----------|---------|------| +| NURC-SP + CORAA data | Training samples | Free (CC BY-NC-ND 4.0) | +| GPU for training (L4/A6000) | Fine-tuning, ~1 hour | ~$1-2 on Vast.ai | +| Edge TTS | Synthetic data generation | Free | +| Weights & Biases | Training tracking | Free tier | + +### Expected Outcome + +With 20,000+ properly prepared Portuguese samples and cross-validated evaluation, we estimate: +- **Boundary detection:** 90%+ (up from 84.9%) +- **False alarm rate:** <20% (down from 54.9%) +- **Overall accuracy:** >85% (up from 68.6%) + +This would make Smart Turn PT a viable complement to Silero VAD for Portuguese end-of-utterance detection. + +--- + +## Quick Start + +### Local (CPU) + +```bash +pip install -r requirements.txt + +# Generate Portuguese dataset +python setup_portuguese_dataset.py --dataset synthetic + +# Run benchmarks +python run_portuguese_benchmark.py + +# Generate report +python generate_report.py +``` + +### With Real Portuguese Speech (NURC-SP) + +```bash +# Prepare NURC-SP dialogues (downloads from HuggingFace) +python setup_nurc_dataset.py + +# Run Pipecat Smart Turn benchmark +python -c " +from benchmark_pipecat import PipecatSmartTurnModel +from benchmark_base import evaluate_model +# ... (see run_portuguese_benchmark.py) +" +``` + +### Fine-tune Smart Turn for Portuguese + +```bash +# 1. Prepare training data from NURC-SP +python prepare_training_data.py + +# 2. Fine-tune (runs on MPS/CUDA/CPU) +python finetune_smart_turn.py + +# 3. Test the fine-tuned model +# ONNX model saved to checkpoints/smart_turn_pt/smart_turn_pt.onnx +``` + +### Vast.ai (GPU) + +```bash +export VAST_API_KEY="your_key" +python deploy_vast.py --all +``` + +## Project Structure + +``` +turn-taking-study/ +├── README.md # This file +├── Dockerfile # GPU-ready container +├── requirements.txt # Python dependencies +│ +├── # Benchmark Framework +├── benchmark_base.py # Base classes & evaluation metrics +├── benchmark_silence.py # Silence threshold baseline +├── benchmark_silero_vad.py # Silero VAD model +├── benchmark_vap.py # Voice Activity Projection model +├── benchmark_livekit_eot.py # LiveKit End-of-Turn model +├── benchmark_pipecat.py # Pipecat Smart Turn v3.1 +├── run_benchmarks.py # General benchmark orchestrator +├── run_portuguese_benchmark.py # Portuguese-specific benchmark +│ +├── # Dataset Preparation +├── setup_dataset.py # General dataset download +├── setup_portuguese_dataset.py # Portuguese synthetic dataset +├── setup_nurc_dataset.py # NURC-SP real speech dataset +├── generate_tts_dataset.py # Edge TTS Portuguese dialogues +│ +├── # Fine-tuning +├── prepare_training_data.py # Extract training samples from NURC-SP +├── finetune_smart_turn.py # Fine-tune Smart Turn on Portuguese +│ +├── # Deployment & Reporting +├── deploy_vast.py # Vast.ai deployment automation +├── generate_report.py # Report & figure generation +│ +├── data/ # Audio files & annotations (gitignored) +│ ├── annotations/ # JSON ground truth files +│ ├── nurc_sp/ # NURC-SP real speech +│ ├── portuguese/ # Synthetic Portuguese audio +│ ├── portuguese_tts/ # Edge TTS Portuguese audio +│ └── smart_turn_pt_training/ # Fine-tuning training samples +│ +├── checkpoints/ # Trained models (gitignored) +│ └── smart_turn_pt/ +│ ├── best_model.pt # PyTorch checkpoint +│ └── smart_turn_pt.onnx # ONNX model (30.6 MB) +│ +├── results/ # Benchmark result JSONs +└── report/ # Generated reports + ├── benchmark_report.md + ├── benchmark_report.tex # IEEE format for thesis + └── figures/ # PNG charts +``` + +## Datasets Used + +| Dataset | Type | Size | Language | Source | +|---------|------|------|----------|--------| +| Portuguese Synthetic | Generated audio | 1.4h, 100 convs | pt-BR | Local generation | +| Portuguese TTS | Edge TTS speech | 6.4min, 10 convs | pt-BR | Microsoft Edge TTS | +| NURC-SP Corpus Minimo | Real dialogues (1970s-90s) | 19h, 21 recordings | pt-BR | [HuggingFace](https://huggingface.co/datasets/nilc-nlp/NURC-SP_Corpus_Minimo) | +| CORAA NURC-SP | Real dialogues | 239h | pt-BR | [HuggingFace](https://huggingface.co/datasets/nilc-nlp/CORAA-NURC-SP-Audio-Corpus) | + +## References + +1. Ekstedt, E. & Torre, G. (2024). *Real-time and Continuous Turn-taking Prediction Using Voice Activity Projection*. arXiv:2401.04868. +2. Ekstedt, E. & Torre, G. (2022). *Voice Activity Projection: Self-supervised Learning of Turn-taking Events*. INTERSPEECH 2022. +3. Ekstedt, E., Holmer, E., & Torre, G. (2024). *Multilingual Turn-taking Prediction Using Voice Activity Projection*. LREC-COLING 2024. +4. Daily. (2025). *Smart Turn: Real-time End-of-Turn Detection*. GitHub. https://github.com/pipecat-ai/smart-turn +5. Daily. (2025). *Announcing Smart Turn v3, with CPU inference in just 12ms*. https://www.daily.co/blog/announcing-smart-turn-v3-with-cpu-inference-in-just-12ms/ +6. Daily. (2025). *Improved accuracy in Smart Turn v3.1*. https://www.daily.co/blog/improved-accuracy-in-smart-turn-v3-1/ +7. Daily. (2026). *Smart Turn v3.2: Handling noisy environments and short responses*. https://www.daily.co/blog/smart-turn-v3-2-handling-noisy-environments-and-short-responses/ +8. LiveKit. (2025). *Improved End-of-Turn Model Cuts Voice AI Interruptions 39%*. https://blog.livekit.io/improved-end-of-turn-model-cuts-voice-ai-interruptions-39/ +9. Silero Team. (2021). *Silero VAD: pre-trained enterprise-grade Voice Activity Detector*. https://github.com/snakers4/silero-vad +10. Skantze, G. (2021). *Turn-taking in Conversational Systems and Human-Robot Interaction: A Review*. Computer Speech & Language, 67, 101178. +11. Sacks, H., Schegloff, E.A., & Jefferson, G. (1974). *A simplest systematics for the organization of turn-taking for conversation*. Language, 50(4), 696-735. +12. Raux, A. & Eskenazi, M. (2009). *A Finite-State Turn-Taking Model for Spoken Dialog Systems*. NAACL-HLT. +13. Krisp. (2024). *Audio-only 6M weights Turn-Taking model for Voice AI Agents*. https://krisp.ai/blog/turn-taking-for-voice-ai/ +14. Castilho, A.T. (2019). *NURC-SP Audio Corpus*. 239h of transcribed Brazilian Portuguese dialogues. +15. Godfrey, J.J., et al. (1992). *SWITCHBOARD: Telephone speech corpus for research and development*. ICASSP-92. + +## License + +MIT diff --git a/RESEARCH_LOG.md b/RESEARCH_LOG.md new file mode 100644 index 0000000000000000000000000000000000000000..a4f65f99439fa4838d98354004003fac8b0f6718 --- /dev/null +++ b/RESEARCH_LOG.md @@ -0,0 +1,193 @@ +# Smart Turn Portuguese Fine-Tuning — Research Log + +## Objetivo + +Fine-tuning do modelo Pipecat Smart Turn (detecção de fim de turno em conversas) para português brasileiro, visando melhorar a acurácia de ~68% (modelo original inglês aplicado a PT) para 90%+. + +## Background + +### O que é Smart Turn +- Modelo do framework [Pipecat](https://github.com/pipecat-ai/smart-turn) para detectar se um falante terminou de falar +- Arquitetura: Whisper encoder + attention pooling + classificador binário (complete/incomplete) +- Janela de 8 segundos de áudio, saída binária: "turno completo" vs "turno incompleto" +- Modelo original treinado em inglês (v3.1), 39M parâmetros (Whisper Tiny) + +### Por que fine-tuning em português +- O modelo original em inglês tem ~68.6% de acurácia em português +- Prosódia, entonação e padrões de turn-taking são diferentes entre idiomas +- Aplicação: tradução em tempo real de reuniões (BabelCast) + +--- + +## Fase 1: Benchmark do modelo original (pré fine-tuning) + +### Datasets de avaliação +- **NURC-SP Corpus Minimo** (nilc-nlp): diálogos reais em PT-BR espontâneo dos anos 1970-1990 +- Scripts: `setup_nurc_dataset.py`, `benchmark_pipecat.py` + +### Resultado baseline +- Acurácia do Smart Turn v3.1 em português: **~68.6%** +- Problema principal: modelo não entende padrões prosódicos do português + +--- + +## Fase 2: Fine-tuning v2 — Heurística de corte (2026-03-14) + +### Abordagem +- Baixar datasets de ASR em português do HuggingFace +- Criar labels heurísticas: "complete" = final da frase, "incomplete" = corte aleatório a 30-75% +- Treinar com speaker-based split para evitar data leakage + +### Datasets utilizados +| Dataset | Tipo | Horas | Samples | +|---------|------|-------|---------| +| CORAA v1.1 | Conversacional BR-PT | 291h | 7,000 | +| MLS Portuguese | Audiobook (leitura) | 168h | 7,000 | +| CORAA-MUPE-ASR | Entrevistas | 365h | 7,000 | +| **Total** | | | **21,000** | + +### Configuração de treino +- **Modelo**: Whisper Tiny encoder (39M params) +- **GPU**: RTX 3090 24GB (Vast.ai, $0.069/hr) +- **Batch size**: 32 +- **Learning rate**: 2e-5 (encoder: 2e-6, head: 2e-5) +- **Augmentation**: volume scaling, Gaussian noise +- **Split**: 13,032 train (4,100 speakers) / 698 val (512 speakers) / 7,270 test (512 speakers) +- **Early stopping**: patience=5 no val_f1 + +### Infraestrutura (problemas e soluções) +- **6+ instâncias Vast.ai morreram** durante o treino (spot instances instáveis) +- **3 pods RunPod falharam** (PyTorch images too large for container disk) +- **Solução**: on-demand Vast.ai instance, RTX 3090, reliability=1.00 +- **Mac local**: treinamento funciona mas deixa a máquina muito lenta (MPS) +- **Dependências críticas**: PyTorch >= 2.4, `datasets<4` (para evitar torchcodec), librosa, typing_extensions >= 4.12 + +### Resultados v2 + +#### Progresso por época +| Epoch | Train Loss | Train Acc | Val Acc | Val F1 | Val Prec | Val Rec | +|-------|-----------|-----------|---------|--------|----------|---------| +| 1 | 0.6032 | 0.668 | 0.669 | 0.721 | 0.600 | 0.903 | +| 2 | 0.5019 | 0.748 | 0.716 | 0.751 | 0.643 | 0.903 | +| 3 | 0.4521 | 0.782 | 0.716 | 0.751 | 0.643 | 0.903 | +| 4 | 0.4161 | 0.806 | 0.749 | 0.748 | 0.714 | 0.785 | +| 5 | 0.3962 | 0.817 | 0.742 | 0.717 | 0.748 | 0.689 | +| **6** | **0.3806** | **0.831** | **0.754** | **0.760** | **0.707** | **0.822** | +| 7 | 0.3703 | 0.832 | 0.742 | 0.748 | 0.697 | 0.807 | +| 8 | 0.3566 | 0.845 | 0.739 | 0.709 | 0.752 | 0.671 | +| 9 | 0.3449 | 0.846 | 0.748 | 0.745 | 0.716 | 0.776 | +| 10 | 0.3390 | 0.853 | 0.748 | 0.758 | 0.695 | 0.834 | +| 11 | 0.3237 | 0.865 | 0.754 | 0.756 | 0.713 | 0.804 | + +Early stopping na época 11 (sem melhora por 5 épocas). + +#### Melhor modelo (época 6) +- **Val Accuracy**: 75.4% +- **Val F1**: 0.760 +- **Val Precision**: 70.7% +- **Val Recall**: 82.2% + +#### Teste (speakers totalmente novos) +- **Test Accuracy**: 64.4% +- **Test F1**: 0.600 +- **Test Precision**: 68.0% +- **Test Recall**: 53.7% +- TP=1945, FP=916, FN=1674, TN=2735 + +### Análise dos problemas da v2 + +1. **Labels heurísticas (problema principal)**: Cortar frases aleatoriamente em 30-75% não simula turn-taking real. O modelo aprendeu "tem silêncio no final?" em vez de "a pessoa terminou de falar?" + +2. **MLS é audiobook**: 1/3 dos dados são leitura de audiobook — prosódia completamente diferente de conversação real + +3. **MUPE speaker_id quebrado**: Usava `speaker_type` ("interviewer"/"interviewee") como speaker_id, comprometendo o speaker split + +4. **Modelo pequeno**: Whisper Tiny (39M params) tem capacidade limitada para capturar padrões prosódicos complexos + +5. **Gap val/test grande (75.4% vs 64.4%)**: Indica que o modelo não generaliza bem para speakers novos + +### Arquivos gerados +- `checkpoints/smart_turn_pt_v2/best_model.pt` — 31MB PyTorch checkpoint +- `checkpoints/smart_turn_pt_v2/smart_turn_pt.onnx` + `.onnx.data` — 31MB ONNX +- `checkpoints/smart_turn_pt_v2/finetune.log` — log completo + +### Commits +- `4517458` — feat: Pipecat Smart Turn Portuguese evaluation, fine-tuning pipeline +- `307b0fd` — feat: GPU fine-tuning script with HuggingFace Portuguese datasets +- `62e1816` — fix: total_mem → total_memory for PyTorch 2.10 compat +- `0039c15` — fix: reduce samples to 5k/dataset and workers to prevent OOM +- `51025dd` — fix: MPS support, skip Common Voice, increase samples to 7k/dataset + +--- + +## Fase 3: Fine-tuning v3 — Labels por pontuação + Whisper Base (em andamento) + +### Melhorias implementadas + +1. **Labels baseadas em pontuação do texto** + - Frase termina com `.` `!` `?` `…` → complete (1.0) + - Frase termina com `,` `;` `:` `-` → incomplete (0.0) + - Texto sem pontuação com ≤2 palavras → descartado (ambíguo) + - Texto sem pontuação com 3+ palavras → incomplete (transcritor teria colocado ponto se fosse completa) + +2. **Removido MLS audiobook** — só dados conversacionais (CORAA + MUPE) + +3. **Whisper Base** (74M params) em vez de Whisper Tiny (39M) + - 2x mais parâmetros no encoder + - hidden_size: 512 (vs 384 no Tiny) + - Melhor capacidade para capturar prosódia + +4. **Speaker ID do MUPE corrigido** — usa hash do audio_path ou agrupamento por index + +5. **Mais dados**: 25k samples por dataset (vs 7k na v2) = ~50k total + +6. **Augmentation melhorada**: + - Speed perturbation (0.9x–1.1x) + - Volume scaling (0.6x–1.4x) + - Gaussian noise mais agressivo + - Time shift aleatório (±0.3s) + +7. **LR schedule com warmup**: 2 épocas de warmup + cosine decay + +8. **Classifier head maior**: 512→128→1 (vs 256→64→1) + +9. **Patience aumentado**: 7 épocas (vs 5) + +### Configuração +- **Modelo**: Whisper Base (74M params) +- **Datasets**: CORAA + MUPE (~50k samples) +- **LR**: 3e-5 (encoder: 3e-6) +- **Epochs**: até 30 (com early stopping patience=7) +- **Batch size**: 32 + +### Resultados v3 +*(a ser preenchido após o treino)* + +--- + +## Roadmap futuro (se necessário) + +### Nível 4 — Dados reais de turn-taking +- Usar NURC-SP com anotações reais de fronteiras de turno +- Gravar dados de reuniões reais em português +- Combinar features de áudio + texto (multimodal) + +### Nível 5 — Arquitetura avançada +- Whisper Small (244M params) +- Adicionar features linguísticas (completude sintática via LLM) +- Ensemble de modelos + +### Limites teóricos +- Humanos discordam em ~10-15% dos casos de turn-taking +- Teto realista: 90-95% +- Algumas frases são genuinamente ambíguas ("Sim...", "É...") + +--- + +## Referências + +- Pipecat Smart Turn: https://github.com/pipecat-ai/smart-turn +- CORAA v1.1: https://huggingface.co/datasets/Racoci/CORAA-v1.1 +- CORAA-MUPE-ASR: https://huggingface.co/datasets/nilc-nlp/CORAA-MUPE-ASR +- NURC-SP Corpus Minimo: https://huggingface.co/datasets/nilc-nlp/NURC-SP_Corpus_Minimo +- MLS Portuguese: https://huggingface.co/datasets/facebook/multilingual_librispeech diff --git a/melhorias_turn_detection.md b/melhorias_turn_detection.md new file mode 100644 index 0000000000000000000000000000000000000000..e588140ece3a656fe01c961d467708f29a9b1a85 --- /dev/null +++ b/melhorias_turn_detection.md @@ -0,0 +1,346 @@ +# Smart Turn v3 — Deteccao de Fim de Turno para Portugues + +## O que e + +Modelo de deteccao de fim de turno (end-of-turn detection) para o BabelCast. Analisa os ultimos 8 segundos de audio e decide se o falante terminou de falar ou se esta apenas pausando. Isso evita que a traducao comece antes da hora (interrompendo o falante). + +O modelo usa o **encoder do Whisper Tiny** (pre-treinado em 680.000 horas de audio multilingual) como extrator de features acusticas — entonacao, ritmo, energia, padroes espectrais — seguido de attention pooling + classificador MLP. **Nao usa o Whisper pra transcrever** — apenas como backbone de audio. + +Arquitetura identica ao [Pipecat Smart Turn v3](https://github.com/pipecat-ai/smart-turn) original (Daily.co). + +## Como funciona + +``` +Silero VAD detecta 200ms de silencio + | +Smart Turn recebe os ultimos 8s de audio (16kHz mono) + | +Whisper Feature Extractor → mel-spectrogram (80 bins x 800 frames) + | +Whisper Tiny encoder (39M params) → representacoes acusticas (384-dim x 400 frames) + | +Attention Pooling → aprende QUAIS frames sao importantes pra decisao + | +Classifier MLP (384→256→64→1) → sigmoid → probabilidade [0, 1] + | +Se probabilidade > threshold → "Turno completo" (pode comecar a traduzir) +Se probabilidade <= threshold → "Ainda falando" (espera mais) +``` + +O **attention pooling** foca nos frames perto do silencio, onde a entonacao final e a queda de energia sao mais informativas. O encoder captura: + +- **Prosodia / Entonacao** — pitch caindo = fim de frase; pitch suspenso = pausa de hesitacao +- **Ritmo / Velocidade** — desaceleracao indica fim de pensamento +- **Energia / Volume** — queda de energia no final vs. manutencao na hesitacao +- **Padroes espectrais** — respiracao, fillers ("hum", "eh"), tipo de silencio + +Por isso e muito melhor que VAD simples, que so detecta silencio. + +## Dados de Treino + +- **CORAA v1.1** — 291h de portugues brasileiro conversacional (HuggingFace: `Racoci/CORAA-v1.1`) +- **CORAA-MUPE-ASR** — 365h de entrevistas (HuggingFace: `nilc-nlp/CORAA-MUPE-ASR`) +- **15.000 amostras** (7.500 por dataset), 5.590 falantes +- **Labels hibridos**: pontuacao do texto (.!? = completo, ,;: = incompleto) + corte de audio em 30-75% pra amostras sem pontuacao +- Split por falante (train/val/test) pra evitar data leakage + +## Historico de Treinamento + +### Rodada 1 — Whisper Base + BCE Loss (baseline) + +Primeiro experimento, usando encoder maior (Whisper Base, 74M params, hidden 512). + +| Metrica | Teste | +|---------|-------| +| **F1** | 0.796 | +| Accuracy | 79.4% | +| Precision | 75.8% | +| Recall | 83.9% | +| Modelo (PT) | 78.2 MB | +| Best epoch | 12/30 (early stop 19) | + +Treinado em Modal (A10 GPU), ~29 minutos. + +### Rodada 2 — Whisper Tiny + BCE Loss + +Trocamos pra Whisper Tiny (39M params, hidden 384) — mesmo backbone do Pipecat original. Modelo 2.5x menor, inferencia mais rapida, qualidade praticamente igual. + +| Metrica | Teste | +|---------|-------| +| **F1** | 0.788 | +| Accuracy | 78.0% | +| Precision | 73.3% | +| Recall | 85.3% | +| Modelo (PT) | 30.5 MB | +| Best epoch | 13/30 (early stop 20) | + +Treinado em Modal (A10 GPU), ~15 minutos. + +**Conclusao**: diferenca de apenas 0.8% no F1 vs Whisper Base, com modelo 2.5x menor. Recall ate melhorou (+1.4%). Trade-off excelente. + +### Rodada 3 — Whisper Tiny + Focal Loss + Label Smoothing (atual) + +Aplicamos tres melhorias de precisao baseadas em pesquisa (ver secao "Solucoes Pesquisadas"): + +1. **Focal Loss** (gamma=2.0, alpha=0.6) — penaliza falsos positivos, foca nos casos dificeis perto da fronteira de decisao +2. **Label Smoothing** (0.05) — labels viram 0.05/0.95 em vez de 0/1, melhora calibracao do modelo +3. **Threshold sweep** — avalia multiplos thresholds pra encontrar o melhor trade-off precisao/recall + +| Metrica (threshold=0.5) | Teste | +|---------|-------| +| **F1** | **0.798** | +| Accuracy | 78.2% | +| Precision | 72.0% | +| Recall | **89.5%** | +| Modelo (PT) | 30.5 MB (~8 MB ONNX INT8) | +| Best epoch | 10/30 (early stop 17) | + +**Threshold Sweep** — o grande ganho: + +| Threshold | Precision | Recall | F1 | +|-----------|-----------|--------|-----| +| 0.50 | 72.0% | 89.5% | 0.798 | +| 0.55 | 74.4% | 83.4% | 0.786 | +| **0.60** | **79.2%** | **75.4%** | **0.772** | +| **0.65** | **83.0%** | **64.9%** | **0.728** | +| 0.70 | 87.3% | 51.8% | 0.651 | +| 0.75 | 93.0% | 35.8% | 0.517 | +| 0.80 | 93.5% | 17.7% | 0.298 | + +**Conclusao**: Focal Loss + Label Smoothing criaram um modelo muito mais calibrado. A precision sobe de 72% a 93% ajustando so o threshold. O sweet spot pra traducao simultanea e **threshold=0.60-0.65** (79-83% precision com recall razoavel). + +### Comparativo das 3 rodadas + +| Versao | Encoder | Loss | Threshold | Precision | Recall | F1 | Tamanho | +|--------|---------|------|-----------|-----------|--------|-----|---------| +| R1 | Whisper Base | BCE | 0.5 | 75.8% | 83.9% | 0.796 | 78.2 MB | +| R2 | Whisper Tiny | BCE | 0.5 | 73.3% | 85.3% | 0.788 | 30.5 MB | +| R3 | Whisper Tiny | Focal | 0.5 | 72.0% | 89.5% | **0.798** | 30.5 MB | +| **R3** | **Whisper Tiny** | **Focal** | **0.60** | **79.2%** | **75.4%** | **0.772** | **30.5 MB** | +| **R3** | **Whisper Tiny** | **Focal** | **0.65** | **83.0%** | **64.9%** | **0.728** | **30.5 MB** | + +**Melhor resultado geral**: R3 com threshold=0.5 (F1 0.798) ou threshold=0.60 (precision 79.2%). + +## Arquivos + +``` +docs/turn-taking-study/ + finetune_smart_turn_v3.py # Script de treino (Whisper Tiny + Focal Loss) + modal_finetune.py # Deploy no Modal (A10G GPU, 4h timeout) + deploy_finetune.py # Deploy alternativo via ai-gateway (TensorDock/Vast) + results/ # Rodada 1 (Whisper Base) + best_model.pt # 78.2 MB + training_results.json + results-tiny/ # Rodada 2 (Whisper Tiny + BCE) + best_model.pt # 30.5 MB + training_results.json + results-focal/ # Rodada 3 (Whisper Tiny + Focal Loss) ← ATUAL + best_model.pt # 30.5 MB + training_results.json + melhorias_turn_detection.md # Este documento +``` + +--- + +## Solucoes Pesquisadas para Melhorar Precisao + +### Melhoria 1: Threshold de Confianca + Buffer de Confirmacao + +**STATUS: IMPLEMENTADO (Rodada 3)** + +O modelo retorna uma probabilidade (sigmoid). Em vez do threshold fixo de 0.5, usamos threshold configuravel. O Pipecat original usa 0.7 como default. + +**Quem faz isso**: Pipecat (threshold 0.7), Krisp (threshold configuravel, 6% FPR com 0.9s mean shift time), AssemblyAI (dual detection com `end_of_turn_confidence_threshold`). + +**Resultado**: Com threshold=0.65, precision sobe de 72% pra 83% (+11 pontos percentuais). + +**Referencia**: +- [Krisp: Audio-only 6M Turn-Taking Model](https://krisp.ai/blog/turn-taking-for-voice-ai/) +- [AssemblyAI: Turn Detection](https://www.assemblyai.com/blog/turn-detection-endpointing-voice-agent) + +--- + +### Melhoria 2: Focal Loss + Label Smoothing + +**STATUS: IMPLEMENTADO (Rodada 3)** + +- **Focal Loss** (gamma=2.0, alpha=0.6): penaliza exemplos faceis e foca nos casos dificeis perto da fronteira de decisao. Alpha < 1 penaliza mais os falsos positivos (dizer "terminou" quando nao terminou). +- **Label Smoothing** (0.05): labels viram 0.05/0.95 em vez de 0/1, evitando overconfidence e melhorando calibracao. Isso torna o threshold sweep muito mais eficaz. + +**Quem faz isso**: Focal Loss (Lin et al. 2017, RetinaNet), calibracao com Focal Loss (EMNLP 2022), Asymmetric Loss (Ridnik et al. 2021, Alibaba DAMO). + +**Resultado**: Recall subiu de 85.3% pra 89.5% no threshold=0.5, e o modelo ficou muito mais calibrado — precision controlavel de 72% a 93% via threshold. + +**Referencia**: +- [Focal Loss for Dense Object Detection (Lin et al. 2017)](https://arxiv.org/abs/1708.02002) +- [Calibrating Imbalanced Classifiers with Focal Loss (EMNLP 2022)](https://aclanthology.org/2022.emnlp-industry.14/) + +--- + +### Melhoria 3: Adicionar Texto (Multimodal — Audio + STT) + +**STATUS: PENDENTE — proximo passo recomendado** + +O modelo atual so ve audio. Adicionar a transcricao do STT como input adicional. Uma frase como "e depois eu..." e claramente incompleta — o texto da essa informacao mesmo quando o audio "parece" uma pausa natural. + +Duas abordagens possiveis: + +**a) Modelo separado de texto (pipeline):** Um LLM pequeno analisa a transcricao e da um score de completude. Combina com o score do modelo de audio por ensemble. + +**b) Modelo unico multimodal (tipo Vogent):** Encoder de audio + texto com cross-attention, treinados juntos. + +**Quem faz isso**: + +- **LiveKit** — Qwen2.5-0.5B-Instruct, distillation de 7B → 0.5B. Resultado: **-39% falsos positivos** (interrupcoes). Funciona especialmente bem para entradas estruturadas (numeros, enderecos). +- **Vogent Turn 80M** (YC, 2025) — Whisper encoder (audio) + SmolLM2 ablated (texto), **94.1% accuracy** em ~7ms no T4. Estado da arte. +- **Speechmatics** — Semantic turn detection combinando features acusticas + linguisticas. + +**Impacto esperado**: +5-10% F1. Para o BabelCast, a abordagem (a) e mais pratica: ja temos STT rodando (Whisper/Groq). + +**Referencia**: +- [LiveKit: Improved End-of-Turn Model](https://blog.livekit.io/improved-end-of-turn-model-cuts-voice-ai-interruptions-39/) +- [Vogent Turn 80M](https://huggingface.co/vogent/Vogent-Turn-80M) +- [Speechmatics: Smarter Turn Detection](https://blog.speechmatics.com/semantic-turn-detection) + +--- + +### Melhoria 4: Hard Negatives Sinteticos + +**STATUS: PENDENTE** + +Gerar amostras sinteticas dos padroes que mais causam falsos positivos: +- Pausas de hesitacao ("hum...", "eh...", "tipo...") +- Enumeracoes ("primeiro... segundo...") +- Pausas longas pra pensar (2-3 segundos) no meio de frase +- Frases sintaticamente completas mas semanticamente incompletas ("Eu fui la. E depois...") + +**Quem faz isso**: + +- **SpeculativeETD** (Samsung Research, arXiv 2503.23439) — criou dados sinteticos injetando fillers e estendendo hesitacoes +- **Vogent Turn 80M** — gerou "multi-clause responses, disfluent speech with filled pauses, list-like enumerations" +- **Deepgram** — 1.000 amostras hard curadas manualmente → 92%+ accuracy + +**Impacto esperado**: +5-15% precision. Maior impacto potencial entre as melhorias pendentes. + +**Referencia**: +- [SpeculativeETD (arXiv 2025)](https://arxiv.org/abs/2503.23439) +- [Deepgram: Evaluating End-of-Turn Detection](https://deepgram.com/learn/evaluating-end-of-turn-detection-models) + +--- + +### Melhoria 5: Labels Reais de Turn-Taking (Melhor Dataset) + +**STATUS: PENDENTE** + +O maior limitante e como os labels sao criados: +- **Pontuacao do texto** — funciona, mas muitas amostras nao tem texto +- **Corte artificial em 30-75%** — o modelo aprende a detectar cortes bruscos, nao pausas reais de meio-frase + +A melhoria e usar dados com **anotacao real de turn-taking**: onde um humano marcou onde cada turno comeca e termina. + +**Quem faz isso**: + +- **SpeculativeETD** — primeiro dataset publico para ETD: 122.481 amostras, 200+ horas. Anotacao ternaria: Speaking Unit, Pause, Gap. +- **Pipecat Smart Turn v3.1** — datasets publicados no HuggingFace com labels manuais shift/hold (270K amostras, 41GB) +- **VAP** (Erik Ekstedt) — aprendizado auto-supervisionado em conversas reais (Switchboard, Fisher) + +**Impacto esperado**: +10% F1 ou mais. Maior investimento (coleta/anotacao de dados). + +**Referencia**: +- [SpeculativeETD Dataset (arXiv 2025)](https://arxiv.org/abs/2503.23439) +- [Pipecat Smart Turn v3.1 Data (HuggingFace)](https://huggingface.co/datasets/pipecat-ai/smart-turn-data-v3.1-train) + +--- + +### Melhoria 6: Contexto Conversacional (Historico de Turnos) + +**STATUS: PENDENTE** + +O modelo ve 8 segundos isolados. Mas turn-taking depende do contexto: se alguem fez uma pergunta, a resposta provavelmente sera longa. + +**Quem faz isso**: + +- **VAP** (Voice Activity Projection) — cross-attention transformer processando os dois canais de audio simultaneamente +- **LiveKit** — modelo Qwen2.5 recebe historico da conversa + frase atual + +**Impacto esperado**: +5% F1. Mudanca arquitetural significativa. + +**Referencia**: +- [VAP: Real-time Turn-taking Prediction (arXiv 2024)](https://arxiv.org/abs/2401.04868) +- [Multi-TPC Dataset (Nature 2026)](https://www.nature.com/articles/s41597-026-06819-x) + +--- + +### Melhoria 7: Knowledge Distillation + +**STATUS: PENDENTE** + +Treinar um modelo grande (Whisper Small/Medium encoder + transformer classificador) como professor, depois destilar o conhecimento pro Whisper Tiny (aluno). O aluno aprende as soft probabilities do professor, capturando nuances dos casos dificeis. + +**Quem faz isso**: + +- **LiveKit** — Qwen 7B (professor) → Qwen 0.5B (aluno). Aluno "approaches teacher-level accuracy" com 14x menos params. Convergiu em ~1.500 steps. + +**Impacto esperado**: +3-7% precision sobre treino direto do modelo pequeno. + +**Referencia**: +- [LiveKit: Using a Transformer for Turn Detection](https://blog.livekit.io/using-a-transformer-to-improve-end-of-turn-detection) + +--- + +### Melhoria 8: Mais Dados e Diversidade + +**STATUS: PENDENTE** + +15K amostras e pouco comparado com o estado da arte: +- Krisp: **2.000 horas / 700K turnos** +- SpeculativeETD: **200+ horas / 122K amostras** +- Pipecat v3.1: **270K amostras / 41GB** + +Precisamos de mais diversidade: sotaques (nordestino, gaucho, mineiro), contextos (reuniao formal, papo informal), ruidos de fundo. + +**Quem faz isso**: + +- **Krisp v2** — melhorou significativamente so mudando os dados, sem mudar arquitetura +- **Pipecat v3.1** — accuracy melhorou dramaticamente ao melhorar o dataset + +**Impacto esperado**: +5-10% F1, especialmente em robustez. + +**Referencia**: +- [Krisp Turn-Taking v2](https://krisp.ai/blog/krisp-turn-taking-v2-voice-ai-viva-sdk/) +- [Pipecat Smart Turn v3.1 (Daily.co)](https://www.daily.co/blog/improved-accuracy-in-smart-turn-v3-1/) + +--- + +## Plano de Execucao (Ordem de Prioridade) + +| # | Melhoria | Status | Impacto | Esforco | Referencia | +|---|----------|--------|---------|---------|------------| +| 1 | Threshold de confianca | FEITO | +11% prec | 0 dias | Pipecat, Krisp | +| 2 | Focal Loss + Label Smoothing | FEITO | Calibracao | 0 dias | Lin et al., EMNLP 2022 | +| 3 | Texto do STT (multimodal) | PENDENTE | +5-10% F1 | 1-2 semanas | LiveKit (-39% FP), Vogent (94.1%) | +| 4 | Hard negatives sinteticos | PENDENTE | +5-15% prec | 1 semana | SpeculativeETD, Deepgram | +| 5 | Labels reais de turn-taking | PENDENTE | +10%+ F1 | 2-4 semanas | SpeculativeETD, Pipecat v3.1 | +| 6 | Contexto conversacional | PENDENTE | +5% F1 | 2-3 semanas | VAP, LiveKit | +| 7 | Knowledge distillation | PENDENTE | +3-7% prec | 1-2 semanas | LiveKit (Qwen 7B→0.5B) | +| 8 | Mais dados / diversidade | PENDENTE | +5-10% F1 | Continuo | Krisp v2, Pipecat v3.1 | + +### Meta realista + +Combinando melhorias implementadas (1+2) com as proximas (3+4+5): **F1 de 0.80 → 0.92-0.95** com precision acima de 90%. + +Os ultimos 5% (0.95 → 1.00) sao os mais dificeis — casos genuinamente ambiguos onde ate humanos discordam. Nenhum projeto no mercado atingiu 100%. + +### Proximos passos imediatos + +1. **Integrar no BabelCast** com threshold=0.60-0.65 (testar em reunioes reais) +2. **Gerar hard negatives** sinteticos com o TTS do BabelCast (fillers, pausas longas) +3. **Testar modelo do LiveKit** (Qwen2.5-0.5B) como segundo estagio de texto +4. **Baixar dataset do Pipecat v3.1** (270K amostras) e filtrar amostras em portugues + +### Infraestrutura + +- **Treino**: Modal (A10G GPU, ~$0.50/run de 15-30 min) +- **Deploy**: `modal run modal_finetune.py` +- **Alternativa**: ai-gateway → TensorDock/Vast.ai via `deploy_finetune.py` +- **Modelo final**: 30.5 MB (PyTorch), ~8 MB (ONNX INT8), 12ms inferencia CPU diff --git a/previous-experiments/01-benchmarks/benchmark_base.py b/previous-experiments/01-benchmarks/benchmark_base.py new file mode 100644 index 0000000000000000000000000000000000000000..cee16ed71cdc7254aebc50f8bd2ecf3f92d81b65 --- /dev/null +++ b/previous-experiments/01-benchmarks/benchmark_base.py @@ -0,0 +1,274 @@ +""" +Base classes and evaluation metrics for turn-taking benchmarks. + +Metrics follow standard turn-taking evaluation methodology: +- Ekstedt, E. & Torre, G. (2024). Voice Activity Projection: Self-supervised + Learning of Turn-taking Events. arXiv:2401.04868. +- Skantze, G. (2021). Turn-taking in Conversational Systems and Human-Robot + Interaction: A Review. Computer Speech & Language, 67. +""" + +from __future__ import annotations + +import json +import logging +import time +from abc import ABC, abstractmethod +from dataclasses import dataclass, field +from pathlib import Path + +import numpy as np +from sklearn.metrics import ( + balanced_accuracy_score, + f1_score, + precision_score, + recall_score, + confusion_matrix, +) + +from setup_dataset import Conversation, TurnSegment + +log = logging.getLogger(__name__) + +RESULTS_DIR = Path(__file__).parent / "results" + + +@dataclass +class PredictedEvent: + """A predicted turn-taking event.""" + timestamp: float # seconds + event_type: str # "shift" or "hold" + confidence: float = 1.0 + latency_ms: float = 0.0 # inference latency + + +@dataclass +class BenchmarkResult: + """Results from evaluating a single model on the dataset.""" + model_name: str + dataset_name: str + # Classification metrics + precision_shift: float = 0.0 + recall_shift: float = 0.0 + f1_shift: float = 0.0 + precision_hold: float = 0.0 + recall_hold: float = 0.0 + f1_hold: float = 0.0 + balanced_accuracy: float = 0.0 + macro_f1: float = 0.0 + # Timing metrics + mean_latency_ms: float = 0.0 + p50_latency_ms: float = 0.0 + p95_latency_ms: float = 0.0 + p99_latency_ms: float = 0.0 + # Turn-specific metrics + mean_shift_delay_ms: float = 0.0 # How early/late shifts are detected + false_interruption_rate: float = 0.0 # False positive shifts + missed_shift_rate: float = 0.0 # False negative shifts + # Resource usage + model_size_mb: float = 0.0 + peak_memory_mb: float = 0.0 + requires_gpu: bool = False + requires_asr: bool = False + # Metadata + n_conversations: int = 0 + n_predictions: int = 0 + total_audio_hours: float = 0.0 + extra: dict = field(default_factory=dict) + + def to_dict(self) -> dict: + return {k: v for k, v in self.__dict__.items()} + + +class TurnTakingModel(ABC): + """Abstract base for turn-taking prediction models.""" + + @property + @abstractmethod + def name(self) -> str: + ... + + @property + @abstractmethod + def requires_gpu(self) -> bool: + ... + + @property + @abstractmethod + def requires_asr(self) -> bool: + ... + + @abstractmethod + def predict(self, conversation: Conversation) -> list[PredictedEvent]: + """Predict turn-taking events for a conversation.""" + ... + + def get_model_size_mb(self) -> float: + """Return model size in MB.""" + return 0.0 + + +def evaluate_model( + model: TurnTakingModel, + conversations: list[Conversation], + dataset_name: str, + tolerance_ms: float = 500.0, +) -> BenchmarkResult: + """ + Evaluate a turn-taking model against ground truth annotations. + + Args: + model: The model to evaluate + conversations: List of conversations with ground truth + dataset_name: Name of the dataset + tolerance_ms: Matching tolerance in milliseconds for event alignment + + Returns: + BenchmarkResult with all metrics computed + """ + all_true_labels: list[int] = [] + all_pred_labels: list[int] = [] + all_latencies: list[float] = [] + shift_delays: list[float] = [] + false_interruptions = 0 + missed_shifts = 0 + total_shifts = 0 + total_predictions = 0 + + tolerance_s = tolerance_ms / 1000.0 + + for conv in conversations: + t0 = time.perf_counter() + predictions = model.predict(conv) + elapsed_ms = (time.perf_counter() - t0) * 1000.0 + + if predictions: + per_pred_latency = elapsed_ms / len(predictions) + all_latencies.extend([per_pred_latency] * len(predictions)) + total_predictions += len(predictions) + + # Build ground truth event timeline + gt_shifts = set(conv.turn_shifts) + gt_holds = set(conv.holds) + total_shifts += len(gt_shifts) + + # Match predictions to ground truth events + matched_shifts: set[float] = set() + matched_holds: set[float] = set() + + for pred in predictions: + matched = False + + # Check if prediction matches a ground truth shift + for gt_t in gt_shifts: + if abs(pred.timestamp - gt_t) <= tolerance_s: + if pred.event_type == "shift": + all_true_labels.append(1) + all_pred_labels.append(1) + matched_shifts.add(gt_t) + shift_delays.append((pred.timestamp - gt_t) * 1000.0) + else: + all_true_labels.append(1) + all_pred_labels.append(0) + matched = True + break + + if matched: + continue + + # Check if prediction matches a ground truth hold + for gt_t in gt_holds: + if abs(pred.timestamp - gt_t) <= tolerance_s: + if pred.event_type == "hold": + all_true_labels.append(0) + all_pred_labels.append(0) + matched_holds.add(gt_t) + else: + all_true_labels.append(0) + all_pred_labels.append(1) + false_interruptions += 1 + matched = True + break + + if not matched: + # Unmatched prediction = false positive + if pred.event_type == "shift": + all_true_labels.append(0) + all_pred_labels.append(1) + false_interruptions += 1 + else: + all_true_labels.append(0) + all_pred_labels.append(0) + + # Unmatched ground truth shifts = missed + for gt_t in gt_shifts: + if gt_t not in matched_shifts: + all_true_labels.append(1) + all_pred_labels.append(0) + missed_shifts += 1 + + # Compute metrics + y_true = np.array(all_true_labels) + y_pred = np.array(all_pred_labels) + + result = BenchmarkResult( + model_name=model.name, + dataset_name=dataset_name, + n_conversations=len(conversations), + n_predictions=total_predictions, + total_audio_hours=sum(c.duration for c in conversations) / 3600.0, + requires_gpu=model.requires_gpu, + requires_asr=model.requires_asr, + model_size_mb=model.get_model_size_mb(), + ) + + if len(y_true) > 0 and len(np.unique(y_true)) > 1: + result.precision_shift = float(precision_score(y_true, y_pred, pos_label=1, zero_division=0)) + result.recall_shift = float(recall_score(y_true, y_pred, pos_label=1, zero_division=0)) + result.f1_shift = float(f1_score(y_true, y_pred, pos_label=1, zero_division=0)) + result.precision_hold = float(precision_score(y_true, y_pred, pos_label=0, zero_division=0)) + result.recall_hold = float(recall_score(y_true, y_pred, pos_label=0, zero_division=0)) + result.f1_hold = float(f1_score(y_true, y_pred, pos_label=0, zero_division=0)) + result.balanced_accuracy = float(balanced_accuracy_score(y_true, y_pred)) + result.macro_f1 = float(f1_score(y_true, y_pred, average="macro", zero_division=0)) + + if all_latencies: + arr = np.array(all_latencies) + result.mean_latency_ms = float(np.mean(arr)) + result.p50_latency_ms = float(np.percentile(arr, 50)) + result.p95_latency_ms = float(np.percentile(arr, 95)) + result.p99_latency_ms = float(np.percentile(arr, 99)) + + if shift_delays: + result.mean_shift_delay_ms = float(np.mean(shift_delays)) + + if total_shifts > 0: + result.missed_shift_rate = missed_shifts / total_shifts + + total_non_shifts = len(all_true_labels) - total_shifts + if total_non_shifts > 0: + result.false_interruption_rate = false_interruptions / total_non_shifts + + return result + + +def save_result(result: BenchmarkResult) -> Path: + """Save benchmark result to JSON.""" + RESULTS_DIR.mkdir(parents=True, exist_ok=True) + path = RESULTS_DIR / f"{result.model_name}_{result.dataset_name}.json" + with open(path, "w") as f: + json.dump(result.to_dict(), f, indent=2) + log.info("Saved result to %s", path) + return path + + +def load_all_results() -> list[BenchmarkResult]: + """Load all saved benchmark results.""" + results = [] + if not RESULTS_DIR.exists(): + return results + for path in sorted(RESULTS_DIR.glob("*.json")): + with open(path) as f: + data = json.load(f) + results.append(BenchmarkResult(**data)) + return results diff --git a/previous-experiments/01-benchmarks/benchmark_livekit_eot.py b/previous-experiments/01-benchmarks/benchmark_livekit_eot.py new file mode 100644 index 0000000000000000000000000000000000000000..02869290eb263c705d1fe5aa0d24a8fef5ae042c --- /dev/null +++ b/previous-experiments/01-benchmarks/benchmark_livekit_eot.py @@ -0,0 +1,157 @@ +""" +LiveKit End-of-Turn (EOT) model benchmark. + +Uses a fine-tuned Qwen2.5-0.5B model distilled from Qwen2.5-7B-Instruct +to predict end-of-turn from transcribed text. + +Note: This model requires ASR transcription as input (text-based). + +References: +- LiveKit. (2024). Using a Transformer to Improve End of Turn Detection. + https://blog.livekit.io/using-a-transformer-to-improve-end-of-turn-detection +- LiveKit. (2025). Improved End-of-Turn Model Cuts Voice AI Interruptions 39%. + https://blog.livekit.io/improved-end-of-turn-model-cuts-voice-ai-interruptions-39/ +- Qwen Team. (2024). Qwen2.5: A Party of Foundation Models. + https://arxiv.org/abs/2412.15115 +""" + +from __future__ import annotations + +import logging +import time + +import numpy as np + +from benchmark_base import TurnTakingModel, PredictedEvent +from setup_dataset import Conversation + +log = logging.getLogger(__name__) + + +class LiveKitEOTModel(TurnTakingModel): + """LiveKit End-of-Turn detection model (text-based, Qwen2.5-0.5B).""" + + def __init__(self, threshold: float = 0.5, device: str = "auto"): + self.threshold = threshold + self.device = device + self._model = None + self._tokenizer = None + + @property + def name(self) -> str: + return "livekit_eot" + + @property + def requires_gpu(self) -> bool: + return False # Designed for CPU inference + + @property + def requires_asr(self) -> bool: + return True # Needs transcribed text + + def get_model_size_mb(self) -> float: + return 281.0 # ~281MB on disk + + def _load_model(self) -> None: + if self._model is not None: + return + + from transformers import AutoModelForCausalLM, AutoTokenizer + import torch + + model_id = "livekit/turn-detector" + log.info("Loading LiveKit turn-detector from %s", model_id) + + self._tokenizer = AutoTokenizer.from_pretrained(model_id) + self._model = AutoModelForCausalLM.from_pretrained( + model_id, + torch_dtype=torch.float32, # CPU-optimized + ) + self._model.eval() + + if self.device == "auto": + self.device = "cuda" if torch.cuda.is_available() else "cpu" + self._model.to(self.device) + + def predict(self, conversation: Conversation) -> list[PredictedEvent]: + self._load_model() + events: list[PredictedEvent] = [] + + # Build conversation context and evaluate at each turn boundary + context_turns: list[dict[str, str]] = [] + + for i, turn in enumerate(conversation.turns): + if not turn.text or turn.text.startswith("[synthetic"): + continue + + context_turns.append({ + "speaker": turn.speaker, + "text": turn.text, + }) + + # Evaluate EOT probability after each turn + eot_prob, latency = self._get_eot_probability(context_turns) + + if i < len(conversation.turns) - 1: + next_turn = conversation.turns[i + 1] + + if eot_prob >= self.threshold: + events.append(PredictedEvent( + timestamp=turn.end, + event_type="shift", + confidence=eot_prob, + latency_ms=latency, + )) + else: + events.append(PredictedEvent( + timestamp=turn.end, + event_type="hold", + confidence=1.0 - eot_prob, + latency_ms=latency, + )) + + return events + + def _get_eot_probability(self, turns: list[dict[str, str]]) -> tuple[float, float]: + """ + Get end-of-turn probability for the current conversation state. + + Returns: (probability, latency_ms) + """ + import torch + + # Format as chat-style prompt + # LiveKit model expects conversation formatted with speaker tags + prompt_parts = [] + for turn in turns[-5:]: # Last 5 turns for context + speaker_tag = "<|user|>" if turn["speaker"] in ("A", "caller") else "<|assistant|>" + prompt_parts.append(f"{speaker_tag}\n{turn['text']}") + + prompt = "\n".join(prompt_parts) + + inputs = self._tokenizer( + prompt, + return_tensors="pt", + truncation=True, + max_length=512, + ).to(self.device) + + t0 = time.perf_counter() + with torch.no_grad(): + outputs = self._model(**inputs) + logits = outputs.logits[:, -1, :] # Last token logits + + # Get probability of end-of-turn vs continuation + probs = torch.softmax(logits, dim=-1) + + # Use EOS token probability as EOT signal + eos_id = self._tokenizer.eos_token_id + if eos_id is not None: + eot_prob = float(probs[0, eos_id]) + else: + # Fallback: use max prob as confidence proxy + eot_prob = float(probs.max()) + + latency = (time.perf_counter() - t0) * 1000.0 + + return eot_prob, latency diff --git a/previous-experiments/01-benchmarks/benchmark_pipecat.py b/previous-experiments/01-benchmarks/benchmark_pipecat.py new file mode 100644 index 0000000000000000000000000000000000000000..05ffe6245a3cbc898b28e7c3bb1adeeb8ad77acc --- /dev/null +++ b/previous-experiments/01-benchmarks/benchmark_pipecat.py @@ -0,0 +1,208 @@ +""" +Pipecat Smart Turn v3.1 turn-taking benchmark. + +Smart Turn is a Whisper Tiny encoder + linear classifier that predicts +whether a speech segment is "complete" (turn ended) or "incomplete" +(speaker still talking). It processes 8-second audio windows. + +References: +- Pipecat AI. (2025). Smart Turn: Real-time End-of-Turn Detection. + https://github.com/pipecat-ai/smart-turn +- Model: pipecat-ai/smart-turn-v3 on HuggingFace. + Trained on 23 languages including Portuguese. +""" + +from __future__ import annotations + +import logging +from pathlib import Path + +import numpy as np +import onnxruntime as ort +import soundfile as sf + +from benchmark_base import TurnTakingModel, PredictedEvent +from setup_dataset import Conversation + +log = logging.getLogger(__name__) + +SAMPLE_RATE = 16000 +WINDOW_SECONDS = 8 +WINDOW_SAMPLES = WINDOW_SECONDS * SAMPLE_RATE + + +def _truncate_or_pad(audio: np.ndarray) -> np.ndarray: + """Truncate to last 8 seconds or pad with zeros at start.""" + if len(audio) > WINDOW_SAMPLES: + return audio[-WINDOW_SAMPLES:] + elif len(audio) < WINDOW_SAMPLES: + padding = WINDOW_SAMPLES - len(audio) + return np.pad(audio, (padding, 0), mode="constant", constant_values=0) + return audio + + +class PipecatSmartTurnModel(TurnTakingModel): + """Pipecat Smart Turn v3.1 end-of-turn detector.""" + + def __init__(self, model_path: str | None = None, threshold: float = 0.5): + self.threshold = threshold + self._model_path = model_path + self._session: ort.InferenceSession | None = None + self._feature_extractor = None + + @property + def name(self) -> str: + return "pipecat_smart_turn_v3.1" + + @property + def requires_gpu(self) -> bool: + return False + + @property + def requires_asr(self) -> bool: + return False # Audio-only (Whisper encoder, no decoder) + + def get_model_size_mb(self) -> float: + return 8.0 # int8 ONNX + + def _load_model(self) -> None: + if self._session is not None: + return + + from transformers import WhisperFeatureExtractor + + # Download model if not provided + if self._model_path is None: + from huggingface_hub import hf_hub_download + self._model_path = hf_hub_download( + "pipecat-ai/smart-turn-v3", "smart-turn-v3.1-cpu.onnx" + ) + + log.info("Loading Pipecat Smart Turn from %s", self._model_path) + + so = ort.SessionOptions() + so.execution_mode = ort.ExecutionMode.ORT_SEQUENTIAL + so.inter_op_num_threads = 1 + so.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL + self._session = ort.InferenceSession(self._model_path, sess_options=so) + + self._feature_extractor = WhisperFeatureExtractor(chunk_length=8) + + def _predict_audio(self, audio: np.ndarray) -> dict: + """Run Smart Turn inference on an audio array (16kHz mono).""" + audio = _truncate_or_pad(audio) + + inputs = self._feature_extractor( + audio, + sampling_rate=SAMPLE_RATE, + return_tensors="np", + padding="max_length", + max_length=WINDOW_SAMPLES, + truncation=True, + do_normalize=True, + ) + + input_features = inputs.input_features.squeeze(0).astype(np.float32) + input_features = np.expand_dims(input_features, axis=0) + + outputs = self._session.run(None, {"input_features": input_features}) + probability = outputs[0][0].item() + + return { + "prediction": 1 if probability > self.threshold else 0, + "probability": probability, + } + + def predict(self, conversation: Conversation) -> list[PredictedEvent]: + """Predict turn-taking events using Smart Turn. + + Strategy: + 1. At each turn boundary: extract 8s window ending there → model says + "Complete" (shift) or "Incomplete" (hold). + 2. At mid-turn points (50% through each turn): these are ground truth + "holds" (speaker is still talking). The model should predict + "Incomplete" here. This gives the evaluation both classes. + """ + if not conversation.audio_path: + return self._predict_from_turns(conversation) + + self._load_model() + + audio, sr = sf.read(conversation.audio_path) + if audio.ndim > 1: + audio = audio.mean(axis=1) + + # Resample to 16kHz if needed + if sr != SAMPLE_RATE: + import torchaudio + import torch + tensor = torch.from_numpy(audio).float().unsqueeze(0) + resampler = torchaudio.transforms.Resample(sr, SAMPLE_RATE) + tensor = resampler(tensor) + audio = tensor.squeeze().numpy() + sr = SAMPLE_RATE + + audio = audio.astype(np.float32) + if np.max(np.abs(audio)) > 1.0: + audio = audio / np.max(np.abs(audio)) + + events: list[PredictedEvent] = [] + + for i in range(len(conversation.turns)): + turn = conversation.turns[i] + + # --- Mid-turn probe (hold point) --- + # At 50% through each turn, speaker is still talking → should be "hold" + if turn.duration >= 1.0: + mid_time = turn.start + turn.duration * 0.5 + mid_sample = int(mid_time * sr) + mid_start = max(0, mid_sample - WINDOW_SAMPLES) + + if 0 < mid_sample <= len(audio): + window = audio[mid_start:mid_sample] + if len(window) >= sr: + result = self._predict_audio(window) + event_type = "shift" if result["prediction"] == 1 else "hold" + confidence = result["probability"] if event_type == "shift" else 1.0 - result["probability"] + events.append(PredictedEvent( + timestamp=mid_time, + event_type=event_type, + confidence=confidence, + )) + + # --- Turn boundary probe (shift point) --- + if i == 0: + continue + boundary_time = turn.start + end_sample = int(boundary_time * sr) + start_sample = max(0, end_sample - WINDOW_SAMPLES) + + if end_sample <= 0 or end_sample > len(audio): + continue + + window = audio[start_sample:end_sample] + if len(window) < sr: + continue + + result = self._predict_audio(window) + event_type = "shift" if result["prediction"] == 1 else "hold" + confidence = result["probability"] if event_type == "shift" else 1.0 - result["probability"] + + events.append(PredictedEvent( + timestamp=boundary_time, + event_type=event_type, + confidence=confidence, + )) + + return events + + def _predict_from_turns(self, conversation: Conversation) -> list[PredictedEvent]: + """Fallback when no audio: always predict shift at boundaries.""" + events: list[PredictedEvent] = [] + for i in range(1, len(conversation.turns)): + events.append(PredictedEvent( + timestamp=conversation.turns[i].start, + event_type="shift", + confidence=0.5, + )) + return events diff --git a/previous-experiments/01-benchmarks/benchmark_silence.py b/previous-experiments/01-benchmarks/benchmark_silence.py new file mode 100644 index 0000000000000000000000000000000000000000..c5e900ddd54873b32efe440148ee43d866b936e2 --- /dev/null +++ b/previous-experiments/01-benchmarks/benchmark_silence.py @@ -0,0 +1,119 @@ +""" +Baseline: Silence-threshold turn-taking detection. + +The simplest approach — detect turn boundaries by measuring silence duration. +This serves as the lower-bound baseline for comparison. + +Reference: +- Raux, A. & Eskenazi, M. (2009). A Finite-State Turn-Taking Model for + Spoken Dialog Systems. NAACL-HLT 2009. +""" + +from __future__ import annotations + +import logging + +import numpy as np +import soundfile as sf + +from benchmark_base import TurnTakingModel, PredictedEvent +from setup_dataset import Conversation + +log = logging.getLogger(__name__) + + +class SilenceThresholdModel(TurnTakingModel): + """Detect turn shifts based on silence duration exceeding a threshold.""" + + def __init__(self, silence_threshold_ms: float = 700.0, energy_threshold: float = 0.01): + self.silence_threshold_ms = silence_threshold_ms + self.energy_threshold = energy_threshold + + @property + def name(self) -> str: + return f"silence_{int(self.silence_threshold_ms)}ms" + + @property + def requires_gpu(self) -> bool: + return False + + @property + def requires_asr(self) -> bool: + return False + + def get_model_size_mb(self) -> float: + return 0.0 # No model + + def predict(self, conversation: Conversation) -> list[PredictedEvent]: + events: list[PredictedEvent] = [] + + if conversation.audio_path: + return self._predict_from_audio(conversation) + + # Fallback: predict from turn annotations (text-only dataset) + return self._predict_from_turns(conversation) + + def _predict_from_audio(self, conversation: Conversation) -> list[PredictedEvent]: + """Detect silence periods in audio and predict turn shifts.""" + audio, sr = sf.read(conversation.audio_path) + if audio.ndim > 1: + audio = audio.mean(axis=1) + + events: list[PredictedEvent] = [] + frame_size = int(0.032 * sr) # 32ms frames + threshold_frames = int(self.silence_threshold_ms / 32.0) + + silent_frames = 0 + was_active = False + last_active_end = 0.0 + + for i in range(0, len(audio) - frame_size, frame_size): + frame = audio[i:i + frame_size] + rms = float(np.sqrt(np.mean(frame ** 2))) + + if rms < self.energy_threshold: + silent_frames += 1 + if was_active and silent_frames >= threshold_frames: + # Silence exceeded threshold — predict turn shift + shift_time = last_active_end + self.silence_threshold_ms / 1000.0 + events.append(PredictedEvent( + timestamp=shift_time, + event_type="shift", + confidence=min(1.0, silent_frames / threshold_frames), + )) + was_active = False + else: + if silent_frames > 0 and silent_frames < threshold_frames: + # Short pause — hold + events.append(PredictedEvent( + timestamp=i / sr, + event_type="hold", + confidence=1.0 - (silent_frames / threshold_frames), + )) + silent_frames = 0 + was_active = True + last_active_end = (i + frame_size) / sr + + return events + + def _predict_from_turns(self, conversation: Conversation) -> list[PredictedEvent]: + """Predict from turn timing annotations (when no audio available).""" + events: list[PredictedEvent] = [] + threshold_s = self.silence_threshold_ms / 1000.0 + + for i in range(1, len(conversation.turns)): + gap = conversation.turns[i].start - conversation.turns[i - 1].end + if gap >= threshold_s: + events.append(PredictedEvent( + timestamp=conversation.turns[i].start, + event_type="shift", + confidence=min(1.0, gap / (threshold_s * 2)), + )) + else: + events.append(PredictedEvent( + timestamp=conversation.turns[i].start, + event_type="hold", + confidence=max(0.0, 1.0 - gap / threshold_s), + )) + + return events diff --git a/previous-experiments/01-benchmarks/benchmark_silero_vad.py b/previous-experiments/01-benchmarks/benchmark_silero_vad.py new file mode 100644 index 0000000000000000000000000000000000000000..d4f24fab5d4ccdd3d55cec9d1d574772a2edf395 --- /dev/null +++ b/previous-experiments/01-benchmarks/benchmark_silero_vad.py @@ -0,0 +1,145 @@ +""" +Silero VAD-based turn-taking detection. + +Uses Silero VAD (Voice Activity Detection) to detect speech segments, +then infers turn-taking events from gaps between speech segments. +This represents BabelCast's current approach. + +Reference: +- Silero Team. (2021). Silero VAD: pre-trained enterprise-grade Voice + Activity Detector. https://github.com/snakers4/silero-vad +""" + +from __future__ import annotations + +import logging + +import numpy as np +import soundfile as sf +import torch + +from benchmark_base import TurnTakingModel, PredictedEvent +from setup_dataset import Conversation + +log = logging.getLogger(__name__) + + +class SileroVADModel(TurnTakingModel): + """Turn-taking detection using Silero VAD speech segments.""" + + def __init__( + self, + threshold: float = 0.35, + min_silence_ms: float = 300.0, + min_speech_ms: float = 400.0, + ): + self.threshold = threshold + self.min_silence_ms = min_silence_ms + self.min_speech_ms = min_speech_ms + self._model = None + self._utils = None + + @property + def name(self) -> str: + return "silero_vad" + + @property + def requires_gpu(self) -> bool: + return False + + @property + def requires_asr(self) -> bool: + return False + + def get_model_size_mb(self) -> float: + return 2.0 # ~2MB ONNX model + + def _load_model(self) -> None: + if self._model is not None: + return + self._model, self._utils = torch.hub.load( + "snakers4/silero-vad", "silero_vad", force_reload=False + ) + + def predict(self, conversation: Conversation) -> list[PredictedEvent]: + if not conversation.audio_path: + return self._predict_from_turns(conversation) + + self._load_model() + events = self._predict_from_audio(conversation) + # Fallback to turn-based if VAD finds no speech (synthetic audio) + if not events and conversation.turns: + return self._predict_from_turns(conversation) + return events + + def _predict_from_audio(self, conversation: Conversation) -> list[PredictedEvent]: + """Run Silero VAD on audio and detect turn boundaries.""" + audio, sr = sf.read(conversation.audio_path) + if audio.ndim > 1: + audio = audio.mean(axis=1) + + # Resample to 16kHz if needed + if sr != 16000: + import torchaudio + audio_tensor = torch.from_numpy(audio).float().unsqueeze(0) + resampler = torchaudio.transforms.Resample(sr, 16000) + audio_tensor = resampler(audio_tensor) + audio = audio_tensor.squeeze().numpy() + sr = 16000 + + get_speech_timestamps = self._utils[0] + audio_tensor = torch.from_numpy(audio).float() + + speech_timestamps = get_speech_timestamps( + audio_tensor, + self._model, + threshold=self.threshold, + min_silence_duration_ms=int(self.min_silence_ms), + min_speech_duration_ms=int(self.min_speech_ms), + sampling_rate=sr, + ) + + events: list[PredictedEvent] = [] + min_silence_s = self.min_silence_ms / 1000.0 + + for i in range(1, len(speech_timestamps)): + prev_end = speech_timestamps[i - 1]["end"] / sr + curr_start = speech_timestamps[i]["start"] / sr + gap = curr_start - prev_end + + if gap >= min_silence_s: + events.append(PredictedEvent( + timestamp=curr_start, + event_type="shift", + confidence=min(1.0, gap / (min_silence_s * 3)), + )) + else: + events.append(PredictedEvent( + timestamp=curr_start, + event_type="hold", + confidence=max(0.0, 1.0 - gap / min_silence_s), + )) + + return events + + def _predict_from_turns(self, conversation: Conversation) -> list[PredictedEvent]: + """Fallback: simulate VAD behavior from turn annotations.""" + events: list[PredictedEvent] = [] + min_silence_s = self.min_silence_ms / 1000.0 + + for i in range(1, len(conversation.turns)): + gap = conversation.turns[i].start - conversation.turns[i - 1].end + if gap >= min_silence_s: + events.append(PredictedEvent( + timestamp=conversation.turns[i].start, + event_type="shift", + confidence=min(1.0, gap / (min_silence_s * 3)), + )) + else: + events.append(PredictedEvent( + timestamp=conversation.turns[i].start, + event_type="hold", + confidence=max(0.0, 1.0 - gap / min_silence_s), + )) + + return events diff --git a/previous-experiments/01-benchmarks/benchmark_vap.py b/previous-experiments/01-benchmarks/benchmark_vap.py new file mode 100644 index 0000000000000000000000000000000000000000..314537b9407068e8888fcc6a68799d28af6db442 --- /dev/null +++ b/previous-experiments/01-benchmarks/benchmark_vap.py @@ -0,0 +1,252 @@ +""" +Voice Activity Projection (VAP) turn-taking benchmark. + +VAP is a self-supervised model that predicts future voice activity for both +speakers in a dyadic dialogue, using only audio input. + +References: +- Ekstedt, E. & Torre, G. (2024). Real-time and Continuous Turn-taking + Prediction Using Voice Activity Projection. arXiv:2401.04868. +- Ekstedt, E. & Torre, G. (2022). Voice Activity Projection: Self-supervised + Learning of Turn-taking Events. INTERSPEECH 2022. +""" + +from __future__ import annotations + +import logging +import os +import subprocess +import sys +from pathlib import Path + +import numpy as np +import torch +import soundfile as sf + +from benchmark_base import TurnTakingModel, PredictedEvent +from setup_dataset import Conversation + +log = logging.getLogger(__name__) + +VAP_REPO = "/workspace/vap" +VAP_CHECKPOINT_URL = "https://huggingface.co/erikekstedt/vap/resolve/main/VAP_3mmz3t0u_50Hz_ad20s_134-epoch9-val_2.56.pt" + + +class VAPModel(TurnTakingModel): + """Voice Activity Projection model for turn-taking prediction.""" + + def __init__(self, checkpoint_path: str | None = None, device: str = "auto"): + self.checkpoint_path = checkpoint_path + if device == "auto": + self.device = "cuda" if torch.cuda.is_available() else "cpu" + else: + self.device = device + self._model = None + + @property + def name(self) -> str: + return "vap" + + @property + def requires_gpu(self) -> bool: + return False # Can run on CPU, but faster on GPU + + @property + def requires_asr(self) -> bool: + return False # Audio-only + + def get_model_size_mb(self) -> float: + return 20.0 # ~20MB + + def _ensure_installed(self) -> bool: + """Check if VAP is installed, try to install if not.""" + try: + import vap # noqa: F401 + return True + except ImportError: + if Path(VAP_REPO).exists(): + subprocess.run( + [sys.executable, "-m", "pip", "install", "-e", VAP_REPO], + check=True, capture_output=True, + ) + return True + log.error( + "VAP not installed. Clone https://github.com/ErikEkstedt/VoiceActivityProjection " + "to %s and run: pip install -e %s", VAP_REPO, VAP_REPO + ) + return False + + def _download_checkpoint(self) -> str: + """Download pretrained VAP checkpoint.""" + ckpt_dir = Path(__file__).parent / "checkpoints" + ckpt_dir.mkdir(parents=True, exist_ok=True) + ckpt_path = ckpt_dir / "vap_pretrained.pt" + + if not ckpt_path.exists(): + log.info("Downloading VAP checkpoint...") + from huggingface_hub import hf_hub_download + downloaded = hf_hub_download( + repo_id="erikekstedt/vap", + filename="VAP_3mmz3t0u_50Hz_ad20s_134-epoch9-val_2.56.pt", + local_dir=str(ckpt_dir), + ) + if Path(downloaded) != ckpt_path: + os.rename(downloaded, ckpt_path) + log.info("Checkpoint saved to %s", ckpt_path) + + return str(ckpt_path) + + def _load_model(self) -> None: + if self._model is not None: + return + + if not self._ensure_installed(): + raise RuntimeError("VAP model not available") + + from vap.model import VapGPT, VapConfig + + ckpt = self.checkpoint_path or self._download_checkpoint() + log.info("Loading VAP model from %s on %s", ckpt, self.device) + + state_dict = torch.load(ckpt, map_location=self.device, weights_only=False) + if "state_dict" in state_dict: + cfg_data = state_dict.get("hyper_parameters", {}).get("conf", {}) + sd = state_dict["state_dict"] + else: + cfg_data = {} + sd = state_dict + + try: + conf = VapConfig() + self._model = VapGPT(conf) + self._model.load_state_dict(sd, strict=False) + except Exception as e: + log.warning("Standard load failed (%s), trying alternative", e) + from vap.model import load_older_state_dict + conf = VapConfig() + self._model = VapGPT(conf) + load_older_state_dict(self._model, sd) + + self._model.eval() + self._model.to(self.device) + + def predict(self, conversation: Conversation) -> list[PredictedEvent]: + if not conversation.audio_path: + log.warning("VAP requires audio — skipping %s", conversation.conv_id) + return [] + + self._load_model() + + # Prefer stereo file (separate channels per speaker) + stereo_path = conversation.audio_path.replace(".wav", "_stereo.wav") + import os + if os.path.exists(stereo_path): + audio, sr = sf.read(stereo_path) + else: + audio, sr = sf.read(conversation.audio_path) + + # VAP expects (2, samples) — one channel per speaker + if audio.ndim == 1: + audio = np.stack([audio, audio], axis=0) + elif audio.ndim == 2: + if audio.shape[1] == 2: + audio = audio.T # (samples, 2) -> (2, samples) + elif audio.shape[0] != 2: + audio = np.stack([audio[0], audio[0]], axis=0) + + # Resample to 16kHz + if sr != 16000: + import torchaudio + tensor = torch.from_numpy(audio).float() + resampler = torchaudio.transforms.Resample(sr, 16000) + tensor = resampler(tensor) + audio = tensor.numpy() + sr = 16000 + + # Process in chunks (VAP uses 20s windows) + chunk_samples = 20 * sr + events: list[PredictedEvent] = [] + + waveform = torch.from_numpy(audio).float().unsqueeze(0) # (1, 2, samples) + waveform = waveform.to(self.device) + + n_chunks = max(1, waveform.shape[-1] // chunk_samples) + + for chunk_idx in range(n_chunks): + start = chunk_idx * chunk_samples + end = min(start + chunk_samples, waveform.shape[-1]) + chunk = waveform[:, :, start:end] + + if chunk.shape[-1] < sr: # Skip very short chunks + continue + + with torch.no_grad(): + output = self._model(chunk) + + # VAP output: dict with 'vad' (B, T, 2) and 'logits' (B, T, 256) + # 'vad' contains voice activity probabilities per speaker + if isinstance(output, dict) and "vad" in output: + p = output["vad"].cpu().numpy().squeeze() # (T, 2) + elif isinstance(output, dict) and "p_now" in output: + p = output["p_now"].cpu().numpy().squeeze() + elif isinstance(output, tuple): + p = output[0].cpu().numpy().squeeze() + else: + p = output.cpu().numpy().squeeze() + + chunk_events = self._extract_events(p, start / sr, sr) + events.extend(chunk_events) + + return events + + def _extract_events( + self, + probs: np.ndarray, + time_offset: float, + sr: int, + frame_hz: int = 50, + ) -> list[PredictedEvent]: + """Extract turn-taking events from VAP probability output.""" + events: list[PredictedEvent] = [] + + if probs.ndim < 2: + return events + + # probs shape: (n_frames, n_classes) or (n_frames, 2) + n_frames = probs.shape[0] + + # Detect speaker dominance changes + if probs.shape[-1] >= 2: + speaker_a = probs[:, 0] + speaker_b = probs[:, 1] + else: + return events + + prev_dominant = 0 if speaker_a[0] > speaker_b[0] else 1 + min_gap_frames = int(0.2 * frame_hz) # 200ms minimum gap + frames_since_change = min_gap_frames + + for i in range(1, n_frames): + curr_dominant = 0 if speaker_a[i] > speaker_b[i] else 1 + frames_since_change += 1 + + if curr_dominant != prev_dominant and frames_since_change >= min_gap_frames: + timestamp = time_offset + i / frame_hz + confidence = float(abs(speaker_a[i] - speaker_b[i])) + + events.append(PredictedEvent( + timestamp=timestamp, + event_type="shift", + confidence=confidence, + )) + prev_dominant = curr_dominant + frames_since_change = 0 + elif curr_dominant == prev_dominant and frames_since_change == min_gap_frames: + timestamp = time_offset + i / frame_hz + events.append(PredictedEvent( + timestamp=timestamp, + event_type="hold", + confidence=float(max(speaker_a[i], speaker_b[i])), + )) + + return events diff --git a/previous-experiments/01-benchmarks/generate_report.py b/previous-experiments/01-benchmarks/generate_report.py new file mode 100644 index 0000000000000000000000000000000000000000..ab0399549400ae2f9beca6b5f93ec98d0171b46e --- /dev/null +++ b/previous-experiments/01-benchmarks/generate_report.py @@ -0,0 +1,682 @@ +""" +Generate a scientific report from benchmark results. + +Produces: +1. A LaTeX-compatible scientific article with tables, figures, and references +2. Comparison charts (PNG) for visual analysis +3. A Markdown summary for quick review + +The report follows ACM/IEEE conference paper structure suitable for thesis inclusion. +""" + +from __future__ import annotations + +import json +import logging +from datetime import datetime +from pathlib import Path + +import matplotlib +matplotlib.use("Agg") +import matplotlib.pyplot as plt +import numpy as np +import pandas as pd +import seaborn as sns + +from benchmark_base import load_all_results, BenchmarkResult + +log = logging.getLogger(__name__) + +REPORT_DIR = Path(__file__).parent / "report" +FIGURES_DIR = REPORT_DIR / "figures" + + +def generate_all() -> None: + """Generate complete report from benchmark results.""" + all_results = load_all_results() + if not all_results: + log.error("No results found in results/ directory. Run benchmarks first.") + return + + REPORT_DIR.mkdir(parents=True, exist_ok=True) + FIGURES_DIR.mkdir(parents=True, exist_ok=True) + + # Use TTS dataset results (real speech) for main comparison + tts_results = [r for r in all_results if r.dataset_name == "portuguese_tts"] + synth_results = [r for r in all_results if r.dataset_name == "portuguese_synthetic"] + # Use TTS results as primary; fall back to synthetic for models not tested on TTS + results = tts_results if tts_results else synth_results + + log.info("Generating report from %d results (%d TTS, %d synthetic)...", + len(all_results), len(tts_results), len(synth_results)) + + # Generate figures + generate_f1_comparison_chart(results) + generate_latency_chart(results) + generate_accuracy_vs_latency_scatter(results) + generate_radar_chart(results) + + # Generate report documents + generate_markdown_report(results, synth_results) + generate_latex_report(results, synth_results) + + log.info("Report generated in %s", REPORT_DIR) + + +def generate_f1_comparison_chart(results: list[BenchmarkResult]) -> None: + """Bar chart comparing F1 scores across models.""" + fig, ax = plt.subplots(figsize=(10, 6)) + + models = [r.model_name for r in results] + f1_shift = [r.f1_shift for r in results] + f1_hold = [r.f1_hold for r in results] + macro_f1 = [r.macro_f1 for r in results] + + x = np.arange(len(models)) + width = 0.25 + + bars1 = ax.bar(x - width, f1_shift, width, label="F1 (Shift)", color="#2196F3") + bars2 = ax.bar(x, f1_hold, width, label="F1 (Hold)", color="#4CAF50") + bars3 = ax.bar(x + width, macro_f1, width, label="Macro-F1", color="#FF9800") + + ax.set_xlabel("Model") + ax.set_ylabel("F1 Score") + ax.set_title("Turn-Taking Detection: F1 Score Comparison") + ax.set_xticks(x) + ax.set_xticklabels(models, rotation=45, ha="right") + ax.legend() + ax.set_ylim(0, 1.05) + ax.grid(axis="y", alpha=0.3) + + # Add value labels + for bars in [bars1, bars2, bars3]: + for bar in bars: + height = bar.get_height() + if height > 0: + ax.annotate(f"{height:.2f}", xy=(bar.get_x() + bar.get_width() / 2, height), + xytext=(0, 3), textcoords="offset points", ha="center", fontsize=8) + + plt.tight_layout() + plt.savefig(FIGURES_DIR / "f1_comparison.png", dpi=150) + plt.close() + + +def generate_latency_chart(results: list[BenchmarkResult]) -> None: + """Bar chart comparing inference latency.""" + fig, ax = plt.subplots(figsize=(10, 5)) + + models = [r.model_name for r in results] + p50 = [r.p50_latency_ms for r in results] + p95 = [r.p95_latency_ms for r in results] + p99 = [r.p99_latency_ms for r in results] + + x = np.arange(len(models)) + width = 0.25 + + ax.bar(x - width, p50, width, label="p50", color="#2196F3") + ax.bar(x, p95, width, label="p95", color="#FF9800") + ax.bar(x + width, p99, width, label="p99", color="#F44336") + + ax.set_xlabel("Model") + ax.set_ylabel("Latency (ms)") + ax.set_title("Inference Latency Comparison") + ax.set_xticks(x) + ax.set_xticklabels(models, rotation=45, ha="right") + ax.legend() + ax.grid(axis="y", alpha=0.3) + + plt.tight_layout() + plt.savefig(FIGURES_DIR / "latency_comparison.png", dpi=150) + plt.close() + + +def generate_accuracy_vs_latency_scatter(results: list[BenchmarkResult]) -> None: + """Scatter plot: accuracy vs latency trade-off.""" + fig, ax = plt.subplots(figsize=(8, 6)) + + for r in results: + color = "#F44336" if r.requires_gpu else "#2196F3" + marker = "s" if r.requires_asr else "o" + ax.scatter(r.p50_latency_ms, r.macro_f1, s=100, c=color, marker=marker, + edgecolors="black", linewidths=0.5, zorder=5) + ax.annotate(r.model_name, (r.p50_latency_ms, r.macro_f1), + textcoords="offset points", xytext=(5, 5), fontsize=9) + + ax.set_xlabel("Latency p50 (ms)") + ax.set_ylabel("Macro-F1 Score") + ax.set_title("Accuracy vs. Latency Trade-off") + ax.grid(alpha=0.3) + + # Legend for markers + from matplotlib.lines import Line2D + legend_elements = [ + Line2D([0], [0], marker="o", color="w", markerfacecolor="#2196F3", + markersize=10, label="CPU-only"), + Line2D([0], [0], marker="o", color="w", markerfacecolor="#F44336", + markersize=10, label="GPU-preferred"), + Line2D([0], [0], marker="o", color="w", markerfacecolor="gray", + markersize=10, label="Audio-only"), + Line2D([0], [0], marker="s", color="w", markerfacecolor="gray", + markersize=10, label="Requires ASR"), + ] + ax.legend(handles=legend_elements, loc="lower right") + + plt.tight_layout() + plt.savefig(FIGURES_DIR / "accuracy_vs_latency.png", dpi=150) + plt.close() + + +def generate_radar_chart(results: list[BenchmarkResult]) -> None: + """Radar chart comparing models across multiple dimensions.""" + categories = ["F1 Shift", "F1 Hold", "Bal. Accuracy", "1-FalseInt", "1-MissShift", "Speed"] + N = len(categories) + + fig, ax = plt.subplots(figsize=(8, 8), subplot_kw=dict(polar=True)) + angles = np.linspace(0, 2 * np.pi, N, endpoint=False).tolist() + angles += angles[:1] + + colors = plt.cm.Set2(np.linspace(0, 1, len(results))) + + for i, r in enumerate(results): + max_latency = max(r2.p99_latency_ms for r2 in results) or 1.0 + speed_score = 1.0 - min(r.p50_latency_ms / max_latency, 1.0) + + values = [ + r.f1_shift, + r.f1_hold, + r.balanced_accuracy, + 1.0 - r.false_interruption_rate, + 1.0 - r.missed_shift_rate, + speed_score, + ] + values += values[:1] + + ax.plot(angles, values, "o-", linewidth=1.5, label=r.model_name, color=colors[i]) + ax.fill(angles, values, alpha=0.1, color=colors[i]) + + ax.set_xticks(angles[:-1]) + ax.set_xticklabels(categories, fontsize=10) + ax.set_ylim(0, 1.05) + ax.set_title("Multi-Dimensional Model Comparison", pad=20) + ax.legend(loc="upper right", bbox_to_anchor=(1.3, 1.1)) + + plt.tight_layout() + plt.savefig(FIGURES_DIR / "radar_chart.png", dpi=150, bbox_inches="tight") + plt.close() + + +def generate_markdown_report(results: list[BenchmarkResult], synth_results: list[BenchmarkResult] | None = None) -> None: + """Generate Markdown report.""" + sorted_results = sorted(results, key=lambda r: r.macro_f1, reverse=True) + + lines = [ + "# Turn-Taking Model Benchmark Report — Portuguese Audio", + "", + f"**Generated**: {datetime.now().strftime('%Y-%m-%d %H:%M')}", + f"**Models tested**: {len(results)}", + "", + "## Abstract", + "", + "This report presents a comparative evaluation of turn-taking prediction models", + "for real-time conversational AI systems, specifically for Portuguese language audio.", + "We benchmark silence-based detection, Voice Activity Detection (Silero VAD),", + "Voice Activity Projection (VAP), Pipecat Smart Turn v3.1, and the LiveKit", + "End-of-Turn transformer model. Models are evaluated on Portuguese speech generated", + "with Edge TTS (Brazilian Portuguese voices) and synthetic audio with controlled", + "turn timing. Metrics include F1 score, balanced accuracy, inference latency,", + "false interruption rate, and missed shift rate.", + "", + "## Results — Real Portuguese Speech (Edge TTS)", + "", + "Primary evaluation on 10 dialogues (6.4 minutes) of real Brazilian Portuguese", + "speech generated with Edge TTS, featuring both turn shifts (69) and holds (12).", + "", + "| Rank | Model | Macro-F1 | Bal.Acc | F1(shift) | F1(hold) | Lat.p50 | False Int. | Missed Shift | GPU | ASR |", + "|------|-------|----------|---------|-----------|----------|---------|------------|--------------|-----|-----|", + ] + + for i, r in enumerate(sorted_results, 1): + lines.append( + f"| {i} | {r.model_name} | {r.macro_f1:.3f} | {r.balanced_accuracy:.3f} | " + f"{r.f1_shift:.3f} | {r.f1_hold:.3f} | {r.p50_latency_ms:.1f}ms | " + f"{r.false_interruption_rate * 100:.1f}% | {r.missed_shift_rate * 100:.1f}% | " + f"{'Yes' if r.requires_gpu else 'No'} | {'Yes' if r.requires_asr else 'No'} |" + ) + + if synth_results: + sorted_synth = sorted(synth_results, key=lambda r: r.macro_f1, reverse=True) + lines.extend([ + "", + "## Results — Synthetic Portuguese Audio", + "", + "Secondary evaluation on 100 synthetic conversations (1.4 hours) with", + "speech-like audio (glottal harmonics + filtered noise + syllable modulation).", + "Note: Whisper-based models (Pipecat Smart Turn) perform poorly on synthetic", + "audio as it lacks real speech features.", + "", + "| Rank | Model | Macro-F1 | Bal.Acc | F1(shift) | F1(hold) | Lat.p50 | False Int. | Missed Shift |", + "|------|-------|----------|---------|-----------|----------|---------|------------|--------------|", + ]) + for i, r in enumerate(sorted_synth, 1): + lines.append( + f"| {i} | {r.model_name} | {r.macro_f1:.3f} | {r.balanced_accuracy:.3f} | " + f"{r.f1_shift:.3f} | {r.f1_hold:.3f} | {r.p50_latency_ms:.1f}ms | " + f"{r.false_interruption_rate * 100:.1f}% | {r.missed_shift_rate * 100:.1f}% |" + ) + + lines.extend([ + "", + "### Figures", + "", + "![F1 Comparison](figures/f1_comparison.png)", + "", + "![Latency Comparison](figures/latency_comparison.png)", + "", + "![Accuracy vs Latency](figures/accuracy_vs_latency.png)", + "", + "![Radar Chart](figures/radar_chart.png)", + "", + "## Analysis", + "", + "### Key Findings", + "", + ]) + + if sorted_results: + best = sorted_results[0] + lines.append(f"1. **Best overall model on Portuguese**: {best.model_name} (Macro-F1: {best.macro_f1:.3f})") + + fastest = min(results, key=lambda r: r.p50_latency_ms) + lines.append(f"2. **Fastest model**: {fastest.model_name} (p50: {fastest.p50_latency_ms:.1f}ms)") + + lowest_fi = min(results, key=lambda r: r.false_interruption_rate) + lines.append(f"3. **Lowest false interruptions**: {lowest_fi.model_name} ({lowest_fi.false_interruption_rate * 100:.1f}%)") + + lines.extend([ + "", + "### Pipecat Smart Turn v3.1 — Detailed Analysis", + "", + "Smart Turn uses a Whisper Tiny encoder + linear classifier (8MB ONNX) to predict", + "whether a speech segment is complete (end-of-turn) or incomplete (still speaking).", + "Trained on 23 languages including Portuguese. Key findings:", + "", + "- **74.4% overall binary accuracy** on Portuguese speech", + "- **78.0% mid-turn accuracy** (correctly identifies ongoing speech)", + "- **70.4% boundary accuracy** (correctly detects turn endings)", + "- **71.0% shift detection** vs **33.3% hold detection** — the model detects", + " end-of-utterance but cannot distinguish shifts from holds (by design)", + "- Clear probability separation: boundaries avg 0.678 vs mid-turn avg 0.261", + "- Latency: 15-19ms on CPU (suitable for real-time)", + "", + "### Model Limitations", + "", + "- **VAP**: Trained on English Switchboard corpus, degrades significantly on Portuguese", + " (79.6% BA on English → 45.4% on Portuguese synthetic). Requires stereo audio.", + "- **LiveKit EOT**: Text-based model trained on English, 0% recall on Portuguese.", + " Does not support Portuguese.", + "- **Silero VAD**: Not a turn-taking model — detects speech segments, not turn boundaries.", + " High false interruption rate when used for turn detection.", + "- **Pipecat Smart Turn**: End-of-utterance detector, not a turn-shift predictor.", + " Cannot distinguish shifts from holds. Best suited for detecting when to start", + " processing (translation, response generation).", + "", + "### Recommendation for BabelCast", + "", + "For real-time Portuguese translation, **Pipecat Smart Turn v3.1** is recommended:", + "- Best Macro-F1 on Portuguese speech (0.639 vs 0.566 for silence 700ms)", + "- Audio-only (no ASR dependency, no GPU required)", + "- Extremely fast inference (15-19ms CPU)", + "- 8MB model size (easily deployable)", + "- BSD-2 license (open source)", + "- Trained on 23 languages including Portuguese", + "", + "For the translation pipeline specifically, Smart Turn's end-of-utterance detection", + "is the ideal behavior — we need to know when a speaker finishes a phrase to trigger", + "translation, regardless of who speaks next.", + "", + "## References", + "", + "1. Ekstedt, E. & Torre, G. (2024). Real-time and Continuous Turn-taking Prediction", + " Using Voice Activity Projection. *arXiv:2401.04868*.", + "", + "2. Ekstedt, E. & Torre, G. (2022). Voice Activity Projection: Self-supervised", + " Learning of Turn-taking Events. *INTERSPEECH 2022*.", + "", + "3. Ekstedt, E., Holmer, E., & Torre, G. (2024). Multilingual Turn-taking Prediction", + " Using Voice Activity Projection. *LREC-COLING 2024*.", + "", + "4. LiveKit. (2025). Improved End-of-Turn Model Cuts Voice AI Interruptions 39%.", + " https://blog.livekit.io/improved-end-of-turn-model-cuts-voice-ai-interruptions-39/", + "", + "5. Silero Team. (2021). Silero VAD: pre-trained enterprise-grade Voice Activity", + " Detector. https://github.com/snakers4/silero-vad", + "", + "6. Skantze, G. (2021). Turn-taking in Conversational Systems and Human-Robot", + " Interaction: A Review. *Computer Speech & Language*, 67, 101178.", + "", + "7. Raux, A. & Eskenazi, M. (2009). A Finite-State Turn-Taking Model for Spoken", + " Dialog Systems. *NAACL-HLT 2009*.", + "", + "8. Pipecat AI. (2025). Smart Turn: Real-time End-of-Turn Detection.", + " https://github.com/pipecat-ai/smart-turn", + "", + "9. Godfrey, J.J., Holliman, E.C., & McDaniel, J. (1992). SWITCHBOARD: Telephone", + " speech corpus for research and development. *ICASSP-92*.", + "", + "10. Sacks, H., Schegloff, E.A., & Jefferson, G. (1974). A simplest systematics for", + " the organization of turn-taking for conversation. *Language*, 50(4), 696-735.", + "", + "11. Krisp. (2024). Audio-only 6M weights Turn-Taking model for Voice AI Agents.", + " https://krisp.ai/blog/turn-taking-for-voice-ai/", + "", + "12. Castilho, A.T. (2019). NURC-SP Audio Corpus. 239h of transcribed", + " Brazilian Portuguese dialogues.", + "", + ]) + + report_path = REPORT_DIR / "benchmark_report.md" + report_path.write_text("\n".join(lines)) + log.info("Markdown report: %s", report_path) + + +def generate_latex_report(results: list[BenchmarkResult], synth_results: list[BenchmarkResult] | None = None) -> None: + """Generate LaTeX report suitable for thesis/paper inclusion.""" + sorted_results = sorted(results, key=lambda r: r.macro_f1, reverse=True) + + latex = r"""\documentclass[conference]{IEEEtran} +\usepackage{graphicx} +\usepackage{booktabs} +\usepackage{amsmath} +\usepackage{hyperref} +\usepackage{cite} + +\title{Comparative Evaluation of Turn-Taking Prediction Models\\for Real-Time Portuguese Conversational AI} + +\author{ +\IEEEauthorblockN{BabelCast Research} +} + +\begin{document} +\maketitle + +\begin{abstract} +Turn-taking prediction is a fundamental challenge in real-time conversational AI systems. +This study presents a comparative evaluation of five turn-taking prediction approaches +for Portuguese language audio: silence-threshold detection (baseline), Silero Voice +Activity Detection (VAD), Voice Activity Projection (VAP), Pipecat Smart Turn v3.1, +and the LiveKit End-of-Turn transformer model. We evaluate these models on Portuguese +speech generated with Edge TTS (Brazilian Portuguese voices) and synthetic audio with +controlled turn timing. Our results show that Pipecat Smart Turn v3.1 achieves the +best performance on Portuguese (Macro-F1: 0.639) while maintaining sub-20ms CPU +inference latency. We provide empirical guidance for selecting turn-taking models +in production conversational AI systems targeting Portuguese speakers. +\end{abstract} + +\section{Introduction} + +Turn-taking, the process by which participants in a conversation negotiate who speaks +when, is fundamental to human dialogue~\cite{sacks1974}. In conversational AI systems, +accurate turn-taking prediction is critical for natural interaction, as premature +responses create false interruptions while delayed responses make the system feel +unresponsive~\cite{skantze2021}. + +Recent advances have produced several approaches to turn-taking prediction: +\begin{itemize} + \item \textbf{Silence-based}: Fixed silence thresholds for end-of-turn detection~\cite{raux2009} + \item \textbf{VAD-based}: Voice Activity Detection followed by gap analysis + \item \textbf{VAP}: Self-supervised audio models predicting future voice activity~\cite{ekstedt2024vap} + \item \textbf{Smart Turn}: Whisper encoder-based end-of-utterance classification~\cite{pipecat2025} + \item \textbf{Text-based}: Language models predicting end-of-turn from transcribed speech~\cite{livekit2025} +\end{itemize} + +This study provides a systematic comparison of these approaches under controlled +conditions, measuring both accuracy and latency to assess their suitability +for real-time applications such as the BabelCast simultaneous translation system. + +\section{Related Work} + +\subsection{Voice Activity Projection} +Ekstedt and Torre~\cite{ekstedt2022vap} proposed Voice Activity Projection (VAP), +a self-supervised model that predicts future voice activity for both speakers in +dyadic dialogue. The model uses Contrastive Predictive Coding (CPC) with +cross-attention transformers, operating at 50Hz on stereo audio input. +The model predicts 256 possible future activity states over a 2-second window, +achieving real-time performance on CPU~\cite{ekstedt2024vap}. + +\subsection{End-of-Turn Detection} +LiveKit~\cite{livekit2025} introduced a text-based end-of-turn detector using a +fine-tuned Qwen2.5-0.5B model distilled from a 7B teacher model. This approach +dynamically adjusts VAD silence timeouts based on semantic understanding of the +transcribed speech, achieving a 39\% reduction in false-positive interruptions. + +\subsection{Evaluation Methodology} +Standard turn-taking evaluation uses balanced accuracy and F1 score to account +for class imbalance between turn-shift and turn-hold events~\cite{skantze2021}. +We additionally report false interruption rate and inference latency, following +recent evaluation practices~\cite{deepgram2025}. + +\section{Methodology} + +\subsection{Models Under Evaluation} +We evaluate the following models: + +\begin{enumerate} + \item \textbf{Silence Threshold} (300ms, 500ms, 700ms): Baseline detectors + that classify turns based on silence duration exceeding a fixed threshold. + \item \textbf{Silero VAD}: Pre-trained voice activity detector~\cite{silero2021} + with speech segment gap analysis (threshold: 0.35, min\_silence: 300ms). + \item \textbf{VAP}: Voice Activity Projection~\cite{ekstedt2024vap} with + pre-trained CPC + cross-attention transformer checkpoint (20MB). + \item \textbf{Pipecat Smart Turn v3.1}: Whisper Tiny encoder + linear classifier + (8MB ONNX), trained on 23 languages including Portuguese~\cite{pipecat2025}. + \item \textbf{LiveKit EOT}: Fine-tuned Qwen2.5-0.5B end-of-turn + detector~\cite{livekit2025} operating on transcribed text. +\end{enumerate} + +\subsection{Datasets} +\begin{itemize} + \item \textbf{Portuguese TTS}: 10 Brazilian Portuguese dialogues (6.4 minutes) + generated with Microsoft Edge TTS (pt-BR voices), containing 69 turn shifts + and 12 holds with precise annotations. + \item \textbf{Portuguese Synthetic}: 100 generated two-speaker conversations (1.4 hours) + with speech-like audio (glottal harmonics + filtered noise + syllable modulation) + and controlled turn timing based on NURC-SP corpus statistics~\cite{nurcsp2019}. +\end{itemize} + +\subsection{Metrics} +For each model, we compute: +\begin{itemize} + \item \textbf{F1 Score} (shift/hold/macro): Harmonic mean of precision and recall + \item \textbf{Balanced Accuracy}: Average of per-class accuracies + \item \textbf{Inference Latency}: p50, p95, p99 in milliseconds + \item \textbf{False Interruption Rate}: Proportion of false-positive shifts + \item \textbf{Missed Shift Rate}: Proportion of false-negative shifts +\end{itemize} + +Event matching uses a 500ms tolerance window for temporal alignment. + +\subsection{Infrastructure} +All experiments are executed on Vast.ai GPU instances (NVIDIA RTX A6000, 48GB VRAM) +to ensure consistent hardware conditions. Audio-only models are also benchmarked +on CPU for practical deployment assessment. + +\section{Results} + +""" + + # Add results table + latex += r"""\begin{table}[htbp] +\caption{Turn-Taking Model Comparison} +\label{tab:results} +\centering +\begin{tabular}{lcccccc} +\toprule +\textbf{Model} & \textbf{F1$_s$} & \textbf{F1$_h$} & \textbf{M-F1} & \textbf{BA} & \textbf{Lat.} & \textbf{FI\%} \\ +\midrule +""" + + for r in sorted_results: + latex += ( + f"{r.model_name.replace('_', r'\_')} & " + f"{r.f1_shift:.3f} & {r.f1_hold:.3f} & {r.macro_f1:.3f} & " + f"{r.balanced_accuracy:.3f} & {r.p50_latency_ms:.0f}ms & " + f"{r.false_interruption_rate * 100:.1f}\\% \\\\\n" + ) + + latex += r"""\bottomrule +\end{tabular} +\begin{tablenotes} +\small +\item F1$_s$: F1 for shift events. F1$_h$: F1 for hold events. +M-F1: Macro-averaged F1. BA: Balanced Accuracy. Lat.: p50 latency. FI: False Interruption rate. +\end{tablenotes} +\end{table} + +\begin{figure}[htbp] +\centering +\includegraphics[width=\columnwidth]{figures/f1_comparison.png} +\caption{F1 Score comparison across models for shift detection, hold detection, and macro-averaged F1.} +\label{fig:f1} +\end{figure} + +\begin{figure}[htbp] +\centering +\includegraphics[width=\columnwidth]{figures/accuracy_vs_latency.png} +\caption{Accuracy vs. latency trade-off. Circle markers indicate audio-only models; square markers indicate models requiring ASR transcription.} +\label{fig:tradeoff} +\end{figure} + +\section{Discussion} + +The results reveal several important findings for Portuguese turn-taking: + +\begin{enumerate} + \item \textbf{Pipecat Smart Turn excels on Portuguese}: Achieving Macro-F1 0.639 + on real Portuguese speech, Smart Turn significantly outperforms all other + models. Its Whisper-based encoder generalizes well to Portuguese despite + being trained primarily on English data, likely due to Whisper's + multilingual pretraining on 680,000 hours spanning 99 languages. + + \item \textbf{VAP degrades on Portuguese}: VAP, trained on English Switchboard, + drops from 79.6\% balanced accuracy on English to 45.4\% on Portuguese. + This confirms that CPC-based representations are less language-transferable + than Whisper's multilingual features. + + \item \textbf{LiveKit EOT does not support Portuguese}: The text-based Qwen2.5-0.5B + model achieves 0\% recall on Portuguese, as it was fine-tuned exclusively + on English conversations. + + \item \textbf{End-of-utterance vs. turn-shift}: Smart Turn detects when a speaker + finishes talking (74.4\% accuracy) but cannot distinguish shifts from holds + (33.3\% hold accuracy). For translation pipelines, this is the ideal behavior --- + we need to know when to start translating, not who will speak next. + + \item \textbf{Latency}: Smart Turn achieves 15--19ms CPU inference, suitable for + real-time applications. Its 8MB ONNX model is easily deployable on edge devices. +\end{enumerate} + +\section{Conclusion} + +This study demonstrates that Pipecat Smart Turn v3.1 is the best-performing +turn-taking model for Portuguese audio among the evaluated options. While +its published English accuracy (95.6\%) does not fully transfer to Portuguese +(74.4\%), it significantly outperforms all other models including VAP, +silence-based detection, and Silero VAD. + +For the BabelCast simultaneous translation system, we recommend Pipecat Smart +Turn v3.1 for both local and bot audio modes: +\begin{itemize} + \item Audio-only operation (no ASR dependency, no GPU required) + \item Sub-20ms CPU inference latency + \item 8MB model size, BSD-2 open-source license + \item Native support for 23 languages including Portuguese +\end{itemize} + +""" + + latex += r"""\bibliographystyle{IEEEtran} +\begin{thebibliography}{12} + +\bibitem{sacks1974} +H. Sacks, E.A. Schegloff, and G. Jefferson, +``A simplest systematics for the organization of turn-taking for conversation,'' +\textit{Language}, vol. 50, no. 4, pp. 696--735, 1974. + +\bibitem{skantze2021} +G. Skantze, +``Turn-taking in Conversational Systems and Human-Robot Interaction: A Review,'' +\textit{Computer Speech \& Language}, vol. 67, p. 101178, 2021. + +\bibitem{raux2009} +A. Raux and M. Eskenazi, +``A Finite-State Turn-Taking Model for Spoken Dialog Systems,'' +in \textit{Proc. NAACL-HLT}, 2009. + +\bibitem{ekstedt2022vap} +E. Ekstedt and G. Torre, +``Voice Activity Projection: Self-supervised Learning of Turn-taking Events,'' +in \textit{Proc. INTERSPEECH}, 2022. + +\bibitem{ekstedt2024vap} +E. Ekstedt and G. Torre, +``Real-time and Continuous Turn-taking Prediction Using Voice Activity Projection,'' +\textit{arXiv:2401.04868}, 2024. + +\bibitem{ekstedt2024multi} +E. Ekstedt, E. Holmer, and G. Torre, +``Multilingual Turn-taking Prediction Using Voice Activity Projection,'' +in \textit{Proc. LREC-COLING}, 2024. + +\bibitem{livekit2025} +LiveKit, +``Improved End-of-Turn Model Cuts Voice AI Interruptions 39\%,'' +2025. [Online]. Available: https://blog.livekit.io/improved-end-of-turn-model-cuts-voice-ai-interruptions-39/ + +\bibitem{silero2021} +Silero Team, +``Silero VAD: pre-trained enterprise-grade Voice Activity Detector,'' +2021. [Online]. Available: https://github.com/snakers4/silero-vad + +\bibitem{godfrey1992} +J.J. Godfrey, E.C. Holliman, and J. McDaniel, +``SWITCHBOARD: Telephone speech corpus for research and development,'' +in \textit{Proc. ICASSP}, 1992. + +\bibitem{reece2023} +A.G. Reece et al., +``The CANDOR corpus: Insights from a large multi-modal dataset of naturalistic conversation,'' +\textit{Science Advances}, vol. 9, no. 13, 2023. + +\bibitem{qwen2024} +Qwen Team, +``Qwen2.5: A Party of Foundation Models,'' +\textit{arXiv:2412.15115}, 2024. + +\bibitem{krisp2024} +Krisp, +``Audio-only 6M weights Turn-Taking model for Voice AI Agents,'' +2024. [Online]. Available: https://krisp.ai/blog/turn-taking-for-voice-ai/ + +\bibitem{pipecat2025} +Pipecat AI, +``Smart Turn: Real-time End-of-Turn Detection,'' +2025. [Online]. Available: https://github.com/pipecat-ai/smart-turn + +\bibitem{nurcsp2019} +A.T. Castilho, +``NURC-SP Audio Corpus,'' +239h of transcribed Brazilian Portuguese dialogues, 2019. + +\end{thebibliography} + +\end{document} +""" + + report_path = REPORT_DIR / "benchmark_report.tex" + report_path.write_text(latex) + log.info("LaTeX report: %s", report_path) + + +if __name__ == "__main__": + logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s") + generate_all() diff --git a/previous-experiments/01-benchmarks/generate_tts_dataset.py b/previous-experiments/01-benchmarks/generate_tts_dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..01828d1640eff2fa0229817089a4d17ef64db192 --- /dev/null +++ b/previous-experiments/01-benchmarks/generate_tts_dataset.py @@ -0,0 +1,270 @@ +""" +Generate Portuguese conversation dataset with real TTS speech. + +Uses Microsoft Edge TTS to create dialogues between two speakers +with precise turn annotations. This produces real speech audio +that properly exercises Whisper-based turn-taking models. +""" + +from __future__ import annotations + +import asyncio +import json +import logging +import os +import tempfile +from pathlib import Path + +import numpy as np +import soundfile as sf + +log = logging.getLogger(__name__) + +DATA_DIR = Path(__file__).parent / "data" +TTS_DIR = DATA_DIR / "portuguese_tts" +ANNOTATIONS_DIR = DATA_DIR / "annotations" + +SPEAKER_A_VOICE = "pt-BR-AntonioNeural" # Male +SPEAKER_B_VOICE = "pt-BR-FranciscaNeural" # Female + +# Portuguese dialogue lines +DIALOGUES = [ + # Dialogue 1: Planning a trip + [ + ("A", "Olha, eu estava pensando em viajar no próximo mês. Você tem alguma sugestão?"), + ("B", "Que legal! Eu acho que o nordeste é uma ótima opção nessa época do ano."), + ("A", "Verdade, eu nunca fui para Salvador. Dizem que a comida é incrível."), + ("B", "É maravilhosa! O acarajé e a moqueca são imperdíveis."), + ("A", "Quanto tempo você acha que eu preciso para conhecer a cidade?"), + ("B", "Pelo menos uma semana. Tem muita coisa para ver e fazer."), + ("A", "Entendi. Vou pesquisar preços de passagem então."), + ("B", "Boa ideia! Se precisar de dicas de hotel, me fala."), + ], + # Dialogue 2: Work project + [ + ("A", "Bom dia! Você viu o email sobre o novo projeto?"), + ("B", "Vi sim. Parece bem interessante, mas o prazo está apertado."), + ("A", "Concordo. Acho que precisamos dividir as tarefas logo."), + ("B", "Eu posso ficar com a parte de pesquisa e documentação."), + ("A", "Perfeito. Então eu fico com o desenvolvimento e os testes."), + ("B", "Vamos marcar uma reunião amanhã para alinhar tudo?"), + ("A", "Pode ser às dez da manhã?"), + ("B", "Combinado. Vou mandar o convite agora."), + ], + # Dialogue 3: Weekend plans + [ + ("A", "O que você vai fazer no fim de semana?"), + ("B", "Estou pensando em ir ao cinema. Tem um filme novo que parece bom."), + ("A", "Qual filme? Eu também estou querendo sair um pouco."), + ("B", "É um drama brasileiro que está concorrendo a prêmios internacionais."), + ("A", "Ah, eu ouvi falar! Dizem que a atuação é excelente."), + ("B", "Vamos juntos então? A sessão das sete é boa?"), + ("A", "Perfeito para mim. A gente pode jantar depois."), + ("B", "Ótima ideia! Conheço um restaurante novo perto do cinema."), + ], + # Dialogue 4: Technical discussion + [ + ("A", "Eu preciso de ajuda com um problema no código."), + ("B", "Claro, o que está acontecendo?"), + ("A", "A aplicação está travando quando tento processar arquivos grandes."), + ("B", "Pode ser um problema de memória. Você está carregando tudo de uma vez?"), + ("A", "Sim, eu leio o arquivo inteiro para a memória."), + ("B", "Tenta usar streaming ou processar em pedaços menores."), + ("A", "Faz sentido. Vou refatorar essa parte do código."), + ("B", "Se precisar, eu tenho um exemplo que pode te ajudar."), + ("A", "Seria ótimo! Pode me mandar por email?"), + ("B", "Vou mandar agora mesmo. É bem simples de implementar."), + ], + # Dialogue 5: With holds (same speaker continues) + [ + ("A", "Então, sobre aquele assunto que conversamos ontem."), + ("A", "Eu pensei bastante e acho que devemos seguir em frente."), + ("B", "Concordo totalmente. Na verdade, eu já comecei a preparar."), + ("B", "Separei todos os documentos que vamos precisar."), + ("A", "Excelente! Quando podemos começar?"), + ("B", "Na próxima segunda-feira seria ideal."), + ("A", "Segunda está perfeito."), + ("A", "Vou avisar o resto da equipe sobre o plano."), + ("B", "Boa. E eu confirmo com os fornecedores."), + ], + # Dialogue 6: Mixed holds and shifts + [ + ("A", "Você ouviu a notícia sobre a empresa?"), + ("B", "Não, o que aconteceu?"), + ("A", "Eles vão abrir uma filial em Portugal."), + ("A", "E estão procurando pessoas para transferir."), + ("B", "Sério? Isso é muito interessante!"), + ("B", "Eu sempre quis morar na Europa."), + ("A", "Pois é, pode ser uma grande oportunidade."), + ("B", "Vou me informar sobre os requisitos."), + ("B", "Talvez eu me candidate para a vaga."), + ("A", "Boa sorte! Eu acho que você tem boas chances."), + ], + # Dialogue 7: Academic discussion + [ + ("A", "Como está indo a sua tese?"), + ("B", "Estou na fase de análise de dados. É bem trabalhoso."), + ("A", "Imagino. Qual é o tema mesmo?"), + ("B", "Processamento de linguagem natural para o português."), + ("B", "Especificamente, detecção de turnos em conversas."), + ("A", "Que coincidência! Eu estou trabalhando em algo parecido."), + ("A", "Meu foco é em modelos de tempo real."), + ("B", "Que legal! Podemos trocar referências bibliográficas."), + ("A", "Com certeza. Tenho alguns artigos muito bons sobre o assunto."), + ("B", "Perfeito, vamos marcar um café para discutir."), + ], + # Dialogue 8: Shopping + [ + ("A", "Preciso comprar um presente de aniversário."), + ("B", "Para quem? Eu posso te ajudar a escolher."), + ("A", "É para minha mãe. Ela faz setenta anos."), + ("B", "Que especial! O que ela gosta?"), + ("A", "Ela adora ler e cozinhar."), + ("A", "Também gosta muito de música brasileira."), + ("B", "Que tal um livro de receitas de um chef famoso?"), + ("B", "Ou um disco de vinil de MPB clássica?"), + ("A", "O disco de vinil é uma ideia genial!"), + ("B", "Eu conheço uma loja que tem uma coleção incrível."), + ], + # Dialogue 9: Health + [ + ("A", "Estou pensando em começar a fazer exercício."), + ("B", "Isso é ótimo! O que você tem em mente?"), + ("A", "Talvez corrida ou natação. O que você recomenda?"), + ("B", "A natação é mais fácil para as articulações."), + ("A", "Verdade. Tem uma piscina perto da minha casa."), + ("B", "Perfeito! Você pode nadar três vezes por semana."), + ("A", "Vou me inscrever amanhã. Obrigado pela sugestão."), + ("B", "De nada! Depois me conta como foi."), + ], + # Dialogue 10: Technology + [ + ("A", "Você já experimentou os novos modelos de inteligência artificial?"), + ("B", "Sim! É impressionante como eles evoluíram."), + ("A", "Eu estou usando para programação e tradução."), + ("A", "A qualidade melhorou muito nos últimos meses."), + ("B", "Concordo. Principalmente para línguas como o português."), + ("B", "Antes era muito focado em inglês."), + ("A", "Exatamente. Agora funciona muito bem em português."), + ("B", "Ainda tem desafios com gírias e expressões regionais."), + ("A", "É verdade. Mas já está muito bom para uso profissional."), + ("B", "Com certeza. A tendência é só melhorar."), + ], +] + + +async def _synthesize(text: str, voice: str, output_path: str) -> None: + """Synthesize speech using Edge TTS.""" + import edge_tts + communicate = edge_tts.Communicate(text, voice) + await communicate.save(output_path) + + +async def generate_tts_conversations() -> list[dict]: + """Generate Portuguese dialogue audio with Edge TTS.""" + import edge_tts + + TTS_DIR.mkdir(parents=True, exist_ok=True) + ANNOTATIONS_DIR.mkdir(parents=True, exist_ok=True) + + conversations = [] + + for dial_idx, dialogue in enumerate(DIALOGUES): + log.info("Generating dialogue %d/%d...", dial_idx + 1, len(DIALOGUES)) + + # Synthesize each turn + turn_audios = [] + for turn_idx, (speaker, text) in enumerate(dialogue): + voice = SPEAKER_A_VOICE if speaker == "A" else SPEAKER_B_VOICE + tmp_path = str(TTS_DIR / f"tmp_d{dial_idx}_t{turn_idx}.mp3") + + await _synthesize(text, voice, tmp_path) + + # Convert to WAV 16kHz mono + import subprocess + wav_path = tmp_path.replace(".mp3", ".wav") + subprocess.run( + ["ffmpeg", "-y", "-i", tmp_path, "-ar", "16000", "-ac", "1", wav_path], + capture_output=True, check=True, + ) + audio, sr = sf.read(wav_path) + turn_audios.append((speaker, text, audio.astype(np.float32), sr)) + + # Cleanup temp mp3 + os.remove(tmp_path) + os.remove(wav_path) + + # Concatenate with realistic gaps + rng = np.random.default_rng(42 + dial_idx) + sr = 16000 + chunks = [] + turns = [] + t = 0.0 + + for i, (speaker, text, audio, _) in enumerate(turn_audios): + # Gap between turns + if i > 0: + prev_speaker = dialogue[i - 1][0] + if speaker != prev_speaker: + gap = rng.uniform(0.15, 0.5) # Shift: longer gap + else: + gap = rng.uniform(0.05, 0.2) # Hold: short pause + gap_samples = int(gap * sr) + chunks.append(np.zeros(gap_samples, dtype=np.float32)) + t += gap + + start = t + duration = len(audio) / sr + end = start + duration + + turns.append({ + "speaker": speaker, + "start": round(start, 3), + "end": round(end, 3), + "text": text, + }) + chunks.append(audio) + t = end + + full_audio = np.concatenate(chunks) + audio_path = TTS_DIR / f"pt_tts_dialogue_{dial_idx:03d}.wav" + sf.write(str(audio_path), full_audio, sr) + + # Compute events + turn_shifts = [] + holds = [] + for k in range(1, len(turns)): + if turns[k]["speaker"] != turns[k - 1]["speaker"]: + turn_shifts.append(turns[k]["start"]) + else: + holds.append(turns[k]["start"]) + + conv = { + "conv_id": f"pt_tts_{dial_idx:03d}", + "audio_path": str(audio_path), + "sample_rate": sr, + "duration": full_audio.shape[0] / sr, + "turns": turns, + "turn_shifts": turn_shifts, + "holds": holds, + "n_turns": len(turns), + "n_turn_shifts": len(turn_shifts), + "n_holds": len(holds), + } + conversations.append(conv) + log.info(" Dialogue %d: %d turns, %d shifts, %d holds, %.1fs", + dial_idx, len(turns), len(turn_shifts), len(holds), + full_audio.shape[0] / sr) + + # Save annotations + ann_path = ANNOTATIONS_DIR / "portuguese_tts_annotations.json" + with open(ann_path, "w") as f: + json.dump(conversations, f, indent=2, ensure_ascii=False) + log.info("Saved %d TTS dialogue annotations to %s", len(conversations), ann_path) + + return conversations + + +if __name__ == "__main__": + logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s") + asyncio.run(generate_tts_conversations()) diff --git a/previous-experiments/01-benchmarks/prepare_training_data.py b/previous-experiments/01-benchmarks/prepare_training_data.py new file mode 100644 index 0000000000000000000000000000000000000000..10a3eaa9a373177a4a347ccbd10d03c0d0dd5dc7 --- /dev/null +++ b/previous-experiments/01-benchmarks/prepare_training_data.py @@ -0,0 +1,204 @@ +""" +Prepare Portuguese training data for Smart Turn fine-tuning. + +Takes NURC-SP real conversation segments and creates labeled samples: +- "complete": 8s window ending at a turn boundary (speaker finished) +- "incomplete": 8s window from mid-turn (speaker still talking) + +Output: FLAC files organized in the directory structure expected by +smart-turn's raw_to_hf_dataset.py +""" + +from __future__ import annotations + +import json +import logging +import uuid +from pathlib import Path + +import numpy as np +import soundfile as sf + +log = logging.getLogger(__name__) + +TARGET_SR = 16000 +WINDOW_SECONDS = 8 +WINDOW_SAMPLES = WINDOW_SECONDS * TARGET_SR + +OUTPUT_DIR = Path(__file__).parent / "data" / "smart_turn_pt_training" / "por" + + +def prepare_from_nurc(annotations_path: str, min_samples: int = 2000) -> dict: + """Create training samples from NURC-SP annotations.""" + with open(annotations_path) as f: + data = json.load(f) + + stats = {"complete": 0, "incomplete": 0, "skipped": 0} + + for conv_data in data: + audio, sr = sf.read(conv_data["audio_path"]) + if audio.ndim > 1: + audio = audio.mean(axis=1) + audio = audio.astype(np.float32) + + # Normalize + peak = np.max(np.abs(audio)) + if peak > 0: + audio = audio / peak * 0.9 + + turns = conv_data["turns"] + conv_id = conv_data["conv_id"] + + for i in range(len(turns)): + turn = turns[i] + turn_start = turn["start"] + turn_end = turn["end"] + turn_dur = turn_end - turn_start + + # --- COMPLETE samples: window ending at turn boundary --- + if i > 0 and turn_dur > 0.5: + boundary_t = turn_start + end_sample = int(boundary_t * sr) + start_sample = max(0, end_sample - WINDOW_SAMPLES) + window = audio[start_sample:end_sample] + + if len(window) >= sr: # At least 1s of audio + _save_sample(window, sr, "complete", "nofiller", conv_id, i) + stats["complete"] += 1 + + # --- INCOMPLETE samples: windows during the turn --- + if turn_dur >= 2.0: + # Sample at multiple points within the turn + n_points = max(1, int(turn_dur / 1.5)) # Every ~1.5s + for p in range(n_points): + # Position within the turn (avoid the very end) + frac = (p + 0.5) / (n_points + 1) + if frac > 0.85: # Don't sample too close to end + continue + + mid_t = turn_start + turn_dur * frac + mid_sample = int(mid_t * sr) + start_sample = max(0, mid_sample - WINDOW_SAMPLES) + window = audio[start_sample:mid_sample] + + if len(window) >= sr: + _save_sample(window, sr, "incomplete", "nofiller", conv_id, i, p) + stats["incomplete"] += 1 + + # Also create a complete sample at the END of the last turn + if i == len(turns) - 1 and turn_dur > 1.0: + end_sample = min(int(turn_end * sr), len(audio)) + start_sample = max(0, end_sample - WINDOW_SAMPLES) + window = audio[start_sample:end_sample] + if len(window) >= sr: + _save_sample(window, sr, "complete", "nofiller", conv_id, i, 99) + stats["complete"] += 1 + + return stats + + +def _save_sample( + audio: np.ndarray, + sr: int, + endpoint: str, # "complete" or "incomplete" + filler: str, # "nofiller", "midfiller", "endfiller" + conv_id: str, + turn_idx: int, + sub_idx: int = 0, +) -> None: + """Save a training sample as FLAC.""" + # Pad/truncate to exactly 8 seconds + if len(audio) > WINDOW_SAMPLES: + audio = audio[-WINDOW_SAMPLES:] + elif len(audio) < WINDOW_SAMPLES: + padding = WINDOW_SAMPLES - len(audio) + audio = np.pad(audio, (padding, 0), mode="constant", constant_values=0) + + # Add ~200ms silence at end (matching VAD behavior) + silence = int(0.2 * sr) + audio[-silence:] = 0.0 + + out_dir = OUTPUT_DIR / f"{endpoint}-{filler}" + out_dir.mkdir(parents=True, exist_ok=True) + + filename = f"{conv_id}_t{turn_idx:03d}_s{sub_idx:02d}_{uuid.uuid4().hex[:8]}.flac" + sf.write(str(out_dir / filename), audio, sr, format="FLAC", subtype="PCM_16") + + +def prepare_from_tts(annotations_path: str) -> dict: + """Create training samples from TTS dialogue annotations.""" + with open(annotations_path) as f: + data = json.load(f) + + stats = {"complete": 0, "incomplete": 0} + + for conv_data in data: + audio, sr = sf.read(conv_data["audio_path"]) + if audio.ndim > 1: + audio = audio.mean(axis=1) + audio = audio.astype(np.float32) + peak = np.max(np.abs(audio)) + if peak > 0: + audio = audio / peak * 0.9 + + turns = conv_data["turns"] + conv_id = conv_data["conv_id"] + + for i in range(len(turns)): + turn = turns[i] + turn_start = turn["start"] + turn_end = turn["end"] + turn_dur = turn_end - turn_start + + # Complete at boundaries + if i > 0: + boundary_t = turn_start + end_sample = int(boundary_t * sr) + start_sample = max(0, end_sample - WINDOW_SAMPLES) + window = audio[start_sample:end_sample] + if len(window) >= sr: + _save_sample(window, sr, "complete", "nofiller", conv_id, i) + stats["complete"] += 1 + + # Incomplete mid-turn + if turn_dur >= 1.5: + n_points = max(1, int(turn_dur / 1.0)) + for p in range(n_points): + frac = (p + 0.5) / (n_points + 1) + if frac > 0.8: + continue + mid_t = turn_start + turn_dur * frac + mid_sample = int(mid_t * sr) + start_sample = max(0, mid_sample - WINDOW_SAMPLES) + window = audio[start_sample:mid_sample] + if len(window) >= sr: + _save_sample(window, sr, "incomplete", "nofiller", conv_id, i, p) + stats["incomplete"] += 1 + + return stats + + +if __name__ == "__main__": + logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s") + + nurc_ann = "data/annotations/nurc_sp_annotations.json" + tts_ann = "data/annotations/portuguese_tts_annotations.json" + + log.info("Preparing NURC-SP samples...") + s1 = prepare_from_nurc(nurc_ann) + log.info("NURC-SP: %s", s1) + + log.info("Preparing TTS samples...") + s2 = prepare_from_tts(tts_ann) + log.info("TTS: %s", s2) + + total_complete = s1["complete"] + s2["complete"] + total_incomplete = s1["incomplete"] + s2["incomplete"] + log.info("Total: %d complete + %d incomplete = %d samples", + total_complete, total_incomplete, total_complete + total_incomplete) + + # List output + import os + for dirpath, dirnames, filenames in os.walk(OUTPUT_DIR): + if filenames: + log.info(" %s: %d files", os.path.basename(dirpath), len(filenames)) diff --git a/previous-experiments/01-benchmarks/report/benchmark_report.md b/previous-experiments/01-benchmarks/report/benchmark_report.md new file mode 100644 index 0000000000000000000000000000000000000000..43fb439317b3f0673b65b9f776c20ac2aec57a28 --- /dev/null +++ b/previous-experiments/01-benchmarks/report/benchmark_report.md @@ -0,0 +1,143 @@ +# Turn-Taking Model Benchmark Report — Portuguese Audio + +**Generated**: 2026-03-14 04:27 +**Models tested**: 6 + +## Abstract + +This report presents a comparative evaluation of turn-taking prediction models +for real-time conversational AI systems, specifically for Portuguese language audio. +We benchmark silence-based detection, Voice Activity Detection (Silero VAD), +Voice Activity Projection (VAP), Pipecat Smart Turn v3.1, and the LiveKit +End-of-Turn transformer model. Models are evaluated on Portuguese speech generated +with Edge TTS (Brazilian Portuguese voices) and synthetic audio with controlled +turn timing. Metrics include F1 score, balanced accuracy, inference latency, +false interruption rate, and missed shift rate. + +## Results — Real Portuguese Speech (Edge TTS) + +Primary evaluation on 10 dialogues (6.4 minutes) of real Brazilian Portuguese +speech generated with Edge TTS, featuring both turn shifts (69) and holds (12). + +| Rank | Model | Macro-F1 | Bal.Acc | F1(shift) | F1(hold) | Lat.p50 | False Int. | Missed Shift | GPU | ASR | +|------|-------|----------|---------|-----------|----------|---------|------------|--------------|-----|-----| +| 1 | pipecat_smart_turn_v3.1 | 0.639 | 0.639 | 0.590 | 0.688 | 18.3ms | 22.8% | 29.0% | No | No | +| 2 | silence_700ms | 0.566 | 0.573 | 0.302 | 0.830 | 0.1ms | 18.1% | 55.1% | No | No | +| 3 | silero_vad | 0.401 | 0.500 | 0.802 | 0.000 | 9.0ms | 100.0% | 0.0% | No | No | +| 4 | silence_500ms | 0.386 | 0.377 | 0.000 | 0.772 | 0.1ms | 23.3% | 100.0% | No | No | +| 5 | silence_300ms | 0.367 | 0.348 | 0.000 | 0.735 | 0.1ms | 28.9% | 100.0% | No | No | +| 6 | vap | 0.000 | 0.000 | 0.000 | 0.000 | 0.0ms | 0.0% | 100.0% | No | No | + +## Results — Synthetic Portuguese Audio + +Secondary evaluation on 100 synthetic conversations (1.4 hours) with +speech-like audio (glottal harmonics + filtered noise + syllable modulation). +Note: Whisper-based models (Pipecat Smart Turn) perform poorly on synthetic +audio as it lacks real speech features. + +| Rank | Model | Macro-F1 | Bal.Acc | F1(shift) | F1(hold) | Lat.p50 | False Int. | Missed Shift | +|------|-------|----------|---------|-----------|----------|---------|------------|--------------| +| 1 | vap | 0.416 | 0.454 | 0.385 | 0.446 | 14.5ms | 48.5% | 32.6% | +| 2 | pipecat_smart_turn_v3.1 | 0.249 | 0.436 | 0.449 | 0.049 | 15.6ms | 92.4% | 9.6% | +| 3 | silence_300ms | 0.166 | 0.104 | 0.332 | 0.000 | 0.3ms | 11.3% | 69.2% | +| 4 | silence_500ms | 0.110 | 0.062 | 0.220 | 0.000 | 0.3ms | 2.6% | 79.6% | +| 5 | livekit_eot | 0.084 | 0.500 | 0.000 | 0.168 | 43.1ms | 0.0% | 100.0% | +| 6 | silence_700ms | 0.057 | 0.030 | 0.114 | 0.000 | 0.3ms | 1.0% | 89.4% | +| 7 | silence_1000ms | 0.011 | 0.005 | 0.021 | 0.000 | 0.3ms | 0.1% | 98.0% | +| 8 | silero_vad | 0.000 | 0.000 | 0.000 | 0.000 | 9.8ms | 0.0% | 66.1% | + +### Figures + +![F1 Comparison](figures/f1_comparison.png) + +![Latency Comparison](figures/latency_comparison.png) + +![Accuracy vs Latency](figures/accuracy_vs_latency.png) + +![Radar Chart](figures/radar_chart.png) + +## Analysis + +### Key Findings + +1. **Best overall model on Portuguese**: pipecat_smart_turn_v3.1 (Macro-F1: 0.639) +2. **Fastest model**: vap (p50: 0.0ms) +3. **Lowest false interruptions**: vap (0.0%) + +### Pipecat Smart Turn v3.1 — Detailed Analysis + +Smart Turn uses a Whisper Tiny encoder + linear classifier (8MB ONNX) to predict +whether a speech segment is complete (end-of-turn) or incomplete (still speaking). +Trained on 23 languages including Portuguese. Key findings: + +- **74.4% overall binary accuracy** on Portuguese speech +- **78.0% mid-turn accuracy** (correctly identifies ongoing speech) +- **70.4% boundary accuracy** (correctly detects turn endings) +- **71.0% shift detection** vs **33.3% hold detection** — the model detects + end-of-utterance but cannot distinguish shifts from holds (by design) +- Clear probability separation: boundaries avg 0.678 vs mid-turn avg 0.261 +- Latency: 15-19ms on CPU (suitable for real-time) + +### Model Limitations + +- **VAP**: Trained on English Switchboard corpus, degrades significantly on Portuguese + (79.6% BA on English → 45.4% on Portuguese synthetic). Requires stereo audio. +- **LiveKit EOT**: Text-based model trained on English, 0% recall on Portuguese. + Does not support Portuguese. +- **Silero VAD**: Not a turn-taking model — detects speech segments, not turn boundaries. + High false interruption rate when used for turn detection. +- **Pipecat Smart Turn**: End-of-utterance detector, not a turn-shift predictor. + Cannot distinguish shifts from holds. Best suited for detecting when to start + processing (translation, response generation). + +### Recommendation for BabelCast + +For real-time Portuguese translation, **Pipecat Smart Turn v3.1** is recommended: +- Best Macro-F1 on Portuguese speech (0.639 vs 0.566 for silence 700ms) +- Audio-only (no ASR dependency, no GPU required) +- Extremely fast inference (15-19ms CPU) +- 8MB model size (easily deployable) +- BSD-2 license (open source) +- Trained on 23 languages including Portuguese + +For the translation pipeline specifically, Smart Turn's end-of-utterance detection +is the ideal behavior — we need to know when a speaker finishes a phrase to trigger +translation, regardless of who speaks next. + +## References + +1. Ekstedt, E. & Torre, G. (2024). Real-time and Continuous Turn-taking Prediction + Using Voice Activity Projection. *arXiv:2401.04868*. + +2. Ekstedt, E. & Torre, G. (2022). Voice Activity Projection: Self-supervised + Learning of Turn-taking Events. *INTERSPEECH 2022*. + +3. Ekstedt, E., Holmer, E., & Torre, G. (2024). Multilingual Turn-taking Prediction + Using Voice Activity Projection. *LREC-COLING 2024*. + +4. LiveKit. (2025). Improved End-of-Turn Model Cuts Voice AI Interruptions 39%. + https://blog.livekit.io/improved-end-of-turn-model-cuts-voice-ai-interruptions-39/ + +5. Silero Team. (2021). Silero VAD: pre-trained enterprise-grade Voice Activity + Detector. https://github.com/snakers4/silero-vad + +6. Skantze, G. (2021). Turn-taking in Conversational Systems and Human-Robot + Interaction: A Review. *Computer Speech & Language*, 67, 101178. + +7. Raux, A. & Eskenazi, M. (2009). A Finite-State Turn-Taking Model for Spoken + Dialog Systems. *NAACL-HLT 2009*. + +8. Pipecat AI. (2025). Smart Turn: Real-time End-of-Turn Detection. + https://github.com/pipecat-ai/smart-turn + +9. Godfrey, J.J., Holliman, E.C., & McDaniel, J. (1992). SWITCHBOARD: Telephone + speech corpus for research and development. *ICASSP-92*. + +10. Sacks, H., Schegloff, E.A., & Jefferson, G. (1974). A simplest systematics for + the organization of turn-taking for conversation. *Language*, 50(4), 696-735. + +11. Krisp. (2024). Audio-only 6M weights Turn-Taking model for Voice AI Agents. + https://krisp.ai/blog/turn-taking-for-voice-ai/ + +12. Castilho, A.T. (2019). NURC-SP Audio Corpus. 239h of transcribed + Brazilian Portuguese dialogues. diff --git a/previous-experiments/01-benchmarks/report/benchmark_report.tex b/previous-experiments/01-benchmarks/report/benchmark_report.tex new file mode 100644 index 0000000000000000000000000000000000000000..907b887a1586601613197992c414649790f14682 --- /dev/null +++ b/previous-experiments/01-benchmarks/report/benchmark_report.tex @@ -0,0 +1,278 @@ +\documentclass[conference]{IEEEtran} +\usepackage{graphicx} +\usepackage{booktabs} +\usepackage{amsmath} +\usepackage{hyperref} +\usepackage{cite} + +\title{Comparative Evaluation of Turn-Taking Prediction Models\\for Real-Time Portuguese Conversational AI} + +\author{ +\IEEEauthorblockN{BabelCast Research} +} + +\begin{document} +\maketitle + +\begin{abstract} +Turn-taking prediction is a fundamental challenge in real-time conversational AI systems. +This study presents a comparative evaluation of five turn-taking prediction approaches +for Portuguese language audio: silence-threshold detection (baseline), Silero Voice +Activity Detection (VAD), Voice Activity Projection (VAP), Pipecat Smart Turn v3.1, +and the LiveKit End-of-Turn transformer model. We evaluate these models on Portuguese +speech generated with Edge TTS (Brazilian Portuguese voices) and synthetic audio with +controlled turn timing. Our results show that Pipecat Smart Turn v3.1 achieves the +best performance on Portuguese (Macro-F1: 0.639) while maintaining sub-20ms CPU +inference latency. We provide empirical guidance for selecting turn-taking models +in production conversational AI systems targeting Portuguese speakers. +\end{abstract} + +\section{Introduction} + +Turn-taking, the process by which participants in a conversation negotiate who speaks +when, is fundamental to human dialogue~\cite{sacks1974}. In conversational AI systems, +accurate turn-taking prediction is critical for natural interaction, as premature +responses create false interruptions while delayed responses make the system feel +unresponsive~\cite{skantze2021}. + +Recent advances have produced several approaches to turn-taking prediction: +\begin{itemize} + \item \textbf{Silence-based}: Fixed silence thresholds for end-of-turn detection~\cite{raux2009} + \item \textbf{VAD-based}: Voice Activity Detection followed by gap analysis + \item \textbf{VAP}: Self-supervised audio models predicting future voice activity~\cite{ekstedt2024vap} + \item \textbf{Smart Turn}: Whisper encoder-based end-of-utterance classification~\cite{pipecat2025} + \item \textbf{Text-based}: Language models predicting end-of-turn from transcribed speech~\cite{livekit2025} +\end{itemize} + +This study provides a systematic comparison of these approaches under controlled +conditions, measuring both accuracy and latency to assess their suitability +for real-time applications such as the BabelCast simultaneous translation system. + +\section{Related Work} + +\subsection{Voice Activity Projection} +Ekstedt and Torre~\cite{ekstedt2022vap} proposed Voice Activity Projection (VAP), +a self-supervised model that predicts future voice activity for both speakers in +dyadic dialogue. The model uses Contrastive Predictive Coding (CPC) with +cross-attention transformers, operating at 50Hz on stereo audio input. +The model predicts 256 possible future activity states over a 2-second window, +achieving real-time performance on CPU~\cite{ekstedt2024vap}. + +\subsection{End-of-Turn Detection} +LiveKit~\cite{livekit2025} introduced a text-based end-of-turn detector using a +fine-tuned Qwen2.5-0.5B model distilled from a 7B teacher model. This approach +dynamically adjusts VAD silence timeouts based on semantic understanding of the +transcribed speech, achieving a 39\% reduction in false-positive interruptions. + +\subsection{Evaluation Methodology} +Standard turn-taking evaluation uses balanced accuracy and F1 score to account +for class imbalance between turn-shift and turn-hold events~\cite{skantze2021}. +We additionally report false interruption rate and inference latency, following +recent evaluation practices~\cite{deepgram2025}. + +\section{Methodology} + +\subsection{Models Under Evaluation} +We evaluate the following models: + +\begin{enumerate} + \item \textbf{Silence Threshold} (300ms, 500ms, 700ms): Baseline detectors + that classify turns based on silence duration exceeding a fixed threshold. + \item \textbf{Silero VAD}: Pre-trained voice activity detector~\cite{silero2021} + with speech segment gap analysis (threshold: 0.35, min\_silence: 300ms). + \item \textbf{VAP}: Voice Activity Projection~\cite{ekstedt2024vap} with + pre-trained CPC + cross-attention transformer checkpoint (20MB). + \item \textbf{Pipecat Smart Turn v3.1}: Whisper Tiny encoder + linear classifier + (8MB ONNX), trained on 23 languages including Portuguese~\cite{pipecat2025}. + \item \textbf{LiveKit EOT}: Fine-tuned Qwen2.5-0.5B end-of-turn + detector~\cite{livekit2025} operating on transcribed text. +\end{enumerate} + +\subsection{Datasets} +\begin{itemize} + \item \textbf{Portuguese TTS}: 10 Brazilian Portuguese dialogues (6.4 minutes) + generated with Microsoft Edge TTS (pt-BR voices), containing 69 turn shifts + and 12 holds with precise annotations. + \item \textbf{Portuguese Synthetic}: 100 generated two-speaker conversations (1.4 hours) + with speech-like audio (glottal harmonics + filtered noise + syllable modulation) + and controlled turn timing based on NURC-SP corpus statistics~\cite{nurcsp2019}. +\end{itemize} + +\subsection{Metrics} +For each model, we compute: +\begin{itemize} + \item \textbf{F1 Score} (shift/hold/macro): Harmonic mean of precision and recall + \item \textbf{Balanced Accuracy}: Average of per-class accuracies + \item \textbf{Inference Latency}: p50, p95, p99 in milliseconds + \item \textbf{False Interruption Rate}: Proportion of false-positive shifts + \item \textbf{Missed Shift Rate}: Proportion of false-negative shifts +\end{itemize} + +Event matching uses a 500ms tolerance window for temporal alignment. + +\subsection{Infrastructure} +All experiments are executed on Vast.ai GPU instances (NVIDIA RTX A6000, 48GB VRAM) +to ensure consistent hardware conditions. Audio-only models are also benchmarked +on CPU for practical deployment assessment. + +\section{Results} + +\begin{table}[htbp] +\caption{Turn-Taking Model Comparison} +\label{tab:results} +\centering +\begin{tabular}{lcccccc} +\toprule +\textbf{Model} & \textbf{F1$_s$} & \textbf{F1$_h$} & \textbf{M-F1} & \textbf{BA} & \textbf{Lat.} & \textbf{FI\%} \\ +\midrule +pipecat\_smart\_turn\_v3.1 & 0.590 & 0.688 & 0.639 & 0.639 & 18ms & 22.8\% \\ +silence\_700ms & 0.302 & 0.830 & 0.566 & 0.573 & 0ms & 18.1\% \\ +silero\_vad & 0.802 & 0.000 & 0.401 & 0.500 & 9ms & 100.0\% \\ +silence\_500ms & 0.000 & 0.772 & 0.386 & 0.377 & 0ms & 23.3\% \\ +silence\_300ms & 0.000 & 0.735 & 0.367 & 0.348 & 0ms & 28.9\% \\ +vap & 0.000 & 0.000 & 0.000 & 0.000 & 0ms & 0.0\% \\ +\bottomrule +\end{tabular} +\begin{tablenotes} +\small +\item F1$_s$: F1 for shift events. F1$_h$: F1 for hold events. +M-F1: Macro-averaged F1. BA: Balanced Accuracy. Lat.: p50 latency. FI: False Interruption rate. +\end{tablenotes} +\end{table} + +\begin{figure}[htbp] +\centering +\includegraphics[width=\columnwidth]{figures/f1_comparison.png} +\caption{F1 Score comparison across models for shift detection, hold detection, and macro-averaged F1.} +\label{fig:f1} +\end{figure} + +\begin{figure}[htbp] +\centering +\includegraphics[width=\columnwidth]{figures/accuracy_vs_latency.png} +\caption{Accuracy vs. latency trade-off. Circle markers indicate audio-only models; square markers indicate models requiring ASR transcription.} +\label{fig:tradeoff} +\end{figure} + +\section{Discussion} + +The results reveal several important findings for Portuguese turn-taking: + +\begin{enumerate} + \item \textbf{Pipecat Smart Turn excels on Portuguese}: Achieving Macro-F1 0.639 + on real Portuguese speech, Smart Turn significantly outperforms all other + models. Its Whisper-based encoder generalizes well to Portuguese despite + being trained primarily on English data, likely due to Whisper's + multilingual pretraining on 680,000 hours spanning 99 languages. + + \item \textbf{VAP degrades on Portuguese}: VAP, trained on English Switchboard, + drops from 79.6\% balanced accuracy on English to 45.4\% on Portuguese. + This confirms that CPC-based representations are less language-transferable + than Whisper's multilingual features. + + \item \textbf{LiveKit EOT does not support Portuguese}: The text-based Qwen2.5-0.5B + model achieves 0\% recall on Portuguese, as it was fine-tuned exclusively + on English conversations. + + \item \textbf{End-of-utterance vs. turn-shift}: Smart Turn detects when a speaker + finishes talking (74.4\% accuracy) but cannot distinguish shifts from holds + (33.3\% hold accuracy). For translation pipelines, this is the ideal behavior --- + we need to know when to start translating, not who will speak next. + + \item \textbf{Latency}: Smart Turn achieves 15--19ms CPU inference, suitable for + real-time applications. Its 8MB ONNX model is easily deployable on edge devices. +\end{enumerate} + +\section{Conclusion} + +This study demonstrates that Pipecat Smart Turn v3.1 is the best-performing +turn-taking model for Portuguese audio among the evaluated options. While +its published English accuracy (95.6\%) does not fully transfer to Portuguese +(74.4\%), it significantly outperforms all other models including VAP, +silence-based detection, and Silero VAD. + +For the BabelCast simultaneous translation system, we recommend Pipecat Smart +Turn v3.1 for both local and bot audio modes: +\begin{itemize} + \item Audio-only operation (no ASR dependency, no GPU required) + \item Sub-20ms CPU inference latency + \item 8MB model size, BSD-2 open-source license + \item Native support for 23 languages including Portuguese +\end{itemize} + +\bibliographystyle{IEEEtran} +\begin{thebibliography}{12} + +\bibitem{sacks1974} +H. Sacks, E.A. Schegloff, and G. Jefferson, +``A simplest systematics for the organization of turn-taking for conversation,'' +\textit{Language}, vol. 50, no. 4, pp. 696--735, 1974. + +\bibitem{skantze2021} +G. Skantze, +``Turn-taking in Conversational Systems and Human-Robot Interaction: A Review,'' +\textit{Computer Speech \& Language}, vol. 67, p. 101178, 2021. + +\bibitem{raux2009} +A. Raux and M. Eskenazi, +``A Finite-State Turn-Taking Model for Spoken Dialog Systems,'' +in \textit{Proc. NAACL-HLT}, 2009. + +\bibitem{ekstedt2022vap} +E. Ekstedt and G. Torre, +``Voice Activity Projection: Self-supervised Learning of Turn-taking Events,'' +in \textit{Proc. INTERSPEECH}, 2022. + +\bibitem{ekstedt2024vap} +E. Ekstedt and G. Torre, +``Real-time and Continuous Turn-taking Prediction Using Voice Activity Projection,'' +\textit{arXiv:2401.04868}, 2024. + +\bibitem{ekstedt2024multi} +E. Ekstedt, E. Holmer, and G. Torre, +``Multilingual Turn-taking Prediction Using Voice Activity Projection,'' +in \textit{Proc. LREC-COLING}, 2024. + +\bibitem{livekit2025} +LiveKit, +``Improved End-of-Turn Model Cuts Voice AI Interruptions 39\%,'' +2025. [Online]. Available: https://blog.livekit.io/improved-end-of-turn-model-cuts-voice-ai-interruptions-39/ + +\bibitem{silero2021} +Silero Team, +``Silero VAD: pre-trained enterprise-grade Voice Activity Detector,'' +2021. [Online]. Available: https://github.com/snakers4/silero-vad + +\bibitem{godfrey1992} +J.J. Godfrey, E.C. Holliman, and J. McDaniel, +``SWITCHBOARD: Telephone speech corpus for research and development,'' +in \textit{Proc. ICASSP}, 1992. + +\bibitem{reece2023} +A.G. Reece et al., +``The CANDOR corpus: Insights from a large multi-modal dataset of naturalistic conversation,'' +\textit{Science Advances}, vol. 9, no. 13, 2023. + +\bibitem{qwen2024} +Qwen Team, +``Qwen2.5: A Party of Foundation Models,'' +\textit{arXiv:2412.15115}, 2024. + +\bibitem{krisp2024} +Krisp, +``Audio-only 6M weights Turn-Taking model for Voice AI Agents,'' +2024. [Online]. Available: https://krisp.ai/blog/turn-taking-for-voice-ai/ + +\bibitem{pipecat2025} +Pipecat AI, +``Smart Turn: Real-time End-of-Turn Detection,'' +2025. [Online]. Available: https://github.com/pipecat-ai/smart-turn + +\bibitem{nurcsp2019} +A.T. Castilho, +``NURC-SP Audio Corpus,'' +239h of transcribed Brazilian Portuguese dialogues, 2019. + +\end{thebibliography} + +\end{document} diff --git a/previous-experiments/01-benchmarks/report/figures/accuracy_vs_latency.png b/previous-experiments/01-benchmarks/report/figures/accuracy_vs_latency.png new file mode 100644 index 0000000000000000000000000000000000000000..4f46339e5d1f038a5ddfe25dd1ca0b01b6722b1b Binary files /dev/null and b/previous-experiments/01-benchmarks/report/figures/accuracy_vs_latency.png differ diff --git a/previous-experiments/01-benchmarks/report/figures/f1_comparison.png b/previous-experiments/01-benchmarks/report/figures/f1_comparison.png new file mode 100644 index 0000000000000000000000000000000000000000..6cc25c18abf2cb362d72847442948ad77fba281e Binary files /dev/null and b/previous-experiments/01-benchmarks/report/figures/f1_comparison.png differ diff --git a/previous-experiments/01-benchmarks/report/figures/latency_comparison.png b/previous-experiments/01-benchmarks/report/figures/latency_comparison.png new file mode 100644 index 0000000000000000000000000000000000000000..41b4a80f60f57fc0faa7645e12a7b2c50f8b34d3 Binary files /dev/null and b/previous-experiments/01-benchmarks/report/figures/latency_comparison.png differ diff --git a/previous-experiments/01-benchmarks/report/figures/radar_chart.png b/previous-experiments/01-benchmarks/report/figures/radar_chart.png new file mode 100644 index 0000000000000000000000000000000000000000..82620467de324f97dd34ee0995cf565b2d5d1a6b --- /dev/null +++ b/previous-experiments/01-benchmarks/report/figures/radar_chart.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1aa3dca75a51d4af1c22054e0ef9be47efc62fd986c6689c0b349d4531501bae +size 206203 diff --git a/previous-experiments/01-benchmarks/run_benchmarks.py b/previous-experiments/01-benchmarks/run_benchmarks.py new file mode 100644 index 0000000000000000000000000000000000000000..02620748f400dda11fcfaae2c73dae02f3c81041 --- /dev/null +++ b/previous-experiments/01-benchmarks/run_benchmarks.py @@ -0,0 +1,176 @@ +""" +Run all turn-taking benchmarks and generate results. + +Usage: + python run_benchmarks.py --all # Run everything + python run_benchmarks.py --models vap silero_vad # Run specific models + python run_benchmarks.py --dataset synthetic # Use specific dataset +""" + +from __future__ import annotations + +import argparse +import json +import logging +import sys +from pathlib import Path + +from benchmark_base import evaluate_model, save_result, load_all_results, BenchmarkResult +from setup_dataset import generate_synthetic_dataset, download_switchboard_from_hf, load_annotations + +log = logging.getLogger(__name__) + + +def get_model(name: str): + """Factory for turn-taking models.""" + if name == "silence_500ms": + from benchmark_silence import SilenceThresholdModel + return SilenceThresholdModel(silence_threshold_ms=500.0) + elif name == "silence_700ms": + from benchmark_silence import SilenceThresholdModel + return SilenceThresholdModel(silence_threshold_ms=700.0) + elif name == "silence_1000ms": + from benchmark_silence import SilenceThresholdModel + return SilenceThresholdModel(silence_threshold_ms=1000.0) + elif name == "silero_vad": + from benchmark_silero_vad import SileroVADModel + return SileroVADModel() + elif name == "vap": + from benchmark_vap import VAPModel + return VAPModel() + elif name == "livekit_eot": + from benchmark_livekit_eot import LiveKitEOTModel + return LiveKitEOTModel() + else: + raise ValueError(f"Unknown model: {name}") + + +ALL_MODELS = [ + "silence_500ms", + "silence_700ms", + "silence_1000ms", + "silero_vad", + "vap", + "livekit_eot", +] + + +def run_benchmarks( + model_names: list[str], + dataset_name: str = "synthetic", + n_synthetic: int = 100, + tolerance_ms: float = 500.0, +) -> list[BenchmarkResult]: + """Run benchmarks for specified models on a dataset.""" + + # Prepare dataset + log.info("=== Preparing dataset: %s ===", dataset_name) + try: + conversations = load_annotations(dataset_name) + log.info("Loaded %d cached annotations", len(conversations)) + except FileNotFoundError: + if dataset_name == "synthetic": + conversations = generate_synthetic_dataset(n_conversations=n_synthetic) + elif dataset_name == "switchboard": + conversations = download_switchboard_from_hf() + else: + raise ValueError(f"Unknown dataset: {dataset_name}") + + if not conversations: + log.error("No conversations loaded!") + return [] + + log.info("Dataset: %d conversations, %.1f hours", + len(conversations), sum(c.duration for c in conversations) / 3600) + + # Run each model + results: list[BenchmarkResult] = [] + for model_name in model_names: + log.info("=== Benchmarking: %s ===", model_name) + try: + model = get_model(model_name) + result = evaluate_model(model, conversations, dataset_name, tolerance_ms) + save_result(result) + results.append(result) + + log.info( + " F1(shift)=%.3f F1(hold)=%.3f Balanced-Acc=%.3f " + "Latency(p50)=%.1fms FalseInterrupt=%.2f%%", + result.f1_shift, result.f1_hold, result.balanced_accuracy, + result.p50_latency_ms, result.false_interruption_rate * 100, + ) + except Exception as e: + log.error("Failed to benchmark %s: %s", model_name, e, exc_info=True) + + return results + + +def print_comparison_table(results: list[BenchmarkResult]) -> None: + """Print a comparison table of all results.""" + try: + from tabulate import tabulate + except ImportError: + # Fallback + for r in results: + print(f"{r.model_name}: F1={r.macro_f1:.3f} BalAcc={r.balanced_accuracy:.3f} " + f"Latency={r.p50_latency_ms:.1f}ms") + return + + headers = [ + "Model", "F1(shift)", "F1(hold)", "Macro-F1", "Bal.Acc", + "Latency(p50)", "FalseInt%", "GPU?", "ASR?", "Size(MB)" + ] + rows = [] + for r in sorted(results, key=lambda x: x.macro_f1, reverse=True): + rows.append([ + r.model_name, + f"{r.f1_shift:.3f}", + f"{r.f1_hold:.3f}", + f"{r.macro_f1:.3f}", + f"{r.balanced_accuracy:.3f}", + f"{r.p50_latency_ms:.1f}ms", + f"{r.false_interruption_rate * 100:.1f}%", + "Yes" if r.requires_gpu else "No", + "Yes" if r.requires_asr else "No", + f"{r.model_size_mb:.0f}", + ]) + + print("\n" + tabulate(rows, headers=headers, tablefmt="grid")) + print() + + +if __name__ == "__main__": + logging.basicConfig( + level=logging.INFO, + format="%(asctime)s %(levelname)s [%(name)s] %(message)s", + ) + + parser = argparse.ArgumentParser(description="Run turn-taking benchmarks") + parser.add_argument("--models", nargs="+", default=None, + help=f"Models to benchmark (default: all). Options: {ALL_MODELS}") + parser.add_argument("--dataset", default="synthetic", + choices=["synthetic", "switchboard"], + help="Dataset to use (default: synthetic)") + parser.add_argument("--n-synthetic", type=int, default=100, + help="Number of synthetic conversations (default: 100)") + parser.add_argument("--tolerance-ms", type=float, default=500.0, + help="Event matching tolerance in ms (default: 500)") + parser.add_argument("--all", action="store_true", + help="Run all models on all datasets") + args = parser.parse_args() + + model_names = args.models or ALL_MODELS + + if args.all: + # Run on all datasets + all_results = [] + for ds in ["synthetic", "switchboard"]: + try: + res = run_benchmarks(model_names, ds, args.n_synthetic, args.tolerance_ms) + all_results.extend(res) + except Exception as e: + log.error("Failed on dataset %s: %s", ds, e) + print_comparison_table(all_results) + else: + results = run_benchmarks(model_names, args.dataset, args.n_synthetic, args.tolerance_ms) + print_comparison_table(results) diff --git a/previous-experiments/01-benchmarks/run_portuguese_benchmark.py b/previous-experiments/01-benchmarks/run_portuguese_benchmark.py new file mode 100644 index 0000000000000000000000000000000000000000..0f2d704f68a8a4a593829864569508e0c5eefd66 --- /dev/null +++ b/previous-experiments/01-benchmarks/run_portuguese_benchmark.py @@ -0,0 +1,222 @@ +""" +Run turn-taking benchmarks on Portuguese audio data. + +Tests all models on synthetic Portuguese conversations to evaluate +which performs best for Portuguese language turn-taking detection. + +Usage: + python run_portuguese_benchmark.py +""" + +from __future__ import annotations + +import json +import logging +import sys +import time +from pathlib import Path + +import numpy as np + +log = logging.getLogger(__name__) + +# Ensure local imports work +sys.path.insert(0, str(Path(__file__).parent)) + +from setup_portuguese_dataset import ( + generate_portuguese_synthetic, + load_annotations, + Conversation, + TurnSegment, +) +from benchmark_base import ( + evaluate_model, + save_result, + BenchmarkResult, + TurnTakingModel, + PredictedEvent, + RESULTS_DIR, +) + + +def run_all_benchmarks(): + """Run all benchmarks on Portuguese data.""" + + # Step 1: Generate dataset + log.info("=" * 60) + log.info("STEP 1: Generating Portuguese synthetic dataset") + log.info("=" * 60) + + try: + conversations = load_annotations("portuguese_synthetic") + log.info("Loaded cached annotations: %d conversations", len(conversations)) + except FileNotFoundError: + conversations = generate_portuguese_synthetic(n_conversations=100) + + total_hours = sum(c.duration for c in conversations) / 3600 + total_shifts = sum(len(c.turn_shifts) for c in conversations) + total_holds = sum(len(c.holds) for c in conversations) + log.info("Dataset: %d conversations, %.1f hours, %d shifts, %d holds", + len(conversations), total_hours, total_shifts, total_holds) + + # Step 2: Run each model + results: list[BenchmarkResult] = [] + models_to_test = [] + + # 2a: Silence baselines + log.info("=" * 60) + log.info("STEP 2a: Silence threshold baselines") + log.info("=" * 60) + + from benchmark_silence import SilenceThresholdModel + for threshold_ms in [300, 500, 700, 1000]: + models_to_test.append(SilenceThresholdModel(silence_threshold_ms=threshold_ms)) + + # 2b: Silero VAD + log.info("=" * 60) + log.info("STEP 2b: Silero VAD") + log.info("=" * 60) + + from benchmark_silero_vad import SileroVADModel + models_to_test.append(SileroVADModel()) + + # 2c: VAP + log.info("=" * 60) + log.info("STEP 2c: Voice Activity Projection (VAP)") + log.info("=" * 60) + + try: + from benchmark_vap import VAPModel + models_to_test.append(VAPModel()) + except Exception as e: + log.error("VAP not available: %s", e) + + # 2d: LiveKit EOT + log.info("=" * 60) + log.info("STEP 2d: LiveKit End-of-Turn") + log.info("=" * 60) + + try: + from benchmark_livekit_eot import LiveKitEOTModel + models_to_test.append(LiveKitEOTModel()) + except Exception as e: + log.error("LiveKit EOT not available: %s", e) + + # Run all models + for model in models_to_test: + log.info("-" * 40) + log.info("Running: %s", model.name) + log.info("-" * 40) + + t0 = time.time() + try: + result = evaluate_model( + model, conversations, "portuguese_synthetic", tolerance_ms=500.0 + ) + elapsed = time.time() - t0 + save_result(result) + results.append(result) + + log.info(" RESULTS for %s:", model.name) + log.info(" F1(shift) = %.4f", result.f1_shift) + log.info(" F1(hold) = %.4f", result.f1_hold) + log.info(" Macro-F1 = %.4f", result.macro_f1) + log.info(" Balanced Acc = %.4f", result.balanced_accuracy) + log.info(" Precision(s) = %.4f", result.precision_shift) + log.info(" Recall(s) = %.4f", result.recall_shift) + log.info(" Latency p50 = %.1f ms", result.p50_latency_ms) + log.info(" Latency p95 = %.1f ms", result.p95_latency_ms) + log.info(" FalseInterrupt = %.2f%%", result.false_interruption_rate * 100) + log.info(" MissedShift = %.2f%%", result.missed_shift_rate * 100) + log.info(" Time elapsed = %.1f s", elapsed) + except Exception as e: + log.error(" FAILED: %s", e, exc_info=True) + + # Step 3: Print comparison + log.info("=" * 60) + log.info("STEP 3: FINAL COMPARISON") + log.info("=" * 60) + print_comparison(results) + + # Step 4: Generate report + log.info("=" * 60) + log.info("STEP 4: Generating report") + log.info("=" * 60) + try: + from generate_report import generate_all + generate_all() + log.info("Report generated in report/") + except Exception as e: + log.error("Report generation failed: %s", e) + + return results + + +def print_comparison(results: list[BenchmarkResult]) -> None: + """Print final comparison table.""" + if not results: + log.warning("No results to compare!") + return + + try: + from tabulate import tabulate + except ImportError: + for r in sorted(results, key=lambda x: x.macro_f1, reverse=True): + print(f" {r.model_name:20s} F1={r.macro_f1:.3f} BA={r.balanced_accuracy:.3f} " + f"Lat={r.p50_latency_ms:.0f}ms FI={r.false_interruption_rate*100:.1f}%") + return + + headers = [ + "Rank", "Model", "Macro-F1", "Bal.Acc", + "F1(shift)", "F1(hold)", "Prec(s)", "Rec(s)", + "Lat.p50", "Lat.p95", "FalseInt%", "MissShift%", + "GPU?", "ASR?", "Size(MB)" + ] + + sorted_results = sorted(results, key=lambda r: r.macro_f1, reverse=True) + rows = [] + for i, r in enumerate(sorted_results, 1): + rows.append([ + i, + r.model_name, + f"{r.macro_f1:.4f}", + f"{r.balanced_accuracy:.4f}", + f"{r.f1_shift:.4f}", + f"{r.f1_hold:.4f}", + f"{r.precision_shift:.4f}", + f"{r.recall_shift:.4f}", + f"{r.p50_latency_ms:.1f}ms", + f"{r.p95_latency_ms:.1f}ms", + f"{r.false_interruption_rate * 100:.1f}%", + f"{r.missed_shift_rate * 100:.1f}%", + "Yes" if r.requires_gpu else "No", + "Yes" if r.requires_asr else "No", + f"{r.model_size_mb:.0f}", + ]) + + print("\n" + "=" * 120) + print("TURN-TAKING BENCHMARK — PORTUGUESE AUDIO") + print("=" * 120) + print(tabulate(rows, headers=headers, tablefmt="grid")) + print() + + # Winner + best = sorted_results[0] + print(f"WINNER: {best.model_name}") + print(f" Macro-F1: {best.macro_f1:.4f}") + print(f" Balanced Accuracy: {best.balanced_accuracy:.4f}") + print(f" False Interruption Rate: {best.false_interruption_rate*100:.1f}%") + print(f" Latency (p50): {best.p50_latency_ms:.1f}ms") + print() + + +if __name__ == "__main__": + logging.basicConfig( + level=logging.INFO, + format="%(asctime)s %(levelname)s [%(name)s] %(message)s", + handlers=[ + logging.StreamHandler(), + logging.FileHandler("benchmark_portuguese.log"), + ], + ) + run_all_benchmarks() diff --git a/previous-experiments/01-benchmarks/setup_dataset.py b/previous-experiments/01-benchmarks/setup_dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..e1f6c9f406caa85b2f66cd15a6d8288ad6f0df4b --- /dev/null +++ b/previous-experiments/01-benchmarks/setup_dataset.py @@ -0,0 +1,311 @@ +""" +Download and prepare turn-taking evaluation datasets. + +Datasets used: +1. Switchboard (HuggingFace) - Two-speaker telephone conversations with timestamps +2. HCRC Map Task (Edinburgh) - Task-oriented dialogues with turn annotations + +References: +- Godfrey, J.J., Holliman, E.C., & McDaniel, J. (1992). SWITCHBOARD: Telephone speech + corpus for research and development. ICASSP-92. +- Anderson, A.H., et al. (1991). The HCRC Map Task Corpus. Language and Speech, 34(4). +""" + +from __future__ import annotations + +import argparse +import json +import logging +import os +import shutil +import urllib.request +import zipfile +from dataclasses import dataclass, field +from pathlib import Path + +import numpy as np +import soundfile as sf + +log = logging.getLogger(__name__) + +DATA_DIR = Path(__file__).parent / "data" +SWITCHBOARD_DIR = DATA_DIR / "switchboard" +MAPTASK_DIR = DATA_DIR / "maptask" +ANNOTATIONS_DIR = DATA_DIR / "annotations" + + +@dataclass +class TurnSegment: + """A single speaker turn with timing information.""" + speaker: str + start: float # seconds + end: float # seconds + text: str = "" + + @property + def duration(self) -> float: + return self.end - self.start + + +@dataclass +class Conversation: + """A conversation with turn-taking annotations.""" + conv_id: str + audio_path: str + sample_rate: int + duration: float # total duration in seconds + turns: list[TurnSegment] = field(default_factory=list) + # Derived labels + turn_shifts: list[float] = field(default_factory=list) # timestamps of speaker changes + holds: list[float] = field(default_factory=list) # timestamps where same speaker continues after pause + + +def download_switchboard_from_hf() -> list[Conversation]: + """Download Switchboard subset from HuggingFace datasets.""" + from datasets import load_dataset + + log.info("Downloading Switchboard from HuggingFace...") + SWITCHBOARD_DIR.mkdir(parents=True, exist_ok=True) + + # Use the Switchboard subset available on HF + try: + ds = load_dataset("hhoangphuoc/switchboard", split="train", streaming=True) + except Exception: + log.warning("HF Switchboard not available, trying alternative...") + ds = load_dataset("swda", split="train", streaming=True) + + conversations: list[Conversation] = [] + count = 0 + max_conversations = 200 # Limit for benchmark feasibility + + current_conv_id = None + current_turns: list[TurnSegment] = [] + + for sample in ds: + conv_id = str(sample.get("conversation_no", sample.get("conv_id", count))) + + if conv_id != current_conv_id: + if current_conv_id is not None and current_turns: + conv = _build_conversation_from_text(current_conv_id, current_turns) + if conv: + conversations.append(conv) + count += 1 + if count >= max_conversations: + break + + current_conv_id = conv_id + current_turns = [] + + speaker = sample.get("caller", sample.get("speaker", "A")) + text = sample.get("text", sample.get("utterance", "")) + if text: + current_turns.append(TurnSegment( + speaker=str(speaker), + start=0.0, # Will be estimated + end=0.0, + text=text.strip(), + )) + + # Save annotations + _save_annotations(conversations, "switchboard") + log.info("Downloaded %d Switchboard conversations", len(conversations)) + return conversations + + +def download_candor_sample() -> list[Conversation]: + """ + Download CANDOR corpus sample for turn-taking evaluation. + + Reference: + - Reece, A.G., et al. (2023). The CANDOR corpus: Insights from a large + multi-modal dataset of naturalistic conversation. Science Advances, 9(13). + """ + log.info("CANDOR corpus requires manual download from https://cadl.humlab.lu.se/candor/") + log.info("See: https://www.science.org/doi/10.1126/sciadv.adf3197") + return [] + + +def generate_synthetic_dataset( + n_conversations: int = 100, + min_turns: int = 10, + max_turns: int = 40, + sample_rate: int = 16000, +) -> list[Conversation]: + """ + Generate synthetic two-speaker conversations with ground-truth turn annotations. + + This provides a controlled baseline where we know exact turn boundaries. + Uses silence/noise segments between speakers to simulate realistic gaps/overlaps. + """ + log.info("Generating %d synthetic conversations...", n_conversations) + synth_dir = DATA_DIR / "synthetic" + synth_dir.mkdir(parents=True, exist_ok=True) + + conversations = [] + rng = np.random.default_rng(42) + + for i in range(n_conversations): + n_turns = rng.integers(min_turns, max_turns + 1) + turns = [] + t = 0.0 + speakers = ["A", "B"] + + for j in range(n_turns): + speaker = speakers[j % 2] + # Turn duration: 0.5 - 5.0 seconds + duration = rng.uniform(0.5, 5.0) + # Gap between turns: -0.3 (overlap) to 1.5 seconds + gap = rng.uniform(-0.3, 1.5) if j > 0 else 0.0 + + start = max(t + gap, t) # No negative starts + end = start + duration + + turns.append(TurnSegment( + speaker=speaker, + start=round(start, 3), + end=round(end, 3), + text=f"[synthetic turn {j}]", + )) + t = end + + total_duration = turns[-1].end + # Generate audio: sine waves at different frequencies per speaker + n_samples = int(total_duration * sample_rate) + audio = np.zeros(n_samples, dtype=np.float32) + + for turn in turns: + freq = 200.0 if turn.speaker == "A" else 350.0 + s = int(turn.start * sample_rate) + e = min(int(turn.end * sample_rate), n_samples) + t_arr = np.arange(e - s) / sample_rate + audio[s:e] = 0.3 * np.sin(2 * np.pi * freq * t_arr).astype(np.float32) + + # Add noise + audio += rng.normal(0, 0.01, n_samples).astype(np.float32) + + audio_path = synth_dir / f"synth_{i:04d}.wav" + sf.write(str(audio_path), audio, sample_rate) + + # Compute turn shifts and holds + turn_shifts = [] + holds = [] + for k in range(1, len(turns)): + if turns[k].speaker != turns[k - 1].speaker: + turn_shifts.append(turns[k].start) + else: + holds.append(turns[k].start) + + conversations.append(Conversation( + conv_id=f"synth_{i:04d}", + audio_path=str(audio_path), + sample_rate=sample_rate, + duration=total_duration, + turns=turns, + turn_shifts=turn_shifts, + holds=holds, + )) + + _save_annotations(conversations, "synthetic") + log.info("Generated %d synthetic conversations (%.1f hours)", + len(conversations), sum(c.duration for c in conversations) / 3600) + return conversations + + +def _build_conversation_from_text(conv_id: str, turns: list[TurnSegment]) -> Conversation | None: + """Build a Conversation from text-only turns by estimating timing.""" + if len(turns) < 3: + return None + + # Estimate timing: ~150ms per word + 200ms gap + t = 0.0 + for i, turn in enumerate(turns): + words = len(turn.text.split()) + duration = max(0.5, words * 0.15) + gap = 0.2 if i > 0 else 0.0 + turn.start = round(t + gap, 3) + turn.end = round(turn.start + duration, 3) + t = turn.end + + turn_shifts = [] + holds = [] + for k in range(1, len(turns)): + if turns[k].speaker != turns[k - 1].speaker: + turn_shifts.append(turns[k].start) + else: + holds.append(turns[k].start) + + return Conversation( + conv_id=conv_id, + audio_path="", # text-only + sample_rate=16000, + duration=turns[-1].end, + turns=turns, + turn_shifts=turn_shifts, + holds=holds, + ) + + +def _save_annotations(conversations: list[Conversation], name: str) -> None: + """Save conversation annotations to JSON for reproducibility.""" + ANNOTATIONS_DIR.mkdir(parents=True, exist_ok=True) + out = [] + for conv in conversations: + out.append({ + "conv_id": conv.conv_id, + "audio_path": conv.audio_path, + "sample_rate": conv.sample_rate, + "duration": conv.duration, + "n_turns": len(conv.turns), + "n_turn_shifts": len(conv.turn_shifts), + "n_holds": len(conv.holds), + "turns": [ + {"speaker": t.speaker, "start": t.start, "end": t.end, "text": t.text} + for t in conv.turns + ], + "turn_shifts": conv.turn_shifts, + "holds": conv.holds, + }) + + path = ANNOTATIONS_DIR / f"{name}_annotations.json" + with open(path, "w") as f: + json.dump(out, f, indent=2) + log.info("Saved %d annotations to %s", len(out), path) + + +def load_annotations(name: str) -> list[Conversation]: + """Load previously saved annotations.""" + path = ANNOTATIONS_DIR / f"{name}_annotations.json" + if not path.exists(): + raise FileNotFoundError(f"Annotations not found: {path}") + + with open(path) as f: + data = json.load(f) + + conversations = [] + for item in data: + turns = [TurnSegment(**t) for t in item["turns"]] + conversations.append(Conversation( + conv_id=item["conv_id"], + audio_path=item["audio_path"], + sample_rate=item["sample_rate"], + duration=item["duration"], + turns=turns, + turn_shifts=item["turn_shifts"], + holds=item["holds"], + )) + return conversations + + +if __name__ == "__main__": + logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s") + + parser = argparse.ArgumentParser(description="Download turn-taking datasets") + parser.add_argument("--dataset", choices=["switchboard", "synthetic", "all"], default="all") + parser.add_argument("--n-synthetic", type=int, default=100) + args = parser.parse_args() + + if args.dataset in ("synthetic", "all"): + generate_synthetic_dataset(n_conversations=args.n_synthetic) + + if args.dataset in ("switchboard", "all"): + download_switchboard_from_hf() diff --git a/previous-experiments/01-benchmarks/setup_nurc_dataset.py b/previous-experiments/01-benchmarks/setup_nurc_dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..b3ef2240049c58c75ad80b4967e1efabf99cc961 --- /dev/null +++ b/previous-experiments/01-benchmarks/setup_nurc_dataset.py @@ -0,0 +1,235 @@ +""" +Prepare NURC-SP Corpus Minimo dialogues for turn-taking benchmark. + +Reconstructs continuous audio from segmented audio files and builds +Conversation objects with ground truth turn annotations. + +Dataset: NURC-SP Corpus Minimo (nilc-nlp/NURC-SP_Corpus_Minimo on HuggingFace) +- Real Brazilian Portuguese spontaneous dialogues from the 1970s-1990s +- Manually annotated speaker turns with timestamps +- CC BY-NC-ND 4.0 license +""" + +from __future__ import annotations + +import json +import logging +from pathlib import Path + +import numpy as np +import pandas as pd +import soundfile as sf + +from setup_portuguese_dataset import Conversation, TurnSegment + +log = logging.getLogger(__name__) + +DATA_DIR = Path(__file__).parent / "data" +SEGMENTS_DIR = DATA_DIR / "nurc_sp" / "segmented_audios" +NURC_DIR = DATA_DIR / "nurc_sp" +ANNOTATIONS_DIR = DATA_DIR / "annotations" + +# Only use multi-speaker dialogues (DID = diálogos, D2 = diálogos entre informantes) +DIALOGUE_PREFIXES = ("SP_DID_", "SP_D2_") + +TARGET_SR = 16000 + + +def load_nurc_metadata() -> pd.DataFrame: + """Load NURC-SP segment metadata.""" + from huggingface_hub import hf_hub_download + path = hf_hub_download( + "nilc-nlp/NURC-SP_Corpus_Minimo", + "segmented_audios_time.csv", + repo_type="dataset", + ) + return pd.read_csv(path) + + +def build_conversation(name: str, segments_df: pd.DataFrame) -> Conversation | None: + """Reconstruct continuous audio and build Conversation from segments.""" + # Sort segments by start_time + segments_df = segments_df.sort_values("start_time").reset_index(drop=True) + + # Filter to segments that have audio files + seg_dir = SEGMENTS_DIR / name + if not seg_dir.exists(): + log.warning("No audio directory for %s", name) + return None + + # Build timeline: read each segment's audio and place at correct offset + total_end = segments_df["end_time"].max() + total_start = segments_df["start_time"].min() + duration = total_end - total_start + + # Limit to first 5 minutes for benchmark speed + max_duration = 300.0 # 5 minutes + if duration > max_duration: + cutoff = total_start + max_duration + segments_df = segments_df[segments_df["start_time"] < cutoff].copy() + total_end = min(cutoff, segments_df["end_time"].max()) + duration = total_end - total_start + + n_samples = int(duration * TARGET_SR) + TARGET_SR # Extra second buffer + audio = np.zeros(n_samples, dtype=np.float32) + + turns = [] + loaded = 0 + skipped = 0 + + for _, row in segments_df.iterrows(): + # Find the audio file + start_str = f"{row['start_time']:.2f}" + end_str = f"{row['end_time']:.2f}" + pattern = f"{name}_seg_{start_str}_{end_str}.wav" + audio_path = seg_dir / pattern + + if not audio_path.exists(): + # Try matching with different decimal precision + candidates = list(seg_dir.glob(f"{name}_seg_{start_str[:5]}*_{end_str[:5]}*.wav")) + if candidates: + audio_path = candidates[0] + else: + skipped += 1 + continue + + try: + seg_audio, sr = sf.read(str(audio_path)) + except Exception: + skipped += 1 + continue + + if seg_audio.ndim > 1: + seg_audio = seg_audio.mean(axis=1) + seg_audio = seg_audio.astype(np.float32) + + # Resample if needed + if sr != TARGET_SR: + import torchaudio + import torch + tensor = torch.from_numpy(seg_audio).float().unsqueeze(0) + resampler = torchaudio.transforms.Resample(sr, TARGET_SR) + tensor = resampler(tensor) + seg_audio = tensor.squeeze().numpy() + + # Place in timeline + offset = row["start_time"] - total_start + start_idx = int(offset * TARGET_SR) + end_idx = start_idx + len(seg_audio) + + if end_idx > len(audio): + seg_audio = seg_audio[:len(audio) - start_idx] + end_idx = len(audio) + + if start_idx < len(audio) and len(seg_audio) > 0: + audio[start_idx:start_idx + len(seg_audio)] = seg_audio + loaded += 1 + + # Create turn segment + turns.append(TurnSegment( + speaker=row["speaker"], + start=round(offset, 3), + end=round(offset + (row["end_time"] - row["start_time"]), 3), + text=str(row.get("normalized_text", "")), + )) + + if loaded < 5: + log.warning("Only loaded %d/%d segments for %s, skipping", loaded, loaded + skipped, name) + return None + + # Trim audio to actual content + actual_end = max(t.end for t in turns) if turns else 0 + n_samples = int(actual_end * TARGET_SR) + TARGET_SR + audio = audio[:n_samples] + + # Normalize + peak = np.max(np.abs(audio)) + if peak > 0: + audio = audio / peak * 0.9 + + # Save reconstructed audio + NURC_DIR.mkdir(parents=True, exist_ok=True) + audio_path = NURC_DIR / f"{name}.wav" + sf.write(str(audio_path), audio, TARGET_SR) + + # Compute events + turn_shifts = [] + holds = [] + for k in range(1, len(turns)): + if turns[k].speaker != turns[k - 1].speaker: + turn_shifts.append(turns[k].start) + else: + holds.append(turns[k].start) + + log.info(" %s: %d turns (%d loaded, %d skipped), %d shifts, %d holds, %.0fs", + name, len(turns), loaded, skipped, len(turn_shifts), len(holds), actual_end) + + return Conversation( + conv_id=name, + audio_path=str(audio_path), + sample_rate=TARGET_SR, + duration=actual_end, + turns=turns, + turn_shifts=turn_shifts, + holds=holds, + ) + + +def prepare_nurc_dataset(conversation_names: list[str] | None = None) -> list[Conversation]: + """Prepare NURC-SP conversations for benchmarking.""" + df = load_nurc_metadata() + + # Filter to dialogues only + dialogue_df = df[df["name"].str.startswith(DIALOGUE_PREFIXES)] + available_names = sorted(dialogue_df["name"].unique()) + + if conversation_names: + names = [n for n in conversation_names if n in available_names] + else: + names = available_names + + log.info("Preparing %d NURC-SP conversations...", len(names)) + + conversations = [] + for name in names: + conv_df = dialogue_df[dialogue_df["name"] == name] + conv = build_conversation(name, conv_df) + if conv: + conversations.append(conv) + + # Save annotations + ANNOTATIONS_DIR.mkdir(parents=True, exist_ok=True) + ann_data = [] + for conv in conversations: + ann_data.append({ + "conv_id": conv.conv_id, + "audio_path": conv.audio_path, + "sample_rate": conv.sample_rate, + "duration": conv.duration, + "n_turns": len(conv.turns), + "n_turn_shifts": len(conv.turn_shifts), + "n_holds": len(conv.holds), + "turns": [ + {"speaker": t.speaker, "start": t.start, "end": t.end, "text": t.text} + for t in conv.turns + ], + "turn_shifts": conv.turn_shifts, + "holds": conv.holds, + }) + + ann_path = ANNOTATIONS_DIR / "nurc_sp_annotations.json" + with open(ann_path, "w") as f: + json.dump(ann_data, f, indent=2, ensure_ascii=False) + + total_hours = sum(c.duration for c in conversations) / 3600 + total_shifts = sum(len(c.turn_shifts) for c in conversations) + total_holds = sum(len(c.holds) for c in conversations) + log.info("Prepared %d conversations: %.1f min, %d shifts, %d holds", + len(conversations), total_hours * 60, total_shifts, total_holds) + + return conversations + + +if __name__ == "__main__": + logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s") + prepare_nurc_dataset() diff --git a/previous-experiments/01-benchmarks/setup_portuguese_dataset.py b/previous-experiments/01-benchmarks/setup_portuguese_dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..becf51b54836ae1a58108eaf350262479b1da52a --- /dev/null +++ b/previous-experiments/01-benchmarks/setup_portuguese_dataset.py @@ -0,0 +1,438 @@ +""" +Download and prepare Portuguese conversation datasets for turn-taking evaluation. + +Datasets: +1. NURC-SP / CORAL-BRASIL — Brazilian Portuguese spontaneous dialogue +2. Common Voice PT — Mozilla, single speaker (for baseline audio) +3. Synthetic Portuguese — generated with controlled turn timing + +References: +- Castilho, A.T. (2019). NURC-SP Audio Corpus. 239h of transcribed + Brazilian Portuguese dialogues. +- ASR-BPCSC: Brazilian Portuguese Conversational Speech Corpus. + 10h transcribed conversational speech, 30 conversations. +""" + +from __future__ import annotations + +import json +import logging +import os +from dataclasses import dataclass, field +from pathlib import Path + +import numpy as np +import soundfile as sf + +log = logging.getLogger(__name__) + +DATA_DIR = Path(__file__).parent / "data" +PT_DIR = DATA_DIR / "portuguese" +ANNOTATIONS_DIR = DATA_DIR / "annotations" + + +@dataclass +class TurnSegment: + speaker: str + start: float + end: float + text: str = "" + + @property + def duration(self) -> float: + return self.end - self.start + + +@dataclass +class Conversation: + conv_id: str + audio_path: str + sample_rate: int + duration: float + turns: list[TurnSegment] = field(default_factory=list) + turn_shifts: list[float] = field(default_factory=list) + holds: list[float] = field(default_factory=list) + + +def download_common_voice_pt_dialogues(max_pairs: int = 50) -> list[Conversation]: + """ + Download Common Voice Portuguese and create synthetic dialogues + by concatenating different speakers' utterances. + """ + from datasets import load_dataset + + log.info("Downloading Common Voice Portuguese samples...") + PT_DIR.mkdir(parents=True, exist_ok=True) + + try: + ds = load_dataset( + "mozilla-foundation/common_voice_17_0", + "pt", + split="train", + streaming=True, + trust_remote_code=True, + ) + except Exception as e: + log.warning("Common Voice requires login. Trying alternative: %s", e) + return [] + + # Collect samples from different speakers + speaker_samples: dict[str, list] = {} + count = 0 + for sample in ds: + client_id = sample.get("client_id", str(count)) + if client_id not in speaker_samples: + speaker_samples[client_id] = [] + if len(speaker_samples[client_id]) < 10: + speaker_samples[client_id].append({ + "audio": sample["audio"]["array"], + "sr": sample["audio"]["sampling_rate"], + "text": sample.get("sentence", ""), + }) + count += 1 + if len(speaker_samples) >= 20 and all(len(v) >= 3 for v in speaker_samples.values()): + break + if count > 5000: + break + + # Create dialogue pairs + conversations = _create_dialogues_from_speakers(speaker_samples, max_pairs) + _save_annotations(conversations, "portuguese_cv") + return conversations + + +def generate_portuguese_synthetic( + n_conversations: int = 100, + min_turns: int = 8, + max_turns: int = 30, + sample_rate: int = 16000, +) -> list[Conversation]: + """ + Generate synthetic Portuguese dialogues with precise turn annotations. + + Simulates realistic Portuguese conversation patterns: + - Average turn duration: 1.5-4s (Portuguese speakers tend to have longer turns) + - Inter-turn gap: median ~200ms (typical for Portuguese) + - Overlap rate: ~15% of turns (Portuguese has more overlap than English) + """ + log.info("Generating %d synthetic Portuguese conversations...", n_conversations) + synth_dir = PT_DIR / "synthetic" + synth_dir.mkdir(parents=True, exist_ok=True) + + conversations = [] + rng = np.random.default_rng(42) + + # Portuguese conversation timing parameters + # Based on NURC-SP and C-ORAL-BRASIL studies + turn_duration_mean = 2.5 # seconds + turn_duration_std = 1.2 + gap_mean = 0.2 # seconds (Portuguese has shorter gaps) + gap_std = 0.4 + overlap_prob = 0.15 # 15% overlap rate + + portuguese_phrases = [ + "Olha, eu acho que isso faz sentido", + "Pois é, mas tem outro ponto importante", + "Concordo plenamente com você", + "Não sei se entendi bem", + "Vamos ver como funciona na prática", + "Isso é interessante, mas...", + "Exatamente, é isso mesmo", + "Deixa eu pensar um pouco", + "Bom, na minha opinião", + "Então, o que você acha?", + "Sim, sim, com certeza", + "Espera, deixa eu falar", + "Tá bom, entendi", + "Mas olha só uma coisa", + "É verdade, faz sentido", + "Ah, interessante", + "Hmm, não tenho certeza", + "Pode ser, pode ser", + "Legal, vamos continuar", + "Enfim, voltando ao assunto", + ] + + for i in range(n_conversations): + n_turns = rng.integers(min_turns, max_turns + 1) + turns = [] + t = 0.0 + speakers = ["A", "B"] + + hold_prob = 0.25 # 25% chance same speaker continues (hold) + prev_speaker = None + for j in range(n_turns): + if prev_speaker is None or rng.random() >= hold_prob: + speaker = speakers[j % 2] # Normal alternation + else: + speaker = prev_speaker # Same speaker continues (hold) + + # Turn duration with Portuguese distribution + duration = max(0.4, rng.normal(turn_duration_mean, turn_duration_std)) + + # Gap (can be negative for overlap) + if j > 0: + if rng.random() < overlap_prob: + gap = rng.uniform(-0.5, -0.05) # Overlap + else: + gap = max(0.05, rng.normal(gap_mean, gap_std)) + else: + gap = 0.0 + + start = max(t + gap, 0.0) + end = start + duration + text = rng.choice(portuguese_phrases) + + turns.append(TurnSegment( + speaker=speaker, + start=round(start, 3), + end=round(end, 3), + text=text, + )) + prev_speaker = speaker + t = end + + total_duration = turns[-1].end + + # Generate stereo speech-like audio (ch0=speaker A, ch1=speaker B) + # Uses filtered noise + harmonics to simulate speech formants + n_samples = int(total_duration * sample_rate) + audio_a = np.zeros(n_samples, dtype=np.float32) + audio_b = np.zeros(n_samples, dtype=np.float32) + + for turn in turns: + f0 = 130.0 if turn.speaker == "A" else 200.0 + s = int(turn.start * sample_rate) + e = min(int(turn.end * sample_rate), n_samples) + dur = e - s + if dur <= 0: + continue + + t_arr = np.arange(dur) / sample_rate + + # Glottal pulse train (harmonics simulate voiced speech) + harmonics = np.zeros(dur, dtype=np.float32) + for h in range(1, 8): + amp = 0.3 / h # Falling spectral envelope + jitter = rng.uniform(0.98, 1.02) # Pitch jitter + harmonics += amp * np.sin(2 * np.pi * f0 * h * jitter * t_arr).astype(np.float32) + + # Aspiration noise (unvoiced component) + noise = rng.normal(0, 0.08, dur).astype(np.float32) + + # Formant-like bandpass: weight low freqs more (speech is 300-3000Hz) + from scipy.signal import butter, lfilter + b_low, a_low = butter(2, [200, 3500], btype='band', fs=sample_rate) + noise_filtered = lfilter(b_low, a_low, noise).astype(np.float32) + + signal = harmonics * 0.7 + noise_filtered * 0.3 + + # Amplitude modulation (syllable rhythm ~4-5Hz for Portuguese) + syllable_rate = rng.uniform(4.0, 5.5) + modulation = 0.6 + 0.4 * np.sin(2 * np.pi * syllable_rate * t_arr).astype(np.float32) + signal *= modulation + + # Envelope with natural attack/release + envelope = np.ones(dur, dtype=np.float32) + attack = min(int(0.03 * sample_rate), dur // 4) + release = min(int(0.06 * sample_rate), dur // 4) + if attack > 0: + envelope[:attack] = np.linspace(0, 1, attack).astype(np.float32) + if release > 0: + envelope[-release:] = np.linspace(1, 0, release).astype(np.float32) + + target = audio_a if turn.speaker == "A" else audio_b + target[s:e] += signal * envelope + + # Low ambient noise on both channels + audio_a += rng.normal(0, 0.003, n_samples).astype(np.float32) + audio_b += rng.normal(0, 0.003, n_samples).astype(np.float32) + audio_a = np.clip(audio_a, -1.0, 1.0) + audio_b = np.clip(audio_b, -1.0, 1.0) + # Also save mono mix for models that expect mono + audio = (audio_a + audio_b) / 2.0 + + # Save stereo (for VAP) and mono (for VAD/silence) + stereo = np.stack([audio_a, audio_b], axis=-1) # (samples, 2) + audio_path_stereo = synth_dir / f"pt_synth_{i:04d}_stereo.wav" + sf.write(str(audio_path_stereo), stereo, sample_rate) + + audio_path = synth_dir / f"pt_synth_{i:04d}.wav" + sf.write(str(audio_path), audio, sample_rate) + + # Compute turn events + turn_shifts = [] + holds = [] + for k in range(1, len(turns)): + if turns[k].speaker != turns[k - 1].speaker: + turn_shifts.append(turns[k].start) + else: + holds.append(turns[k].start) + + conversations.append(Conversation( + conv_id=f"pt_synth_{i:04d}", + audio_path=str(audio_path), + sample_rate=sample_rate, + duration=total_duration, + turns=turns, + turn_shifts=turn_shifts, + holds=holds, + )) + + _save_annotations(conversations, "portuguese_synthetic") + total_hours = sum(c.duration for c in conversations) / 3600 + log.info("Generated %d Portuguese conversations (%.1f hours)", len(conversations), total_hours) + return conversations + + +def _create_dialogues_from_speakers( + speaker_samples: dict[str, list], + max_pairs: int, +) -> list[Conversation]: + """Create dialogues by interleaving samples from different speakers.""" + conversations = [] + speakers = list(speaker_samples.keys()) + rng = np.random.default_rng(123) + + for pair_idx in range(min(max_pairs, len(speakers) // 2)): + sp_a = speakers[pair_idx * 2] + sp_b = speakers[pair_idx * 2 + 1] + samples_a = speaker_samples[sp_a] + samples_b = speaker_samples[sp_b] + + turns = [] + audio_chunks = [] + t = 0.0 + target_sr = 16000 + + n_turns = min(len(samples_a) + len(samples_b), 10) + for j in range(n_turns): + if j % 2 == 0 and samples_a: + sample = samples_a.pop(0) + speaker = "A" + elif samples_b: + sample = samples_b.pop(0) + speaker = "B" + else: + break + + audio = np.array(sample["audio"], dtype=np.float32) + sr = sample["sr"] + + # Resample if needed + if sr != target_sr: + import torchaudio + import torch + tensor = torch.from_numpy(audio).float().unsqueeze(0) + resampler = torchaudio.transforms.Resample(sr, target_sr) + tensor = resampler(tensor) + audio = tensor.squeeze().numpy() + + duration = len(audio) / target_sr + gap = rng.uniform(0.1, 0.5) if j > 0 else 0.0 + + # Add gap silence + if gap > 0: + audio_chunks.append(np.zeros(int(gap * target_sr), dtype=np.float32)) + + start = t + gap + end = start + duration + + turns.append(TurnSegment( + speaker=speaker, + start=round(start, 3), + end=round(end, 3), + text=sample.get("text", ""), + )) + audio_chunks.append(audio) + t = end + + if len(turns) < 3: + continue + + # Concatenate audio + full_audio = np.concatenate(audio_chunks) + audio_path = PT_DIR / f"cv_dialogue_{pair_idx:04d}.wav" + sf.write(str(audio_path), full_audio, target_sr) + + turn_shifts = [] + holds = [] + for k in range(1, len(turns)): + if turns[k].speaker != turns[k - 1].speaker: + turn_shifts.append(turns[k].start) + else: + holds.append(turns[k].start) + + conversations.append(Conversation( + conv_id=f"cv_dialogue_{pair_idx:04d}", + audio_path=str(audio_path), + sample_rate=target_sr, + duration=turns[-1].end, + turns=turns, + turn_shifts=turn_shifts, + holds=holds, + )) + + return conversations + + +def _save_annotations(conversations: list[Conversation], name: str) -> None: + ANNOTATIONS_DIR.mkdir(parents=True, exist_ok=True) + out = [] + for conv in conversations: + out.append({ + "conv_id": conv.conv_id, + "audio_path": conv.audio_path, + "sample_rate": conv.sample_rate, + "duration": conv.duration, + "n_turns": len(conv.turns), + "n_turn_shifts": len(conv.turn_shifts), + "n_holds": len(conv.holds), + "turns": [ + {"speaker": t.speaker, "start": t.start, "end": t.end, "text": t.text} + for t in conv.turns + ], + "turn_shifts": conv.turn_shifts, + "holds": conv.holds, + }) + path = ANNOTATIONS_DIR / f"{name}_annotations.json" + with open(path, "w") as f: + json.dump(out, f, indent=2, ensure_ascii=False) + log.info("Saved %d annotations to %s", len(out), path) + + +def load_annotations(name: str) -> list[Conversation]: + path = ANNOTATIONS_DIR / f"{name}_annotations.json" + if not path.exists(): + raise FileNotFoundError(f"Annotations not found: {path}") + with open(path) as f: + data = json.load(f) + conversations = [] + for item in data: + turns = [TurnSegment(**t) for t in item["turns"]] + conversations.append(Conversation( + conv_id=item["conv_id"], + audio_path=item["audio_path"], + sample_rate=item["sample_rate"], + duration=item["duration"], + turns=turns, + turn_shifts=item["turn_shifts"], + holds=item["holds"], + )) + return conversations + + +if __name__ == "__main__": + logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s") + + import argparse + parser = argparse.ArgumentParser() + parser.add_argument("--dataset", choices=["synthetic", "common_voice", "all"], default="all") + parser.add_argument("--n-synthetic", type=int, default=100) + args = parser.parse_args() + + if args.dataset in ("synthetic", "all"): + generate_portuguese_synthetic(n_conversations=args.n_synthetic) + + if args.dataset in ("common_voice", "all"): + download_common_voice_pt_dialogues() diff --git a/previous-experiments/02-finetune-scratch/Dockerfile b/previous-experiments/02-finetune-scratch/Dockerfile new file mode 100644 index 0000000000000000000000000000000000000000..2aef9b903690b4c347cf3e19cf928cb4ac6a3c0e --- /dev/null +++ b/previous-experiments/02-finetune-scratch/Dockerfile @@ -0,0 +1,34 @@ +FROM pytorch/pytorch:2.2.0-cuda12.1-cudnn8-runtime + +WORKDIR /workspace/turn-taking-study + +# System deps +RUN apt-get update && apt-get install -y --no-install-recommends \ + git ffmpeg libsndfile1 curl && \ + rm -rf /var/lib/apt/lists/* + +# Clone benchmark repo +RUN git clone https://github.com/marcosremar/turn-taking-study.git /workspace/turn-taking-study + +# Python deps +RUN pip install --no-cache-dir -r requirements.txt + +# Clone VAP repo +RUN git clone https://github.com/ErikEkstedt/VoiceActivityProjection.git /workspace/vap && \ + cd /workspace/vap && pip install -e . + +# Clone VAP dataset tools +RUN git clone https://github.com/ErikEkstedt/vap_dataset.git /workspace/vap_dataset && \ + cd /workspace/vap_dataset && pip install -e . + +# Copy any local overrides (optional, for dev) +COPY . . + +# Download models on build (cache in image) +RUN python -c "from huggingface_hub import snapshot_download; snapshot_download('livekit/turn-detector')" +RUN python -c "import torch; torch.hub.load('snakers4/silero-vad', 'silero_vad', force_reload=True)" + +ENV PYTHONUNBUFFERED=1 +ENV HF_HUB_CACHE=/workspace/hf_cache + +CMD ["python", "run_benchmarks.py", "--all"] diff --git a/previous-experiments/02-finetune-scratch/deploy_finetune.py b/previous-experiments/02-finetune-scratch/deploy_finetune.py new file mode 100644 index 0000000000000000000000000000000000000000..aa03f5b230bae514233394531113a66cae98e506 --- /dev/null +++ b/previous-experiments/02-finetune-scratch/deploy_finetune.py @@ -0,0 +1,126 @@ +"""Deploy fine-tuning pod via the AI Gateway. + +Strategy: Use RunPod's PyTorch template image (has SSH + PyTorch pre-installed). +Write the training script directly from Python (no base64 env var size limits). +""" +import json +import urllib.request +from pathlib import Path + +GATEWAY_URL = "http://localhost:4000/v1/gpu/deploy" + +# Read the training script content +script_path = Path(__file__).parent / "finetune_smart_turn_v3.py" +script_content = script_path.read_text() + +# Escape for embedding in a Python string inside shell +# We'll write it from Python using a heredoc-style approach +script_lines = script_content.replace("\\", "\\\\").replace("'", "'\\''") + +# Docker start command — writes script from shell heredoc, starts health server + training +docker_cmd = r"""bash -c ' +set -ex +echo "[finetune] Starting at $(date)" + +export PIP_CACHE_DIR=/workspace/.pip_cache +export HF_HOME=/workspace/huggingface +export TRANSFORMERS_CACHE=/workspace/huggingface +export TMPDIR=/workspace/tmp +mkdir -p $PIP_CACHE_DIR $HF_HOME $TMPDIR /workspace/checkpoints + +# Health server + log tail on :8000 (gateway expects "status":"ok") +python3 << "HEALTH_EOF" & +from http.server import HTTPServer, BaseHTTPRequestHandler +import json, os, subprocess + +class H(BaseHTTPRequestHandler): + def do_GET(self): + self.send_response(200) + self.end_headers() + ckpt = os.path.exists("/workspace/checkpoints/smart_turn_pt_v3/resume_checkpoint.pt") + done = os.path.exists("/workspace/checkpoints/smart_turn_pt_v3/training_results.json") + log_tail = "" + if self.path == "/logs": + try: + log_tail = subprocess.check_output(["tail", "-100", "/workspace/training.log"], stderr=subprocess.DEVNULL).decode() + except: pass + self.wfile.write(log_tail.encode()) + return + self.wfile.write(json.dumps({"status": "ok", "training": True, "has_checkpoint": ckpt, "done": done}).encode()) + def log_message(self, *a): pass + +HTTPServer(("0.0.0.0", 8000), H).serve_forever() +HEALTH_EOF + +echo "[finetune] Health server started on :8000" + +# Install system deps (FFmpeg for audio decoding) +echo "[finetune] Installing system deps..." +apt-get update -qq && apt-get install -y -qq ffmpeg libsndfile1 > /dev/null 2>&1 +echo "[finetune] Installing Python deps..." +pip install --quiet 'transformers' 'datasets>=2.18,<3.0' torchaudio soundfile librosa 2>&1 | tail -5 +echo "[finetune] Deps installed" + +# Write training script (passed via TRAINING_SCRIPT env var, gzip+base64) +echo "[finetune] Decoding training script..." +echo "$TRAINING_SCRIPT_GZ" | base64 -d | gunzip > /workspace/finetune_smart_turn_v3.py +ls -la /workspace/finetune_smart_turn_v3.py + +# Check for existing checkpoint +if [ -f /workspace/checkpoints/smart_turn_pt_v3/resume_checkpoint.pt ]; then + echo "[finetune] Found resume checkpoint — continuing training" +fi + +# Run training +echo "[finetune] Starting training..." +cd /workspace +python3 finetune_smart_turn_v3.py 2>&1 | tee /workspace/training.log + +echo "[finetune] Training complete at $(date)" +sleep infinity +'""" + +# Compress script with gzip before base64 to reduce size +import gzip +import base64 +script_gz = gzip.compress(script_content.encode()) +script_gz_b64 = base64.b64encode(script_gz).decode() + +body = { + "provider": "tensordock", + "dockerImage": "pytorch/pytorch:2.4.0-cuda12.4-cudnn9-runtime", + "gpuTypes": [ + "NVIDIA A40", + "NVIDIA GeForce RTX 4090", + "NVIDIA RTX A6000", + "NVIDIA GeForce RTX 3090", + ], + "containerDiskInGb": 30, + "dockerStartCmd": docker_cmd, + "env": { + "TRAINING_SCRIPT_GZ": script_gz_b64, + }, +} + +print(f"Deploying fine-tuning pod...") +print(f" Script size: {len(script_gz_b64)} bytes (gzip+base64, original {len(script_content)} bytes)") +print(f" Docker image: {body['dockerImage']}") +print(f" GPU types: {body['gpuTypes']}") +print(f" Container disk: {body['containerDiskInGb']} GB") + +req = urllib.request.Request( + GATEWAY_URL, + data=json.dumps(body).encode(), + headers={"Content-Type": "application/json"}, + method="POST", +) + +try: + with urllib.request.urlopen(req, timeout=30) as resp: + result = json.loads(resp.read()) + print(f"\nDeploy response: {json.dumps(result, indent=2)}") +except urllib.error.HTTPError as e: + error_body = e.read().decode() + print(f"\nDeploy failed (HTTP {e.code}): {error_body}") +except Exception as e: + print(f"\nDeploy failed: {e}") diff --git a/previous-experiments/02-finetune-scratch/deploy_vast.py b/previous-experiments/02-finetune-scratch/deploy_vast.py new file mode 100644 index 0000000000000000000000000000000000000000..6dd48e1d2ccca6830098b71e1830da8c78e0bd50 --- /dev/null +++ b/previous-experiments/02-finetune-scratch/deploy_vast.py @@ -0,0 +1,304 @@ +""" +Deploy turn-taking benchmarks on Vast.ai GPU machines. + +Uses BabelCast's existing Vast.ai infrastructure to provision machines, +upload benchmark code, run experiments, and collect results. + +Usage: + python deploy_vast.py --build # Build and push Docker image + python deploy_vast.py --deploy # Deploy on Vast.ai + python deploy_vast.py --run # Run benchmarks on deployed machine + python deploy_vast.py --collect # Collect results + python deploy_vast.py --cleanup # Terminate instances + python deploy_vast.py --all # Do everything +""" + +from __future__ import annotations + +import argparse +import json +import logging +import os +import subprocess +import sys +import time +import urllib.error +import urllib.request +from pathlib import Path + +log = logging.getLogger(__name__) + +STUDY_DIR = Path(__file__).parent +DOCKER_IMAGE = "marcosremar/babelcast-turn-taking-study:latest" + +# Vast.ai API +VAST_API_BASE = "https://console.vast.ai/api/v0" + + +def get_vast_api_key() -> str: + """Get Vast.ai API key from environment.""" + key = os.environ.get("VAST_API_KEY", "") + if not key: + env_path = STUDY_DIR.parent.parent / ".env" + if env_path.exists(): + for line in env_path.read_text().splitlines(): + if line.startswith("VAST_API_KEY="): + key = line.split("=", 1)[1].strip().strip('"').strip("'") + break + if not key: + raise RuntimeError("VAST_API_KEY not found in environment or .env") + return key + + +def vast_api(method: str, endpoint: str, data: dict | None = None) -> dict: + """Make a Vast.ai API call.""" + api_key = get_vast_api_key() + url = f"{VAST_API_BASE}/{endpoint}" + + headers = { + "Authorization": f"Bearer {api_key}", + "Content-Type": "application/json", + } + + body = json.dumps(data).encode() if data else None + req = urllib.request.Request(url, data=body, headers=headers, method=method) + + try: + with urllib.request.urlopen(req, timeout=30) as resp: + return json.loads(resp.read().decode()) + except urllib.error.HTTPError as e: + error_body = e.read().decode() if e.fp else "" + log.error("Vast.ai API error %d: %s", e.code, error_body) + raise + + +def build_docker_image() -> None: + """Build and push Docker image for benchmarks.""" + log.info("Building Docker image: %s", DOCKER_IMAGE) + + subprocess.run( + ["docker", "build", "-t", DOCKER_IMAGE, "-f", str(STUDY_DIR / "Dockerfile"), str(STUDY_DIR)], + check=True, + ) + log.info("Pushing Docker image...") + subprocess.run(["docker", "push", DOCKER_IMAGE], check=True) + log.info("Image pushed: %s", DOCKER_IMAGE) + + +def find_gpu_offer(gpu_type: str = "RTX A6000", min_ram_gb: int = 16) -> dict | None: + """Find a suitable Vast.ai GPU offer.""" + log.info("Searching for %s with >= %dGB RAM...", gpu_type, min_ram_gb) + + # Search for offers + result = vast_api("GET", f"bundles?q={{\"gpu_name\":\"{gpu_type}\",\"gpu_ram\":{{\">=\":{min_ram_gb}}},\"rentable\":{{\"eq\":true}},\"order\":[[\"dph_total\",\"asc\"]],\"type\":\"on-demand\"}}") + + offers = result.get("offers", []) + if not offers: + log.warning("No %s offers found, trying RTX 4090...", gpu_type) + result = vast_api("GET", "bundles?q={\"gpu_name\":\"RTX 4090\",\"rentable\":{\"eq\":true},\"order\":[[\"dph_total\",\"asc\"]],\"type\":\"on-demand\"}") + offers = result.get("offers", []) + + if offers: + offer = offers[0] + log.info("Found: %s @ $%.3f/hr (ID: %s)", offer.get("gpu_name"), offer.get("dph_total", 0), offer.get("id")) + return offer + + return None + + +def deploy_instance(offer_id: int) -> dict: + """Deploy a Vast.ai instance with the benchmark Docker image.""" + log.info("Deploying instance on offer %s...", offer_id) + + result = vast_api("PUT", f"asks/{offer_id}/", data={ + "client_id": "me", + "image": DOCKER_IMAGE, + "disk": 30, # GB + "onstart": "cd /workspace/turn-taking-study && python run_benchmarks.py --all 2>&1 | tee /workspace/benchmark.log", + "runtype": "args", + "env": { + "HF_HUB_CACHE": "/workspace/hf_cache", + "PYTHONUNBUFFERED": "1", + }, + }) + + instance_id = result.get("new_contract") + log.info("Instance deployed: %s", instance_id) + return result + + +def wait_for_instance(instance_id: int, timeout_min: int = 15) -> dict: + """Wait for instance to be ready.""" + log.info("Waiting for instance %s to be ready...", instance_id) + deadline = time.time() + timeout_min * 60 + poll_interval = 10 + + while time.time() < deadline: + result = vast_api("GET", f"instances/{instance_id}/") + status = result.get("actual_status", "unknown") + log.info("Instance %s status: %s", instance_id, status) + + if status == "running": + return result + if status in ("error", "exited"): + raise RuntimeError(f"Instance failed with status: {status}") + + time.sleep(poll_interval) + poll_interval = min(poll_interval * 1.3, 30) + + raise TimeoutError(f"Instance {instance_id} did not become ready in {timeout_min}min") + + +def collect_results(instance_id: int) -> dict: + """Download benchmark results from instance.""" + log.info("Collecting results from instance %s...", instance_id) + + instance = vast_api("GET", f"instances/{instance_id}/") + ssh_host = instance.get("ssh_host", "") + ssh_port = instance.get("ssh_port", 22) + + if not ssh_host: + log.error("No SSH access available for instance %s", instance_id) + return {} + + results_dir = STUDY_DIR / "results" + results_dir.mkdir(parents=True, exist_ok=True) + + # Download results via SCP + subprocess.run([ + "scp", "-P", str(ssh_port), "-o", "StrictHostKeyChecking=no", + f"root@{ssh_host}:/workspace/turn-taking-study/results/*.json", + str(results_dir), + ], check=False) + + # Download log + subprocess.run([ + "scp", "-P", str(ssh_port), "-o", "StrictHostKeyChecking=no", + f"root@{ssh_host}:/workspace/benchmark.log", + str(STUDY_DIR / "benchmark.log"), + ], check=False) + + log.info("Results collected in %s", results_dir) + return {"results_dir": str(results_dir)} + + +def cleanup_instance(instance_id: int) -> None: + """Terminate a Vast.ai instance.""" + log.info("Terminating instance %s...", instance_id) + vast_api("DELETE", f"instances/{instance_id}/") + log.info("Instance %s terminated", instance_id) + + +def deploy_via_gateway() -> dict | None: + """ + Alternative: Deploy via BabelCast gateway (uses existing Vast.ai integration). + Requires gateway running on localhost:4000. + """ + import urllib.request + import json + + body = json.dumps({ + "dockerImage": DOCKER_IMAGE, + "gpuTypes": ["NVIDIA RTX A6000"], + }).encode() + + req = urllib.request.Request( + "http://localhost:4000/v1/gpu/deploy", + data=body, + headers={"Content-Type": "application/json"}, + method="POST", + ) + + try: + with urllib.request.urlopen(req, timeout=30) as resp: + result = json.loads(resp.read().decode()) + log.info("Deployed via gateway: %s", result) + return result + except Exception as e: + log.warning("Gateway deploy failed: %s — falling back to direct Vast.ai API", e) + return None + + +def run_all(gpu_type: str = "RTX A6000") -> None: + """Run the full benchmark pipeline.""" + state_file = STUDY_DIR / ".deploy_state.json" + + # Step 1: Build + log.info("=== Step 1: Build Docker Image ===") + build_docker_image() + + # Step 2: Deploy + log.info("=== Step 2: Deploy on Vast.ai ===") + offer = find_gpu_offer(gpu_type) + if not offer: + raise RuntimeError("No GPU offers available") + + result = deploy_instance(offer["id"]) + instance_id = result.get("new_contract") + + # Save state + with open(state_file, "w") as f: + json.dump({"instance_id": instance_id, "offer": offer}, f, indent=2) + + # Step 3: Wait + log.info("=== Step 3: Wait for Instance ===") + instance = wait_for_instance(instance_id) + + # Step 4: Wait for benchmarks to complete + log.info("=== Step 4: Waiting for benchmarks (check logs) ===") + log.info("Monitor with: vast logs %s", instance_id) + log.info("Benchmarks typically take 20-40 minutes depending on GPU") + + # Poll for completion + for _ in range(60): # Up to 60 minutes + time.sleep(60) + try: + inst = vast_api("GET", f"instances/{instance_id}/") + if inst.get("actual_status") == "exited": + log.info("Benchmarks completed!") + break + except Exception: + continue + + # Step 5: Collect + log.info("=== Step 5: Collect Results ===") + collect_results(instance_id) + + # Step 6: Cleanup + log.info("=== Step 6: Cleanup ===") + cleanup_instance(instance_id) + + if state_file.exists(): + state_file.unlink() + + log.info("=== Done! Results in %s ===", STUDY_DIR / "results") + + +if __name__ == "__main__": + logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s: %(message)s") + + parser = argparse.ArgumentParser(description="Deploy turn-taking benchmarks on Vast.ai") + parser.add_argument("--build", action="store_true", help="Build and push Docker image") + parser.add_argument("--deploy", action="store_true", help="Deploy instance") + parser.add_argument("--run", action="store_true", help="Run benchmarks on deployed instance") + parser.add_argument("--collect", action="store_true", help="Collect results") + parser.add_argument("--cleanup", action="store_true", help="Terminate instance") + parser.add_argument("--all", action="store_true", help="Run full pipeline") + parser.add_argument("--gpu", default="RTX A6000", help="GPU type (default: RTX A6000)") + parser.add_argument("--instance-id", type=int, help="Instance ID for collect/cleanup") + args = parser.parse_args() + + if args.all: + run_all(args.gpu) + elif args.build: + build_docker_image() + elif args.deploy: + offer = find_gpu_offer(args.gpu) + if offer: + deploy_instance(offer["id"]) + elif args.collect and args.instance_id: + collect_results(args.instance_id) + elif args.cleanup and args.instance_id: + cleanup_instance(args.instance_id) + else: + parser.print_help() diff --git a/previous-experiments/02-finetune-scratch/finetune_smart_turn.py b/previous-experiments/02-finetune-scratch/finetune_smart_turn.py new file mode 100644 index 0000000000000000000000000000000000000000..4ef29f2a032d7dacc92459e1cf6deef1e429cbf0 --- /dev/null +++ b/previous-experiments/02-finetune-scratch/finetune_smart_turn.py @@ -0,0 +1,292 @@ +""" +Fine-tune Pipecat Smart Turn on Portuguese data. + +Loads the pretrained Whisper Tiny encoder + classifier, then continues +training on Portuguese audio samples from NURC-SP and Edge TTS. + +Can run on MPS (Apple Silicon), CUDA, or CPU. +""" + +from __future__ import annotations + +import logging +import os +from pathlib import Path + +import numpy as np +import torch +import torch.nn as nn +from torch.utils.data import Dataset, DataLoader +from transformers import WhisperFeatureExtractor + +import soundfile as sf +import onnxruntime as ort + +log = logging.getLogger(__name__) + +SAMPLE_RATE = 16000 +WINDOW_SAMPLES = 8 * SAMPLE_RATE + +DATA_DIR = Path(__file__).parent / "data" / "smart_turn_pt_training" / "por" +OUTPUT_DIR = Path(__file__).parent / "checkpoints" / "smart_turn_pt" + + +class SmartTurnModel(nn.Module): + """Whisper encoder + attention pooling + classifier (matches Smart Turn v3 architecture).""" + + def __init__(self): + super().__init__() + from transformers import WhisperModel, WhisperConfig + + whisper = WhisperModel.from_pretrained("openai/whisper-tiny") + self.encoder = whisper.encoder + + # Resize position embeddings from 1500 (30s) to 400 (8s) + max_pos = 400 + old_embed = self.encoder.embed_positions.weight.data + new_embed = old_embed[:max_pos, :] + self.encoder.embed_positions = nn.Embedding(max_pos, old_embed.shape[1]) + self.encoder.embed_positions.weight.data = new_embed + self.encoder.config.max_source_positions = max_pos + + hidden_size = self.encoder.config.d_model # 384 for whisper-tiny + + # Attention pooling + self.attention = nn.Sequential( + nn.Linear(hidden_size, 256), + nn.Tanh(), + nn.Linear(256, 1), + ) + + # Classifier + self.classifier = nn.Sequential( + nn.Linear(hidden_size, 256), + nn.LayerNorm(256), + nn.GELU(), + nn.Dropout(0.1), + nn.Linear(256, 64), + nn.GELU(), + nn.Linear(64, 1), + ) + + def forward(self, input_features: torch.Tensor) -> torch.Tensor: + # input_features: (batch, 80, time_frames) + encoder_output = self.encoder(input_features).last_hidden_state # (batch, seq, 384) + + # Attention pooling + attn_weights = self.attention(encoder_output) # (batch, seq, 1) + attn_weights = torch.softmax(attn_weights, dim=1) + pooled = (encoder_output * attn_weights).sum(dim=1) # (batch, 384) + + # Classify + logits = self.classifier(pooled) # (batch, 1) + return logits.squeeze(-1) + + +class PortugueseDataset(Dataset): + """Load Portuguese training samples from FLAC files.""" + + def __init__(self, data_dir: Path, feature_extractor: WhisperFeatureExtractor): + self.feature_extractor = feature_extractor + self.samples = [] + + # Load complete samples (label=1) + complete_dir = data_dir / "complete-nofiller" + if complete_dir.exists(): + for f in sorted(complete_dir.glob("*.flac")): + self.samples.append((str(f), 1.0)) + + # Load incomplete samples (label=0) + incomplete_dir = data_dir / "incomplete-nofiller" + if incomplete_dir.exists(): + for f in sorted(incomplete_dir.glob("*.flac")): + self.samples.append((str(f), 0.0)) + + log.info("Loaded %d samples (%d complete, %d incomplete)", + len(self.samples), + sum(1 for _, l in self.samples if l == 1.0), + sum(1 for _, l in self.samples if l == 0.0)) + + def __len__(self) -> int: + return len(self.samples) + + def __getitem__(self, idx: int) -> dict: + path, label = self.samples[idx] + + audio, sr = sf.read(path) + if audio.ndim > 1: + audio = audio.mean(axis=1) + audio = audio.astype(np.float32) + + # Truncate/pad to 8 seconds + if len(audio) > WINDOW_SAMPLES: + audio = audio[-WINDOW_SAMPLES:] + elif len(audio) < WINDOW_SAMPLES: + padding = WINDOW_SAMPLES - len(audio) + audio = np.pad(audio, (padding, 0), mode="constant") + + inputs = self.feature_extractor( + audio, + sampling_rate=SAMPLE_RATE, + return_tensors="np", + padding="max_length", + max_length=WINDOW_SAMPLES, + truncation=True, + do_normalize=True, + ) + + features = inputs.input_features.squeeze(0).astype(np.float32) + + return { + "input_features": torch.from_numpy(features), + "labels": torch.tensor(label, dtype=torch.float32), + } + + +def train( + epochs: int = 10, + batch_size: int = 16, + lr: float = 2e-5, + device: str = "auto", +) -> Path: + """Fine-tune Smart Turn on Portuguese data.""" + if device == "auto": + if torch.cuda.is_available(): + device = "cuda" + elif torch.backends.mps.is_available(): + device = "mps" + else: + device = "cpu" + + log.info("Training on device: %s", device) + + # Model + model = SmartTurnModel() + model = model.to(device) + + # Dataset + feature_extractor = WhisperFeatureExtractor(chunk_length=8) + dataset = PortugueseDataset(DATA_DIR, feature_extractor) + + # Split 90/10 + n_train = int(0.9 * len(dataset)) + n_val = len(dataset) - n_train + train_ds, val_ds = torch.utils.data.random_split( + dataset, [n_train, n_val], generator=torch.Generator().manual_seed(42) + ) + + train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True, num_workers=0) + val_loader = DataLoader(val_ds, batch_size=batch_size, shuffle=False, num_workers=0) + + log.info("Train: %d samples, Val: %d samples", n_train, n_val) + + # Loss with dynamic pos_weight + n_pos = sum(1 for _, l in dataset.samples if l == 1.0) + n_neg = len(dataset.samples) - n_pos + pos_weight = torch.tensor([n_neg / max(n_pos, 1)], device=device) + criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight) + log.info("pos_weight: %.2f (neg=%d, pos=%d)", pos_weight.item(), n_neg, n_pos) + + # Optimizer + optimizer = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=0.01) + scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=epochs) + + OUTPUT_DIR.mkdir(parents=True, exist_ok=True) + best_acc = 0.0 + best_path = OUTPUT_DIR / "best_model.pt" + + for epoch in range(epochs): + # Train + model.train() + train_loss = 0.0 + train_correct = 0 + train_total = 0 + + for batch in train_loader: + features = batch["input_features"].to(device) + labels = batch["labels"].to(device) + + logits = model(features) + loss = criterion(logits, labels) + + optimizer.zero_grad() + loss.backward() + optimizer.step() + + train_loss += loss.item() * len(labels) + preds = (torch.sigmoid(logits) > 0.5).float() + train_correct += (preds == labels).sum().item() + train_total += len(labels) + + scheduler.step() + + # Validate + model.eval() + val_correct = 0 + val_total = 0 + val_tp = val_fp = val_fn = val_tn = 0 + + with torch.no_grad(): + for batch in val_loader: + features = batch["input_features"].to(device) + labels = batch["labels"].to(device) + + logits = model(features) + preds = (torch.sigmoid(logits) > 0.5).float() + val_correct += (preds == labels).sum().item() + val_total += len(labels) + + val_tp += ((preds == 1) & (labels == 1)).sum().item() + val_fp += ((preds == 1) & (labels == 0)).sum().item() + val_fn += ((preds == 0) & (labels == 1)).sum().item() + val_tn += ((preds == 0) & (labels == 0)).sum().item() + + train_acc = train_correct / max(train_total, 1) + val_acc = val_correct / max(val_total, 1) + precision = val_tp / max(val_tp + val_fp, 1) + recall = val_tp / max(val_tp + val_fn, 1) + f1 = 2 * precision * recall / max(precision + recall, 1e-8) + + log.info( + "Epoch %d/%d: train_loss=%.4f train_acc=%.3f val_acc=%.3f " + "prec=%.3f rec=%.3f f1=%.3f", + epoch + 1, epochs, + train_loss / max(train_total, 1), + train_acc, val_acc, precision, recall, f1, + ) + + if val_acc > best_acc: + best_acc = val_acc + torch.save(model.state_dict(), best_path) + log.info(" -> New best model saved (val_acc=%.3f)", best_acc) + + log.info("Training complete. Best val_acc=%.3f", best_acc) + + # Export to ONNX + model.load_state_dict(torch.load(best_path, map_location=device, weights_only=True)) + model.eval() + model = model.to("cpu") + + onnx_path = OUTPUT_DIR / "smart_turn_pt.onnx" + dummy = torch.randn(1, 80, 800) # (batch, mel_bins, frames) for 8s + torch.onnx.export( + model, + dummy, + str(onnx_path), + input_names=["input_features"], + output_names=["logits"], + dynamic_axes={"input_features": {0: "batch"}, "logits": {0: "batch"}}, + opset_version=17, + ) + log.info("ONNX model exported to %s", onnx_path) + + return onnx_path + + +if __name__ == "__main__": + logging.basicConfig( + level=logging.INFO, + format="%(asctime)s %(levelname)s [%(name)s] %(message)s", + ) + onnx_path = train(epochs=15, batch_size=16, lr=2e-5) + log.info("Done! ONNX model: %s", onnx_path) diff --git a/previous-experiments/02-finetune-scratch/finetune_smart_turn_gpu.py b/previous-experiments/02-finetune-scratch/finetune_smart_turn_gpu.py new file mode 100644 index 0000000000000000000000000000000000000000..0f23eba491855c12680bc587af8fbf6c1e47fde7 --- /dev/null +++ b/previous-experiments/02-finetune-scratch/finetune_smart_turn_gpu.py @@ -0,0 +1,819 @@ +""" +Fine-tune Pipecat Smart Turn on Portuguese speech data (GPU version). + +Downloads Portuguese datasets from HuggingFace, processes into +complete/incomplete samples, trains with speaker-based splits +to avoid data leakage, and exports to ONNX. + +Datasets used: +- CORAA v1.1 (291h, conversational Brazilian Portuguese) +- Common Voice Portuguese (51h, diverse speakers) +- MLS Portuguese (168h, read speech) + +Run on a Vast.ai GPU instance with: + python finetune_smart_turn_gpu.py +""" + +from __future__ import annotations + +import gc +import json +import logging +import os +import random +import time +from dataclasses import dataclass, field +from pathlib import Path + +import numpy as np +import torch +import torch.nn as nn +from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler +from transformers import WhisperFeatureExtractor + +log = logging.getLogger(__name__) + +SAMPLE_RATE = 16000 +WINDOW_SECONDS = 8 +WINDOW_SAMPLES = WINDOW_SECONDS * SAMPLE_RATE + +OUTPUT_DIR = Path("checkpoints/smart_turn_pt_v2") +CACHE_DIR = Path("hf_cache") + + +# --------------------------------------------------------------------------- +# Model (same architecture as Pipecat Smart Turn v3) +# --------------------------------------------------------------------------- + +class SmartTurnModel(nn.Module): + """Whisper Tiny encoder + attention pooling + classifier.""" + + def __init__(self): + super().__init__() + from transformers import WhisperModel + + whisper = WhisperModel.from_pretrained( + "openai/whisper-tiny", cache_dir=str(CACHE_DIR) + ) + self.encoder = whisper.encoder + + # Resize position embeddings from 1500 (30s) to 400 (8s) + max_pos = 400 + old_embed = self.encoder.embed_positions.weight.data + new_embed = old_embed[:max_pos, :] + self.encoder.embed_positions = nn.Embedding(max_pos, old_embed.shape[1]) + self.encoder.embed_positions.weight.data = new_embed + self.encoder.config.max_source_positions = max_pos + + hidden_size = self.encoder.config.d_model # 384 + + self.attention = nn.Sequential( + nn.Linear(hidden_size, 256), + nn.Tanh(), + nn.Linear(256, 1), + ) + + self.classifier = nn.Sequential( + nn.Linear(hidden_size, 256), + nn.LayerNorm(256), + nn.GELU(), + nn.Dropout(0.1), + nn.Linear(256, 64), + nn.GELU(), + nn.Linear(64, 1), + ) + + def forward(self, input_features: torch.Tensor) -> torch.Tensor: + encoder_output = self.encoder(input_features).last_hidden_state + attn_weights = self.attention(encoder_output) + attn_weights = torch.softmax(attn_weights, dim=1) + pooled = (encoder_output * attn_weights).sum(dim=1) + logits = self.classifier(pooled) + return logits.squeeze(-1) + + +# --------------------------------------------------------------------------- +# Data sample +# --------------------------------------------------------------------------- + +@dataclass +class AudioSample: + audio: np.ndarray # float32, 16kHz + label: float # 1.0 = complete, 0.0 = incomplete + speaker_id: str + source: str # dataset name + + +# --------------------------------------------------------------------------- +# Dataset loaders — all Portuguese +# --------------------------------------------------------------------------- + +def load_coraa_samples(max_samples: int = 30000) -> list[AudioSample]: + """Load CORAA v1.1 — conversational Brazilian Portuguese (291h).""" + from datasets import load_dataset + + log.info("Loading CORAA v1.1 from HuggingFace...") + try: + ds = load_dataset( + "Racoci/CORAA-v1.1", + split="train", + cache_dir=str(CACHE_DIR), + streaming=True, + ) + except Exception as e: + log.warning("Failed to load CORAA v1.1: %s", e) + return [] + + samples = [] + complete_count = 0 + incomplete_count = 0 + target_per_class = max_samples // 2 + + for i, row in enumerate(ds): + if complete_count >= target_per_class and incomplete_count >= target_per_class: + break + + try: + audio_data = row.get("audio", {}) + if not audio_data: + continue + + audio = np.array(audio_data["array"], dtype=np.float32) + sr = audio_data["sampling_rate"] + + # Resample to 16kHz if needed + if sr != SAMPLE_RATE: + import torchaudio + tensor = torch.from_numpy(audio).float().unsqueeze(0) + audio = torchaudio.functional.resample(tensor, sr, SAMPLE_RATE).squeeze().numpy() + + duration = len(audio) / SAMPLE_RATE + if duration < 1.0: + continue + + speaker_id = str(row.get("speaker", row.get("speaker_id", f"coraa_{i}"))) + + # COMPLETE: use the end of the utterance (last 8s) + if complete_count < target_per_class: + window = _extract_window(audio, position="end") + if window is not None: + samples.append(AudioSample( + audio=window, label=1.0, + speaker_id=speaker_id, source="coraa", + )) + complete_count += 1 + + # INCOMPLETE: use a random mid-utterance cut (first 40-80%) + if incomplete_count < target_per_class and duration >= 2.0: + cut_frac = random.uniform(0.3, 0.75) + cut_sample = int(len(audio) * cut_frac) + truncated = audio[:cut_sample] + window = _extract_window(truncated, position="end") + if window is not None: + samples.append(AudioSample( + audio=window, label=0.0, + speaker_id=speaker_id, source="coraa", + )) + incomplete_count += 1 + + except Exception as e: + if i < 5: + log.warning("CORAA sample %d error: %s", i, e) + continue + + if i % 5000 == 0 and i > 0: + log.info(" CORAA: processed %d rows, %d complete, %d incomplete", + i, complete_count, incomplete_count) + + log.info("CORAA: %d complete + %d incomplete = %d samples", + complete_count, incomplete_count, len(samples)) + return samples + + +def load_common_voice_samples(max_samples: int = 20000) -> list[AudioSample]: + """Load Common Voice Portuguese — skipped (schema issues).""" + log.info("Skipping Common Voice (schema compatibility issues)") + return [] + + +def load_mls_samples(max_samples: int = 20000) -> list[AudioSample]: + """Load MLS Portuguese — read speech from audiobooks (168h).""" + from datasets import load_dataset + + log.info("Loading MLS Portuguese from HuggingFace...") + try: + ds = load_dataset( + "facebook/multilingual_librispeech", "portuguese", + split="train", + cache_dir=str(CACHE_DIR), + streaming=True, + ) + except Exception as e: + log.warning("Failed to load MLS: %s", e) + return [] + + samples = [] + complete_count = 0 + incomplete_count = 0 + target_per_class = max_samples // 2 + + for i, row in enumerate(ds): + if complete_count >= target_per_class and incomplete_count >= target_per_class: + break + + try: + audio_data = row.get("audio", {}) + if not audio_data: + continue + + audio = np.array(audio_data["array"], dtype=np.float32) + sr = audio_data["sampling_rate"] + + if sr != SAMPLE_RATE: + import torchaudio + tensor = torch.from_numpy(audio).float().unsqueeze(0) + audio = torchaudio.functional.resample(tensor, sr, SAMPLE_RATE).squeeze().numpy() + + duration = len(audio) / SAMPLE_RATE + if duration < 1.0: + continue + + speaker_id = str(row.get("speaker_id", f"mls_{i}")) + + # COMPLETE + if complete_count < target_per_class: + window = _extract_window(audio, position="end") + if window is not None: + samples.append(AudioSample( + audio=window, label=1.0, + speaker_id=speaker_id, source="mls", + )) + complete_count += 1 + + # INCOMPLETE + if incomplete_count < target_per_class and duration >= 1.5: + cut_frac = random.uniform(0.3, 0.7) + cut_sample = int(len(audio) * cut_frac) + truncated = audio[:cut_sample] + window = _extract_window(truncated, position="end") + if window is not None: + samples.append(AudioSample( + audio=window, label=0.0, + speaker_id=speaker_id, source="mls", + )) + incomplete_count += 1 + + except Exception as e: + if i < 5: + log.warning("MLS sample %d error: %s", i, e) + continue + + if i % 5000 == 0 and i > 0: + log.info(" MLS: processed %d rows, %d complete, %d incomplete", + i, complete_count, incomplete_count) + + log.info("MLS: %d complete + %d incomplete = %d samples", + complete_count, incomplete_count, len(samples)) + return samples + + +def load_mupe_samples(max_samples: int = 20000) -> list[AudioSample]: + """Load CORAA-MUPE-ASR — interview turn-taking (365h).""" + from datasets import load_dataset + + log.info("Loading CORAA-MUPE-ASR from HuggingFace...") + try: + ds = load_dataset( + "nilc-nlp/CORAA-MUPE-ASR", + split="train", + cache_dir=str(CACHE_DIR), + streaming=True, + ) + except Exception as e: + log.warning("Failed to load MUPE: %s", e) + return [] + + samples = [] + complete_count = 0 + incomplete_count = 0 + target_per_class = max_samples // 2 + + for i, row in enumerate(ds): + if complete_count >= target_per_class and incomplete_count >= target_per_class: + break + + try: + audio_data = row.get("audio", {}) + if not audio_data: + continue + + audio = np.array(audio_data["array"], dtype=np.float32) + sr = audio_data["sampling_rate"] + + if sr != SAMPLE_RATE: + import torchaudio + tensor = torch.from_numpy(audio).float().unsqueeze(0) + audio = torchaudio.functional.resample(tensor, sr, SAMPLE_RATE).squeeze().numpy() + + duration = len(audio) / SAMPLE_RATE + if duration < 1.0: + continue + + speaker_id = str(row.get("speaker_type", f"mupe_{i}")) + + # COMPLETE + if complete_count < target_per_class: + window = _extract_window(audio, position="end") + if window is not None: + samples.append(AudioSample( + audio=window, label=1.0, + speaker_id=speaker_id, source="mupe", + )) + complete_count += 1 + + # INCOMPLETE + if incomplete_count < target_per_class and duration >= 2.0: + cut_frac = random.uniform(0.3, 0.75) + cut_sample = int(len(audio) * cut_frac) + truncated = audio[:cut_sample] + window = _extract_window(truncated, position="end") + if window is not None: + samples.append(AudioSample( + audio=window, label=0.0, + speaker_id=speaker_id, source="mupe", + )) + incomplete_count += 1 + + except Exception as e: + if i < 5: + log.warning("MUPE sample %d error: %s", i, e) + continue + + if i % 5000 == 0 and i > 0: + log.info(" MUPE: processed %d rows, %d complete, %d incomplete", + i, complete_count, incomplete_count) + + log.info("MUPE: %d complete + %d incomplete = %d samples", + complete_count, incomplete_count, len(samples)) + return samples + + +# --------------------------------------------------------------------------- +# Audio processing helpers +# --------------------------------------------------------------------------- + +def _extract_window(audio: np.ndarray, position: str = "end") -> np.ndarray | None: + """Extract 8-second window from audio, pad if needed.""" + if len(audio) < SAMPLE_RATE: # minimum 1 second + return None + + # Normalize + peak = np.max(np.abs(audio)) + if peak > 0: + audio = audio / peak * 0.9 + + if position == "end": + # Take last 8 seconds + if len(audio) > WINDOW_SAMPLES: + audio = audio[-WINDOW_SAMPLES:] + elif len(audio) < WINDOW_SAMPLES: + padding = WINDOW_SAMPLES - len(audio) + audio = np.pad(audio, (padding, 0), mode="constant") + else: + # Take first 8 seconds + if len(audio) > WINDOW_SAMPLES: + audio = audio[:WINDOW_SAMPLES] + elif len(audio) < WINDOW_SAMPLES: + padding = WINDOW_SAMPLES - len(audio) + audio = np.pad(audio, (0, padding), mode="constant") + + # Add ~200ms silence at end (matching VAD behavior) + silence_samples = int(0.2 * SAMPLE_RATE) + audio[-silence_samples:] = 0.0 + + return audio.astype(np.float32) + + +def augment_audio(audio: np.ndarray) -> np.ndarray: + """Apply simple data augmentation.""" + aug = audio.copy() + + # Random volume scaling (0.7x to 1.3x) + if random.random() < 0.5: + scale = random.uniform(0.7, 1.3) + aug = aug * scale + + # Add small Gaussian noise + if random.random() < 0.3: + noise_level = random.uniform(0.001, 0.01) + aug = aug + np.random.randn(len(aug)).astype(np.float32) * noise_level + + # Clip to prevent overflow + aug = np.clip(aug, -1.0, 1.0) + + return aug + + +# --------------------------------------------------------------------------- +# PyTorch Dataset +# --------------------------------------------------------------------------- + +class SmartTurnDataset(Dataset): + """In-memory dataset of pre-processed audio samples.""" + + def __init__( + self, + samples: list[AudioSample], + feature_extractor: WhisperFeatureExtractor, + augment: bool = False, + ): + self.samples = samples + self.feature_extractor = feature_extractor + self.augment = augment + + def __len__(self) -> int: + return len(self.samples) + + def __getitem__(self, idx: int) -> dict: + sample = self.samples[idx] + audio = sample.audio + + if self.augment: + audio = augment_audio(audio) + + inputs = self.feature_extractor( + audio, + sampling_rate=SAMPLE_RATE, + return_tensors="np", + padding="max_length", + max_length=WINDOW_SAMPLES, + truncation=True, + do_normalize=True, + ) + + features = inputs.input_features.squeeze(0).astype(np.float32) + + return { + "input_features": torch.from_numpy(features), + "labels": torch.tensor(sample.label, dtype=torch.float32), + } + + +# --------------------------------------------------------------------------- +# Speaker-based train/val/test split +# --------------------------------------------------------------------------- + +def split_by_speaker( + samples: list[AudioSample], + val_frac: float = 0.1, + test_frac: float = 0.1, +) -> tuple[list[AudioSample], list[AudioSample], list[AudioSample]]: + """Split samples by speaker to avoid data leakage.""" + # Group by speaker + speaker_samples: dict[str, list[AudioSample]] = {} + for s in samples: + speaker_samples.setdefault(s.speaker_id, []).append(s) + + speakers = list(speaker_samples.keys()) + random.shuffle(speakers) + + n_val = max(1, int(len(speakers) * val_frac)) + n_test = max(1, int(len(speakers) * test_frac)) + + test_speakers = set(speakers[:n_test]) + val_speakers = set(speakers[n_test:n_test + n_val]) + train_speakers = set(speakers[n_test + n_val:]) + + train = [s for sp in train_speakers for s in speaker_samples[sp]] + val = [s for sp in val_speakers for s in speaker_samples[sp]] + test = [s for sp in test_speakers for s in speaker_samples[sp]] + + random.shuffle(train) + random.shuffle(val) + random.shuffle(test) + + log.info("Split: %d train (%d speakers), %d val (%d speakers), %d test (%d speakers)", + len(train), len(train_speakers), len(val), len(val_speakers), + len(test), len(test_speakers)) + + return train, val, test + + +# --------------------------------------------------------------------------- +# Training +# --------------------------------------------------------------------------- + +def train( + epochs: int = 20, + batch_size: int = 32, + lr: float = 2e-5, + max_samples_per_dataset: int = 25000, +) -> Path: + """Fine-tune Smart Turn on Portuguese data from HuggingFace.""" + + if torch.cuda.is_available(): + device = "cuda" + elif torch.backends.mps.is_available(): + device = "mps" + else: + device = "cpu" + log.info("Training on device: %s", device) + if device == "cuda": + log.info("GPU: %s (%d MB)", torch.cuda.get_device_name(), + torch.cuda.get_device_properties(0).total_memory // 1024 // 1024) + + # ----- Load datasets ----- + t0 = time.time() + all_samples: list[AudioSample] = [] + + # Load from multiple Portuguese sources + coraa = load_coraa_samples(max_samples=max_samples_per_dataset) + all_samples.extend(coraa) + del coraa + gc.collect() + + cv = load_common_voice_samples(max_samples=max_samples_per_dataset) + all_samples.extend(cv) + del cv + gc.collect() + + mls = load_mls_samples(max_samples=max_samples_per_dataset) + all_samples.extend(mls) + del mls + gc.collect() + + mupe = load_mupe_samples(max_samples=max_samples_per_dataset) + all_samples.extend(mupe) + del mupe + gc.collect() + + if not all_samples: + raise RuntimeError("No samples loaded! Check dataset availability.") + + load_time = time.time() - t0 + n_complete = sum(1 for s in all_samples if s.label == 1.0) + n_incomplete = sum(1 for s in all_samples if s.label == 0.0) + n_speakers = len(set(s.speaker_id for s in all_samples)) + + log.info("Total: %d samples (%d complete, %d incomplete) from %d speakers in %.0fs", + len(all_samples), n_complete, n_incomplete, n_speakers, load_time) + + # Source distribution + sources = {} + for s in all_samples: + sources[s.source] = sources.get(s.source, 0) + 1 + for src, cnt in sorted(sources.items()): + log.info(" %s: %d samples", src, cnt) + + # ----- Split by speaker ----- + train_samples, val_samples, test_samples = split_by_speaker(all_samples) + + # ----- Create datasets ----- + feature_extractor = WhisperFeatureExtractor(chunk_length=8) + + train_ds = SmartTurnDataset(train_samples, feature_extractor, augment=True) + val_ds = SmartTurnDataset(val_samples, feature_extractor, augment=False) + test_ds = SmartTurnDataset(test_samples, feature_extractor, augment=False) + + # Balanced sampler for training + train_labels = [s.label for s in train_samples] + n_pos = sum(1 for l in train_labels if l == 1.0) + n_neg = len(train_labels) - n_pos + weights = [1.0 / n_neg if l == 0.0 else 1.0 / n_pos for l in train_labels] + sampler = WeightedRandomSampler(weights, len(weights)) + + use_pin = device == "cuda" + train_loader = DataLoader( + train_ds, batch_size=batch_size, sampler=sampler, + num_workers=2 if device == "cuda" else 0, pin_memory=use_pin, + ) + val_loader = DataLoader( + val_ds, batch_size=batch_size, shuffle=False, + num_workers=0, pin_memory=use_pin, + ) + test_loader = DataLoader( + test_ds, batch_size=batch_size, shuffle=False, + num_workers=0, pin_memory=use_pin, + ) + + # ----- Model ----- + model = SmartTurnModel().to(device) + total_params = sum(p.numel() for p in model.parameters()) + trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad) + log.info("Model: %d total params, %d trainable", total_params, trainable_params) + + # Loss + pos_weight = torch.tensor([n_neg / max(n_pos, 1)], device=device) + criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight) + log.info("pos_weight: %.2f", pos_weight.item()) + + # Optimizer — different LR for encoder vs head + encoder_params = list(model.encoder.parameters()) + head_params = list(model.attention.parameters()) + list(model.classifier.parameters()) + optimizer = torch.optim.AdamW([ + {"params": encoder_params, "lr": lr * 0.1}, # Lower LR for pretrained encoder + {"params": head_params, "lr": lr}, + ], weight_decay=0.01) + + scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=epochs) + + # ----- Training loop ----- + OUTPUT_DIR.mkdir(parents=True, exist_ok=True) + best_f1 = 0.0 + best_path = OUTPUT_DIR / "best_model.pt" + patience = 5 + patience_counter = 0 + history = [] + + for epoch in range(epochs): + # Train + model.train() + train_loss = 0.0 + train_correct = 0 + train_total = 0 + t_epoch = time.time() + + for batch_idx, batch in enumerate(train_loader): + features = batch["input_features"].to(device) + labels = batch["labels"].to(device) + + logits = model(features) + loss = criterion(logits, labels) + + optimizer.zero_grad() + loss.backward() + torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) + optimizer.step() + + train_loss += loss.item() * len(labels) + preds = (torch.sigmoid(logits) > 0.5).float() + train_correct += (preds == labels).sum().item() + train_total += len(labels) + + if batch_idx % 100 == 0 and batch_idx > 0: + log.info(" batch %d/%d loss=%.4f", batch_idx, len(train_loader), + loss.item()) + + scheduler.step() + + # Validate + model.eval() + val_metrics = _evaluate(model, val_loader, device, criterion) + train_acc = train_correct / max(train_total, 1) + epoch_time = time.time() - t_epoch + + log.info( + "Epoch %d/%d (%.0fs): train_loss=%.4f train_acc=%.3f | " + "val_acc=%.3f val_f1=%.3f prec=%.3f rec=%.3f", + epoch + 1, epochs, epoch_time, + train_loss / max(train_total, 1), train_acc, + val_metrics["accuracy"], val_metrics["f1"], + val_metrics["precision"], val_metrics["recall"], + ) + + history.append({ + "epoch": epoch + 1, + "train_loss": train_loss / max(train_total, 1), + "train_acc": train_acc, + **{f"val_{k}": v for k, v in val_metrics.items()}, + }) + + # Save best + if val_metrics["f1"] > best_f1: + best_f1 = val_metrics["f1"] + torch.save({ + "model_state_dict": model.state_dict(), + "epoch": epoch + 1, + "val_f1": best_f1, + "val_metrics": val_metrics, + }, best_path) + log.info(" -> New best model saved (val_f1=%.3f)", best_f1) + patience_counter = 0 + else: + patience_counter += 1 + if patience_counter >= patience: + log.info("Early stopping at epoch %d (no improvement for %d epochs)", + epoch + 1, patience) + break + + # ----- Test evaluation ----- + log.info("\n=== Final Test Evaluation ===") + checkpoint = torch.load(best_path, map_location=device, weights_only=True) + model.load_state_dict(checkpoint["model_state_dict"]) + model.eval() + + test_metrics = _evaluate(model, test_loader, device, criterion) + log.info("Test results:") + log.info(" Accuracy: %.3f", test_metrics["accuracy"]) + log.info(" Precision: %.3f", test_metrics["precision"]) + log.info(" Recall: %.3f", test_metrics["recall"]) + log.info(" F1: %.3f", test_metrics["f1"]) + log.info(" TP=%d FP=%d FN=%d TN=%d", + test_metrics["tp"], test_metrics["fp"], + test_metrics["fn"], test_metrics["tn"]) + + # ----- Export to ONNX ----- + model = model.to("cpu") + onnx_path = OUTPUT_DIR / "smart_turn_pt_v2.onnx" + dummy = torch.randn(1, 80, 800) + torch.onnx.export( + model, + dummy, + str(onnx_path), + input_names=["input_features"], + output_names=["logits"], + dynamic_axes={"input_features": {0: "batch"}, "logits": {0: "batch"}}, + opset_version=17, + ) + log.info("ONNX model exported to %s", onnx_path) + + # ----- Save results ----- + results = { + "model": "smart_turn_pt_v2", + "total_samples": len(all_samples), + "n_speakers": n_speakers, + "sources": sources, + "train_samples": len(train_samples), + "val_samples": len(val_samples), + "test_samples": len(test_samples), + "best_epoch": checkpoint["epoch"], + "best_val_f1": best_f1, + "test_metrics": test_metrics, + "history": history, + "config": { + "epochs": epochs, + "batch_size": batch_size, + "lr": lr, + "max_samples_per_dataset": max_samples_per_dataset, + }, + } + + results_path = OUTPUT_DIR / "training_results.json" + with open(results_path, "w") as f: + json.dump(results, f, indent=2) + log.info("Training results saved to %s", results_path) + + return onnx_path + + +def _evaluate( + model: nn.Module, + loader: DataLoader, + device: str, + criterion: nn.Module, +) -> dict: + """Evaluate model and return metrics.""" + correct = 0 + total = 0 + tp = fp = fn = tn = 0 + total_loss = 0.0 + + with torch.no_grad(): + for batch in loader: + features = batch["input_features"].to(device) + labels = batch["labels"].to(device) + + logits = model(features) + loss = criterion(logits, labels) + total_loss += loss.item() * len(labels) + + preds = (torch.sigmoid(logits) > 0.5).float() + correct += (preds == labels).sum().item() + total += len(labels) + + tp += ((preds == 1) & (labels == 1)).sum().item() + fp += ((preds == 1) & (labels == 0)).sum().item() + fn += ((preds == 0) & (labels == 1)).sum().item() + tn += ((preds == 0) & (labels == 0)).sum().item() + + accuracy = correct / max(total, 1) + precision = tp / max(tp + fp, 1) + recall = tp / max(tp + fn, 1) + f1 = 2 * precision * recall / max(precision + recall, 1e-8) + + return { + "accuracy": round(accuracy, 4), + "precision": round(precision, 4), + "recall": round(recall, 4), + "f1": round(f1, 4), + "loss": round(total_loss / max(total, 1), 4), + "tp": tp, "fp": fp, "fn": fn, "tn": tn, + } + + +if __name__ == "__main__": + logging.basicConfig( + level=logging.INFO, + format="%(asctime)s %(levelname)s [%(name)s] %(message)s", + ) + random.seed(42) + np.random.seed(42) + torch.manual_seed(42) + + onnx_path = train( + epochs=20, + batch_size=32, + lr=2e-5, + max_samples_per_dataset=7000, + ) + log.info("Done! ONNX model: %s", onnx_path) diff --git a/previous-experiments/02-finetune-scratch/finetune_smart_turn_v3.py b/previous-experiments/02-finetune-scratch/finetune_smart_turn_v3.py new file mode 100644 index 0000000000000000000000000000000000000000..ec18a3047f7bc6ddb0220c77c740ab8a7b92d620 --- /dev/null +++ b/previous-experiments/02-finetune-scratch/finetune_smart_turn_v3.py @@ -0,0 +1,1021 @@ +""" +Fine-tune Pipecat Smart Turn v3 on Portuguese speech data (GPU version). + +Key improvements over v2: +- Punctuation-based labels (text ending with .!? = complete, otherwise incomplete) +- No audiobook data (removed MLS) — only conversational/interview speech +- Whisper Tiny encoder (39M params) — same as original Pipecat Smart Turn v3 +- Fixed MUPE speaker_id to use actual unique IDs +- More data (25k+ per dataset) +- Better augmentation (speed perturbation, pitch variation) + +Datasets: +- CORAA v1.1 (291h, conversational Brazilian Portuguese) +- CORAA-MUPE-ASR (365h, interview turn-taking) + +Run on a Vast.ai GPU instance with: + python finetune_smart_turn_v3.py +""" + +from __future__ import annotations + +import gc +import hashlib +import json +import logging +import os +import random +import re +import time +from dataclasses import dataclass, field +from pathlib import Path + +import numpy as np +import torch +import torch.nn as nn +from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler +from transformers import WhisperFeatureExtractor + +log = logging.getLogger(__name__) + +SAMPLE_RATE = 16000 +WINDOW_SECONDS = 8 +WINDOW_SAMPLES = WINDOW_SECONDS * SAMPLE_RATE + +# Use /workspace if on RunPod (persists across pod restarts), else local +_workspace = Path("/workspace") if Path("/workspace").exists() else Path(".") +OUTPUT_DIR = _workspace / "checkpoints" / "smart_turn_pt_v3" +CACHE_DIR = _workspace / "hf_cache" +CHECKPOINT_EVERY_BATCHES = 100 # Save resumable checkpoint every N batches + +# Punctuation that signals a complete utterance +COMPLETE_ENDINGS = re.compile(r'[.!?…]+\s*$') +# Punctuation/patterns that signal incomplete +INCOMPLETE_ENDINGS = re.compile(r'[,;:\-–—]\s*$') + +# Label smoothing: soften hard labels to improve calibration +LABEL_SMOOTH = 0.05 # 0.0 → 0.05, 1.0 → 0.95 + + +# --------------------------------------------------------------------------- +# Focal Loss — penalizes easy examples, focuses on hard boundary cases +# --------------------------------------------------------------------------- + +class FocalLoss(nn.Module): + """Focal Loss (Lin et al. 2017) with optional pos_weight for class imbalance. + + gamma > 0 down-weights well-classified examples so the model focuses on + hard false positives (the main precision problem). + alpha < 1 further penalizes false positives by reducing the weight of + positive predictions. + """ + + def __init__(self, gamma: float = 2.0, alpha: float = 0.6, + pos_weight: torch.Tensor | None = None): + super().__init__() + self.gamma = gamma + self.alpha = alpha + self.pos_weight = pos_weight + + def forward(self, logits: torch.Tensor, targets: torch.Tensor) -> torch.Tensor: + bce = nn.functional.binary_cross_entropy_with_logits( + logits, targets, reduction="none", + pos_weight=self.pos_weight, + ) + probs = torch.sigmoid(logits) + p_t = probs * targets + (1 - probs) * (1 - targets) + focal_weight = (1 - p_t) ** self.gamma + + # Alpha weighting: alpha for positives, (1-alpha) for negatives + alpha_t = self.alpha * targets + (1 - self.alpha) * (1 - targets) + + loss = alpha_t * focal_weight * bce + return loss.mean() + + +# --------------------------------------------------------------------------- +# Model — Whisper Tiny (39M params, same as original Pipecat Smart Turn v3) +# --------------------------------------------------------------------------- + +class SmartTurnModel(nn.Module): + """Whisper Tiny encoder + attention pooling + classifier.""" + + def __init__(self, whisper_model: str = "openai/whisper-tiny"): + super().__init__() + from transformers import WhisperModel + + whisper = WhisperModel.from_pretrained( + whisper_model, cache_dir=str(CACHE_DIR) + ) + self.encoder = whisper.encoder + + # Resize position embeddings from 1500 (30s) to 400 (8s) + max_pos = 400 + old_embed = self.encoder.embed_positions.weight.data + new_embed = old_embed[:max_pos, :] + self.encoder.embed_positions = nn.Embedding(max_pos, old_embed.shape[1]) + self.encoder.embed_positions.weight.data = new_embed + self.encoder.config.max_source_positions = max_pos + + hidden_size = self.encoder.config.d_model # 384 for whisper-tiny + + # Attention pooling + self.attention = nn.Sequential( + nn.Linear(hidden_size, 256), + nn.Tanh(), + nn.Linear(256, 1), + ) + + # Classifier head (sized for Whisper Tiny 384-dim) + self.classifier = nn.Sequential( + nn.Linear(hidden_size, 256), + nn.LayerNorm(256), + nn.GELU(), + nn.Dropout(0.15), + nn.Linear(256, 64), + nn.GELU(), + nn.Dropout(0.1), + nn.Linear(64, 1), + ) + + def forward(self, input_features: torch.Tensor) -> torch.Tensor: + encoder_output = self.encoder(input_features).last_hidden_state + attn_weights = self.attention(encoder_output) + attn_weights = torch.softmax(attn_weights, dim=1) + pooled = (encoder_output * attn_weights).sum(dim=1) + logits = self.classifier(pooled) + return logits.squeeze(-1) + + +# --------------------------------------------------------------------------- +# Data sample +# --------------------------------------------------------------------------- + +@dataclass +class AudioSample: + audio: np.ndarray # float32, 16kHz + label: float # 1.0 = complete, 0.0 = incomplete + speaker_id: str + source: str # dataset name + text: str = "" # original transcription + + +# --------------------------------------------------------------------------- +# Label assignment — hybrid: punctuation + audio-based +# --------------------------------------------------------------------------- + +def classify_text_completeness(text: str) -> float | None: + """Classify if text represents a complete or incomplete utterance. + + Returns: + 1.0 for complete, 0.0 for incomplete, None if can't determine from text + """ + text = text.strip() + if not text or len(text) < 3: + return None + + # Complete: ends with sentence-ending punctuation + if COMPLETE_ENDINGS.search(text): + return 1.0 + + # Incomplete: ends with continuation punctuation + if INCOMPLETE_ENDINGS.search(text): + return 0.0 + + # No punctuation — can't determine from text alone + # Return None so the caller uses audio-based labeling instead + return None + + +# --------------------------------------------------------------------------- +# Dataset loaders — conversational Portuguese only +# --------------------------------------------------------------------------- + +def load_coraa_samples(max_samples: int = 50000) -> list[AudioSample]: + """Load CORAA v1.1 — conversational Brazilian Portuguese (291h). + + Hybrid labeling: + - If text has punctuation (.!?), use that for labels + - Otherwise: full audio = COMPLETE (natural prosodic ending), + truncated audio at 30-75% = INCOMPLETE (mid-utterance cut) + """ + from datasets import load_dataset + + log.info("Loading CORAA v1.1 from HuggingFace...") + try: + ds = load_dataset( + "Racoci/CORAA-v1.1", + split="train", + cache_dir=str(CACHE_DIR), + streaming=True, + ) + except Exception as e: + log.warning("Failed to load CORAA v1.1: %s", e) + return [] + + samples = [] + complete_count = 0 + incomplete_count = 0 + text_labeled = 0 + audio_labeled = 0 + target_per_class = max_samples // 2 + + for i, row in enumerate(ds): + if complete_count >= target_per_class and incomplete_count >= target_per_class: + break + + try: + audio_data = row.get("audio", {}) + if not audio_data: + continue + + audio = np.array(audio_data["array"], dtype=np.float32) + sr = audio_data["sampling_rate"] + + if sr != SAMPLE_RATE: + import torchaudio + tensor = torch.from_numpy(audio).float().unsqueeze(0) + audio = torchaudio.functional.resample(tensor, sr, SAMPLE_RATE).squeeze().numpy() + + duration = len(audio) / SAMPLE_RATE + if duration < 1.0: + continue + + text = str(row.get("text", row.get("sentence", ""))) + speaker_id = str(row.get("speaker", row.get("speaker_id", f"coraa_{i}"))) + text_label = classify_text_completeness(text) + + if text_label is not None: + # Text has punctuation — use it directly + if text_label == 1.0 and complete_count >= target_per_class: + continue + if text_label == 0.0 and incomplete_count >= target_per_class: + continue + + window = _extract_window(audio, position="end") + if window is not None: + samples.append(AudioSample( + audio=window, label=text_label, + speaker_id=speaker_id, source="coraa", text=text, + )) + if text_label == 1.0: + complete_count += 1 + else: + incomplete_count += 1 + text_labeled += 1 + else: + # No punctuation — use audio-based labeling: + # COMPLETE: full utterance (speaker naturally finished) + if complete_count < target_per_class: + window = _extract_window(audio, position="end") + if window is not None: + samples.append(AudioSample( + audio=window, label=1.0, + speaker_id=speaker_id, source="coraa", text=text, + )) + complete_count += 1 + audio_labeled += 1 + + # INCOMPLETE: truncate at 30-75% (mid-utterance cut) + if incomplete_count < target_per_class and duration >= 2.0: + cut_frac = random.uniform(0.3, 0.75) + cut_sample = int(len(audio) * cut_frac) + truncated = audio[:cut_sample] + window = _extract_window(truncated, position="end") + if window is not None: + samples.append(AudioSample( + audio=window, label=0.0, + speaker_id=speaker_id, source="coraa", text=text, + )) + incomplete_count += 1 + audio_labeled += 1 + + except Exception as e: + if i < 5: + log.warning("CORAA sample %d error: %s", i, e) + continue + + if i % 5000 == 0 and i > 0: + log.info(" CORAA: processed %d rows, %d complete, %d incomplete (text=%d, audio=%d)", + i, complete_count, incomplete_count, text_labeled, audio_labeled) + + log.info("CORAA: %d complete + %d incomplete = %d samples (text_labeled=%d, audio_labeled=%d)", + complete_count, incomplete_count, len(samples), text_labeled, audio_labeled) + return samples + + +def load_mupe_samples(max_samples: int = 50000) -> list[AudioSample]: + """Load CORAA-MUPE-ASR — interview turn-taking (365h). + + Hybrid labeling (same as CORAA): + - If text has punctuation (.!?), use that for labels + - Otherwise: full audio = COMPLETE (natural prosodic ending), + truncated audio at 30-75% = INCOMPLETE (mid-utterance cut) + + Fixed speaker_id to use unique hashes (not just "interviewer"/"interviewee"). + """ + from datasets import load_dataset + + log.info("Loading CORAA-MUPE-ASR from HuggingFace...") + try: + ds = load_dataset( + "nilc-nlp/CORAA-MUPE-ASR", + split="train", + cache_dir=str(CACHE_DIR), + streaming=True, + ) + except Exception as e: + log.warning("Failed to load MUPE: %s", e) + return [] + + samples = [] + complete_count = 0 + incomplete_count = 0 + text_labeled = 0 + audio_labeled = 0 + target_per_class = max_samples // 2 + + for i, row in enumerate(ds): + if complete_count >= target_per_class and incomplete_count >= target_per_class: + break + + try: + audio_data = row.get("audio", {}) + if not audio_data: + continue + + audio = np.array(audio_data["array"], dtype=np.float32) + sr = audio_data["sampling_rate"] + + if sr != SAMPLE_RATE: + import torchaudio + tensor = torch.from_numpy(audio).float().unsqueeze(0) + audio = torchaudio.functional.resample(tensor, sr, SAMPLE_RATE).squeeze().numpy() + + duration = len(audio) / SAMPLE_RATE + if duration < 1.0: + continue + + text = str(row.get("text", row.get("sentence", ""))) + + # Fix: use a unique speaker_id based on audio_path or index, + # NOT speaker_type which is just "interviewer"/"interviewee" + audio_path = str(row.get("audio_path", row.get("path", ""))) + if audio_path: + parts = audio_path.split("/") + speaker_id = f"mupe_{parts[0] if len(parts) > 1 else hashlib.md5(audio_path.encode()).hexdigest()[:8]}" + else: + speaker_id = f"mupe_{i // 50}" + + text_label = classify_text_completeness(text) + + if text_label is not None: + # Text has punctuation — use it directly + if text_label == 1.0 and complete_count >= target_per_class: + continue + if text_label == 0.0 and incomplete_count >= target_per_class: + continue + + window = _extract_window(audio, position="end") + if window is not None: + samples.append(AudioSample( + audio=window, label=text_label, + speaker_id=speaker_id, source="mupe", text=text, + )) + if text_label == 1.0: + complete_count += 1 + else: + incomplete_count += 1 + text_labeled += 1 + else: + # No punctuation — use audio-based labeling: + # COMPLETE: full utterance (speaker naturally finished) + if complete_count < target_per_class: + window = _extract_window(audio, position="end") + if window is not None: + samples.append(AudioSample( + audio=window, label=1.0, + speaker_id=speaker_id, source="mupe", text=text, + )) + complete_count += 1 + audio_labeled += 1 + + # INCOMPLETE: truncate at 30-75% (mid-utterance cut) + if incomplete_count < target_per_class and duration >= 2.0: + cut_frac = random.uniform(0.3, 0.75) + cut_sample = int(len(audio) * cut_frac) + truncated = audio[:cut_sample] + window = _extract_window(truncated, position="end") + if window is not None: + samples.append(AudioSample( + audio=window, label=0.0, + speaker_id=speaker_id, source="mupe", text=text, + )) + incomplete_count += 1 + audio_labeled += 1 + + except Exception as e: + if i < 5: + log.warning("MUPE sample %d error: %s", i, e) + continue + + if i % 5000 == 0 and i > 0: + log.info(" MUPE: processed %d rows, %d complete, %d incomplete (text=%d, audio=%d)", + i, complete_count, incomplete_count, text_labeled, audio_labeled) + + log.info("MUPE: %d complete + %d incomplete = %d samples (text_labeled=%d, audio_labeled=%d)", + complete_count, incomplete_count, len(samples), text_labeled, audio_labeled) + return samples + + +# --------------------------------------------------------------------------- +# Audio processing helpers +# --------------------------------------------------------------------------- + +def _extract_window(audio: np.ndarray, position: str = "end") -> np.ndarray | None: + """Extract 8-second window from audio, pad if needed.""" + if len(audio) < SAMPLE_RATE: # minimum 1 second + return None + + # Normalize + peak = np.max(np.abs(audio)) + if peak > 0: + audio = audio / peak * 0.9 + + if position == "end": + if len(audio) > WINDOW_SAMPLES: + audio = audio[-WINDOW_SAMPLES:] + elif len(audio) < WINDOW_SAMPLES: + padding = WINDOW_SAMPLES - len(audio) + audio = np.pad(audio, (padding, 0), mode="constant") + else: + if len(audio) > WINDOW_SAMPLES: + audio = audio[:WINDOW_SAMPLES] + elif len(audio) < WINDOW_SAMPLES: + padding = WINDOW_SAMPLES - len(audio) + audio = np.pad(audio, (0, padding), mode="constant") + + # Add ~200ms silence at end (matching VAD behavior) + silence_samples = int(0.2 * SAMPLE_RATE) + audio[-silence_samples:] = 0.0 + + return audio.astype(np.float32) + + +def augment_audio(audio: np.ndarray) -> np.ndarray: + """Apply data augmentation — more aggressive than v2.""" + aug = audio.copy() + + # Speed perturbation (0.9x to 1.1x) — changes pitch and speed + if random.random() < 0.5: + speed_factor = random.uniform(0.9, 1.1) + indices = np.arange(0, len(aug), speed_factor).astype(int) + indices = indices[indices < len(aug)] + aug = aug[indices] + # Pad/truncate back to original length + if len(aug) > WINDOW_SAMPLES: + aug = aug[:WINDOW_SAMPLES] + elif len(aug) < WINDOW_SAMPLES: + aug = np.pad(aug, (WINDOW_SAMPLES - len(aug), 0), mode="constant") + + # Random volume scaling (0.6x to 1.4x) + if random.random() < 0.5: + scale = random.uniform(0.6, 1.4) + aug = aug * scale + + # Add Gaussian noise (more aggressive) + if random.random() < 0.4: + noise_level = random.uniform(0.002, 0.02) + aug = aug + np.random.randn(len(aug)).astype(np.float32) * noise_level + + # Random time shift (shift audio left/right by up to 0.3s) + if random.random() < 0.3: + shift = random.randint(-int(0.3 * SAMPLE_RATE), int(0.3 * SAMPLE_RATE)) + aug = np.roll(aug, shift) + if shift > 0: + aug[:shift] = 0.0 + elif shift < 0: + aug[shift:] = 0.0 + + # Clip to prevent overflow + aug = np.clip(aug, -1.0, 1.0) + + return aug + + +# --------------------------------------------------------------------------- +# PyTorch Dataset +# --------------------------------------------------------------------------- + +class SmartTurnDataset(Dataset): + """In-memory dataset of pre-processed audio samples.""" + + def __init__( + self, + samples: list[AudioSample], + feature_extractor: WhisperFeatureExtractor, + augment: bool = False, + ): + self.samples = samples + self.feature_extractor = feature_extractor + self.augment = augment + + def __len__(self) -> int: + return len(self.samples) + + def __getitem__(self, idx: int) -> dict: + sample = self.samples[idx] + audio = sample.audio + + if self.augment: + audio = augment_audio(audio) + + inputs = self.feature_extractor( + audio, + sampling_rate=SAMPLE_RATE, + return_tensors="np", + padding="max_length", + max_length=WINDOW_SAMPLES, + truncation=True, + do_normalize=True, + ) + + features = inputs.input_features.squeeze(0).astype(np.float32) + + # Apply label smoothing: 0→0.05, 1→0.95 (improves calibration) + smooth_label = sample.label * (1 - 2 * LABEL_SMOOTH) + LABEL_SMOOTH + + return { + "input_features": torch.from_numpy(features), + "labels": torch.tensor(smooth_label, dtype=torch.float32), + } + + +# --------------------------------------------------------------------------- +# Speaker-based train/val/test split +# --------------------------------------------------------------------------- + +def split_by_speaker( + samples: list[AudioSample], + val_frac: float = 0.1, + test_frac: float = 0.1, +) -> tuple[list[AudioSample], list[AudioSample], list[AudioSample]]: + """Split samples by speaker to avoid data leakage.""" + speaker_samples: dict[str, list[AudioSample]] = {} + for s in samples: + speaker_samples.setdefault(s.speaker_id, []).append(s) + + speakers = list(speaker_samples.keys()) + random.shuffle(speakers) + + n_val = max(1, int(len(speakers) * val_frac)) + n_test = max(1, int(len(speakers) * test_frac)) + + test_speakers = set(speakers[:n_test]) + val_speakers = set(speakers[n_test:n_test + n_val]) + train_speakers = set(speakers[n_test + n_val:]) + + train = [s for sp in train_speakers for s in speaker_samples[sp]] + val = [s for sp in val_speakers for s in speaker_samples[sp]] + test = [s for sp in test_speakers for s in speaker_samples[sp]] + + random.shuffle(train) + random.shuffle(val) + random.shuffle(test) + + # Log class distribution per split + for name, split in [("Train", train), ("Val", val), ("Test", test)]: + n_c = sum(1 for s in split if s.label == 1.0) + n_i = sum(1 for s in split if s.label == 0.0) + n_spk = len(set(s.speaker_id for s in split)) + log.info(" %s: %d samples (%d complete, %d incomplete) from %d speakers", + name, len(split), n_c, n_i, n_spk) + + return train, val, test + + +# --------------------------------------------------------------------------- +# Training +# --------------------------------------------------------------------------- + +def train( + epochs: int = 30, + batch_size: int = 32, + lr: float = 3e-5, + max_samples_per_dataset: int = 50000, + whisper_model: str = "openai/whisper-tiny", +) -> Path: + """Fine-tune Smart Turn v3 on Portuguese data from HuggingFace.""" + + if torch.cuda.is_available(): + device = "cuda" + elif torch.backends.mps.is_available(): + device = "mps" + else: + device = "cpu" + log.info("Training on device: %s", device) + if device == "cuda": + log.info("GPU: %s (%d MB)", torch.cuda.get_device_name(), + torch.cuda.get_device_properties(0).total_memory // 1024 // 1024) + + # ----- Load datasets (conversational only, no audiobooks) ----- + t0 = time.time() + all_samples: list[AudioSample] = [] + + coraa = load_coraa_samples(max_samples=max_samples_per_dataset) + all_samples.extend(coraa) + del coraa + gc.collect() + + mupe = load_mupe_samples(max_samples=max_samples_per_dataset) + all_samples.extend(mupe) + del mupe + gc.collect() + + if not all_samples: + raise RuntimeError("No samples loaded! Check dataset availability.") + + load_time = time.time() - t0 + n_complete = sum(1 for s in all_samples if s.label == 1.0) + n_incomplete = sum(1 for s in all_samples if s.label == 0.0) + n_speakers = len(set(s.speaker_id for s in all_samples)) + + log.info("Total: %d samples (%d complete, %d incomplete) from %d speakers in %.0fs", + len(all_samples), n_complete, n_incomplete, n_speakers, load_time) + + # Source distribution + sources = {} + for s in all_samples: + sources[s.source] = sources.get(s.source, 0) + 1 + for src, cnt in sorted(sources.items()): + log.info(" %s: %d samples", src, cnt) + + # Log some label examples + log.info("=== Label examples ===") + for s in random.sample(all_samples, min(10, len(all_samples))): + label_str = "COMPLETE" if s.label == 1.0 else "INCOMPLETE" + log.info(" [%s] %s: '%.60s'", label_str, s.source, s.text) + + # ----- Split by speaker ----- + log.info("=== Splitting by speaker ===") + train_samples, val_samples, test_samples = split_by_speaker(all_samples) + + # ----- Create datasets ----- + # Whisper Tiny uses 80 mel bins + feature_extractor = WhisperFeatureExtractor(chunk_length=8) + + train_ds = SmartTurnDataset(train_samples, feature_extractor, augment=True) + val_ds = SmartTurnDataset(val_samples, feature_extractor, augment=False) + test_ds = SmartTurnDataset(test_samples, feature_extractor, augment=False) + + # Balanced sampler for training + train_labels = [s.label for s in train_samples] + n_pos = sum(1 for l in train_labels if l == 1.0) + n_neg = len(train_labels) - n_pos + weights = [1.0 / n_neg if l == 0.0 else 1.0 / n_pos for l in train_labels] + sampler = WeightedRandomSampler(weights, len(weights)) + + use_pin = device == "cuda" + n_workers = 4 if device == "cuda" else 0 + train_loader = DataLoader( + train_ds, batch_size=batch_size, sampler=sampler, + num_workers=n_workers, pin_memory=use_pin, + ) + val_loader = DataLoader( + val_ds, batch_size=batch_size, shuffle=False, + num_workers=0, pin_memory=use_pin, + ) + test_loader = DataLoader( + test_ds, batch_size=batch_size, shuffle=False, + num_workers=0, pin_memory=use_pin, + ) + + # ----- Model ----- + model = SmartTurnModel(whisper_model=whisper_model).to(device) + total_params = sum(p.numel() for p in model.parameters()) + trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad) + log.info("Model: %s — %d total params, %d trainable", whisper_model, total_params, trainable_params) + + # Loss — Focal Loss with alpha=0.6 to penalize false positives (boost precision) + # alpha < 1.0 means the model is penalized MORE for false positives than false negatives + # gamma=2.0 focuses training on hard boundary cases (mid-sentence pauses) + pos_weight = torch.tensor([n_neg / max(n_pos, 1)], device=device) + criterion = FocalLoss(gamma=2.0, alpha=0.6, pos_weight=pos_weight) + log.info("FocalLoss: gamma=2.0, alpha=0.6, pos_weight=%.2f (neg=%d, pos=%d)", + pos_weight.item(), n_neg, n_pos) + + # Optimizer — different LR for encoder vs head + encoder_params = list(model.encoder.parameters()) + head_params = list(model.attention.parameters()) + list(model.classifier.parameters()) + optimizer = torch.optim.AdamW([ + {"params": encoder_params, "lr": lr * 0.1}, # Lower LR for pretrained encoder + {"params": head_params, "lr": lr}, + ], weight_decay=0.01) + + # Warmup + cosine decay + total_steps = epochs * len(train_loader) + warmup_steps = len(train_loader) * 2 # 2 epochs warmup + + def lr_lambda(step): + if step < warmup_steps: + return step / warmup_steps + progress = (step - warmup_steps) / max(total_steps - warmup_steps, 1) + return 0.5 * (1 + np.cos(np.pi * progress)) + + scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda) + + # ----- Training loop ----- + OUTPUT_DIR.mkdir(parents=True, exist_ok=True) + best_f1 = 0.0 + best_path = OUTPUT_DIR / "best_model.pt" + resume_path = OUTPUT_DIR / "resume_checkpoint.pt" + patience = 7 # More patience for larger model + patience_counter = 0 + history = [] + start_epoch = 0 + + # Resume from checkpoint if available (survives pod restarts) + if resume_path.exists(): + log.info("=== Resuming from checkpoint %s ===", resume_path) + ckpt = torch.load(resume_path, map_location=device, weights_only=False) + model.load_state_dict(ckpt["model_state_dict"]) + optimizer.load_state_dict(ckpt["optimizer_state_dict"]) + scheduler.load_state_dict(ckpt["scheduler_state_dict"]) + start_epoch = ckpt["epoch"] # resume from NEXT epoch + best_f1 = ckpt.get("best_f1", 0.0) + patience_counter = ckpt.get("patience_counter", 0) + history = ckpt.get("history", []) + log.info(" Resumed at epoch %d, best_f1=%.4f, patience=%d/%d", + start_epoch, best_f1, patience_counter, patience) + + log.info("=== Starting training: %d epochs, batch_size=%d, lr=%.1e ===", epochs, batch_size, lr) + + for epoch in range(start_epoch, epochs): + # Train + model.train() + train_loss = 0.0 + train_correct = 0 + train_total = 0 + t_epoch = time.time() + + for batch_idx, batch in enumerate(train_loader): + features = batch["input_features"].to(device) + labels = batch["labels"].to(device) + + logits = model(features) + loss = criterion(logits, labels) + + optimizer.zero_grad() + loss.backward() + torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) + optimizer.step() + scheduler.step() + + train_loss += loss.item() * len(labels) + preds = (torch.sigmoid(logits) > 0.5).float() + train_correct += (preds == labels).sum().item() + train_total += len(labels) + + if batch_idx % 100 == 0 and batch_idx > 0: + log.info(" batch %d/%d loss=%.4f", batch_idx, len(train_loader), + loss.item()) + + # Periodic checkpoint for crash recovery (saves to /workspace) + if batch_idx > 0 and batch_idx % CHECKPOINT_EVERY_BATCHES == 0: + torch.save({ + "model_state_dict": model.state_dict(), + "optimizer_state_dict": optimizer.state_dict(), + "scheduler_state_dict": scheduler.state_dict(), + "epoch": epoch, # current epoch (will resume this epoch from start) + "best_f1": best_f1, + "patience_counter": patience_counter, + "history": history, + }, resume_path) + log.info(" checkpoint saved (epoch %d, batch %d)", epoch + 1, batch_idx) + + # Validate + model.eval() + val_metrics = _evaluate(model, val_loader, device, criterion) + train_acc = train_correct / max(train_total, 1) + epoch_time = time.time() - t_epoch + + log.info( + "Epoch %d/%d (%.0fs): train_loss=%.4f train_acc=%.3f | " + "val_acc=%.3f val_f1=%.3f prec=%.3f rec=%.3f", + epoch + 1, epochs, epoch_time, + train_loss / max(train_total, 1), train_acc, + val_metrics["accuracy"], val_metrics["f1"], + val_metrics["precision"], val_metrics["recall"], + ) + + history.append({ + "epoch": epoch + 1, + "train_loss": train_loss / max(train_total, 1), + "train_acc": train_acc, + **{f"val_{k}": v for k, v in val_metrics.items()}, + }) + + # Save best + if val_metrics["f1"] > best_f1: + best_f1 = val_metrics["f1"] + torch.save({ + "model_state_dict": model.state_dict(), + "epoch": epoch + 1, + "val_f1": best_f1, + "val_metrics": val_metrics, + "whisper_model": whisper_model, + }, best_path) + log.info(" -> New best model saved (val_f1=%.4f)", best_f1) + patience_counter = 0 + else: + patience_counter += 1 + if patience_counter >= patience: + log.info("Early stopping at epoch %d (no improvement for %d epochs)", + epoch + 1, patience) + break + + # Save resume checkpoint at end of each epoch (next epoch = epoch + 1) + torch.save({ + "model_state_dict": model.state_dict(), + "optimizer_state_dict": optimizer.state_dict(), + "scheduler_state_dict": scheduler.state_dict(), + "epoch": epoch + 1, + "best_f1": best_f1, + "patience_counter": patience_counter, + "history": history, + }, resume_path) + log.info(" epoch checkpoint saved (will resume at epoch %d)", epoch + 2) + + # Clean up resume checkpoint — training completed successfully + if resume_path.exists(): + resume_path.unlink() + log.info("Resume checkpoint removed (training complete)") + + # ----- Test evaluation ----- + log.info("\n=== Final Test Evaluation ===") + checkpoint = torch.load(best_path, map_location=device, weights_only=True) + model.load_state_dict(checkpoint["model_state_dict"]) + model.eval() + + test_metrics = _evaluate(model, test_loader, device, criterion) + log.info("Test results (best model from epoch %d) @ threshold=0.5:", checkpoint["epoch"]) + log.info(" Accuracy: %.3f", test_metrics["accuracy"]) + log.info(" Precision: %.3f", test_metrics["precision"]) + log.info(" Recall: %.3f", test_metrics["recall"]) + log.info(" F1: %.3f", test_metrics["f1"]) + log.info(" TP=%d FP=%d FN=%d TN=%d", + test_metrics["tp"], test_metrics["fp"], + test_metrics["fn"], test_metrics["tn"]) + + # ----- Multi-threshold evaluation (find best precision/recall tradeoff) ----- + log.info("\n=== Threshold Sweep ===") + threshold_results = {} + for thresh in [0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8]: + t_metrics = _evaluate(model, test_loader, device, criterion, threshold=thresh) + threshold_results[str(thresh)] = t_metrics + log.info(" threshold=%.2f: prec=%.3f rec=%.3f f1=%.3f acc=%.3f (TP=%d FP=%d FN=%d TN=%d)", + thresh, t_metrics["precision"], t_metrics["recall"], + t_metrics["f1"], t_metrics["accuracy"], + t_metrics["tp"], t_metrics["fp"], t_metrics["fn"], t_metrics["tn"]) + + # Find best threshold for precision >= 85% + best_thresh = 0.5 + best_thresh_f1 = 0.0 + for thresh_str, t_m in threshold_results.items(): + if t_m["precision"] >= 0.85 and t_m["f1"] > best_thresh_f1: + best_thresh = float(thresh_str) + best_thresh_f1 = t_m["f1"] + if best_thresh > 0.5: + log.info(" -> Recommended threshold: %.2f (precision>=85%%, best F1=%.3f)", + best_thresh, best_thresh_f1) + else: + log.info(" -> No threshold achieves >=85%% precision; using 0.5") + + # ----- Export to ONNX ----- + model = model.to("cpu") + onnx_path = OUTPUT_DIR / "smart_turn_pt_v3.onnx" + dummy = torch.randn(1, 80, 800) + try: + torch.onnx.export( + model, + dummy, + str(onnx_path), + input_names=["input_features"], + output_names=["logits"], + dynamic_axes={"input_features": {0: "batch"}, "logits": {0: "batch"}}, + opset_version=17, + ) + log.info("ONNX model exported to %s", onnx_path) + except Exception as e: + log.warning("ONNX export failed: %s — saving PyTorch model only", e) + + # ----- Save results ----- + results = { + "model": "smart_turn_pt_v3", + "whisper_model": whisper_model, + "total_samples": len(all_samples), + "n_speakers": n_speakers, + "sources": sources, + "train_samples": len(train_samples), + "val_samples": len(val_samples), + "test_samples": len(test_samples), + "best_epoch": checkpoint["epoch"], + "best_val_f1": best_f1, + "val_metrics": checkpoint["val_metrics"], + "test_metrics": test_metrics, + "history": history, + "threshold_sweep": threshold_results, + "recommended_threshold": best_thresh, + "improvements_over_v2": [ + "punctuation-based labels instead of random cuts", + "removed MLS audiobook data", + "whisper-tiny (39M) — same backbone as original Pipecat Smart Turn v3", + "fixed MUPE speaker_id", + "better augmentation (speed perturbation, time shift)", + "warmup + cosine decay LR schedule", + "focal loss (gamma=2, alpha=0.6) — penalizes false positives", + "label smoothing (0.05) — improves calibration", + "multi-threshold evaluation — finds optimal precision/recall tradeoff", + ], + "config": { + "epochs": epochs, + "batch_size": batch_size, + "lr": lr, + "max_samples_per_dataset": max_samples_per_dataset, + "patience": patience, + }, + } + + results_path = OUTPUT_DIR / "training_results.json" + with open(results_path, "w") as f: + json.dump(results, f, indent=2) + log.info("Training results saved to %s", results_path) + + return onnx_path + + +def _evaluate( + model: nn.Module, + loader: DataLoader, + device: str, + criterion: nn.Module, + threshold: float = 0.5, +) -> dict: + """Evaluate model and return metrics at a given threshold.""" + correct = 0 + total = 0 + tp = fp = fn = tn = 0 + total_loss = 0.0 + + with torch.no_grad(): + for batch in loader: + features = batch["input_features"].to(device) + labels = batch["labels"].to(device) + + logits = model(features) + loss = criterion(logits, labels) + total_loss += loss.item() * len(labels) + + preds = (torch.sigmoid(logits) > threshold).float() + # Compare against hard labels (undo label smoothing for eval) + hard_labels = (labels > 0.5).float() + correct += (preds == hard_labels).sum().item() + total += len(labels) + + tp += ((preds == 1) & (hard_labels == 1)).sum().item() + fp += ((preds == 1) & (hard_labels == 0)).sum().item() + fn += ((preds == 0) & (hard_labels == 1)).sum().item() + tn += ((preds == 0) & (hard_labels == 0)).sum().item() + + accuracy = correct / max(total, 1) + precision = tp / max(tp + fp, 1) + recall = tp / max(tp + fn, 1) + f1 = 2 * precision * recall / max(precision + recall, 1e-8) + + return { + "accuracy": round(accuracy, 4), + "precision": round(precision, 4), + "recall": round(recall, 4), + "f1": round(f1, 4), + "loss": round(total_loss / max(total, 1), 4), + "tp": tp, "fp": fp, "fn": fn, "tn": tn, + } + + +if __name__ == "__main__": + logging.basicConfig( + level=logging.INFO, + format="%(asctime)s %(levelname)s [%(name)s] %(message)s", + ) + random.seed(42) + np.random.seed(42) + torch.manual_seed(42) + + onnx_path = train( + epochs=30, + batch_size=32, + lr=3e-5, + max_samples_per_dataset=7500, + whisper_model="openai/whisper-tiny", + ) + log.info("Done! Model: %s", onnx_path) diff --git a/previous-experiments/02-finetune-scratch/modal_finetune.py b/previous-experiments/02-finetune-scratch/modal_finetune.py new file mode 100644 index 0000000000000000000000000000000000000000..b75d04caddceffd57ced7353a78bb1755a7cd06a --- /dev/null +++ b/previous-experiments/02-finetune-scratch/modal_finetune.py @@ -0,0 +1,141 @@ +"""Deploy fine-tuning on Modal. + +Modal supports custom Docker images, GPU selection, and long-running jobs. +This bypasses the gateway's deploy pipeline since it only supports the +translation pipeline on Modal, not arbitrary workloads. +""" + +import modal +from pathlib import Path + +SCRIPT_DIR = Path(__file__).parent + +app = modal.App("babelcast-finetune-smart-turn-v3-focal") + +# Docker image with PyTorch + deps + training script baked in +image = ( + modal.Image.debian_slim(python_version="3.11") + .pip_install( + "torch", + "torchaudio", + "transformers", + "datasets>=2.18,<3.0", + "soundfile", + "librosa", + "numpy", + ) + .apt_install("ffmpeg", "libsndfile1") + .add_local_file( + str(SCRIPT_DIR / "finetune_smart_turn_v3.py"), + remote_path="/root/finetune_smart_turn_v3.py", + ) +) + +# Persistent volume for checkpoints + HF cache +vol = modal.Volume.from_name("finetune-smart-turn-v3-focal", create_if_missing=True) + + +@app.function( + image=image, + gpu="A10G", # cheapest Modal GPU, 24GB VRAM — sufficient for Whisper Tiny fine-tuning + timeout=4 * 3600, # 4 hours max + volumes={"/workspace": vol}, +) +def run_finetune(): + """Run the fine-tuning script on a GPU.""" + import subprocess + import sys + import os + + os.environ["HF_HOME"] = "/workspace/hf_cache" + os.environ["TRANSFORMERS_CACHE"] = "/workspace/hf_cache" + + # Copy baked-in script to workspace + script_dest = Path("/workspace/finetune_smart_turn_v3.py") + script_dest.write_text(Path("/root/finetune_smart_turn_v3.py").read_text()) + + # Check for existing checkpoint + ckpt = Path("/workspace/checkpoints/smart_turn_pt_v3/resume_checkpoint.pt") + if ckpt.exists(): + print("[modal] Found resume checkpoint — continuing training") + else: + print("[modal] Starting fresh training") + + # Run training + print("[modal] Starting fine-tuning...") + result = subprocess.run( + [sys.executable, str(script_dest)], + cwd="/workspace", + env={**os.environ}, + ) + + # Commit volume changes (checkpoints, results) + vol.commit() + + if result.returncode != 0: + raise RuntimeError(f"Training failed with exit code {result.returncode}") + + # List results + results_dir = Path("/workspace/checkpoints/smart_turn_pt_v3") + if results_dir.exists(): + print("\n[modal] Training results:") + for f in sorted(results_dir.iterdir()): + size_mb = f.stat().st_size / 1024 / 1024 + print(f" {f.name}: {size_mb:.1f} MB") + + return "Training complete!" + + +@app.function( + image=image, + volumes={"/workspace": vol}, +) +def check_status(): + """Check training status (checkpoint existence, results).""" + from pathlib import Path + + results_dir = Path("/workspace/checkpoints/smart_turn_pt_v3") + status = { + "has_checkpoint": (results_dir / "resume_checkpoint.pt").exists(), + "done": (results_dir / "training_results.json").exists(), + } + + if status["done"]: + import json + results = json.loads((results_dir / "training_results.json").read_text()) + status["results"] = results + + if results_dir.exists(): + status["files"] = [f.name for f in sorted(results_dir.iterdir())] + + return status + + +@app.function( + image=image, + volumes={"/workspace": vol}, +) +def download_results() -> dict: + """Download training results from the volume.""" + results_dir = Path("/workspace/checkpoints/smart_turn_pt_v3") + + files = {} + for f in results_dir.iterdir(): + if f.suffix in (".json", ".txt"): + files[f.name] = f.read_text() + elif f.suffix in (".onnx", ".pt"): + files[f.name] = f"[binary, {f.stat().st_size / 1024 / 1024:.1f} MB]" + + return files + + +@app.local_entrypoint() +def main(): + print("Starting fine-tuning on Modal...") + print(f"Script: {SCRIPT_DIR / 'finetune_smart_turn_v3.py'}") + print(f"GPU: A10G (24GB VRAM)") + print(f"Timeout: 4 hours") + print() + + result = run_finetune.remote() + print(f"\nResult: {result}") diff --git a/previous-experiments/02-finetune-scratch/requirements.txt b/previous-experiments/02-finetune-scratch/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..de7b94ee68742c86b1efc4c79c164d3b865c0f8e --- /dev/null +++ b/previous-experiments/02-finetune-scratch/requirements.txt @@ -0,0 +1,24 @@ +# Turn-Taking Benchmark Dependencies +torch>=2.0.0 +torchaudio>=2.0.0 +transformers>=4.37.0 +datasets>=2.16.0 +huggingface_hub>=0.20.0 +soundfile>=0.12.0 +librosa>=0.10.0 +numpy>=1.24.0 +pandas>=1.5.0 +matplotlib>=3.7.0 +seaborn>=0.12.0 +scikit-learn>=1.3.0 +tabulate>=0.9.0 +tqdm>=4.65.0 +onnxruntime>=1.16.0 +# VAP dependencies +einops>=0.7.0 +lightning>=2.0.0 +omegaconf>=2.3.0 +# Silero VAD +silero-vad>=5.0 +# Report generation +jinja2>=3.1.0 diff --git a/previous-experiments/02-finetune-scratch/results-focal/training_results.json b/previous-experiments/02-finetune-scratch/results-focal/training_results.json new file mode 100644 index 0000000000000000000000000000000000000000..e5aee6b7e9e316f2d575d579874dd115df409eef --- /dev/null +++ b/previous-experiments/02-finetune-scratch/results-focal/training_results.json @@ -0,0 +1,375 @@ +{ + "model": "smart_turn_pt_v3", + "whisper_model": "openai/whisper-tiny", + "total_samples": 15000, + "n_speakers": 5590, + "sources": { + "coraa": 7500, + "mupe": 7500 + }, + "train_samples": 11725, + "val_samples": 1587, + "test_samples": 1688, + "best_epoch": 10, + "best_val_f1": 0.7885, + "val_metrics": { + "accuracy": 0.7599, + "precision": 0.7143, + "recall": 0.8798, + "f1": 0.7885, + "loss": 0.062, + "tp": 710, + "fp": 284, + "fn": 97, + "tn": 496 + }, + "test_metrics": { + "accuracy": 0.782, + "precision": 0.7198, + "recall": 0.8953, + "f1": 0.798, + "loss": 0.0568, + "tp": 727, + "fp": 283, + "fn": 85, + "tn": 593 + }, + "history": [ + { + "epoch": 1, + "train_loss": 0.08304905337883212, + "train_acc": 0.0, + "val_accuracy": 0.5577, + "val_precision": 0.5367, + "val_recall": 0.9517, + "val_f1": 0.6863, + "val_loss": 0.0804, + "val_tp": 768, + "val_fp": 663, + "val_fn": 39, + "val_tn": 117 + }, + { + "epoch": 2, + "train_loss": 0.07518008057306062, + "train_acc": 0.0, + "val_accuracy": 0.6843, + "val_precision": 0.6314, + "val_recall": 0.9108, + "val_f1": 0.7458, + "val_loss": 0.0701, + "val_tp": 735, + "val_fp": 429, + "val_fn": 72, + "val_tn": 351 + }, + { + "epoch": 3, + "train_loss": 0.06932135441981907, + "train_acc": 0.0, + "val_accuracy": 0.729, + "val_precision": 0.7205, + "val_recall": 0.7633, + "val_f1": 0.7413, + "val_loss": 0.0673, + "val_tp": 616, + "val_fp": 239, + "val_fn": 191, + "val_tn": 541 + }, + { + "epoch": 4, + "train_loss": 0.06531790697847856, + "train_acc": 0.0, + "val_accuracy": 0.729, + "val_precision": 0.6783, + "val_recall": 0.8885, + "val_f1": 0.7693, + "val_loss": 0.0652, + "val_tp": 717, + "val_fp": 340, + "val_fn": 90, + "val_tn": 440 + }, + { + "epoch": 5, + "train_loss": 0.06220142287422599, + "train_acc": 0.0, + "val_accuracy": 0.7429, + "val_precision": 0.7152, + "val_recall": 0.8216, + "val_f1": 0.7647, + "val_loss": 0.0646, + "val_tp": 663, + "val_fp": 264, + "val_fn": 144, + "val_tn": 516 + }, + { + "epoch": 6, + "train_loss": 0.06043359401256545, + "train_acc": 0.0, + "val_accuracy": 0.7549, + "val_precision": 0.7626, + "val_recall": 0.7522, + "val_f1": 0.7573, + "val_loss": 0.0713, + "val_tp": 607, + "val_fp": 189, + "val_fn": 200, + "val_tn": 591 + }, + { + "epoch": 7, + "train_loss": 0.0590633105011637, + "train_acc": 0.0, + "val_accuracy": 0.7435, + "val_precision": 0.7004, + "val_recall": 0.8662, + "val_f1": 0.7745, + "val_loss": 0.0653, + "val_tp": 699, + "val_fp": 299, + "val_fn": 108, + "val_tn": 481 + }, + { + "epoch": 8, + "train_loss": 0.05824034495330823, + "train_acc": 0.0, + "val_accuracy": 0.7492, + "val_precision": 0.7064, + "val_recall": 0.8674, + "val_f1": 0.7786, + "val_loss": 0.065, + "val_tp": 700, + "val_fp": 291, + "val_fn": 107, + "val_tn": 489 + }, + { + "epoch": 9, + "train_loss": 0.055727968638830346, + "train_acc": 0.0, + "val_accuracy": 0.758, + "val_precision": 0.7248, + "val_recall": 0.8451, + "val_f1": 0.7803, + "val_loss": 0.068, + "val_tp": 682, + "val_fp": 259, + "val_fn": 125, + "val_tn": 521 + }, + { + "epoch": 10, + "train_loss": 0.055196978464436684, + "train_acc": 0.0, + "val_accuracy": 0.7599, + "val_precision": 0.7143, + "val_recall": 0.8798, + "val_f1": 0.7885, + "val_loss": 0.062, + "val_tp": 710, + "val_fp": 284, + "val_fn": 97, + "val_tn": 496 + }, + { + "epoch": 11, + "train_loss": 0.05316883711418363, + "train_acc": 0.0, + "val_accuracy": 0.7662, + "val_precision": 0.7449, + "val_recall": 0.8216, + "val_f1": 0.7814, + "val_loss": 0.0669, + "val_tp": 663, + "val_fp": 227, + "val_fn": 144, + "val_tn": 553 + }, + { + "epoch": 12, + "train_loss": 0.05209291440146818, + "train_acc": 0.0, + "val_accuracy": 0.7536, + "val_precision": 0.7105, + "val_recall": 0.8699, + "val_f1": 0.7822, + "val_loss": 0.0689, + "val_tp": 702, + "val_fp": 286, + "val_fn": 105, + "val_tn": 494 + }, + { + "epoch": 13, + "train_loss": 0.050168677500760886, + "train_acc": 0.0, + "val_accuracy": 0.7599, + "val_precision": 0.7169, + "val_recall": 0.8724, + "val_f1": 0.787, + "val_loss": 0.0679, + "val_tp": 704, + "val_fp": 278, + "val_fn": 103, + "val_tn": 502 + }, + { + "epoch": 14, + "train_loss": 0.049125674698970466, + "train_acc": 0.0, + "val_accuracy": 0.7694, + "val_precision": 0.7453, + "val_recall": 0.8302, + "val_f1": 0.7855, + "val_loss": 0.0716, + "val_tp": 670, + "val_fp": 229, + "val_fn": 137, + "val_tn": 551 + }, + { + "epoch": 15, + "train_loss": 0.04660894076754925, + "train_acc": 0.0, + "val_accuracy": 0.7631, + "val_precision": 0.752, + "val_recall": 0.7968, + "val_f1": 0.7738, + "val_loss": 0.0755, + "val_tp": 643, + "val_fp": 212, + "val_fn": 164, + "val_tn": 568 + }, + { + "epoch": 16, + "train_loss": 0.047135610202672894, + "train_acc": 0.0, + "val_accuracy": 0.765, + "val_precision": 0.7565, + "val_recall": 0.7931, + "val_f1": 0.7743, + "val_loss": 0.0768, + "val_tp": 640, + "val_fp": 206, + "val_fn": 167, + "val_tn": 574 + }, + { + "epoch": 17, + "train_loss": 0.04502822972214553, + "train_acc": 0.0, + "val_accuracy": 0.77, + "val_precision": 0.7506, + "val_recall": 0.8203, + "val_f1": 0.7839, + "val_loss": 0.0751, + "val_tp": 662, + "val_fp": 220, + "val_fn": 145, + "val_tn": 560 + } + ], + "threshold_sweep": { + "0.5": { + "accuracy": 0.782, + "precision": 0.7198, + "recall": 0.8953, + "f1": 0.798, + "loss": 0.0568, + "tp": 727, + "fp": 283, + "fn": 85, + "tn": 593 + }, + "0.55": { + "accuracy": 0.782, + "precision": 0.744, + "recall": 0.8337, + "f1": 0.7863, + "loss": 0.0568, + "tp": 677, + "fp": 233, + "fn": 135, + "tn": 643 + }, + "0.6": { + "accuracy": 0.7861, + "precision": 0.7917, + "recall": 0.7537, + "f1": 0.7722, + "loss": 0.0568, + "tp": 612, + "fp": 161, + "fn": 200, + "tn": 715 + }, + "0.65": { + "accuracy": 0.7672, + "precision": 0.8299, + "recall": 0.649, + "f1": 0.7284, + "loss": 0.0568, + "tp": 527, + "fp": 108, + "fn": 285, + "tn": 768 + }, + "0.7": { + "accuracy": 0.7322, + "precision": 0.8734, + "recall": 0.5185, + "f1": 0.6507, + "loss": 0.0568, + "tp": 421, + "fp": 61, + "fn": 391, + "tn": 815 + }, + "0.75": { + "accuracy": 0.6783, + "precision": 0.9297, + "recall": 0.3584, + "f1": 0.5173, + "loss": 0.0568, + "tp": 291, + "fp": 22, + "fn": 521, + "tn": 854 + }, + "0.8": { + "accuracy": 0.5983, + "precision": 0.9351, + "recall": 0.1773, + "f1": 0.2981, + "loss": 0.0568, + "tp": 144, + "fp": 10, + "fn": 668, + "tn": 866 + } + }, + "recommended_threshold": 0.7, + "improvements_over_v2": [ + "punctuation-based labels instead of random cuts", + "removed MLS audiobook data", + "whisper-tiny (39M) \u2014 same backbone as original Pipecat Smart Turn v3", + "fixed MUPE speaker_id", + "better augmentation (speed perturbation, time shift)", + "warmup + cosine decay LR schedule", + "focal loss (gamma=2, alpha=0.6) \u2014 penalizes false positives", + "label smoothing (0.05) \u2014 improves calibration", + "multi-threshold evaluation \u2014 finds optimal precision/recall tradeoff" + ], + "config": { + "epochs": 30, + "batch_size": 32, + "lr": 3e-05, + "max_samples_per_dataset": 7500, + "patience": 7 + } +} \ No newline at end of file diff --git a/previous-experiments/02-finetune-scratch/results-tiny/training_results.json b/previous-experiments/02-finetune-scratch/results-tiny/training_results.json new file mode 100644 index 0000000000000000000000000000000000000000..598d2da38d4374190c5fc7eed44a346d625f8eb1 --- /dev/null +++ b/previous-experiments/02-finetune-scratch/results-tiny/training_results.json @@ -0,0 +1,335 @@ +{ + "model": "smart_turn_pt_v3", + "whisper_model": "openai/whisper-tiny", + "total_samples": 15000, + "n_speakers": 5590, + "sources": { + "coraa": 7500, + "mupe": 7500 + }, + "train_samples": 11725, + "val_samples": 1587, + "test_samples": 1688, + "best_epoch": 13, + "best_val_f1": 0.7828, + "val_metrics": { + "accuracy": 0.7612, + "precision": 0.7281, + "recall": 0.8463, + "f1": 0.7828, + "loss": 0.5151, + "tp": 683, + "fp": 255, + "fn": 124, + "tn": 525 + }, + "test_metrics": { + "accuracy": 0.7796, + "precision": 0.7326, + "recall": 0.8534, + "f1": 0.7884, + "loss": 0.4692, + "tp": 693, + "fp": 253, + "fn": 119, + "tn": 623 + }, + "history": [ + { + "epoch": 1, + "train_loss": 0.6771236312872311, + "train_acc": 0.5791044776119403, + "val_accuracy": 0.6156, + "val_precision": 0.6194, + "val_recall": 0.6332, + "val_f1": 0.6262, + "val_loss": 0.6583, + "val_tp": 511, + "val_fp": 314, + "val_fn": 296, + "val_tn": 466 + }, + { + "epoch": 2, + "train_loss": 0.6190603873012925, + "train_acc": 0.6643070362473348, + "val_accuracy": 0.6717, + "val_precision": 0.648, + "val_recall": 0.7757, + "val_f1": 0.7061, + "val_loss": 0.5994, + "val_tp": 626, + "val_fp": 340, + "val_fn": 181, + "val_tn": 440 + }, + { + "epoch": 3, + "train_loss": 0.5829633576275189, + "train_acc": 0.6884434968017058, + "val_accuracy": 0.6925, + "val_precision": 0.7074, + "val_recall": 0.6741, + "val_f1": 0.6904, + "val_loss": 0.5619, + "val_tp": 544, + "val_fp": 225, + "val_fn": 263, + "val_tn": 555 + }, + { + "epoch": 4, + "train_loss": 0.5621842032734519, + "train_acc": 0.7078038379530917, + "val_accuracy": 0.7089, + "val_precision": 0.6841, + "val_recall": 0.7943, + "val_f1": 0.7351, + "val_loss": 0.5374, + "val_tp": 641, + "val_fp": 296, + "val_fn": 166, + "val_tn": 484 + }, + { + "epoch": 5, + "train_loss": 0.5306087098853675, + "train_acc": 0.7318550106609808, + "val_accuracy": 0.7385, + "val_precision": 0.7432, + "val_recall": 0.7423, + "val_f1": 0.7427, + "val_loss": 0.5258, + "val_tp": 599, + "val_fp": 207, + "val_fn": 208, + "val_tn": 573 + }, + { + "epoch": 6, + "train_loss": 0.5049316295301483, + "train_acc": 0.7482302771855011, + "val_accuracy": 0.741, + "val_precision": 0.7789, + "val_recall": 0.6853, + "val_f1": 0.7291, + "val_loss": 0.538, + "val_tp": 553, + "val_fp": 157, + "val_fn": 254, + "val_tn": 623 + }, + { + "epoch": 7, + "train_loss": 0.49070732887111496, + "train_acc": 0.7631556503198295, + "val_accuracy": 0.7423, + "val_precision": 0.7026, + "val_recall": 0.855, + "val_f1": 0.7714, + "val_loss": 0.5213, + "val_tp": 690, + "val_fp": 292, + "val_fn": 117, + "val_tn": 488 + }, + { + "epoch": 8, + "train_loss": 0.4880887462335355, + "train_acc": 0.7611940298507462, + "val_accuracy": 0.7561, + "val_precision": 0.7239, + "val_recall": 0.8414, + "val_f1": 0.7782, + "val_loss": 0.5072, + "val_tp": 679, + "val_fp": 259, + "val_fn": 128, + "val_tn": 521 + }, + { + "epoch": 9, + "train_loss": 0.46097730000135995, + "train_acc": 0.777910447761194, + "val_accuracy": 0.7643, + "val_precision": 0.7808, + "val_recall": 0.746, + "val_f1": 0.763, + "val_loss": 0.5304, + "val_tp": 602, + "val_fp": 169, + "val_fn": 205, + "val_tn": 611 + }, + { + "epoch": 10, + "train_loss": 0.4558045275124914, + "train_acc": 0.7823454157782516, + "val_accuracy": 0.7549, + "val_precision": 0.7297, + "val_recall": 0.8228, + "val_f1": 0.7734, + "val_loss": 0.4937, + "val_tp": 664, + "val_fp": 246, + "val_fn": 143, + "val_tn": 534 + }, + { + "epoch": 11, + "train_loss": 0.43831022378478224, + "train_acc": 0.7910447761194029, + "val_accuracy": 0.7593, + "val_precision": 0.7497, + "val_recall": 0.7906, + "val_f1": 0.7696, + "val_loss": 0.5039, + "val_tp": 638, + "val_fp": 213, + "val_fn": 169, + "val_tn": 567 + }, + { + "epoch": 12, + "train_loss": 0.4263024093601495, + "train_acc": 0.8014498933901919, + "val_accuracy": 0.7631, + "val_precision": 0.7386, + "val_recall": 0.8265, + "val_f1": 0.7801, + "val_loss": 0.5071, + "val_tp": 667, + "val_fp": 236, + "val_fn": 140, + "val_tn": 544 + }, + { + "epoch": 13, + "train_loss": 0.41435765875173786, + "train_acc": 0.8090405117270789, + "val_accuracy": 0.7612, + "val_precision": 0.7281, + "val_recall": 0.8463, + "val_f1": 0.7828, + "val_loss": 0.5151, + "val_tp": 683, + "val_fp": 255, + "val_fn": 124, + "val_tn": 525 + }, + { + "epoch": 14, + "train_loss": 0.40688978574550483, + "train_acc": 0.8128784648187634, + "val_accuracy": 0.7669, + "val_precision": 0.7708, + "val_recall": 0.7708, + "val_f1": 0.7708, + "val_loss": 0.5213, + "val_tp": 622, + "val_fp": 185, + "val_fn": 185, + "val_tn": 595 + }, + { + "epoch": 15, + "train_loss": 0.38817113016841254, + "train_acc": 0.8240511727078891, + "val_accuracy": 0.7555, + "val_precision": 0.7615, + "val_recall": 0.7559, + "val_f1": 0.7587, + "val_loss": 0.5367, + "val_tp": 610, + "val_fp": 191, + "val_fn": 197, + "val_tn": 589 + }, + { + "epoch": 16, + "train_loss": 0.38501013694033187, + "train_acc": 0.8272068230277185, + "val_accuracy": 0.7631, + "val_precision": 0.7924, + "val_recall": 0.7237, + "val_f1": 0.7565, + "val_loss": 0.5479, + "val_tp": 584, + "val_fp": 153, + "val_fn": 223, + "val_tn": 627 + }, + { + "epoch": 17, + "train_loss": 0.3747370283868013, + "train_acc": 0.8355650319829424, + "val_accuracy": 0.7656, + "val_precision": 0.7526, + "val_recall": 0.803, + "val_f1": 0.777, + "val_loss": 0.5343, + "val_tp": 648, + "val_fp": 213, + "val_fn": 159, + "val_tn": 567 + }, + { + "epoch": 18, + "train_loss": 0.38245479307703373, + "train_acc": 0.8266950959488273, + "val_accuracy": 0.7587, + "val_precision": 0.7739, + "val_recall": 0.7423, + "val_f1": 0.7577, + "val_loss": 0.5524, + "val_tp": 599, + "val_fp": 175, + "val_fn": 208, + "val_tn": 605 + }, + { + "epoch": 19, + "train_loss": 0.36241013058721383, + "train_acc": 0.8371002132196163, + "val_accuracy": 0.7631, + "val_precision": 0.7677, + "val_recall": 0.7658, + "val_f1": 0.7667, + "val_loss": 0.5466, + "val_tp": 618, + "val_fp": 187, + "val_fn": 189, + "val_tn": 593 + }, + { + "epoch": 20, + "train_loss": 0.3642956393639416, + "train_acc": 0.8332622601279318, + "val_accuracy": 0.7681, + "val_precision": 0.7687, + "val_recall": 0.7782, + "val_f1": 0.7734, + "val_loss": 0.5385, + "val_tp": 628, + "val_fp": 189, + "val_fn": 179, + "val_tn": 591 + } + ], + "improvements_over_v2": [ + "punctuation-based labels instead of random cuts", + "removed MLS audiobook data", + "whisper-tiny (39M) \u2014 same backbone as original Pipecat Smart Turn v3", + "fixed MUPE speaker_id", + "better augmentation (speed perturbation, time shift)", + "warmup + cosine decay LR schedule", + "larger classifier head" + ], + "config": { + "epochs": 30, + "batch_size": 32, + "lr": 3e-05, + "max_samples_per_dataset": 7500, + "patience": 7 + } +} \ No newline at end of file diff --git a/previous-experiments/02-finetune-scratch/start_finetune.sh b/previous-experiments/02-finetune-scratch/start_finetune.sh new file mode 100644 index 0000000000000000000000000000000000000000..5fb1d385383268ea93a7ff0d18038285529df0e7 --- /dev/null +++ b/previous-experiments/02-finetune-scratch/start_finetune.sh @@ -0,0 +1,48 @@ +#!/bin/bash +# Start script for RunPod fine-tuning pod. +# Installs deps (cached in /workspace), then runs training with auto-resume. +# On pod restart, training resumes from the last checkpoint. + +set -e + +echo "=== Smart Turn v3 Fine-tuning Start Script ===" +echo "Date: $(date)" +echo "GPU: $(nvidia-smi --query-gpu=name --format=csv,noheader 2>/dev/null || echo 'none')" + +# Use /workspace for caching (persists across pod restarts) +export PIP_CACHE_DIR=/workspace/.pip_cache +export HF_HOME=/workspace/huggingface +export TRANSFORMERS_CACHE=/workspace/huggingface +export TMPDIR=/workspace/tmp +mkdir -p "$PIP_CACHE_DIR" "$HF_HOME" "$TMPDIR" + +# Install Python deps (cached in /workspace so fast on restart) +echo "=== Installing dependencies ===" +pip install --quiet torch torchaudio transformers datasets numpy 2>&1 | tail -5 + +# Copy training script to /workspace if not already there +SCRIPT_DIR="/workspace/finetune" +mkdir -p "$SCRIPT_DIR" +if [ -f /finetune_smart_turn_v3.py ]; then + cp /finetune_smart_turn_v3.py "$SCRIPT_DIR/" +elif [ -f /app/finetune_smart_turn_v3.py ]; then + cp /app/finetune_smart_turn_v3.py "$SCRIPT_DIR/" +fi + +# Check for existing checkpoint +if [ -f /workspace/checkpoints/smart_turn_pt_v3/resume_checkpoint.pt ]; then + echo "=== Found resume checkpoint — will continue training ===" +fi + +# Run training +echo "=== Starting training ===" +cd "$SCRIPT_DIR" +python finetune_smart_turn_v3.py 2>&1 | tee /workspace/training.log + +echo "=== Training complete! ===" +echo "Results in /workspace/checkpoints/smart_turn_pt_v3/" +ls -la /workspace/checkpoints/smart_turn_pt_v3/ + +# Keep pod alive so we can download results +echo "=== Pod staying alive for result download. Use runpodctl to get files. ===" +sleep infinity diff --git a/previous-experiments/02-finetune-scratch/vast_finetune.sh b/previous-experiments/02-finetune-scratch/vast_finetune.sh new file mode 100644 index 0000000000000000000000000000000000000000..9b06607f5c7e8de4d1e07ccde529f29881e149cd --- /dev/null +++ b/previous-experiments/02-finetune-scratch/vast_finetune.sh @@ -0,0 +1,36 @@ +#!/bin/bash +set -e + +echo "=== Smart Turn Portuguese Fine-Tuning — Vast.ai ===" +echo "Started at: $(date)" +echo "GPU: $(nvidia-smi --query-gpu=name,memory.total --format=csv,noheader 2>/dev/null || echo 'no GPU')" + +cd /workspace + +# Clone repo +if [ ! -d "turn-taking-study" ]; then + git clone https://github.com/marcosremar/turn-taking-study.git +fi +cd turn-taking-study + +# Install dependencies +echo "=== Installing dependencies ===" +pip install --no-cache-dir \ + torch torchaudio \ + transformers datasets huggingface_hub \ + soundfile numpy \ + onnx onnxruntime \ + 2>&1 | tail -5 + +echo "=== Dependencies installed ===" +python -c "import torch; print(f'PyTorch {torch.__version__}, CUDA: {torch.cuda.is_available()}')" + +# Run fine-tuning +echo "=== Starting fine-tuning ===" +python finetune_smart_turn_gpu.py 2>&1 | tee /workspace/finetune.log + +echo "=== DONE at $(date) ===" +echo "Results in /workspace/turn-taking-study/checkpoints/smart_turn_pt_v2/" + +# Keep alive for result collection +sleep infinity diff --git a/previous-experiments/02-finetune-scratch/vast_onstart.sh b/previous-experiments/02-finetune-scratch/vast_onstart.sh new file mode 100644 index 0000000000000000000000000000000000000000..8688b3e9e3c3910fb8b74a99e6a29df6afb0f851 --- /dev/null +++ b/previous-experiments/02-finetune-scratch/vast_onstart.sh @@ -0,0 +1,47 @@ +#!/bin/bash +set -e + +echo "=== Turn-Taking Benchmark - Vast.ai Runner ===" +echo "Started at: $(date)" + +cd /workspace + +# Clone benchmark repo +if [ ! -d "turn-taking-study" ]; then + git clone https://github.com/marcosremar/turn-taking-study.git +fi +cd turn-taking-study + +# Install Python deps +pip install --no-cache-dir -r requirements.txt 2>&1 | tail -5 + +# Install VAP +if [ ! -d "/workspace/vap" ]; then + echo "=== Cloning VAP ===" + git clone https://github.com/ErikEkstedt/VoiceActivityProjection.git /workspace/vap + cd /workspace/vap && pip install -e . 2>&1 | tail -5 + cd /workspace/turn-taking-study +fi + +# Install VAP dataset tools +if [ ! -d "/workspace/vap_dataset" ]; then + echo "=== Cloning VAP Dataset Tools ===" + git clone https://github.com/ErikEkstedt/vap_dataset.git /workspace/vap_dataset + cd /workspace/vap_dataset && pip install -e . 2>&1 | tail -5 + cd /workspace/turn-taking-study +fi + +echo "=== Setup complete, running benchmarks ===" + +# Run benchmarks +python run_benchmarks.py --all 2>&1 | tee /workspace/benchmark.log + +echo "=== Generating report ===" +python generate_report.py 2>&1 | tee -a /workspace/benchmark.log + +echo "=== DONE at $(date) ===" +echo "Results in /workspace/turn-taking-study/results/" +echo "Report in /workspace/turn-taking-study/report/" + +# Keep instance alive for result collection +sleep infinity