""" Space 2 — Delivery Analyzer. Thin-client architecture (the free-tools version of the Mistral Voxtral pipeline pattern described in research-journal.md, Weeks 5–7): audio upload -> Hugging Face Inference API: openai/whisper-small (return_timestamps='word' for word-level start/end times) -> pure-Python prosodic feature extraction -> Gradio output No local model loading. The Space boots in seconds on free-tier CPU because it doesn't hold any weights in memory — Whisper runs on Hugging Face's servers via the Inference API and this Space just formats the request and reads the response. Requires a Hugging Face token in the HF_TOKEN Space secret (read access is enough; the free Inference API tier is rate-limited but sufficient for demo use). See research-journal.md, Week 8, for the first real numbers table and Week 10 for the end-to-end evaluation. """ import json import os import statistics from typing import Any import gradio as gr import requests HF_TOKEN = os.environ.get("HF_TOKEN", "") WHISPER_URL = "https://api-inference.huggingface.co/models/openai/whisper-small" PAUSE_THRESHOLD_SECONDS = 0.4 MIN_WORDS_FOR_RELIABLE_FEATURES = 20 def transcribe_with_word_timestamps(audio_path: str) -> dict[str, Any]: """Send audio file to the HF Inference API and ask for word-level timestamps.""" if not HF_TOKEN: raise RuntimeError( "HF_TOKEN is not set. Add it as a Space secret " "(Settings -> Variables and secrets -> New secret)." ) with open(audio_path, "rb") as f: data = f.read() headers = { "Authorization": f"Bearer {HF_TOKEN}", "Content-Type": "audio/wav", } params = {"return_timestamps": "word"} response = requests.post( WHISPER_URL, headers=headers, params=params, data=data, timeout=120, ) if response.status_code != 200: raise RuntimeError( f"Inference API error {response.status_code}: {response.text[:500]}" ) return response.json() def extract_words_with_times(api_response: dict[str, Any]) -> list[dict[str, Any]]: """Normalize the Whisper API response into a list of {word, start, end} dicts.""" chunks = api_response.get("chunks") or api_response.get("words") or [] words: list[dict[str, Any]] = [] for c in chunks: word = c.get("text") or c.get("word") or "" ts = c.get("timestamp") or (c.get("start"), c.get("end")) if not word or ts is None: continue start, end = ts if isinstance(ts, (list, tuple)) else (ts, None) if start is None or end is None: continue words.append({"word": word.strip(), "start": float(start), "end": float(end)}) return words def compute_prosodic_features(words: list[dict[str, Any]]) -> dict[str, float]: """Compute the four features Prea's journal defines in Week 6. 1. Words per minute over the whole clip. 2. Number of pauses longer than PAUSE_THRESHOLD_SECONDS. 3. Variance of the durations of those pauses. 4. Variance of words-per-minute across the first, middle, and last thirds of the speech. """ if len(words) < MIN_WORDS_FOR_RELIABLE_FEATURES: raise ValueError( f"Only {len(words)} words transcribed. " f"Need at least {MIN_WORDS_FOR_RELIABLE_FEATURES} for reliable features." ) total_duration = words[-1]["end"] - words[0]["start"] if total_duration <= 0: raise ValueError("Clip has zero or negative duration after transcription.") wpm_overall = len(words) / (total_duration / 60.0) # Pauses: gaps between word[i].end and word[i+1].start. pause_durations = [ words[i + 1]["start"] - words[i]["end"] for i in range(len(words) - 1) if words[i + 1]["start"] - words[i]["end"] > PAUSE_THRESHOLD_SECONDS ] pause_count = len(pause_durations) pause_variance = ( statistics.pvariance(pause_durations) if len(pause_durations) >= 2 else 0.0 ) # Speaking-rate variance across thirds. n = len(words) third = n // 3 if third < 2: rate_variance = 0.0 else: thirds = [words[0:third], words[third : 2 * third], words[2 * third :]] rates = [] for section in thirds: dur = section[-1]["end"] - section[0]["start"] if dur > 0: rates.append(len(section) / (dur / 60.0)) rate_variance = statistics.pvariance(rates) if len(rates) >= 2 else 0.0 return { "wpm_overall": round(wpm_overall, 1), "pause_count_over_400ms": pause_count, "pause_duration_variance": round(pause_variance, 3), "speaking_rate_variance_across_thirds": round(rate_variance, 1), "num_words": len(words), "total_duration_seconds": round(total_duration, 1), } def analyze(audio_path: str): if not audio_path: return "Please upload or record an audio clip.", "", "" try: api_response = transcribe_with_word_timestamps(audio_path) words = extract_words_with_times(api_response) if not words: return ( "Whisper returned no word-level timestamps. Try a longer clip or " "check that the audio is a recognizable language.", "", "", ) features = compute_prosodic_features(words) except ValueError as e: return f"Short-clip warning: {e}", "", "" except Exception as e: return f"Error: {e}", "", "" transcript = " ".join(w["word"] for w in words) feature_lines = [ f"Speaking rate (wpm): {features['wpm_overall']}", f"Pauses longer than 400 ms: {features['pause_count_over_400ms']}", f"Pause-duration variance: {features['pause_duration_variance']}", f"Speaking-rate variance (thirds): {features['speaking_rate_variance_across_thirds']}", f"Words transcribed: {features['num_words']}", f"Clip length (s): {features['total_duration_seconds']}", ] return transcript, "\n".join(feature_lines), json.dumps(features, indent=2) demo = gr.Interface( fn=analyze, inputs=gr.Audio( sources=["upload", "microphone"], type="filepath", label="Debate or speech clip (10 seconds to 4 minutes)", ), outputs=[ gr.Textbox(label="Transcript (Whisper-small)", lines=6), gr.Textbox(label="Prosodic features", lines=8), gr.Code(label="Raw feature JSON", language="json"), ], title="Delivery Analyzer — Space 2", description=( "This is Prea's Space 2 — the thin-client delivery analyzer. " "Uploads audio to the Hugging Face Inference API for Whisper-small " "transcription with word-level timestamps, then computes four prosodic " "features in pure Python: words per minute, pause count above 400 ms, " "pause-duration variance, and speaking-rate variance across thirds of " "the clip. No local model weights are loaded in this Space, so it " "boots in seconds on free-tier CPU. See research-journal.md, Weeks " "7–8, for the architectural pivot that led to this design." ), allow_flagging="never", theme=gr.themes.Soft(), ) if __name__ == "__main__": demo.launch()