File size: 7,451 Bytes
7e7994f | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 | """
Space 2 — Delivery Analyzer.
Thin-client architecture (the free-tools version of the Mistral Voxtral
pipeline pattern described in research-journal.md, Weeks 5–7):
audio upload
-> Hugging Face Inference API: openai/whisper-small
(return_timestamps='word' for word-level start/end times)
-> pure-Python prosodic feature extraction
-> Gradio output
No local model loading. The Space boots in seconds on free-tier CPU because
it doesn't hold any weights in memory — Whisper runs on Hugging Face's
servers via the Inference API and this Space just formats the request and
reads the response.
Requires a Hugging Face token in the HF_TOKEN Space secret (read access is
enough; the free Inference API tier is rate-limited but sufficient for
demo use).
See research-journal.md, Week 8, for the first real numbers table and
Week 10 for the end-to-end evaluation.
"""
import json
import os
import statistics
from typing import Any
import gradio as gr
import requests
HF_TOKEN = os.environ.get("HF_TOKEN", "")
WHISPER_URL = "https://api-inference.huggingface.co/models/openai/whisper-small"
PAUSE_THRESHOLD_SECONDS = 0.4
MIN_WORDS_FOR_RELIABLE_FEATURES = 20
def transcribe_with_word_timestamps(audio_path: str) -> dict[str, Any]:
"""Send audio file to the HF Inference API and ask for word-level timestamps."""
if not HF_TOKEN:
raise RuntimeError(
"HF_TOKEN is not set. Add it as a Space secret "
"(Settings -> Variables and secrets -> New secret)."
)
with open(audio_path, "rb") as f:
data = f.read()
headers = {
"Authorization": f"Bearer {HF_TOKEN}",
"Content-Type": "audio/wav",
}
params = {"return_timestamps": "word"}
response = requests.post(
WHISPER_URL,
headers=headers,
params=params,
data=data,
timeout=120,
)
if response.status_code != 200:
raise RuntimeError(
f"Inference API error {response.status_code}: {response.text[:500]}"
)
return response.json()
def extract_words_with_times(api_response: dict[str, Any]) -> list[dict[str, Any]]:
"""Normalize the Whisper API response into a list of {word, start, end} dicts."""
chunks = api_response.get("chunks") or api_response.get("words") or []
words: list[dict[str, Any]] = []
for c in chunks:
word = c.get("text") or c.get("word") or ""
ts = c.get("timestamp") or (c.get("start"), c.get("end"))
if not word or ts is None:
continue
start, end = ts if isinstance(ts, (list, tuple)) else (ts, None)
if start is None or end is None:
continue
words.append({"word": word.strip(), "start": float(start), "end": float(end)})
return words
def compute_prosodic_features(words: list[dict[str, Any]]) -> dict[str, float]:
"""Compute the four features Prea's journal defines in Week 6.
1. Words per minute over the whole clip.
2. Number of pauses longer than PAUSE_THRESHOLD_SECONDS.
3. Variance of the durations of those pauses.
4. Variance of words-per-minute across the first, middle, and last
thirds of the speech.
"""
if len(words) < MIN_WORDS_FOR_RELIABLE_FEATURES:
raise ValueError(
f"Only {len(words)} words transcribed. "
f"Need at least {MIN_WORDS_FOR_RELIABLE_FEATURES} for reliable features."
)
total_duration = words[-1]["end"] - words[0]["start"]
if total_duration <= 0:
raise ValueError("Clip has zero or negative duration after transcription.")
wpm_overall = len(words) / (total_duration / 60.0)
# Pauses: gaps between word[i].end and word[i+1].start.
pause_durations = [
words[i + 1]["start"] - words[i]["end"]
for i in range(len(words) - 1)
if words[i + 1]["start"] - words[i]["end"] > PAUSE_THRESHOLD_SECONDS
]
pause_count = len(pause_durations)
pause_variance = (
statistics.pvariance(pause_durations) if len(pause_durations) >= 2 else 0.0
)
# Speaking-rate variance across thirds.
n = len(words)
third = n // 3
if third < 2:
rate_variance = 0.0
else:
thirds = [words[0:third], words[third : 2 * third], words[2 * third :]]
rates = []
for section in thirds:
dur = section[-1]["end"] - section[0]["start"]
if dur > 0:
rates.append(len(section) / (dur / 60.0))
rate_variance = statistics.pvariance(rates) if len(rates) >= 2 else 0.0
return {
"wpm_overall": round(wpm_overall, 1),
"pause_count_over_400ms": pause_count,
"pause_duration_variance": round(pause_variance, 3),
"speaking_rate_variance_across_thirds": round(rate_variance, 1),
"num_words": len(words),
"total_duration_seconds": round(total_duration, 1),
}
def analyze(audio_path: str):
if not audio_path:
return "Please upload or record an audio clip.", "", ""
try:
api_response = transcribe_with_word_timestamps(audio_path)
words = extract_words_with_times(api_response)
if not words:
return (
"Whisper returned no word-level timestamps. Try a longer clip or "
"check that the audio is a recognizable language.",
"",
"",
)
features = compute_prosodic_features(words)
except ValueError as e:
return f"Short-clip warning: {e}", "", ""
except Exception as e:
return f"Error: {e}", "", ""
transcript = " ".join(w["word"] for w in words)
feature_lines = [
f"Speaking rate (wpm): {features['wpm_overall']}",
f"Pauses longer than 400 ms: {features['pause_count_over_400ms']}",
f"Pause-duration variance: {features['pause_duration_variance']}",
f"Speaking-rate variance (thirds): {features['speaking_rate_variance_across_thirds']}",
f"Words transcribed: {features['num_words']}",
f"Clip length (s): {features['total_duration_seconds']}",
]
return transcript, "\n".join(feature_lines), json.dumps(features, indent=2)
demo = gr.Interface(
fn=analyze,
inputs=gr.Audio(
sources=["upload", "microphone"],
type="filepath",
label="Debate or speech clip (10 seconds to 4 minutes)",
),
outputs=[
gr.Textbox(label="Transcript (Whisper-small)", lines=6),
gr.Textbox(label="Prosodic features", lines=8),
gr.Code(label="Raw feature JSON", language="json"),
],
title="Delivery Analyzer — Space 2",
description=(
"This is Prea's Space 2 — the thin-client delivery analyzer. "
"Uploads audio to the Hugging Face Inference API for Whisper-small "
"transcription with word-level timestamps, then computes four prosodic "
"features in pure Python: words per minute, pause count above 400 ms, "
"pause-duration variance, and speaking-rate variance across thirds of "
"the clip. No local model weights are loaded in this Space, so it "
"boots in seconds on free-tier CPU. See research-journal.md, Weeks "
"7–8, for the architectural pivot that led to this design."
),
allow_flagging="never",
theme=gr.themes.Soft(),
)
if __name__ == "__main__":
demo.launch()
|