File size: 7,451 Bytes
7e7994f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
"""
Space 2 — Delivery Analyzer.

Thin-client architecture (the free-tools version of the Mistral Voxtral
pipeline pattern described in research-journal.md, Weeks 5–7):

    audio upload
        -> Hugging Face Inference API: openai/whisper-small
           (return_timestamps='word' for word-level start/end times)
        -> pure-Python prosodic feature extraction
        -> Gradio output

No local model loading. The Space boots in seconds on free-tier CPU because
it doesn't hold any weights in memory — Whisper runs on Hugging Face's
servers via the Inference API and this Space just formats the request and
reads the response.

Requires a Hugging Face token in the HF_TOKEN Space secret (read access is
enough; the free Inference API tier is rate-limited but sufficient for
demo use).

See research-journal.md, Week 8, for the first real numbers table and
Week 10 for the end-to-end evaluation.
"""

import json
import os
import statistics
from typing import Any

import gradio as gr
import requests

HF_TOKEN = os.environ.get("HF_TOKEN", "")
WHISPER_URL = "https://api-inference.huggingface.co/models/openai/whisper-small"
PAUSE_THRESHOLD_SECONDS = 0.4
MIN_WORDS_FOR_RELIABLE_FEATURES = 20


def transcribe_with_word_timestamps(audio_path: str) -> dict[str, Any]:
    """Send audio file to the HF Inference API and ask for word-level timestamps."""
    if not HF_TOKEN:
        raise RuntimeError(
            "HF_TOKEN is not set. Add it as a Space secret "
            "(Settings -> Variables and secrets -> New secret)."
        )
    with open(audio_path, "rb") as f:
        data = f.read()
    headers = {
        "Authorization": f"Bearer {HF_TOKEN}",
        "Content-Type": "audio/wav",
    }
    params = {"return_timestamps": "word"}
    response = requests.post(
        WHISPER_URL,
        headers=headers,
        params=params,
        data=data,
        timeout=120,
    )
    if response.status_code != 200:
        raise RuntimeError(
            f"Inference API error {response.status_code}: {response.text[:500]}"
        )
    return response.json()


def extract_words_with_times(api_response: dict[str, Any]) -> list[dict[str, Any]]:
    """Normalize the Whisper API response into a list of {word, start, end} dicts."""
    chunks = api_response.get("chunks") or api_response.get("words") or []
    words: list[dict[str, Any]] = []
    for c in chunks:
        word = c.get("text") or c.get("word") or ""
        ts = c.get("timestamp") or (c.get("start"), c.get("end"))
        if not word or ts is None:
            continue
        start, end = ts if isinstance(ts, (list, tuple)) else (ts, None)
        if start is None or end is None:
            continue
        words.append({"word": word.strip(), "start": float(start), "end": float(end)})
    return words


def compute_prosodic_features(words: list[dict[str, Any]]) -> dict[str, float]:
    """Compute the four features Prea's journal defines in Week 6.

    1. Words per minute over the whole clip.
    2. Number of pauses longer than PAUSE_THRESHOLD_SECONDS.
    3. Variance of the durations of those pauses.
    4. Variance of words-per-minute across the first, middle, and last
       thirds of the speech.
    """
    if len(words) < MIN_WORDS_FOR_RELIABLE_FEATURES:
        raise ValueError(
            f"Only {len(words)} words transcribed. "
            f"Need at least {MIN_WORDS_FOR_RELIABLE_FEATURES} for reliable features."
        )

    total_duration = words[-1]["end"] - words[0]["start"]
    if total_duration <= 0:
        raise ValueError("Clip has zero or negative duration after transcription.")

    wpm_overall = len(words) / (total_duration / 60.0)

    # Pauses: gaps between word[i].end and word[i+1].start.
    pause_durations = [
        words[i + 1]["start"] - words[i]["end"]
        for i in range(len(words) - 1)
        if words[i + 1]["start"] - words[i]["end"] > PAUSE_THRESHOLD_SECONDS
    ]
    pause_count = len(pause_durations)
    pause_variance = (
        statistics.pvariance(pause_durations) if len(pause_durations) >= 2 else 0.0
    )

    # Speaking-rate variance across thirds.
    n = len(words)
    third = n // 3
    if third < 2:
        rate_variance = 0.0
    else:
        thirds = [words[0:third], words[third : 2 * third], words[2 * third :]]
        rates = []
        for section in thirds:
            dur = section[-1]["end"] - section[0]["start"]
            if dur > 0:
                rates.append(len(section) / (dur / 60.0))
        rate_variance = statistics.pvariance(rates) if len(rates) >= 2 else 0.0

    return {
        "wpm_overall": round(wpm_overall, 1),
        "pause_count_over_400ms": pause_count,
        "pause_duration_variance": round(pause_variance, 3),
        "speaking_rate_variance_across_thirds": round(rate_variance, 1),
        "num_words": len(words),
        "total_duration_seconds": round(total_duration, 1),
    }


def analyze(audio_path: str):
    if not audio_path:
        return "Please upload or record an audio clip.", "", ""
    try:
        api_response = transcribe_with_word_timestamps(audio_path)
        words = extract_words_with_times(api_response)
        if not words:
            return (
                "Whisper returned no word-level timestamps. Try a longer clip or "
                "check that the audio is a recognizable language.",
                "",
                "",
            )
        features = compute_prosodic_features(words)
    except ValueError as e:
        return f"Short-clip warning: {e}", "", ""
    except Exception as e:
        return f"Error: {e}", "", ""

    transcript = " ".join(w["word"] for w in words)
    feature_lines = [
        f"Speaking rate (wpm):                {features['wpm_overall']}",
        f"Pauses longer than 400 ms:          {features['pause_count_over_400ms']}",
        f"Pause-duration variance:            {features['pause_duration_variance']}",
        f"Speaking-rate variance (thirds):    {features['speaking_rate_variance_across_thirds']}",
        f"Words transcribed:                  {features['num_words']}",
        f"Clip length (s):                    {features['total_duration_seconds']}",
    ]
    return transcript, "\n".join(feature_lines), json.dumps(features, indent=2)


demo = gr.Interface(
    fn=analyze,
    inputs=gr.Audio(
        sources=["upload", "microphone"],
        type="filepath",
        label="Debate or speech clip (10 seconds to 4 minutes)",
    ),
    outputs=[
        gr.Textbox(label="Transcript (Whisper-small)", lines=6),
        gr.Textbox(label="Prosodic features", lines=8),
        gr.Code(label="Raw feature JSON", language="json"),
    ],
    title="Delivery Analyzer — Space 2",
    description=(
        "This is Prea's Space 2 — the thin-client delivery analyzer. "
        "Uploads audio to the Hugging Face Inference API for Whisper-small "
        "transcription with word-level timestamps, then computes four prosodic "
        "features in pure Python: words per minute, pause count above 400 ms, "
        "pause-duration variance, and speaking-rate variance across thirds of "
        "the clip. No local model weights are loaded in this Space, so it "
        "boots in seconds on free-tier CPU. See research-journal.md, Weeks "
        "7–8, for the architectural pivot that led to this design."
    ),
    allow_flagging="never",
    theme=gr.themes.Soft(),
)

if __name__ == "__main__":
    demo.launch()