Spaces:

profplate
/

space2-delivery-analyzer

Paused

File size: 7,451 Bytes

7e7994f

"""
Space 2 — Delivery Analyzer.

Thin-client architecture (the free-tools version of the Mistral Voxtral
pipeline pattern described in research-journal.md, Weeks 5–7):

    audio upload
        -> Hugging Face Inference API: openai/whisper-small
           (return_timestamps='word' for word-level start/end times)
        -> pure-Python prosodic feature extraction
        -> Gradio output

No local model loading. The Space boots in seconds on free-tier CPU because
it doesn't hold any weights in memory — Whisper runs on Hugging Face's
servers via the Inference API and this Space just formats the request and
reads the response.

Requires a Hugging Face token in the HF_TOKEN Space secret (read access is
enough; the free Inference API tier is rate-limited but sufficient for
demo use).

See research-journal.md, Week 8, for the first real numbers table and
Week 10 for the end-to-end evaluation.
"""

import json
import os
import statistics
from typing import Any

import gradio as gr
import requests

HF_TOKEN = os.environ.get("HF_TOKEN", "")
WHISPER_URL = "https://api-inference.huggingface.co/models/openai/whisper-small"
PAUSE_THRESHOLD_SECONDS = 0.4
MIN_WORDS_FOR_RELIABLE_FEATURES = 20


def transcribe_with_word_timestamps(audio_path: str) -> dict[str, Any]:
    """Send audio file to the HF Inference API and ask for word-level timestamps."""
    if not HF_TOKEN:
        raise RuntimeError(
            "HF_TOKEN is not set. Add it as a Space secret "
            "(Settings -> Variables and secrets -> New secret)."
        )
    with open(audio_path, "rb") as f:
        data = f.read()
    headers = {
        "Authorization": f"Bearer {HF_TOKEN}",
        "Content-Type": "audio/wav",
    }
    params = {"return_timestamps": "word"}
    response = requests.post(
        WHISPER_URL,
        headers=headers,
        params=params,
        data=data,
        timeout=120,
    )
    if response.status_code != 200:
        raise RuntimeError(
            f"Inference API error {response.status_code}: {response.text[:500]}"
        )
    return response.json()


def extract_words_with_times(api_response: dict[str, Any]) -> list[dict[str, Any]]:
    """Normalize the Whisper API response into a list of {word, start, end} dicts."""
    chunks = api_response.get("chunks") or api_response.get("words") or []
    words: list[dict[str, Any]] = []
    for c in chunks:
        word = c.get("text") or c.get("word") or ""
        ts = c.get("timestamp") or (c.get("start"), c.get("end"))
        if not word or ts is None:
            continue
        start, end = ts if isinstance(ts, (list, tuple)) else (ts, None)
        if start is None or end is None:
            continue
        words.append({"word": word.strip(), "start": float(start), "end": float(end)})
    return words


def compute_prosodic_features(words: list[dict[str, Any]]) -> dict[str, float]:
    """Compute the four features Prea's journal defines in Week 6.

    1. Words per minute over the whole clip.
    2. Number of pauses longer than PAUSE_THRESHOLD_SECONDS.
    3. Variance of the durations of those pauses.
    4. Variance of words-per-minute across the first, middle, and last
       thirds of the speech.
    """
    if len(words) < MIN_WORDS_FOR_RELIABLE_FEATURES:
        raise ValueError(
            f"Only {len(words)} words transcribed. "
            f"Need at least {MIN_WORDS_FOR_RELIABLE_FEATURES} for reliable features."
        )

    total_duration = words[-1]["end"] - words[0]["start"]
    if total_duration <= 0:
        raise ValueError("Clip has zero or negative duration after transcription.")

    wpm_overall = len(words) / (total_duration / 60.0)

    # Pauses: gaps between word[i].end and word[i+1].start.
    pause_durations = [
        words[i + 1]["start"] - words[i]["end"]
        for i in range(len(words) - 1)
        if words[i + 1]["start"] - words[i]["end"] > PAUSE_THRESHOLD_SECONDS
    ]
    pause_count = len(pause_durations)
    pause_variance = (
        statistics.pvariance(pause_durations) if len(pause_durations) >= 2 else 0.0
    )

    # Speaking-rate variance across thirds.
    n = len(words)
    third = n // 3
    if third < 2:
        rate_variance = 0.0
    else:
        thirds = [words[0:third], words[third : 2 * third], words[2 * third :]]
        rates = []
        for section in thirds:
            dur = section[-1]["end"] - section[0]["start"]
            if dur > 0:
                rates.append(len(section) / (dur / 60.0))
        rate_variance = statistics.pvariance(rates) if len(rates) >= 2 else 0.0

    return {
        "wpm_overall": round(wpm_overall, 1),
        "pause_count_over_400ms": pause_count,
        "pause_duration_variance": round(pause_variance, 3),
        "speaking_rate_variance_across_thirds": round(rate_variance, 1),
        "num_words": len(words),
        "total_duration_seconds": round(total_duration, 1),
    }


def analyze(audio_path: str):
    if not audio_path:
        return "Please upload or record an audio clip.", "", ""
    try:
        api_response = transcribe_with_word_timestamps(audio_path)
        words = extract_words_with_times(api_response)
        if not words:
            return (
                "Whisper returned no word-level timestamps. Try a longer clip or "
                "check that the audio is a recognizable language.",
                "",
                "",
            )
        features = compute_prosodic_features(words)
    except ValueError as e:
        return f"Short-clip warning: {e}", "", ""
    except Exception as e:
        return f"Error: {e}", "", ""

    transcript = " ".join(w["word"] for w in words)
    feature_lines = [
        f"Speaking rate (wpm):                {features['wpm_overall']}",
        f"Pauses longer than 400 ms:          {features['pause_count_over_400ms']}",
        f"Pause-duration variance:            {features['pause_duration_variance']}",
        f"Speaking-rate variance (thirds):    {features['speaking_rate_variance_across_thirds']}",
        f"Words transcribed:                  {features['num_words']}",
        f"Clip length (s):                    {features['total_duration_seconds']}",
    ]
    return transcript, "\n".join(feature_lines), json.dumps(features, indent=2)


demo = gr.Interface(
    fn=analyze,
    inputs=gr.Audio(
        sources=["upload", "microphone"],
        type="filepath",
        label="Debate or speech clip (10 seconds to 4 minutes)",
    ),
    outputs=[
        gr.Textbox(label="Transcript (Whisper-small)", lines=6),
        gr.Textbox(label="Prosodic features", lines=8),
        gr.Code(label="Raw feature JSON", language="json"),
    ],
    title="Delivery Analyzer — Space 2",
    description=(
        "This is Prea's Space 2 — the thin-client delivery analyzer. "
        "Uploads audio to the Hugging Face Inference API for Whisper-small "
        "transcription with word-level timestamps, then computes four prosodic "
        "features in pure Python: words per minute, pause count above 400 ms, "
        "pause-duration variance, and speaking-rate variance across thirds of "
        "the clip. No local model weights are loaded in this Space, so it "
        "boots in seconds on free-tier CPU. See research-journal.md, Weeks "
        "7–8, for the architectural pivot that led to this design."
    ),
    allow_flagging="never",
    theme=gr.themes.Soft(),
)

if __name__ == "__main__":
    demo.launch()