Spaces:

cstr
/

CrispASR

Running

File size: 7,359 Bytes

import json
import os
import time
from pathlib import Path

import gradio as gr
import requests


SERVER_URL = os.environ.get("CRISPASR_SERVER_URL", "http://127.0.0.1:8080").rstrip("/")
SPACE_TITLE = os.environ.get("CRISPASR_SPACE_TITLE", "CrispASR")
DEFAULT_LANGUAGE = os.environ.get("CRISPASR_LANGUAGE", "en")
DEFAULT_MODEL = os.environ.get("CRISPASR_MODEL", "auto")
API_KEY = next((key.strip() for key in os.environ.get("CRISPASR_API_KEYS", "").split(",") if key.strip()), "")

MODEL_CHOICES = {
    "Whisper base multilingual (~147 MB)": ("whisper", "auto", "en"),
    "Parakeet TDT 0.6B v3 Q4_K (~467 MB)": ("parakeet", "auto", "en"),
    "Qwen3 ASR 0.6B Q4_K (~500 MB)": ("qwen3", "auto", "en"),
    "Cohere Transcribe Q4_K (~550 MB)": ("cohere", "auto", "en"),
}


def log(message: str):
    print(f"[{time.strftime('%Y-%m-%dT%H:%M:%SZ', time.gmtime())}] hf-space-app: {message}", flush=True)


def _request(method: str, path: str, **kwargs):
    if API_KEY:
        headers = dict(kwargs.pop("headers", {}) or {})
        headers.setdefault("Authorization", f"Bearer {API_KEY}")
        kwargs["headers"] = headers
    return requests.request(method, f"{SERVER_URL}{path}", timeout=300, **kwargs)


def fetch_status():
    try:
        log("fetch_status: probing /health and /v1/models")
        health = _request("GET", "/health")
        health.raise_for_status()
        models = _request("GET", "/v1/models")
        models.raise_for_status()
        health_json = health.json()
        models_json = models.json()
        model_names = [item.get("id", "") for item in models_json.get("data", [])]
        log(f"fetch_status: ready models={model_names if model_names else ['(none)']}")
        return (
            "ready",
            json.dumps(health_json, indent=2, ensure_ascii=False),
            "\n".join(model_names) if model_names else "(no models reported)",
        )
    except Exception as exc:
        log(f"fetch_status: waiting error={type(exc).__name__}: {exc}")
        return "starting", f"{type(exc).__name__}: {exc}", DEFAULT_MODEL


def wait_for_server():
    log("wait_for_server: start")
    last_status = "starting"
    last_health = ""
    last_models = DEFAULT_MODEL
    for i in range(300):
        last_status, last_health, last_models = fetch_status()
        if last_status == "ready":
            log(f"wait_for_server: ready after {i + 1} probe(s)")
            break
        time.sleep(1)
    if last_status != "ready":
        log("wait_for_server: timeout, app staying up in starting state")
    return last_status, last_health, last_models


def transcribe(audio_path: str, language: str, prompt: str, temperature: float, response_format: str):
    if not audio_path:
        raise gr.Error("Upload or record audio first.")

    file_path = Path(audio_path)
    if not file_path.exists():
        raise gr.Error("Audio file is not available anymore.")

    log(
        f"transcribe: file={file_path.name} language={language or 'default'} "
        f"response_format={response_format} temperature={temperature:.2f} prompt={'yes' if prompt else 'no'}"
    )

    data = {
        "model": "loaded-model",
        "response_format": response_format,
        "temperature": f"{temperature:.2f}",
    }

    if language and language != "auto":
        data["language"] = language
    if prompt:
        data["prompt"] = prompt
    with file_path.open("rb") as f:
        response = _request(
            "POST",
            "/v1/audio/transcriptions",
            files={"file": (file_path.name, f, "application/octet-stream")},
            data=data,
        )

    if response.status_code >= 400:
        log(f"transcribe: error status={response.status_code} body={response.text[:400]}")
        raise gr.Error(f"{response.status_code}: {response.text}")

    content_type = response.headers.get("content-type", "")
    log(f"transcribe: ok status={response.status_code} content_type={content_type}")
    if response_format == "verbose_json" or "application/json" in content_type:
        payload = response.json()
        text = payload.get("text", "") if isinstance(payload, dict) else ""
        log(f"transcribe: json text_len={len(text)}")
        return text, json.dumps(payload, indent=2, ensure_ascii=False)

    text = response.text.strip()
    log(f"transcribe: text text_len={len(text)}")
    return text, text


def load_model(choice: str, language: str):
    backend, model, default_language = MODEL_CHOICES.get(choice, MODEL_CHOICES["Whisper base multilingual (~147 MB)"])
    language = language or default_language
    log(f"load_model: choice={choice} backend={backend} model={model} language={language}")
    response = _request(
        "POST",
        "/load",
        files={
            "backend": (None, backend),
            "model": (None, model),
            "language": (None, language),
        },
    )
    if response.status_code >= 400:
        log(f"load_model: error status={response.status_code} body={response.text[:400]}")
        raise gr.Error(f"{response.status_code}: {response.text}")
    status, health, models = fetch_status()
    log(f"load_model: ok backend={backend}")
    return status, health, models, language


with gr.Blocks(title=SPACE_TITLE) as demo:
    gr.Markdown(
        f"""# {SPACE_TITLE}

Offline speech transcription via CrispASR's OpenAI-compatible server.

- Server URL: `{SERVER_URL}`
- Model path: `{DEFAULT_MODEL}`
"""
    )

    with gr.Row():
        status = gr.Textbox(label="Server status", interactive=False)
        models = gr.Textbox(label="Loaded model(s)", interactive=False)
    health = gr.Code(label="/health", language="json", interactive=False)
    refresh = gr.Button("Refresh server status")

    with gr.Row():
        model_choice = gr.Dropdown(list(MODEL_CHOICES.keys()), value="Whisper base multilingual (~147 MB)", label="Model")
        load = gr.Button("Load selected model")

    with gr.Row():
        audio = gr.Audio(label="Audio", type="filepath", sources=["upload", "microphone"])
        with gr.Column():
            language = gr.Textbox(value=DEFAULT_LANGUAGE, label="Language", placeholder="auto or ISO-639-1 code")
            response_format = gr.Dropdown(
                ["text", "verbose_json"], value="verbose_json", label="Response format"
            )
            temperature = gr.Slider(0.0, 1.0, value=0.0, step=0.1, label="Temperature")
            prompt = gr.Textbox(label="Prompt", placeholder="Optional prompt or context")
            submit = gr.Button("Transcribe", variant="primary")

    transcript = gr.Textbox(label="Transcript", lines=12)
    raw = gr.Code(label="Raw response", language="json")

    refresh.click(fetch_status, outputs=[status, health, models])
    load.click(load_model, inputs=[model_choice, language], outputs=[status, health, models, language])
    submit.click(
        transcribe,
        inputs=[audio, language, prompt, temperature, response_format],
        outputs=[transcript, raw],
    )
    demo.load(wait_for_server, outputs=[status, health, models])


if __name__ == "__main__":
    log(f"launch: server_url={SERVER_URL} default_model={DEFAULT_MODEL} default_language={DEFAULT_LANGUAGE}")
    demo.launch(
        server_name=os.environ.get("GRADIO_SERVER_NAME", "0.0.0.0"),
        server_port=int(os.environ.get("GRADIO_SERVER_PORT", "7860")),
    )