Spaces:

VeuReu
/

asr

Paused

App Files Files Community

VeuReu commited on Oct 17, 2025

Commit

df60057

verified ·

1 Parent(s): 5513e38

Upload 3 files

Browse files

Files changed (3) hide show

README.md +24 -14
app.py +200 -0
requirements.txt +6 -0

README.md CHANGED Viewed

@@ -1,14 +1,24 @@
----
-title: Asr
-emoji: 🐨
-colorFrom: indigo
-colorTo: green
-sdk: gradio
-sdk_version: 5.49.1
-app_file: app.py
-pinned: false
-license: mit
-short_description: Automatic Speech Recognition using Whsiper-Cat
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+---
+title: veureu-asr
+emoji: 🗣️
+colorFrom: pink
+colorTo: pink
+sdk: gradio
+sdk_version: "4.44.1"
+app_file: app.py
+pinned: false
+---
+# 🗣️ veureu-asr (Aina faster-whisper · Català · ZeroGPU)
+Reconocimiento de voz en catalán basado en **faster-whisper** (CTranslate2) con el modelo de **projecte-aina**.
+## Endpoints (Gradio)
+- **`/api/predict`** — entrada: `[ <audio_file>, "ca", true, true ]` → salida:
+  ```json
+  {
+    "text": "…",
+    "segments": [{"start": 0.1, "end": 1.9, "text": "…"}],
+    "language": "ca",
+    "info": {"duration": 12.3, "device": "cuda", "compute_type": "float16"}
+  }

app.py ADDED Viewed

	@@ -0,0 +1,200 @@

+# app.py — veureu/asr (Aina faster-whisper Catalan · ZeroGPU) — compatible con ENGINE
+from __future__ import annotations
+import os, json, tempfile
+from typing import Dict, Any, List, Tuple, Optional
+import gradio as gr
+import spaces
+# faster-whisper (CTranslate2)
+from faster_whisper import WhisperModel
+# =========================
+# Config y carga perezosa
+# =========================
+# Por defecto usamos el finetune catalán de projecte-aina en HF.
+# Cambia MODEL_ID por el repo exacto que uses (ej.: "projecte-aina/faster-whisper-large-v3-ca-3catparla")
+MODEL_ID = os.environ.get("MODEL_ID", "projecte-aina/faster-whisper-large-v3-ca-3catparla")
+# Detecta si hay GPU (ZeroGPU) -> fp16, si no INT8
+HAS_CUDA = os.environ.get("CUDA_VISIBLE_DEVICES") not in (None, "", "-1")
+DEVICE = "cuda" if HAS_CUDA else "cpu"
+COMPUTE_TYPE = "float16" if HAS_CUDA else "int8"  # "int8_float16" también vale en GPU baja
+_model: Optional[WhisperModel] = None
+def _lazy_model() -> WhisperModel:
+    global _model
+    if _model is None:
+        _model = WhisperModel(
+            MODEL_ID,
+            device=DEVICE,
+            compute_type=COMPUTE_TYPE,
+            download_root=os.environ.get("HF_HOME") or None,  # opcional
+        )
+    return _model
+# ==================================
+# Núcleo de transcripción (Catalán)
+# ==================================
+def _transcribe_core(
+    audio_path: str,
+    language: str = "ca",
+    task: str = "transcribe",
+    vad_filter: bool = True,
+    beam_size: int = 5,
+    temperature: float = 0.0,
+    word_timestamps: bool = False,
+) -> Dict[str, Any]:
+    """
+    Devuelve:
+      {
+        "text": "transcripció…",
+        "segments": [
+            {"start": 0.10, "end": 1.92, "text": "…"},
+            ...
+        ],
+        "language": "ca",
+        "info": { "duration": ..., "device": "cuda/cpu", "compute_type": "float16/int8" }
+      }
+    """
+    model = _lazy_model()
+    # faster-whisper produce un generador de segments + info
+    segments, info = model.transcribe(
+        audio_path,
+        language=language or "ca",
+        task=task,
+        vad_filter=vad_filter,
+        beam_size=int(beam_size),
+        temperature=float(temperature),
+        word_timestamps=bool(word_timestamps),
+    )
+    segs: List[Dict[str, Any]] = []
+    full_text_parts: List[str] = []
+    for seg in segments:
+        text = (seg.text or "").strip()
+        full_text_parts.append(text)
+        segs.append({
+            "start": round(float(seg.start), 3) if seg.start is not None else None,
+            "end": round(float(seg.end), 3) if seg.end is not None else None,
+            "text": text,
+        })
+    out = {
+        "text": " ".join([t for t in full_text_parts if t]),
+        "segments": segs,
+        "language": language or "ca",
+        "info": {
+            "duration": getattr(info, "duration", None),
+            "device": DEVICE,
+            "compute_type": COMPUTE_TYPE,
+        },
+    }
+    return out
+# ==========================
+# Endpoints Gradio (API/UI)
+# ==========================
+# 1) /predict — el que usa el ENGINE vía gradio_client
+#    Firma minimalista: solo el audio; el resto con defaults.
+def predict_for_engine(
+    audio_file,              # gr.Audio o gr.File
+    language: str = "ca",
+    timestamps: bool = True,
+    vad_filter: bool = True,
+) -> Dict[str, Any]:
+    """
+    ENGINE llama normalmente con: client.predict(<audio_path>, api_name="/predict")
+    Devolvemos dict con 'text' y 'segments'.
+    """
+    # Gradio puede darte un dict {'name', 'data'} o una ruta directamente
+    path = None
+    if isinstance(audio_file, dict) and audio_file.get("name"):
+        path = audio_file["name"]
+    elif isinstance(audio_file, str):
+        path = audio_file
+    elif hasattr(audio_file, "name"):
+        path = audio_file.name
+    if not path:
+        return {"text": "", "segments": [], "language": language, "info": {"error": "no_audio"}}
+    return _transcribe_core(
+        path,
+        language=language or "ca",
+        task="transcribe",
+        vad_filter=bool(vad_filter),
+        beam_size=5,
+        temperature=0.0,
+        word_timestamps=bool(timestamps),
+    )
+# 2) /transcribe — endpoint alternativo con más controles (útil para pruebas manuales/HTTP)
+def transcribe_advanced(
+    audio_file,
+    language: str = "ca",
+    task: str = "transcribe",         # "transcribe" | "translate"
+    vad_filter: bool = True,
+    beam_size: int = 5,
+    temperature: float = 0.0,
+    word_timestamps: bool = False,
+) -> Dict[str, Any]:
+    path = None
+    if isinstance(audio_file, dict) and audio_file.get("name"):
+        path = audio_file["name"]
+    elif isinstance(audio_file, str):
+        path = audio_file
+    elif hasattr(audio_file, "name"):
+        path = audio_file.name
+    if not path:
+        return {"text": "", "segments": [], "language": language, "info": {"error": "no_audio"}}
+    return _transcribe_core(
+        path,
+        language=language or "ca",
+        task=task or "transcribe",
+        vad_filter=bool(vad_filter),
+        beam_size=int(beam_size),
+        temperature=float(temperature),
+        word_timestamps=bool(word_timestamps),
+    )
+# =================
+# UI de demostración
+# =================
+with gr.Blocks(title="Aina faster-whisper (Català) · ZeroGPU") as demo:
+    gr.Markdown("## Aina faster-whisper (Català) · ZeroGPU\nReconocimiento de voz en catalán finetune projecte-aina.")
+    with gr.Row():
+        with gr.Column():
+            inp = gr.Audio(sources=["upload", "microphone"], type="filepath", label="Audio (WAV/MP3/MP4, etc.)")
+            lang = gr.Textbox(label="language", value="ca")
+            ts = gr.Checkbox(label="timestamps", value=True)
+            vad = gr.Checkbox(label="VAD filter", value=True)
+            btn = gr.Button("Transcribir (ENGINE /predict)", variant="primary")
+        with gr.Column():
+            out = gr.JSON(label="Salida /predict")
+    btn.click(predict_for_engine, [inp, lang, ts, vad], out, api_name="predict")
+    # Sección avanzada
+    gr.Markdown("---\n### Avanzado (/transcribe)")
+    with gr.Row():
+        with gr.Column():
+            inp2 = gr.Audio(sources=["upload", "microphone"], type="filepath", label="Audio")
+            lang2 = gr.Textbox(label="language", value="ca")
+            task2 = gr.Dropdown(["transcribe", "translate"], value="transcribe", label="task")
+            vad2 = gr.Checkbox(label="VAD filter", value=True)
+            beam2 = gr.Slider(1, 10, value=5, step=1, label="beam_size")
+            temp2 = gr.Slider(0.0, 1.5, value=0.0, step=0.1, label="temperature")
+            wts2 = gr.Checkbox(label="word_timestamps", value=False)
+            btn2 = gr.Button("Transcribir (avanzado)")
+        with gr.Column():
+            out2 = gr.JSON(label="Salida /transcribe")
+    btn2.click(transcribe_advanced, [inp2, lang2, task2, vad2, beam2, temp2, wts2], out2, api_name="transcribe")
+demo.queue(concurrency_count=1, max_size=8).launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+gradio>=4.44.1
+spaces>=0.25.0
+faster-whisper>=1.0
+ctranslate2>=4.3
+numpy<2.0        # estabilidad general con libs de audio
+soundfile>=0.12  # lectura de WAV/OGG/FLAC, etc.