VeuReu commited on
Commit
df60057
·
verified ·
1 Parent(s): 5513e38

Upload 3 files

Browse files
Files changed (3) hide show
  1. README.md +24 -14
  2. app.py +200 -0
  3. requirements.txt +6 -0
README.md CHANGED
@@ -1,14 +1,24 @@
1
- ---
2
- title: Asr
3
- emoji: 🐨
4
- colorFrom: indigo
5
- colorTo: green
6
- sdk: gradio
7
- sdk_version: 5.49.1
8
- app_file: app.py
9
- pinned: false
10
- license: mit
11
- short_description: Automatic Speech Recognition using Whsiper-Cat
12
- ---
13
-
14
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: veureu-asr
3
+ emoji: 🗣️
4
+ colorFrom: pink
5
+ colorTo: pink
6
+ sdk: gradio
7
+ sdk_version: "4.44.1"
8
+ app_file: app.py
9
+ pinned: false
10
+ ---
11
+
12
+ # 🗣️ veureu-asr (Aina faster-whisper · Català · ZeroGPU)
13
+
14
+ Reconocimiento de voz en catalán basado en **faster-whisper** (CTranslate2) con el modelo de **projecte-aina**.
15
+
16
+ ## Endpoints (Gradio)
17
+ - **`/api/predict`** — entrada: `[ <audio_file>, "ca", true, true ]` → salida:
18
+ ```json
19
+ {
20
+ "text": "…",
21
+ "segments": [{"start": 0.1, "end": 1.9, "text": "…"}],
22
+ "language": "ca",
23
+ "info": {"duration": 12.3, "device": "cuda", "compute_type": "float16"}
24
+ }
app.py ADDED
@@ -0,0 +1,200 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # app.py — veureu/asr (Aina faster-whisper Catalan · ZeroGPU) — compatible con ENGINE
2
+ from __future__ import annotations
3
+ import os, json, tempfile
4
+ from typing import Dict, Any, List, Tuple, Optional
5
+
6
+ import gradio as gr
7
+ import spaces
8
+
9
+ # faster-whisper (CTranslate2)
10
+ from faster_whisper import WhisperModel
11
+
12
+ # =========================
13
+ # Config y carga perezosa
14
+ # =========================
15
+ # Por defecto usamos el finetune catalán de projecte-aina en HF.
16
+ # Cambia MODEL_ID por el repo exacto que uses (ej.: "projecte-aina/faster-whisper-large-v3-ca-3catparla")
17
+ MODEL_ID = os.environ.get("MODEL_ID", "projecte-aina/faster-whisper-large-v3-ca-3catparla")
18
+
19
+ # Detecta si hay GPU (ZeroGPU) -> fp16, si no INT8
20
+ HAS_CUDA = os.environ.get("CUDA_VISIBLE_DEVICES") not in (None, "", "-1")
21
+ DEVICE = "cuda" if HAS_CUDA else "cpu"
22
+ COMPUTE_TYPE = "float16" if HAS_CUDA else "int8" # "int8_float16" también vale en GPU baja
23
+
24
+ _model: Optional[WhisperModel] = None
25
+
26
+ def _lazy_model() -> WhisperModel:
27
+ global _model
28
+ if _model is None:
29
+ _model = WhisperModel(
30
+ MODEL_ID,
31
+ device=DEVICE,
32
+ compute_type=COMPUTE_TYPE,
33
+ download_root=os.environ.get("HF_HOME") or None, # opcional
34
+ )
35
+ return _model
36
+
37
+ # ==================================
38
+ # Núcleo de transcripción (Catalán)
39
+ # ==================================
40
+ def _transcribe_core(
41
+ audio_path: str,
42
+ language: str = "ca",
43
+ task: str = "transcribe",
44
+ vad_filter: bool = True,
45
+ beam_size: int = 5,
46
+ temperature: float = 0.0,
47
+ word_timestamps: bool = False,
48
+ ) -> Dict[str, Any]:
49
+ """
50
+ Devuelve:
51
+ {
52
+ "text": "transcripció…",
53
+ "segments": [
54
+ {"start": 0.10, "end": 1.92, "text": "…"},
55
+ ...
56
+ ],
57
+ "language": "ca",
58
+ "info": { "duration": ..., "device": "cuda/cpu", "compute_type": "float16/int8" }
59
+ }
60
+ """
61
+ model = _lazy_model()
62
+
63
+ # faster-whisper produce un generador de segments + info
64
+ segments, info = model.transcribe(
65
+ audio_path,
66
+ language=language or "ca",
67
+ task=task,
68
+ vad_filter=vad_filter,
69
+ beam_size=int(beam_size),
70
+ temperature=float(temperature),
71
+ word_timestamps=bool(word_timestamps),
72
+ )
73
+
74
+ segs: List[Dict[str, Any]] = []
75
+ full_text_parts: List[str] = []
76
+ for seg in segments:
77
+ text = (seg.text or "").strip()
78
+ full_text_parts.append(text)
79
+ segs.append({
80
+ "start": round(float(seg.start), 3) if seg.start is not None else None,
81
+ "end": round(float(seg.end), 3) if seg.end is not None else None,
82
+ "text": text,
83
+ })
84
+
85
+ out = {
86
+ "text": " ".join([t for t in full_text_parts if t]),
87
+ "segments": segs,
88
+ "language": language or "ca",
89
+ "info": {
90
+ "duration": getattr(info, "duration", None),
91
+ "device": DEVICE,
92
+ "compute_type": COMPUTE_TYPE,
93
+ },
94
+ }
95
+ return out
96
+
97
+ # ==========================
98
+ # Endpoints Gradio (API/UI)
99
+ # ==========================
100
+
101
+ # 1) /predict — el que usa el ENGINE vía gradio_client
102
+ # Firma minimalista: solo el audio; el resto con defaults.
103
+ def predict_for_engine(
104
+ audio_file, # gr.Audio o gr.File
105
+ language: str = "ca",
106
+ timestamps: bool = True,
107
+ vad_filter: bool = True,
108
+ ) -> Dict[str, Any]:
109
+ """
110
+ ENGINE llama normalmente con: client.predict(<audio_path>, api_name="/predict")
111
+ Devolvemos dict con 'text' y 'segments'.
112
+ """
113
+ # Gradio puede darte un dict {'name', 'data'} o una ruta directamente
114
+ path = None
115
+ if isinstance(audio_file, dict) and audio_file.get("name"):
116
+ path = audio_file["name"]
117
+ elif isinstance(audio_file, str):
118
+ path = audio_file
119
+ elif hasattr(audio_file, "name"):
120
+ path = audio_file.name
121
+
122
+ if not path:
123
+ return {"text": "", "segments": [], "language": language, "info": {"error": "no_audio"}}
124
+
125
+ return _transcribe_core(
126
+ path,
127
+ language=language or "ca",
128
+ task="transcribe",
129
+ vad_filter=bool(vad_filter),
130
+ beam_size=5,
131
+ temperature=0.0,
132
+ word_timestamps=bool(timestamps),
133
+ )
134
+
135
+ # 2) /transcribe — endpoint alternativo con más controles (útil para pruebas manuales/HTTP)
136
+ def transcribe_advanced(
137
+ audio_file,
138
+ language: str = "ca",
139
+ task: str = "transcribe", # "transcribe" | "translate"
140
+ vad_filter: bool = True,
141
+ beam_size: int = 5,
142
+ temperature: float = 0.0,
143
+ word_timestamps: bool = False,
144
+ ) -> Dict[str, Any]:
145
+ path = None
146
+ if isinstance(audio_file, dict) and audio_file.get("name"):
147
+ path = audio_file["name"]
148
+ elif isinstance(audio_file, str):
149
+ path = audio_file
150
+ elif hasattr(audio_file, "name"):
151
+ path = audio_file.name
152
+ if not path:
153
+ return {"text": "", "segments": [], "language": language, "info": {"error": "no_audio"}}
154
+
155
+ return _transcribe_core(
156
+ path,
157
+ language=language or "ca",
158
+ task=task or "transcribe",
159
+ vad_filter=bool(vad_filter),
160
+ beam_size=int(beam_size),
161
+ temperature=float(temperature),
162
+ word_timestamps=bool(word_timestamps),
163
+ )
164
+
165
+ # =================
166
+ # UI de demostración
167
+ # =================
168
+ with gr.Blocks(title="Aina faster-whisper (Català) · ZeroGPU") as demo:
169
+ gr.Markdown("## Aina faster-whisper (Català) · ZeroGPU\nReconocimiento de voz en catalán finetune projecte-aina.")
170
+
171
+ with gr.Row():
172
+ with gr.Column():
173
+ inp = gr.Audio(sources=["upload", "microphone"], type="filepath", label="Audio (WAV/MP3/MP4, etc.)")
174
+ lang = gr.Textbox(label="language", value="ca")
175
+ ts = gr.Checkbox(label="timestamps", value=True)
176
+ vad = gr.Checkbox(label="VAD filter", value=True)
177
+ btn = gr.Button("Transcribir (ENGINE /predict)", variant="primary")
178
+ with gr.Column():
179
+ out = gr.JSON(label="Salida /predict")
180
+
181
+ btn.click(predict_for_engine, [inp, lang, ts, vad], out, api_name="predict")
182
+
183
+ # Sección avanzada
184
+ gr.Markdown("---\n### Avanzado (/transcribe)")
185
+ with gr.Row():
186
+ with gr.Column():
187
+ inp2 = gr.Audio(sources=["upload", "microphone"], type="filepath", label="Audio")
188
+ lang2 = gr.Textbox(label="language", value="ca")
189
+ task2 = gr.Dropdown(["transcribe", "translate"], value="transcribe", label="task")
190
+ vad2 = gr.Checkbox(label="VAD filter", value=True)
191
+ beam2 = gr.Slider(1, 10, value=5, step=1, label="beam_size")
192
+ temp2 = gr.Slider(0.0, 1.5, value=0.0, step=0.1, label="temperature")
193
+ wts2 = gr.Checkbox(label="word_timestamps", value=False)
194
+ btn2 = gr.Button("Transcribir (avanzado)")
195
+ with gr.Column():
196
+ out2 = gr.JSON(label="Salida /transcribe")
197
+
198
+ btn2.click(transcribe_advanced, [inp2, lang2, task2, vad2, beam2, temp2, wts2], out2, api_name="transcribe")
199
+
200
+ demo.queue(concurrency_count=1, max_size=8).launch()
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ gradio>=4.44.1
2
+ spaces>=0.25.0
3
+ faster-whisper>=1.0
4
+ ctranslate2>=4.3
5
+ numpy<2.0 # estabilidad general con libs de audio
6
+ soundfile>=0.12 # lectura de WAV/OGG/FLAC, etc.