Spaces:

VeuReu
/

asr

Paused

App Files Files Community

VeuReu commited on Dec 9, 2025

Commit

4c12dfc

verified ·

1 Parent(s): fa8b6b1

Update app.py

Browse files

Files changed (1) hide show

app.py +142 -127

app.py CHANGED Viewed

@@ -1,4 +1,4 @@
-# app.py — veureu/asr (Aina faster-whisper Catalan · ZeroGPU) — compatible con ENGINE
 from __future__ import annotations
 import os, json, tempfile
 from typing import Dict, Any, List, Tuple, Optional
@@ -10,20 +10,17 @@ import torch
 # faster-whisper (CTranslate2)
 from faster_whisper import WhisperModel
 # =========================
-# Config y carga perezosa
 # =========================
-# Por defecto usamos el finetune catalán de projecte-aina en HF.
-# Cambia MODEL_ID por el repo exacto que uses (ej.: "projecte-aina/faster-whisper-large-v3-ca-3catparla")
 MODEL_ID = os.environ.get("MODEL_ID", "projecte-aina/faster-whisper-large-v3-ca-3catparla")
-# Detecta si hay GPU (ZeroGPU) -> fp16, si no INT8
 HAS_CUDA = os.environ.get("CUDA_VISIBLE_DEVICES") not in (None, "", "-1")
 DEVICE = "cuda" if HAS_CUDA else "cpu"
-COMPUTE_TYPE = "float16" if HAS_CUDA else "int8"  # "int8_float16" también vale en GPU baja
 _model: Optional[WhisperModel] = None
@@ -34,7 +31,7 @@ def _lazy_model() -> WhisperModel:
             MODEL_ID,
             device=DEVICE,
             compute_type=COMPUTE_TYPE,
-            download_root=os.environ.get("HF_HOME") or None,  # opcional
         )
     return _model
@@ -49,11 +46,11 @@ def _lazy_load_whisper():
     global _model_whis, _processor_whis
     if _model_whis is None or _processor_whis is None:
         model_name = "projecte-aina/whisper-large-v3-ca-3catparla"
-        # procesador
         _processor_whis = WhisperProcessor.from_pretrained(model_name)
-        # modelo
         m = WhisperForConditionalGeneration.from_pretrained(
             model_name,
             low_cpu_mem_usage=True,
@@ -67,7 +64,7 @@ def _lazy_load_whisper():
     return _processor_whis, _model_whis
 # ==================================
-# Núcleo de transcripción (Catalán)
 # ==================================
 @spaces.GPU
 def _transcribe_core(
@@ -80,9 +77,9 @@ def _transcribe_core(
     word_timestamps: bool = False,
 ) -> Dict[str, Any]:
     """
-    Devuelve:
       {
-        "text": "transcripció…",
         "segments": [
             {"start": 0.10, "end": 1.92, "text": "…"},
             ...
@@ -93,7 +90,7 @@ def _transcribe_core(
     """
     model = _lazy_model()
-    # faster-whisper produce un generador de segments + info
     segments, info = model.transcribe(
         audio_path,
         language=language or "ca",
@@ -204,33 +201,33 @@ import base64
 import soundfile as sf
 def diarize_audio(
-    wav_archivo: str,
     min_segment_duration: float = 0.5,
     max_segment_duration: float = 50.0,
 ) -> Tuple[List[str], List[Dict[str, Any]]]:
     """
-    Diarització d'àudio que:
-    - Llegeix un fitxer WAV
-    - Retorna clips en memòria com dicts per Gradio (sense desar fitxers)
-    - Retorna la llista de segments [{'start','end','speaker'}]
     """
-    # Carregar l'àudio i calcular durada
-    audio = AudioSegment.from_wav(wav_archivo)
     duration = len(audio) / 1000.0
-    # Pipeline de diarització
     pipeline = Pipeline.from_pretrained(
         "pyannote/speaker-diarization-3.1",
         use_auth_token=os.getenv('HF_TOKEN')
     )
-    diarization = pipeline(wav_archivo)
     clip_buffers: List[Tuple[str, BytesIO]] = []
     segments: List[Dict[str, Any]] = []
     spk_map: Dict[str, int] = {}
     prev_end = 0.0
-    # Processar cada segment
     for i, (turn, _, speaker) in enumerate(diarization.itertracks(yield_label=True)):
         start, end = max(0.0, float(turn.start)), min(duration, float(turn.end))
@@ -245,7 +242,7 @@ def diarize_audio(
         if seg_dur < min_segment_duration:
             continue
-        # Dividir segments molt llargs
         if seg_dur > max_segment_duration:
             n = int(math.ceil(seg_dur / max_segment_duration))
             sub_d = seg_dur / n
@@ -276,27 +273,27 @@ def diarize_audio(
             segments.append({"start": start, "end": end, "speaker": f"SPEAKER_{spk_map[speaker]:02d}"})
             prev_end = end
-    # Si no hi ha segments, tot l'àudio
     if not segments:
         buf = BytesIO()
         audio.export(buf, format="wav")
         buf.seek(0)
         return [{"name": "segment_000.wav", "data": base64.b64encode(buf.read()).decode("utf-8")}], [{"start": 0.0, "end": duration, "speaker": "SPEAKER_00"}]
-    # Convertir tots els clips a dicts per Gradio
     print("Clip buffers:")
     print(clip_buffers)
     gr_clips = []
     for i, (name, buf) in enumerate(clip_buffers, start=1):
         buf.seek(0)
-        # Crear archivo temporal pero con nombre amigable
         tmp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
         tmp_file.write(buf.read())
         tmp_file.close()
-        # Renombrar a algo como "escena1.wav", "escena2.wav", ...
-        new_name = f"escena{i}.wav"
         new_path = os.path.join(tempfile.gettempdir(), new_name)
         os.rename(tmp_file.name, new_path)
@@ -313,7 +310,7 @@ from typing import List
 import torchaudio
 import torch
-def voice_embedder(wav_archivo: str) -> List[float]:
     print("======================================================")
     model = SpeakerRecognition.from_hparams(
         source="pretrained_models/spkrec-ecapa-voxceleb",
@@ -321,23 +318,26 @@ def voice_embedder(wav_archivo: str) -> List[float]:
     )
     model.eval()
     print("======================================================")
-    # Preprocesamiento del audio
-    waveform, sr = torchaudio.load(wav_archivo)
     target_sr = 16000
     if sr != target_sr:
         waveform = T.Resample(orig_freq=sr, new_freq=target_sr)(waveform)
     if waveform.shape[0] > 1:
         waveform = waveform.mean(dim=0, keepdim=True)
-    # Duración mínima de 0.2 segundos
     min_samples = int(0.2 * target_sr)
     if waveform.shape[1] < min_samples:
         pad = min_samples - waveform.shape[1]
         waveform = torch.cat([waveform, torch.zeros((1, pad))], dim=1)
     with torch.no_grad():
         emb = (
             model.encode_batch(waveform)
@@ -347,13 +347,14 @@ def voice_embedder(wav_archivo: str) -> List[float]:
             .astype(float)
         )
     emb = emb / np.linalg.norm(emb)
     print(len(emb))
     print(emb.tolist())
-    return  emb.tolist()
-def identify_speaker(wav_archivo: str, voice_col: List[Dict[str, Any]]) -> Dict[str, Any]:
-    voice_embedding = voice_embedder(wav_archivo)
     voice_col = json.loads(voice_col)
     identity = "Desconegut"
@@ -374,7 +375,7 @@ def identify_speaker(wav_archivo: str, voice_col: List[Dict[str, Any]]) -> Dict[
                 distances_embedding = []
-                # Compute Euclidean distance between the detected face and each stored embedding
                 for voice_base_datos in voice_col:
                     voice_base_datos_embedding = np.array(voice_base_datos["embedding"])
                     distance = np.linalg.norm(voice_embedding - voice_base_datos_embedding)
@@ -387,18 +388,14 @@ def identify_speaker(wav_archivo: str, voice_col: List[Dict[str, Any]]) -> Dict[
                 distances_embedding = sorted(distances_embedding, key=lambda x: x["distance"])
                 knn = distances_embedding[:n_results]
-                # Assign identity if closest match is below distance threshold
-                '''if knn and knn[0]["distance"] < 0.95: #puede llegar la distancia máxima a 1.414
-                    identity = knn[0]["identity"]
-                else:
-                    identity = "Desconegut"'''
-                if knn: #puede llegar la distancia máxima a 1.414
                     identity = knn[0]["identity"]
                 else:
                     identity = "Desconegut"
         except Exception as e:
-            print(f"Face KNN failed: {e}")
             knn = []
             identity = "Desconegut"
@@ -411,22 +408,22 @@ import os
 import shutil
 import tempfile
-def convertir_a_temporal(original_file):
     """
-    Convierte un fichero en un temporal, borra el original y devuelve
-    la ruta del fichero temporal.
     """
     if not os.path.exists(original_file):
-        raise FileNotFoundError(f"{original_file} no existe")
-    # Crear un fichero temporal en modo persistente
     temp_fd, temp_path = tempfile.mkstemp(suffix=os.path.splitext(original_file)[1])
-    os.close(temp_fd)  # Cerramos el descriptor, lo usaremos como archivo normal
-    # Copiar el contenido al temporal
     shutil.copy2(original_file, temp_path)
-    # Borrar el original
     os.remove(original_file)
     return temp_path
@@ -450,17 +447,23 @@ def extract_audio_ffmpeg(video_file, sr: int = 16000, mono: bool = True):
     str
         Filepath to the extracted WAV audio file.
     """
-    if video_file is None: return None
-    # Extraer el nombre del archivo sin extensión
     base_name = os.path.splitext(os.path.basename(video_file))[0]
-    # Construir el path de salida con .wav
     audio_out = f"./{base_name}.wav"
-    # Si el archivo ya existe, devolverlo directamente
-    if os.path.exists(audio_out+".mp3"): return audio_out
-    # Llamar a la función que hace la extracción
     extract_audio(input_path=video_file, output_path=audio_out)
-    return convertir_a_temporal(audio_out+".mp3")
 import torch
 import torchaudio
@@ -480,32 +483,36 @@ def transcribe_wav(wav_path: str) -> str:
     dev = device
     if dev == "cuda" and not torch.cuda.is_available():
         dev = "cpu"
     processor, model = _lazy_load_whisper()
     device = dev
-    # Carga el archivo WAV
     waveform, sr = torchaudio.load(wav_path)
     target_sr = 16000
     if sr != target_sr:
         waveform = torchaudio.transforms.Resample(orig_freq=sr, new_freq=target_sr)(waveform)
         sr = target_sr
-    # Preprocesamos el audio
     inputs = processor(
         waveform.numpy(), sampling_rate=sr, return_tensors="pt"
     ).input_features.to(model.device)
-    # Genera la transcripción con el modelo
     with torch.no_grad():
         ids = model.generate(inputs, max_new_tokens=440)[0]
-    # Decodifica la transcripción
     txt = processor.decode(ids)
-    # Normaliza el texto si es necesario
     norm = getattr(processor.tokenizer, "_normalize", None)
     return norm(txt) if callable(norm) else txt
 def transcribe_long_audio(
         wav_path: str,
         chunk_length_s: int = 20,
@@ -516,17 +523,21 @@ def transcribe_long_audio(
     dev = device
     if dev == "cuda" and not torch.cuda.is_available():
         dev = "cpu"
     processor, model = _lazy_load_whisper()
     device = dev
-    # Carga el archivo WAV completo
     waveform, sr = torchaudio.load(wav_path)
     target_sr = 16000
     if sr != target_sr:
         waveform = torchaudio.transforms.Resample(orig_freq=sr, new_freq=target_sr)(waveform)
         sr = target_sr
     total_samples = waveform.shape[1]
-    # Calculamos el tamaño de los chunks y el solapamiento en muestras
     chunk_size = chunk_length_s * sr
     overlap_size = overlap_s * sr
@@ -535,14 +546,16 @@ def transcribe_long_audio(
     while start < total_samples:
         end = min(start + chunk_size, total_samples)
-        chunk = waveform[:, start:end] # Se transcribe como en fragmentos pequeños
         input_features = processor(
             chunk.numpy(),
             sampling_rate=sr,
             return_tensors="pt"
         ).input_features.to(model.device)
         with torch.no_grad():
             predicted_ids = model.generate(
                 input_features,
@@ -550,15 +563,16 @@ def transcribe_long_audio(
                 num_beams=1,
             )[0]
         text = processor.decode(predicted_ids, skip_special_tokens=True)
         transcriptions.append(text.strip())
-        # avanzar con solapamiento
         start += chunk_size - overlap_size
     return " ".join(transcriptions).strip()
 """
 # ==============================================================================
 # UI & Endpoints
@@ -593,45 +607,20 @@ h2 {
 }
 """
 with gr.Blocks(title="Aina faster-whisper (Català) · ZeroGPU", css=custom_css,theme=gr.themes.Soft()) as demo:
-    # Main transcription section
-    gr.Markdown('<h2 style="text-align:center">Aina faster-whisper (Català) · ZeroGPU - Reconeixement de veu en català finetune projecte-aina</h2>')
-    with gr.Row():
-        with gr.Column():
-            inp = gr.Audio(sources=["upload", "microphone"], type="filepath", label="Àudio (WAV/MP3/MP4, etc.)")
-            lang = gr.Textbox(label="Idioma", value="ca")
-            ts = gr.Checkbox(label="Marques de temps", value=True)
-            vad = gr.Checkbox(label="Filtre VAD", value=True)
-        with gr.Column():
-            out = gr.JSON(label="Sortida /predict")
-    with gr.Row():
-        btn = gr.Button("Transcriure (ENGINE /predict)", variant="primary")
-    # Button callback
-    btn.click(predict_for_engine, [inp, lang, ts, vad], out, api_name="predict", concurrency_limit=1)
-    # Advanced transcription section
-    gr.Markdown('<h2 style="text-align:center">Avançat (/transcribe)</h2>')
     with gr.Row():
-        with gr.Column():
-            inp2 = gr.Audio(sources=["upload", "microphone"], type="filepath", label="Àudio")
-            lang2 = gr.Textbox(label="Idioma", value="ca")
-            task2 = gr.Dropdown(["transcribe", "translate"], value="transcribe", label="Tasques")
-            vad2 = gr.Checkbox(label="Filtre VAD", value=True)
-            beam2 = gr.Slider(1, 10, value=5, step=1, label="Mida del feix")
-            temp2 = gr.Slider(0.0, 1.5, value=0.0, step=0.1, label="Temperatura")
-            wts2 = gr.Checkbox(label="Marques de temps per paraula", value=False)
-        with gr.Column():
-            out2 = gr.JSON(label="Sortida /transcribe")
     with gr.Row():
-        btn2 = gr.Button("Transcriure (avançat)", variant="primary")
-    # Button callback advanced
-    btn2.click(
-        transcribe_advanced,
-        [inp2, lang2, task2, vad2, beam2, temp2, wts2],
-        out2,
-        api_name="transcribe",
-        concurrency_limit=1
     )
     # Diarization section
@@ -692,21 +681,6 @@ with gr.Blocks(title="Aina faster-whisper (Català) · ZeroGPU", css=custom_css,
         concurrency_limit=1
     )
-    # Extract audio from video
-    gr.Markdown('<h2 style="text-align:center">Extreure àudio d\'un vídeo</h2>')
-    with gr.Row():
-        video_input = gr.Video(label="Puja un vídeo")
-    with gr.Row():
-        extract_btn = gr.Button("Extreure àudio", variant="primary")
-    with gr.Row():
-        audio_output = gr.Audio(label="Àudio extret (WAV)", type="filepath")
-    extract_btn.click(
-        fn=extract_audio_ffmpeg,
-        inputs=video_input,
-        outputs=audio_output
-    )
     # Short audio transcription
     gr.Markdown('<h2 style="text-align:center">Aina faster-whisper (Català) Àudio curt → text</h2>')
     with gr.Row():
@@ -736,5 +710,46 @@ with gr.Blocks(title="Aina faster-whisper (Català) · ZeroGPU", css=custom_css,
         inputs=audio_input,
         outputs=output_text
     )
 demo.queue(max_size=8).launch(share=True,show_error=True)

+# app.py — veureu/asr (Aina faster-whisper Catalan · ZeroGPU) — compatible with ENGINE
 from __future__ import annotations
 import os, json, tempfile
 from typing import Dict, Any, List, Tuple, Optional
 # faster-whisper (CTranslate2)
 from faster_whisper import WhisperModel
 # =========================
+# Config and lazy loading
 # =========================
+# By default we use the Catalan finetune from projecte-aina on HF.
+# Change MODEL_ID to the exact repo you are using (e.g.: "projecte-aina/faster-whisper-large-v3-ca-3catparla")
 MODEL_ID = os.environ.get("MODEL_ID", "projecte-aina/faster-whisper-large-v3-ca-3catparla")
+# Detect if there is a GPU (ZeroGPU) -> fp16, otherwise INT8
 HAS_CUDA = os.environ.get("CUDA_VISIBLE_DEVICES") not in (None, "", "-1")
 DEVICE = "cuda" if HAS_CUDA else "cpu"
+COMPUTE_TYPE = "float16" if HAS_CUDA else "int8"  # "int8_float16" also works on low-end GPUs
 _model: Optional[WhisperModel] = None
             MODEL_ID,
             device=DEVICE,
             compute_type=COMPUTE_TYPE,
+            download_root=os.environ.get("HF_HOME") or None,  # optional
         )
     return _model
     global _model_whis, _processor_whis
     if _model_whis is None or _processor_whis is None:
         model_name = "projecte-aina/whisper-large-v3-ca-3catparla"
+        # processor
         _processor_whis = WhisperProcessor.from_pretrained(model_name)
+        # model
         m = WhisperForConditionalGeneration.from_pretrained(
             model_name,
             low_cpu_mem_usage=True,
     return _processor_whis, _model_whis
 # ==================================
+# Transcription core (Catalan)
 # ==================================
 @spaces.GPU
 def _transcribe_core(
     word_timestamps: bool = False,
 ) -> Dict[str, Any]:
     """
+    Returns:
       {
+        "text": "transcription…",
         "segments": [
             {"start": 0.10, "end": 1.92, "text": "…"},
             ...
     """
     model = _lazy_model()
+    # faster-whisper produces a generator of segments + info
     segments, info = model.transcribe(
         audio_path,
         language=language or "ca",
 import soundfile as sf
 def diarize_audio(
+    wav_file: str,
     min_segment_duration: float = 0.5,
     max_segment_duration: float = 50.0,
 ) -> Tuple[List[str], List[Dict[str, Any]]]:
     """
+    Audio diarization that:
+    - Reads a WAV file
+    - Returns clips in memory as dicts for Gradio (without saving files)
+    - Returns the list of segments [{'start','end','speaker'}]
     """
+    # Load audio and calculate duration
+    audio = AudioSegment.from_wav(wav_file)
     duration = len(audio) / 1000.0
+    # Diarization pipeline
     pipeline = Pipeline.from_pretrained(
         "pyannote/speaker-diarization-3.1",
         use_auth_token=os.getenv('HF_TOKEN')
     )
+    diarization = pipeline(wav_file)
     clip_buffers: List[Tuple[str, BytesIO]] = []
     segments: List[Dict[str, Any]] = []
     spk_map: Dict[str, int] = {}
     prev_end = 0.0
+    # Process each segment
     for i, (turn, _, speaker) in enumerate(diarization.itertracks(yield_label=True)):
         start, end = max(0.0, float(turn.start)), min(duration, float(turn.end))
         if seg_dur < min_segment_duration:
             continue
+        # Split very long segments
         if seg_dur > max_segment_duration:
             n = int(math.ceil(seg_dur / max_segment_duration))
             sub_d = seg_dur / n
             segments.append({"start": start, "end": end, "speaker": f"SPEAKER_{spk_map[speaker]:02d}"})
             prev_end = end
+    # If no segments, use the entire audio
     if not segments:
         buf = BytesIO()
         audio.export(buf, format="wav")
         buf.seek(0)
         return [{"name": "segment_000.wav", "data": base64.b64encode(buf.read()).decode("utf-8")}], [{"start": 0.0, "end": duration, "speaker": "SPEAKER_00"}]
+    # Convert all clips to dicts for Gradio
     print("Clip buffers:")
     print(clip_buffers)
     gr_clips = []
     for i, (name, buf) in enumerate(clip_buffers, start=1):
         buf.seek(0)
+        # Create temporary file but with friendly name
         tmp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
         tmp_file.write(buf.read())
         tmp_file.close()
+        # Rename to something like "clip1.wav", "clip2.wav", ...
+        new_name = f"clip{i}.wav"
         new_path = os.path.join(tempfile.gettempdir(), new_name)
         os.rename(tmp_file.name, new_path)
 import torchaudio
 import torch
+def voice_embedder(wav_file: str) -> List[float]:
     print("======================================================")
     model = SpeakerRecognition.from_hparams(
         source="pretrained_models/spkrec-ecapa-voxceleb",
     )
     model.eval()
     print("======================================================")
+    # Audio preprocessing
+    waveform, sr = torchaudio.load(wav_file)
     target_sr = 16000
+    # Resample if needed
     if sr != target_sr:
         waveform = T.Resample(orig_freq=sr, new_freq=target_sr)(waveform)
+    # Convert to mono if stereo
     if waveform.shape[0] > 1:
         waveform = waveform.mean(dim=0, keepdim=True)
+    # Minimum duration of 0.2 seconds
     min_samples = int(0.2 * target_sr)
     if waveform.shape[1] < min_samples:
         pad = min_samples - waveform.shape[1]
         waveform = torch.cat([waveform, torch.zeros((1, pad))], dim=1)
+    # Compute speaker embedding
     with torch.no_grad():
         emb = (
             model.encode_batch(waveform)
             .astype(float)
         )
+    # Normalize embedding
     emb = emb / np.linalg.norm(emb)
     print(len(emb))
     print(emb.tolist())
+    return emb.tolist()
+def identify_speaker(wav_file: str, voice_col: List[Dict[str, Any]]) -> Dict[str, Any]:
+    voice_embedding = voice_embedder(wav_file)
     voice_col = json.loads(voice_col)
     identity = "Desconegut"
                 distances_embedding = []
+                # Compute Euclidean distance between the detected voice and each stored embedding
                 for voice_base_datos in voice_col:
                     voice_base_datos_embedding = np.array(voice_base_datos["embedding"])
                     distance = np.linalg.norm(voice_embedding - voice_base_datos_embedding)
                 distances_embedding = sorted(distances_embedding, key=lambda x: x["distance"])
                 knn = distances_embedding[:n_results]
+                # Assign identity if closest match exists
+                if knn:
                     identity = knn[0]["identity"]
                 else:
                     identity = "Desconegut"
         except Exception as e:
+            print(f"Voice KNN failed: {e}")
             knn = []
             identity = "Desconegut"
 import shutil
 import tempfile
+def convert_to_temporary(original_file):
     """
+    Converts a file to a temporary file, deletes the original, and returns
+    the path of the temporary file.
     """
     if not os.path.exists(original_file):
+        raise FileNotFoundError(f"{original_file} does not exist")
+    # Create a temporary file in persistent mode
     temp_fd, temp_path = tempfile.mkstemp(suffix=os.path.splitext(original_file)[1])
+    os.close(temp_fd)  # Close the file descriptor; we'll use it as a normal file
+    # Copy the content to the temporary file
     shutil.copy2(original_file, temp_path)
+    # Delete the original file
     os.remove(original_file)
     return temp_path
     str
         Filepath to the extracted WAV audio file.
     """
+    if video_file is None:
+        return None
+    # Extract the file name without extension
     base_name = os.path.splitext(os.path.basename(video_file))[0]
+    # Build the output path with .wav extension
     audio_out = f"./{base_name}.wav"
+    # If the file already exists, return it directly
+    if os.path.exists(audio_out+".mp3"):
+        return audio_out
+    # Call the function that performs the extraction
     extract_audio(input_path=video_file, output_path=audio_out)
+    return convert_to_temporary(audio_out+".mp3")
 import torch
 import torchaudio
     dev = device
     if dev == "cuda" and not torch.cuda.is_available():
         dev = "cpu"
+    # Lazy-load the Whisper processor and model
     processor, model = _lazy_load_whisper()
     device = dev
+    # Load the WAV file
     waveform, sr = torchaudio.load(wav_path)
     target_sr = 16000
     if sr != target_sr:
+        # Resample audio if sample rate differs
         waveform = torchaudio.transforms.Resample(orig_freq=sr, new_freq=target_sr)(waveform)
         sr = target_sr
+    # Preprocess the audio
     inputs = processor(
         waveform.numpy(), sampling_rate=sr, return_tensors="pt"
     ).input_features.to(model.device)
+    # Generate transcription with the model
     with torch.no_grad():
         ids = model.generate(inputs, max_new_tokens=440)[0]
+    # Decode the transcription
     txt = processor.decode(ids)
+    # Normalize text if necessary
     norm = getattr(processor.tokenizer, "_normalize", None)
     return norm(txt) if callable(norm) else txt
 def transcribe_long_audio(
         wav_path: str,
         chunk_length_s: int = 20,
     dev = device
     if dev == "cuda" and not torch.cuda.is_available():
         dev = "cpu"
+    # Lazy-load the Whisper processor and model
     processor, model = _lazy_load_whisper()
     device = dev
+    # Load the full WAV file
     waveform, sr = torchaudio.load(wav_path)
     target_sr = 16000
     if sr != target_sr:
+        # Resample if sample rate differs
         waveform = torchaudio.transforms.Resample(orig_freq=sr, new_freq=target_sr)(waveform)
         sr = target_sr
     total_samples = waveform.shape[1]
+    # Calculate chunk size and overlap in samples
     chunk_size = chunk_length_s * sr
     overlap_size = overlap_s * sr
     while start < total_samples:
         end = min(start + chunk_size, total_samples)
+        chunk = waveform[:, start:end]  # Transcribe in small fragments
+        # Preprocess the chunk
         input_features = processor(
             chunk.numpy(),
             sampling_rate=sr,
             return_tensors="pt"
         ).input_features.to(model.device)
+        # Generate transcription for the chunk
         with torch.no_grad():
             predicted_ids = model.generate(
                 input_features,
                 num_beams=1,
             )[0]
+        # Decode and store the chunk transcription
         text = processor.decode(predicted_ids, skip_special_tokens=True)
         transcriptions.append(text.strip())
+        # Move to the next chunk with overlap
         start += chunk_size - overlap_size
+    # Join all chunks into a single string
     return " ".join(transcriptions).strip()
 """
 # ==============================================================================
 # UI & Endpoints
 }
 """
 with gr.Blocks(title="Aina faster-whisper (Català) · ZeroGPU", css=custom_css,theme=gr.themes.Soft()) as demo:
+    # Extract audio from video
+    gr.Markdown('<h2 style="text-align:center">Extreure àudio d\'un vídeo</h2>')
     with gr.Row():
+        video_input = gr.Video(label="Puja un vídeo")
     with gr.Row():
+        extract_btn = gr.Button("Extreure àudio", variant="primary")
+    with gr.Row():
+        audio_output = gr.Audio(label="Àudio extret (WAV)", type="filepath")
+    extract_btn.click(
+        fn=extract_audio_ffmpeg,
+        inputs=video_input,
+        outputs=audio_output
     )
     # Diarization section
         concurrency_limit=1
     )
     # Short audio transcription
     gr.Markdown('<h2 style="text-align:center">Aina faster-whisper (Català) Àudio curt → text</h2>')
     with gr.Row():
         inputs=audio_input,
         outputs=output_text
     )
+    # Main transcription section
+    gr.Markdown('<h2 style="text-align:center">Aina faster-whisper (Català) · ZeroGPU - Reconeixement de veu en català finetune projecte-aina</h2>')
+    with gr.Row():
+        with gr.Column():
+            inp = gr.Audio(sources=["upload", "microphone"], type="filepath", label="Àudio (WAV/MP3/MP4, etc.)")
+            lang = gr.Textbox(label="Idioma", value="ca")
+            ts = gr.Checkbox(label="Marques de temps", value=True)
+            vad = gr.Checkbox(label="Filtre VAD", value=True)
+        with gr.Column():
+            out = gr.JSON(label="Sortida /predict")
+    with gr.Row():
+        btn = gr.Button("Transcriure (ENGINE /predict)", variant="primary")
+    # Button callback
+    btn.click(predict_for_engine, [inp, lang, ts, vad], out, api_name="predict", concurrency_limit=1)
+    # Advanced transcription section
+    gr.Markdown('<h2 style="text-align:center">Avançat (/transcribe)</h2>')
+    with gr.Row():
+        with gr.Column():
+            inp2 = gr.Audio(sources=["upload", "microphone"], type="filepath", label="Àudio")
+            lang2 = gr.Textbox(label="Idioma", value="ca")
+            task2 = gr.Dropdown(["transcribe", "translate"], value="transcribe", label="Tasques")
+            vad2 = gr.Checkbox(label="Filtre VAD", value=True)
+            beam2 = gr.Slider(1, 10, value=5, step=1, label="Mida del feix")
+            temp2 = gr.Slider(0.0, 1.5, value=0.0, step=0.1, label="Temperatura")
+            wts2 = gr.Checkbox(label="Marques de temps per paraula", value=False)
+        with gr.Column():
+            out2 = gr.JSON(label="Sortida /transcribe")
+    with gr.Row():
+        btn2 = gr.Button("Transcriure (avançat)", variant="primary")
+    # Button callback advanced
+    btn2.click(
+        transcribe_advanced,
+        [inp2, lang2, task2, vad2, beam2, temp2, wts2],
+        out2,
+        api_name="transcribe",
+        concurrency_limit=1
+    )
 demo.queue(max_size=8).launch(share=True,show_error=True)