Spaces:

RobotsMali
/

RobotsMali_Video_captionning

Running

App Files Files Community

binaryMao commited on Nov 1, 2025

Commit

e18b5e6

verified ·

1 Parent(s): 5738fbf

Update app.py

Browse files

Files changed (1) hide show

app.py +144 -447

app.py CHANGED Viewed

@@ -1,8 +1,6 @@
 # -*- coding: utf-8 -*-
-"""Video_Captioning_Space_V8_0_MINIMALIST_BLUE.ipynb
-Architecture NeMo + ctc-segmentation pour l'alignement sur tous les modèles.
-Design Minimalist Blue.
-"""
 import gradio as gr
 import numpy as np
 import torch
@@ -11,493 +9,192 @@ import os
 import tempfile
 import warnings
 from moviepy.editor import VideoFileClip, TextClip, CompositeVideoClip
-from typing import List, Tuple, Union
-# --- Installation des dépendances pour Google Colab (À exécuter avant ce script) ---
-# !pip install gradio moviepy numpy torch soundfile
-# !pip install nemo_toolkit['asr']
-# !pip install ctc-segmentation huggingface-hub
 try:
     from nemo.collections import asr as nemo_asr
-    from huggingface_hub import hf_hub_download, snapshot_download
     from ctc_segmentation import ctc_segmentation, CtcSegmentationParameters, prepare_text
     NEMO_LOADED = True
-except ImportError as e:
     NEMO_LOADED = False
-    print(f"Erreur d'importation des dépendances NeMo/CTC : {e}")
-    # Classes/Fonctions de substitution pour éviter le crash au lancement
-    class DummyASRModel:
-        def from_pretrained(self, *args, **kwargs):
-            raise RuntimeError("Dépendances ASR manquantes. Veuillez exécuter la cellule d'installation.")
-    nemo_asr = type('nemo_asr', (object,), {'models': type('models', (object,), {'EncDecHybridRNNTCTCBPEModel': DummyASRModel, 'EncDecCTCModelBPE': DummyASRModel})})
-    hf_hub_download = lambda *args, **kwargs: None
-    snapshot_download = lambda *args, **kwargs: None
-# --- CONFIGURATION DES MODÈLES (Utilisation de votre liste complète) ---
 MODELS = {
     "Soloni V1 (RNnT - Précis)": ("RobotsMali/soloni-114m-tdt-ctc-V1", "soloni-114m-tdt-ctc-V1.nemo", "rnnt"),
     "Soloba V1 (CTC - Équilibré)": ("RobotsMali/soloba-ctc-0.6b-V1", None, "ctc"),
     "QuartzNet V1 (CTC - Rapide)": ("RobotsMali/stt-bm-quartznet15x5-V1", None, "ctc"),
-    # Anciennes versions (Gardées pour la compatibilité, mais V1 recommandées)
-    "Soloni V0 (RNnT)": ("RobotsMali/soloni-114m-tdt-ctc-V0", "soloni-114m-tdt-ctc-V0.nemo", "rnnt"),
-    "Soloba V0 (CTC)": ("RobotsMali/soloba-ctc-0.6b-V0", None, "ctc"),
-    "QuartzNet V0 (CTC)": ("RobotsMali/stt-bm-quartznet15x5-V0", None, "ctc"),
 }
-asr_pipeline = {}
-# --- CSS : ROBOTSMALI MINIMALIST BLUE ---
-CUSTOM_CSS = """
-    @import url('https://fonts.googleapis.com/css2?family=Roboto:wght@300;400;700&display=swap');
-    /* Couleurs */
-    :root {
-        --primary-color: #007bff; /* Bleu de base */
-        --accent-color: #00BFFF; /* Bleu Cyan Électrique */
-        --background-light: #F8F9FA; /* Gris très clair */
-        --surface-color: #FFFFFF; /* Blanc */
-        --text-color: #212529; /* Gris très foncé */
-        --border-color: #E9ECEF;
-    }
-    body {
-        background-color: var(--background-light) !important;
-        font-family: 'Roboto', sans-serif !important;
-        color: var(--text-color) !important;
-    }
-    .gradio-container {
-        max-width: 1200px;
-        margin: 0 auto;
-        padding: 20px 10px;
-        background-color: var(--background-light) !important;
-        border-radius: 0 !important;
-    }
-    /* Conteneurs et cartes (Blocs) */
-    .block {
-        border: 1px solid var(--border-color);
-        border-radius: 8px;
-        background-color: var(--surface-color);
-        box-shadow: 0 2px 4px rgba(0, 0, 0, 0.05);
-        padding: 20px;
-    }
-    /* Titres */
-    h1 {
-        color: var(--accent-color) !important;
-        text-align: center;
-        margin-bottom: 5px;
-        font-weight: 700;
-    }
-    h3 {
-        color: var(--primary-color) !important;
-        font-weight: 500;
-        border-bottom: 1px solid var(--border-color);
-        padding-bottom: 5px;
-        margin-bottom: 15px;
-    }
-    /* Boutons d'action : Bleu Primair */
-    .primary {
-        background-color: var(--primary-color) !important;
-        border: none !important;
-        color: white !important;
-        font-weight: 700;
-        text-transform: uppercase;
-        transition: background-color 0.2s;
-    }
-    .primary:hover {
-        background-color: #0056b3 !important; /* Bleu foncé au survol */
-        box-shadow: 0 0 8px rgba(0, 123, 255, 0.4);
-    }
-    /* Inputs et Dropdowns */
-    .gr-input, .gr-dropdown {
-        background-color: #FFFFFF !important;
-        border: 1px solid #CED4DA !important;
-        color: var(--text-color) !important;
-        border-radius: 4px;
-    }
-    .gr-file-input {
-        border: 2px dashed var(--primary-color) !important;
-        background-color: #F0F5FF !important;
-    }
-    /* Statut d'exécution */
-    .gr-status {
-        background-color: #E6F0FF !important;
-        border-left: 5px solid var(--primary-color);
-        color: var(--text-color) !important;
-        padding: 10px;
-    }
-"""
-# ----------------------------------------------------------------------
-# FONCTIONS DE CHARGEMENT ET D'ALIGEMENT
-# ----------------------------------------------------------------------
 def load_ctc_model_safe(repo_id):
-    """Charge les modèles CTC de manière robuste (votre fonction)"""
-    # Votre logique de chargement stable est conservée
     try:
-        # Essai 1: Chargement standard
         return nemo_asr.models.EncDecCTCModelBPE.from_pretrained(model_name=repo_id)
-    except Exception as e:
-        # Essai 2: Téléchargement manuel via snapshot si l'essai 1 échoue
-        print(f"Erreur lors du chargement standard du CTC: {e}. Tentative de téléchargement manuel...")
         with tempfile.TemporaryDirectory() as tmpdir:
-            try:
-                model_path = snapshot_download(
-                    repo_id=repo_id,
-                    cache_dir=tmpdir,
-                    local_dir_use_symlinks=False
-                )
-                # Chercher le fichier .nemo
-                nemo_file = None
-                for file in os.listdir(model_path):
-                    if file.endswith('.nemo'):
-                        nemo_file = os.path.join(model_path, file)
-                        break
-                if nemo_file and os.path.exists(nemo_file):
-                    print(f"Chargement réussi depuis: {nemo_file}")
-                    return nemo_asr.models.EncDecCTCModelBPE.restore_from(nemo_file)
-                else:
-                    raise FileNotFoundError("Fichier .nemo non trouvé dans le repo téléchargé.")
-            except Exception as e2:
-                raise Exception(f"Échec du téléchargement/chargement manuel du modèle CTC: {e2}")
-def load_asr_model(model_name: str):
-    """Gestion centralisée du chargement de modèles (RNNT et CTC)"""
-    global asr_pipeline
     repo_id, nemo_file, mode = MODELS[model_name]
     if model_name not in asr_pipeline:
-        print(f"-> Chargement initial du modèle : {model_name} (Mode: {mode})")
-        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
         if mode == "rnnt":
-            # RNNT (Soloni) : Téléchargement du fichier .nemo spécifique
-            if not nemo_file: raise ValueError("Nom de fichier .nemo manquant pour le modèle RNNT.")
             nemo_path = hf_hub_download(repo_id, filename=nemo_file)
-            model_instance = nemo_asr.models.EncDecHybridRNNTCTCBPEModel.restore_from(nemo_path)
         else:
-            # CTC (Soloba, QuartzNet) : Utilisation de la fonction sécurisée
-            model_instance = load_ctc_model_safe(repo_id)
-        model_instance = model_instance.to(device)
-        model_instance.eval()
-        asr_pipeline[model_name] = model_instance
-        print(f"-> Modèle {model_name} chargé sur {device}.")
     return asr_pipeline[model_name]
-# --- Logique de Segmentation et d'Optimisation des Lignes ---
-MAX_SUBTITLE_WORDS = 4
-MAX_SUBTITLE_CHARS = 45
-MAX_SUBTITLE_DURATION = 3.5 # Durée maximale en secondes pour une ligne de sous-titre
-def group_words_to_subtitles(words_with_timestamps: List[Tuple[float, float, str]]) -> List[Tuple[float, float, str]]:
-    """
-    Formate la liste de mots horodatés en lignes de sous-titres optimisées
-    selon les règles de mots, caractères et durée maximum.
-    Cette fonction assure l'optimisation pour les 6 modèles.
-    """
-    subtitles = []
-    if not words_with_timestamps: return []
-    current_group = []
-    def finalize_group(group):
-        if not group: return
-        start_time = group[0][0]
-        end_time = group[-1][1]
-        line_text = " ".join([w[2] for w in group])
-        subtitles.append((start_time, end_time, line_text))
-    for word_data in words_with_timestamps:
-        # Tentative d'ajouter le mot au groupe actuel
-        test_group = current_group + [word_data]
-        test_text = " ".join([w[2] for w in test_group])
-        # Calcul de la durée du groupe test
-        test_duration = test_group[-1][1] - test_group[0][0] if test_group else 0
-        should_cut = False
-        # Règle 1: Dépasser la limite de mots
-        if len(current_group) >= MAX_SUBTITLE_WORDS:
-            should_cut = True
-        # Règle 2: Dépasser la limite de caractères (avant l'ajout)
-        elif len(test_text) > MAX_SUBTITLE_CHARS and current_group:
-            should_cut = True
-        # Règle 3: Dépasser la durée maximum (avant l'ajout)
-        # On coupe si la durée est trop longue, mais seulement si le groupe a
-        # déjà une taille raisonnable (>= 2 mots) pour éviter des coupures trop courtes.
-        elif len(current_group) >= 2 and test_duration > MAX_SUBTITLE_DURATION:
-            should_cut = True
-        if should_cut:
-            finalize_group(current_group)
-            current_group = [word_data]
         else:
-            # Si aucune règle de coupure n'est déclenchée, on ajoute le mot au groupe
-            current_group.append(word_data)
-    # Finalisation du dernier groupe
-    finalize_group(current_group)
-    return subtitles
-def transcribe(model, device, wav, model_name):
-    """Transcrit l'audio et génère des horodatages de LIGNES (start, end, text)"""
-    # Lecture de l'audio
-    audio, sr = sf.read(wav)
-    if audio.ndim == 2:
-        audio = np.mean(audio, axis=1)
     x = torch.tensor(audio, dtype=torch.float32).unsqueeze(0).to(device)
     ln = torch.tensor([x.shape[1]]).to(device)
     total_s = len(audio) / sr
-    # --- Mode RNNT (Soloni) : Utilisation de l'alignement natif ---
     if "Soloni" in model_name:
-        with torch.no_grad():
-            proc, plen = model.preprocessor(input_signal=x, input_signal_length=ln)
-            # Utilisation du decode_and_align natif pour les word-timestamps
-            hyps = model.decode_and_align(encoder_output=proc, encoded_lengths=plen)
-        if not hyps or not hyps[0]: return []
-        hyp = hyps[0][0] if isinstance(hyps[0], list) else hyps[0]
-        word_timestamps = [(w.start_offset_ms/1000, w.end_offset_ms/1000, w.word) for w in hyp.words]
-        # Application de la logique d'optimisation
-        return group_words_to_subtitles(word_timestamps)
-    # --- Mode CTC (Soloba, QuartzNet) : Utilisation de ctc-segmentation ---
-    text = model.transcribe([wav])[0].strip()
-    if not text: return []
-    with torch.no_grad():
-        logits, logit_len = model.forward(input_signal=x, input_signal_length=ln)
-    words = text.split()
-    if not words: return []
-    # CTC Segmentation
-    config = CtcSegmentationParameters()
-    config.char_list = list(model.tokenizer.vocab.keys())
-    gt, _ = prepare_text(config, words)
-    # Suppression des avertissements de ctc_segmentation
-    with warnings.catch_warnings():
-        warnings.simplefilter("ignore")
-        timings, _, _ = ctc_segmentation(config, logits.cpu().numpy()[0], gt)
-    tps = total_s / logit_len.cpu().numpy()[0]
-    # Alignement des mots
-    aligned = [(timings[i] * tps,
-                timings[i+1] * tps if i+1 < len(timings) else total_s,
                 words[i]) for i in range(len(words))]
-    # Application de la logique d'optimisation
-    return group_words_to_subtitles(aligned)
-# --- Fonction d'Extraction Audio (Optimisée) ---
-def extract_audio(video_path, wav_path):
-    """Extrait l'audio de la vidéo avec gestion des ressources"""
-    try:
-        video = VideoFileClip(video_path)
-        video.audio.write_audiofile(
-            wav_path, fps=16000, codec="pcm_s16le", verbose=False, logger=None
-        )
-        video.close()
-    except Exception as e:
-        raise Exception(f"Erreur lors de l'extraction audio: {e}")
-# --- Fonction d'Incrustation Vidéo (Simplifiée et Stabilisée) ---
 def burn(video, subs):
-    """Ajoute les sous-titres à la vidéo en utilisant TextClip (plus stable)"""
-    out_path = "RobotsMali_Subtitled.mp4"
-    if os.path.exists(out_path): os.remove(out_path)
     clip = VideoFileClip(video)
     W, H = clip.size
-    subtitle_clips = []
-    for start, end, text in subs:
-        # Utilisation de TextClip pour la stabilité, le style et l'alignement
-        # Fond sombre semi-transparent pour la lisibilité sur TOUS fonds vidéo
-        txt_clip = TextClip(
-            text.upper(),
-            fontsize=H // 20,
-            color='white',
-            font='Roboto-Bold', # Utilisation d'une police web standard pour éviter les erreurs Colab
-            bg_color='rgba(0, 0, 0, 0.7)',
-            method='caption',
-            size=(W * 0.9, None) # 90% de la largeur pour le wrap
-        )
-        duration = max(0.1, end - start) # Durée minimale de 0.1s
-        txt_clip = txt_clip.set_pos(('center', H * 0.85)).set_duration(duration).set_start(start)
-        subtitle_clips.append(txt_clip)
-    # Composition finale
-    final = CompositeVideoClip([clip] + subtitle_clips)
-    # Écriture de la vidéo finale
-    final.write_videofile(
-        out_path,
-        codec="libx264",
-        audio_codec="aac",
-        fps=clip.fps,
-        bitrate="4000k", # Bitrate fixé à 4000k pour une qualité HD standard
-        preset="medium",
-        verbose=False,
-        logger=None,
-        temp_audiofile="temp-audio.m4a",
-        remove_temp=True
-    )
-    # Nettoyage
-    clip.close()
-    final.close()
-    for layer in subtitle_clips:
-        layer.close()
-    return out_path
-# --- Pipeline Principal ---
 def pipeline(video_file, model_name):
-    """Pipeline principal de traitement"""
-    if not NEMO_LOADED:
-        return "❌ ERREUR FATALE : NeMo/CTC Segmentation n'a pas été importé. Exécutez la cellule d'installation.", None
     if video_file is None:
-        return "Veuillez importer une vidéo.", None
     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-    yield f"🧠 Chargement du modèle {model_name} sur {device}..."
     try:
         model = load_asr_model(model_name)
-        yield "🎶 Extraction audio en cours..."
-        wav_path = os.path.join(tempfile.gettempdir(), "audio.wav")
-        extract_audio(video_file, wav_path)
-        yield "📝 Transcription et alignement des mots en cours..."
-        subs = transcribe(model, device, wav_path, model_name)
-        if not subs:
-            return "⚠️ ALERTE : Aucune parole détectée ou alignement échoué. Vérifiez la qualité audio.", None
-        yield "🎬 Incrustation des sous-titres sur la vidéo..."
-        final_video = burn(video_file, subs)
-        # Nettoyage des fichiers temporaires
-        if os.path.exists(wav_path):
-            os.remove(wav_path)
-        return "✅ PRODUCTION TERMINÉE avec succès!", final_video
     except Exception as e:
-        print(f"Erreur dans le pipeline: {e}")
-        # Nettoyage en cas d'erreur
-        if 'wav_path' in locals() and os.path.exists(wav_path): os.remove(wav_path)
-        return f"❌ ERREUR FATALE : {str(e)}", None
-# ----------------------------------------------------------------------
-# INTERFACE GRADIO - "ROBOTSMALI V8.0 : MINIMALIST BLUE"
-# ----------------------------------------------------------------------
-# Statut de l'application
-if NEMO_LOADED:
-    APP_STATUS = "✨ SYSTÈME PRÊT : Toutes les dépendances (NeMo/CTC) sont chargées."
-else:
-    APP_STATUS = "❌ DÉPENDANCES MANQUANTES : Veuillez exécuter la commande d'installation."
-with gr.Blocks(theme=gr.themes.Default(), title="RobotsMali V8.0", css=CUSTOM_CSS) as demo:
-    gr.Markdown(
-        f"""
-        # ⚡ **ROBOTSMALI V8.0 : MINIMALIST BLUE** ⚡
-        ### Sous-titrage et alignement de haute précision (RNNT & CTC).
-        *Statut : {APP_STATUS}*
-        ---
-        """
-    )
-    with gr.Row(equal_height=True):
-        with gr.Column(scale=1):
-            with gr.Group(elem_classes=["block"]):
-                gr.Markdown("### 1. Source & Configuration")
-                video = gr.Video(
-                    label="Vidéo d'entrée (MP4, MOV, AVI)",
-                    height=300,
-                    elem_classes=["gr-file-input"]
-                )
-                model = gr.Dropdown(
-                    list(MODELS.keys()),
-                    value="Soloni V1 (RNnT - Précis)",
-                    label="Modèle de Reconnaissance Vocale",
-                    info="RNnT (Soloni): meilleur alignement. CTC (Soloba/QuartzNet): plus rapide.",
-                    interactive=NEMO_LOADED,
-                )
-                btn = gr.Button("▶️ **INITIER LA PRODUCTION**", variant="primary", interactive=NEMO_LOADED)
-        with gr.Column(scale=2):
-            with gr.Group(elem_classes=["block"]):
-                gr.Markdown("### 2. Flux de Production & Résultat")
-                status = gr.Markdown(
-                    value="En attente du fichier source...",
-                    label="Journal de Bord",
-                    elem_classes=["gr-status"]
-                )
-                out = gr.Video(
-                    label="Vidéo sous-titrée",
-                    height=300,
-                    interactive=False
-                )
-    # Explication de la correction :
-    gr.Markdown(
-        """
-        ---
-        **Note de l'Expert :** La logique d'alignement a été optimisée et unifiée pour les 6 modèles:
-        -   **Optimisation:** Chaque ligne de sous-titre respecte désormais simultanément les limites de **4 mots**, **45 caractères** et une durée maximale de **3.5 secondes**, assurant un rythme de lecture optimal.
-        -   **Unification:** La même fonction d'optimisation est appliquée à la sortie de tous les modèles (RNNT et CTC).
-        """
-    )
-    # L'utilisation de 'fn' dans gr.Examples est dépréciée. Le clic est le standard.
-    btn.click(
-        fn=pipeline,
-        inputs=[video, model],
-        outputs=[status, out]
-    )
-if __name__ == "__main__":
-    demo.launch(share=True)

 # -*- coding: utf-8 -*-
+"""ROBOTSMALI VIDEO CAPTIONING V8 - MINIMALIST BLUE (STABLE VERSION)"""
 import gradio as gr
 import numpy as np
 import torch
 import tempfile
 import warnings
 from moviepy.editor import VideoFileClip, TextClip, CompositeVideoClip
+from typing import List, Tuple
+from huggingface_hub import hf_hub_download, snapshot_download
+# ------------------------------------------------------------
+# Import NeMo
+# ------------------------------------------------------------
 try:
     from nemo.collections import asr as nemo_asr
     from ctc_segmentation import ctc_segmentation, CtcSegmentationParameters, prepare_text
     NEMO_LOADED = True
+except Exception as e:
+    print("❌ ERREUR : NeMo ou ctc-segmentation non installé.")
     NEMO_LOADED = False
+# ------------------------------------------------------------
+# Modèles RobotsMali
+# ------------------------------------------------------------
 MODELS = {
     "Soloni V1 (RNnT - Précis)": ("RobotsMali/soloni-114m-tdt-ctc-V1", "soloni-114m-tdt-ctc-V1.nemo", "rnnt"),
     "Soloba V1 (CTC - Équilibré)": ("RobotsMali/soloba-ctc-0.6b-V1", None, "ctc"),
     "QuartzNet V1 (CTC - Rapide)": ("RobotsMali/stt-bm-quartznet15x5-V1", None, "ctc"),
 }
+asr_pipeline = {}
+# ------------------------------------------------------------
+# Chargement modèle robuste
+# ------------------------------------------------------------
 def load_ctc_model_safe(repo_id):
     try:
         return nemo_asr.models.EncDecCTCModelBPE.from_pretrained(model_name=repo_id)
+    except:
         with tempfile.TemporaryDirectory() as tmpdir:
+            path = snapshot_download(repo_id, cache_dir=tmpdir)
+            for f in os.listdir(path):
+                if f.endswith(".nemo"):
+                    return nemo_asr.models.EncDecCTCModelBPE.restore_from(os.path.join(path, f))
+        raise RuntimeError("Impossible de charger le modèle CTC.")
+def load_asr_model(model_name):
     repo_id, nemo_file, mode = MODELS[model_name]
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
     if model_name not in asr_pipeline:
         if mode == "rnnt":
             nemo_path = hf_hub_download(repo_id, filename=nemo_file)
+            model = nemo_asr.models.EncDecHybridRNNTCTCBPEModel.restore_from(nemo_path)
         else:
+            model = load_ctc_model_safe(repo_id)
+        model.to(device).eval()
+        asr_pipeline[model_name] = model
     return asr_pipeline[model_name]
+# ------------------------------------------------------------
+# Groupage des mots en sous-titres
+# ------------------------------------------------------------
+MAX_WORDS = 4
+MAX_CHARS = 45
+MAX_DURATION = 3.5
+def group_words(words):
+    subs, group = [], []
+    def commit(g):
+        if g:
+            subs.append((g[0][0], g[-1][1], " ".join([w[2] for w in g])))
+    for w in words:
+        test = group + [w]
+        text = " ".join([t[2] for t in test])
+        duration = test[-1][1] - test[0][0]
+        if len(test) > MAX_WORDS or len(text) > MAX_CHARS or duration > MAX_DURATION:
+            commit(group)
+            group = [w]
         else:
+            group.append(w)
+    commit(group)
+    return subs
+# ------------------------------------------------------------
+# Transcription + Alignement
+# ------------------------------------------------------------
+def transcribe(model, device, wavfile, model_name):
+    audio, sr = sf.read(wavfile)
+    if audio.ndim == 2: audio = np.mean(audio, axis=1)
     x = torch.tensor(audio, dtype=torch.float32).unsqueeze(0).to(device)
     ln = torch.tensor([x.shape[1]]).to(device)
     total_s = len(audio) / sr
+    # RNNT direct timestamps
     if "Soloni" in model_name:
+        hyps = model.decode_and_align(*model.preprocessor(input_signal=x, input_signal_length=ln))
+        words = [(w.start_offset_ms/1000, w.end_offset_ms/1000, w.word) for w in hyps[0][0].words]
+        return group_words(words)
+    # CTC + segmentation
+    text = model.transcribe([wavfile])[0]
+    if not text.strip(): return []
+    with torch.no_grad(): logits, loglen = model(x, ln)
+    words = text.strip().split()
+    cfg = CtcSegmentationParameters()
+    cfg.char_list = list(model.tokenizer.vocab.keys())
+    gt, _ = prepare_text(cfg, words)
+    timings, _, _ = ctc_segmentation(cfg, logits.cpu().numpy()[0], gt)
+    tps = total_s / loglen.cpu().numpy()[0]
+    aligned = [(timings[i]*tps,
+                timings[i+1]*tps if i+1 < len(timings) else total_s,
                 words[i]) for i in range(len(words))]
+    return group_words(aligned)
+# ------------------------------------------------------------
+# Extraction audio
+# ------------------------------------------------------------
+def extract_audio(video, wav):
+    v = VideoFileClip(video)
+    v.audio.write_audiofile(wav, fps=16000, codec="pcm_s16le", logger=None)
+    v.close()
+# ------------------------------------------------------------
+# Burn subtitles
+# ------------------------------------------------------------
 def burn(video, subs):
+    output = "RobotsMali_Subtitled.mp4"
     clip = VideoFileClip(video)
     W, H = clip.size
+    layers = []
+    for start, end, text in subs:
+        txt = TextClip(
+            text.upper(), fontsize=H//20, color='white', bg_color='rgba(0,0,0,0.7)',
+            method='caption', size=(W*0.9, None)
+        ).set_pos(("center", H*0.85)).set_duration(end-start).set_start(start)
+        layers.append(txt)
+    final = CompositeVideoClip([clip] + layers)
+    final.write_videofile(output, codec="libx264", audio_codec="aac", fps=clip.fps, logger=None)
+    clip.close(); final.close()
+    return output
+# ------------------------------------------------------------
+# PIPELINE STABLE (PAS DE YIELD)
+# ------------------------------------------------------------
 def pipeline(video_file, model_name):
     if video_file is None:
+        return "⚠️ Importez une vidéo.", None
     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    status = f"🧠 Chargement du modèle {model_name}..."
     try:
         model = load_asr_model(model_name)
+        status += "\n🎶 Extraction audio..."
+        wav = os.path.join(tempfile.gettempdir(), "audio.wav")
+        extract_audio(video_file, wav)
+        status += "\n📝 Transcription..."
+        subs = transcribe(model, device, wav, model_name)
+        if not subs: return "⚠️ Aucun mot détecté.", None
+        status += "\n🎬 Sous-titrage..."
+        out = burn(video_file, subs)
+        if os.path.exists(wav): os.remove(wav)
+        status += "\n✅ Terminé !"
+        return status, out
     except Exception as e:
+        return f"❌ ERREUR : {e}", None
+# ------------------------------------------------------------
+# Interface
+# ------------------------------------------------------------
+with gr.Blocks() as demo:
+    gr.Markdown("# ⚡ ROBOTSMALI V8 — MINIMALIST BLUE")
+    video = gr.Video(label="Importer une vidéo")
+    model = gr.Dropdown(list(MODELS.keys()), value="Soloni V1 (RNnT - Précis)")
+    run = gr.Button("▶️ PRODUIRE")
+    status = gr.Markdown()
+    out = gr.Video()
+    run.click(pipeline, inputs=[video, model], outputs=[status, out])
+demo.launch(share=True)