Spaces:

RobotsMali
/

RobotsMali_Video_captionning

Running

App Files Files Community

binaryMao commited on Nov 13, 2025

Commit

49bfd29

verified ·

1 Parent(s): bfcc59d

Update app.py

Browse files

Files changed (1) hide show

app.py +269 -140

app.py CHANGED Viewed

@@ -1,10 +1,20 @@
 # -*- coding: utf-8 -*-
 """
-ROBOTSMALI V38 FINAL — SOUS-TITRAGE BAMBARA (STYLE NETFLIX)
-Correction V38.1 : FFmpeg fixe (pas de -c copy), durée exacte, QuartzNet fonctionnel
 """
-import os, tempfile, traceback, random, textwrap
 import numpy as np
 import torch
 import soundfile as sf
@@ -12,7 +22,6 @@ import librosa
 from huggingface_hub import snapshot_download
 from nemo.collections import asr as nemo_asr
 import gradio as gr
-from moviepy.editor import VideoFileClip
 # ----------------------------
 # CONFIG
@@ -25,211 +34,331 @@ torch.manual_seed(1234)
 MODELS = {
     "Soloni V1 (RNNT)":        ("RobotsMali/soloni-114m-tdt-ctc-v1", "rnnt"),
     "Soloni V0 (RNNT)":        ("RobotsMali/soloni-114m-tdt-ctc-v0", "rnnt"),
-    "Soloba V1 (CTC)":         ("RobotsMali/soloba-ctc-0.6b-v1", "ctc"),
-    "Soloba V0 (CTC)":         ("RobotsMali/soloba-ctc-0.6b-v0", "ctc"),
-    "QuartzNet V1 (CTC-char)": ("RobotsMali/stt-bm-quartznet15x5-v1", "ctc_char"),
-    "QuartzNet V0 (CTC-char)": ("RobotsMali/stt-bm-quartznet15x5-v0", "ctc_char"),
 }
 _cache = {}
 # ----------------------------
-# MODEL LOADING
 # ----------------------------
 def load_model(name):
-    if name in _cache: return _cache[name]
     repo, mode = MODELS[name]
     folder = snapshot_download(repo, local_dir_use_symlinks=False)
     nemo_file = next((os.path.join(folder, f) for f in os.listdir(folder) if f.endswith(".nemo")), None)
     if not nemo_file:
-        raise FileNotFoundError(f"Aucun .nemo trouvé pour {name}")
-    model = (
-        nemo_asr.models.EncDecHybridRNNTCTCBPEModel.restore_from(nemo_file)
-        if mode == "rnnt"
-        else nemo_asr.models.EncDecCTCModelBPE.restore_from(nemo_file)
-    )
     model.to(DEVICE).eval()
     _cache[name] = model
     return model
 # ----------------------------
 # AUDIO EXTRACTION & CLEANING
 # ----------------------------
-def extract_audio(video, wav):
-    os.system(f'ffmpeg -y -i "{video}" -ar 16000 -ac 1 -vn "{wav}"')
-def clean_audio(wav, top_db=35):
-    audio, sr = sf.read(wav)
     if audio.ndim == 2:
-        audio = audio.mean(1)
-    max_val = np.max(np.abs(audio)) if audio.size > 0 else 0
     if max_val > 1e-6:
         audio = audio / max_val * 0.9
-    clean = wav.replace(".wav", "_clean.wav")
-    sf.write(clean, audio, sr)
-    return clean, audio, sr
 # ----------------------------
 # TRANSCRIPTION
 # ----------------------------
-def transcribe(model, wav):
-    out = model.transcribe([wav])
     if isinstance(out, list):
-        if out and hasattr(out[0], "text"):
-            return out[0].text.strip()
-        if out and isinstance(out[0], str):
-            return out[0].strip()
     if hasattr(out, "text"):
         return out.text.strip()
     return str(out).strip()
 # ----------------------------
-# UTILITAIRES
 # ----------------------------
 def keep_bambara(words):
-    res=[]
     for w in words:
-        wl=w.lower()
-        if any(c in wl for c in ["ɛ","ɔ","ŋ"]) or sum(c in "aeiou" for c in wl)>=2:
             res.append(w)
     return res
-MAX_CHARS=45; MIN_DUR=0.3; MAX_DUR=3.2; MAX_WORDS=8
 def wrap2(txt):
-    parts=textwrap.wrap(txt,MAX_CHARS)
-    if len(parts)<=1: return txt
-    mid=len(txt)//2
-    left=txt.rfind(" ",0,mid)
-    right=txt.find(" ",mid)
-    cut=left if (mid-left)<=(right-mid if right!=-1 else 1e9) else right
-    l1=txt[:cut].strip(); l2=txt[cut:].strip()
-    return l1+"\n"+l2 if l2 else l1
-def pack(spans,total):
-    tmp=[]
-    for s,e,t in spans:
-        s=max(0,min(s,total)); e=max(0,min(e,total))
-        if e<=s or not t.strip(): continue
-        tmp.append((s,e,t.strip()))
-    merged=[]
     for seg in tmp:
-        if not merged: merged.append(seg); continue
-        ps,pe,pt=merged[-1]; s,e,t=seg
-        if (e-s)<MIN_DUR or (s-pe)<0.1:
-            merged[-1]=(ps,max(pe,e),(pt+" "+t).strip())
-        else: merged.append(seg)
-    out=[]; last_end=0
-    for s,e,t in merged:
-        dur=e-s; words=t.split()
-        blocks=[" ".join(words[i:i+MAX_WORDS]) for i in range(0,len(words),MAX_WORDS)]
-        step=dur/max(1,len(blocks)); base=s
         for b in blocks:
-            st=base; en=min(base+step,e); base=en
-            if en<=st: en=min(st+0.05,total)
-            txt=wrap2(b)
-            if st<last_end: st=last_end+1e-3; en=max(en,st+0.05)
-            out.append((st,en,txt)); last_end=en
     return out
 # ----------------------------
-# ALIGNEMENT SIMPLE (VAD)
 # ----------------------------
-def align_vad(text,audio,sr,total_dur,top_db=28):
-    words=keep_bambara(text.split())
-    total=total_dur
-    iv=librosa.effects.split(audio,top_db=top_db)
-    if len(iv)==0 or not words:
-        return pack([(0,total," ".join(words[:MAX_WORDS]))],total)
-    spans=[]; L=sum(e-s for s,e in iv); idx=0
-    for s,e in iv:
-        seg=e-s; segt=seg/sr; k=max(1,int(round(len(words)*(seg/L))))
-        chunk=words[idx:idx+k]; idx+=k
         if not chunk: continue
-        lines=[chunk[i:i+MAX_WORDS] for i in range(0,len(chunk),MAX_WORDS)]
-        step=max(MIN_DUR,min(MAX_DUR,segt/len(lines))); base=s/sr
-        for j,ln in enumerate(lines):
-            st=base+j*step; en=base+(j+1)*step
-            spans.append((st,en," ".join(ln)))
-    return pack(spans,total)
 # ----------------------------
-# SOUS-TITRES SRT + FFmpeg
 # ----------------------------
-def burn(video, subs):
-    tmp_srt = tempfile.mktemp(suffix=".srt")
-    out_file = "RobotsMali_Subtitled.mp4"
     def sec_to_srt(t):
-        h = int(t // 3600)
-        m = int((t % 3600) // 60)
-        s = int(t % 60)
-        ms = int((t - int(t)) * 1000)
         return f"{h:02}:{m:02}:{s:02},{ms:03}"
-    # Écriture du SRT
     with open(tmp_srt, "w", encoding="utf-8") as f:
         for i, (start, end, text) in enumerate(subs, 1):
             f.write(f"{i}\n{sec_to_srt(start)} --> {sec_to_srt(end)}\n{text}\n\n")
-    # ✅ Fusion avec réencodage (corrigé)
-    cmd = (
-        f'ffmpeg -y -i "{video}" '
-        f'-vf "subtitles={tmp_srt}:force_style=\'Fontsize=24,PrimaryColour=&HFFFFFF&,OutlineColour=&H000000&\'" '
-        f'-c:v libx264 -preset ultrafast -crf 23 -c:a copy "{out_file}"'
-    )
-    os.system(cmd)
-    if os.path.exists(tmp_srt):
-        os.remove(tmp_srt)
-    return out_file
 # ----------------------------
-# PIPELINE PRINCIPAL
 # ----------------------------
-def pipeline(video, model_name):
     try:
-        wav=tempfile.mktemp(suffix=".wav")
-        extract_audio(video,wav)
-        clean,audio,sr=clean_audio(wav)
-        model=load_model(model_name)
-        text=transcribe(model,clean)
-        mode=MODELS[model_name][1]
-        if mode=="rnnt":
-            from ctc_segmentation import ctc_segmentation,CtcSegmentationParameters,prepare_text
-            words=keep_bambara(text.split())
-            if not words: return "⚠️ Aucun sous-titre utilisable",None
-            x=torch.tensor(audio).float().unsqueeze(0).to(DEVICE)
-            ln=torch.tensor([x.shape[1]]).to(DEVICE)
-            with torch.no_grad(): logits=model(input_signal=x,input_signal_length=ln)[0]
-            tps=VideoFileClip(video).duration/logits.shape[1]
-            raw=model.tokenizer.vocab
-            vocab=list(raw.keys()) if isinstance(raw,dict) else list(raw)
-            cfg=CtcSegmentationParameters(); cfg.char_list=vocab
-            gt=prepare_text(cfg,words)[0]
-            timing,_,_=ctc_segmentation(cfg,logits.detach().cpu().numpy()[0],gt)
-            spans=[(timing[i]*tps,timing[i+1]*tps,words[i]) for i in range(len(words))]
-            subs=pack(spans,VideoFileClip(video).duration)
         else:
-            subs=align_vad(text,audio,sr,VideoFileClip(video).duration)
-        if not subs: return "⚠️ Aucun sous-titre utilisable",None
-        out=burn(video,subs)
-        return "✅ Terminé avec succès",out
-    except Exception:
         traceback.print_exc()
-        return "❌ Erreur — voir logs ci-dessus",None
 # ----------------------------
-# INTERFACE GRADIO
 # ----------------------------
-with gr.Blocks(title="RobotsMali V38.1 Final") as demo:
-    gr.Markdown("## ⚡ RobotsMali V38.1 — Sous-titrage Style Netflix (QuartzNet & RNNT stable)")
     v = gr.Video(label="Vidéo à sous-titrer")
     m = gr.Dropdown(list(MODELS.keys()), value="Soloba V1 (CTC)", label="Modèle ASR")
     b = gr.Button("▶️ Générer")
     s = gr.Markdown()
     o = gr.Video(label="Vidéo sous-titrée")
     b.click(pipeline, [v, m], [s, o])
-demo.launch(share=True, debug=False)

 # -*- coding: utf-8 -*-
 """
+ROBOTSMALI V41 — Sous-titrage Bambara (QuartzNet fix + RNNT/CTC robust)
+- Load: RNNT, CTC-BPE (Soloba) et CTC-char (QuartzNet) correctement
+- Segmentation : protected ctc_segmentation -> fallback VAD
+- Burn subtitles : réencodage (libx264) quand on applique un filtre subtitles
 """
+import os
+import shlex
+import subprocess
+import tempfile
+import traceback
+import random
+import textwrap
+from pathlib import Path
 import numpy as np
 import torch
 import soundfile as sf
 from huggingface_hub import snapshot_download
 from nemo.collections import asr as nemo_asr
 import gradio as gr
 # ----------------------------
 # CONFIG
 MODELS = {
     "Soloni V1 (RNNT)":        ("RobotsMali/soloni-114m-tdt-ctc-v1", "rnnt"),
     "Soloni V0 (RNNT)":        ("RobotsMali/soloni-114m-tdt-ctc-v0", "rnnt"),
+    "Soloba V1 (CTC)":         ("RobotsMali/soloba-ctc-0.6b-v1", "ctc"),       # BPE
+    "Soloba V0 (CTC)":         ("RobotsMali/soloba-ctc-0.6b-v0", "ctc"),       # BPE
+    "QuartzNet V1 (CTC-char)": ("RobotsMali/stt-bm-quartznet15x5-v1", "ctc_char"),  # char
+    "QuartzNet V0 (CTC-char)": ("RobotsMali/stt-bm-quartznet15x5-v0", "ctc_char"),  # char
 }
 _cache = {}
 # ----------------------------
+# UTIL: run_cmd, ffprobe_duration
+# ----------------------------
+def run_cmd(cmd):
+    """Execute a shell command and raise on non-zero exit."""
+    print("RUN:", cmd)
+    res = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True)
+    if res.returncode != 0:
+        raise RuntimeError(f"Commande échouée [{cmd}]\nOutput:\n{res.stdout}")
+    return res.stdout
+def ffprobe_duration(path):
+    cmd = f'ffprobe -v error -select_streams v:0 -show_entries format=duration -of default=noprint_wrappers=1:nokey=1 {shlex.quote(path)}'
+    out = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
+    if out.returncode != 0:
+        print("ffprobe erreur:", out.stderr)
+        return None
+    try:
+        return float(out.stdout.strip())
+    except:
+        return None
+# ----------------------------
+# LOAD MODEL (robust)
 # ----------------------------
 def load_model(name):
+    """Charge le modèle NeMo correct selon type (rnnt / ctc / ctc_char)."""
+    if name in _cache:
+        return _cache[name]
     repo, mode = MODELS[name]
+    print(f"[LOAD] snapshot_download {repo} ...")
     folder = snapshot_download(repo, local_dir_use_symlinks=False)
     nemo_file = next((os.path.join(folder, f) for f in os.listdir(folder) if f.endswith(".nemo")), None)
     if not nemo_file:
+        raise FileNotFoundError(f"Aucun .nemo trouvé pour {name} dans {folder}")
+    print(f"[LOAD] .nemo trouvé: {nemo_file}; mode={mode}")
+    # Sélection de la classe NeMo selon le mode
+    if mode == "rnnt":
+        model = nemo_asr.models.EncDecHybridRNNTCTCBPEModel.restore_from(nemo_file)
+    elif mode == "ctc_char":
+        # QuartzNet (char) : pas de tokenizer BPE dans cfg -> utiliser EncDecCTCModel
+        model = nemo_asr.models.EncDecCTCModel.restore_from(nemo_file)
+    else:  # mode == "ctc" (BPE)
+        try:
+            model = nemo_asr.models.EncDecCTCModelBPE.restore_from(nemo_file)
+        except Exception as e:
+            # fallback sur EncDecCTCModel si BPE absent (prudence)
+            print(f"[WARN] EncDecCTCModelBPE failed ({e}), fallback EncDecCTCModel")
+            model = nemo_asr.models.EncDecCTCModel.restore_from(nemo_file)
     model.to(DEVICE).eval()
     _cache[name] = model
+    print(f"[OK] Modèle {name} chargé sur {DEVICE}")
     return model
 # ----------------------------
 # AUDIO EXTRACTION & CLEANING
 # ----------------------------
+def extract_audio(video_path, out_wav):
+    """Extract mono 16k WAV using ffmpeg."""
+    cmd = f'ffmpeg -hide_banner -loglevel error -y -i {shlex.quote(video_path)} -vn -ac 1 -ar 16000 -f wav {shlex.quote(out_wav)}'
+    run_cmd(cmd)
+def clean_audio(wav_path, target_sr=16000):
+    """Load audio, ensure mono, resample to target_sr, normalize, write cleaned wav."""
+    audio, sr = sf.read(wav_path)
     if audio.ndim == 2:
+        audio = audio.mean(axis=1)
+    if sr != target_sr:
+        audio = librosa.resample(audio.astype(float), orig_sr=sr, target_sr=target_sr)
+        sr = target_sr
+    max_val = np.max(np.abs(audio)) if audio.size > 0 else 0.0
     if max_val > 1e-6:
         audio = audio / max_val * 0.9
+    clean_path = str(Path(wav_path).with_name(Path(wav_path).stem + "_clean.wav"))
+    sf.write(clean_path, audio, sr)
+    return clean_path, audio, sr
 # ----------------------------
 # TRANSCRIPTION
 # ----------------------------
+def transcribe(model, wav_path):
+    """Robuste: essaie model.transcribe et nettoie la sortie."""
+    if not hasattr(model, "transcribe"):
+        raise RuntimeError("Le modèle ne supporte pas model.transcribe()")
+    out = model.transcribe([wav_path])
+    # Différentes formes de sortie possibles
     if isinstance(out, list):
+        if len(out) == 0:
+            return ""
+        first = out[0]
+        if isinstance(first, str):
+            return first.strip()
+        if hasattr(first, "text"):
+            return first.text.strip()
+        return str(first).strip()
     if hasattr(out, "text"):
         return out.text.strip()
     return str(out).strip()
 # ----------------------------
+# UTILITAIRES sous-titres / packing
 # ----------------------------
 def keep_bambara(words):
+    res = []
     for w in words:
+        wl = w.lower()
+        if any(c in wl for c in ["ɛ","ɔ","ŋ"]) or sum(1 for c in wl if c in "aeiou") >= 2:
             res.append(w)
     return res
+MAX_CHARS = 45; MIN_DUR = 0.3; MAX_DUR = 3.2; MAX_WORDS = 8
 def wrap2(txt):
+    parts = textwrap.wrap(txt, MAX_CHARS)
+    if len(parts) <= 1:
+        return txt
+    mid = len(txt) // 2
+    left = txt.rfind(" ", 0, mid)
+    right = txt.find(" ", mid)
+    cut = left if (mid - left) <= ((right - mid) if right != -1 else 1e9) else right
+    l1 = txt[:cut].strip(); l2 = txt[cut:].strip()
+    return l1 + "\n" + l2 if l2 else l1
+def pack(spans, total):
+    tmp = []
+    for s, e, t in spans:
+        s = max(0, min(s, total)); e = max(0, min(e, total))
+        if e <= s or not t.strip(): continue
+        tmp.append((s, e, t.strip()))
+    merged = []
     for seg in tmp:
+        if not merged:
+            merged.append(seg); continue
+        ps, pe, pt = merged[-1]; s, e, t = seg
+        if (e - s) < MIN_DUR or (s - pe) < 0.1:
+            merged[-1] = (ps, max(pe, e), (pt + " " + t).strip())
+        else:
+            merged.append(seg)
+    out = []; last_end = 0
+    for s, e, t in merged:
+        dur = e - s; words = t.split()
+        blocks = [" ".join(words[i:i+MAX_WORDS]) for i in range(0, len(words), MAX_WORDS)]
+        step = dur / max(1, len(blocks))
+        base = s
         for b in blocks:
+            st = base; en = min(base + step, e); base = en
+            if en <= st: en = min(st + 0.05, total)
+            txt = wrap2(b)
+            if st < last_end:
+                st = last_end + 1e-3; en = max(en, st + 0.05)
+            out.append((st, en, txt)); last_end = en
     return out
 # ----------------------------
+# VAD ALIGN (fallback alignment)
 # ----------------------------
+def align_vad(text, audio, sr, total_dur, top_db=28):
+    words = keep_bambara(text.split())
+    total = total_dur
+    if audio is None or len(audio) == 0 or not words:
+        return pack([(0, total, " ".join(words[:MAX_WORDS]))], total)
+    iv = librosa.effects.split(audio, top_db=top_db)
+    if len(iv) == 0:
+        return pack([(0, total, " ".join(words[:MAX_WORDS]))], total)
+    spans = []
+    L = sum(e - s for s, e in iv)
+    idx = 0
+    for s, e in iv:
+        seg = e - s; segt = seg / sr
+        k = max(1, int(round(len(words) * (seg / L))))
+        chunk = words[idx:idx+k]; idx += k
         if not chunk: continue
+        lines = [chunk[i:i+MAX_WORDS] for i in range(0, len(chunk), MAX_WORDS)]
+        step = max(MIN_DUR, min(MAX_DUR, segt / max(1, len(lines))))
+        base = s / sr
+        for j, ln in enumerate(lines):
+            st = base + j * step; en = base + (j + 1) * step
+            spans.append((st, en, " ".join(ln)))
+    return pack(spans, total)
 # ----------------------------
+# Écriture SRT + Burn (réencode)
 # ----------------------------
+def burn(video_path, subs, output_path=None):
+    if output_path is None:
+        output_path = "RobotsMali_Subtitled.mp4"
+    tmp_fd, tmp_srt = tempfile.mkstemp(suffix=".srt")
+    os.close(tmp_fd)
     def sec_to_srt(t):
+        h = int(t // 3600); m = int((t % 3600) // 60); s = int(t % 60); ms = int((t - int(t)) * 1000)
         return f"{h:02}:{m:02}:{s:02},{ms:03}"
     with open(tmp_srt, "w", encoding="utf-8") as f:
         for i, (start, end, text) in enumerate(subs, 1):
             f.write(f"{i}\n{sec_to_srt(start)} --> {sec_to_srt(end)}\n{text}\n\n")
+    # On réencode (libx264) car on applique subtitles filter
+    vf = f"subtitles={shlex.quote(tmp_srt)}:force_style='Fontsize=22,PrimaryColour=&HFFFFFF&,OutlineColour=&H000000&'"
+    cmd = f'ffmpeg -hide_banner -loglevel error -y -i {shlex.quote(video_path)} -vf {shlex.quote(vf)} -c:v libx264 -preset fast -crf 23 -c:a aac -b:a 192k {shlex.quote(output_path)}'
+    try:
+        run_cmd(cmd)
+    finally:
+        if os.path.exists(tmp_srt):
+            os.remove(tmp_srt)
+    return output_path
 # ----------------------------
+# PIPELINE PRINCIPAL (V41)
 # ----------------------------
+def pipeline(video_input, model_name):
+    """
+    video_input : chemin ou dict Gradio (tmp_path)
+    model_name : clé dans MODELS
+    """
     try:
+        # support Gradio dict (tmp_path)
+        if isinstance(video_input, dict) and "tmp_path" in video_input:
+            video_path = video_input["tmp_path"]
         else:
+            video_path = video_input
+        duration = ffprobe_duration(video_path)
+        if duration is None:
+            raise RuntimeError("Impossible d'obtenir la durée de la vidéo via ffprobe")
+        # fichiers temporaires
+        tmp_fd, tmp_wav = tempfile.mkstemp(suffix=".wav")
+        os.close(tmp_fd)
+        # extraction + nettoyage
+        extract_audio(video_path, tmp_wav)
+        clean_wav, audio, sr = clean_audio(tmp_wav)
+        # charger modèle
+        model = load_model(model_name)
+        text = transcribe(model, clean_wav)
+        mode = MODELS[model_name][1]
+        # segmentation / alignement
+        subs = None
+        if mode == "rnnt":
+            # RNNT : tentative de segmentation via logits + ctc_segmentation si dispo
+            try:
+                from ctc_segmentation import ctc_segmentation, CtcSegmentationParameters, prepare_text
+                words = keep_bambara(text.split())
+                if not words:
+                    return ("⚠️ Aucun sous-titre utilisable (texte vide après filtrage)", None)
+                x = torch.tensor(audio).float().unsqueeze(0).to(DEVICE)
+                ln = torch.tensor([x.shape[1]]).to(DEVICE)
+                with torch.no_grad():
+                    logits = model(input_signal=x, input_signal_length=ln)[0]
+                # heuristique mapping frames -> seconds
+                time_per_frame = duration / max(1, logits.shape[1])
+                # build char list
+                try:
+                    raw = model.tokenizer.vocab
+                    vocab = list(raw.keys()) if isinstance(raw, dict) else list(raw)
+                except Exception:
+                    vocab = None
+                cfg = CtcSegmentationParameters()
+                if vocab:
+                    cfg.char_list = vocab
+                gt = prepare_text(cfg, words)[0]
+                try:
+                    timing, _, _ = ctc_segmentation(cfg, logits.detach().cpu().numpy()[0], gt)
+                    spans = [(timing[i] * time_per_frame, timing[i+1] * time_per_frame, words[i]) for i in range(len(words) - 1)]
+                    subs = pack(spans, duration)
+                except AssertionError:
+                    print("[WARN] Audio shorter than text -> fallback to VAD alignment")
+                    subs = align_vad(text, audio, sr, duration)
+            except Exception as e:
+                print(f"[WARN] ctc_segmentation not available or failed ({e}) -> fallback VAD")
+                subs = align_vad(text, audio, sr, duration)
+        elif mode == "ctc_char":
+            # QuartzNet : pas de tokenizer BPE, on procède avec VAD (ou on peut essayer timestamps si model le permet)
+            # On essaie d'obtenir timestamps via model.transcribe() si disponible (mais souvent non)
+            try:
+                subs = align_vad(text, audio, sr, duration)
+            except Exception as e:
+                print(f"[WARN] QuartzNet alignment failed: {e}")
+                subs = align_vad(text, audio, sr, duration)
+        else:  # ctc (BPE)
+            # Pour les modèles CTC-BPE, VAD reste une option raisonnable si segmentation manque
+            try:
+                subs = align_vad(text, audio, sr, duration)
+            except Exception as e:
+                print(f"[WARN] CTC alignment failed: {e}")
+                subs = align_vad(text, audio, sr, duration)
+        if not subs:
+            return ("⚠️ Aucun sous-titre utilisable (sub list vide)", None)
+        out_video = burn(video_path, subs)
+        return ("✅ Terminé avec succès", out_video)
+    except Exception as e:
         traceback.print_exc()
+        return (f"❌ Erreur — {str(e)}", None)
 # ----------------------------
+# INTERFACE GRADIO (optionnel)
 # ----------------------------
+with gr.Blocks(title="RobotsMali V41 - Sous-titrage") as demo:
+    gr.Markdown("## ⚡ RobotsMali V41 — Sous-titrage (QuartzNet fix + RNNT/CTC robust)")
     v = gr.Video(label="Vidéo à sous-titrer")
     m = gr.Dropdown(list(MODELS.keys()), value="Soloba V1 (CTC)", label="Modèle ASR")
     b = gr.Button("▶️ Générer")
     s = gr.Markdown()
     o = gr.Video(label="Vidéo sous-titrée")
     b.click(pipeline, [v, m], [s, o])
+# Pour exécuter l'interface :
+# demo.launch(share=True, debug=False)
+demo.launch(share=True, debug=True)