Spaces:

RobotsMali
/

RobotsMali_Video_captionning

Running

App Files Files Community

binaryMao commited on Oct 30, 2025

Commit

27e6201

verified ·

1 Parent(s): 1a890b5

Update app.py

Browse files

Files changed (1) hide show

app.py +58 -34

app.py CHANGED Viewed

@@ -15,10 +15,12 @@ import soundfile as sf
 from moviepy.editor import VideoFileClip, CompositeVideoClip, TextClip
 from nemo.collections import asr as nemo_asr
 # ---------------- CONFIG ---------------- #
 SR = 16000
-MAX_VIDEO_BYTES = 200_000_000
 ASR_MODELS = {
     "Soloba CTC 0.6B V0": "RobotsMali/soloba-ctc-0.6b-v0",
@@ -31,6 +33,7 @@ ASR_MODELS = {
 _CACHE = {}
 # ---------------- LOAD MODEL ---------------- #
 def load_model(name):
@@ -43,55 +46,73 @@ def load_model(name):
     _CACHE[name] = (model, device)
     return model, device
-# ---------------- EXTRACT AUDIO (FORCE MONO) ---------------- #
 def extract_audio(video_path, wav_path):
     if os.path.getsize(video_path) > MAX_VIDEO_BYTES:
         raise RuntimeError("⚠️ Vidéo trop lourde (>200MB). Compressez puis réessayez.")
-    # Force audio mono + 16k (100% fiable)
     os.system(f"ffmpeg -y -i '{video_path}' -ac 1 -ar {SR} -vn '{wav_path}' >/dev/null 2>&1")
     audio, sr = sf.read(wav_path)
     if sr == 0 or len(audio) == 0:
         raise RuntimeError("⚠️ Audio introuvable ou illisible.")
     return len(audio) / sr
-# ---------------- TRANSCRIBE ---------------- #
 def transcribe(model, device, wav_path, model_key):
     audio, sr = sf.read(wav_path)
-    # Force mono propre + normalisation
     if audio.ndim == 2:
         audio = np.mean(audio, axis=1).astype(np.float32)
     if np.max(np.abs(audio)) > 1:
         audio = audio / np.max(np.abs(audio))
-    total_s = len(audio)/sr if sr else 0
     x = torch.tensor(audio, dtype=torch.float32).unsqueeze(0).to(device)
     ln = torch.tensor([x.shape[1]]).to(device)
-    # ---- Soloni : timestamps réels ---- #
     if "Soloni" in model_key and hasattr(model, "decode_and_align"):
-        with torch.no_grad():
-            proc, plen = model.preprocessor(
-                input_signal=x,
-                input_signal_length=ln
-            )
-            hyps = model.decode_and_align(
-                encoder_output=proc,
-                encoded_lengths=plen
-            )
-        hyp = hyps[0][0] if isinstance(hyps[0], list) else hyps[0]
-        if hasattr(hyp, "words") and hyp.words:
-            return [(w.start_offset_ms/1000, w.end_offset_ms/1000, w.word) for w in hyp.words]
-    # ---- Soloba & QuartzNet fallback alignement fluide ---- #
-    text = model.transcribe([wav_path])[0]
     words = text.split()
-    if not words or total_s <= 0:
         return []
     wps = max(2.0, len(words) / total_s)
@@ -104,18 +125,19 @@ def transcribe(model, device, wav_path, model_key):
             break
     return subs
 # ---------------- BURN SUBTITLES ---------------- #
 def burn(video_path, subs):
-    clip = None
-    final = None
     try:
         clip = VideoFileClip(video_path)
         W, H = clip.size
-        layers = []
         for s, e, w in subs:
-            if e <= s: continue
             txt = TextClip(
                 w.upper(),
                 fontsize=int(H/20),
@@ -138,6 +160,7 @@ def burn(video_path, subs):
         try: clip.close()
         except: pass
 # ---------------- PIPELINE ---------------- #
 def pipeline(video, model_name, progress=gr.Progress()):
@@ -150,7 +173,7 @@ def pipeline(video, model_name, progress=gr.Progress()):
         progress(0.5, "Extraction audio…")
         duration = extract_audio(video, wav)
-        progress(0.75, "Transcription…")
         subs = transcribe(model, device, wav, model_name)
         if not subs:
             return "⚠️ Aucun mot détecté.", None
@@ -161,19 +184,20 @@ def pipeline(video, model_name, progress=gr.Progress()):
     progress(1.0, "✅ Terminé")
     return f"✅ Sous-titrage terminé avec **{model_name}**", out
 # ---------------- UI ---------------- #
 CSS = """
-body { background:#F7FAFF; font-family:Inter, sans-serif; }
-h1 { text-align:center; font-weight:800; color:#005BFF; }
 .gr-button { background:#005BFF !important; color:white !important; border-radius:8px; font-weight:700; }
 """
-with gr.Blocks(css=CSS, title="RobotsMali Caption Studio") as demo:
-    gr.Markdown("<h1>RobotsMali Caption Studio</h1><p>Transcription & Sous-titres Automatiques en Bambara</p>")
     video = gr.File(label="🎥 Importer une vidéo (max 200MB)", type="filepath")
     model = gr.Dropdown(list(ASR_MODELS.keys()), value="Soloni 114M TDT CTC V1", label="🧠 Modèle ASR")
-    run = gr.Button("🚀 Générer")
     status = gr.Markdown()
     output = gr.Video()

 from moviepy.editor import VideoFileClip, CompositeVideoClip, TextClip
 from nemo.collections import asr as nemo_asr
 # ---------------- CONFIG ---------------- #
 SR = 16000
+MAX_VIDEO_BYTES = 200_000_000  # 200MB limite
+TITLE = "RobotsMali Caption Studio — Sous-titrage Automatique en Bambara"
 ASR_MODELS = {
     "Soloba CTC 0.6B V0": "RobotsMali/soloba-ctc-0.6b-v0",
 _CACHE = {}
 # ---------------- LOAD MODEL ---------------- #
 def load_model(name):
     _CACHE[name] = (model, device)
     return model, device
+# ---------------- AUDIO EXTRACTION (FORCE MONO) ---------------- #
 def extract_audio(video_path, wav_path):
     if os.path.getsize(video_path) > MAX_VIDEO_BYTES:
         raise RuntimeError("⚠️ Vidéo trop lourde (>200MB). Compressez puis réessayez.")
     os.system(f"ffmpeg -y -i '{video_path}' -ac 1 -ar {SR} -vn '{wav_path}' >/dev/null 2>&1")
     audio, sr = sf.read(wav_path)
     if sr == 0 or len(audio) == 0:
         raise RuntimeError("⚠️ Audio introuvable ou illisible.")
     return len(audio) / sr
+# ---------------- TRANSCRIBE (UNIFIÉ + SÛR) ---------------- #
 def transcribe(model, device, wav_path, model_key):
     audio, sr = sf.read(wav_path)
     if audio.ndim == 2:
         audio = np.mean(audio, axis=1).astype(np.float32)
     if np.max(np.abs(audio)) > 1:
         audio = audio / np.max(np.abs(audio))
+    total_s = len(audio) / sr if sr else 0
+    if total_s <= 0:
+        return []
     x = torch.tensor(audio, dtype=torch.float32).unsqueeze(0).to(device)
     ln = torch.tensor([x.shape[1]]).to(device)
+    # ---- Priority 1: Soloni precise timestamps ---- #
     if "Soloni" in model_key and hasattr(model, "decode_and_align"):
+        try:
+            with torch.no_grad():
+                proc, plen = model.preprocessor(
+                    input_signal=x,
+                    input_signal_length=ln
+                )
+                hyps = model.decode_and_align(
+                    encoder_output=proc,
+                    encoded_lengths=plen
+                )
+            hyp = hyps[0][0] if isinstance(hyps[0], list) else hyps[0]
+            if hasattr(hyp, "words") and hyp.words:
+                return [(w.start_offset_ms/1000, w.end_offset_ms/1000, w.word) for w in hyp.words]
+        except:
+            pass  # fallback auto
+    # ---- Priority 2: Universal fallback ---- #
+    out = model.transcribe([wav_path])[0]
+    if hasattr(out, "text"):
+        text = out.text.strip()
+    else:
+        text = str(out).strip()
+    if not text:
+        return []
     words = text.split()
+    if not words:
         return []
     wps = max(2.0, len(words) / total_s)
             break
     return subs
 # ---------------- BURN SUBTITLES ---------------- #
 def burn(video_path, subs):
+    clip, final = None, None
     try:
         clip = VideoFileClip(video_path)
         W, H = clip.size
+        layers = []
         for s, e, w in subs:
+            if e <= s:
+                continue
             txt = TextClip(
                 w.upper(),
                 fontsize=int(H/20),
         try: clip.close()
         except: pass
 # ---------------- PIPELINE ---------------- #
 def pipeline(video, model_name, progress=gr.Progress()):
         progress(0.5, "Extraction audio…")
         duration = extract_audio(video, wav)
+        progress(0.75, "Transcription en Bambara…")
         subs = transcribe(model, device, wav, model_name)
         if not subs:
             return "⚠️ Aucun mot détecté.", None
     progress(1.0, "✅ Terminé")
     return f"✅ Sous-titrage terminé avec **{model_name}**", out
 # ---------------- UI ---------------- #
 CSS = """
+body { background:#F5F8FF; font-family:Inter, sans-serif; }
+h1 { text-align:center; font-weight:800; color:#005BFF; margin-bottom:6px; }
 .gr-button { background:#005BFF !important; color:white !important; border-radius:8px; font-weight:700; }
 """
+with gr.Blocks(css=CSS, title=TITLE) as demo:
+    gr.Markdown("<h1>RobotsMali Caption Studio</h1><p>Génération automatique de sous-titres en Bambara</p>")
     video = gr.File(label="🎥 Importer une vidéo (max 200MB)", type="filepath")
     model = gr.Dropdown(list(ASR_MODELS.keys()), value="Soloni 114M TDT CTC V1", label="🧠 Modèle ASR")
+    run = gr.Button("🚀 Générer les sous-titres")
     status = gr.Markdown()
     output = gr.Video()