Spaces:

RobotsMali
/

RobotsMali_Video_captionning

Running

App Files Files Community

binaryMao commited on Dec 16, 2025

Commit

b332947

verified ·

1 Parent(s): 77aec49

Update app.py

Browse files

Files changed (1) hide show

app.py +65 -149

app.py CHANGED Viewed

@@ -1,7 +1,9 @@
 # -*- coding: utf-8 -*-
 """
-ROBOTSMALI — Sous-titrage Bambara
 """
 import os
 import shlex
@@ -20,13 +22,6 @@ from huggingface_hub import snapshot_download
 from nemo.collections import asr as nemo_asr
 import gradio as gr
-# Tente l'importation de la librairie d'alignement nécessaire
-try:
-    from ctc_segmentation import ctc_segmentation, CtcSegmentationParameters, prepare_text
-    HAS_CTC_SEGMENTATION = True
-except ImportError:
-    HAS_CTC_SEGMENTATION = False
 # ---------------------------- # CONFIGURATION # ----------------------------
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 random.seed(1234)
@@ -35,7 +30,6 @@ torch.manual_seed(1234)
 SEGMENT_DURATION = 10.0
-# Liste complète des modèles
 MODELS = {
     "Soloni V1 (RNNT)":        ("RobotsMali/soloni-114m-tdt-ctc-v1", "rnnt"),
     "Soloni V0 (RNNT)":        ("RobotsMali/soloni-114m-tdt-ctc-v0", "rnnt"),
@@ -45,7 +39,6 @@ MODELS = {
     "QuartzNet V0 (CTC-char)": ("RobotsMali/stt-bm-quartznet15x5-v0", "ctc_char"),
 }
-# Vidéo d'exemple (identifiée sur votre capture d'écran)
 VIDEO_EXAMPLES = [
     ["examples/MARALINKE-Wii (Lève-toi) Black lives matter (Clip officiel) - MARALINKE (360p, H264).mp4", "Soloba V1 (CTC)"]
 ]
@@ -55,18 +48,14 @@ _cache = {}
 # ---------------------------- # FONCTIONS TECHNIQUES # ----------------------------
 def run_cmd(cmd):
-    """Execute a shell command and raise on non-zero exit."""
-    print("RUN:", cmd)
     res = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True)
     if res.returncode != 0:
-        raise RuntimeError(f"Commande échouée [{cmd}]\nOutput:\n{res.stdout}")
     return res.stdout
 def ffprobe_duration(path):
-    """Détermine la durée de la vidéo via ffprobe."""
     cmd = f'ffprobe -v error -show_entries format=duration -of default=noprint_wrappers=1:nokey=1 {shlex.quote(path)}'
     out = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
-    if out.returncode != 0: return None
     try: return float(out.stdout.strip())
     except: return None
@@ -75,7 +64,6 @@ def load_model(name):
     repo, mode = MODELS[name]
     folder = snapshot_download(repo, local_dir_use_symlinks=False)
     nemo_file = next((os.path.join(folder, f) for f in os.listdir(folder) if f.endswith(".nemo")), None)
     if mode == "rnnt":
         model = nemo_asr.models.EncDecHybridRNNTCTCBPEModel.restore_from(nemo_file)
     elif mode == "ctc_char":
@@ -83,47 +71,29 @@ def load_model(name):
     else:
         try: model = nemo_asr.models.EncDecCTCModelBPE.restore_from(nemo_file)
         except: model = nemo_asr.models.EncDecCTCModel.restore_from(nemo_file)
     model.to(DEVICE).eval()
     _cache[name] = model
     return model
 def extract_audio(video_path, out_wav):
-    """Extraction audio avec stabilisation forcée pour support Webcam (VP8 -> H264)."""
     tmp_fd, stabilized_mp4 = tempfile.mkstemp(suffix="_stabilized.mp4")
     os.close(tmp_fd)
-    # ÉTAPE 1: Réencodage en H.264 (Indispensable pour MP4/Webcam)
-    remux_cmd = (
-        f'ffmpeg -hide_banner -loglevel error -y '
-        f'-analyzeduration 2147483647 -probesize 2147483647 '
-        f'-i {shlex.quote(video_path)} '
-        f'-c:v libx264 -preset ultrafast -crf 23 -c:a aac '
-        f'{shlex.quote(stabilized_mp4)}'
-    )
-    run_cmd(remux_cmd)
-    # ÉTAPE 2: Extraction de l'audio 16k WAV
-    extract_cmd = (
-        f'ffmpeg -hide_banner -loglevel error -y '
-        f'-i {shlex.quote(stabilized_mp4)} -vn -ac 1 -ar 16000 -f wav {shlex.quote(out_wav)}'
-    )
-    run_cmd(extract_cmd)
     if os.path.exists(stabilized_mp4): os.remove(stabilized_mp4)
-def clean_audio(wav_path, target_sr=16000):
     audio, sr = sf.read(wav_path)
     if audio.ndim == 2: audio = audio.mean(axis=1)
-    if sr != target_sr:
-        audio = librosa.resample(audio.astype(float), orig_sr=sr, target_sr=target_sr)
     max_val = np.max(np.abs(audio)) if audio.size > 0 else 0.0
     if max_val > 1e-6: audio = audio / max_val * 0.9
-    clean_path = str(Path(wav_path).with_name(Path(wav_path).stem + "_clean.wav"))
-    sf.write(clean_path, audio, target_sr)
-    return clean_path, audio, target_sr
-# ---------------------------- # LOGIQUE SOUS-TITRAGE # ----------------------------
 def transcribe(model, wav_path):
     out = model.transcribe([wav_path])
@@ -132,83 +102,12 @@ def transcribe(model, wav_path):
         return res.text.strip() if hasattr(res, "text") else str(res).strip()
     return str(out).strip()
-def keep_bambara(words):
-    return [w for w in words if any(c in w.lower() for c in ["ɛ","ɔ","ŋ"]) or sum(1 for c in w.lower() if c in "aeiou") >= 2]
-MAX_CHARS = 45; MIN_DUR = 0.3; MAX_WORDS = 8
-def wrap2(txt):
-    parts = textwrap.wrap(txt, MAX_CHARS)
-    return "\n".join(parts) if len(parts) > 1 else txt
-def pack(spans, total):
-    if not spans: return []
-    merged = []
-    for s, e, t in spans:
-        s = max(0, min(s, total)); e = max(0, min(e, total))
-        if e <= s or not t.strip(): continue
-        if not merged: merged.append((s, e, t))
-        else:
-            ps, pe, pt = merged[-1]; s, e, t = s, e, t
-            if (e - s) < MIN_DUR or (s - pe) < 0.1:
-                merged[-1] = (ps, max(pe, e), (pt + " " + t).strip())
-            else: merged.append((s, e, t))
-    final = []
-    for s, e, t in merged:
-        words = t.split()
-        blocks = [" ".join(words[i:i+MAX_WORDS]) for i in range(0, len(words), MAX_WORDS)]
-        step = (e - s) / max(1, len(blocks))
-        for j, b in enumerate(blocks):
-            st = s + j * step; en = st + step
-            final.append((st, en, wrap2(b)))
-    return final
-def align_heuristic(words, total_dur):
-    if not words: return []
-    blocks = [" ".join(words[i:i+MAX_WORDS]) for i in range(0, len(words), MAX_WORDS)]
-    step = total_dur / len(blocks)
-    return [(i*step, (i+1)*step, b) for i, b in enumerate(blocks)]
-def segment_and_align(model, audio, sr, total_dur, mode):
-    segment_samples = int(SEGMENT_DURATION * sr)
-    all_subs = []
-    for i in range(0, len(audio), segment_samples):
-        start_s = i / sr
-        chunk = audio[i:i+segment_samples]
-        with tempfile.NamedTemporaryFile(suffix=".wav", delete=True) as tf:
-            sf.write(tf.name, chunk, sr)
-            text = transcribe(model, tf.name)
-            words = keep_bambara(text.split())
-            subs = align_heuristic(words, len(chunk)/sr)
-            for s, e, t in subs:
-                all_subs.append((s + start_s, e + start_s, t))
-    return pack(all_subs, total_dur)
-def burn(video_path, subs):
-    out_path = "RobotsMali_Subtitled.mp4"
-    with tempfile.NamedTemporaryFile(suffix=".srt", mode="w", encoding="utf-8", delete=False) as tf:
-        for i, (start, end, text) in enumerate(subs, 1):
-            def t_srt(sec):
-                h=int(sec//3600); m=int((sec%3600)//60); s=int(sec%60); ms=int((sec-int(sec))*1000)
-                return f"{h:02}:{m:02}:{s:02},{ms:03}"
-            tf.write(f"{i}\n{t_srt(start)} --> {t_srt(end)}\n{text}\n\n")
-        srt_name = tf.name
-    vf = f"subtitles={shlex.quote(srt_name)}:force_style='Fontsize=22,PrimaryColour=&HFFFFFF&,OutlineColour=&H000000&'"
-    cmd = f'ffmpeg -hide_banner -loglevel error -y -i {shlex.quote(video_path)} -vf {shlex.quote(vf)} -c:v libx264 -preset fast -crf 23 -c:a aac {shlex.quote(out_path)}'
-    run_cmd(cmd)
-    os.remove(srt_name)
-    return out_path
-# ---------------------------- # PIPELINE & INTERFACE # ----------------------------
 def pipeline(video_input, model_name):
     try:
         video_path = video_input["tmp_path"] if isinstance(video_input, dict) else video_input
-        if not video_path: return "❌ Aucune vidéo fournie", None
-        yield "⏳ Phase 1/3 : Stabilisation et extraction audio...", None
         with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tf:
             wav_path = tf.name
@@ -216,60 +115,77 @@ def pipeline(video_input, model_name):
         clean_wav, audio, sr = clean_audio(wav_path)
         duration = ffprobe_duration(video_path) or (len(audio)/sr)
-        yield f"⏳ Phase 2/3 : Analyse IA avec {model_name}...", None
         model = load_model(model_name)
-        mode = MODELS[model_name][1]
-        subs = segment_and_align(model, audio, sr, duration, mode)
-        if not subs: return "⚠️ Pas de parole détectée", None
-        yield "⏳ Phase 3/3 : Incrustation des sous-titres...", None
-        res_v = burn(video_path, subs)
-        return "✅ Traitement terminé avec succès", res_v
     except Exception as e:
         traceback.print_exc()
-        return f"❌ Erreur: {str(e)}", None
-# --- DESIGN CSS ARTISTIQUE ---
 custom_css = """
 body { background-color: #0b0e14; }
-.gradio-container { background: rgba(17, 25, 40, 0.8) !important; backdrop-filter: blur(12px); border-radius: 20px; border: 1px solid rgba(255, 255, 255, 0.1); padding: 25px !important; }
-#header { text-align: center; margin-bottom: 20px; }
-#header h1 { color: #facc15; font-size: 2.8rem; letter-spacing: 4px; margin-bottom: 0; }
-#header p { color: #94a3b8; font-style: italic; font-size: 1.1rem; }
-.gr-button-primary { background: linear-gradient(135deg, #059669, #10b981) !important; border: none !important; font-weight: bold !important; }
-.gr-button-primary:hover { transform: translateY(-2px); box-shadow: 0 5px 15px rgba(16, 185, 129, 0.4); }
 """
 with gr.Blocks(css=custom_css, theme=gr.themes.Soft()) as demo:
-    with gr.Div(elem_id="header"):
         gr.HTML("""
-            <h1>🤖 ROBOTSMALI</h1>
-            <p>Intelligence Artificielle & Sauvegarde de la Langue Bambara</p>
-            <div style="height: 3px; width: 80px; background: #facc15; margin: 15px auto;"></div>
         """)
     with gr.Row():
         with gr.Column():
-            gr.Markdown("### 🎥 Source Vidéo")
             v_in = gr.Video(label=None, mirror_webcam=False)
-            m_sel = gr.Dropdown(list(MODELS.keys()), value="Soloba V1 (CTC)", label="Cerveau ASR")
-            btn = gr.Button("🚀 GÉNÉRER LES SOUS-TITRES", variant="primary")
         with gr.Column():
-            gr.Markdown("### 📺 Résultat")
-            status = gr.Markdown("*Prêt pour le traitement...*")
             v_out = gr.Video(label=None)
-    gr.Examples(
-        examples=VIDEO_EXAMPLES,
-        inputs=[v_in, m_sel],
-        label="📺 Testez avec nos exemples"
-    )
-    gr.HTML("<div style='text-align: center; color: #475569; margin-top: 40px;'>© 2025 RobotsMali - Bamako, Mali</div>")
     btn.click(pipeline, [v_in, m_sel], [status, v_out])
 if __name__ == "__main__":
-            demo.launch(share=True, debug=True)

 # -*- coding: utf-8 -*-
 """
+ROBOTSMALI — Sous-titrage Bambara (V5.2 - Final Fix)
+- Correction Codec Webcam (VP8 -> H264)
+- Interface Artistique (Compatible Gradio)
+- Intégration Vidéo d'Exemple
 """
 import os
 import shlex
 from nemo.collections import asr as nemo_asr
 import gradio as gr
 # ---------------------------- # CONFIGURATION # ----------------------------
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 random.seed(1234)
 SEGMENT_DURATION = 10.0
 MODELS = {
     "Soloni V1 (RNNT)":        ("RobotsMali/soloni-114m-tdt-ctc-v1", "rnnt"),
     "Soloni V0 (RNNT)":        ("RobotsMali/soloni-114m-tdt-ctc-v0", "rnnt"),
     "QuartzNet V0 (CTC-char)": ("RobotsMali/stt-bm-quartznet15x5-v0", "ctc_char"),
 }
 VIDEO_EXAMPLES = [
     ["examples/MARALINKE-Wii (Lève-toi) Black lives matter (Clip officiel) - MARALINKE (360p, H264).mp4", "Soloba V1 (CTC)"]
 ]
 # ---------------------------- # FONCTIONS TECHNIQUES # ----------------------------
 def run_cmd(cmd):
     res = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True)
     if res.returncode != 0:
+        raise RuntimeError(f"Erreur FFmpeg: {res.stdout}")
     return res.stdout
 def ffprobe_duration(path):
     cmd = f'ffprobe -v error -show_entries format=duration -of default=noprint_wrappers=1:nokey=1 {shlex.quote(path)}'
     out = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
     try: return float(out.stdout.strip())
     except: return None
     repo, mode = MODELS[name]
     folder = snapshot_download(repo, local_dir_use_symlinks=False)
     nemo_file = next((os.path.join(folder, f) for f in os.listdir(folder) if f.endswith(".nemo")), None)
     if mode == "rnnt":
         model = nemo_asr.models.EncDecHybridRNNTCTCBPEModel.restore_from(nemo_file)
     elif mode == "ctc_char":
     else:
         try: model = nemo_asr.models.EncDecCTCModelBPE.restore_from(nemo_file)
         except: model = nemo_asr.models.EncDecCTCModel.restore_from(nemo_file)
     model.to(DEVICE).eval()
     _cache[name] = model
     return model
 def extract_audio(video_path, out_wav):
     tmp_fd, stabilized_mp4 = tempfile.mkstemp(suffix="_stabilized.mp4")
     os.close(tmp_fd)
+    # Réencodage H.264 pour supporter le VP8 de la webcam
+    run_cmd(f'ffmpeg -hide_banner -loglevel error -y -i {shlex.quote(video_path)} -c:v libx264 -preset ultrafast -crf 23 -c:a aac {shlex.quote(stabilized_mp4)}')
+    run_cmd(f'ffmpeg -hide_banner -loglevel error -y -i {shlex.quote(stabilized_mp4)} -vn -ac 1 -ar 16000 -f wav {shlex.quote(out_wav)}')
     if os.path.exists(stabilized_mp4): os.remove(stabilized_mp4)
+def clean_audio(wav_path):
     audio, sr = sf.read(wav_path)
     if audio.ndim == 2: audio = audio.mean(axis=1)
+    if sr != 16000: audio = librosa.resample(audio.astype(float), orig_sr=sr, target_sr=16000)
     max_val = np.max(np.abs(audio)) if audio.size > 0 else 0.0
     if max_val > 1e-6: audio = audio / max_val * 0.9
+    clean_path = wav_path.replace(".wav", "_clean.wav")
+    sf.write(clean_path, audio, 16000)
+    return clean_path, audio, 16000
+# ---------------------------- # LOGIQUE MÉTIER # ----------------------------
 def transcribe(model, wav_path):
     out = model.transcribe([wav_path])
         return res.text.strip() if hasattr(res, "text") else str(res).strip()
     return str(out).strip()
 def pipeline(video_input, model_name):
     try:
         video_path = video_input["tmp_path"] if isinstance(video_input, dict) else video_input
+        if not video_path: return "❌ Aucune vidéo détectée", None
+        yield "⏳ Phase 1 : Stabilisation et extraction...", None
         with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tf:
             wav_path = tf.name
         clean_wav, audio, sr = clean_audio(wav_path)
         duration = ffprobe_duration(video_path) or (len(audio)/sr)
+        yield f"⏳ Phase 2 : Analyse avec {model_name}...", None
         model = load_model(model_name)
+        text = transcribe(model, clean_wav)
+        words = [w for w in text.split() if any(c in w.lower() for c in ["ɛ","ɔ","ŋ"]) or len(w) > 2]
+        if not words: return "⚠️ Pas de texte Bambara détecté.", None
+        yield "⏳ Phase 3 : Création de la vidéo finale...", None
+        # Heuristique simple pour les sous-titres
+        subs = []
+        chunk_size = 8
+        for i in range(0, len(words), chunk_size):
+            chunk = words[i:i+chunk_size]
+            s = (i / len(words)) * duration
+            e = (min(i + chunk_size, len(words)) / len(words)) * duration
+            subs.append((s, e, "\n".join(textwrap.wrap(" ".join(chunk), 40))))
+        out_v = burn(video_path, subs)
+        yield "✅ Terminé !", out_v
     except Exception as e:
         traceback.print_exc()
+        yield f"❌ Erreur: {str(e)}", None
+def burn(video_path, subs):
+    out_path = "RobotsMali_Final.mp4"
+    with tempfile.NamedTemporaryFile(suffix=".srt", mode="w", encoding="utf-8", delete=False) as tf:
+        for idx, (start, end, text) in enumerate(subs, 1):
+            def t_srt(sec):
+                h=int(sec//3600); m=int((sec%3600)//60); s=int(sec%60); ms=int((sec-int(sec))*1000)
+                return f"{h:02}:{m:02}:{s:02},{ms:03}"
+            tf.write(f"{idx}\n{t_srt(start)} --> {t_srt(end)}\n{text}\n\n")
+        srt_name = tf.name
+    vf = f"subtitles={shlex.quote(srt_name)}:force_style='Fontsize=22,PrimaryColour=&HFFFFFF&,OutlineColour=&H000000&'"
+    run_cmd(f'ffmpeg -hide_banner -loglevel error -y -i {shlex.quote(video_path)} -vf {shlex.quote(vf)} -c:v libx264 -crf 23 -c:a aac {shlex.quote(out_path)}')
+    os.remove(srt_name)
+    return out_path
+# ---------------------------- # INTERFACE GRADIO # ----------------------------
 custom_css = """
 body { background-color: #0b0e14; }
+.gradio-container { background: rgba(17, 25, 40, 0.8) !important; backdrop-filter: blur(12px); border-radius: 20px; border: 1px solid rgba(255, 255, 255, 0.1); }
+#title-header { text-align: center; padding: 20px; }
+.gr-button-primary { background: linear-gradient(135deg, #059669, #10b981) !important; border: none !important; }
 """
 with gr.Blocks(css=custom_css, theme=gr.themes.Soft()) as demo:
+    # Utilisation de gr.Column au lieu de gr.Div pour éviter l'erreur AttributeError
+    with gr.Column(elem_id="title-header"):
         gr.HTML("""
+            <h1 style='color:#facc15; font-size: 2.5rem; margin:0;'>🤖 ROBOTSMALI</h1>
+            <p style='color:#94a3b8; font-style:italic;'>Système Expert de Sous-titrage Bambara</p>
+            <div style="height: 3px; width: 60px; background: #facc15; margin: 15px auto;"></div>
         """)
     with gr.Row():
         with gr.Column():
+            gr.Markdown("### 📥 Source")
             v_in = gr.Video(label=None, mirror_webcam=False)
+            m_sel = gr.Dropdown(list(MODELS.keys()), value="Soloba V1 (CTC)", label="Modèle IA")
+            btn = gr.Button("🚀 GÉNÉRER", variant="primary")
         with gr.Column():
+            gr.Markdown("### 📤 Sortie")
+            status = gr.Markdown("*En attente...*")
             v_out = gr.Video(label=None)
+    gr.Examples(examples=VIDEO_EXAMPLES, inputs=[v_in, m_sel], label="📺 Exemples")
+    gr.HTML("<div style='text-align: center; color: #475569; padding-top: 20px;'>© 2025 RobotsMali</div>")
     btn.click(pipeline, [v_in, m_sel], [status, v_out])
 if __name__ == "__main__":
+    demo.launch( share=True, debug=True)