Spaces:

RobotsMali
/

RobotsMali_Video_captionning

Running

App Files Files Community

binaryMao commited on Dec 16, 2025

Commit

77aec49

verified ·

1 Parent(s): e685733

Update app.py

Browse files

Files changed (1) hide show

app.py +156 -76

app.py CHANGED Viewed

@@ -1,7 +1,7 @@
 # -*- coding: utf-8 -*-
 """
-ROBOTSMALI — Sous-titrage Bambara (V5.0 - Intégration Exemples & Design)
-Compatible: Webcam, Fichiers locaux et Exemples Hugging Face
 """
 import os
 import shlex
@@ -20,37 +20,53 @@ from huggingface_hub import snapshot_download
 from nemo.collections import asr as nemo_asr
 import gradio as gr
-# ---------------------------- # CONFIG & MODÈLES # ----------------------------
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 random.seed(1234)
 np.random.seed(1234)
 torch.manual_seed(1234)
 SEGMENT_DURATION = 10.0
 MODELS = {
     "Soloni V1 (RNNT)":        ("RobotsMali/soloni-114m-tdt-ctc-v1", "rnnt"),
     "Soloba V1 (CTC)":         ("RobotsMali/soloba-ctc-0.6b-v1", "ctc"),
     "QuartzNet V1 (CTC-char)": ("RobotsMali/stt-bm-quartznet15x5-v1", "ctc_char"),
 }
-# Liste des exemples basée sur votre capture d'écran Hugging Face
 VIDEO_EXAMPLES = [
     ["examples/MARALINKE-Wii (Lève-toi) Black lives matter (Clip officiel) - MARALINKE (360p, H264).mp4", "Soloba V1 (CTC)"]
 ]
 _cache = {}
-# ---------------------------- # LOGIQUE TECHNIQUE # ----------------------------
 def run_cmd(cmd):
     res = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True)
     if res.returncode != 0:
-        raise RuntimeError(f"Erreur FFmpeg: {res.stdout}")
     return res.stdout
 def ffprobe_duration(path):
     cmd = f'ffprobe -v error -show_entries format=duration -of default=noprint_wrappers=1:nokey=1 {shlex.quote(path)}'
     out = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
     try: return float(out.stdout.strip())
     except: return None
@@ -59,35 +75,55 @@ def load_model(name):
     repo, mode = MODELS[name]
     folder = snapshot_download(repo, local_dir_use_symlinks=False)
     nemo_file = next((os.path.join(folder, f) for f in os.listdir(folder) if f.endswith(".nemo")), None)
     if mode == "rnnt":
         model = nemo_asr.models.EncDecHybridRNNTCTCBPEModel.restore_from(nemo_file)
     else:
         try: model = nemo_asr.models.EncDecCTCModelBPE.restore_from(nemo_file)
         except: model = nemo_asr.models.EncDecCTCModel.restore_from(nemo_file)
     model.to(DEVICE).eval()
     _cache[name] = model
     return model
 def extract_audio(video_path, out_wav):
     tmp_fd, stabilized_mp4 = tempfile.mkstemp(suffix="_stabilized.mp4")
     os.close(tmp_fd)
-    # Re-encodage H.264 pour garantir la compatibilité (indispensable pour les sorties webcam)
-    run_cmd(f'ffmpeg -hide_banner -loglevel error -y -i {shlex.quote(video_path)} -c:v libx264 -preset ultrafast -crf 23 -c:a aac {shlex.quote(stabilized_mp4)}')
-    run_cmd(f'ffmpeg -hide_banner -loglevel error -y -i {shlex.quote(stabilized_mp4)} -vn -ac 1 -ar 16000 -f wav {shlex.quote(out_wav)}')
     if os.path.exists(stabilized_mp4): os.remove(stabilized_mp4)
-def clean_audio(wav_path):
     audio, sr = sf.read(wav_path)
     if audio.ndim == 2: audio = audio.mean(axis=1)
-    if sr != 16000:
-        audio = librosa.resample(audio.astype(float), orig_sr=sr, target_sr=16000)
     max_val = np.max(np.abs(audio)) if audio.size > 0 else 0.0
     if max_val > 1e-6: audio = audio / max_val * 0.9
-    clean_path = wav_path.replace(".wav", "_clean.wav")
-    sf.write(clean_path, audio, 16000)
-    return clean_path, audio, 16000
-# ---------------------------- # TRANSCRIPTION & SOUS-TITRES # ----------------------------
 def transcribe(model, wav_path):
     out = model.transcribe([wav_path])
@@ -96,14 +132,83 @@ def transcribe(model, wav_path):
         return res.text.strip() if hasattr(res, "text") else str(res).strip()
     return str(out).strip()
 def pipeline(video_input, model_name):
     try:
-        if not video_input: return "❌ Veuillez charger une vidéo", None
-        video_path = video_input
-        # Statut initial
-        yield "⏳ Extraction de l'audio et stabilisation...", None
         with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tf:
             wav_path = tf.name
@@ -111,85 +216,60 @@ def pipeline(video_input, model_name):
         clean_wav, audio, sr = clean_audio(wav_path)
         duration = ffprobe_duration(video_path) or (len(audio)/sr)
-        yield f"⏳ Chargement du modèle {model_name}...", None
         model = load_model(model_name)
-        yield "⏳ Transcription et alignement en cours...", None
-        # (Logique simplifiée pour l'exemple)
-        text = transcribe(model, clean_wav)
-        words = [w for w in text.split() if len(w) > 1] # Filtre basique
-        if not words:
-            yield "⚠️ Aucun discours détecté en Bambara.", None
-            return
-        # Création des segments (Heuristique)
-        total_words = len(words)
-        chunk_size = 8
-        subs = []
-        for i in range(0, total_words, chunk_size):
-            chunk = words[i:i+chunk_size]
-            s = (i / total_words) * duration
-            e = (min(i + chunk_size, total_words) / total_words) * duration
-            txt = "\n".join(textwrap.wrap(" ".join(chunk), 40))
-            subs.append((s, e, txt))
-        yield "⏳ Incrustation des sous-titres dans la vidéo...", None
-        # Burn subtitles
-        out_v = "RobotsMali_Final.mp4"
-        with tempfile.NamedTemporaryFile(suffix=".srt", mode="w", encoding="utf-8", delete=False) as srt_f:
-            for idx, (start, end, text) in enumerate(subs, 1):
-                def t(sec):
-                    h=int(sec//3600); m=int((sec%3600)//60); s=int(sec%60); ms=int((sec-int(sec))*1000)
-                    return f"{h:02}:{m:02}:{s:02},{ms:03}"
-                srt_f.write(f"{idx}\n{t(start)} --> {t(end)}\n{text}\n\n")
-            srt_name = srt_f.name
-        vf = f"subtitles={shlex.quote(srt_name)}:force_style='Fontsize=22,PrimaryColour=&HFFFFFF&,OutlineColour=&H000000&'"
-        run_cmd(f'ffmpeg -hide_banner -loglevel error -y -i {shlex.quote(video_path)} -vf {shlex.quote(vf)} -c:v libx264 -crf 23 -c:a aac {shlex.quote(out_v)}')
-        os.remove(srt_name)
-        yield "✅ Sous-titrage terminé !", out_v
     except Exception as e:
-        yield f"❌ Erreur: {str(e)}", None
-# ---------------------------- # INTERFACE GRADIO STYLISÉE # ----------------------------
 custom_css = """
 body { background-color: #0b0e14; }
-.gradio-container { background: rgba(17, 25, 40, 0.8) !important; backdrop-filter: blur(12px); border-radius: 20px; border: 1px solid rgba(255, 255, 255, 0.1); }
-#header { text-align: center; padding: 20px; }
-#header h1 { color: #facc15; font-size: 2.5rem; margin-bottom: 0; }
-.gr-button-primary { background: linear-gradient(135deg, #059669, #10b981) !important; border: none !important; }
 """
 with gr.Blocks(css=custom_css, theme=gr.themes.Soft()) as demo:
     with gr.Div(elem_id="header"):
-        gr.HTML("<h1>🤖 ROBOTSMALI</h1><p style='color:#94a3b8'>Sous-titrage Automatique en Bambara (V5.0)</p>")
-        gr.HTML("<div style='height:2px; width:100px; background:#facc15; margin:10px auto;'></div>")
     with gr.Row():
         with gr.Column():
-            v_in = gr.Video(label="Vidéo (Webcam ou Fichier)", mirror_webcam=False)
-            m_sel = gr.Dropdown(list(MODELS.keys()), value="Soloba V1 (CTC)", label="Modèle ASR")
             btn = gr.Button("🚀 GÉNÉRER LES SOUS-TITRES", variant="primary")
         with gr.Column():
-            status = gr.Markdown("### État du traitement\n*Prêt...*")
-            v_out = gr.Video(label="Résultat final")
-    # Section des exemples (Intégration de votre fichier MARALINKE)
     gr.Examples(
         examples=VIDEO_EXAMPLES,
         inputs=[v_in, m_sel],
-        label="📺 Vidéos d'exemple (Hugging Face)"
     )
-    gr.HTML("<div style='text-align:center; color:#475569; padding:20px'>© 2024 RobotsMali - Intelligence Artificielle pour le Mali</div>")
     btn.click(pipeline, [v_in, m_sel], [status, v_out])
 if __name__ == "__main__":
-    demo.launch()

 # -*- coding: utf-8 -*-
 """
+ROBOTSMALI — Sous-titrage Bambara
 """
 import os
 import shlex
 from nemo.collections import asr as nemo_asr
 import gradio as gr
+# Tente l'importation de la librairie d'alignement nécessaire
+try:
+    from ctc_segmentation import ctc_segmentation, CtcSegmentationParameters, prepare_text
+    HAS_CTC_SEGMENTATION = True
+except ImportError:
+    HAS_CTC_SEGMENTATION = False
+# ---------------------------- # CONFIGURATION # ----------------------------
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 random.seed(1234)
 np.random.seed(1234)
 torch.manual_seed(1234)
 SEGMENT_DURATION = 10.0
+# Liste complète des modèles
 MODELS = {
     "Soloni V1 (RNNT)":        ("RobotsMali/soloni-114m-tdt-ctc-v1", "rnnt"),
+    "Soloni V0 (RNNT)":        ("RobotsMali/soloni-114m-tdt-ctc-v0", "rnnt"),
     "Soloba V1 (CTC)":         ("RobotsMali/soloba-ctc-0.6b-v1", "ctc"),
+    "Soloba V0 (CTC)":         ("RobotsMali/soloba-ctc-0.6b-v0", "ctc"),
     "QuartzNet V1 (CTC-char)": ("RobotsMali/stt-bm-quartznet15x5-v1", "ctc_char"),
+    "QuartzNet V0 (CTC-char)": ("RobotsMali/stt-bm-quartznet15x5-v0", "ctc_char"),
 }
+# Vidéo d'exemple (identifiée sur votre capture d'écran)
 VIDEO_EXAMPLES = [
     ["examples/MARALINKE-Wii (Lève-toi) Black lives matter (Clip officiel) - MARALINKE (360p, H264).mp4", "Soloba V1 (CTC)"]
 ]
 _cache = {}
+# ---------------------------- # FONCTIONS TECHNIQUES # ----------------------------
 def run_cmd(cmd):
+    """Execute a shell command and raise on non-zero exit."""
+    print("RUN:", cmd)
     res = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True)
     if res.returncode != 0:
+        raise RuntimeError(f"Commande échouée [{cmd}]\nOutput:\n{res.stdout}")
     return res.stdout
 def ffprobe_duration(path):
+    """Détermine la durée de la vidéo via ffprobe."""
     cmd = f'ffprobe -v error -show_entries format=duration -of default=noprint_wrappers=1:nokey=1 {shlex.quote(path)}'
     out = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
+    if out.returncode != 0: return None
     try: return float(out.stdout.strip())
     except: return None
     repo, mode = MODELS[name]
     folder = snapshot_download(repo, local_dir_use_symlinks=False)
     nemo_file = next((os.path.join(folder, f) for f in os.listdir(folder) if f.endswith(".nemo")), None)
     if mode == "rnnt":
         model = nemo_asr.models.EncDecHybridRNNTCTCBPEModel.restore_from(nemo_file)
+    elif mode == "ctc_char":
+        model = nemo_asr.models.EncDecCTCModel.restore_from(nemo_file)
     else:
         try: model = nemo_asr.models.EncDecCTCModelBPE.restore_from(nemo_file)
         except: model = nemo_asr.models.EncDecCTCModel.restore_from(nemo_file)
     model.to(DEVICE).eval()
     _cache[name] = model
     return model
 def extract_audio(video_path, out_wav):
+    """Extraction audio avec stabilisation forcée pour support Webcam (VP8 -> H264)."""
     tmp_fd, stabilized_mp4 = tempfile.mkstemp(suffix="_stabilized.mp4")
     os.close(tmp_fd)
+    # ÉTAPE 1: Réencodage en H.264 (Indispensable pour MP4/Webcam)
+    remux_cmd = (
+        f'ffmpeg -hide_banner -loglevel error -y '
+        f'-analyzeduration 2147483647 -probesize 2147483647 '
+        f'-i {shlex.quote(video_path)} '
+        f'-c:v libx264 -preset ultrafast -crf 23 -c:a aac '
+        f'{shlex.quote(stabilized_mp4)}'
+    )
+    run_cmd(remux_cmd)
+    # ÉTAPE 2: Extraction de l'audio 16k WAV
+    extract_cmd = (
+        f'ffmpeg -hide_banner -loglevel error -y '
+        f'-i {shlex.quote(stabilized_mp4)} -vn -ac 1 -ar 16000 -f wav {shlex.quote(out_wav)}'
+    )
+    run_cmd(extract_cmd)
     if os.path.exists(stabilized_mp4): os.remove(stabilized_mp4)
+def clean_audio(wav_path, target_sr=16000):
     audio, sr = sf.read(wav_path)
     if audio.ndim == 2: audio = audio.mean(axis=1)
+    if sr != target_sr:
+        audio = librosa.resample(audio.astype(float), orig_sr=sr, target_sr=target_sr)
     max_val = np.max(np.abs(audio)) if audio.size > 0 else 0.0
     if max_val > 1e-6: audio = audio / max_val * 0.9
+    clean_path = str(Path(wav_path).with_name(Path(wav_path).stem + "_clean.wav"))
+    sf.write(clean_path, audio, target_sr)
+    return clean_path, audio, target_sr
+# ---------------------------- # LOGIQUE SOUS-TITRAGE # ----------------------------
 def transcribe(model, wav_path):
     out = model.transcribe([wav_path])
         return res.text.strip() if hasattr(res, "text") else str(res).strip()
     return str(out).strip()
+def keep_bambara(words):
+    return [w for w in words if any(c in w.lower() for c in ["ɛ","ɔ","ŋ"]) or sum(1 for c in w.lower() if c in "aeiou") >= 2]
+MAX_CHARS = 45; MIN_DUR = 0.3; MAX_WORDS = 8
+def wrap2(txt):
+    parts = textwrap.wrap(txt, MAX_CHARS)
+    return "\n".join(parts) if len(parts) > 1 else txt
+def pack(spans, total):
+    if not spans: return []
+    merged = []
+    for s, e, t in spans:
+        s = max(0, min(s, total)); e = max(0, min(e, total))
+        if e <= s or not t.strip(): continue
+        if not merged: merged.append((s, e, t))
+        else:
+            ps, pe, pt = merged[-1]; s, e, t = s, e, t
+            if (e - s) < MIN_DUR or (s - pe) < 0.1:
+                merged[-1] = (ps, max(pe, e), (pt + " " + t).strip())
+            else: merged.append((s, e, t))
+    final = []
+    for s, e, t in merged:
+        words = t.split()
+        blocks = [" ".join(words[i:i+MAX_WORDS]) for i in range(0, len(words), MAX_WORDS)]
+        step = (e - s) / max(1, len(blocks))
+        for j, b in enumerate(blocks):
+            st = s + j * step; en = st + step
+            final.append((st, en, wrap2(b)))
+    return final
+def align_heuristic(words, total_dur):
+    if not words: return []
+    blocks = [" ".join(words[i:i+MAX_WORDS]) for i in range(0, len(words), MAX_WORDS)]
+    step = total_dur / len(blocks)
+    return [(i*step, (i+1)*step, b) for i, b in enumerate(blocks)]
+def segment_and_align(model, audio, sr, total_dur, mode):
+    segment_samples = int(SEGMENT_DURATION * sr)
+    all_subs = []
+    for i in range(0, len(audio), segment_samples):
+        start_s = i / sr
+        chunk = audio[i:i+segment_samples]
+        with tempfile.NamedTemporaryFile(suffix=".wav", delete=True) as tf:
+            sf.write(tf.name, chunk, sr)
+            text = transcribe(model, tf.name)
+            words = keep_bambara(text.split())
+            subs = align_heuristic(words, len(chunk)/sr)
+            for s, e, t in subs:
+                all_subs.append((s + start_s, e + start_s, t))
+    return pack(all_subs, total_dur)
+def burn(video_path, subs):
+    out_path = "RobotsMali_Subtitled.mp4"
+    with tempfile.NamedTemporaryFile(suffix=".srt", mode="w", encoding="utf-8", delete=False) as tf:
+        for i, (start, end, text) in enumerate(subs, 1):
+            def t_srt(sec):
+                h=int(sec//3600); m=int((sec%3600)//60); s=int(sec%60); ms=int((sec-int(sec))*1000)
+                return f"{h:02}:{m:02}:{s:02},{ms:03}"
+            tf.write(f"{i}\n{t_srt(start)} --> {t_srt(end)}\n{text}\n\n")
+        srt_name = tf.name
+    vf = f"subtitles={shlex.quote(srt_name)}:force_style='Fontsize=22,PrimaryColour=&HFFFFFF&,OutlineColour=&H000000&'"
+    cmd = f'ffmpeg -hide_banner -loglevel error -y -i {shlex.quote(video_path)} -vf {shlex.quote(vf)} -c:v libx264 -preset fast -crf 23 -c:a aac {shlex.quote(out_path)}'
+    run_cmd(cmd)
+    os.remove(srt_name)
+    return out_path
+# ---------------------------- # PIPELINE & INTERFACE # ----------------------------
 def pipeline(video_input, model_name):
     try:
+        video_path = video_input["tmp_path"] if isinstance(video_input, dict) else video_input
+        if not video_path: return "❌ Aucune vidéo fournie", None
+        yield "⏳ Phase 1/3 : Stabilisation et extraction audio...", None
         with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tf:
             wav_path = tf.name
         clean_wav, audio, sr = clean_audio(wav_path)
         duration = ffprobe_duration(video_path) or (len(audio)/sr)
+        yield f"⏳ Phase 2/3 : Analyse IA avec {model_name}...", None
         model = load_model(model_name)
+        mode = MODELS[model_name][1]
+        subs = segment_and_align(model, audio, sr, duration, mode)
+        if not subs: return "⚠️ Pas de parole détectée", None
+        yield "⏳ Phase 3/3 : Incrustation des sous-titres...", None
+        res_v = burn(video_path, subs)
+        return "✅ Traitement terminé avec succès", res_v
     except Exception as e:
+        traceback.print_exc()
+        return f"❌ Erreur: {str(e)}", None
+# --- DESIGN CSS ARTISTIQUE ---
 custom_css = """
 body { background-color: #0b0e14; }
+.gradio-container { background: rgba(17, 25, 40, 0.8) !important; backdrop-filter: blur(12px); border-radius: 20px; border: 1px solid rgba(255, 255, 255, 0.1); padding: 25px !important; }
+#header { text-align: center; margin-bottom: 20px; }
+#header h1 { color: #facc15; font-size: 2.8rem; letter-spacing: 4px; margin-bottom: 0; }
+#header p { color: #94a3b8; font-style: italic; font-size: 1.1rem; }
+.gr-button-primary { background: linear-gradient(135deg, #059669, #10b981) !important; border: none !important; font-weight: bold !important; }
+.gr-button-primary:hover { transform: translateY(-2px); box-shadow: 0 5px 15px rgba(16, 185, 129, 0.4); }
 """
 with gr.Blocks(css=custom_css, theme=gr.themes.Soft()) as demo:
     with gr.Div(elem_id="header"):
+        gr.HTML("""
+            <h1>🤖 ROBOTSMALI</h1>
+            <p>Intelligence Artificielle & Sauvegarde de la Langue Bambara</p>
+            <div style="height: 3px; width: 80px; background: #facc15; margin: 15px auto;"></div>
+        """)
     with gr.Row():
         with gr.Column():
+            gr.Markdown("### 🎥 Source Vidéo")
+            v_in = gr.Video(label=None, mirror_webcam=False)
+            m_sel = gr.Dropdown(list(MODELS.keys()), value="Soloba V1 (CTC)", label="Cerveau ASR")
             btn = gr.Button("🚀 GÉNÉRER LES SOUS-TITRES", variant="primary")
         with gr.Column():
+            gr.Markdown("### 📺 Résultat")
+            status = gr.Markdown("*Prêt pour le traitement...*")
+            v_out = gr.Video(label=None)
     gr.Examples(
         examples=VIDEO_EXAMPLES,
         inputs=[v_in, m_sel],
+        label="📺 Testez avec nos exemples"
     )
+    gr.HTML("<div style='text-align: center; color: #475569; margin-top: 40px;'>© 2025 RobotsMali - Bamako, Mali</div>")
     btn.click(pipeline, [v_in, m_sel], [status, v_out])
 if __name__ == "__main__":
+            demo.launch(share=True, debug=True)