Spaces:

RobotsMali
/

RobotsMali_Video_captionning

Running

App Files Files Community

binaryMao commited on 28 days ago

Commit

bde1ae6

verified ·

1 Parent(s): 33e6f73

Update app.py

Browse files

Files changed (1) hide show

app.py +77 -91

app.py CHANGED Viewed

@@ -1,26 +1,13 @@
 # -*- coding: utf-8 -*-
-"""
-ROBOTSMALI — Sous-titrage Bambara (VERSION 7.7 - INTÉGRALE)
--=
-"""
-import os
-import shlex
-import subprocess
-import tempfile
-import traceback
-import textwrap
-import time
-from pathlib import Path
 import torch
 from huggingface_hub import snapshot_download
 from nemo.collections import asr as nemo_asr
 import gradio as gr
-# ---------------------------- # CONFIGURATION DES MODÈLES # ----------------------------
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
-# La liste complète et correcte des modèles RobotsMali
 MODELS = {
     "Soloba V1 (CTC)":         ("RobotsMali/soloba-ctc-0.6b-v1", "ctc"),
     "Soloni V1 (RNNT)":        ("RobotsMali/soloni-114m-tdt-ctc-v1", "rnnt"),
@@ -30,7 +17,7 @@ MODELS = {
     "QuartzNet V0 (CTC-char)": ("RobotsMali/stt-bm-quartznet15x5-v0", "ctc_char"),
 }
-# Détection du chemin absolu pour la vidéo d'exemple
 def get_absolute_example():
     paths = [
         os.path.abspath("MARALINKE.mp4"),
@@ -45,8 +32,7 @@ def get_absolute_example():
 EXAMPLE_PATH = get_absolute_example()
 _cache = {}
-# ---------------------------- # MOTEUR IA & VIDÉO # ----------------------------
 def load_model(name):
     if name in _cache: return _cache[name]
     _cache.clear()
@@ -61,109 +47,109 @@ def load_model(name):
     elif mode == "ctc_char":
         model = nemo_asr.models.EncDecCTCModel.restore_from(nemo_file)
     else:
-        try:
-            model = nemo_asr.models.EncDecCTCModelBPE.restore_from(nemo_file)
-        except:
-            model = nemo_asr.models.EncDecCTCModel.restore_from(nemo_file)
     model.to(DEVICE).eval()
     _cache[name] = model
     return model
-def burn_subtitles(video_path, words, duration):
-    out_name = f"robotsmali_output_{int(time.time())}.mp4"
-    out_path = os.path.abspath(out_name)
-    # Génération du SRT
-    chunk_size = 7
-    with tempfile.NamedTemporaryFile(suffix=".srt", mode="w", encoding="utf-8", delete=False) as tf:
-        for i, idx in enumerate(range(0, len(words), chunk_size)):
-            chunk = words[idx : idx + chunk_size]
-            start = (idx / len(words)) * duration
-            end = (min(idx + chunk_size, len(words)) / len(words)) * duration
-            def t_srt(sec):
-                h=int(sec//3600); m=int((sec%3600)//60); s=int(sec%60); ms=int((sec-int(sec))*1000)
-                return f"{h:02}:{m:02}:{s:02},{ms:03}"
-            txt = "\n".join(textwrap.wrap(" ".join(chunk), 40))
-            tf.write(f"{i+1}\n{t_srt(start)} --> {t_srt(end)}\n{txt}\n\n")
-        srt_name = tf.name
-    # Rendu FFmpeg avec optimisation FastStart pour corriger la durée web
-    vf = f"subtitles={shlex.quote(srt_name)}:force_style='Fontsize=22,PrimaryColour=&HFFFFFF&,OutlineColour=&H000000&'"
-    cmd = (
-        f'ffmpeg -hide_banner -loglevel error -y -i {shlex.quote(video_path)} '
-        f'-vf {shlex.quote(vf)} -c:v libx264 -pix_fmt yuv420p -preset ultrafast -crf 28 '
-        f'-c:a aac -b:a 128k -movflags +faststart {shlex.quote(out_path)}'
-    )
-    subprocess.run(cmd, shell=True, check=True)
-    if os.path.exists(srt_name): os.remove(srt_name)
-    return out_path
-# ---------------------------- # PIPELINE # ----------------------------
-def pipeline(video_input, model_name):
     try:
-        if not video_input:
-            yield "### ❌ Erreur : Vidéo manquante.", None
-            return
-        yield "### ⏳ Étape 1/3 : Extraction Audio...", None
-        wav_path = os.path.abspath("temp_audio.wav")
-        subprocess.run(f"ffmpeg -y -i {shlex.quote(video_input)} -vn -ac 1 -ar 16000 {wav_path}", shell=True, check=True)
-        # Détection précise de la durée
-        dur_out = subprocess.run(f'ffprobe -v error -show_entries format=duration -of default=noprint_wrappers=1:nokey=1 {shlex.quote(video_input)}',
-                                 shell=True, stdout=subprocess.PIPE, text=True).stdout
-        duration = float(dur_out.strip()) if dur_out.strip() else 10.0
-        yield f"### ⏳ Étape 2/3 : Transcription avec {model_name}...", None
         model = load_model(model_name)
-        res = model.transcribe([wav_path])[0]
-        words = (res.text if hasattr(res, 'text') else str(res)).split()
-        if not words:
-            yield "### ⚠️ Aucune parole détectée.", None
-            return
-        yield "### ⏳ Étape 3/3 : Finalisation Vidéo...", None
-        final_video = burn_subtitles(video_input, words, duration)
-        if os.path.exists(wav_path): os.remove(wav_path)
-        yield "### ✅ Succès !", final_video
     except Exception as e:
         traceback.print_exc()
-        yield f"### ❌ Erreur : {str(e)}", None
-def force_load_demo():
-    return EXAMPLE_PATH
-# ---------------------------- # INTERFACE # ----------------------------
-with gr.Blocks(theme=gr.themes.Soft(), css="body { background-color: #0b0e14; }") as demo:
-    gr.HTML("<h1 style='text-align:center; color:#facc15;'>🤖 ROBOTSMALI V7.7</h1>")
     with gr.Row():
         with gr.Column():
-            gr.Markdown("### 📥 CHARGEMENT")
-            v_in = gr.Video(label="Vidéo source", interactive=True)
             if EXAMPLE_PATH:
-                btn_demo = gr.Button("📂 CHARGER LA DÉMO (MARALINKE)", variant="secondary")
             m_sel = gr.Dropdown(list(MODELS.keys()), value="Soloba V1 (CTC)", label="Modèle IA")
             btn_run = gr.Button("🚀 GÉNÉRER", variant="primary")
         with gr.Column():
             gr.Markdown("### 📤 RÉSULTAT")
             status = gr.Markdown("### État\nPrêt")
-            v_out = gr.Video(label="Vidéo finale")
     # Actions
     if EXAMPLE_PATH:
-        btn_demo.click(fn=force_load_demo, outputs=v_in)
-        gr.Examples(examples=[[EXAMPLE_PATH, "Soloba V1 (CTC)"]], inputs=[v_in, m_sel], cache_examples=False)
     btn_run.click(pipeline, [v_in, m_sel], [status, v_out])
 if __name__ == "__main__":

 # -*- coding: utf-8 -*-
+import os, shlex, subprocess, tempfile, traceback, textwrap, time
 import torch
 from huggingface_hub import snapshot_download
 from nemo.collections import asr as nemo_asr
 import gradio as gr
+# 1. CONFIGURATION DU MATÉRIEL ET DES MODÈLES
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 MODELS = {
     "Soloba V1 (CTC)":         ("RobotsMali/soloba-ctc-0.6b-v1", "ctc"),
     "Soloni V1 (RNNT)":        ("RobotsMali/soloni-114m-tdt-ctc-v1", "rnnt"),
     "QuartzNet V0 (CTC-char)": ("RobotsMali/stt-bm-quartznet15x5-v0", "ctc_char"),
 }
+# 2. GESTION DES CHEMINS (Correction du bug de chargement exemple)
 def get_absolute_example():
     paths = [
         os.path.abspath("MARALINKE.mp4"),
 EXAMPLE_PATH = get_absolute_example()
 _cache = {}
+# 3. MOTEUR IA NEMO
 def load_model(name):
     if name in _cache: return _cache[name]
     _cache.clear()
     elif mode == "ctc_char":
         model = nemo_asr.models.EncDecCTCModel.restore_from(nemo_file)
     else:
+        model = nemo_asr.models.EncDecCTCModelBPE.restore_from(nemo_file)
     model.to(DEVICE).eval()
     _cache[name] = model
     return model
+# 4. UTILITAIRES DE SYNCHRONISATION
+def format_ts(seconds):
+    td = time.gmtime(seconds)
+    ms = int((seconds - int(seconds)) * 1000)
+    return f"{time.strftime('%H:%M:%S', td)},{ms:03}"
+def get_real_duration(file_path):
+    cmd = f"ffprobe -v error -show_entries format=duration -of default=noprint_wrappers=1:nokey=1 {shlex.quote(file_path)}"
+    res = subprocess.run(cmd, shell=True, capture_output=True, text=True)
+    try: return float(res.stdout.strip())
+    except: return 0.0
+# 5. PIPELINE DE TRAITEMENT
+def pipeline(video_in, model_name):
     try:
+        if not video_in: return "❌ Erreur : Aucune vidéo détectée.", None
+        # Étape A : Extraction Audio
+        yield "⏳ Extraction de l'audio...", None
+        wav_path = os.path.abspath("temp.wav")
+        subprocess.run(f"ffmpeg -y -i {shlex.quote(video_in)} -vn -ac 1 -ar 16000 {wav_path}", shell=True, check=True)
+        duration = get_real_duration(video_in)
+        # Étape B : Transcription avec Offsets (Alignement Natif)
+        yield f"⏳ Transcription IA ({model_name}) avec alignement...", None
         model = load_model(model_name)
+        # Utilisation de return_hypotheses pour récupérer les timestamps CTC
+        hypotheses = model.transcribe([wav_path], return_hypotheses=True)[0]
+        words_with_ts = []
+        if hasattr(hypotheses, 'word_offsets') and hypotheses.word_offsets:
+            offsets = hypotheses.word_offsets
+            words = hypotheses.text.split()
+            # Facteur 0.02 (Stride de NeMo) pour convertir frames en secondes
+            for i, word in enumerate(words):
+                t_start = offsets[i] * 0.02
+                words_with_ts.append({"word": word, "start": t_start, "end": t_start + 0.4})
+        else:
+            # Fallback temporel linéaire si les offsets ne sont pas disponibles (RNNT)
+            words = (hypotheses.text if hasattr(hypotheses, 'text') else str(hypotheses)).split()
+            for i, w in enumerate(words):
+                words_with_ts.append({"word": w, "start": (i/len(words))*duration, "end": ((i+1)/len(words))*duration})
+        # Étape C : Création du SRT segmenté
+        yield "⏳ Génération des segments synchronisés...", None
+        srt_path = os.path.abspath("output.srt")
+        words_per_line = 6
+        with open(srt_path, "w", encoding="utf-8") as f:
+            for i in range(0, len(words_with_ts), words_per_line):
+                chunk = words_with_ts[i:i+words_per_line]
+                start_time = chunk[0]['start']
+                end_time = chunk[-1]['end'] + 0.5
+                f.write(f"{(i//words_per_line)+1}\n{format_ts(start_time)} --> {format_ts(end_time)}\n")
+                f.write(" ".join([w['word'] for w in chunk]) + "\n\n")
+        # Étape D : Encodage et "Burn-in"
+        yield "⏳ Incrustation des sous-titres (FastStart)...", None
+        out_path = os.path.abspath(f"resultat_{int(time.time())}.mp4")
+        cmd_ffmpeg = (
+            f"ffmpeg -y -i {shlex.quote(video_in)} "
+            f"-vf \"subtitles={shlex.quote(srt_path)}:force_style='Alignment=2,FontSize=20,PrimaryColour=&H00FFFF&'\" "
+            f"-c:v libx264 -pix_fmt yuv420p -movflags +faststart -c:a aac {out_path}"
+        )
+        subprocess.run(cmd_ffmpeg, shell=True, check=True)
+        yield "✅ Terminé avec succès !", out_path
     except Exception as e:
         traceback.print_exc()
+        yield f"❌ Erreur : {str(e)}", None
+# 6. INTERFACE GRADIO (Webcam + Example Fix)
+with gr.Blocks(theme=gr.themes.Soft(), css="body {background-color: #0b1120;}") as demo:
+    gr.HTML("<h1 style='text-align:center; color:#facc15;'>🤖 ROBOTSMALI V10.5</h1>")
     with gr.Row():
         with gr.Column():
+            gr.Markdown("### 📥 SOURCE")
+            # Supporte l'upload ET la webcam
+            v_in = gr.Video(label="Webcam ou Fichier", sources=["upload", "webcam"], interactive=True)
             if EXAMPLE_PATH:
+                btn_demo = gr.Button("📂 CHARGER LA VIDÉO D'EXEMPLE", variant="secondary")
             m_sel = gr.Dropdown(list(MODELS.keys()), value="Soloba V1 (CTC)", label="Modèle IA")
             btn_run = gr.Button("🚀 GÉNÉRER", variant="primary")
         with gr.Column():
             gr.Markdown("### 📤 RÉSULTAT")
             status = gr.Markdown("### État\nPrêt")
+            v_out = gr.Video(label="Vidéo finale synchronisée")
     # Actions
     if EXAMPLE_PATH:
+        btn_demo.click(fn=lambda: EXAMPLE_PATH, outputs=v_in)
     btn_run.click(pipeline, [v_in, m_sel], [status, v_out])
 if __name__ == "__main__":