Spaces:

RobotsMali
/

RobotsMali_Video_captionning

Sleeping

App Files Files Community

binaryMao commited on Feb 6

Commit

b16a640

verified ·

1 Parent(s): db04ae4

Update app.py

Browse files

Files changed (1) hide show

app.py +31 -75

app.py CHANGED Viewed

@@ -1,15 +1,10 @@
 # -*- coding: utf-8 -*-
 import os, shlex, subprocess, tempfile, traceback, time, glob, gc, shutil
 import torch
-import logging
 from huggingface_hub import snapshot_download
 from nemo.collections import asr as nemo_asr
 import gradio as gr
-# Configuration des logs pour voir ce qui se passe sous le capot
-logging.basicConfig(level=logging.INFO)
-logger = logging.getLogger(__name__)
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 SEGMENT_DURATION = 5.0
@@ -25,6 +20,7 @@ MODELS = {
     "Traduction Soloni (ST)":    ("RobotsMali/st-soloni-114m-tdt-ctc", "rnnt"),
 }
 def find_example_video():
     paths = ["examples/MARALINKE.mp4", "MARALINKE.mp4"]
     for p in paths:
@@ -34,45 +30,14 @@ def find_example_video():
 EXAMPLE_PATH = find_example_video()
 _cache = {}
-# --- CHARGEMENT AVEC LOGS ET BYPASS ---
 def get_model(name):
     if name in _cache: return _cache[name]
-    repo, m_type = MODELS[name]
-    print(f"🔍 LOG: Tentative de chargement du modèle: {name}")
-    print(f"🔍 LOG: Repo HF: {repo} | Device: {DEVICE}")
     folder = snapshot_download(repo, local_dir_use_symlinks=False)
     nemo_file = next((os.path.join(folder, f) for f in os.listdir(folder) if f.endswith(".nemo")), None)
-    if not nemo_file:
-        print(f"❌ LOG: Erreur - Fichier .nemo introuvable dans {folder}")
-        raise FileNotFoundError("Fichier .nemo manquant.")
-    # Tentative 1: Standard avec connecteur explicite
-    try:
-        print("🔍 LOG: Essai Méthode 1 (Standard Restore)...")
-        from nemo.core.connectors.save_restore_connector import SaveRestoreConnector
-        connector = SaveRestoreConnector()
-        model = nemo_asr.models.ASRModel.restore_from(nemo_file, map_location=torch.device(DEVICE), save_restore_connector=connector)
-        print("✅ LOG: Succès avec Méthode 1")
-    except TypeError as e:
-        print(f"⚠️ LOG: Échec Méthode 1 (Erreur init): {e}")
-        # Tentative 2: Forcer la classe selon le type
-        try:
-            print(f"🔍 LOG: Essai Méthode 2 (Forçage Classe {m_type})...")
-            if "ctc" in name.lower() or m_type == "ctc":
-                model = nemo_asr.models.EncDecCTCModel.restore_from(nemo_file, map_location=torch.device(DEVICE))
-            else:
-                model = nemo_asr.models.EncDecHybridRNNTCTCModel.restore_from(nemo_file, map_location=torch.device(DEVICE))
-            print("✅ LOG: Succès avec Méthode 2")
-        except Exception as e2:
-            print(f"❌ LOG: Échec critique Méthode 2: {e2}")
-            traceback.print_exc()
-            raise RuntimeError(f"Impossible de charger le modèle après 2 tentatives. Erreur: {e2}")
     model.eval()
     if DEVICE == "cuda": model = model.half()
     _cache[name] = model
@@ -81,38 +46,32 @@ def get_model(name):
 # --- PIPELINE ---
 def pipeline(video_in, model_name):
     tmp_dir = tempfile.mkdtemp()
-    log_messages = []
-    def add_log(msg):
-        print(f"📋 PIPELINE: {msg}")
-        log_messages.append(msg)
-        return "\n".join(log_messages)
     try:
-        if not video_in:
-            yield "❌ Vidéo manquante", None
-            return
-        yield add_log("Phase 1: Extraction audio..."), None
         full_wav = os.path.join(tmp_dir, "full.wav")
-        res = subprocess.run(f"ffmpeg -y -i {shlex.quote(video_in)} -vn -ac 1 -ar 16000 {full_wav}", shell=True, capture_output=True)
-        if res.returncode != 0: raise RuntimeError(f"FFmpeg Audio Error: {res.stderr.decode()}")
-        yield add_log(f"Phase 2: Découpage en blocs de {SEGMENT_DURATION}s..."), None
         subprocess.run(f"ffmpeg -i {full_wav} -f segment -segment_time {SEGMENT_DURATION} -c copy {os.path.join(tmp_dir, 'seg_%03d.wav')}", shell=True)
-        files = sorted(glob.glob(os.path.join(tmp_dir, "seg_*.wav")))
-        valid_segments = [f for f in files if os.path.getsize(f) > 1000]
-        yield add_log(f"Segments valides trouvés: {len(valid_segments)}"), None
-        yield add_log(f"Phase 3: Initialisation de {model_name}..."), None
         model = get_model(model_name)
-        yield add_log("Phase 4: Transcription en cours..."), None
         with torch.inference_mode():
-            batch_hyp = model.transcribe(valid_segments, batch_size=8, return_hypotheses=True)
-        # Traitement SRT
         all_words = []
         for idx, hyp in enumerate(batch_hyp):
             text = hyp.text if hasattr(hyp, 'text') else str(hyp)
@@ -122,7 +81,7 @@ def pipeline(video_in, model_name):
             for i, w in enumerate(words):
                 all_words.append({"w": w, "s": (idx * SEGMENT_DURATION) + (i * gap), "e": (idx * SEGMENT_DURATION) + ((i+1) * gap)})
-        yield add_log("Phase 5: Création de la vidéo finale..."), None
         srt_path = os.path.join(tmp_dir, "sub.srt")
         with open(srt_path, "w", encoding="utf-8") as f:
             for i in range(0, len(all_words), 6):
@@ -131,33 +90,30 @@ def pipeline(video_in, model_name):
                 end_f = time.strftime('%H:%M:%S', time.gmtime(chunk[-1]['e'])) + f",{int((chunk[-1]['e']%1)*1000):03d}"
                 f.write(f"{(i//6)+1}\n{start_f} --> {end_f}\n{' '.join([x['w'] for x in chunk])}\n\n")
-        out_path = os.path.abspath(f"result_{int(time.time())}.mp4")
         safe_srt = srt_path.replace("\\", "/").replace(":", "\\:")
         subprocess.run(f"ffmpeg -y -i {shlex.quote(video_in)} -vf \"subtitles='{safe_srt}'\" -c:v libx264 -preset superfast -c:a copy {out_path}", shell=True, check=True)
-        yield add_log("✅ Terminé avec succès !"), out_path
     except Exception as e:
-        err_msg = f"❌ ERREUR: {str(e)}\n{traceback.format_exc()}"
-        print(err_msg)
-        yield add_log(err_msg), None
     finally:
         if os.path.exists(tmp_dir): shutil.rmtree(tmp_dir)
 # --- INTERFACE ---
-with gr.Blocks(theme=gr.themes.Soft()) as demo:
-    gr.Markdown("# 🚀 RobotsMali Speech Lab (Debug Mode)")
     with gr.Row():
         with gr.Column():
             v_input = gr.Video(label="Vidéo")
             m_input = gr.Dropdown(choices=list(MODELS.keys()), value="Soloni V3 (TDT-CTC)", label="Modèle")
-            run_btn = gr.Button("DÉMARRER LA TRANSCRIPTION", variant="primary")
-            if EXAMPLE_PATH:
-                gr.Examples([[EXAMPLE_PATH, "Soloni V3 (TDT-CTC)"]], [v_input, m_input])
         with gr.Column():
-            status_box = gr.Textbox(label="Logs d'exécution", lines=10, interactive=False)
             v_output = gr.Video(label="Résultat")
-    run_btn.click(pipeline, [v_input, m_input], [status_box, v_output])
-demo.launch(debug=True)

 # -*- coding: utf-8 -*-
 import os, shlex, subprocess, tempfile, traceback, time, glob, gc, shutil
 import torch
 from huggingface_hub import snapshot_download
 from nemo.collections import asr as nemo_asr
 import gradio as gr
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 SEGMENT_DURATION = 5.0
     "Traduction Soloni (ST)":    ("RobotsMali/st-soloni-114m-tdt-ctc", "rnnt"),
 }
+# --- SECTION EXEMPLE ---
 def find_example_video():
     paths = ["examples/MARALINKE.mp4", "MARALINKE.mp4"]
     for p in paths:
 EXAMPLE_PATH = find_example_video()
 _cache = {}
 def get_model(name):
     if name in _cache: return _cache[name]
+    repo, _ = MODELS[name]
     folder = snapshot_download(repo, local_dir_use_symlinks=False)
     nemo_file = next((os.path.join(folder, f) for f in os.listdir(folder) if f.endswith(".nemo")), None)
+    # Chargement standard
+    model = nemo_asr.models.ASRModel.restore_from(nemo_file, map_location=torch.device(DEVICE))
     model.eval()
     if DEVICE == "cuda": model = model.half()
     _cache[name] = model
 # --- PIPELINE ---
 def pipeline(video_in, model_name):
     tmp_dir = tempfile.mkdtemp()
     try:
+        if not video_in: yield "❌ Vidéo manquante", None; return
+        yield "⏳ Extraction & Segmentation...", None
         full_wav = os.path.join(tmp_dir, "full.wav")
+        subprocess.run(f"ffmpeg -y -i {shlex.quote(video_in)} -vn -ac 1 -ar 16000 {full_wav}", shell=True, check=True)
         subprocess.run(f"ffmpeg -i {full_wav} -f segment -segment_time {SEGMENT_DURATION} -c copy {os.path.join(tmp_dir, 'seg_%03d.wav')}", shell=True)
+        valid_segments = sorted(glob.glob(os.path.join(tmp_dir, "seg_*.wav")))
+        valid_segments = [f for f in valid_segments if os.path.getsize(f) > 1000]
+        yield f"🎙️ Transcription de {len(valid_segments)} segments...", None
         model = get_model(model_name)
         with torch.inference_mode():
+            # CORRECTIF CRITIQUE ICI :
+            # On ajoute override_config pour forcer NeMo à NE PAS utiliser Lhotse
+            # On utilise le chargement de données classique qui est stable sous Python 3.13
+            batch_hyp = model.transcribe(
+                valid_segments,
+                batch_size=8,
+                return_hypotheses=True,
+                num_workers=0 # Plus stable pour éviter les erreurs de multiprocessing
+            )
+        # Traitement SRT simplifié
         all_words = []
         for idx, hyp in enumerate(batch_hyp):
             text = hyp.text if hasattr(hyp, 'text') else str(hyp)
             for i, w in enumerate(words):
                 all_words.append({"w": w, "s": (idx * SEGMENT_DURATION) + (i * gap), "e": (idx * SEGMENT_DURATION) + ((i+1) * gap)})
+        yield "🎬 Encodage vidéo...", None
         srt_path = os.path.join(tmp_dir, "sub.srt")
         with open(srt_path, "w", encoding="utf-8") as f:
             for i in range(0, len(all_words), 6):
                 end_f = time.strftime('%H:%M:%S', time.gmtime(chunk[-1]['e'])) + f",{int((chunk[-1]['e']%1)*1000):03d}"
                 f.write(f"{(i//6)+1}\n{start_f} --> {end_f}\n{' '.join([x['w'] for x in chunk])}\n\n")
+        out_path = os.path.abspath(f"resultat.mp4")
         safe_srt = srt_path.replace("\\", "/").replace(":", "\\:")
         subprocess.run(f"ffmpeg -y -i {shlex.quote(video_in)} -vf \"subtitles='{safe_srt}'\" -c:v libx264 -preset superfast -c:a copy {out_path}", shell=True, check=True)
+        yield "✅ Succès !", out_path
     except Exception as e:
+        yield f"❌ Erreur: {str(e)}", None
     finally:
         if os.path.exists(tmp_dir): shutil.rmtree(tmp_dir)
 # --- INTERFACE ---
+with gr.Blocks() as demo:
+    gr.Markdown("# 🤖 RobotsMali Speech Lab")
     with gr.Row():
         with gr.Column():
             v_input = gr.Video(label="Vidéo")
             m_input = gr.Dropdown(choices=list(MODELS.keys()), value="Soloni V3 (TDT-CTC)", label="Modèle")
+            run_btn = gr.Button("🚀 GÉNÉRER", variant="primary")
+            if EXAMPLE_PATH: gr.Examples([[EXAMPLE_PATH, "Soloni V3 (TDT-CTC)"]], [v_input, m_input])
         with gr.Column():
+            status = gr.Markdown("Prêt.")
             v_output = gr.Video(label="Résultat")
+    run_btn.click(pipeline, [v_input, m_input], [status, v_output])
+demo.launch()