Spaces:

RobotsMali
/

RobotsMali_Video_captionning

Runtime error

App Files Files Community

binaryMao commited on 28 days ago

Commit

84226a5

verified ·

1 Parent(s): 5484bb2

Update app.py

Browse files

Files changed (1) hide show

app.py +52 -140

app.py CHANGED Viewed

@@ -5,8 +5,9 @@ from huggingface_hub import snapshot_download
 from nemo.collections import asr as nemo_asr
 import gradio as gr
-# 1. CONFIGURATION ET MODÈLES
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 MODELS = {
     "Soloba V3 (CTC)":           ("RobotsMali/soloba-ctc-0.6b-v3", "ctc"),
@@ -20,27 +21,9 @@ MODELS = {
     "Traduction Soloni (ST)":    ("RobotsMali/st-soloni-114m-tdt-ctc", "rnnt"),
 }
-def find_example_video():
-    paths = ["examples/MARALINKE_FIXED.mp4", "examples/MARALINKE.mp4", "MARALINKE.mp4"]
-    for p in paths:
-        if os.path.exists(p): return p
-    # Si aucun fichier local, on télécharge un exemple
-    print("⬇️ Téléchargement de la vidéo d'exemple...")
-    example_url = "https://huggingface.co/spaces/RobotsMali/Soloni-Demo/resolve/main/examples/MARALINKE.mp4"
-    target_path = "examples/MARALINKE.mp4"
-    os.makedirs("examples", exist_ok=True)
-    try:
-        subprocess.run(f"wget {example_url} -O {target_path}", shell=True, check=True)
-        return target_path
-    except Exception as e:
-        print(f"⚠️ Impossible de télécharger l'exemple : {e}")
-        return None
-EXAMPLE_PATH = find_example_video()
 _cache = {}
-# 2. GESTION MÉMOIRE ET CHARGEMENT (AVEC CORRECTIF STATE_DICT)
 def clear_memory():
     _cache.clear()
     gc.collect()
@@ -54,105 +37,34 @@ def get_model(name):
     folder = snapshot_download(repo, local_dir_use_symlinks=False)
     nemo_file = next((os.path.join(folder, f) for f in os.listdir(folder) if f.endswith(".nemo")), None)
     if not nemo_file: raise FileNotFoundError("Fichier .nemo introuvable.")
-    # Méthode la plus simple et robuste pour charger le modèle
-    print(f"📦 Chargement du modèle depuis : {nemo_file}")
-    # On laisse NeMo gérer tout automatiquement
-    model = nemo_asr.models.ASRModel.restore_from(nemo_file)
-    # Déplacement vers le device approprié
-    model = model.to(DEVICE)
-    model.eval()
-    # Optimisation mémoire pour GPU
     if DEVICE == "cuda":
-        try:
-            model = model.half()
-        except Exception as e:
-            print(f"⚠️ Impossible de convertir en half precision: {e}")
     _cache[name] = model
     return model
-# 3. UTILITAIRES
 def format_srt_time(sec):
-    td = time.gmtime(sec)
     ms = int((sec - int(sec)) * 1000)
     return f"{time.strftime('%H:%M:%S', td)},{ms:03}"
-# 4. PIPELINE DE TRANSCRIPTION (OPTIMISÉ)
-def detect_silences(path, min_silence_len=0.3, silence_thresh=-35):
-    """Detects silence intervals using ffmpeg"""
-    cmd = (
-        f"ffmpeg -i {shlex.quote(path)} -af "
-        f"silencedetect=noise={silence_thresh}dB:d={min_silence_len} "
-        f"-f null -"
-    )
-    result = subprocess.run(cmd, shell=True, stderr=subprocess.PIPE, text=True)
-    silences = []
-    for line in result.stderr.splitlines():
-        if "silence_start" in line:
-            start = float(line.split("silence_start: ")[1])
-            silences.append({"start": start, "end": None})
-        elif "silence_end" in line and silences:
-            end = float(line.split("silence_end: ")[1].split(" ")[0])
-            silences[-1]["end"] = end
-    return [s for s in silences if s["end"] is not None]
-def smart_segment_audio(audio_path, target_duration=5.0):
-    """Segments audio at silence points closest to target_duration"""
-    silences = detect_silences(audio_path)
-    segments_cuts = [0.0]
-    last_cut = 0.0
-    # Si aucun silence détecté, on fallback sur du découpage régulier
-    if not silences:
-        return None
-    # On cherche le meilleur point de coupe
-    duration = float(subprocess.check_output(
-        f"ffprobe -v error -show_entries format=duration -of default=noprint_wrappers=1:nokey=1 {shlex.quote(audio_path)}",
-        shell=True
-    ).strip())
-    current_pos = 0.0
-    while current_pos < duration:
-        target_pos = current_pos + target_duration
-        if target_pos >= duration:
-            break
-        # Trouver le silence le plus proche du target_pos
-        best_cut = None
-        min_dist = float('inf')
-        for s in silences:
-            # On coupe au milieu du silence
-            mid_silence = (s["start"] + s["end"]) / 2
-            if mid_silence <= current_pos: continue
-            dist = abs(mid_silence - target_pos)
-            if dist < min_dist:
-                min_dist = dist
-                best_cut = mid_silence
-            # Optimisation: inutile de chercher trop loin
-            if mid_silence > target_pos + 10: break
-        if best_cut and abs(best_cut - current_pos) > 1.0: # Éviter segments trop courts
-            segments_cuts.append(best_cut)
-            current_pos = best_cut
-        else:
-            # Pas de silence proche, on force la coupe (fallback)
-            current_pos += target_duration
-            segments_cuts.append(current_pos)
-    segments_cuts.append(duration)
-    return segments_cuts
 def pipeline(video_in, model_name):
     tmp_dir = tempfile.mkdtemp()
     try:
@@ -160,86 +72,86 @@ def pipeline(video_in, model_name):
             yield "❌ Aucune vidéo sélectionnée.", None
             return
         yield "⏳ Phase 1/4 : Extraction audio...", None
         full_wav = os.path.join(tmp_dir, "full.wav")
-        subprocess.run(f"ffmpeg -y -threads 0 -i {shlex.quote(video_in)} -vn -ac 1 -ar 16000 {full_wav}", shell=True, check=True)
-        yield "⏳ Phase 2/4 : Segmentation (5s optimisé Soloni)...", None
-        # Segmentation fixe 5s (optimal pour Soloni V2/V3)
-        subprocess.run(f"ffmpeg -i {full_wav} -f segment -segment_time 5 -c copy {os.path.join(tmp_dir, 'seg_%03d.wav')}", shell=True, check=True)
         files = sorted(glob.glob(os.path.join(tmp_dir, "seg_*.wav")))
-        segment_files = []
-        for i, f in enumerate(files):
-            segment_files.append({"file": f, "start_offset": i * 5.0})
         yield f"⏳ Phase 3/4 : Chargement de {model_name}...", None
         model = get_model(model_name)
-        yield f"🎙️ Transcription de {len(segment_files)} segments...", None
-        # Optimisation batch size pour Colab (souvent T4/V100)
         b_size = 16 if DEVICE == "cuda" else 2
-        audio_paths = [s["file"] for s in segment_files]
-        # Utilisation de torch.inference_mode pour gain perf
         with torch.inference_mode():
-            batch_hypotheses = model.transcribe(audio_paths, batch_size=b_size, return_hypotheses=True)
         all_words_ts = []
         for idx, hyp in enumerate(batch_hypotheses):
-            yield f"📝 Traitement : {idx+1}/{len(segment_files)}...", None
-            base_time = segment_files[idx]["start_offset"]
-            if isinstance(hyp, list): hyp = hyp[0]
             text = hyp.text if hasattr(hyp, 'text') else str(hyp)
             words = text.split()
-            # Ajustement temporel plus précis
-            segment_duration = segment_files[idx+1]["start_offset"] - base_time if idx < len(segment_files)-1 else 5.0
-            gap = segment_duration / max(len(words), 1)
             for i, w in enumerate(words):
-                all_words_ts.append({"word": w, "start": base_time + (i * gap), "end": base_time + ((i+1) * gap)})
-        yield "⏳ Phase 4/4 : Encodage vidéo...", None
         srt_path = os.path.join(tmp_dir, "final.srt")
         with open(srt_path, "w", encoding="utf-8") as f:
-            for i in range(0, len(all_words_ts), 6):
                 chunk = all_words_ts[i:i+6]
                 f.write(f"{(i//6)+1}\n{format_srt_time(chunk[0]['start'])} --> {format_srt_time(chunk[-1]['end'])}\n")
                 f.write(" ".join([c['word'] for c in chunk]) + "\n\n")
         out_path = os.path.abspath(f"resultat_{int(time.time())}.mp4")
         safe_srt = srt_path.replace("\\", "/").replace(":", "\\:")
-        cmd = f"ffmpeg -y -threads 0 -i {shlex.quote(video_in)} -vf \"subtitles='{safe_srt}':force_style='Alignment=2,FontSize=18,PrimaryColour=&H00FFFF'\" -c:v libx264 -preset ultrafast -c:a copy {out_path}"
         subprocess.run(cmd, shell=True, check=True)
         yield "✅ Terminé !", out_path
     except Exception as e:
         yield f"❌ Erreur : {str(e)}", None
     finally:
         if os.path.exists(tmp_dir): shutil.rmtree(tmp_dir)
-# 5. INTERFACE GRADIO
 with gr.Blocks(theme=gr.themes.Soft()) as demo:
     gr.HTML("<div style='text-align:center;'><h1>🤖 RobotsMali Speech Lab</h1></div>")
     with gr.Row():
         with gr.Column():
             v_input = gr.Video(label="Vidéo Source")
             m_input = gr.Dropdown(choices=list(MODELS.keys()), value="Soloni V3 (TDT-CTC)", label="Modèle")
-            run_btn = gr.Button("🚀 GÉNÉRER", variant="primary")
-            if EXAMPLE_PATH:
-                gr.Examples(examples=[[EXAMPLE_PATH, "Soloni V3 (TDT-CTC)"]], inputs=[v_input, m_input])
         with gr.Column():
-            status = gr.Markdown("### État\nEn attente...")
-            v_output = gr.Video(label="Vidéo finale")
     run_btn.click(pipeline, [v_input, m_input], [status, v_output])
-demo.queue().launch()

 from nemo.collections import asr as nemo_asr
 import gradio as gr
+# --- CONFIGURATION ---
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+SEGMENT_DURATION = 5.0  # Ta préférence pour Soloni
 MODELS = {
     "Soloba V3 (CTC)":           ("RobotsMali/soloba-ctc-0.6b-v3", "ctc"),
     "Traduction Soloni (ST)":    ("RobotsMali/st-soloni-114m-tdt-ctc", "rnnt"),
 }
 _cache = {}
+# --- GESTION MÉMOIRE ET CHARGEMENT ---
 def clear_memory():
     _cache.clear()
     gc.collect()
     folder = snapshot_download(repo, local_dir_use_symlinks=False)
     nemo_file = next((os.path.join(folder, f) for f in os.listdir(folder) if f.endswith(".nemo")), None)
     if not nemo_file: raise FileNotFoundError("Fichier .nemo introuvable.")
+    # CORRECTIF SÉCURISÉ POUR L'ERREUR D'INITIALISATION
+    from nemo.core.connectors.save_restore_connector import SaveRestoreConnector
+    # On force l'utilisation d'un connecteur standard pour éviter le bug __init__()
+    connector = SaveRestoreConnector()
+    model = nemo_asr.models.ASRModel.restore_from(
+        nemo_file,
+        map_location=torch.device(DEVICE),
+        save_restore_connector=connector  # On passe l'instance déjà créée
+    )
+    model.eval()
     if DEVICE == "cuda":
+        model = model.half()
     _cache[name] = model
     return model
+# --- UTILITAIRES ---
 def format_srt_time(sec):
+    td = time.gmtime(max(0, sec))
     ms = int((sec - int(sec)) * 1000)
     return f"{time.strftime('%H:%M:%S', td)},{ms:03}"
+# --- PIPELINE ---
 def pipeline(video_in, model_name):
     tmp_dir = tempfile.mkdtemp()
     try:
             yield "❌ Aucune vidéo sélectionnée.", None
             return
+        # Phase 1 : Audio
         yield "⏳ Phase 1/4 : Extraction audio...", None
         full_wav = os.path.join(tmp_dir, "full.wav")
+        subprocess.run(f"ffmpeg -y -i {shlex.quote(video_in)} -vn -ac 1 -ar 16000 {full_wav}", shell=True, check=True)
+        # Phase 2 : Segmentation
+        yield f"⏳ Phase 2/4 : Segmentation ({SEGMENT_DURATION}s)...", None
+        subprocess.run(f"ffmpeg -i {full_wav} -f segment -segment_time {SEGMENT_DURATION} -c copy {os.path.join(tmp_dir, 'seg_%03d.wav')}", shell=True, check=True)
         files = sorted(glob.glob(os.path.join(tmp_dir, "seg_*.wav")))
+        # Sécurité : ignorer les fichiers corrompus ou trop petits (<1.5 Ko)
+        valid_segments = [f for f in files if os.path.getsize(f) > 1500]
+        if not valid_segments:
+            yield "❌ Erreur : Audio trop court ou invalide.", None
+            return
+        # Phase 3 : Transcription
         yield f"⏳ Phase 3/4 : Chargement de {model_name}...", None
         model = get_model(model_name)
+        yield f"🎙️ Transcription de {len(valid_segments)} segments...", None
         b_size = 16 if DEVICE == "cuda" else 2
         with torch.inference_mode():
+            batch_hypotheses = model.transcribe(valid_segments, batch_size=b_size, return_hypotheses=True)
         all_words_ts = []
         for idx, hyp in enumerate(batch_hypotheses):
+            base_time = idx * SEGMENT_DURATION
             text = hyp.text if hasattr(hyp, 'text') else str(hyp)
             words = text.split()
+            if not words: continue
+            # Distribution équitable des mots sur les 5 secondes
+            gap = SEGMENT_DURATION / len(words)
             for i, w in enumerate(words):
+                all_words_ts.append({
+                    "word": w,
+                    "start": base_time + (i * gap),
+                    "end": base_time + ((i+1) * gap)
+                })
+        # Phase 4 : Encodage Vidéo
+        yield "⏳ Phase 4/4 : Encodage final...", None
         srt_path = os.path.join(tmp_dir, "final.srt")
         with open(srt_path, "w", encoding="utf-8") as f:
+            for i in range(0, len(all_words_ts), 6): # 6 mots max par ligne
                 chunk = all_words_ts[i:i+6]
                 f.write(f"{(i//6)+1}\n{format_srt_time(chunk[0]['start'])} --> {format_srt_time(chunk[-1]['end'])}\n")
                 f.write(" ".join([c['word'] for c in chunk]) + "\n\n")
         out_path = os.path.abspath(f"resultat_{int(time.time())}.mp4")
+        # Fix pour le chemin SRT (Windows/Linux)
         safe_srt = srt_path.replace("\\", "/").replace(":", "\\:")
+        # Style : Couleur Cyan pour la lisibilité
+        cmd = f"ffmpeg -y -i {shlex.quote(video_in)} -vf \"subtitles='{safe_srt}':force_style='Alignment=2,FontSize=18,PrimaryColour=&H00FFFF'\" -c:v libx264 -preset ultrafast -c:a copy {out_path}"
         subprocess.run(cmd, shell=True, check=True)
         yield "✅ Terminé !", out_path
     except Exception as e:
+        traceback.print_exc()
         yield f"❌ Erreur : {str(e)}", None
     finally:
         if os.path.exists(tmp_dir): shutil.rmtree(tmp_dir)
+# --- INTERFACE ---
 with gr.Blocks(theme=gr.themes.Soft()) as demo:
     gr.HTML("<div style='text-align:center;'><h1>🤖 RobotsMali Speech Lab</h1></div>")
     with gr.Row():
         with gr.Column():
             v_input = gr.Video(label="Vidéo Source")
             m_input = gr.Dropdown(choices=list(MODELS.keys()), value="Soloni V3 (TDT-CTC)", label="Modèle")
+            run_btn = gr.Button("🚀 GÉNÉRER SOUS-TITRES", variant="primary")
         with gr.Column():
+            status = gr.Markdown("### État\nPrêt.")
+            v_output = gr.Video(label="Résultat")
     run_btn.click(pipeline, [v_input, m_input], [status, v_output])
+demo.queue().launch()