Spaces:

RobotsMali
/

RobotsMali_Video_captionning

Sleeping

App Files Files Community

binaryMao commited on Nov 29, 2025

Commit

77b95a2

verified ·

1 Parent(s): 8fd540e

Update app.py

Browse files

Files changed (1) hide show

app.py +50 -26

app.py CHANGED Viewed

@@ -1,6 +1,7 @@
 # -*- coding: utf-8 -*-
 """
 ROBOTSMALI — Sous-titrage Bambara
 """
 import os
@@ -51,13 +52,21 @@ def run_cmd(cmd):
     return res.stdout
 def ffprobe_duration(path):
-    cmd = f'ffprobe -v error -select_streams v:0 -show_entries format=duration -of default=noprint_wrappers=1:nokey=1 {shlex.quote(path)}'
     out = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
     if out.returncode != 0:
-        print("ffprobe erreur:", out.stderr)
         return None
     try:
-        return float(out.stdout.strip())
     except:
         return None
@@ -249,7 +258,7 @@ def burn(video_path, subs, output_path=None):
     return output_path
 # ----------------------------
-# PIPELINE PRINCIPAL (V41)
 # ----------------------------
 def pipeline(video_input, model_name):
     """
@@ -257,30 +266,40 @@ def pipeline(video_input, model_name):
     model_name : clé dans MODELS
     """
     try:
-        # support Gradio dict (tmp_path)
         if isinstance(video_input, dict) and "tmp_path" in video_input:
             video_path = video_input["tmp_path"]
         else:
             video_path = video_input
         duration = ffprobe_duration(video_path)
-        if duration is None:
-            raise RuntimeError("Impossible d'obtenir la durée de la vidéo via ffprobe")
-        # fichiers temporaires
         tmp_fd, tmp_wav = tempfile.mkstemp(suffix=".wav")
         os.close(tmp_fd)
-        # extraction + nettoyage
         extract_audio(video_path, tmp_wav)
         clean_wav, audio, sr = clean_audio(tmp_wav)
-        # charger modèle
         model = load_model(model_name)
         text = transcribe(model, clean_wav)
         mode = MODELS[model_name][1]
-        # segmentation / alignement
         subs = None
         if mode == "rnnt":
             # RNNT : tentative de segmentation via logits + ctc_segmentation si dispo
@@ -293,17 +312,20 @@ def pipeline(video_input, model_name):
                 ln = torch.tensor([x.shape[1]]).to(DEVICE)
                 with torch.no_grad():
                     logits = model(input_signal=x, input_signal_length=ln)[0]
-                # heuristique mapping frames -> seconds
                 time_per_frame = duration / max(1, logits.shape[1])
                 # build char list
                 try:
                     raw = model.tokenizer.vocab
                     vocab = list(raw.keys()) if isinstance(raw, dict) else list(raw)
                 except Exception:
                     vocab = None
                 cfg = CtcSegmentationParameters()
                 if vocab:
                     cfg.char_list = vocab
                 gt = prepare_text(cfg, words)[0]
                 try:
                     timing, _, _ = ctc_segmentation(cfg, logits.detach().cpu().numpy()[0], gt)
@@ -317,8 +339,7 @@ def pipeline(video_input, model_name):
                 subs = align_vad(text, audio, sr, duration)
         elif mode == "ctc_char":
-            # QuartzNet : pas de tokenizer BPE, on procède avec VAD (ou on peut essayer timestamps si model le permet)
-            # On essaie d'obtenir timestamps via model.transcribe() si disponible (mais souvent non)
             try:
                 subs = align_vad(text, audio, sr, duration)
             except Exception as e:
@@ -326,7 +347,7 @@ def pipeline(video_input, model_name):
                 subs = align_vad(text, audio, sr, duration)
         else:  # ctc (BPE)
-            # Pour les modèles CTC-BPE, VAD reste une option raisonnable si segmentation manque
             try:
                 subs = align_vad(text, audio, sr, duration)
             except Exception as e:
@@ -344,18 +365,21 @@ def pipeline(video_input, model_name):
         return (f"❌ Erreur — {str(e)}", None)
 # ----------------------------
-# INTERFACE GRADIO (optionnel)
 # ----------------------------
 with gr.Blocks(title="RobotsMali - Sous-titrage") as demo:
-    gr.Markdown(" RobotsMali — Sous-titrage")
-    v = gr.Video(label="Vidéo à sous-titrer")
-    m = gr.Dropdown(list(MODELS.keys()), value="Soloba V1 (CTC)", label="Modèle ASR")
-    b = gr.Button("▶️ Générer")
-    s = gr.Markdown()
-    o = gr.Video(label="Vidéo sous-titrée")
-    b.click(pipeline, [v, m], [s, o])
-# Pour exécuter l'interface :
-# demo.launch(share=True, debug=False)
-demo.launch(share=True, debug=False)

 # -*- coding: utf-8 -*-
 """
 ROBOTSMALI — Sous-titrage Bambara
+Correctif: Durée vidéo robuste (FFprobe + Fallback Audio)
 """
 import os
     return res.stdout
 def ffprobe_duration(path):
+    """
+    Tente d'obtenir la durée via ffprobe.
+    Modifié pour être plus tolérant avec les formats web/webcam.
+    """
+    # On a retiré '-select_streams v:0' pour lire les métadonnées globales du conteneur
+    cmd = f'ffprobe -v error -show_entries format=duration -of default=noprint_wrappers=1:nokey=1 {shlex.quote(path)}'
     out = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
     if out.returncode != 0:
+        print(f"ffprobe warning: {out.stderr}")
         return None
     try:
+        # Parfois ffprobe renvoie plusieurs lignes, on prend la première valide
+        output = out.stdout.strip().split('\n')[0]
+        return float(output)
     except:
         return None
     return output_path
 # ----------------------------
+# PIPELINE PRINCIPAL (FIXED)
 # ----------------------------
 def pipeline(video_input, model_name):
     """
     model_name : clé dans MODELS
     """
     try:
+        # Support Gradio dict (tmp_path)
         if isinstance(video_input, dict) and "tmp_path" in video_input:
             video_path = video_input["tmp_path"]
         else:
             video_path = video_input
+        # 1. Tentative d'obtention de durée via FFPROBE
         duration = ffprobe_duration(video_path)
+        # 2. Extraction & Nettoyage Audio
         tmp_fd, tmp_wav = tempfile.mkstemp(suffix=".wav")
         os.close(tmp_fd)
         extract_audio(video_path, tmp_wav)
         clean_wav, audio, sr = clean_audio(tmp_wav)
+        # 3. FALLBACK: Si FFprobe a échoué (None), on calcule depuis l'audio
+        if duration is None:
+            print("[INFO] ffprobe duration failed, calculating from audio...")
+            if sr and sr > 0:
+                duration = len(audio) / sr
+        # Vérification finale
+        if not duration or duration <= 0:
+             raise RuntimeError("Impossible de déterminer la durée de la vidéo (fichier corrompu ?)")
+        print(f"[INFO] Durée détectée: {duration:.2f}s")
+        # 4. Chargement modèle + Transcription
         model = load_model(model_name)
         text = transcribe(model, clean_wav)
         mode = MODELS[model_name][1]
+        # 5. Segmentation / Alignement
         subs = None
         if mode == "rnnt":
             # RNNT : tentative de segmentation via logits + ctc_segmentation si dispo
                 ln = torch.tensor([x.shape[1]]).to(DEVICE)
                 with torch.no_grad():
                     logits = model(input_signal=x, input_signal_length=ln)[0]
                 time_per_frame = duration / max(1, logits.shape[1])
                 # build char list
                 try:
                     raw = model.tokenizer.vocab
                     vocab = list(raw.keys()) if isinstance(raw, dict) else list(raw)
                 except Exception:
                     vocab = None
                 cfg = CtcSegmentationParameters()
                 if vocab:
                     cfg.char_list = vocab
                 gt = prepare_text(cfg, words)[0]
                 try:
                     timing, _, _ = ctc_segmentation(cfg, logits.detach().cpu().numpy()[0], gt)
                 subs = align_vad(text, audio, sr, duration)
         elif mode == "ctc_char":
+            # QuartzNet (char) : pas de tokenizer BPE, VAD fallback
             try:
                 subs = align_vad(text, audio, sr, duration)
             except Exception as e:
                 subs = align_vad(text, audio, sr, duration)
         else:  # ctc (BPE)
+            # Soloba CTC : VAD fallback
             try:
                 subs = align_vad(text, audio, sr, duration)
             except Exception as e:
         return (f"❌ Erreur — {str(e)}", None)
 # ----------------------------
+# INTERFACE GRADIO
 # ----------------------------
 with gr.Blocks(title="RobotsMali - Sous-titrage") as demo:
+    gr.Markdown("## RobotsMali — Sous-titrage Bambara")
+    with gr.Row():
+        with gr.Column():
+            v = gr.Video(label="Vidéo à sous-titrer", sources=["upload", "webcam"])
+            m = gr.Dropdown(list(MODELS.keys()), value="Soloba V1 (CTC)", label="Modèle ASR")
+            b = gr.Button("▶️ Générer les sous-titres", variant="primary")
+        with gr.Column():
+            s = gr.Markdown(label="Statut")
+            o = gr.Video(label="Vidéo sous-titrée")
+    b.click(pipeline, [v, m], [s, o])
+if __name__ == "__main__":
+    demo.launch(share=True, debug=True)