Spaces:

RobotsMali
/

RobotsMali_Video_captionning

Running

App Files Files Community

binaryMao commited on 28 days ago

Commit

93438d8

verified ·

1 Parent(s): bde1ae6

Update app.py

Browse files

Files changed (1) hide show

app.py +77 -75

app.py CHANGED Viewed

@@ -1,5 +1,5 @@
 # -*- coding: utf-8 -*-
-import os, shlex, subprocess, tempfile, traceback, textwrap, time
 import torch
 from huggingface_hub import snapshot_download
 from nemo.collections import asr as nemo_asr
@@ -17,22 +17,20 @@ MODELS = {
     "QuartzNet V0 (CTC-char)": ("RobotsMali/stt-bm-quartznet15x5-v0", "ctc_char"),
 }
-# 2. GESTION DES CHEMINS (Correction du bug de chargement exemple)
 def get_absolute_example():
-    paths = [
-        os.path.abspath("MARALINKE.mp4"),
-        os.path.abspath("examples/MARALINKE.mp4"),
-        "/home/user/app/MARALINKE.mp4",
-        "/home/user/app/examples/MARALINKE.mp4"
-    ]
-    for p in paths:
-        if os.path.exists(p): return p
     return None
 EXAMPLE_PATH = get_absolute_example()
 _cache = {}
-# 3. MOTEUR IA NEMO
 def load_model(name):
     if name in _cache: return _cache[name]
     _cache.clear()
@@ -53,65 +51,74 @@ def load_model(name):
     _cache[name] = model
     return model
-# 4. UTILITAIRES DE SYNCHRONISATION
-def format_ts(seconds):
-    td = time.gmtime(seconds)
-    ms = int((seconds - int(seconds)) * 1000)
     return f"{time.strftime('%H:%M:%S', td)},{ms:03}"
-def get_real_duration(file_path):
-    cmd = f"ffprobe -v error -show_entries format=duration -of default=noprint_wrappers=1:nokey=1 {shlex.quote(file_path)}"
-    res = subprocess.run(cmd, shell=True, capture_output=True, text=True)
-    try: return float(res.stdout.strip())
-    except: return 0.0
-# 5. PIPELINE DE TRAITEMENT
 def pipeline(video_in, model_name):
     try:
-        if not video_in: return "❌ Erreur : Aucune vidéo détectée.", None
-        # Étape A : Extraction Audio
-        yield "⏳ Extraction de l'audio...", None
-        wav_path = os.path.abspath("temp.wav")
-        subprocess.run(f"ffmpeg -y -i {shlex.quote(video_in)} -vn -ac 1 -ar 16000 {wav_path}", shell=True, check=True)
-        duration = get_real_duration(video_in)
-        # Étape B : Transcription avec Offsets (Alignement Natif)
-        yield f"⏳ Transcription IA ({model_name}) avec alignement...", None
         model = load_model(model_name)
-        # Utilisation de return_hypotheses pour récupérer les timestamps CTC
-        hypotheses = model.transcribe([wav_path], return_hypotheses=True)[0]
-        words_with_ts = []
-        if hasattr(hypotheses, 'word_offsets') and hypotheses.word_offsets:
-            offsets = hypotheses.word_offsets
-            words = hypotheses.text.split()
-            # Facteur 0.02 (Stride de NeMo) pour convertir frames en secondes
-            for i, word in enumerate(words):
-                t_start = offsets[i] * 0.02
-                words_with_ts.append({"word": word, "start": t_start, "end": t_start + 0.4})
-        else:
-            # Fallback temporel linéaire si les offsets ne sont pas disponibles (RNNT)
-            words = (hypotheses.text if hasattr(hypotheses, 'text') else str(hypotheses)).split()
-            for i, w in enumerate(words):
-                words_with_ts.append({"word": w, "start": (i/len(words))*duration, "end": ((i+1)/len(words))*duration})
-        # Étape C : Création du SRT segmenté
-        yield "⏳ Génération des segments synchronisés...", None
-        srt_path = os.path.abspath("output.srt")
         words_per_line = 6
         with open(srt_path, "w", encoding="utf-8") as f:
-            for i in range(0, len(words_with_ts), words_per_line):
-                chunk = words_with_ts[i:i+words_per_line]
-                start_time = chunk[0]['start']
-                end_time = chunk[-1]['end'] + 0.5
-                f.write(f"{(i//words_per_line)+1}\n{format_ts(start_time)} --> {format_ts(end_time)}\n")
-                f.write(" ".join([w['word'] for w in chunk]) + "\n\n")
-        # Étape D : Encodage et "Burn-in"
-        yield "⏳ Incrustation des sous-titres (FastStart)...", None
-        out_path = os.path.abspath(f"resultat_{int(time.time())}.mp4")
         cmd_ffmpeg = (
             f"ffmpeg -y -i {shlex.quote(video_in)} "
             f"-vf \"subtitles={shlex.quote(srt_path)}:force_style='Alignment=2,FontSize=20,PrimaryColour=&H00FFFF&'\" "
@@ -119,38 +126,33 @@ def pipeline(video_in, model_name):
         )
         subprocess.run(cmd_ffmpeg, shell=True, check=True)
-        yield "✅ Terminé avec succès !", out_path
     except Exception as e:
         traceback.print_exc()
         yield f"❌ Erreur : {str(e)}", None
-# 6. INTERFACE GRADIO (Webcam + Example Fix)
 with gr.Blocks(theme=gr.themes.Soft(), css="body {background-color: #0b1120;}") as demo:
-    gr.HTML("<h1 style='text-align:center; color:#facc15;'>🤖 ROBOTSMALI V10.5</h1>")
     with gr.Row():
         with gr.Column():
-            gr.Markdown("### 📥 SOURCE")
-            # Supporte l'upload ET la webcam
-            v_in = gr.Video(label="Webcam ou Fichier", sources=["upload", "webcam"], interactive=True)
-            if EXAMPLE_PATH:
-                btn_demo = gr.Button("📂 CHARGER LA VIDÉO D'EXEMPLE", variant="secondary")
             m_sel = gr.Dropdown(list(MODELS.keys()), value="Soloba V1 (CTC)", label="Modèle IA")
-            btn_run = gr.Button("🚀 GÉNÉRER", variant="primary")
         with gr.Column():
-            gr.Markdown("### 📤 RÉSULTAT")
             status = gr.Markdown("### État\nPrêt")
-            v_out = gr.Video(label="Vidéo finale synchronisée")
-    # Actions
-    if EXAMPLE_PATH:
-        btn_demo.click(fn=lambda: EXAMPLE_PATH, outputs=v_in)
     btn_run.click(pipeline, [v_in, m_sel], [status, v_out])
 if __name__ == "__main__":
-    demo.launch(share=True, debug=True)

 # -*- coding: utf-8 -*-
+import os, shlex, subprocess, tempfile, traceback, textwrap, time, glob
 import torch
 from huggingface_hub import snapshot_download
 from nemo.collections import asr as nemo_asr
     "QuartzNet V0 (CTC-char)": ("RobotsMali/stt-bm-quartznet15x5-v0", "ctc_char"),
 }
+# 2. LOCALISATION DE LA VIDÉO D'EXEMPLE
 def get_absolute_example():
+    names = ["MARALINKE.mp4", "maralinke.mp4", "example.mp4"]
+    dirs = [".", "examples", "/home/user/app", "/home/user/app/examples"]
+    for d in dirs:
+        for n in names:
+            p = os.path.join(d, n)
+            if os.path.exists(p): return os.path.abspath(p)
     return None
 EXAMPLE_PATH = get_absolute_example()
 _cache = {}
+# 3. CHARGEMENT DES MODÈLES IA
 def load_model(name):
     if name in _cache: return _cache[name]
     _cache.clear()
     _cache[name] = model
     return model
+# 4. UTILITAIRE DE FORMATAGE SRT
+def format_srt_time(sec):
+    td = time.gmtime(sec)
+    ms = int((sec - int(sec)) * 1000)
     return f"{time.strftime('%H:%M:%S', td)},{ms:03}"
+# 5. PIPELINE DE TRAITEMENT (SEGMENTATION 10S + OFFSETS)
 def pipeline(video_in, model_name):
+    tmp_dir = tempfile.mkdtemp()
     try:
+        if not video_in: return "❌ Erreur : Source vide", None
+        # Étape A : Extraction et Segmentation Audio
+        yield "⏳ Découpage de l'audio en segments de 10s...", None
+        full_wav = os.path.join(tmp_dir, "full.wav")
+        subprocess.run(f"ffmpeg -y -i {shlex.quote(video_in)} -vn -ac 1 -ar 16000 {full_wav}", shell=True, check=True)
+        segment_pattern = os.path.join(tmp_dir, "seg_%03d.wav")
+        subprocess.run(f"ffmpeg -i {full_wav} -f segment -segment_time 10 -c copy {segment_pattern}", shell=True, check=True)
+        audio_segments = sorted(glob.glob(os.path.join(tmp_dir, "seg_*.wav")))
         model = load_model(model_name)
+        # Étape B : Transcription segmentée avec Offsets natifs
+        all_words_ts = []
+        for idx, seg_path in enumerate(audio_segments):
+            base_time = idx * 10.0
+            yield f"⏳ IA : Transcription segment {idx+1}/{len(audio_segments)}...", None
+            # Utilisation de return_hypotheses pour les timestamps
+            hyp = model.transcribe([seg_path], return_hypotheses=True)[0]
+            if hasattr(hyp, 'word_offsets') and hyp.word_offsets:
+                words = hyp.text.split()
+                for i, word in enumerate(words):
+                    # Facteur de conversion frame->seconde (standard NeMo 0.02)
+                    rel_start = hyp.word_offsets[i] * 0.02
+                    all_words_ts.append({
+                        "word": word,
+                        "start": base_time + rel_start,
+                        "end": base_time + rel_start + 0.45
+                    })
+            else:
+                # Fallback temporel si offsets non dispos
+                words = (hyp.text if hasattr(hyp, 'text') else str(hyp)).split()
+                if words:
+                    gap = 10.0 / len(words)
+                    for i, w in enumerate(words):
+                        all_words_ts.append({
+                            "word": w,
+                            "start": base_time + (i * gap),
+                            "end": base_time + ((i+1) * gap)
+                        })
+        # Étape C : Génération du SRT optimisé
+        yield "⏳ Création du fichier de sous-titres...", None
+        srt_path = os.path.join(tmp_dir, "final.srt")
         words_per_line = 6
         with open(srt_path, "w", encoding="utf-8") as f:
+            for i in range(0, len(all_words_ts), words_per_line):
+                chunk = all_words_ts[i:i+words_per_line]
+                f.write(f"{(i//words_per_line)+1}\n")
+                f.write(f"{format_srt_time(chunk[0]['start'])} --> {format_srt_time(chunk[-1]['end'])}\n")
+                f.write(" ".join([c['word'] for c in chunk]) + "\n\n")
+        # Étape D : Incrustation Finale (Burn-in)
+        yield "⏳ Rendu vidéo final...", None
+        out_path = os.path.abspath(f"robotsmali_final_{int(time.time())}.mp4")
         cmd_ffmpeg = (
             f"ffmpeg -y -i {shlex.quote(video_in)} "
             f"-vf \"subtitles={shlex.quote(srt_path)}:force_style='Alignment=2,FontSize=20,PrimaryColour=&H00FFFF&'\" "
         )
         subprocess.run(cmd_ffmpeg, shell=True, check=True)
+        yield "✅ Synchronisation parfaite terminée !", out_path
     except Exception as e:
         traceback.print_exc()
         yield f"❌ Erreur : {str(e)}", None
+# 6. INTERFACE UTILISATEUR GRADIO
 with gr.Blocks(theme=gr.themes.Soft(), css="body {background-color: #0b1120;}") as demo:
+    gr.HTML("<h1 style='text-align:center; color:#facc15;'>🤖 ROBOTSMALI V12.5</h1>")
+    gr.Markdown("<p style='text-align:center; color:white;'>Segmentation 10s + Offsets Natifs NeMo</p>")
     with gr.Row():
         with gr.Column():
+            v_in = gr.Video(label="Source (Webcam ou Fichier)", sources=["upload", "webcam"], interactive=True)
             m_sel = gr.Dropdown(list(MODELS.keys()), value="Soloba V1 (CTC)", label="Modèle IA")
+            btn_run = gr.Button("🚀 GÉNÉRER SOUS-TITRES", variant="primary")
+            if EXAMPLE_PATH:
+                gr.Markdown("### 💡 Exemple")
+                gr.Examples(examples=[[EXAMPLE_PATH, "Soloba V1 (CTC)"]], inputs=[v_in, m_sel])
         with gr.Column():
             status = gr.Markdown("### État\nPrêt")
+            v_out = gr.Video(label="Résultat Final")
     btn_run.click(pipeline, [v_in, m_sel], [status, v_out])
 if __name__ == "__main__":
+    demo.launch(debug=True)