Spaces:

RobotsMali
/

RobotsMali_Video_captionning

Running

App Files Files Community

binaryMao commited on Oct 31, 2025

Commit

0456de7

verified ·

1 Parent(s): 605a27b

Update app.py

Browse files

Files changed (1) hide show

app.py +17 -8

app.py CHANGED Viewed

@@ -1,5 +1,4 @@
 import gradio as gr
-import os
 import numpy as np
 import torch
 import soundfile as sf
@@ -26,7 +25,7 @@ MODELS = {
 # =============================
-# EXTRACTION AUDIO (FIABLE + COMPATIBLE HF & COLAB)
 # =============================
 def extract_audio(video_path, wav_path):
@@ -65,7 +64,7 @@ def transcribe(model, device, wav, model_name):
         hyp = hyps[0][0] if isinstance(hyps[0], list) else hyps[0]
         return [(w.start_offset_ms/1000, w.end_offset_ms/1000, w.word) for w in hyp.words]
-    # === Soloba / QuartzNet → Forced Alignment CTC ===
     text = model.transcribe([wav])[0].strip()
     if not text:
         return []
@@ -85,6 +84,7 @@ def transcribe(model, device, wav, model_name):
                 timings[i+1] * tps if i+1 < len(timings) else total_s,
                 words[i]) for i in range(len(words))]
     grouped, temp = [], []
     for w in aligned:
         temp.append(w)
@@ -122,13 +122,14 @@ def burn(video, subs):
     final = CompositeVideoClip([clip] + layers)
     out = "RobotsMali_Subtitled.mp4"
     final.write_videofile(out, codec="libx264", audio_codec="aac", fps=clip.fps, verbose=False, logger=None)
     clip.close()
     final.close()
     return out
 # =============================
-# PIPELINE
 # =============================
 def pipeline(video_file, model_name):
@@ -136,7 +137,15 @@ def pipeline(video_file, model_name):
         return "Veuillez importer une vidéo.", None
     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-    model = nemo_asr.models.ASRModel.from_pretrained(MODELS[model_name]).to(device)
     wav = "audio.wav"
     extract_audio(video_file, wav)
@@ -147,17 +156,17 @@ def pipeline(video_file, model_name):
 # =============================
-# INTERFACE (inchangée)
 # =============================
 with gr.Blocks() as demo:
-    gr.Markdown("# 🎙️ **RobotsMali - Sous-titrage Bambara Automatique**")
     video = gr.Video(label="Vidéo")
     model = gr.Dropdown(list(MODELS.keys()), value="Soloni V1", label="Modèle")
     btn = gr.Button("⚡ Générer les sous-titres")
     status = gr.Markdown()
-    out = gr.Video(label="Résultat")
     btn.click(pipeline, inputs=[video, model], outputs=[status, out])

 import gradio as gr
 import numpy as np
 import torch
 import soundfile as sf
 # =============================
+# EXTRACTION AUDIO (SOLIDE & COMPATIBLE HF)
 # =============================
 def extract_audio(video_path, wav_path):
         hyp = hyps[0][0] if isinstance(hyps[0], list) else hyps[0]
         return [(w.start_offset_ms/1000, w.end_offset_ms/1000, w.word) for w in hyp.words]
+    # === Soloba & QuartzNet → CTC Forced Alignment ===
     text = model.transcribe([wav])[0].strip()
     if not text:
         return []
                 timings[i+1] * tps if i+1 < len(timings) else total_s,
                 words[i]) for i in range(len(words))]
+    # Groupage lisible (max 4 mots par sous-titre)
     grouped, temp = [], []
     for w in aligned:
         temp.append(w)
     final = CompositeVideoClip([clip] + layers)
     out = "RobotsMali_Subtitled.mp4"
     final.write_videofile(out, codec="libx264", audio_codec="aac", fps=clip.fps, verbose=False, logger=None)
     clip.close()
     final.close()
     return out
 # =============================
+# PIPELINE PRINCIPAL
 # =============================
 def pipeline(video_file, model_name):
         return "Veuillez importer une vidéo.", None
     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    # Chargement correct selon le modèle
+    if "Soloni" in model_name:
+        model = nemo_asr.models.EncDecHybridRNNTCTCBPEModel.from_pretrained(model_name=MODELS[model_name])
+    else:
+        model = nemo_asr.models.EncDecCTCModelBPE.from_pretrained(model_name=MODELS[model_name])
+    model = model.to(device)
+    model.eval()
     wav = "audio.wav"
     extract_audio(video_file, wav)
 # =============================
+# INTERFACE (DESIGN CONSERVÉ)
 # =============================
 with gr.Blocks() as demo:
+    gr.Markdown("# 🎙️ **RobotsMali — Sous-titrage automatique Bambara**")
     video = gr.Video(label="Vidéo")
     model = gr.Dropdown(list(MODELS.keys()), value="Soloni V1", label="Modèle")
     btn = gr.Button("⚡ Générer les sous-titres")
     status = gr.Markdown()
+    out = gr.Video(label="Résultat (avec sous-titres)")
     btn.click(pipeline, inputs=[video, model], outputs=[status, out])