Spaces:

RobotsMali
/

RobotsMali_Video_captionning

Running

App Files Files Community

binaryMao commited on Oct 31, 2025

Commit

60b8ac2

verified ·

1 Parent(s): 0fb103f

Update app.py

Browse files

Files changed (1) hide show

app.py +163 -48

app.py CHANGED Viewed

@@ -2,32 +2,67 @@ import gradio as gr
 import numpy as np
 import torch
 import soundfile as sf
 from moviepy.editor import VideoFileClip, CompositeVideoClip, ImageClip
 from PIL import Image, ImageDraw, ImageFont
 from nemo.collections import asr as nemo_asr
-from huggingface_hub import hf_hub_download
 from ctc_segmentation import ctc_segmentation, CtcSegmentationParameters, prepare_text
 MODELS = {
     "Soloni V0": ("RobotsMali/soloni-114m-tdt-ctc-V0", "soloni-114m-tdt-ctc-V0.nemo", "rnnt"),
     "Soloni V1": ("RobotsMali/soloni-114m-tdt-ctc-V1", "soloni-114m-tdt-ctc-V1.nemo", "rnnt"),
     "Soloba V0": ("RobotsMali/soloba-ctc-0.6b-V0", None, "ctc"),
     "Soloba V1": ("RobotsMali/soloba-ctc-0.6b-V1", None, "ctc"),
     "QuartzNet V0": ("RobotsMali/stt-bm-quartznet15x5-V0", None, "ctc"),
     "QuartzNet V1": ("RobotsMali/stt-bm-quartznet15x5-V1", None, "ctc"),
 }
 def extract_audio(video_path, wav_path):
-    (VideoFileClip(video_path).audio.write_audiofile(
         wav_path, fps=16000, codec="pcm_s16le", verbose=False, logger=None
-    ))
 def transcribe(model, device, wav, model_name):
     audio, sr = sf.read(wav)
     if audio.ndim == 2:
         audio = np.mean(audio, axis=1)
@@ -35,6 +70,7 @@ def transcribe(model, device, wav, model_name):
     ln = torch.tensor([x.shape[1]]).to(device)
     total_s = len(audio) / sr
     if "Soloni" in model_name:
         with torch.no_grad():
             proc, plen = model.preprocessor(input_signal=x, input_signal_length=ln)
@@ -42,6 +78,7 @@ def transcribe(model, device, wav, model_name):
         hyp = hyps[0][0] if isinstance(hyps[0], list) else hyps[0]
         return [(w.start_offset_ms/1000, w.end_offset_ms/1000, w.word) for w in hyp.words]
     text = model.transcribe([wav])[0].strip()
     if not text:
         return []
@@ -50,6 +87,9 @@ def transcribe(model, device, wav, model_name):
         logits, logit_len = model.forward(input_signal=x, input_signal_length=ln)
     words = text.split()
     config = CtcSegmentationParameters()
     config.char_list = list(model.tokenizer.vocab.keys())
     gt, _ = prepare_text(config, words)
@@ -61,69 +101,144 @@ def transcribe(model, device, wav, model_name):
                 timings[i+1] * tps if i+1 < len(timings) else total_s,
                 words[i]) for i in range(len(words))]
     grouped, temp = [], []
     for w in aligned:
         temp.append(w)
-        if len(temp) >= 4:
-            grouped.append(temp); temp = []
-    if temp: grouped.append(temp)
     return [(g[0][0], g[-1][1], " ".join([w[2] for w in g])) for g in grouped]
 def burn(video, subs):
     clip = VideoFileClip(video)
     W, H = clip.size
     try:
-        font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf", int(H/20))
     except:
-        font = ImageFont.load_default()
     layers = []
-    for s,e,text in subs:
-        img = Image.new("RGBA",(W,int(H*0.12)),(0,0,0,140))
         draw = ImageDraw.Draw(img)
-        bbox = draw.textbbox((0,0), text, font=font)
-        tw, th = bbox[2]-bbox[0], bbox[3]-bbox[1]
-        draw.text(((W-tw)//2,(int(H*0.12)-th)//2), text, font=font, fill="white")
-        layers.append(ImageClip(np.array(img)).set_start(s).set_duration(e-s).set_position(("center",int(H*0.85))))
     final = CompositeVideoClip([clip] + layers)
-    out = "RobotsMali_Subtitled.mp4"
-    final.write_videofile(out, codec="libx264", audio_codec="aac", fps=clip.fps, verbose=False, logger=None)
-    clip.close(); final.close()
-    return out
 def pipeline(video_file, model_name):
     if video_file is None:
         return "Veuillez importer une vidéo.", None
     repo, nemo_file, mode = MODELS[model_name]
     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-    if mode == "rnnt":
-        nemo_path = hf_hub_download(repo, filename=nemo_file)
-        model = nemo_asr.models.EncDecHybridRNNTCTCBPEModel.restore_from(nemo_path)
-    else:
-        model = nemo_asr.models.EncDecCTCModelBPE.from_pretrained(model_name=repo)
-    model = model.to(device); model.eval()
-    wav = "audio.wav"
-    extract_audio(video_file, wav)
-    subs = transcribe(model, device, wav, model_name)
-    final = burn(video_file, subs)
-    return "✅ Sous-titres générés.", final
-with gr.Blocks() as demo:
-    gr.Markdown("# 🎙️ **RobotsMali — Sous-titrage automatique Bambara**")
-    video = gr.Video(label="Vidéo")
-    model = gr.Dropdown(list(MODELS.keys()), value="Soloni V1", label="Modèle")
-    btn = gr.Button("⚡ Générer les sous-titres")
-    status = gr.Markdown()
-    out = gr.Video(label="Résultat")
     btn.click(pipeline, inputs=[video, model], outputs=[status, out])
-demo.launch()

 import numpy as np
 import torch
 import soundfile as sf
+import os
+import tempfile
 from moviepy.editor import VideoFileClip, CompositeVideoClip, ImageClip
 from PIL import Image, ImageDraw, ImageFont
 from nemo.collections import asr as nemo_asr
+from huggingface_hub import hf_hub_download, snapshot_download
 from ctc_segmentation import ctc_segmentation, CtcSegmentationParameters, prepare_text
 MODELS = {
     "Soloni V0": ("RobotsMali/soloni-114m-tdt-ctc-V0", "soloni-114m-tdt-ctc-V0.nemo", "rnnt"),
     "Soloni V1": ("RobotsMali/soloni-114m-tdt-ctc-V1", "soloni-114m-tdt-ctc-V1.nemo", "rnnt"),
     "Soloba V0": ("RobotsMali/soloba-ctc-0.6b-V0", None, "ctc"),
     "Soloba V1": ("RobotsMali/soloba-ctc-0.6b-V1", None, "ctc"),
     "QuartzNet V0": ("RobotsMali/stt-bm-quartznet15x5-V0", None, "ctc"),
     "QuartzNet V1": ("RobotsMali/stt-bm-quartznet15x5-V1", None, "ctc"),
 }
+def load_ctc_model_safe(repo_id):
+    """Charge les modèles CTC de manière robuste"""
+    try:
+        # Essai 1: Chargement standard
+        return nemo_asr.models.EncDecCTCModelBPE.from_pretrained(model_name=repo_id)
+    except Exception as e:
+        print(f"Erreur lors du chargement standard: {e}")
+        # Essai 2: Téléchargement manuel via snapshot
+        try:
+            print("Tentative de téléchargement manuel...")
+            model_path = snapshot_download(
+                repo_id=repo_id,
+                cache_dir=tempfile.mkdtemp(),
+                local_dir_use_symlinks=False
+            )
+            # Chercher le fichier .nemo
+            nemo_file = None
+            for file in os.listdir(model_path):
+                if file.endswith('.nemo'):
+                    nemo_file = os.path.join(model_path, file)
+                    break
+            if nemo_file and os.path.exists(nemo_file):
+                print(f"Chargement depuis: {nemo_file}")
+                return nemo_asr.models.EncDecCTCModelBPE.restore_from(nemo_file)
+            else:
+                raise FileNotFoundError("Fichier .nemo non trouvé dans le repo")
+        except Exception as e2:
+            print(f"Échec du téléchargement manuel: {e2}")
+            raise
 def extract_audio(video_path, wav_path):
+    """Extrait l'audio de la vidéo"""
+    video = VideoFileClip(video_path)
+    video.audio.write_audiofile(
         wav_path, fps=16000, codec="pcm_s16le", verbose=False, logger=None
+    )
+    video.close()
 def transcribe(model, device, wav, model_name):
+    """Transcrit l'audio avec alignement temporel"""
     audio, sr = sf.read(wav)
     if audio.ndim == 2:
         audio = np.mean(audio, axis=1)
     ln = torch.tensor([x.shape[1]]).to(device)
     total_s = len(audio) / sr
+    # Modèles RNNT (Soloni)
     if "Soloni" in model_name:
         with torch.no_grad():
             proc, plen = model.preprocessor(input_signal=x, input_signal_length=ln)
         hyp = hyps[0][0] if isinstance(hyps[0], list) else hyps[0]
         return [(w.start_offset_ms/1000, w.end_offset_ms/1000, w.word) for w in hyp.words]
+    # Modèles CTC (Soloba, QuartzNet)
     text = model.transcribe([wav])[0].strip()
     if not text:
         return []
         logits, logit_len = model.forward(input_signal=x, input_signal_length=ln)
     words = text.split()
+    if not words:
+        return []
     config = CtcSegmentationParameters()
     config.char_list = list(model.tokenizer.vocab.keys())
     gt, _ = prepare_text(config, words)
                 timings[i+1] * tps if i+1 < len(timings) else total_s,
                 words[i]) for i in range(len(words))]
+    # Regroupement des mots
     grouped, temp = [], []
     for w in aligned:
         temp.append(w)
+        if len(temp) >= 4:  # Groupe de 4 mots
+            grouped.append(temp)
+            temp = []
+    if temp:
+        grouped.append(temp)
     return [(g[0][0], g[-1][1], " ".join([w[2] for w in g])) for g in grouped]
 def burn(video, subs):
+    """Ajoute les sous-titres à la vidéo"""
     clip = VideoFileClip(video)
     W, H = clip.size
+    # Tentative de chargement de police
     try:
+        font_size = max(int(H/20), 20)  # Taille minimale
+        font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf", font_size)
     except:
+        try:
+            font = ImageFont.load_default()
+        except:
+            font = None
     layers = []
+    for start, end, text in subs:
+        # Création de l'image de sous-titre
+        img_height = int(H * 0.12)
+        img = Image.new("RGBA", (W, img_height), (0, 0, 0, 140))
         draw = ImageDraw.Draw(img)
+        if font:
+            bbox = draw.textbbox((0, 0), text, font=font)
+            tw, th = bbox[2] - bbox[0], bbox[3] - bbox[1]
+            draw.text(((W - tw) // 2, (img_height - th) // 2), text, font=font, fill="white")
+        else:
+            # Fallback si police non disponible
+            draw.text((W//2, img_height//2), text, fill="white", anchor="mm")
+        # Création du clip de sous-titre
+        subtitle_clip = ImageClip(np.array(img)).set_start(start).set_duration(end - start)
+        subtitle_clip = subtitle_clip.set_position(("center", int(H * 0.85)))
+        layers.append(subtitle_clip)
+    # Composition finale
     final = CompositeVideoClip([clip] + layers)
+    out_path = "RobotsMali_Subtitled.mp4"
+    # Écriture de la vidéo finale
+    final.write_videofile(
+        out_path,
+        codec="libx264",
+        audio_codec="aac",
+        fps=clip.fps,
+        verbose=False,
+        logger=None,
+        temp_audiofile="temp-audio.m4a",
+        remove_temp=True
+    )
+    # Nettoyage
+    clip.close()
+    final.close()
+    for layer in layers:
+        layer.close()
+    return out_path
 def pipeline(video_file, model_name):
+    """Pipeline principal de traitement"""
     if video_file is None:
         return "Veuillez importer une vidéo.", None
     repo, nemo_file, mode = MODELS[model_name]
     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    try:
+        # Chargement du modèle
+        if mode == "rnnt":
+            nemo_path = hf_hub_download(repo, filename=nemo_file)
+            model = nemo_asr.models.EncDecHybridRNNTCTCBPEModel.restore_from(nemo_path)
+        else:
+            model = load_ctc_model_safe(repo)  # Utilisation de la fonction sécurisée
+        model = model.to(device)
+        model.eval()
+        # Traitement
+        wav_path = "audio.wav"
+        extract_audio(video_file, wav_path)
+        subs = transcribe(model, device, wav_path, model_name)
+        final_video = burn(video_file, subs)
+        # Nettoyage des fichiers temporaires
+        if os.path.exists(wav_path):
+            os.remove(wav_path)
+        return "✅ Sous-titres générés avec succès!", final_video
+    except Exception as e:
+        print(f"Erreur dans le pipeline: {e}")
+        return f"❌ Erreur: {str(e)}", None
+# Interface Gradio
+with gr.Blocks(theme=gr.themes.Soft()) as demo:
+    gr.Markdown("""
+    # 🎙️ **RobotsMali — Sous-titrage automatique Bambara**
+    *Générez automatiquement des sous-titres en Bambara pour vos vidéos*
+    """)
+    with gr.Row():
+        with gr.Column():
+            video = gr.Video(label="Vidéo d'entrée", height=300)
+            model = gr.Dropdown(
+                list(MODELS.keys()),
+                value="Soloni V1",
+                label="Modèle de reconnaissance vocale",
+                info="Soloni: plus précis • Soloba/QuartzNet: plus rapide"
+            )
+            btn = gr.Button("⚡ Générer les sous-titres", variant="primary")
+        with gr.Column():
+            status = gr.Markdown("Prêt à traiter...")
+            out = gr.Video(label="Vidéo sous-titrée", height=300)
+    # Exemples
+    gr.Examples(
+        examples=[],
+        inputs=[video, model],
+        outputs=[status, out],
+        fn=pipeline,
+        cache_examples=False,
+    )
     btn.click(pipeline, inputs=[video, model], outputs=[status, out])
+if __name__ == "__main__":
+    demo.launch(share=True, server_port=7860)