Spaces:

RobotsMali
/

RobotsMali_Video_captionning

Running

App Files Files Community

binaryMao commited on Nov 2, 2025

Commit

f6e735c

verified ·

1 Parent(s): e18b5e6

Update app.py

Browse files

Files changed (1) hide show

app.py +63 -77

app.py CHANGED Viewed

@@ -1,5 +1,5 @@
 # -*- coding: utf-8 -*-
-"""ROBOTSMALI VIDEO CAPTIONING V8 - MINIMALIST BLUE (STABLE VERSION)"""
 import gradio as gr
 import numpy as np
@@ -7,57 +7,51 @@ import torch
 import soundfile as sf
 import os
 import tempfile
-import warnings
 from moviepy.editor import VideoFileClip, TextClip, CompositeVideoClip
 from typing import List, Tuple
-from huggingface_hub import hf_hub_download, snapshot_download
-# ------------------------------------------------------------
-# Import NeMo
-# ------------------------------------------------------------
 try:
     from nemo.collections import asr as nemo_asr
     from ctc_segmentation import ctc_segmentation, CtcSegmentationParameters, prepare_text
     NEMO_LOADED = True
-except Exception as e:
-    print("❌ ERREUR : NeMo ou ctc-segmentation non installé.")
     NEMO_LOADED = False
 # ------------------------------------------------------------
-# Modèles RobotsMali
 # ------------------------------------------------------------
 MODELS = {
-    "Soloni V1 (RNnT - Précis)": ("RobotsMali/soloni-114m-tdt-ctc-V1", "soloni-114m-tdt-ctc-V1.nemo", "rnnt"),
-    "Soloba V1 (CTC - Équilibré)": ("RobotsMali/soloba-ctc-0.6b-V1", None, "ctc"),
-    "QuartzNet V1 (CTC - Rapide)": ("RobotsMali/stt-bm-quartznet15x5-V1", None, "ctc"),
 }
 asr_pipeline = {}
 # ------------------------------------------------------------
-# Chargement modèle robuste
 # ------------------------------------------------------------
-def load_ctc_model_safe(repo_id):
-    try:
-        return nemo_asr.models.EncDecCTCModelBPE.from_pretrained(model_name=repo_id)
-    except:
-        with tempfile.TemporaryDirectory() as tmpdir:
-            path = snapshot_download(repo_id, cache_dir=tmpdir)
-            for f in os.listdir(path):
-                if f.endswith(".nemo"):
-                    return nemo_asr.models.EncDecCTCModelBPE.restore_from(os.path.join(path, f))
-        raise RuntimeError("Impossible de charger le modèle CTC.")
 def load_asr_model(model_name):
-    repo_id, nemo_file, mode = MODELS[model_name]
     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
     if model_name not in asr_pipeline:
-        if mode == "rnnt":
-            nemo_path = hf_hub_download(repo_id, filename=nemo_file)
             model = nemo_asr.models.EncDecHybridRNNTCTCBPEModel.restore_from(nemo_path)
-        else:
-            model = load_ctc_model_safe(repo_id)
         model.to(device).eval()
         asr_pipeline[model_name] = model
@@ -65,7 +59,7 @@ def load_asr_model(model_name):
     return asr_pipeline[model_name]
 # ------------------------------------------------------------
-# Groupage des mots en sous-titres
 # ------------------------------------------------------------
 MAX_WORDS = 4
 MAX_CHARS = 45
@@ -73,59 +67,49 @@ MAX_DURATION = 3.5
 def group_words(words):
     subs, group = [], []
-    def commit(g):
-        if g:
-            subs.append((g[0][0], g[-1][1], " ".join([w[2] for w in g])))
     for w in words:
         test = group + [w]
-        text = " ".join([t[2] for t in test])
         duration = test[-1][1] - test[0][0]
         if len(test) > MAX_WORDS or len(text) > MAX_CHARS or duration > MAX_DURATION:
-            commit(group)
-            group = [w]
         else:
-            group.append(w)
-    commit(group)
     return subs
 # ------------------------------------------------------------
-# Transcription + Alignement
 # ------------------------------------------------------------
-def transcribe(model, device, wavfile, model_name):
-    audio, sr = sf.read(wavfile)
-    if audio.ndim == 2: audio = np.mean(audio, axis=1)
     x = torch.tensor(audio, dtype=torch.float32).unsqueeze(0).to(device)
     ln = torch.tensor([x.shape[1]]).to(device)
     total_s = len(audio) / sr
-    # RNNT direct timestamps
     if "Soloni" in model_name:
         hyps = model.decode_and_align(*model.preprocessor(input_signal=x, input_signal_length=ln))
         words = [(w.start_offset_ms/1000, w.end_offset_ms/1000, w.word) for w in hyps[0][0].words]
         return group_words(words)
-    # CTC + segmentation
-    text = model.transcribe([wavfile])[0]
-    if not text.strip(): return []
     with torch.no_grad(): logits, loglen = model(x, ln)
-    words = text.strip().split()
-    cfg = CtcSegmentationParameters()
-    cfg.char_list = list(model.tokenizer.vocab.keys())
     gt, _ = prepare_text(cfg, words)
-    timings, _, _ = ctc_segmentation(cfg, logits.cpu().numpy()[0], gt)
     tps = total_s / loglen.cpu().numpy()[0]
-    aligned = [(timings[i]*tps,
-                timings[i+1]*tps if i+1 < len(timings) else total_s,
-                words[i]) for i in range(len(words))]
     return group_words(aligned)
 # ------------------------------------------------------------
-# Extraction audio
 # ------------------------------------------------------------
 def extract_audio(video, wav):
     v = VideoFileClip(video)
@@ -133,19 +117,25 @@ def extract_audio(video, wav):
     v.close()
 # ------------------------------------------------------------
-# Burn subtitles
 # ------------------------------------------------------------
 def burn(video, subs):
     output = "RobotsMali_Subtitled.mp4"
     clip = VideoFileClip(video)
     W, H = clip.size
-    layers = []
-    for start, end, text in subs:
         txt = TextClip(
-            text.upper(), fontsize=H//20, color='white', bg_color='rgba(0,0,0,0.7)',
-            method='caption', size=(W*0.9, None)
-        ).set_pos(("center", H*0.85)).set_duration(end-start).set_start(start)
         layers.append(txt)
     final = CompositeVideoClip([clip] + layers)
@@ -154,17 +144,15 @@ def burn(video, subs):
     return output
 # ------------------------------------------------------------
-# PIPELINE STABLE (PAS DE YIELD)
 # ------------------------------------------------------------
 def pipeline(video_file, model_name):
-    if video_file is None:
-        return "⚠️ Importez une vidéo.", None
     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-    status = f"🧠 Chargement du modèle {model_name}..."
     try:
         model = load_asr_model(model_name)
         status += "\n🎶 Extraction audio..."
         wav = os.path.join(tempfile.gettempdir(), "audio.wav")
         extract_audio(video_file, wav)
@@ -173,12 +161,11 @@ def pipeline(video_file, model_name):
         subs = transcribe(model, device, wav, model_name)
         if not subs: return "⚠️ Aucun mot détecté.", None
-        status += "\n🎬 Sous-titrage..."
         out = burn(video_file, subs)
         if os.path.exists(wav): os.remove(wav)
         status += "\n✅ Terminé !"
         return status, out
     except Exception as e:
@@ -187,14 +174,13 @@ def pipeline(video_file, model_name):
 # ------------------------------------------------------------
 # Interface
 # ------------------------------------------------------------
-with gr.Blocks() as demo:
-    gr.Markdown("# ⚡ ROBOTSMALI V8 — MINIMALIST BLUE")
-    video = gr.Video(label="Importer une vidéo")
     model = gr.Dropdown(list(MODELS.keys()), value="Soloni V1 (RNnT - Précis)")
     run = gr.Button("▶️ PRODUIRE")
     status = gr.Markdown()
-    out = gr.Video()
-    run.click(pipeline, inputs=[video, model], outputs=[status, out])
 demo.launch(share=True)

 # -*- coding: utf-8 -*-
+"""ROBOTSMALI VIDEO CAPTIONING V8 — MINIMALIST BLUE + NETFLIX SUBTITLES"""
 import gradio as gr
 import numpy as np
 import soundfile as sf
 import os
 import tempfile
 from moviepy.editor import VideoFileClip, TextClip, CompositeVideoClip
+from huggingface_hub import snapshot_download
 from typing import List, Tuple
 try:
     from nemo.collections import asr as nemo_asr
     from ctc_segmentation import ctc_segmentation, CtcSegmentationParameters, prepare_text
     NEMO_LOADED = True
+except:
     NEMO_LOADED = False
 # ------------------------------------------------------------
+# MODELS (corrigés)
 # ------------------------------------------------------------
 MODELS = {
+    "Soloni V1 (RNnT - Précis)": ("RobotsMali/soloni-114m-tdt-ctc-v1", "rnnt"),
+    "Soloba V1 (CTC - Équilibré)": ("RobotsMali/soloba-ctc-0.6b-v1", "ctc"),
+    "QuartzNet V1 (CTC - Rapide)": ("RobotsMali/stt-bm-quartznet15x5-v1", "ctc"),
 }
 asr_pipeline = {}
 # ------------------------------------------------------------
+# Chargement automatique du modèle (.nemo auto-detect)
 # ------------------------------------------------------------
 def load_asr_model(model_name):
+    repo_id, mode = MODELS[model_name]
     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
     if model_name not in asr_pipeline:
+        repo_path = snapshot_download(repo_id, local_dir_use_symlinks=False)
+        nemo_path = None
+        for f in os.listdir(repo_path):
+            if f.endswith(".nemo"):
+                nemo_path = os.path.join(repo_path, f)
+                break
+        if nemo_path is None:
+            raise FileNotFoundError(f"Aucun .nemo trouvé dans {repo_id}")
+        try:
             model = nemo_asr.models.EncDecHybridRNNTCTCBPEModel.restore_from(nemo_path)
+        except:
+            model = nemo_asr.models.EncDecCTCModelBPE.restore_from(nemo_path)
         model.to(device).eval()
         asr_pipeline[model_name] = model
     return asr_pipeline[model_name]
 # ------------------------------------------------------------
+# Paramètres de découpage
 # ------------------------------------------------------------
 MAX_WORDS = 4
 MAX_CHARS = 45
 def group_words(words):
     subs, group = [], []
+    def push(g):
+        if g: subs.append((g[0][0], g[-1][1], " ".join([w[2] for w in g])))
     for w in words:
         test = group + [w]
+        text = " ".join([x[2] for x in test])
         duration = test[-1][1] - test[0][0]
         if len(test) > MAX_WORDS or len(text) > MAX_CHARS or duration > MAX_DURATION:
+            push(group); group = [w]
         else:
+            group = test
+    push(group)
     return subs
 # ------------------------------------------------------------
+# Transcription + alignement
 # ------------------------------------------------------------
+def transcribe(model, device, wav, model_name):
+    audio, sr = sf.read(wav)
+    if audio.ndim == 2: audio = audio.mean(1)
     x = torch.tensor(audio, dtype=torch.float32).unsqueeze(0).to(device)
     ln = torch.tensor([x.shape[1]]).to(device)
     total_s = len(audio) / sr
     if "Soloni" in model_name:
         hyps = model.decode_and_align(*model.preprocessor(input_signal=x, input_signal_length=ln))
         words = [(w.start_offset_ms/1000, w.end_offset_ms/1000, w.word) for w in hyps[0][0].words]
         return group_words(words)
+    text = model.transcribe([wav])[0].strip()
+    if not text: return []
     with torch.no_grad(): logits, loglen = model(x, ln)
+    words = text.split()
+    cfg = CtcSegmentationParameters(); cfg.char_list = list(model.tokenizer.vocab.keys())
     gt, _ = prepare_text(cfg, words)
+    timing, _, _ = ctc_segmentation(cfg, logits.cpu().numpy()[0], gt)
     tps = total_s / loglen.cpu().numpy()[0]
+    aligned = [(timing[i]*tps, timing[i+1]*tps if i+1<len(timing) else total_s, words[i]) for i in range(len(words))]
     return group_words(aligned)
 # ------------------------------------------------------------
+# Extraction Audio
 # ------------------------------------------------------------
 def extract_audio(video, wav):
     v = VideoFileClip(video)
     v.close()
 # ------------------------------------------------------------
+# Sous-titres Style Netflix
 # ------------------------------------------------------------
 def burn(video, subs):
     output = "RobotsMali_Subtitled.mp4"
     clip = VideoFileClip(video)
     W, H = clip.size
+    layers = []
+    for s, e, t in subs:
         txt = TextClip(
+            t.upper(),
+            fontsize=H//18,
+            stroke_width=3,
+            stroke_color="black",
+            color="white",
+            method="caption",
+            size=(W*0.85, None),
+            bg_color="rgba(0,0,0,0.45)"
+        ).set_start(s).set_duration(e-s).set_pos(("center", H*0.82))
         layers.append(txt)
     final = CompositeVideoClip([clip] + layers)
     return output
 # ------------------------------------------------------------
+# Pipeline
 # ------------------------------------------------------------
 def pipeline(video_file, model_name):
     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    status = f"🧠 Chargement modèle sur {device}..."
     try:
         model = load_asr_model(model_name)
         status += "\n🎶 Extraction audio..."
         wav = os.path.join(tempfile.gettempdir(), "audio.wav")
         extract_audio(video_file, wav)
         subs = transcribe(model, device, wav, model_name)
         if not subs: return "⚠️ Aucun mot détecté.", None
+        status += "\n🎬 Sous-titres Netflix..."
         out = burn(video_file, subs)
         if os.path.exists(wav): os.remove(wav)
         status += "\n✅ Terminé !"
         return status, out
     except Exception as e:
 # ------------------------------------------------------------
 # Interface
 # ------------------------------------------------------------
+with gr.Blocks(title="RobotsMali V8") as demo:
+    gr.Markdown("# ⚡ ROBOTSMALI V8 — Minimalist Blue + Netflix Subtitles")
+    video = gr.Video()
     model = gr.Dropdown(list(MODELS.keys()), value="Soloni V1 (RNnT - Précis)")
     run = gr.Button("▶️ PRODUIRE")
     status = gr.Markdown()
+    result = gr.Video()
+    run.click(pipeline, inputs=[video, model], outputs=[status, result])
 demo.launch(share=True)