Spaces:

RobotsMali
/

RobotsMali_Video_captionning

Running

App Files Files Community

binaryMao commited on Oct 31, 2025

Commit

60f6dea

verified ·

1 Parent(s): 9683e37

Update app.py

Browse files

Files changed (1) hide show

app.py +101 -131

app.py CHANGED Viewed

@@ -1,134 +1,112 @@
-import os, warnings, logging, tempfile
-warnings.filterwarnings("ignore")
-logging.getLogger("nemo_logger").setLevel(logging.ERROR)
-import torch
-torch.set_grad_enabled(False)
 import gradio as gr
 import numpy as np
 import soundfile as sf
 from moviepy.editor import VideoFileClip, CompositeVideoClip, ImageClip
 from PIL import Image, ImageDraw, ImageFont
 from nemo.collections import asr as nemo_asr
 from ctc_segmentation import ctc_segmentation, CtcSegmentationParameters, prepare_text
-# ---------------- CONFIG ---------------- #
-os.environ["NEMO_FORCE_CPU"] = "1"
-os.environ["TOKENIZERS_PARALLELISM"] = "false"
-SR = 16000
-MAX_VIDEO_BYTES = 200_000_000
-ASR_MODELS = {
-    "Soloba CTC 0.6B V0": "RobotsMali/soloba-ctc-0.6b-v0",
-    "Soloba CTC 0.6B V1": "RobotsMali/soloba-ctc-0.6b-v1",
-    "Soloni 114M TDT CTC V0": "RobotsMali/soloni-114m-tdt-ctc-V0",
-    "Soloni 114M TDT CTC V1": "RobotsMali/soloni-114m-tdt-ctc-v1",
-    "QuartzNet BM V0": "RobotsMali/stt-bm-quartznet15x5-V0",
-    "QuartzNet BM V1": "RobotsMali/stt-bm-quartznet15x5-V1"
-}
-_CACHE = {}
-# ---------------- LOAD MODEL ---------------- #
-def load_model(name):
-    if name in _CACHE:
-        return _CACHE[name]
-    device = "cuda" if torch.cuda.is_available() else "cpu"
-    model = nemo_asr.models.ASRModel.from_pretrained(
-        model_name=ASR_MODELS[name]
-    ).to(device).eval()
-    _CACHE[name] = (model, device)
-    return model, device
-# ---------------- EXTRACT AUDIO ---------------- #
 def extract_audio(video_path, wav_path):
-    if os.path.getsize(video_path) > MAX_VIDEO_BYTES:
-        raise RuntimeError("⚠️ Vidéo > 200MB. Compressez avant l’upload.")
-    os.system(f"ffmpeg -y -i '{video_path}' -ac 1 -ar {SR} -vn '{wav_path}' >/dev/null 2>&1")
-    audio, sr = sf.read(wav_path)
-    return len(audio)/sr
-# ---------------- TRANSCRIBE (with forced alignment) ---------------- #
-def transcribe(model, device, wav_path, model_key):
-    audio, sr = sf.read(wav_path)
     if audio.ndim == 2:
         audio = np.mean(audio, axis=1)
-    total_s = len(audio) / sr
     x = torch.tensor(audio, dtype=torch.float32).unsqueeze(0).to(device)
     ln = torch.tensor([x.shape[1]]).to(device)
-    # --- Case 1 : Soloni → true word timestamps ---
-    if "Soloni" in model_key and hasattr(model, "decode_and_align"):
-        try:
-            with torch.no_grad():
-                proc, plen = model.preprocessor(input_signal=x, input_signal_length=ln)
-                hyps = model.decode_and_align(encoder_output=proc, encoded_lengths=plen)
-            hyp = hyps[0][0] if isinstance(hyps[0], list) else hyps[0]
-            return [(w.start_offset_ms/1000, w.end_offset_ms/1000, w.word) for w in hyp.words]
-        except:
-            pass
-    # --- Case 2 : Soloba / QuartzNet → forced alignment CTC ---
     with torch.no_grad():
-        logits, logits_len = model.forward(input_signal=x, input_signal_length=ln)
-    text = model.transcribe([wav_path])[0].text.strip()
     words = text.split()
-    if not words:
-        return []
     config = CtcSegmentationParameters()
     config.char_list = list(model.tokenizer.vocab.keys())
-    ground_truth_mat, _ = prepare_text(config, words)
-    timings, _, _ = ctc_segmentation(
-        config,
-        logits.cpu().numpy()[0],
-        ground_truth_mat
-    )
-    time_per_step = total_s / logits_len.cpu().numpy()[0]
     word_times = []
     for i, w in enumerate(words):
-        start = timings[i] * time_per_step
-        end = timings[i+1] * time_per_step if i+1 < len(timings) else total_s
-        word_times.append((start, end, w))
-    # --- Segment mode B (2 to 5 words per subtitle line) ---
-    grouped = []
-    segment = []
     for w in word_times:
-        segment.append(w)
-        if len(segment) >= 4: # max words per line
-            grouped.append(segment)
-            segment = []
-    if segment:
-        grouped.append(segment)
-    subtitles = []
-    for seg in grouped:
-        s = seg[0][0]
-        e = seg[-1][1]
-        text = " ".join([w[2] for w in seg])
-        subtitles.append((s, e, text))
-    return subtitles
-# ---------------- BURN SUBTITLES (NO IMAGEMAGICK) ---------------- #
-def burn(video_path, subs):
-    clip = VideoFileClip(video_path)
     W, H = clip.size
     try:
@@ -140,21 +118,15 @@ def burn(video_path, subs):
     for s, e, text in subs:
         img = Image.new("RGBA", (W, int(H*0.12)), (0,0,0,140))
         draw = ImageDraw.Draw(img)
-        try:
-            bbox = draw.textbbox((0,0), text, font=font)
-            tw, th = bbox[2]-bbox[0], bbox[3]-bbox[1]
-        except:
-            tw, th = draw.textsize(text, font=font)
-        x = (W-tw)//2
-        y = (int(H*0.12)-th)//2
-        draw.text((x,y), text, font=font, fill=(255,255,255))
-        img_clip = ImageClip(np.array(img)).set_start(s).set_duration(e-s).set_position(("center", int(H*0.85)))
-        layers.append(img_clip)
-    final = CompositeVideoClip([clip] + layers]
     out = "RobotsMali_Subtitled.mp4"
     final.write_videofile(out, codec="libx264", audio_codec="aac", fps=clip.fps, verbose=False, logger=None)
     clip.close()
@@ -162,39 +134,37 @@ def burn(video_path, subs):
     return out
-# ---------------- PIPELINE ---------------- #
-def pipeline(video, model_name, progress=gr.Progress()):
-    progress(0.3, "📦 Chargement du modèle…")
-    model, device = load_model(model_name)
-    with tempfile.TemporaryDirectory() as td:
-        wav = f"{td}/audio.wav"
-        progress(0.5, "🔊 Extraction audio…")
-        extract_audio(video, wav)
-        progress(0.75, "🧠 Alignement temporel…")
-        subs = transcribe(model, device, wav, model_name)
-    progress(0.95, "🎞️ Incrustation des sous-titres…")
-    out = burn(video, subs)
-    return f"✅ Sous-titrage généré avec **{model_name}**", out
-# ---------------- UI ---------------- #
-CSS = """
-body { background:#F5F8FF; font-family:Inter, sans-serif; }
-h1 { text-align:center; font-weight:800; color:#005BFF; margin-bottom:6px; }
-.gr-button { background:#005BFF !important; color:white !important; border-radius:8px; font-weight:700; }
-"""
-with gr.Blocks(css=CSS, title="RobotsMali Caption Studio") as demo:
-    gr.Markdown("<h1>RobotsMali Caption Studio</h1><p>Sous-titrage automatique en Bambara (Alignement Professionnel)</p>")
-    video = gr.File(label="🎥 Importer une vidéo")
-    model = gr.Dropdown(list(ASR_MODELS.keys()), value="Soloni 114M TDT CTC V1", label="🧠 Modèle ASR")
-    run = gr.Button("🚀 Générer les sous-titres")
     status = gr.Markdown()
-    output = gr.Video()
-    run.click(pipeline, inputs=[video, model], outputs=[status, output])
-demo.launch(server_name="0.0.0.0", server_port=7860, share=False, ssr_mode=False)

 import gradio as gr
+import os
 import numpy as np
+import torch
 import soundfile as sf
 from moviepy.editor import VideoFileClip, CompositeVideoClip, ImageClip
 from PIL import Image, ImageDraw, ImageFont
 from nemo.collections import asr as nemo_asr
+from nemo.collections.asr.parts.utils.rnnt_utils import Hypothesis
 from ctc_segmentation import ctc_segmentation, CtcSegmentationParameters, prepare_text
+# =============================
+# LISTE DES MODELES ROBOTSMALI
+# =============================
+MODELS = {
+    "Soloni 114M TDT CTC v1": "RobotsMali/soloni-114m-tdt-ctc-v1",
+    "Soloni 350M TDT CTC v1": "RobotsMali/soloni-350m-tdt-ctc-v1",
+    "Soloba CTC 0.6B v0": "RobotsMali/soloba-ctc-0.6b-v0",
+    "Soloba CTC 0.6B v1": "RobotsMali/soloba-ctc-0.6b-v1",
+    "QuartzNet Bambara v1": "RobotsMali/stt-bm-quartznet15x5-v1",
+    "QuartzNet Bambara v2": "RobotsMali/stt-bm-quartznet15x5-v2"
+}
+# =============================
+# FONCTION : EXTRAIRE AUDIO
+# =============================
 def extract_audio(video_path, wav_path):
+    clip = VideoFileClip(video_path)
+    audio = clip.audio.to_soundarray(fps=16000)
+    if audio.ndim == 2:
+        audio = np.mean(audio, axis=1)
+    sf.write(wav_path, audio, 16000)
+    clip.close()
+# =============================
+# FONCTION : TRANSCRIPTION + TIMESTAMP
+# =============================
+def transcribe(model, device, wav, model_name):
+    audio, sr = sf.read(wav)
     if audio.ndim == 2:
         audio = np.mean(audio, axis=1)
     x = torch.tensor(audio, dtype=torch.float32).unsqueeze(0).to(device)
     ln = torch.tensor([x.shape[1]]).to(device)
+    # === Cas 1 : Soloni → timestamps natifs ===
+    if "Soloni" in model_name and hasattr(model, "decode_and_align"):
+        with torch.no_grad():
+            proc, plen = model.preprocessor(input_signal=x, input_signal_length=ln)
+            hyps = model.decode_and_align(encoder_output=proc, encoded_lengths=plen)
+        hyp = hyps[0][0] if isinstance(hyps[0], list) else hyps[0]
+        return [(w.start_offset_ms/1000, w.end_offset_ms/1000, w.word) for w in hyp.words]
+    # === Cas 2 : Soloba & QuartzNet → Forced Alignment CTC ===
+    text = model.transcribe([wav])[0]
+    text = text.strip()
+    if not text:
+        return []
     with torch.no_grad():
+        logits, logit_len = model.forward(input_signal=x, input_signal_length=ln)
     words = text.split()
     config = CtcSegmentationParameters()
     config.char_list = list(model.tokenizer.vocab.keys())
+    gt, utt = prepare_text(config, words)
+    timings, _, _ = ctc_segmentation(config, logits.cpu().numpy()[0], gt)
+    total_s = len(audio) / sr
+    tps = total_s / logit_len.cpu().numpy()[0]
     word_times = []
     for i, w in enumerate(words):
+        s = timings[i] * tps
+        e = timings[i+1] * tps if i+1 < len(timings) else total_s
+        word_times.append((s, e, w))
+    # Groupage lisible : 3-5 mots par ligne
+    grouped, block = [], []
     for w in word_times:
+        block.append(w)
+        if len(block) >= 4:
+            grouped.append(block)
+            block = []
+    if block:
+        grouped.append(block)
+    subs = []
+    for g in grouped:
+        subs.append((g[0][0], g[-1][1], " ".join([w[2] for w in g])))
+    return subs
+# =============================
+# FONCTION : INCRUSTATION SOUS-TITRES
+# =============================
+def burn(video, subs):
+    clip = VideoFileClip(video)
     W, H = clip.size
     try:
     for s, e, text in subs:
         img = Image.new("RGBA", (W, int(H*0.12)), (0,0,0,140))
         draw = ImageDraw.Draw(img)
+        bbox = draw.textbbox((0,0), text, font=font)
+        tw, th = bbox[2]-bbox[0], bbox[3]-bbox[1]
+        draw.text(((W-tw)//2, (int(H*0.12)-th)//2), text, font=font, fill="white")
+        layers.append(ImageClip(np.array(img))
+                      .set_start(s).set_duration(e-s)
+                      .set_position(("center", int(H*0.85))))
+    final = CompositeVideoClip([clip] + layers)
     out = "RobotsMali_Subtitled.mp4"
     final.write_videofile(out, codec="libx264", audio_codec="aac", fps=clip.fps, verbose=False, logger=None)
     clip.close()
     return out
+# =============================
+# PIPELINE
+# =============================
+def pipeline(video_file, model_name):
+    if video_file is None:
+        return "Veuillez importer une vidéo.", None
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    model = nemo_asr.models.ASRModel.from_pretrained(MODELS[model_name]).to(device)
+    wav = "temp.wav"
+    extract_audio(video_file, wav)
+    subs = transcribe(model, device, wav, model_name)
+    out = burn(video_file, subs)
+    return "✅ Sous-titres générés avec succès.", out
+# =============================
+# INTERFACE GRADIO
+# =============================
+with gr.Blocks() as demo:
+    gr.Markdown("# 🎙️ RobotsMali Subtitle Generator")
+    video = gr.Video(label="Importer une vidéo")
+    model = gr.Dropdown(list(MODELS.keys()), value="Soloni 114M TDT CTC v1", label="Sélection du modèle")
+    btn = gr.Button("⚡ Générer les sous-titres")
     status = gr.Markdown()
+    out = gr.Video(label="Résultat")
+    btn.click(pipeline, inputs=[video, model], outputs=[status, out])
+demo.launch()