Spaces:

RobotsMali
/

RobotsMali_Video_captionning

Running

App Files Files Community

binaryMao commited on Nov 10, 2025

Commit

64e18a4

verified ·

1 Parent(s): 900f511

Update app.py

Browse files

Files changed (1) hide show

app.py +111 -161

app.py CHANGED Viewed

@@ -1,7 +1,7 @@
 # -*- coding: utf-8 -*-
 """
-ROBOTSMALI V37 FINAL — SOUS-TITRAGE BAMBARA (STYLE NETFLIX)
-Correction V12 : Alignement stable (RNNT = CTC precise, CTC/BPE = VAD)
 """
 import os, tempfile, traceback, random, textwrap
@@ -9,19 +9,18 @@ import numpy as np
 import torch
 import soundfile as sf
 import librosa
-from PIL import Image, ImageDraw, ImageFont
-import gradio as gr
 from huggingface_hub import snapshot_download
-from moviepy.editor import VideoFileClip, CompositeVideoClip, ImageClip
 from nemo.collections import asr as nemo_asr
-from ctc_segmentation import ctc_segmentation, CtcSegmentationParameters, prepare_text
 # ----------------------------
 # CONFIG
 # ----------------------------
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
-random.seed(1234); np.random.seed(1234); torch.manual_seed(1234)
 MODELS = {
     "Soloni V1 (RNNT)":        ("RobotsMali/soloni-114m-tdt-ctc-v1", "rnnt"),
@@ -38,13 +37,12 @@ _cache = {}
 # MODEL LOADING
 # ----------------------------
 def load_model(name):
-    if name in _cache:
-        return _cache[name]
     repo, mode = MODELS[name]
     folder = snapshot_download(repo, local_dir_use_symlinks=False)
     nemo_file = next((os.path.join(folder, f) for f in os.listdir(folder) if f.endswith(".nemo")), None)
     if not nemo_file:
-        raise FileNotFoundError("Aucun fichier .nemo trouvé dans le repo du modèle.")
     model = (
         nemo_asr.models.EncDecHybridRNNTCTCBPEModel.restore_from(nemo_file)
         if mode == "rnnt"
@@ -58,9 +56,7 @@ def load_model(name):
 # AUDIO EXTRACTION & CLEANING
 # ----------------------------
 def extract_audio(video, wav):
-    VideoFileClip(video).audio.write_audiofile(
-        wav, fps=16000, codec="pcm_s16le", ffmpeg_params=["-ac", "1"], logger=None
-    )
 def clean_audio(wav, top_db=35):
     audio, sr = sf.read(wav)
@@ -91,183 +87,137 @@ def transcribe(model, wav):
 # UTILITAIRES
 # ----------------------------
 def keep_bambara(words):
-    res = []
     for w in words:
-        wl = w.lower()
-        if any(c in wl for c in ["ɛ", "ɔ", "ŋ"]) or sum(c in "aeiou" for c in wl) >= 2:
             res.append(w)
     return res
-MAX_CHARS = 45
-MIN_DUR = 0.3
-MAX_DUR = 3.2
-MAX_WORDS = 8
 def wrap2(txt):
-    parts = textwrap.wrap(txt, MAX_CHARS)
-    if len(parts) <= 1:
-        return txt
-    mid = len(txt) // 2
-    left = txt.rfind(" ", 0, mid)
-    right = txt.find(" ", mid)
-    cut = left if (mid - left) <= (right - mid if right != -1 else 1e9) else right
-    l1 = txt[:cut].strip()
-    l2 = txt[cut:].strip()
-    return l1 + "\n" + l2 if l2 else l1
-def pack(spans, total):
-    tmp = []
-    for s, e, t in spans:
-        s = max(0, min(s, total))
-        e = max(0, min(e, total))
-        if e <= s or not t.strip():
-            continue
-        tmp.append((s, e, t.strip()))
-    merged = []
     for seg in tmp:
-        if not merged:
-            merged.append(seg)
-            continue
-        ps, pe, pt = merged[-1]
-        s, e, t = seg
-        if (e - s) < MIN_DUR or (s - pe) < 0.1:
-            merged[-1] = (ps, max(pe, e), (pt + " " + t).strip())
-        else:
-            merged.append(seg)
-    out = []
-    last_end = 0
-    for s, e, t in merged:
-        dur = e - s
-        words = t.split()
-        blocks = [" ".join(words[i:i + MAX_WORDS]) for i in range(0, len(words), MAX_WORDS)]
-        step = dur / max(1, len(blocks))
-        base = s
         for b in blocks:
-            st = base
-            en = min(base + step, e)
-            base = en
-            if en <= st:
-                en = min(st + 0.05, total)
-            txt = wrap2(b)
-            if st < last_end:
-                st = last_end + 1e-3
-                en = max(en, st + 0.05)
-            out.append((st, en, txt))
-            last_end = en
     return out
 # ----------------------------
 # ALIGNEMENT SIMPLE (VAD)
 # ----------------------------
-def align_vad(text, audio, sr, total_dur, top_db=28):
-    words = keep_bambara(text.split())
-    total = total_dur
-    iv = librosa.effects.split(audio, top_db=top_db)
-    if len(iv) == 0 or not words:
-        return pack([(0, total, " ".join(words[:MAX_WORDS]))], total)
-    spans = []
-    L = sum(e - s for s, e in iv)
-    idx = 0
-    for s, e in iv:
-        seg = e - s
-        segt = seg / sr
-        k = max(1, int(round(len(words) * (seg / L))))
-        chunk = words[idx:idx + k]
-        idx += k
-        if not chunk:
-            continue
-        lines = [chunk[i:i + MAX_WORDS] for i in range(0, len(chunk), MAX_WORDS)]
-        step = max(MIN_DUR, min(MAX_DUR, segt / len(lines)))
-        base = s / sr
-        for j, ln in enumerate(lines):
-            st = base + j * step
-            en = base + (j + 1) * step
-            spans.append((st, en, " ".join(ln)))
-    return pack(spans, total)
-# ----------------------------
-# DESSIN SOUS-TITRES
-# ----------------------------
-def draw(text, W, H):
-    band = int(H * 0.18)
-    img = Image.new("RGBA", (W, band), (0, 0, 0, 170))
-    d = ImageDraw.Draw(img)
-    try:
-        font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf", max(20, H // 22))
-    except:
-        font = ImageFont.load_default()
-    lines = text.split("\n")
-    for i, line in enumerate(lines):
-        bbox = d.textbbox((0, 0), line, font=font)
-        w = bbox[2] - bbox[0]
-        h = bbox[3] - bbox[1]
-        d.text(((W - w) // 2, (band - (h * len(lines))) // 2 + i * h),
-               line, fill="white", font=font, stroke_width=2, stroke_fill="black")
-    return np.array(img)
-# ----------------------------
-# FUSION FINALE (FFmpeg)
 # ----------------------------
 def burn(video, subs):
-    tmp_final = "temp_noaudio.mp4"
-    out = "RobotsMali_Subtitled.mp4"
-    base = VideoFileClip(video)
-    dur = base.duration
-    fps = base.fps
-    W, H = base.size
-    layers = [ImageClip(draw(t, W, H)).set_start(s).set_duration(e - s).set_pos(("center", "bottom")) for s, e, t in subs]
-    final = CompositeVideoClip([base] + layers).set_duration(dur)
-    final.write_videofile(tmp_final, codec="libx264", audio=False, fps=fps, logger=None)
-    os.system(f'ffmpeg -y -i "{tmp_final}" -i "{video}" -map 0:v -map 1:a -c:v libx264 -crf 23 -c:a aac -b:a 192k -r {fps} -t {dur} "{out}"')
-    if os.path.exists(tmp_final):
-        os.remove(tmp_final)
-    return out
 # ----------------------------
 # PIPELINE PRINCIPAL
 # ----------------------------
 def pipeline(video, model_name):
     try:
-        wav = tempfile.gettempdir() + "/asr.wav"
-        base = VideoFileClip(video)
-        dur = base.duration
-        extract_audio(video, wav)
-        clean, audio, sr = clean_audio(wav)
-        print(f"DEBUG: Durée vidéo = {dur:.2f}s, Audio = {len(audio)/sr:.2f}s")
-        model = load_model(model_name)
-        text = transcribe(model, clean)
-        mode = MODELS[model_name][1]
-        # RNNT → alignement CTC réel | sinon alignement simple (VAD)
-        if mode == "rnnt":
-            subs = align_ctc(model, audio, sr, text, dur)
         else:
-            subs = align_vad(text, audio, sr, dur)
-        if not subs:
-            return "⚠️ Aucun sous-titre utilisable", None
-        out = burn(video, subs)
-        return "✅ Terminé avec succès", out
     except Exception:
         traceback.print_exc()
-        return "❌ Erreur — voir logs ci-dessus", None
 # ----------------------------
 # INTERFACE GRADIO
 # ----------------------------
-with gr.Blocks(title="RobotsMali V37 Final") as demo:
-    gr.Markdown("## ⚡ RobotsMali V37 — Sous-titrage Style Netflix (alignement stable)")
-    v = gr.Video()
-    m = gr.Dropdown(list(MODELS.keys()), value="Soloba V1 (CTC)")
     b = gr.Button("▶️ Générer")
     s = gr.Markdown()
-    o = gr.Video()
     b.click(pipeline, [v, m], [s, o])
 demo.launch(share=True, debug=False)

 # -*- coding: utf-8 -*-
 """
+ROBOTSMALI V38 FINAL — SOUS-TITRAGE BAMBARA (STYLE NETFLIX)
+Correction V38 : Durée exacte, QuartzNet fonctionnel, pipeline simplifiée
 """
 import os, tempfile, traceback, random, textwrap
 import torch
 import soundfile as sf
 import librosa
 from huggingface_hub import snapshot_download
 from nemo.collections import asr as nemo_asr
+import gradio as gr
+from moviepy.editor import VideoFileClip
 # ----------------------------
 # CONFIG
 # ----------------------------
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+random.seed(1234)
+np.random.seed(1234)
+torch.manual_seed(1234)
 MODELS = {
     "Soloni V1 (RNNT)":        ("RobotsMali/soloni-114m-tdt-ctc-v1", "rnnt"),
 # MODEL LOADING
 # ----------------------------
 def load_model(name):
+    if name in _cache: return _cache[name]
     repo, mode = MODELS[name]
     folder = snapshot_download(repo, local_dir_use_symlinks=False)
     nemo_file = next((os.path.join(folder, f) for f in os.listdir(folder) if f.endswith(".nemo")), None)
     if not nemo_file:
+        raise FileNotFoundError(f"Aucun .nemo trouvé pour {name}")
     model = (
         nemo_asr.models.EncDecHybridRNNTCTCBPEModel.restore_from(nemo_file)
         if mode == "rnnt"
 # AUDIO EXTRACTION & CLEANING
 # ----------------------------
 def extract_audio(video, wav):
+    os.system(f'ffmpeg -y -i "{video}" -ar 16000 -ac 1 -vn "{wav}"')
 def clean_audio(wav, top_db=35):
     audio, sr = sf.read(wav)
 # UTILITAIRES
 # ----------------------------
 def keep_bambara(words):
+    res=[]
     for w in words:
+        wl=w.lower()
+        if any(c in wl for c in ["ɛ","ɔ","ŋ"]) or sum(c in "aeiou" for c in wl)>=2:
             res.append(w)
     return res
+MAX_CHARS=45; MIN_DUR=0.3; MAX_DUR=3.2; MAX_WORDS=8
 def wrap2(txt):
+    parts=textwrap.wrap(txt,MAX_CHARS)
+    if len(parts)<=1: return txt
+    mid=len(txt)//2
+    left=txt.rfind(" ",0,mid)
+    right=txt.find(" ",mid)
+    cut=left if (mid-left)<=(right-mid if right!=-1 else 1e9) else right
+    l1=txt[:cut].strip(); l2=txt[cut:].strip()
+    return l1+"\n"+l2 if l2 else l1
+def pack(spans,total):
+    tmp=[]
+    for s,e,t in spans:
+        s=max(0,min(s,total)); e=max(0,min(e,total))
+        if e<=s or not t.strip(): continue
+        tmp.append((s,e,t.strip()))
+    merged=[]
     for seg in tmp:
+        if not merged: merged.append(seg); continue
+        ps,pe,pt=merged[-1]; s,e,t=seg
+        if (e-s)<MIN_DUR or (s-pe)<0.1:
+            merged[-1]=(ps,max(pe,e),(pt+" "+t).strip())
+        else: merged.append(seg)
+    out=[]; last_end=0
+    for s,e,t in merged:
+        dur=e-s; words=t.split()
+        blocks=[" ".join(words[i:i+MAX_WORDS]) for i in range(0,len(words),MAX_WORDS)]
+        step=dur/max(1,len(blocks)); base=s
         for b in blocks:
+            st=base; en=min(base+step,e); base=en
+            if en<=st: en=min(st+0.05,total)
+            txt=wrap2(b)
+            if st<last_end: st=last_end+1e-3; en=max(en,st+0.05)
+            out.append((st,en,txt)); last_end=en
     return out
 # ----------------------------
 # ALIGNEMENT SIMPLE (VAD)
 # ----------------------------
+def align_vad(text,audio,sr,total_dur,top_db=28):
+    words=keep_bambara(text.split())
+    total=total_dur
+    iv=librosa.effects.split(audio,top_db=top_db)
+    if len(iv)==0 or not words:
+        return pack([(0,total," ".join(words[:MAX_WORDS]))],total)
+    spans=[]; L=sum(e-s for s,e in iv); idx=0
+    for s,e in iv:
+        seg=e-s; segt=seg/sr; k=max(1,int(round(len(words)*(seg/L))))
+        chunk=words[idx:idx+k]; idx+=k
+        if not chunk: continue
+        lines=[chunk[i:i+MAX_WORDS] for i in range(0,len(chunk),MAX_WORDS)]
+        step=max(MIN_DUR,min(MAX_DUR,segt/len(lines))); base=s/sr
+        for j,ln in enumerate(lines):
+            st=base+j*step; en=base+(j+1)*step
+            spans.append((st,en," ".join(ln)))
+    return pack(spans,total)
+# ----------------------------
+# SOUS-TITRES SRT + FFmpeg
 # ----------------------------
 def burn(video, subs):
+    tmp_srt = tempfile.mktemp(suffix=".srt")
+    out_file = "RobotsMali_Subtitled.mp4"
+    # Écriture SRT
+    def sec_to_srt(t):
+        h=int(t//3600); m=int((t%3600)//60); s=int(t%60); ms=int((t-int(t))*1000)
+        return f"{h:02}:{m:02}:{s:02},{ms:03}"
+    with open(tmp_srt,"w",encoding="utf-8") as f:
+        for i,(start,end,text) in enumerate(subs,1):
+            f.write(f"{i}\n{sec_to_srt(start)} --> {sec_to_srt(end)}\n{text}\n\n")
+    # Fusion vidéo + sous-titres sans changer durée
+    os.system(f'ffmpeg -y -i "{video}" -vf "subtitles={tmp_srt}" -c:v copy -c:a aac -b:a 192k "{out_file}"')
+    if os.path.exists(tmp_srt): os.remove(tmp_srt)
+    return out_file
 # ----------------------------
 # PIPELINE PRINCIPAL
 # ----------------------------
 def pipeline(video, model_name):
     try:
+        wav=tempfile.mktemp(suffix=".wav")
+        # Extraction audio
+        extract_audio(video,wav)
+        clean,audio,sr=clean_audio(wav)
+        model=load_model(model_name)
+        text=transcribe(model,clean)
+        mode=MODELS[model_name][1]
+        if mode=="rnnt":
+            from ctc_segmentation import ctc_segmentation,CtcSegmentationParameters,prepare_text
+            words=keep_bambara(text.split())
+            if not words: return "⚠️ Aucun sous-titre utilisable",None
+            x=torch.tensor(audio).float().unsqueeze(0).to(DEVICE)
+            ln=torch.tensor([x.shape[1]]).to(DEVICE)
+            with torch.no_grad(): logits=model(input_signal=x,input_signal_length=ln)[0]
+            tps=VideoFileClip(video).duration/logits.shape[1]
+            raw=model.tokenizer.vocab
+            vocab=list(raw.keys()) if isinstance(raw,dict) else list(raw)
+            cfg=CtcSegmentationParameters(); cfg.char_list=vocab
+            gt=prepare_text(cfg,words)[0]
+            timing,_,_=ctc_segmentation(cfg,logits.detach().cpu().numpy()[0],gt)
+            spans=[(timing[i]*tps,timing[i+1]*tps,words[i]) for i in range(len(words))]
+            subs=pack(spans,VideoFileClip(video).duration)
         else:
+            subs=align_vad(text,audio,sr,VideoFileClip(video).duration)
+        if not subs: return "⚠️ Aucun sous-titre utilisable",None
+        out=burn(video,subs)
+        return "✅ Terminé avec succès",out
     except Exception:
         traceback.print_exc()
+        return "❌ Erreur — voir logs ci-dessus",None
 # ----------------------------
 # INTERFACE GRADIO
 # ----------------------------
+with gr.Blocks(title="RobotsMali V38 Final") as demo:
+    gr.Markdown("## ⚡ RobotsMali V38 — Sous-titrage Style Netflix (QuartzNet & RNNT stable)")
+    v = gr.Video(label="Vidéo à sous-titrer")
+    m = gr.Dropdown(list(MODELS.keys()), value="Soloba V1 (CTC)", label="Modèle ASR")
     b = gr.Button("▶️ Générer")
     s = gr.Markdown()
+    o = gr.Video(label="Vidéo sous-titrée")
     b.click(pipeline, [v, m], [s, o])
 demo.launch(share=True, debug=False)