Spaces:

RobotsMali
/

RobotsMali_Video_captionning

Running

App Files Files Community

binaryMao commited on Nov 7, 2025

Commit

224a0d9

verified ·

1 Parent(s): 857e7cb

Update app.py

Browse files

Files changed (1) hide show

app.py +185 -141

app.py CHANGED Viewed

@@ -1,16 +1,10 @@
 # -*- coding: utf-8 -*-
 """
-ROBOTSMALI VIDEO CAPTIONING — V21 (Stable)
-- Alignement parfait pour Soloba (CTC)
-- Découpage fluide pour Soloni (RNNT)
-- QuartzNet supporté sans crash
-- Filtrage Bambara phonétique (retire français)
-- Sous-titres style Netflix
-- Durée vidéo exacte (plus d'allongement)
-- Compatible Google Colab + Kali + Linux
 """
-import os, tempfile
 import numpy as np
 import torch
 import soundfile as sf
@@ -24,29 +18,28 @@ from nemo.collections import asr as nemo_asr
 from ctc_segmentation import ctc_segmentation, CtcSegmentationParameters, prepare_text
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 MODELS = {
-    "Soloni V1 (RNNT)": ("RobotsMali/soloni-114m-tdt-ctc-v1", "rnnt"),
-    "Soloni V0 (RNNT)": ("RobotsMali/soloni-114m-tdt-ctc-v0", "rnnt"),
-    "Soloba V1 (CTC)":  ("RobotsMali/soloba-ctc-0.6b-v1", "ctc"),
-    "Soloba V0 (CTC)":  ("RobotsMali/soloba-ctc-0.6b-v0", "ctc"),
-    "QuartzNet V1 (CTC)": ("RobotsMali/stt-bm-quartznet15x5-v1", "ctc"),
-    "QuartzNet V0 (CTC)": ("RobotsMali/stt-bm-quartznet15x5-v0", "ctc"),
 }
-_model_cache = {}
-_vocab_cache = {}
 def load_model(name):
-    if name in _model_cache:
-        return _model_cache[name]
-    repo, mode = MODELS[name]
-    path = snapshot_download(repo, local_dir_use_symlinks=False)
-    nemo = [os.path.join(path, f) for f in os.listdir(path) if f.endswith(".nemo")][0]
-    model = nemo_asr.models.EncDecCTCModelBPE.restore_from(nemo) if mode=="ctc" \
-        else nemo_asr.models.EncDecHybridRNNTCTCBPEModel.restore_from(nemo)
     model.to(DEVICE).eval()
-    _model_cache[name] = model
     return model
 def extract_audio(video, wav):
@@ -54,133 +47,184 @@ def extract_audio(video, wav):
         wav, fps=16000, codec="pcm_s16le", ffmpeg_params=["-ac","1"], logger=None
     )
-def clean_audio(wav):
     audio, sr = sf.read(wav)
-    if audio.ndim == 2: audio = audio.mean(1)
-    audio,_ = librosa.effects.trim(audio, top_db=35)
-    out = wav.replace(".wav","_clean.wav")
-    sf.write(out, audio, sr)
-    return out, audio, sr
-def transcribe(model, wav):
-    o = model.transcribe([wav])[0]
-    return o.text.strip() if hasattr(o,"text") else str(o).strip()
-# ---------- FILTRAGE BAMBARA ---------- #
-def keep_bambara_words(words):
-    filtered=[]
     for w in words:
-        w2=w.lower()
-        if any(ch in w2 for ch in ["ɛ","ɔ","ŋ"]) or sum(c in "aeiou" for c in w2)>=2:
-            filtered.append(w)
-    return filtered
-MAX_WORDS=4; MAX_CHARS=45; MAX_DURATION=3.4
-def group(spans):
-    subs=[]; buf=[]
-    def push(b):
-        if b: subs.append((b[0][0], b[-1][1], " ".join(x[2] for x in b)))
-    for w in spans:
-        test=buf+[w]; txt=" ".join(x[2] for x in test)
-        dur=test[-1][1]-test[0][0]
-        if len(test)>MAX_WORDS or len(txt)>MAX_CHARS or dur>MAX_DURATION:
-            push(buf); buf=[w]
-        else:
-            buf=test
-    push(buf); return subs
-# ---------- ALIGNEMENT CTC (Soloba + QuartzNet) ---------- #
-def align_ctc(model, audio, sr, text):
-    words = keep_bambara_words(text.split())
     if not words: return []
-    x = torch.tensor(audio).float().unsqueeze(0).to(DEVICE)
-    ln = torch.tensor([x.shape[1]]).to(DEVICE)
-    total = len(audio)/sr
-    with torch.no_grad():
-        logits, _ = model(input_signal=x, input_signal_length=ln)
-    frames = logits.shape[1]
-    if frames <= 2: return []
-    vocab = list(model.tokenizer.vocab.keys())
-    cfg = CtcSegmentationParameters(); cfg.char_list=vocab
-    out = prepare_text(cfg, words)
-    gt = out[0] if isinstance(out, (list,tuple)) else out
-    timing, _, _ = ctc_segmentation(cfg, logits.cpu().numpy()[0], gt)
-    tps = total / float(frames)
-    spans=[]
-    for i in range(len(words)):
-        st=float(timing[i])*tps
-        en=float(timing[i+1])*tps if i+1<len(timing) else total
-        spans.append((st,en,words[i]))
-    return group(spans)
-# ---------- ALIGNEMENT RNNT (Soloni) ---------- #
-def rnnt_vad(text, audio, sr):
-    intervals = librosa.effects.split(audio, top_db=28)
-    words = keep_bambara_words(text.split())
-    if not intervals or not words:
-        return [(0,len(audio)/sr,text)]
-    spans=[]; idx=0
-    total_audio=sum(e-s for s,e in intervals)
-    for s,e in intervals:
-        seg_d=(e-s)/sr
-        k=max(1,int(len(words)*((e-s)/total_audio)))
         chunk=words[idx:idx+k]; idx+=k
         if not chunk: continue
-        parts=[chunk[i:i+MAX_WORDS] for i in range(0,len(chunk),MAX_WORDS)]
-        step=seg_d/len(parts); base=s/sr
-        for j,p in enumerate(parts):
             st=base+j*step; en=base+(j+1)*step
-            spans.append((st,en," ".join(p)))
-    return group(spans)
-# ---------- RENDER SUBTITLES ---------- #
-def draw_sub(text,W,H):
-    bg=Image.new("RGBA",(W,int(H*0.12)),(0,0,0,180))
-    d=ImageDraw.Draw(bg)
-    try: font=ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf",size=max(18,H//18))
     except: font=ImageFont.load_default()
-    box=d.textbbox((0,0),text,font)
-    tw=box[2]-box[0]; th=box[3]-box[1]
-    d.text(((W-tw)//2,(H*0.12-th)//2),text,font=font,fill="white")
-    return bg
 def burn(video,subs):
-    out="RobotsMali_Subtitled.mp4"
-    base=VideoFileClip(video); W,H=base.size; dur=base.duration
-    layers=[]
-    for s,e,t in subs:
-        s=max(0,min(s,dur)); e=max(0,min(e,dur))
-        if e<=s: continue
-        img=draw_sub(t.upper(),W,H)
-        layers.append(ImageClip(np.array(img)).set_start(s).set_duration(e-s).set_pos(("center","bottom")))
-    CompositeVideoClip([base]+layers).set_duration(dur).write_videofile(out,codec="libx264",audio_codec="aac",fps=base.fps)
     return out
-# ---------- PIPELINE ---------- #
-def pipeline(video, model_name):
     try:
-        tmp=os.path.join(tempfile.gettempdir(),"audio.wav")
-        extract_audio(video,tmp)
-        clean,audio,sr=clean_audio(tmp)
         model=load_model(model_name)
         text=transcribe(model,clean)
         mode=MODELS[model_name][1]
-        subs = align_ctc(model,audio,sr,text) if mode=="ctc" else rnnt_vad(text,audio,sr)
-        if not subs: return "⚠️ Aucun sous-titre utilisable.",None
-        return "✅ Terminé !", burn(video,subs)
-    except Exception as e:
-        return f"❌ ERREUR : {e}",None
-with gr.Blocks(title="RobotsMali V21 — Bambara Aligné") as demo:
-    gr.Markdown("# ⚡ RobotsMali V21 — Sous-titrage Bambara Stable")
-    video=gr.Video()
-    model=gr.Dropdown(list(MODELS.keys()),value="Soloba V1 (CTC)")
-    run=gr.Button("▶️ Générer")
-    status=gr.Markdown(); out=gr.Video()
-    run.click(pipeline,[video,model],[status,out])
-demo.launch(share=True)

 # -*- coding: utf-8 -*-
 """
+ROBOTSMALI V37 FINAL — SOUS-TITRAGE BAMBARA (STYLE NETFLIX)
+Correction V9 : Forçage du FPS du clip source pour stabiliser la durée.
 """
+import os, tempfile, traceback, random, math, textwrap
 import numpy as np
 import torch
 import soundfile as sf
 from ctc_segmentation import ctc_segmentation, CtcSegmentationParameters, prepare_text
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+random.seed(1234); np.random.seed(1234); torch.manual_seed(1234)
 MODELS = {
+    "Soloni V1 (RNNT)":      ("RobotsMali/soloni-114m-tdt-ctc-v1", "rnnt"),
+    "Soloni V0 (RNNT)":      ("RobotsMali/soloni-114m-tdt-ctc-v0", "rnnt"),
+    "Soloba V1 (CTC)":       ("RobotsMali/soloba-ctc-0.6b-v1",       "ctc"),
+    "Soloba V0 (CTC)":       ("RobotsMali/soloba-ctc-0.6b-v0",       "ctc"),
+    "QuartzNet V1 (CTC-char)": ("RobotsMali/stt-bm-quartznet15x5-v1", "ctc_char"),
+    "QuartzNet V0 (CTC-char)": ("RobotsMali/stt-bm-quartznet15x5-v0", "ctc_char"),
 }
+_cache = {}
 def load_model(name):
+    if name in _cache: return _cache[name]
+    repo,mode = MODELS[name]
+    folder = snapshot_download(repo, local_dir_use_symlinks=False)
+    nemo_file = next((os.path.join(folder,f) for f in os.listdir(folder) if f.endswith(".nemo")),None)
+    if not nemo_file: raise FileNotFoundError("Aucun .nemo trouvé")
+    model = nemo_asr.models.EncDecHybridRNNTCTCBPEModel.restore_from(nemo_file) if mode=="rnnt" \
+        else nemo_asr.models.EncDecCTCModelBPE.restore_from(nemo_file)
     model.to(DEVICE).eval()
+    _cache[name]=model
     return model
 def extract_audio(video, wav):
         wav, fps=16000, codec="pcm_s16le", ffmpeg_params=["-ac","1"], logger=None
     )
+def clean_audio(wav, top_db=35):
     audio, sr = sf.read(wav)
+    if audio.ndim==2: audio=audio.mean(1)
+    max_val=np.max(np.abs(audio)) if audio.size>0 else 0
+    if max_val>1e-6: audio=audio/max_val*0.9
+    clean=wav.replace(".wav","_clean.wav")
+    sf.write(clean,audio,sr)
+    return clean,audio,sr
+def transcribe(model,wav):
+    out=model.transcribe([wav])
+    if isinstance(out,list) and out and hasattr(out[0],"text"): return out[0].text.strip()
+    if isinstance(out,list) and out and isinstance(out[0],str): return out[0].strip()
+    if hasattr(out,"text"): return out.text.strip()
+    return str(out).strip()
+def keep_bambara(words):
+    res=[]
     for w in words:
+        wl=w.lower()
+        if any(c in wl for c in ["ɛ","ɔ","ŋ"]) or sum(c in "aeiou" for c in wl)>=2:
+            res.append(w)
+    return res
+MAX_CHARS=45; MIN_DUR=0.3; MAX_DUR=3.2; MAX_WORDS=8
+def wrap2(txt):
+    parts=textwrap.wrap(txt,MAX_CHARS)
+    if len(parts)<=1: return txt
+    mid=len(txt)//2
+    left=txt.rfind(" ",0,mid)
+    right=txt.find(" ",mid)
+    cut=left if (mid-left)<=(right-mid if right!=-1 else 1e9) else right
+    l1=txt[:cut].strip(); l2=txt[cut:].strip()
+    return l1+"\n"+l2 if l2 else l1
+def pack(spans,total):
+    tmp=[]
+    for s,e,t in spans:
+        s=max(0,min(s,total)); e=max(0,min(e,total))
+        if e<=s: continue
+        t=t.strip()
+        if not t: continue
+        tmp.append((s,e,t))
+    merged=[]
+    for seg in tmp:
+        if not merged: merged.append(seg); continue
+        ps,pe,pt=merged[-1]; s,e,t=seg
+        if (e-s)<MIN_DUR or (s-pe)<0.1:
+            merged[-1]=(ps,max(pe,e),(pt+" "+t).strip())
+        else: merged.append(seg)
+    out=[]; last_end=0
+    for s,e,t in merged:
+        dur=e-s; words=t.split()
+        blocks=[" ".join(words[i:i+MAX_WORDS]) for i in range(0,len(words),MAX_WORDS)]
+        step = dur / max(1, len(blocks))
+        base=s
+        for b in blocks:
+            st=base; en=min(base+step,e); base=en
+            if en<=st: en=min(st+0.05,total)
+            txt=wrap2(b)
+            if st < last_end:
+                 st = last_end + 1e-3
+                 en = max(en, st + 0.05)
+            out.append((st,en,txt)); last_end=en
+    return out
+def align_ctc(model,audio,sr,text,total_dur):
+    words=keep_bambara(text.split())
     if not words: return []
+    x=torch.tensor(audio).float().unsqueeze(0).to(DEVICE)
+    ln=torch.tensor([x.shape[1]]).to(DEVICE)
+    total = total_dur
+    with torch.no_grad(): logits=model(input_signal=x,input_signal_length=ln)[0]
+    print(f"DEBUG TIME 3: Logits frames = {logits.shape[1]}")
+    tps = total / logits.shape[1]
+    print(f"DEBUG TIME 4: Time per logit frame (tps) = {tps:.6f}s")
+    raw=model.tokenizer.vocab
+    vocab=list(raw.keys()) if isinstance(raw,dict) else list(raw)
+    cfg=CtcSegmentationParameters(); cfg.char_list=vocab
+    gt=prepare_text(cfg,words)[0]
+    timing,_,_=ctc_segmentation(cfg,logits.detach().cpu().numpy()[0],gt)
+    spans=[(timing[i]*tps, timing[i+1]*tps, words[i]) for i in range(len(words))]
+    return pack(spans,total)
+def align_vad(text,audio,sr,total_dur,top_db=28):
+    words=keep_bambara(text.split())
+    total = total_dur
+    iv=librosa.effects.split(audio,top_db=top_db)
+    if len(iv)==0 or not words:
+        return pack([(0,total," ".join(words[:MAX_WORDS]))],total)
+    spans=[]; L=sum(e-s for s,e in iv); idx=0
+    for s,e in iv:
+        seg=e-s; segt=seg/sr; k=max(1,int(round(len(words)*(seg/L))))
         chunk=words[idx:idx+k]; idx+=k
         if not chunk: continue
+        lines=[chunk[i:i+MAX_WORDS] for i in range(0,len(chunk),MAX_WORDS)]
+        step=max(MIN_DUR,min(MAX_DUR,segt/len(lines))); base=s/sr
+        for j,ln in enumerate(lines):
             st=base+j*step; en=base+(j+1)*step
+            spans.append((st,en," ".join(ln)))
+    return pack(spans,total)
+def draw(text,W,H):
+    band=int(H*0.18); img=Image.new("RGBA",(W,band),(0,0,0,170))
+    d=ImageDraw.Draw(img)
+    try:
+        font=ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf",max(20,H//22))
     except: font=ImageFont.load_default()
+    for i,line in enumerate(text.split("\n")):
+        bbox=d.textbbox((0,0),line,font=font)
+        w=bbox[2]-bbox[0]; h=bbox[3]-bbox[1]
+        d.text(((W-w)//2,(band-(h*len(text.split('\n'))))//2 + i*h),
+               line,fill="white",font=font,stroke_width=2,stroke_fill="black")
+    return np.array(img)
 def burn(video,subs):
+    tmp="noaudio.mp4"; out="RobotsMali_Subtitled.mp4"
+    # Correction V9: Forcer le FPS dès la lecture du clip
+    base=VideoFileClip(video, fps=None)
+    dur=base.duration
+    fps=base.fps
+    base = base.set_fps(fps)
+    W,H=base.size;
+    layers=[ImageClip(draw(t,W,H)).set_start(s).set_duration(e-s).set_pos(("center","bottom"))
+            for s,e,t in subs]
+    final=CompositeVideoClip([base]+layers).set_duration(dur)
+    final.write_videofile(tmp,codec="libx264",audio=False,fps=base.fps)
+    # Correction V8: Réencodage forcé + réinitialisation du timestamp (-ss 0)
+    os.system(f'ffmpeg -y -i "{tmp}" -i "{video}" -map 0:v -map 1:a -c:v libx264 -crf 23 -c:a aac -b:a 192k -r {fps} -t {dur} -ss 0 "{out}"')
     return out
+def pipeline(video,model_name):
     try:
+        wav=tempfile.gettempdir()+"/asr.wav"
+        base=VideoFileClip(video)
+        dur=base.duration
+        extract_audio(video,wav)
+        clean,audio,sr=clean_audio(wav)
+        print(f"DEBUG TIME 1: Video duration (dur) = {dur:.4f}s")
+        print(f"DEBUG TIME 2: Audio length (len(audio)/sr) = {len(audio)/sr:.4f}s")
         model=load_model(model_name)
         text=transcribe(model,clean)
         mode=MODELS[model_name][1]
+        subs=align_ctc(model,audio,sr,text, dur) if mode=="ctc" else align_vad(text,audio,sr, dur)
+        if not subs: return "⚠️ Aucun sous-titre utilisable",None
+        out=burn(video,subs)
+        return "✅ Terminé",out
+    except Exception:
+        traceback.print_exc()
+        return "❌ Erreur — logs ci-dessus",None
+with gr.Blocks(title="RobotsMali V37 Final") as demo:
+    gr.Markdown("## ⚡ RobotsMali V37 — Sous-titrage Style Netflix (Production)")
+    v=gr.Video()
+    m=gr.Dropdown(list(MODELS.keys()),value="Soloba V1 (CTC)")
+    b=gr.Button("▶️ Générer")
+    s=gr.Markdown()
+    o=gr.Video()
+    b.click(pipeline,[v,m],[s,o])
+demo.launch(share=True, debug=False)