binaryMao commited on
Commit
224a0d9
·
verified ·
1 Parent(s): 857e7cb

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +185 -141
app.py CHANGED
@@ -1,16 +1,10 @@
1
  # -*- coding: utf-8 -*-
2
  """
3
- ROBOTSMALI VIDEO CAPTIONINGV21 (Stable)
4
- - Alignement parfait pour Soloba (CTC)
5
- - Découpage fluide pour Soloni (RNNT)
6
- - QuartzNet supporté sans crash
7
- - Filtrage Bambara phonétique (retire français)
8
- - Sous-titres style Netflix
9
- - Durée vidéo exacte (plus d'allongement)
10
- - Compatible Google Colab + Kali + Linux
11
  """
12
 
13
- import os, tempfile
14
  import numpy as np
15
  import torch
16
  import soundfile as sf
@@ -24,29 +18,28 @@ from nemo.collections import asr as nemo_asr
24
  from ctc_segmentation import ctc_segmentation, CtcSegmentationParameters, prepare_text
25
 
26
  DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 
27
 
28
  MODELS = {
29
- "Soloni V1 (RNNT)": ("RobotsMali/soloni-114m-tdt-ctc-v1", "rnnt"),
30
- "Soloni V0 (RNNT)": ("RobotsMali/soloni-114m-tdt-ctc-v0", "rnnt"),
31
- "Soloba V1 (CTC)": ("RobotsMali/soloba-ctc-0.6b-v1", "ctc"),
32
- "Soloba V0 (CTC)": ("RobotsMali/soloba-ctc-0.6b-v0", "ctc"),
33
- "QuartzNet V1 (CTC)": ("RobotsMali/stt-bm-quartznet15x5-v1", "ctc"),
34
- "QuartzNet V0 (CTC)": ("RobotsMali/stt-bm-quartznet15x5-v0", "ctc"),
35
  }
36
-
37
- _model_cache = {}
38
- _vocab_cache = {}
39
 
40
  def load_model(name):
41
- if name in _model_cache:
42
- return _model_cache[name]
43
- repo, mode = MODELS[name]
44
- path = snapshot_download(repo, local_dir_use_symlinks=False)
45
- nemo = [os.path.join(path, f) for f in os.listdir(path) if f.endswith(".nemo")][0]
46
- model = nemo_asr.models.EncDecCTCModelBPE.restore_from(nemo) if mode=="ctc" \
47
- else nemo_asr.models.EncDecHybridRNNTCTCBPEModel.restore_from(nemo)
48
  model.to(DEVICE).eval()
49
- _model_cache[name] = model
50
  return model
51
 
52
  def extract_audio(video, wav):
@@ -54,133 +47,184 @@ def extract_audio(video, wav):
54
  wav, fps=16000, codec="pcm_s16le", ffmpeg_params=["-ac","1"], logger=None
55
  )
56
 
57
- def clean_audio(wav):
58
  audio, sr = sf.read(wav)
59
- if audio.ndim == 2: audio = audio.mean(1)
60
- audio,_ = librosa.effects.trim(audio, top_db=35)
61
- out = wav.replace(".wav","_clean.wav")
62
- sf.write(out, audio, sr)
63
- return out, audio, sr
64
-
65
- def transcribe(model, wav):
66
- o = model.transcribe([wav])[0]
67
- return o.text.strip() if hasattr(o,"text") else str(o).strip()
68
-
69
- # ---------- FILTRAGE BAMBARA ---------- #
70
- def keep_bambara_words(words):
71
- filtered=[]
 
 
 
72
  for w in words:
73
- w2=w.lower()
74
- if any(ch in w2 for ch in ["ɛ","ɔ","ŋ"]) or sum(c in "aeiou" for c in w2)>=2:
75
- filtered.append(w)
76
- return filtered
77
-
78
- MAX_WORDS=4; MAX_CHARS=45; MAX_DURATION=3.4
79
-
80
- def group(spans):
81
- subs=[]; buf=[]
82
- def push(b):
83
- if b: subs.append((b[0][0], b[-1][1], " ".join(x[2] for x in b)))
84
- for w in spans:
85
- test=buf+[w]; txt=" ".join(x[2] for x in test)
86
- dur=test[-1][1]-test[0][0]
87
- if len(test)>MAX_WORDS or len(txt)>MAX_CHARS or dur>MAX_DURATION:
88
- push(buf); buf=[w]
89
- else:
90
- buf=test
91
- push(buf); return subs
92
-
93
- # ---------- ALIGNEMENT CTC (Soloba + QuartzNet) ---------- #
94
- def align_ctc(model, audio, sr, text):
95
- words = keep_bambara_words(text.split())
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
96
  if not words: return []
97
- x = torch.tensor(audio).float().unsqueeze(0).to(DEVICE)
98
- ln = torch.tensor([x.shape[1]]).to(DEVICE)
99
- total = len(audio)/sr
100
- with torch.no_grad():
101
- logits, _ = model(input_signal=x, input_signal_length=ln)
102
- frames = logits.shape[1]
103
- if frames <= 2: return []
104
- vocab = list(model.tokenizer.vocab.keys())
105
- cfg = CtcSegmentationParameters(); cfg.char_list=vocab
106
-
107
- out = prepare_text(cfg, words)
108
- gt = out[0] if isinstance(out, (list,tuple)) else out
109
-
110
- timing, _, _ = ctc_segmentation(cfg, logits.cpu().numpy()[0], gt)
111
- tps = total / float(frames)
112
-
113
- spans=[]
114
- for i in range(len(words)):
115
- st=float(timing[i])*tps
116
- en=float(timing[i+1])*tps if i+1<len(timing) else total
117
- spans.append((st,en,words[i]))
118
- return group(spans)
119
-
120
- # ---------- ALIGNEMENT RNNT (Soloni) ---------- #
121
- def rnnt_vad(text, audio, sr):
122
- intervals = librosa.effects.split(audio, top_db=28)
123
- words = keep_bambara_words(text.split())
124
- if not intervals or not words:
125
- return [(0,len(audio)/sr,text)]
126
- spans=[]; idx=0
127
- total_audio=sum(e-s for s,e in intervals)
128
- for s,e in intervals:
129
- seg_d=(e-s)/sr
130
- k=max(1,int(len(words)*((e-s)/total_audio)))
131
  chunk=words[idx:idx+k]; idx+=k
132
  if not chunk: continue
133
- parts=[chunk[i:i+MAX_WORDS] for i in range(0,len(chunk),MAX_WORDS)]
134
- step=seg_d/len(parts); base=s/sr
135
- for j,p in enumerate(parts):
136
  st=base+j*step; en=base+(j+1)*step
137
- spans.append((st,en," ".join(p)))
138
- return group(spans)
139
-
140
- # ---------- RENDER SUBTITLES ---------- #
141
- def draw_sub(text,W,H):
142
- bg=Image.new("RGBA",(W,int(H*0.12)),(0,0,0,180))
143
- d=ImageDraw.Draw(bg)
144
- try: font=ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf",size=max(18,H//18))
145
  except: font=ImageFont.load_default()
146
- box=d.textbbox((0,0),text,font)
147
- tw=box[2]-box[0]; th=box[3]-box[1]
148
- d.text(((W-tw)//2,(H*0.12-th)//2),text,font=font,fill="white")
149
- return bg
 
 
150
 
151
  def burn(video,subs):
152
- out="RobotsMali_Subtitled.mp4"
153
- base=VideoFileClip(video); W,H=base.size; dur=base.duration
154
- layers=[]
155
- for s,e,t in subs:
156
- s=max(0,min(s,dur)); e=max(0,min(e,dur))
157
- if e<=s: continue
158
- img=draw_sub(t.upper(),W,H)
159
- layers.append(ImageClip(np.array(img)).set_start(s).set_duration(e-s).set_pos(("center","bottom")))
160
- CompositeVideoClip([base]+layers).set_duration(dur).write_videofile(out,codec="libx264",audio_codec="aac",fps=base.fps)
 
 
 
 
 
 
 
 
 
161
  return out
162
 
163
- # ---------- PIPELINE ---------- #
164
- def pipeline(video, model_name):
165
  try:
166
- tmp=os.path.join(tempfile.gettempdir(),"audio.wav")
167
- extract_audio(video,tmp)
168
- clean,audio,sr=clean_audio(tmp)
 
 
 
 
 
 
 
 
169
  model=load_model(model_name)
170
  text=transcribe(model,clean)
171
  mode=MODELS[model_name][1]
172
- subs = align_ctc(model,audio,sr,text) if mode=="ctc" else rnnt_vad(text,audio,sr)
173
- if not subs: return "⚠️ Aucun sous-titre utilisable.",None
174
- return "✅ Terminé !", burn(video,subs)
175
- except Exception as e:
176
- return f"❌ ERREUR : {e}",None
177
-
178
- with gr.Blocks(title="RobotsMali V21 — Bambara Aligné") as demo:
179
- gr.Markdown("# ⚡ RobotsMali V21 — Sous-titrage Bambara Stable")
180
- video=gr.Video()
181
- model=gr.Dropdown(list(MODELS.keys()),value="Soloba V1 (CTC)")
182
- run=gr.Button("▶️ Générer")
183
- status=gr.Markdown(); out=gr.Video()
184
- run.click(pipeline,[video,model],[status,out])
185
-
186
- demo.launch(share=True)
 
 
 
 
 
 
 
1
  # -*- coding: utf-8 -*-
2
  """
3
+ ROBOTSMALI V37 FINALSOUS-TITRAGE BAMBARA (STYLE NETFLIX)
4
+ Correction V9 : Forçage du FPS du clip source pour stabiliser la durée.
 
 
 
 
 
 
5
  """
6
 
7
+ import os, tempfile, traceback, random, math, textwrap
8
  import numpy as np
9
  import torch
10
  import soundfile as sf
 
18
  from ctc_segmentation import ctc_segmentation, CtcSegmentationParameters, prepare_text
19
 
20
  DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
21
+ random.seed(1234); np.random.seed(1234); torch.manual_seed(1234)
22
 
23
  MODELS = {
24
+ "Soloni V1 (RNNT)": ("RobotsMali/soloni-114m-tdt-ctc-v1", "rnnt"),
25
+ "Soloni V0 (RNNT)": ("RobotsMali/soloni-114m-tdt-ctc-v0", "rnnt"),
26
+ "Soloba V1 (CTC)": ("RobotsMali/soloba-ctc-0.6b-v1", "ctc"),
27
+ "Soloba V0 (CTC)": ("RobotsMali/soloba-ctc-0.6b-v0", "ctc"),
28
+ "QuartzNet V1 (CTC-char)": ("RobotsMali/stt-bm-quartznet15x5-v1", "ctc_char"),
29
+ "QuartzNet V0 (CTC-char)": ("RobotsMali/stt-bm-quartznet15x5-v0", "ctc_char"),
30
  }
31
+ _cache = {}
 
 
32
 
33
  def load_model(name):
34
+ if name in _cache: return _cache[name]
35
+ repo,mode = MODELS[name]
36
+ folder = snapshot_download(repo, local_dir_use_symlinks=False)
37
+ nemo_file = next((os.path.join(folder,f) for f in os.listdir(folder) if f.endswith(".nemo")),None)
38
+ if not nemo_file: raise FileNotFoundError("Aucun .nemo trouvé")
39
+ model = nemo_asr.models.EncDecHybridRNNTCTCBPEModel.restore_from(nemo_file) if mode=="rnnt" \
40
+ else nemo_asr.models.EncDecCTCModelBPE.restore_from(nemo_file)
41
  model.to(DEVICE).eval()
42
+ _cache[name]=model
43
  return model
44
 
45
  def extract_audio(video, wav):
 
47
  wav, fps=16000, codec="pcm_s16le", ffmpeg_params=["-ac","1"], logger=None
48
  )
49
 
50
+ def clean_audio(wav, top_db=35):
51
  audio, sr = sf.read(wav)
52
+ if audio.ndim==2: audio=audio.mean(1)
53
+ max_val=np.max(np.abs(audio)) if audio.size>0 else 0
54
+ if max_val>1e-6: audio=audio/max_val*0.9
55
+ clean=wav.replace(".wav","_clean.wav")
56
+ sf.write(clean,audio,sr)
57
+ return clean,audio,sr
58
+
59
+ def transcribe(model,wav):
60
+ out=model.transcribe([wav])
61
+ if isinstance(out,list) and out and hasattr(out[0],"text"): return out[0].text.strip()
62
+ if isinstance(out,list) and out and isinstance(out[0],str): return out[0].strip()
63
+ if hasattr(out,"text"): return out.text.strip()
64
+ return str(out).strip()
65
+
66
+ def keep_bambara(words):
67
+ res=[]
68
  for w in words:
69
+ wl=w.lower()
70
+ if any(c in wl for c in ["ɛ","ɔ","ŋ"]) or sum(c in "aeiou" for c in wl)>=2:
71
+ res.append(w)
72
+ return res
73
+
74
+ MAX_CHARS=45; MIN_DUR=0.3; MAX_DUR=3.2; MAX_WORDS=8
75
+
76
+ def wrap2(txt):
77
+ parts=textwrap.wrap(txt,MAX_CHARS)
78
+ if len(parts)<=1: return txt
79
+ mid=len(txt)//2
80
+ left=txt.rfind(" ",0,mid)
81
+ right=txt.find(" ",mid)
82
+ cut=left if (mid-left)<=(right-mid if right!=-1 else 1e9) else right
83
+ l1=txt[:cut].strip(); l2=txt[cut:].strip()
84
+ return l1+"\n"+l2 if l2 else l1
85
+
86
+ def pack(spans,total):
87
+ tmp=[]
88
+ for s,e,t in spans:
89
+ s=max(0,min(s,total)); e=max(0,min(e,total))
90
+ if e<=s: continue
91
+ t=t.strip()
92
+ if not t: continue
93
+ tmp.append((s,e,t))
94
+ merged=[]
95
+ for seg in tmp:
96
+ if not merged: merged.append(seg); continue
97
+ ps,pe,pt=merged[-1]; s,e,t=seg
98
+ if (e-s)<MIN_DUR or (s-pe)<0.1:
99
+ merged[-1]=(ps,max(pe,e),(pt+" "+t).strip())
100
+ else: merged.append(seg)
101
+ out=[]; last_end=0
102
+ for s,e,t in merged:
103
+ dur=e-s; words=t.split()
104
+ blocks=[" ".join(words[i:i+MAX_WORDS]) for i in range(0,len(words),MAX_WORDS)]
105
+
106
+ step = dur / max(1, len(blocks))
107
+
108
+ base=s
109
+ for b in blocks:
110
+ st=base; en=min(base+step,e); base=en
111
+
112
+ if en<=st: en=min(st+0.05,total)
113
+
114
+ txt=wrap2(b)
115
+ if st < last_end:
116
+ st = last_end + 1e-3
117
+ en = max(en, st + 0.05)
118
+
119
+ out.append((st,en,txt)); last_end=en
120
+ return out
121
+
122
+ def align_ctc(model,audio,sr,text,total_dur):
123
+ words=keep_bambara(text.split())
124
  if not words: return []
125
+ x=torch.tensor(audio).float().unsqueeze(0).to(DEVICE)
126
+ ln=torch.tensor([x.shape[1]]).to(DEVICE)
127
+ total = total_dur
128
+ with torch.no_grad(): logits=model(input_signal=x,input_signal_length=ln)[0]
129
+
130
+ print(f"DEBUG TIME 3: Logits frames = {logits.shape[1]}")
131
+ tps = total / logits.shape[1]
132
+ print(f"DEBUG TIME 4: Time per logit frame (tps) = {tps:.6f}s")
133
+
134
+ raw=model.tokenizer.vocab
135
+ vocab=list(raw.keys()) if isinstance(raw,dict) else list(raw)
136
+ cfg=CtcSegmentationParameters(); cfg.char_list=vocab
137
+ gt=prepare_text(cfg,words)[0]
138
+ timing,_,_=ctc_segmentation(cfg,logits.detach().cpu().numpy()[0],gt)
139
+ spans=[(timing[i]*tps, timing[i+1]*tps, words[i]) for i in range(len(words))]
140
+ return pack(spans,total)
141
+
142
+ def align_vad(text,audio,sr,total_dur,top_db=28):
143
+ words=keep_bambara(text.split())
144
+ total = total_dur
145
+ iv=librosa.effects.split(audio,top_db=top_db)
146
+ if len(iv)==0 or not words:
147
+ return pack([(0,total," ".join(words[:MAX_WORDS]))],total)
148
+ spans=[]; L=sum(e-s for s,e in iv); idx=0
149
+ for s,e in iv:
150
+ seg=e-s; segt=seg/sr; k=max(1,int(round(len(words)*(seg/L))))
 
 
 
 
 
 
 
 
151
  chunk=words[idx:idx+k]; idx+=k
152
  if not chunk: continue
153
+ lines=[chunk[i:i+MAX_WORDS] for i in range(0,len(chunk),MAX_WORDS)]
154
+ step=max(MIN_DUR,min(MAX_DUR,segt/len(lines))); base=s/sr
155
+ for j,ln in enumerate(lines):
156
  st=base+j*step; en=base+(j+1)*step
157
+ spans.append((st,en," ".join(ln)))
158
+ return pack(spans,total)
159
+
160
+ def draw(text,W,H):
161
+ band=int(H*0.18); img=Image.new("RGBA",(W,band),(0,0,0,170))
162
+ d=ImageDraw.Draw(img)
163
+ try:
164
+ font=ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf",max(20,H//22))
165
  except: font=ImageFont.load_default()
166
+ for i,line in enumerate(text.split("\n")):
167
+ bbox=d.textbbox((0,0),line,font=font)
168
+ w=bbox[2]-bbox[0]; h=bbox[3]-bbox[1]
169
+ d.text(((W-w)//2,(band-(h*len(text.split('\n'))))//2 + i*h),
170
+ line,fill="white",font=font,stroke_width=2,stroke_fill="black")
171
+ return np.array(img)
172
 
173
  def burn(video,subs):
174
+ tmp="noaudio.mp4"; out="RobotsMali_Subtitled.mp4"
175
+
176
+ # Correction V9: Forcer le FPS dès la lecture du clip
177
+ base=VideoFileClip(video, fps=None)
178
+ dur=base.duration
179
+ fps=base.fps
180
+ base = base.set_fps(fps)
181
+ W,H=base.size;
182
+
183
+ layers=[ImageClip(draw(t,W,H)).set_start(s).set_duration(e-s).set_pos(("center","bottom"))
184
+ for s,e,t in subs]
185
+
186
+ final=CompositeVideoClip([base]+layers).set_duration(dur)
187
+
188
+ final.write_videofile(tmp,codec="libx264",audio=False,fps=base.fps)
189
+
190
+ # Correction V8: Réencodage forcé + réinitialisation du timestamp (-ss 0)
191
+ os.system(f'ffmpeg -y -i "{tmp}" -i "{video}" -map 0:v -map 1:a -c:v libx264 -crf 23 -c:a aac -b:a 192k -r {fps} -t {dur} -ss 0 "{out}"')
192
  return out
193
 
194
+ def pipeline(video,model_name):
 
195
  try:
196
+ wav=tempfile.gettempdir()+"/asr.wav"
197
+
198
+ base=VideoFileClip(video)
199
+ dur=base.duration
200
+
201
+ extract_audio(video,wav)
202
+ clean,audio,sr=clean_audio(wav)
203
+
204
+ print(f"DEBUG TIME 1: Video duration (dur) = {dur:.4f}s")
205
+ print(f"DEBUG TIME 2: Audio length (len(audio)/sr) = {len(audio)/sr:.4f}s")
206
+
207
  model=load_model(model_name)
208
  text=transcribe(model,clean)
209
  mode=MODELS[model_name][1]
210
+
211
+ subs=align_ctc(model,audio,sr,text, dur) if mode=="ctc" else align_vad(text,audio,sr, dur)
212
+
213
+ if not subs: return "⚠️ Aucun sous-titre utilisable",None
214
+ out=burn(video,subs)
215
+ return "✅ Terminé",out
216
+ except Exception:
217
+ traceback.print_exc()
218
+ return "❌ Erreur — logs ci-dessus",None
219
+
220
+ with gr.Blocks(title="RobotsMali V37 Final") as demo:
221
+ gr.Markdown("## ⚡ RobotsMali V37 — Sous-titrage Style Netflix (Production)")
222
+ v=gr.Video()
223
+ m=gr.Dropdown(list(MODELS.keys()),value="Soloba V1 (CTC)")
224
+ b=gr.Button("▶️ Générer")
225
+ s=gr.Markdown()
226
+ o=gr.Video()
227
+
228
+ b.click(pipeline,[v,m],[s,o])
229
+
230
+ demo.launch(share=True, debug=False)