binaryMao commited on
Commit
64e18a4
·
verified ·
1 Parent(s): 900f511

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +111 -161
app.py CHANGED
@@ -1,7 +1,7 @@
1
  # -*- coding: utf-8 -*-
2
  """
3
- ROBOTSMALI V37 FINAL — SOUS-TITRAGE BAMBARA (STYLE NETFLIX)
4
- Correction V12 : Alignement stable (RNNT = CTC precise, CTC/BPE = VAD)
5
  """
6
 
7
  import os, tempfile, traceback, random, textwrap
@@ -9,19 +9,18 @@ import numpy as np
9
  import torch
10
  import soundfile as sf
11
  import librosa
12
- from PIL import Image, ImageDraw, ImageFont
13
- import gradio as gr
14
  from huggingface_hub import snapshot_download
15
- from moviepy.editor import VideoFileClip, CompositeVideoClip, ImageClip
16
-
17
  from nemo.collections import asr as nemo_asr
18
- from ctc_segmentation import ctc_segmentation, CtcSegmentationParameters, prepare_text
 
19
 
20
  # ----------------------------
21
  # CONFIG
22
  # ----------------------------
23
  DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
24
- random.seed(1234); np.random.seed(1234); torch.manual_seed(1234)
 
 
25
 
26
  MODELS = {
27
  "Soloni V1 (RNNT)": ("RobotsMali/soloni-114m-tdt-ctc-v1", "rnnt"),
@@ -38,13 +37,12 @@ _cache = {}
38
  # MODEL LOADING
39
  # ----------------------------
40
  def load_model(name):
41
- if name in _cache:
42
- return _cache[name]
43
  repo, mode = MODELS[name]
44
  folder = snapshot_download(repo, local_dir_use_symlinks=False)
45
  nemo_file = next((os.path.join(folder, f) for f in os.listdir(folder) if f.endswith(".nemo")), None)
46
  if not nemo_file:
47
- raise FileNotFoundError("Aucun fichier .nemo trouvé dans le repo du modèle.")
48
  model = (
49
  nemo_asr.models.EncDecHybridRNNTCTCBPEModel.restore_from(nemo_file)
50
  if mode == "rnnt"
@@ -58,9 +56,7 @@ def load_model(name):
58
  # AUDIO EXTRACTION & CLEANING
59
  # ----------------------------
60
  def extract_audio(video, wav):
61
- VideoFileClip(video).audio.write_audiofile(
62
- wav, fps=16000, codec="pcm_s16le", ffmpeg_params=["-ac", "1"], logger=None
63
- )
64
 
65
  def clean_audio(wav, top_db=35):
66
  audio, sr = sf.read(wav)
@@ -91,183 +87,137 @@ def transcribe(model, wav):
91
  # UTILITAIRES
92
  # ----------------------------
93
  def keep_bambara(words):
94
- res = []
95
  for w in words:
96
- wl = w.lower()
97
- if any(c in wl for c in ["ɛ", "ɔ", "ŋ"]) or sum(c in "aeiou" for c in wl) >= 2:
98
  res.append(w)
99
  return res
100
 
101
- MAX_CHARS = 45
102
- MIN_DUR = 0.3
103
- MAX_DUR = 3.2
104
- MAX_WORDS = 8
105
 
106
  def wrap2(txt):
107
- parts = textwrap.wrap(txt, MAX_CHARS)
108
- if len(parts) <= 1:
109
- return txt
110
- mid = len(txt) // 2
111
- left = txt.rfind(" ", 0, mid)
112
- right = txt.find(" ", mid)
113
- cut = left if (mid - left) <= (right - mid if right != -1 else 1e9) else right
114
- l1 = txt[:cut].strip()
115
- l2 = txt[cut:].strip()
116
- return l1 + "\n" + l2 if l2 else l1
117
-
118
- def pack(spans, total):
119
- tmp = []
120
- for s, e, t in spans:
121
- s = max(0, min(s, total))
122
- e = max(0, min(e, total))
123
- if e <= s or not t.strip():
124
- continue
125
- tmp.append((s, e, t.strip()))
126
-
127
- merged = []
128
  for seg in tmp:
129
- if not merged:
130
- merged.append(seg)
131
- continue
132
- ps, pe, pt = merged[-1]
133
- s, e, t = seg
134
- if (e - s) < MIN_DUR or (s - pe) < 0.1:
135
- merged[-1] = (ps, max(pe, e), (pt + " " + t).strip())
136
- else:
137
- merged.append(seg)
138
-
139
- out = []
140
- last_end = 0
141
- for s, e, t in merged:
142
- dur = e - s
143
- words = t.split()
144
- blocks = [" ".join(words[i:i + MAX_WORDS]) for i in range(0, len(words), MAX_WORDS)]
145
- step = dur / max(1, len(blocks))
146
- base = s
147
  for b in blocks:
148
- st = base
149
- en = min(base + step, e)
150
- base = en
151
- if en <= st:
152
- en = min(st + 0.05, total)
153
- txt = wrap2(b)
154
- if st < last_end:
155
- st = last_end + 1e-3
156
- en = max(en, st + 0.05)
157
- out.append((st, en, txt))
158
- last_end = en
159
  return out
160
 
161
  # ----------------------------
162
  # ALIGNEMENT SIMPLE (VAD)
163
  # ----------------------------
164
- def align_vad(text, audio, sr, total_dur, top_db=28):
165
- words = keep_bambara(text.split())
166
- total = total_dur
167
- iv = librosa.effects.split(audio, top_db=top_db)
168
- if len(iv) == 0 or not words:
169
- return pack([(0, total, " ".join(words[:MAX_WORDS]))], total)
170
- spans = []
171
- L = sum(e - s for s, e in iv)
172
- idx = 0
173
- for s, e in iv:
174
- seg = e - s
175
- segt = seg / sr
176
- k = max(1, int(round(len(words) * (seg / L))))
177
- chunk = words[idx:idx + k]
178
- idx += k
179
- if not chunk:
180
- continue
181
- lines = [chunk[i:i + MAX_WORDS] for i in range(0, len(chunk), MAX_WORDS)]
182
- step = max(MIN_DUR, min(MAX_DUR, segt / len(lines)))
183
- base = s / sr
184
- for j, ln in enumerate(lines):
185
- st = base + j * step
186
- en = base + (j + 1) * step
187
- spans.append((st, en, " ".join(ln)))
188
- return pack(spans, total)
189
-
190
- # ----------------------------
191
- # DESSIN SOUS-TITRES
192
- # ----------------------------
193
- def draw(text, W, H):
194
- band = int(H * 0.18)
195
- img = Image.new("RGBA", (W, band), (0, 0, 0, 170))
196
- d = ImageDraw.Draw(img)
197
- try:
198
- font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf", max(20, H // 22))
199
- except:
200
- font = ImageFont.load_default()
201
- lines = text.split("\n")
202
- for i, line in enumerate(lines):
203
- bbox = d.textbbox((0, 0), line, font=font)
204
- w = bbox[2] - bbox[0]
205
- h = bbox[3] - bbox[1]
206
- d.text(((W - w) // 2, (band - (h * len(lines))) // 2 + i * h),
207
- line, fill="white", font=font, stroke_width=2, stroke_fill="black")
208
- return np.array(img)
209
-
210
- # ----------------------------
211
- # FUSION FINALE (FFmpeg)
212
  # ----------------------------
213
  def burn(video, subs):
214
- tmp_final = "temp_noaudio.mp4"
215
- out = "RobotsMali_Subtitled.mp4"
216
- base = VideoFileClip(video)
217
- dur = base.duration
218
- fps = base.fps
219
- W, H = base.size
220
- layers = [ImageClip(draw(t, W, H)).set_start(s).set_duration(e - s).set_pos(("center", "bottom")) for s, e, t in subs]
221
- final = CompositeVideoClip([base] + layers).set_duration(dur)
222
- final.write_videofile(tmp_final, codec="libx264", audio=False, fps=fps, logger=None)
223
- os.system(f'ffmpeg -y -i "{tmp_final}" -i "{video}" -map 0:v -map 1:a -c:v libx264 -crf 23 -c:a aac -b:a 192k -r {fps} -t {dur} "{out}"')
224
- if os.path.exists(tmp_final):
225
- os.remove(tmp_final)
226
- return out
227
 
228
  # ----------------------------
229
  # PIPELINE PRINCIPAL
230
  # ----------------------------
231
  def pipeline(video, model_name):
232
  try:
233
- wav = tempfile.gettempdir() + "/asr.wav"
234
- base = VideoFileClip(video)
235
- dur = base.duration
236
- extract_audio(video, wav)
237
- clean, audio, sr = clean_audio(wav)
238
-
239
- print(f"DEBUG: Durée vidéo = {dur:.2f}s, Audio = {len(audio)/sr:.2f}s")
240
-
241
- model = load_model(model_name)
242
- text = transcribe(model, clean)
243
- mode = MODELS[model_name][1]
244
-
245
- # RNNT → alignement CTC réel | sinon alignement simple (VAD)
246
- if mode == "rnnt":
247
- subs = align_ctc(model, audio, sr, text, dur)
 
 
 
 
 
 
 
248
  else:
249
- subs = align_vad(text, audio, sr, dur)
250
-
251
- if not subs:
252
- return "⚠️ Aucun sous-titre utilisable", None
253
-
254
- out = burn(video, subs)
255
- return "✅ Terminé avec succès", out
256
-
257
  except Exception:
258
  traceback.print_exc()
259
- return "❌ Erreur — voir logs ci-dessus", None
260
 
261
  # ----------------------------
262
  # INTERFACE GRADIO
263
  # ----------------------------
264
- with gr.Blocks(title="RobotsMali V37 Final") as demo:
265
- gr.Markdown("## ⚡ RobotsMali V37 — Sous-titrage Style Netflix (alignement stable)")
266
- v = gr.Video()
267
- m = gr.Dropdown(list(MODELS.keys()), value="Soloba V1 (CTC)")
268
  b = gr.Button("▶️ Générer")
269
  s = gr.Markdown()
270
- o = gr.Video()
 
271
  b.click(pipeline, [v, m], [s, o])
272
 
273
  demo.launch(share=True, debug=False)
 
1
  # -*- coding: utf-8 -*-
2
  """
3
+ ROBOTSMALI V38 FINAL — SOUS-TITRAGE BAMBARA (STYLE NETFLIX)
4
+ Correction V38 : Durée exacte, QuartzNet fonctionnel, pipeline simplifiée
5
  """
6
 
7
  import os, tempfile, traceback, random, textwrap
 
9
  import torch
10
  import soundfile as sf
11
  import librosa
 
 
12
  from huggingface_hub import snapshot_download
 
 
13
  from nemo.collections import asr as nemo_asr
14
+ import gradio as gr
15
+ from moviepy.editor import VideoFileClip
16
 
17
  # ----------------------------
18
  # CONFIG
19
  # ----------------------------
20
  DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
21
+ random.seed(1234)
22
+ np.random.seed(1234)
23
+ torch.manual_seed(1234)
24
 
25
  MODELS = {
26
  "Soloni V1 (RNNT)": ("RobotsMali/soloni-114m-tdt-ctc-v1", "rnnt"),
 
37
  # MODEL LOADING
38
  # ----------------------------
39
  def load_model(name):
40
+ if name in _cache: return _cache[name]
 
41
  repo, mode = MODELS[name]
42
  folder = snapshot_download(repo, local_dir_use_symlinks=False)
43
  nemo_file = next((os.path.join(folder, f) for f in os.listdir(folder) if f.endswith(".nemo")), None)
44
  if not nemo_file:
45
+ raise FileNotFoundError(f"Aucun .nemo trouvé pour {name}")
46
  model = (
47
  nemo_asr.models.EncDecHybridRNNTCTCBPEModel.restore_from(nemo_file)
48
  if mode == "rnnt"
 
56
  # AUDIO EXTRACTION & CLEANING
57
  # ----------------------------
58
  def extract_audio(video, wav):
59
+ os.system(f'ffmpeg -y -i "{video}" -ar 16000 -ac 1 -vn "{wav}"')
 
 
60
 
61
  def clean_audio(wav, top_db=35):
62
  audio, sr = sf.read(wav)
 
87
  # UTILITAIRES
88
  # ----------------------------
89
  def keep_bambara(words):
90
+ res=[]
91
  for w in words:
92
+ wl=w.lower()
93
+ if any(c in wl for c in ["ɛ","ɔ","ŋ"]) or sum(c in "aeiou" for c in wl)>=2:
94
  res.append(w)
95
  return res
96
 
97
+ MAX_CHARS=45; MIN_DUR=0.3; MAX_DUR=3.2; MAX_WORDS=8
 
 
 
98
 
99
  def wrap2(txt):
100
+ parts=textwrap.wrap(txt,MAX_CHARS)
101
+ if len(parts)<=1: return txt
102
+ mid=len(txt)//2
103
+ left=txt.rfind(" ",0,mid)
104
+ right=txt.find(" ",mid)
105
+ cut=left if (mid-left)<=(right-mid if right!=-1 else 1e9) else right
106
+ l1=txt[:cut].strip(); l2=txt[cut:].strip()
107
+ return l1+"\n"+l2 if l2 else l1
108
+
109
+ def pack(spans,total):
110
+ tmp=[]
111
+ for s,e,t in spans:
112
+ s=max(0,min(s,total)); e=max(0,min(e,total))
113
+ if e<=s or not t.strip(): continue
114
+ tmp.append((s,e,t.strip()))
115
+ merged=[]
 
 
 
 
 
116
  for seg in tmp:
117
+ if not merged: merged.append(seg); continue
118
+ ps,pe,pt=merged[-1]; s,e,t=seg
119
+ if (e-s)<MIN_DUR or (s-pe)<0.1:
120
+ merged[-1]=(ps,max(pe,e),(pt+" "+t).strip())
121
+ else: merged.append(seg)
122
+ out=[]; last_end=0
123
+ for s,e,t in merged:
124
+ dur=e-s; words=t.split()
125
+ blocks=[" ".join(words[i:i+MAX_WORDS]) for i in range(0,len(words),MAX_WORDS)]
126
+ step=dur/max(1,len(blocks)); base=s
 
 
 
 
 
 
 
 
127
  for b in blocks:
128
+ st=base; en=min(base+step,e); base=en
129
+ if en<=st: en=min(st+0.05,total)
130
+ txt=wrap2(b)
131
+ if st<last_end: st=last_end+1e-3; en=max(en,st+0.05)
132
+ out.append((st,en,txt)); last_end=en
 
 
 
 
 
 
133
  return out
134
 
135
  # ----------------------------
136
  # ALIGNEMENT SIMPLE (VAD)
137
  # ----------------------------
138
+ def align_vad(text,audio,sr,total_dur,top_db=28):
139
+ words=keep_bambara(text.split())
140
+ total=total_dur
141
+ iv=librosa.effects.split(audio,top_db=top_db)
142
+ if len(iv)==0 or not words:
143
+ return pack([(0,total," ".join(words[:MAX_WORDS]))],total)
144
+ spans=[]; L=sum(e-s for s,e in iv); idx=0
145
+ for s,e in iv:
146
+ seg=e-s; segt=seg/sr; k=max(1,int(round(len(words)*(seg/L))))
147
+ chunk=words[idx:idx+k]; idx+=k
148
+ if not chunk: continue
149
+ lines=[chunk[i:i+MAX_WORDS] for i in range(0,len(chunk),MAX_WORDS)]
150
+ step=max(MIN_DUR,min(MAX_DUR,segt/len(lines))); base=s/sr
151
+ for j,ln in enumerate(lines):
152
+ st=base+j*step; en=base+(j+1)*step
153
+ spans.append((st,en," ".join(ln)))
154
+ return pack(spans,total)
155
+
156
+ # ----------------------------
157
+ # SOUS-TITRES SRT + FFmpeg
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
158
  # ----------------------------
159
  def burn(video, subs):
160
+ tmp_srt = tempfile.mktemp(suffix=".srt")
161
+ out_file = "RobotsMali_Subtitled.mp4"
162
+ # Écriture SRT
163
+ def sec_to_srt(t):
164
+ h=int(t//3600); m=int((t%3600)//60); s=int(t%60); ms=int((t-int(t))*1000)
165
+ return f"{h:02}:{m:02}:{s:02},{ms:03}"
166
+ with open(tmp_srt,"w",encoding="utf-8") as f:
167
+ for i,(start,end,text) in enumerate(subs,1):
168
+ f.write(f"{i}\n{sec_to_srt(start)} --> {sec_to_srt(end)}\n{text}\n\n")
169
+ # Fusion vidéo + sous-titres sans changer durée
170
+ os.system(f'ffmpeg -y -i "{video}" -vf "subtitles={tmp_srt}" -c:v copy -c:a aac -b:a 192k "{out_file}"')
171
+ if os.path.exists(tmp_srt): os.remove(tmp_srt)
172
+ return out_file
173
 
174
  # ----------------------------
175
  # PIPELINE PRINCIPAL
176
  # ----------------------------
177
  def pipeline(video, model_name):
178
  try:
179
+ wav=tempfile.mktemp(suffix=".wav")
180
+ # Extraction audio
181
+ extract_audio(video,wav)
182
+ clean,audio,sr=clean_audio(wav)
183
+ model=load_model(model_name)
184
+ text=transcribe(model,clean)
185
+ mode=MODELS[model_name][1]
186
+ if mode=="rnnt":
187
+ from ctc_segmentation import ctc_segmentation,CtcSegmentationParameters,prepare_text
188
+ words=keep_bambara(text.split())
189
+ if not words: return "⚠️ Aucun sous-titre utilisable",None
190
+ x=torch.tensor(audio).float().unsqueeze(0).to(DEVICE)
191
+ ln=torch.tensor([x.shape[1]]).to(DEVICE)
192
+ with torch.no_grad(): logits=model(input_signal=x,input_signal_length=ln)[0]
193
+ tps=VideoFileClip(video).duration/logits.shape[1]
194
+ raw=model.tokenizer.vocab
195
+ vocab=list(raw.keys()) if isinstance(raw,dict) else list(raw)
196
+ cfg=CtcSegmentationParameters(); cfg.char_list=vocab
197
+ gt=prepare_text(cfg,words)[0]
198
+ timing,_,_=ctc_segmentation(cfg,logits.detach().cpu().numpy()[0],gt)
199
+ spans=[(timing[i]*tps,timing[i+1]*tps,words[i]) for i in range(len(words))]
200
+ subs=pack(spans,VideoFileClip(video).duration)
201
  else:
202
+ subs=align_vad(text,audio,sr,VideoFileClip(video).duration)
203
+ if not subs: return "⚠️ Aucun sous-titre utilisable",None
204
+ out=burn(video,subs)
205
+ return " Terminé avec succès",out
 
 
 
 
206
  except Exception:
207
  traceback.print_exc()
208
+ return "❌ Erreur — voir logs ci-dessus",None
209
 
210
  # ----------------------------
211
  # INTERFACE GRADIO
212
  # ----------------------------
213
+ with gr.Blocks(title="RobotsMali V38 Final") as demo:
214
+ gr.Markdown("## ⚡ RobotsMali V38 — Sous-titrage Style Netflix (QuartzNet & RNNT stable)")
215
+ v = gr.Video(label="Vidéo à sous-titrer")
216
+ m = gr.Dropdown(list(MODELS.keys()), value="Soloba V1 (CTC)", label="Modèle ASR")
217
  b = gr.Button("▶️ Générer")
218
  s = gr.Markdown()
219
+ o = gr.Video(label="Vidéo sous-titrée")
220
+
221
  b.click(pipeline, [v, m], [s, o])
222
 
223
  demo.launch(share=True, debug=False)