binaryMao commited on
Commit
b332947
·
verified ·
1 Parent(s): 77aec49

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +65 -149
app.py CHANGED
@@ -1,7 +1,9 @@
1
  # -*- coding: utf-8 -*-
2
  """
3
- ROBOTSMALI — Sous-titrage Bambara
4
-
 
 
5
  """
6
  import os
7
  import shlex
@@ -20,13 +22,6 @@ from huggingface_hub import snapshot_download
20
  from nemo.collections import asr as nemo_asr
21
  import gradio as gr
22
 
23
- # Tente l'importation de la librairie d'alignement nécessaire
24
- try:
25
- from ctc_segmentation import ctc_segmentation, CtcSegmentationParameters, prepare_text
26
- HAS_CTC_SEGMENTATION = True
27
- except ImportError:
28
- HAS_CTC_SEGMENTATION = False
29
-
30
  # ---------------------------- # CONFIGURATION # ----------------------------
31
  DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
32
  random.seed(1234)
@@ -35,7 +30,6 @@ torch.manual_seed(1234)
35
 
36
  SEGMENT_DURATION = 10.0
37
 
38
- # Liste complète des modèles
39
  MODELS = {
40
  "Soloni V1 (RNNT)": ("RobotsMali/soloni-114m-tdt-ctc-v1", "rnnt"),
41
  "Soloni V0 (RNNT)": ("RobotsMali/soloni-114m-tdt-ctc-v0", "rnnt"),
@@ -45,7 +39,6 @@ MODELS = {
45
  "QuartzNet V0 (CTC-char)": ("RobotsMali/stt-bm-quartznet15x5-v0", "ctc_char"),
46
  }
47
 
48
- # Vidéo d'exemple (identifiée sur votre capture d'écran)
49
  VIDEO_EXAMPLES = [
50
  ["examples/MARALINKE-Wii (Lève-toi) Black lives matter (Clip officiel) - MARALINKE (360p, H264).mp4", "Soloba V1 (CTC)"]
51
  ]
@@ -55,18 +48,14 @@ _cache = {}
55
  # ---------------------------- # FONCTIONS TECHNIQUES # ----------------------------
56
 
57
  def run_cmd(cmd):
58
- """Execute a shell command and raise on non-zero exit."""
59
- print("RUN:", cmd)
60
  res = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True)
61
  if res.returncode != 0:
62
- raise RuntimeError(f"Commande échouée [{cmd}]\nOutput:\n{res.stdout}")
63
  return res.stdout
64
 
65
  def ffprobe_duration(path):
66
- """Détermine la durée de la vidéo via ffprobe."""
67
  cmd = f'ffprobe -v error -show_entries format=duration -of default=noprint_wrappers=1:nokey=1 {shlex.quote(path)}'
68
  out = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
69
- if out.returncode != 0: return None
70
  try: return float(out.stdout.strip())
71
  except: return None
72
 
@@ -75,7 +64,6 @@ def load_model(name):
75
  repo, mode = MODELS[name]
76
  folder = snapshot_download(repo, local_dir_use_symlinks=False)
77
  nemo_file = next((os.path.join(folder, f) for f in os.listdir(folder) if f.endswith(".nemo")), None)
78
-
79
  if mode == "rnnt":
80
  model = nemo_asr.models.EncDecHybridRNNTCTCBPEModel.restore_from(nemo_file)
81
  elif mode == "ctc_char":
@@ -83,47 +71,29 @@ def load_model(name):
83
  else:
84
  try: model = nemo_asr.models.EncDecCTCModelBPE.restore_from(nemo_file)
85
  except: model = nemo_asr.models.EncDecCTCModel.restore_from(nemo_file)
86
-
87
  model.to(DEVICE).eval()
88
  _cache[name] = model
89
  return model
90
 
91
  def extract_audio(video_path, out_wav):
92
- """Extraction audio avec stabilisation forcée pour support Webcam (VP8 -> H264)."""
93
  tmp_fd, stabilized_mp4 = tempfile.mkstemp(suffix="_stabilized.mp4")
94
  os.close(tmp_fd)
95
-
96
- # ÉTAPE 1: Réencodage en H.264 (Indispensable pour MP4/Webcam)
97
- remux_cmd = (
98
- f'ffmpeg -hide_banner -loglevel error -y '
99
- f'-analyzeduration 2147483647 -probesize 2147483647 '
100
- f'-i {shlex.quote(video_path)} '
101
- f'-c:v libx264 -preset ultrafast -crf 23 -c:a aac '
102
- f'{shlex.quote(stabilized_mp4)}'
103
- )
104
- run_cmd(remux_cmd)
105
-
106
- # ÉTAPE 2: Extraction de l'audio 16k WAV
107
- extract_cmd = (
108
- f'ffmpeg -hide_banner -loglevel error -y '
109
- f'-i {shlex.quote(stabilized_mp4)} -vn -ac 1 -ar 16000 -f wav {shlex.quote(out_wav)}'
110
- )
111
- run_cmd(extract_cmd)
112
-
113
  if os.path.exists(stabilized_mp4): os.remove(stabilized_mp4)
114
 
115
- def clean_audio(wav_path, target_sr=16000):
116
  audio, sr = sf.read(wav_path)
117
  if audio.ndim == 2: audio = audio.mean(axis=1)
118
- if sr != target_sr:
119
- audio = librosa.resample(audio.astype(float), orig_sr=sr, target_sr=target_sr)
120
  max_val = np.max(np.abs(audio)) if audio.size > 0 else 0.0
121
  if max_val > 1e-6: audio = audio / max_val * 0.9
122
- clean_path = str(Path(wav_path).with_name(Path(wav_path).stem + "_clean.wav"))
123
- sf.write(clean_path, audio, target_sr)
124
- return clean_path, audio, target_sr
125
 
126
- # ---------------------------- # LOGIQUE SOUS-TITRAGE # ----------------------------
127
 
128
  def transcribe(model, wav_path):
129
  out = model.transcribe([wav_path])
@@ -132,83 +102,12 @@ def transcribe(model, wav_path):
132
  return res.text.strip() if hasattr(res, "text") else str(res).strip()
133
  return str(out).strip()
134
 
135
- def keep_bambara(words):
136
- return [w for w in words if any(c in w.lower() for c in ["ɛ","ɔ","ŋ"]) or sum(1 for c in w.lower() if c in "aeiou") >= 2]
137
-
138
- MAX_CHARS = 45; MIN_DUR = 0.3; MAX_WORDS = 8
139
-
140
- def wrap2(txt):
141
- parts = textwrap.wrap(txt, MAX_CHARS)
142
- return "\n".join(parts) if len(parts) > 1 else txt
143
-
144
- def pack(spans, total):
145
- if not spans: return []
146
- merged = []
147
- for s, e, t in spans:
148
- s = max(0, min(s, total)); e = max(0, min(e, total))
149
- if e <= s or not t.strip(): continue
150
- if not merged: merged.append((s, e, t))
151
- else:
152
- ps, pe, pt = merged[-1]; s, e, t = s, e, t
153
- if (e - s) < MIN_DUR or (s - pe) < 0.1:
154
- merged[-1] = (ps, max(pe, e), (pt + " " + t).strip())
155
- else: merged.append((s, e, t))
156
-
157
- final = []
158
- for s, e, t in merged:
159
- words = t.split()
160
- blocks = [" ".join(words[i:i+MAX_WORDS]) for i in range(0, len(words), MAX_WORDS)]
161
- step = (e - s) / max(1, len(blocks))
162
- for j, b in enumerate(blocks):
163
- st = s + j * step; en = st + step
164
- final.append((st, en, wrap2(b)))
165
- return final
166
-
167
- def align_heuristic(words, total_dur):
168
- if not words: return []
169
- blocks = [" ".join(words[i:i+MAX_WORDS]) for i in range(0, len(words), MAX_WORDS)]
170
- step = total_dur / len(blocks)
171
- return [(i*step, (i+1)*step, b) for i, b in enumerate(blocks)]
172
-
173
- def segment_and_align(model, audio, sr, total_dur, mode):
174
- segment_samples = int(SEGMENT_DURATION * sr)
175
- all_subs = []
176
- for i in range(0, len(audio), segment_samples):
177
- start_s = i / sr
178
- chunk = audio[i:i+segment_samples]
179
- with tempfile.NamedTemporaryFile(suffix=".wav", delete=True) as tf:
180
- sf.write(tf.name, chunk, sr)
181
- text = transcribe(model, tf.name)
182
- words = keep_bambara(text.split())
183
- subs = align_heuristic(words, len(chunk)/sr)
184
- for s, e, t in subs:
185
- all_subs.append((s + start_s, e + start_s, t))
186
- return pack(all_subs, total_dur)
187
-
188
- def burn(video_path, subs):
189
- out_path = "RobotsMali_Subtitled.mp4"
190
- with tempfile.NamedTemporaryFile(suffix=".srt", mode="w", encoding="utf-8", delete=False) as tf:
191
- for i, (start, end, text) in enumerate(subs, 1):
192
- def t_srt(sec):
193
- h=int(sec//3600); m=int((sec%3600)//60); s=int(sec%60); ms=int((sec-int(sec))*1000)
194
- return f"{h:02}:{m:02}:{s:02},{ms:03}"
195
- tf.write(f"{i}\n{t_srt(start)} --> {t_srt(end)}\n{text}\n\n")
196
- srt_name = tf.name
197
-
198
- vf = f"subtitles={shlex.quote(srt_name)}:force_style='Fontsize=22,PrimaryColour=&HFFFFFF&,OutlineColour=&H000000&'"
199
- cmd = f'ffmpeg -hide_banner -loglevel error -y -i {shlex.quote(video_path)} -vf {shlex.quote(vf)} -c:v libx264 -preset fast -crf 23 -c:a aac {shlex.quote(out_path)}'
200
- run_cmd(cmd)
201
- os.remove(srt_name)
202
- return out_path
203
-
204
- # ---------------------------- # PIPELINE & INTERFACE # ----------------------------
205
-
206
  def pipeline(video_input, model_name):
207
  try:
208
  video_path = video_input["tmp_path"] if isinstance(video_input, dict) else video_input
209
- if not video_path: return "❌ Aucune vidéo fournie", None
210
 
211
- yield "⏳ Phase 1/3 : Stabilisation et extraction audio...", None
212
  with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tf:
213
  wav_path = tf.name
214
 
@@ -216,60 +115,77 @@ def pipeline(video_input, model_name):
216
  clean_wav, audio, sr = clean_audio(wav_path)
217
  duration = ffprobe_duration(video_path) or (len(audio)/sr)
218
 
219
- yield f"⏳ Phase 2/3 : Analyse IA avec {model_name}...", None
220
  model = load_model(model_name)
221
- mode = MODELS[model_name][1]
222
- subs = segment_and_align(model, audio, sr, duration, mode)
223
 
224
- if not subs: return "⚠️ Pas de parole détectée", None
225
 
226
- yield "⏳ Phase 3/3 : Incrustation des sous-titres...", None
227
- res_v = burn(video_path, subs)
228
- return "✅ Traitement terminé avec succès", res_v
 
 
 
 
 
 
 
 
 
229
  except Exception as e:
230
  traceback.print_exc()
231
- return f"❌ Erreur: {str(e)}", None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
232
 
233
- # --- DESIGN CSS ARTISTIQUE ---
234
  custom_css = """
235
  body { background-color: #0b0e14; }
236
- .gradio-container { background: rgba(17, 25, 40, 0.8) !important; backdrop-filter: blur(12px); border-radius: 20px; border: 1px solid rgba(255, 255, 255, 0.1); padding: 25px !important; }
237
- #header { text-align: center; margin-bottom: 20px; }
238
- #header h1 { color: #facc15; font-size: 2.8rem; letter-spacing: 4px; margin-bottom: 0; }
239
- #header p { color: #94a3b8; font-style: italic; font-size: 1.1rem; }
240
- .gr-button-primary { background: linear-gradient(135deg, #059669, #10b981) !important; border: none !important; font-weight: bold !important; }
241
- .gr-button-primary:hover { transform: translateY(-2px); box-shadow: 0 5px 15px rgba(16, 185, 129, 0.4); }
242
  """
243
 
244
  with gr.Blocks(css=custom_css, theme=gr.themes.Soft()) as demo:
245
- with gr.Div(elem_id="header"):
 
246
  gr.HTML("""
247
- <h1>🤖 ROBOTSMALI</h1>
248
- <p>Intelligence Artificielle & Sauvegarde de la Langue Bambara</p>
249
- <div style="height: 3px; width: 80px; background: #facc15; margin: 15px auto;"></div>
250
  """)
251
 
252
  with gr.Row():
253
  with gr.Column():
254
- gr.Markdown("### 🎥 Source Vidéo")
255
  v_in = gr.Video(label=None, mirror_webcam=False)
256
- m_sel = gr.Dropdown(list(MODELS.keys()), value="Soloba V1 (CTC)", label="Cerveau ASR")
257
- btn = gr.Button("🚀 GÉNÉRER LES SOUS-TITRES", variant="primary")
258
 
259
  with gr.Column():
260
- gr.Markdown("### 📺 Résultat")
261
- status = gr.Markdown("*Prêt pour le traitement...*")
262
  v_out = gr.Video(label=None)
263
 
264
- gr.Examples(
265
- examples=VIDEO_EXAMPLES,
266
- inputs=[v_in, m_sel],
267
- label="📺 Testez avec nos exemples"
268
- )
269
-
270
- gr.HTML("<div style='text-align: center; color: #475569; margin-top: 40px;'>© 2025 RobotsMali - Bamako, Mali</div>")
271
 
272
  btn.click(pipeline, [v_in, m_sel], [status, v_out])
273
 
274
  if __name__ == "__main__":
275
- demo.launch(share=True, debug=True)
 
1
  # -*- coding: utf-8 -*-
2
  """
3
+ ROBOTSMALI — Sous-titrage Bambara (V5.2 - Final Fix)
4
+ - Correction Codec Webcam (VP8 -> H264)
5
+ - Interface Artistique (Compatible Gradio)
6
+ - Intégration Vidéo d'Exemple
7
  """
8
  import os
9
  import shlex
 
22
  from nemo.collections import asr as nemo_asr
23
  import gradio as gr
24
 
 
 
 
 
 
 
 
25
  # ---------------------------- # CONFIGURATION # ----------------------------
26
  DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
27
  random.seed(1234)
 
30
 
31
  SEGMENT_DURATION = 10.0
32
 
 
33
  MODELS = {
34
  "Soloni V1 (RNNT)": ("RobotsMali/soloni-114m-tdt-ctc-v1", "rnnt"),
35
  "Soloni V0 (RNNT)": ("RobotsMali/soloni-114m-tdt-ctc-v0", "rnnt"),
 
39
  "QuartzNet V0 (CTC-char)": ("RobotsMali/stt-bm-quartznet15x5-v0", "ctc_char"),
40
  }
41
 
 
42
  VIDEO_EXAMPLES = [
43
  ["examples/MARALINKE-Wii (Lève-toi) Black lives matter (Clip officiel) - MARALINKE (360p, H264).mp4", "Soloba V1 (CTC)"]
44
  ]
 
48
  # ---------------------------- # FONCTIONS TECHNIQUES # ----------------------------
49
 
50
  def run_cmd(cmd):
 
 
51
  res = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True)
52
  if res.returncode != 0:
53
+ raise RuntimeError(f"Erreur FFmpeg: {res.stdout}")
54
  return res.stdout
55
 
56
  def ffprobe_duration(path):
 
57
  cmd = f'ffprobe -v error -show_entries format=duration -of default=noprint_wrappers=1:nokey=1 {shlex.quote(path)}'
58
  out = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
 
59
  try: return float(out.stdout.strip())
60
  except: return None
61
 
 
64
  repo, mode = MODELS[name]
65
  folder = snapshot_download(repo, local_dir_use_symlinks=False)
66
  nemo_file = next((os.path.join(folder, f) for f in os.listdir(folder) if f.endswith(".nemo")), None)
 
67
  if mode == "rnnt":
68
  model = nemo_asr.models.EncDecHybridRNNTCTCBPEModel.restore_from(nemo_file)
69
  elif mode == "ctc_char":
 
71
  else:
72
  try: model = nemo_asr.models.EncDecCTCModelBPE.restore_from(nemo_file)
73
  except: model = nemo_asr.models.EncDecCTCModel.restore_from(nemo_file)
 
74
  model.to(DEVICE).eval()
75
  _cache[name] = model
76
  return model
77
 
78
  def extract_audio(video_path, out_wav):
 
79
  tmp_fd, stabilized_mp4 = tempfile.mkstemp(suffix="_stabilized.mp4")
80
  os.close(tmp_fd)
81
+ # Réencodage H.264 pour supporter le VP8 de la webcam
82
+ run_cmd(f'ffmpeg -hide_banner -loglevel error -y -i {shlex.quote(video_path)} -c:v libx264 -preset ultrafast -crf 23 -c:a aac {shlex.quote(stabilized_mp4)}')
83
+ run_cmd(f'ffmpeg -hide_banner -loglevel error -y -i {shlex.quote(stabilized_mp4)} -vn -ac 1 -ar 16000 -f wav {shlex.quote(out_wav)}')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
84
  if os.path.exists(stabilized_mp4): os.remove(stabilized_mp4)
85
 
86
+ def clean_audio(wav_path):
87
  audio, sr = sf.read(wav_path)
88
  if audio.ndim == 2: audio = audio.mean(axis=1)
89
+ if sr != 16000: audio = librosa.resample(audio.astype(float), orig_sr=sr, target_sr=16000)
 
90
  max_val = np.max(np.abs(audio)) if audio.size > 0 else 0.0
91
  if max_val > 1e-6: audio = audio / max_val * 0.9
92
+ clean_path = wav_path.replace(".wav", "_clean.wav")
93
+ sf.write(clean_path, audio, 16000)
94
+ return clean_path, audio, 16000
95
 
96
+ # ---------------------------- # LOGIQUE MÉTIER # ----------------------------
97
 
98
  def transcribe(model, wav_path):
99
  out = model.transcribe([wav_path])
 
102
  return res.text.strip() if hasattr(res, "text") else str(res).strip()
103
  return str(out).strip()
104
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
105
  def pipeline(video_input, model_name):
106
  try:
107
  video_path = video_input["tmp_path"] if isinstance(video_input, dict) else video_input
108
+ if not video_path: return "❌ Aucune vidéo détectée", None
109
 
110
+ yield "⏳ Phase 1 : Stabilisation et extraction...", None
111
  with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tf:
112
  wav_path = tf.name
113
 
 
115
  clean_wav, audio, sr = clean_audio(wav_path)
116
  duration = ffprobe_duration(video_path) or (len(audio)/sr)
117
 
118
+ yield f"⏳ Phase 2 : Analyse avec {model_name}...", None
119
  model = load_model(model_name)
120
+ text = transcribe(model, clean_wav)
121
+ words = [w for w in text.split() if any(c in w.lower() for c in ["ɛ","ɔ","ŋ"]) or len(w) > 2]
122
 
123
+ if not words: return "⚠️ Pas de texte Bambara détecté.", None
124
 
125
+ yield "⏳ Phase 3 : Création de la vidéo finale...", None
126
+ # Heuristique simple pour les sous-titres
127
+ subs = []
128
+ chunk_size = 8
129
+ for i in range(0, len(words), chunk_size):
130
+ chunk = words[i:i+chunk_size]
131
+ s = (i / len(words)) * duration
132
+ e = (min(i + chunk_size, len(words)) / len(words)) * duration
133
+ subs.append((s, e, "\n".join(textwrap.wrap(" ".join(chunk), 40))))
134
+
135
+ out_v = burn(video_path, subs)
136
+ yield "✅ Terminé !", out_v
137
  except Exception as e:
138
  traceback.print_exc()
139
+ yield f"❌ Erreur: {str(e)}", None
140
+
141
+ def burn(video_path, subs):
142
+ out_path = "RobotsMali_Final.mp4"
143
+ with tempfile.NamedTemporaryFile(suffix=".srt", mode="w", encoding="utf-8", delete=False) as tf:
144
+ for idx, (start, end, text) in enumerate(subs, 1):
145
+ def t_srt(sec):
146
+ h=int(sec//3600); m=int((sec%3600)//60); s=int(sec%60); ms=int((sec-int(sec))*1000)
147
+ return f"{h:02}:{m:02}:{s:02},{ms:03}"
148
+ tf.write(f"{idx}\n{t_srt(start)} --> {t_srt(end)}\n{text}\n\n")
149
+ srt_name = tf.name
150
+ vf = f"subtitles={shlex.quote(srt_name)}:force_style='Fontsize=22,PrimaryColour=&HFFFFFF&,OutlineColour=&H000000&'"
151
+ run_cmd(f'ffmpeg -hide_banner -loglevel error -y -i {shlex.quote(video_path)} -vf {shlex.quote(vf)} -c:v libx264 -crf 23 -c:a aac {shlex.quote(out_path)}')
152
+ os.remove(srt_name)
153
+ return out_path
154
+
155
+ # ---------------------------- # INTERFACE GRADIO # ----------------------------
156
 
 
157
  custom_css = """
158
  body { background-color: #0b0e14; }
159
+ .gradio-container { background: rgba(17, 25, 40, 0.8) !important; backdrop-filter: blur(12px); border-radius: 20px; border: 1px solid rgba(255, 255, 255, 0.1); }
160
+ #title-header { text-align: center; padding: 20px; }
161
+ .gr-button-primary { background: linear-gradient(135deg, #059669, #10b981) !important; border: none !important; }
 
 
 
162
  """
163
 
164
  with gr.Blocks(css=custom_css, theme=gr.themes.Soft()) as demo:
165
+ # Utilisation de gr.Column au lieu de gr.Div pour éviter l'erreur AttributeError
166
+ with gr.Column(elem_id="title-header"):
167
  gr.HTML("""
168
+ <h1 style='color:#facc15; font-size: 2.5rem; margin:0;'>🤖 ROBOTSMALI</h1>
169
+ <p style='color:#94a3b8; font-style:italic;'>Système Expert de Sous-titrage Bambara</p>
170
+ <div style="height: 3px; width: 60px; background: #facc15; margin: 15px auto;"></div>
171
  """)
172
 
173
  with gr.Row():
174
  with gr.Column():
175
+ gr.Markdown("### 📥 Source")
176
  v_in = gr.Video(label=None, mirror_webcam=False)
177
+ m_sel = gr.Dropdown(list(MODELS.keys()), value="Soloba V1 (CTC)", label="Modèle IA")
178
+ btn = gr.Button("🚀 GÉNÉRER", variant="primary")
179
 
180
  with gr.Column():
181
+ gr.Markdown("### 📤 Sortie")
182
+ status = gr.Markdown("*En attente...*")
183
  v_out = gr.Video(label=None)
184
 
185
+ gr.Examples(examples=VIDEO_EXAMPLES, inputs=[v_in, m_sel], label="📺 Exemples")
186
+ gr.HTML("<div style='text-align: center; color: #475569; padding-top: 20px;'>© 2025 RobotsMali</div>")
 
 
 
 
 
187
 
188
  btn.click(pipeline, [v_in, m_sel], [status, v_out])
189
 
190
  if __name__ == "__main__":
191
+ demo.launch( share=True, debug=True)