binaryMao commited on
Commit
4281210
·
verified ·
1 Parent(s): d7541cf

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +47 -39
app.py CHANGED
@@ -1,10 +1,10 @@
1
  import os, warnings, logging, tempfile
2
 
3
- # --- Suppression warnings inutile ---
4
  warnings.filterwarnings("ignore")
5
  logging.getLogger("nemo_logger").setLevel(logging.ERROR)
6
 
7
- # --- CPU compatibility for HF Spaces ---
8
  os.environ["NEMO_FORCE_CPU"] = "1"
9
  os.environ["TOKENIZERS_PARALLELISM"] = "false"
10
 
@@ -14,22 +14,33 @@ torch.set_grad_enabled(False)
14
  import gradio as gr
15
  import numpy as np
16
  import soundfile as sf
 
 
 
 
 
17
  from moviepy.editor import VideoFileClip, CompositeVideoClip, TextClip
18
  from nemo.collections import asr as nemo_asr
19
 
20
- # --- UNLOCK IMAGEMAGICK POLICY (Required on HF) ---
21
- for p in ("/etc/ImageMagick/policy.xml", "/etc/ImageMagick-6/policy.xml"):
22
- if os.path.exists(p):
23
- os.system(
24
- 'sed -i "s/rights=\\"none\\"/rights=\\"read|write\\"/g" "{}"'.format(p)
25
- )
 
 
 
 
 
 
 
26
 
27
 
28
  # ---------------- CONFIG ---------------- #
29
 
30
  SR = 16000
31
- MAX_VIDEO_BYTES = 200_000_000 # 200MB limite
32
- TITLE = "RobotsMali Caption Studio — Sous-titrage Bambara Automatique"
33
 
34
  ASR_MODELS = {
35
  "Soloba CTC 0.6B V0": "RobotsMali/soloba-ctc-0.6b-v0",
@@ -56,22 +67,23 @@ def load_model(name):
56
  return model, device
57
 
58
 
59
- # ---------------- AUDIO EXTRACTION (FORCE MONO) ---------------- #
60
 
61
  def extract_audio(video_path, wav_path):
62
  if os.path.getsize(video_path) > MAX_VIDEO_BYTES:
63
- raise RuntimeError("⚠️ Vidéo trop lourde (>200MB). Compressez puis réessayez.")
64
 
 
65
  os.system(f"ffmpeg -y -i '{video_path}' -ac 1 -ar {SR} -vn '{wav_path}' >/dev/null 2>&1")
66
  audio, sr = sf.read(wav_path)
67
 
68
  if sr == 0 or len(audio) == 0:
69
- raise RuntimeError("⚠️ Audio introuvable ou illisible.")
70
 
71
- return len(audio) / sr
72
 
73
 
74
- # ---------------- TRANSCRIBE (UNIFIÉ + SAFE) ---------------- #
75
 
76
  def transcribe(model, device, wav_path, model_key):
77
  audio, sr = sf.read(wav_path)
@@ -81,14 +93,14 @@ def transcribe(model, device, wav_path, model_key):
81
  if np.max(np.abs(audio)) > 1:
82
  audio = audio / np.max(np.abs(audio))
83
 
84
- total_s = len(audio) / sr if sr else 0
85
  if total_s <= 0:
86
  return []
87
 
88
  x = torch.tensor(audio, dtype=torch.float32).unsqueeze(0).to(device)
89
  ln = torch.tensor([x.shape[1]]).to(device)
90
 
91
- # ---- Soloni: timestamps réels ---- #
92
  if "Soloni" in model_key and hasattr(model, "decode_and_align"):
93
  try:
94
  with torch.no_grad():
@@ -101,19 +113,16 @@ def transcribe(model, device, wav_path, model_key):
101
  encoded_lengths=plen
102
  )
103
  hyp = hyps[0][0] if isinstance(hyps[0], list) else hyps[0]
 
104
  if hasattr(hyp, "words") and hyp.words:
105
  return [(w.start_offset_ms/1000, w.end_offset_ms/1000, w.word) for w in hyp.words]
106
  except:
107
- pass
108
 
109
- # ---- Fallback universel (Soloba + QuartzNet + backup Soloni) ---- #
110
  out = model.transcribe([wav_path])[0]
111
 
112
- if hasattr(out, "text"):
113
- text = out.text.strip()
114
- else:
115
- text = str(out).strip()
116
-
117
  if not text:
118
  return []
119
 
@@ -123,12 +132,12 @@ def transcribe(model, device, wav_path, model_key):
123
 
124
  wps = max(2.0, len(words) / total_s)
125
  subs, t = [], 0
 
126
  for w in words:
127
  d = 1 / wps
128
- subs.append((t, min(total_s, t + d), w))
129
  t += d
130
- if t >= total_s:
131
- break
132
 
133
  return subs
134
 
@@ -143,18 +152,17 @@ def burn(video_path, subs):
143
 
144
  layers = []
145
  for s, e, w in subs:
146
- if e <= s:
147
- continue
148
  txt = TextClip(
149
  w.upper(),
150
  fontsize=int(H/20),
151
- font="DejaVu-Sans", # ✅ Police stable
152
  color="white",
153
  stroke_color="black",
154
  stroke_width=2,
155
  method="caption",
156
  size=(int(W*0.9), None)
157
- ).set_start(s).set_duration(e - s).set_position(("center", int(H*0.88)))
158
  layers.append(txt)
159
 
160
  final = CompositeVideoClip([clip] + layers)
@@ -172,24 +180,24 @@ def burn(video_path, subs):
172
  # ---------------- PIPELINE ---------------- #
173
 
174
  def pipeline(video, model_name, progress=gr.Progress()):
175
- progress(0.3, "Chargement du modèle…")
176
  model, device = load_model(model_name)
177
 
178
  with tempfile.TemporaryDirectory() as td:
179
  wav = f"{td}/audio.wav"
180
- progress(0.5, "Extraction audio…")
181
  extract_audio(video, wav)
182
 
183
- progress(0.75, "Transcription…")
184
  subs = transcribe(model, device, wav, model_name)
185
  if not subs:
186
  return "⚠️ Aucun mot détecté.", None
187
 
188
- progress(0.95, "Incrustation…")
189
  out = burn(video, subs)
190
 
191
- progress(1.0, "✅ Terminé")
192
- return f"✅ Sous-titrage terminé ({model_name})", out
193
 
194
 
195
  # ---------------- UI ---------------- #
@@ -200,9 +208,9 @@ h1 { text-align:center; font-weight:800; color:#005BFF; margin-bottom:6px; }
200
  .gr-button { background:#005BFF !important; color:white !important; border-radius:8px; font-weight:700; }
201
  """
202
 
203
- with gr.Blocks(css=CSS, title=TITLE) as demo:
204
  gr.Markdown("<h1>RobotsMali Caption Studio</h1><p>Sous-titrage Automatique en Bambara</p>")
205
- video = gr.File(label="🎥 Importer une vidéo (max 200MB)", type="filepath")
206
  model = gr.Dropdown(list(ASR_MODELS.keys()), value="Soloni 114M TDT CTC V1", label="🧠 Modèle ASR")
207
  run = gr.Button("🚀 Générer les sous-titres")
208
  status = gr.Markdown()
 
1
  import os, warnings, logging, tempfile
2
 
3
+ # === STOP useless warnings ===
4
  warnings.filterwarnings("ignore")
5
  logging.getLogger("nemo_logger").setLevel(logging.ERROR)
6
 
7
+ # === CPU fallback for HuggingFace ===
8
  os.environ["NEMO_FORCE_CPU"] = "1"
9
  os.environ["TOKENIZERS_PARALLELISM"] = "false"
10
 
 
14
  import gradio as gr
15
  import numpy as np
16
  import soundfile as sf
17
+
18
+ # === Force MoviePy to use ImageMagick ===
19
+ import moviepy.config as mpconf
20
+ mpconf.change_settings({"IMAGEMAGICK_BINARY": "/usr/bin/convert"})
21
+
22
  from moviepy.editor import VideoFileClip, CompositeVideoClip, TextClip
23
  from nemo.collections import asr as nemo_asr
24
 
25
+
26
+ # === FIX IMAGEMAGICK POLICY (Required on HF Spaces) ===
27
+ def unlock_imagemagick():
28
+ POLICIES = [
29
+ "/etc/ImageMagick/policy.xml",
30
+ "/etc/ImageMagick-6/policy.xml"
31
+ ]
32
+ for p in POLICIES:
33
+ if os.path.exists(p):
34
+ print(f"⚙️ Patching ImageMagick security: {p}")
35
+ os.system(f"sed -i 's/rights=\"none\"/rights=\"read|write\"/g' {p}")
36
+
37
+ unlock_imagemagick()
38
 
39
 
40
  # ---------------- CONFIG ---------------- #
41
 
42
  SR = 16000
43
+ MAX_VIDEO_BYTES = 200_000_000 # Max 200MB video upload
 
44
 
45
  ASR_MODELS = {
46
  "Soloba CTC 0.6B V0": "RobotsMali/soloba-ctc-0.6b-v0",
 
67
  return model, device
68
 
69
 
70
+ # ---------------- EXTRACT AUDIO (FORCE MONO) ---------------- #
71
 
72
  def extract_audio(video_path, wav_path):
73
  if os.path.getsize(video_path) > MAX_VIDEO_BYTES:
74
+ raise RuntimeError("⚠️ Vidéo trop lourde (>200MB). Compressez avant l’upload.")
75
 
76
+ # Force mono + 16kHz → prevents all ASR crashes
77
  os.system(f"ffmpeg -y -i '{video_path}' -ac 1 -ar {SR} -vn '{wav_path}' >/dev/null 2>&1")
78
  audio, sr = sf.read(wav_path)
79
 
80
  if sr == 0 or len(audio) == 0:
81
+ raise RuntimeError("⚠️ Impossible de lire l’audio.")
82
 
83
+ return len(audio)/sr
84
 
85
 
86
+ # ---------------- TRANSCRIBE (UNIFIED & SAFE) ---------------- #
87
 
88
  def transcribe(model, device, wav_path, model_key):
89
  audio, sr = sf.read(wav_path)
 
93
  if np.max(np.abs(audio)) > 1:
94
  audio = audio / np.max(np.abs(audio))
95
 
96
+ total_s = len(audio)/sr if sr else 0
97
  if total_s <= 0:
98
  return []
99
 
100
  x = torch.tensor(audio, dtype=torch.float32).unsqueeze(0).to(device)
101
  ln = torch.tensor([x.shape[1]]).to(device)
102
 
103
+ # === SOLONI true timestamps ===
104
  if "Soloni" in model_key and hasattr(model, "decode_and_align"):
105
  try:
106
  with torch.no_grad():
 
113
  encoded_lengths=plen
114
  )
115
  hyp = hyps[0][0] if isinstance(hyps[0], list) else hyps[0]
116
+
117
  if hasattr(hyp, "words") and hyp.words:
118
  return [(w.start_offset_ms/1000, w.end_offset_ms/1000, w.word) for w in hyp.words]
119
  except:
120
+ pass # fallback auto
121
 
122
+ # === UNIVERSAL FALLBACK (Soloba + QuartzNet + backup Soloni) ===
123
  out = model.transcribe([wav_path])[0]
124
 
125
+ text = out.text.strip() if hasattr(out, "text") else str(out).strip()
 
 
 
 
126
  if not text:
127
  return []
128
 
 
132
 
133
  wps = max(2.0, len(words) / total_s)
134
  subs, t = [], 0
135
+
136
  for w in words:
137
  d = 1 / wps
138
+ subs.append((t, min(total_s, t+d), w))
139
  t += d
140
+ if t >= total_s: break
 
141
 
142
  return subs
143
 
 
152
 
153
  layers = []
154
  for s, e, w in subs:
155
+ if e <= s: continue
 
156
  txt = TextClip(
157
  w.upper(),
158
  fontsize=int(H/20),
159
+ font="DejaVu-Sans", # ✅ Stable Linux font
160
  color="white",
161
  stroke_color="black",
162
  stroke_width=2,
163
  method="caption",
164
  size=(int(W*0.9), None)
165
+ ).set_start(s).set_duration(e-s).set_position(("center", int(H*0.88)))
166
  layers.append(txt)
167
 
168
  final = CompositeVideoClip([clip] + layers)
 
180
  # ---------------- PIPELINE ---------------- #
181
 
182
  def pipeline(video, model_name, progress=gr.Progress()):
183
+ progress(0.3, "📦 Chargement du modèle…")
184
  model, device = load_model(model_name)
185
 
186
  with tempfile.TemporaryDirectory() as td:
187
  wav = f"{td}/audio.wav"
188
+ progress(0.5, "🔊 Extraction audio…")
189
  extract_audio(video, wav)
190
 
191
+ progress(0.75, "🧠 Transcription en cours…")
192
  subs = transcribe(model, device, wav, model_name)
193
  if not subs:
194
  return "⚠️ Aucun mot détecté.", None
195
 
196
+ progress(0.95, "🎞️ Incrustation des sous-titres…")
197
  out = burn(video, subs)
198
 
199
+ progress(1.0, "✅ Terminé.")
200
+ return f"✅ Sous-titrage généré avec **{model_name}**", out
201
 
202
 
203
  # ---------------- UI ---------------- #
 
208
  .gr-button { background:#005BFF !important; color:white !important; border-radius:8px; font-weight:700; }
209
  """
210
 
211
+ with gr.Blocks(css=CSS, title="RobotsMali Caption Studio") as demo:
212
  gr.Markdown("<h1>RobotsMali Caption Studio</h1><p>Sous-titrage Automatique en Bambara</p>")
213
+ video = gr.File(label="🎥 Importer une vidéo")
214
  model = gr.Dropdown(list(ASR_MODELS.keys()), value="Soloni 114M TDT CTC V1", label="🧠 Modèle ASR")
215
  run = gr.Button("🚀 Générer les sous-titres")
216
  status = gr.Markdown()