binaryMao commited on
Commit
8f9582a
·
verified ·
1 Parent(s): 4281210

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +50 -102
app.py CHANGED
@@ -1,46 +1,24 @@
1
  import os, warnings, logging, tempfile
2
-
3
- # === STOP useless warnings ===
4
  warnings.filterwarnings("ignore")
5
  logging.getLogger("nemo_logger").setLevel(logging.ERROR)
6
 
7
- # === CPU fallback for HuggingFace ===
8
- os.environ["NEMO_FORCE_CPU"] = "1"
9
- os.environ["TOKENIZERS_PARALLELISM"] = "false"
10
-
11
  import torch
12
  torch.set_grad_enabled(False)
13
 
14
  import gradio as gr
15
  import numpy as np
16
  import soundfile as sf
17
-
18
- # === Force MoviePy to use ImageMagick ===
19
- import moviepy.config as mpconf
20
- mpconf.change_settings({"IMAGEMAGICK_BINARY": "/usr/bin/convert"})
21
-
22
- from moviepy.editor import VideoFileClip, CompositeVideoClip, TextClip
23
  from nemo.collections import asr as nemo_asr
24
 
25
 
26
- # === FIX IMAGEMAGICK POLICY (Required on HF Spaces) ===
27
- def unlock_imagemagick():
28
- POLICIES = [
29
- "/etc/ImageMagick/policy.xml",
30
- "/etc/ImageMagick-6/policy.xml"
31
- ]
32
- for p in POLICIES:
33
- if os.path.exists(p):
34
- print(f"⚙️ Patching ImageMagick security: {p}")
35
- os.system(f"sed -i 's/rights=\"none\"/rights=\"read|write\"/g' {p}")
36
-
37
- unlock_imagemagick()
38
-
39
-
40
- # ---------------- CONFIG ---------------- #
41
 
42
  SR = 16000
43
- MAX_VIDEO_BYTES = 200_000_000 # Max 200MB video upload
44
 
45
  ASR_MODELS = {
46
  "Soloba CTC 0.6B V0": "RobotsMali/soloba-ctc-0.6b-v0",
@@ -55,7 +33,6 @@ _CACHE = {}
55
 
56
 
57
  # ---------------- LOAD MODEL ---------------- #
58
-
59
  def load_model(name):
60
  if name in _CACHE:
61
  return _CACHE[name]
@@ -68,140 +45,111 @@ def load_model(name):
68
 
69
 
70
  # ---------------- EXTRACT AUDIO (FORCE MONO) ---------------- #
71
-
72
  def extract_audio(video_path, wav_path):
73
  if os.path.getsize(video_path) > MAX_VIDEO_BYTES:
74
  raise RuntimeError("⚠️ Vidéo trop lourde (>200MB). Compressez avant l’upload.")
75
-
76
- # Force mono + 16kHz → prevents all ASR crashes
77
  os.system(f"ffmpeg -y -i '{video_path}' -ac 1 -ar {SR} -vn '{wav_path}' >/dev/null 2>&1")
78
  audio, sr = sf.read(wav_path)
79
-
80
- if sr == 0 or len(audio) == 0:
81
- raise RuntimeError("⚠️ Impossible de lire l’audio.")
82
-
83
  return len(audio)/sr
84
 
85
 
86
- # ---------------- TRANSCRIBE (UNIFIED & SAFE) ---------------- #
87
-
88
  def transcribe(model, device, wav_path, model_key):
89
  audio, sr = sf.read(wav_path)
90
-
91
  if audio.ndim == 2:
92
  audio = np.mean(audio, axis=1).astype(np.float32)
93
  if np.max(np.abs(audio)) > 1:
94
  audio = audio / np.max(np.abs(audio))
95
 
96
- total_s = len(audio)/sr if sr else 0
97
- if total_s <= 0:
98
- return []
99
-
100
  x = torch.tensor(audio, dtype=torch.float32).unsqueeze(0).to(device)
101
  ln = torch.tensor([x.shape[1]]).to(device)
102
 
103
- # === SOLONI true timestamps ===
104
  if "Soloni" in model_key and hasattr(model, "decode_and_align"):
105
  try:
106
  with torch.no_grad():
107
- proc, plen = model.preprocessor(
108
- input_signal=x,
109
- input_signal_length=ln
110
- )
111
- hyps = model.decode_and_align(
112
- encoder_output=proc,
113
- encoded_lengths=plen
114
- )
115
  hyp = hyps[0][0] if isinstance(hyps[0], list) else hyps[0]
116
-
117
  if hasattr(hyp, "words") and hyp.words:
118
  return [(w.start_offset_ms/1000, w.end_offset_ms/1000, w.word) for w in hyp.words]
119
  except:
120
- pass # fallback auto
121
 
122
- # === UNIVERSAL FALLBACK (Soloba + QuartzNet + backup Soloni) ===
123
  out = model.transcribe([wav_path])[0]
124
-
125
  text = out.text.strip() if hasattr(out, "text") else str(out).strip()
126
- if not text:
127
- return []
128
-
129
  words = text.split()
130
  if not words:
131
  return []
132
 
133
- wps = max(2.0, len(words) / total_s)
134
  subs, t = [], 0
135
-
136
  for w in words:
137
  d = 1 / wps
138
  subs.append((t, min(total_s, t+d), w))
139
  t += d
140
  if t >= total_s: break
141
-
142
  return subs
143
 
144
 
145
- # ---------------- BURN SUBTITLES ---------------- #
146
-
147
  def burn(video_path, subs):
148
- clip, final = None, None
 
 
149
  try:
150
- clip = VideoFileClip(video_path)
151
- W, H = clip.size
152
-
153
- layers = []
154
- for s, e, w in subs:
155
- if e <= s: continue
156
- txt = TextClip(
157
- w.upper(),
158
- fontsize=int(H/20),
159
- font="DejaVu-Sans", # ✅ Stable Linux font
160
- color="white",
161
- stroke_color="black",
162
- stroke_width=2,
163
- method="caption",
164
- size=(int(W*0.9), None)
165
- ).set_start(s).set_duration(e-s).set_position(("center", int(H*0.88)))
166
- layers.append(txt)
167
-
168
- final = CompositeVideoClip([clip] + layers)
169
- out = "RobotsMali_Subtitled.mp4"
170
- final.write_videofile(out, codec="libx264", audio_codec="aac", verbose=False, logger=None)
171
- return out
172
-
173
- finally:
174
- try: final.close()
175
- except: pass
176
- try: clip.close()
177
- except: pass
178
 
 
 
 
 
179
 
180
- # ---------------- PIPELINE ---------------- #
 
 
 
 
 
 
 
 
 
 
 
 
181
 
 
 
 
 
 
 
182
  def pipeline(video, model_name, progress=gr.Progress()):
183
- progress(0.3, "📦 Chargement du modèle…")
184
  model, device = load_model(model_name)
185
 
186
  with tempfile.TemporaryDirectory() as td:
187
  wav = f"{td}/audio.wav"
188
- progress(0.5, "🔊 Extraction audio…")
189
  extract_audio(video, wav)
190
 
191
- progress(0.75, "🧠 Transcription en cours…")
192
  subs = transcribe(model, device, wav, model_name)
193
  if not subs:
194
- return "⚠️ Aucun mot détecté.", None
195
 
196
- progress(0.95, "🎞️ Incrustation des sous-titres…")
197
  out = burn(video, subs)
198
 
199
- progress(1.0, "✅ Terminé.")
200
- return f"✅ Sous-titrage généré avec **{model_name}**", out
201
 
202
 
203
  # ---------------- UI ---------------- #
204
-
205
  CSS = """
206
  body { background:#F5F8FF; font-family:Inter, sans-serif; }
207
  h1 { text-align:center; font-weight:800; color:#005BFF; margin-bottom:6px; }
 
1
  import os, warnings, logging, tempfile
 
 
2
  warnings.filterwarnings("ignore")
3
  logging.getLogger("nemo_logger").setLevel(logging.ERROR)
4
 
 
 
 
 
5
  import torch
6
  torch.set_grad_enabled(False)
7
 
8
  import gradio as gr
9
  import numpy as np
10
  import soundfile as sf
11
+ from moviepy.editor import VideoFileClip, CompositeVideoClip, ImageClip
12
+ from PIL import Image, ImageDraw, ImageFont
 
 
 
 
13
  from nemo.collections import asr as nemo_asr
14
 
15
 
16
+ # ---------------- GLOBAL CONFIG ---------------- #
17
+ os.environ["NEMO_FORCE_CPU"] = "1"
18
+ os.environ["TOKENIZERS_PARALLELISM"] = "false"
 
 
 
 
 
 
 
 
 
 
 
 
19
 
20
  SR = 16000
21
+ MAX_VIDEO_BYTES = 200_000_000
22
 
23
  ASR_MODELS = {
24
  "Soloba CTC 0.6B V0": "RobotsMali/soloba-ctc-0.6b-v0",
 
33
 
34
 
35
  # ---------------- LOAD MODEL ---------------- #
 
36
  def load_model(name):
37
  if name in _CACHE:
38
  return _CACHE[name]
 
45
 
46
 
47
  # ---------------- EXTRACT AUDIO (FORCE MONO) ---------------- #
 
48
  def extract_audio(video_path, wav_path):
49
  if os.path.getsize(video_path) > MAX_VIDEO_BYTES:
50
  raise RuntimeError("⚠️ Vidéo trop lourde (>200MB). Compressez avant l’upload.")
 
 
51
  os.system(f"ffmpeg -y -i '{video_path}' -ac 1 -ar {SR} -vn '{wav_path}' >/dev/null 2>&1")
52
  audio, sr = sf.read(wav_path)
 
 
 
 
53
  return len(audio)/sr
54
 
55
 
56
+ # ---------------- TRANSCRIBE ---------------- #
 
57
  def transcribe(model, device, wav_path, model_key):
58
  audio, sr = sf.read(wav_path)
 
59
  if audio.ndim == 2:
60
  audio = np.mean(audio, axis=1).astype(np.float32)
61
  if np.max(np.abs(audio)) > 1:
62
  audio = audio / np.max(np.abs(audio))
63
 
64
+ total_s = len(audio)/sr
 
 
 
65
  x = torch.tensor(audio, dtype=torch.float32).unsqueeze(0).to(device)
66
  ln = torch.tensor([x.shape[1]]).to(device)
67
 
68
+ # Real timestamps for Soloni
69
  if "Soloni" in model_key and hasattr(model, "decode_and_align"):
70
  try:
71
  with torch.no_grad():
72
+ proc, plen = model.preprocessor(input_signal=x, input_signal_length=ln)
73
+ hyps = model.decode_and_align(encoder_output=proc, encoded_lengths=plen)
 
 
 
 
 
 
74
  hyp = hyps[0][0] if isinstance(hyps[0], list) else hyps[0]
 
75
  if hasattr(hyp, "words") and hyp.words:
76
  return [(w.start_offset_ms/1000, w.end_offset_ms/1000, w.word) for w in hyp.words]
77
  except:
78
+ pass
79
 
80
+ # Universal fallback (Soloba + QuartzNet + backup Soloni)
81
  out = model.transcribe([wav_path])[0]
 
82
  text = out.text.strip() if hasattr(out, "text") else str(out).strip()
 
 
 
83
  words = text.split()
84
  if not words:
85
  return []
86
 
87
+ wps = max(2.0, len(words) / total_s) # words per second
88
  subs, t = [], 0
 
89
  for w in words:
90
  d = 1 / wps
91
  subs.append((t, min(total_s, t+d), w))
92
  t += d
93
  if t >= total_s: break
 
94
  return subs
95
 
96
 
97
+ # ---------------- BURN SUBTITLES (NO IMAGEMAGICK) ---------------- #
 
98
  def burn(video_path, subs):
99
+ clip = VideoFileClip(video_path)
100
+ W, H = clip.size
101
+
102
  try:
103
+ font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf", int(H/20))
104
+ except:
105
+ font = ImageFont.load_default()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
106
 
107
+ layers = []
108
+ for s, e, w in subs:
109
+ if e <= s:
110
+ continue
111
 
112
+ img = Image.new("RGBA", (W, int(H*0.12)), (0, 0, 0, 140))
113
+ draw = ImageDraw.Draw(img)
114
+
115
+ text = w.upper()
116
+ tw, th = draw.textsize(text, font=font)
117
+ draw.text(((W-tw)//2, (H*0.12-th)//2), text, font=font, fill=(255,255,255))
118
+
119
+ img_clip = ImageClip(np.array(img)).set_start(s).set_duration(e-s).set_position(("center", int(H*0.85)))
120
+ layers.append(img_clip)
121
+
122
+ final = CompositeVideoClip([clip] + layers)
123
+ out = "RobotsMali_Subtitled.mp4"
124
+ final.write_videofile(out, codec="libx264", audio_codec="aac", fps=clip.fps, verbose=False, logger=None)
125
 
126
+ clip.close()
127
+ final.close()
128
+ return out
129
+
130
+
131
+ # ---------------- PIPELINE ---------------- #
132
  def pipeline(video, model_name, progress=gr.Progress()):
133
+ progress(0.2, "📦 Chargement du modèle…")
134
  model, device = load_model(model_name)
135
 
136
  with tempfile.TemporaryDirectory() as td:
137
  wav = f"{td}/audio.wav"
138
+ progress(0.4, "🔊 Extraction audio…")
139
  extract_audio(video, wav)
140
 
141
+ progress(0.7, "🧠 Transcription…")
142
  subs = transcribe(model, device, wav, model_name)
143
  if not subs:
144
+ return "⚠️ Aucun mot reconnu.", None
145
 
146
+ progress(0.95, "🎞️ Incrustation…")
147
  out = burn(video, subs)
148
 
149
+ return f"✅ Sous-titres générés avec **{model_name}**", out
 
150
 
151
 
152
  # ---------------- UI ---------------- #
 
153
  CSS = """
154
  body { background:#F5F8FF; font-family:Inter, sans-serif; }
155
  h1 { text-align:center; font-weight:800; color:#005BFF; margin-bottom:6px; }