binaryMao commited on
Commit
2bff7cb
·
verified ·
1 Parent(s): 3564a78

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +102 -42
app.py CHANGED
@@ -1,19 +1,27 @@
1
- import os, warnings, tempfile
 
 
2
  warnings.filterwarnings("ignore")
 
 
 
 
 
3
 
4
- # Débloquer ImageMagick (compatible tous environnements)
5
- for p in ["/etc/ImageMagick/policy.xml", "/etc/ImageMagick-6/policy.xml"]:
6
  if os.path.exists(p):
7
  os.system(f'sed -i "s/rights=\\"none\\"/rights=\\"read|write\\"/g" "{p}"')
8
 
9
  import gradio as gr
10
  import numpy as np
11
  import soundfile as sf
12
- import torch
13
  from moviepy.editor import VideoFileClip, CompositeVideoClip, TextClip
14
  from nemo.collections import asr as nemo_asr
15
 
 
16
  SR = 16000
 
17
 
18
  ASR_MODELS = {
19
  "Soloba CTC 0.6B V0": "RobotsMali/soloba-ctc-0.6b-v0",
@@ -26,50 +34,79 @@ ASR_MODELS = {
26
 
27
  _CACHE = {}
28
 
 
 
29
  def load_model(name):
30
  if name in _CACHE:
31
  return _CACHE[name]
32
  device = "cuda" if torch.cuda.is_available() else "cpu"
33
- model = nemo_asr.models.ASRModel.from_pretrained(model_name=ASR_MODELS[name]).to(device).eval()
 
 
34
  _CACHE[name] = (model, device)
35
  return model, device
36
 
 
 
37
  def extract_audio(video_path, wav_path):
38
- # Extraction audio stable (pas to_soundarray)
 
 
 
 
 
 
 
39
  try:
40
  clip = VideoFileClip(video_path)
 
 
 
41
  clip.audio.write_audiofile(
42
  wav_path, fps=SR, codec="pcm_s16le", verbose=False, logger=None
43
  )
 
44
  clip.close()
 
45
  except:
46
- os.system(f"ffmpeg -i '{video_path}' -ac 1 -ar {SR} -vn -y '{wav_path}' >/dev/null 2>&1")
47
- audio, sr = sf.read(wav_path)
48
- return len(audio)/sr
 
 
49
 
50
  def transcribe(model, device, wav_path, model_key):
51
  audio, sr = sf.read(wav_path)
52
  if audio.ndim > 1: audio = audio.mean(axis=1)
53
- total_s = len(audio)/sr
54
 
55
  x = torch.tensor(audio, dtype=torch.float32).unsqueeze(0).to(device)
56
  ln = torch.tensor([x.shape[1]]).to(device)
57
 
58
- # Timestamps Soloni (decode_and_align → hyp.words)
59
- if "Soloni" in model_key:
60
  with torch.no_grad():
61
- proc, plen = model.preprocessor(x, ln)
62
- hyps = model.decode_and_align(proc, plen)
 
 
 
 
 
 
 
63
  hyp = hyps[0][0] if isinstance(hyps[0], list) else hyps[0]
64
- if hasattr(hyp, "words"):
 
65
  return [(w.start_offset_ms/1000, w.end_offset_ms/1000, w.word) for w in hyp.words]
66
 
67
- # CTC models → alignement dynamique
68
  text = model.transcribe([wav_path])[0]
69
  words = text.split()
70
- if not words: return []
 
71
 
72
- wps = max(2.2, len(words)/total_s)
73
  subs, t = [], 0
74
  for w in words:
75
  d = 1/wps
@@ -78,41 +115,63 @@ def transcribe(model, device, wav_path, model_key):
78
  if t >= total_s: break
79
  return subs
80
 
 
 
81
  def burn(video_path, subs):
82
- clip = VideoFileClip(video_path)
83
- W, H = clip.size
84
-
85
- layers = []
86
- for s,e,w in subs:
87
- txt = TextClip(w.upper(), fontsize=int(H/20), color="white",
88
- stroke_color="black", stroke_width=2,
89
- method="caption", size=(int(W*0.9), None)
90
- ).set_start(s).set_duration(e-s).set_position(("center", int(H*0.88)))
91
- layers.append(txt)
92
-
93
- final = CompositeVideoClip([clip] + layers)
94
- out = "RobotsMali_Subtitled.mp4"
95
- final.write_videofile(out, codec="libx264", audio_codec="aac", verbose=False, logger=None)
96
- return out
 
 
 
 
 
 
 
 
 
 
 
 
 
97
 
98
  def pipeline(video, model_name, progress=gr.Progress()):
99
- progress(0.2, "Chargement du modèle…")
100
  model, device = load_model(model_name)
101
 
102
  with tempfile.TemporaryDirectory() as td:
103
  wav = f"{td}/audio.wav"
104
- progress(0.45, "Extraction audio…")
105
- extract_audio(video, wav)
106
 
107
- progress(0.7, "Transcription en Bambara…")
 
 
 
 
 
108
  subs = transcribe(model, device, wav, model_name)
 
 
109
 
110
- progress(0.9, "Incrustation des sous-titres…")
111
  out = burn(video, subs)
112
 
113
  progress(1.0, "✅ Terminé")
114
  return f"✅ Sous-titré avec **{model_name}**", out
115
 
 
 
116
  CSS = """
117
  body { background:#F6F9FF; font-family:Inter, sans-serif; }
118
  h1 { text-align:center; font-weight:800; color:#006CFF; }
@@ -121,11 +180,12 @@ h1 { text-align:center; font-weight:800; color:#006CFF; }
121
 
122
  with gr.Blocks(css=CSS, title="RobotsMali Caption Studio") as demo:
123
  gr.Markdown("<h1>RobotsMali Caption Studio</h1><p>Sous-titrage Automatique en Bambara</p>")
124
- video = gr.File(label="🎥 Importer une vidéo", type="filepath")
125
- model = gr.Dropdown(list(ASR_MODELS.keys()), value="Soloni 114M TDT CTC V1")
126
- run = gr.Button("🚀 Générer les sous-titres")
127
  status = gr.Markdown()
128
  output = gr.Video()
 
129
  run.click(pipeline, inputs=[video, model], outputs=[status, output])
130
 
131
- demo.launch()
 
1
+ import os, warnings, tempfile, logging
2
+
3
+ # --- Sécurité & compatibilité environnement ---
4
  warnings.filterwarnings("ignore")
5
+ logging.getLogger("nemo_logger").setLevel(logging.ERROR)
6
+ os.environ["NEMO_FORCE_CPU"] = "1"
7
+ os.environ["TOKENIZERS_PARALLELISM"] = "false"
8
+ import torch
9
+ torch.set_grad_enabled(False)
10
 
11
+ # Débloquer ImageMagick (HF varie selon build)
12
+ for p in ("/etc/ImageMagick/policy.xml", "/etc/ImageMagick-6/policy.xml"):
13
  if os.path.exists(p):
14
  os.system(f'sed -i "s/rights=\\"none\\"/rights=\\"read|write\\"/g" "{p}"')
15
 
16
  import gradio as gr
17
  import numpy as np
18
  import soundfile as sf
 
19
  from moviepy.editor import VideoFileClip, CompositeVideoClip, TextClip
20
  from nemo.collections import asr as nemo_asr
21
 
22
+ # --- Configuration ---
23
  SR = 16000
24
+ MAX_VIDEO_BYTES = 200_000_000 # ≈200MB limite stable HF Space
25
 
26
  ASR_MODELS = {
27
  "Soloba CTC 0.6B V0": "RobotsMali/soloba-ctc-0.6b-v0",
 
34
 
35
  _CACHE = {}
36
 
37
+ # ---------------- LOAD MODEL ---------------- #
38
+
39
  def load_model(name):
40
  if name in _CACHE:
41
  return _CACHE[name]
42
  device = "cuda" if torch.cuda.is_available() else "cpu"
43
+ model = nemo_asr.models.ASRModel.from_pretrained(
44
+ model_name=ASR_MODELS[name]
45
+ ).to(device).eval()
46
  _CACHE[name] = (model, device)
47
  return model, device
48
 
49
+ # ---------------- AUDIO EXTRACTION ---------------- #
50
+
51
  def extract_audio(video_path, wav_path):
52
+ # Protéger contre fichiers trop lourds
53
+ try:
54
+ if os.path.getsize(video_path) > MAX_VIDEO_BYTES:
55
+ raise RuntimeError("⚠️ Vidéo trop lourde (>200MB). Compressez puis réessayez.")
56
+ except:
57
+ pass
58
+
59
+ # Extraction fiable (ffmpeg / MoviePy)
60
  try:
61
  clip = VideoFileClip(video_path)
62
+ if clip.audio is None:
63
+ clip.close()
64
+ raise RuntimeError("⚠️ Aucun audio détecté dans la vidéo.")
65
  clip.audio.write_audiofile(
66
  wav_path, fps=SR, codec="pcm_s16le", verbose=False, logger=None
67
  )
68
+ duration = clip.duration or 0
69
  clip.close()
70
+ return duration
71
  except:
72
+ os.system(f"ffmpeg -y -i '{video_path}' -ac 1 -ar {SR} -vn '{wav_path}' >/dev/null 2>&1")
73
+ audio, sr = sf.read(wav_path)
74
+ return len(audio)/sr if sr else 0.0
75
+
76
+ # ---------------- TRANSCRIPTION ---------------- #
77
 
78
  def transcribe(model, device, wav_path, model_key):
79
  audio, sr = sf.read(wav_path)
80
  if audio.ndim > 1: audio = audio.mean(axis=1)
81
+ total_s = len(audio)/sr if sr else 0
82
 
83
  x = torch.tensor(audio, dtype=torch.float32).unsqueeze(0).to(device)
84
  ln = torch.tensor([x.shape[1]]).to(device)
85
 
86
+ # --- Soloni → timestamps réels via decode_and_align ---
87
+ if "Soloni" in model_key and hasattr(model, "decode_and_align"):
88
  with torch.no_grad():
89
+ proc, plen = model.preprocessor(
90
+ input_signal=x,
91
+ input_signal_length=ln
92
+ )
93
+ hyps = model.decode_and_align(
94
+ encoder_output=proc,
95
+ encoded_lengths=plen
96
+ )
97
+
98
  hyp = hyps[0][0] if isinstance(hyps[0], list) else hyps[0]
99
+
100
+ if hasattr(hyp, "words") and hyp.words:
101
  return [(w.start_offset_ms/1000, w.end_offset_ms/1000, w.word) for w in hyp.words]
102
 
103
+ # --- Soloba + QuartzNet → alignement fluide stable ---
104
  text = model.transcribe([wav_path])[0]
105
  words = text.split()
106
+ if not words or total_s <= 0:
107
+ return []
108
 
109
+ wps = max(2.0, len(words)/total_s)
110
  subs, t = [], 0
111
  for w in words:
112
  d = 1/wps
 
115
  if t >= total_s: break
116
  return subs
117
 
118
+ # ---------------- HARD SUBTITLE ---------------- #
119
+
120
  def burn(video_path, subs):
121
+ clip = None
122
+ final = None
123
+ try:
124
+ clip = VideoFileClip(video_path)
125
+ W, H = clip.size
126
+
127
+ layers = []
128
+ for s,e,w in subs:
129
+ if e <= s: continue
130
+ tc = TextClip(
131
+ w.upper(), fontsize=int(H/20), color="white",
132
+ stroke_color="black", stroke_width=2,
133
+ method="caption", size=(int(W*0.9), None)
134
+ ).set_start(s).set_duration(e - s).set_position(("center", int(H*0.88)))
135
+ layers.append(tc)
136
+
137
+ final = CompositeVideoClip([clip] + layers)
138
+ out = "RobotsMali_Subtitled.mp4"
139
+ final.write_videofile(out, codec="libx264", audio_codec="aac", verbose=False, logger=None)
140
+ return out
141
+
142
+ finally:
143
+ try: final.close()
144
+ except: pass
145
+ try: clip.close()
146
+ except: pass
147
+
148
+ # ---------------- PIPELINE ---------------- #
149
 
150
  def pipeline(video, model_name, progress=gr.Progress()):
151
+ progress(0.25, "Chargement du modèle…")
152
  model, device = load_model(model_name)
153
 
154
  with tempfile.TemporaryDirectory() as td:
155
  wav = f"{td}/audio.wav"
 
 
156
 
157
+ progress(0.5, "Extraction audio…")
158
+ duration = extract_audio(video, wav)
159
+ if duration <= 0:
160
+ return "❌ Audio introuvable ou illisible.", None
161
+
162
+ progress(0.75, "Transcription en Bambara…")
163
  subs = transcribe(model, device, wav, model_name)
164
+ if not subs:
165
+ return "⚠️ Aucun mot détecté.", None
166
 
167
+ progress(0.95, "Incrustation des sous-titres…")
168
  out = burn(video, subs)
169
 
170
  progress(1.0, "✅ Terminé")
171
  return f"✅ Sous-titré avec **{model_name}**", out
172
 
173
+ # ---------------- UI ---------------- #
174
+
175
  CSS = """
176
  body { background:#F6F9FF; font-family:Inter, sans-serif; }
177
  h1 { text-align:center; font-weight:800; color:#006CFF; }
 
180
 
181
  with gr.Blocks(css=CSS, title="RobotsMali Caption Studio") as demo:
182
  gr.Markdown("<h1>RobotsMali Caption Studio</h1><p>Sous-titrage Automatique en Bambara</p>")
183
+ video = gr.File(label="🎥 Importer une vidéo (max 200MB)", type="filepath")
184
+ model = gr.Dropdown(list(ASR_MODELS.keys()), value="Soloni 114M TDT CTC V1", label="🧠 Sélection du modèle ASR")
185
+ run = gr.Button("🚀 Générer")
186
  status = gr.Markdown()
187
  output = gr.Video()
188
+
189
  run.click(pipeline, inputs=[video, model], outputs=[status, output])
190
 
191
+ demo.launch(server_name="0.0.0.0", server_port=7860, share=False, ssr_mode=False)