binaryMao commited on
Commit
27e6201
·
verified ·
1 Parent(s): 1a890b5

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +58 -34
app.py CHANGED
@@ -15,10 +15,12 @@ import soundfile as sf
15
  from moviepy.editor import VideoFileClip, CompositeVideoClip, TextClip
16
  from nemo.collections import asr as nemo_asr
17
 
 
18
  # ---------------- CONFIG ---------------- #
19
 
20
  SR = 16000
21
- MAX_VIDEO_BYTES = 200_000_000
 
22
 
23
  ASR_MODELS = {
24
  "Soloba CTC 0.6B V0": "RobotsMali/soloba-ctc-0.6b-v0",
@@ -31,6 +33,7 @@ ASR_MODELS = {
31
 
32
  _CACHE = {}
33
 
 
34
  # ---------------- LOAD MODEL ---------------- #
35
 
36
  def load_model(name):
@@ -43,55 +46,73 @@ def load_model(name):
43
  _CACHE[name] = (model, device)
44
  return model, device
45
 
46
- # ---------------- EXTRACT AUDIO (FORCE MONO) ---------------- #
 
47
 
48
  def extract_audio(video_path, wav_path):
49
  if os.path.getsize(video_path) > MAX_VIDEO_BYTES:
50
  raise RuntimeError("⚠️ Vidéo trop lourde (>200MB). Compressez puis réessayez.")
51
 
52
- # Force audio mono + 16k (100% fiable)
53
  os.system(f"ffmpeg -y -i '{video_path}' -ac 1 -ar {SR} -vn '{wav_path}' >/dev/null 2>&1")
54
 
55
  audio, sr = sf.read(wav_path)
56
  if sr == 0 or len(audio) == 0:
57
  raise RuntimeError("⚠️ Audio introuvable ou illisible.")
 
58
  return len(audio) / sr
59
 
60
- # ---------------- TRANSCRIBE ---------------- #
 
61
 
62
  def transcribe(model, device, wav_path, model_key):
63
  audio, sr = sf.read(wav_path)
64
 
65
- # Force mono propre + normalisation
66
  if audio.ndim == 2:
67
  audio = np.mean(audio, axis=1).astype(np.float32)
68
  if np.max(np.abs(audio)) > 1:
69
  audio = audio / np.max(np.abs(audio))
70
 
71
- total_s = len(audio)/sr if sr else 0
 
 
 
72
  x = torch.tensor(audio, dtype=torch.float32).unsqueeze(0).to(device)
73
  ln = torch.tensor([x.shape[1]]).to(device)
74
 
75
- # ---- Soloni : timestamps réels ---- #
76
  if "Soloni" in model_key and hasattr(model, "decode_and_align"):
77
- with torch.no_grad():
78
- proc, plen = model.preprocessor(
79
- input_signal=x,
80
- input_signal_length=ln
81
- )
82
- hyps = model.decode_and_align(
83
- encoder_output=proc,
84
- encoded_lengths=plen
85
- )
86
-
87
- hyp = hyps[0][0] if isinstance(hyps[0], list) else hyps[0]
88
- if hasattr(hyp, "words") and hyp.words:
89
- return [(w.start_offset_ms/1000, w.end_offset_ms/1000, w.word) for w in hyp.words]
90
-
91
- # ---- Soloba & QuartzNet fallback alignement fluide ---- #
92
- text = model.transcribe([wav_path])[0]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
93
  words = text.split()
94
- if not words or total_s <= 0:
95
  return []
96
 
97
  wps = max(2.0, len(words) / total_s)
@@ -104,18 +125,19 @@ def transcribe(model, device, wav_path, model_key):
104
  break
105
  return subs
106
 
 
107
  # ---------------- BURN SUBTITLES ---------------- #
108
 
109
  def burn(video_path, subs):
110
- clip = None
111
- final = None
112
  try:
113
  clip = VideoFileClip(video_path)
114
  W, H = clip.size
115
- layers = []
116
 
 
117
  for s, e, w in subs:
118
- if e <= s: continue
 
119
  txt = TextClip(
120
  w.upper(),
121
  fontsize=int(H/20),
@@ -138,6 +160,7 @@ def burn(video_path, subs):
138
  try: clip.close()
139
  except: pass
140
 
 
141
  # ---------------- PIPELINE ---------------- #
142
 
143
  def pipeline(video, model_name, progress=gr.Progress()):
@@ -150,7 +173,7 @@ def pipeline(video, model_name, progress=gr.Progress()):
150
  progress(0.5, "Extraction audio…")
151
  duration = extract_audio(video, wav)
152
 
153
- progress(0.75, "Transcription…")
154
  subs = transcribe(model, device, wav, model_name)
155
  if not subs:
156
  return "⚠️ Aucun mot détecté.", None
@@ -161,19 +184,20 @@ def pipeline(video, model_name, progress=gr.Progress()):
161
  progress(1.0, "✅ Terminé")
162
  return f"✅ Sous-titrage terminé avec **{model_name}**", out
163
 
 
164
  # ---------------- UI ---------------- #
165
 
166
  CSS = """
167
- body { background:#F7FAFF; font-family:Inter, sans-serif; }
168
- h1 { text-align:center; font-weight:800; color:#005BFF; }
169
  .gr-button { background:#005BFF !important; color:white !important; border-radius:8px; font-weight:700; }
170
  """
171
 
172
- with gr.Blocks(css=CSS, title="RobotsMali Caption Studio") as demo:
173
- gr.Markdown("<h1>RobotsMali Caption Studio</h1><p>Transcription & Sous-titres Automatiques en Bambara</p>")
174
  video = gr.File(label="🎥 Importer une vidéo (max 200MB)", type="filepath")
175
  model = gr.Dropdown(list(ASR_MODELS.keys()), value="Soloni 114M TDT CTC V1", label="🧠 Modèle ASR")
176
- run = gr.Button("🚀 Générer")
177
  status = gr.Markdown()
178
  output = gr.Video()
179
 
 
15
  from moviepy.editor import VideoFileClip, CompositeVideoClip, TextClip
16
  from nemo.collections import asr as nemo_asr
17
 
18
+
19
  # ---------------- CONFIG ---------------- #
20
 
21
  SR = 16000
22
+ MAX_VIDEO_BYTES = 200_000_000 # 200MB limite
23
+ TITLE = "RobotsMali Caption Studio — Sous-titrage Automatique en Bambara"
24
 
25
  ASR_MODELS = {
26
  "Soloba CTC 0.6B V0": "RobotsMali/soloba-ctc-0.6b-v0",
 
33
 
34
  _CACHE = {}
35
 
36
+
37
  # ---------------- LOAD MODEL ---------------- #
38
 
39
  def load_model(name):
 
46
  _CACHE[name] = (model, device)
47
  return model, device
48
 
49
+
50
+ # ---------------- AUDIO EXTRACTION (FORCE MONO) ---------------- #
51
 
52
  def extract_audio(video_path, wav_path):
53
  if os.path.getsize(video_path) > MAX_VIDEO_BYTES:
54
  raise RuntimeError("⚠️ Vidéo trop lourde (>200MB). Compressez puis réessayez.")
55
 
 
56
  os.system(f"ffmpeg -y -i '{video_path}' -ac 1 -ar {SR} -vn '{wav_path}' >/dev/null 2>&1")
57
 
58
  audio, sr = sf.read(wav_path)
59
  if sr == 0 or len(audio) == 0:
60
  raise RuntimeError("⚠️ Audio introuvable ou illisible.")
61
+
62
  return len(audio) / sr
63
 
64
+
65
+ # ---------------- TRANSCRIBE (UNIFIÉ + SÛR) ---------------- #
66
 
67
  def transcribe(model, device, wav_path, model_key):
68
  audio, sr = sf.read(wav_path)
69
 
 
70
  if audio.ndim == 2:
71
  audio = np.mean(audio, axis=1).astype(np.float32)
72
  if np.max(np.abs(audio)) > 1:
73
  audio = audio / np.max(np.abs(audio))
74
 
75
+ total_s = len(audio) / sr if sr else 0
76
+ if total_s <= 0:
77
+ return []
78
+
79
  x = torch.tensor(audio, dtype=torch.float32).unsqueeze(0).to(device)
80
  ln = torch.tensor([x.shape[1]]).to(device)
81
 
82
+ # ---- Priority 1: Soloni precise timestamps ---- #
83
  if "Soloni" in model_key and hasattr(model, "decode_and_align"):
84
+ try:
85
+ with torch.no_grad():
86
+ proc, plen = model.preprocessor(
87
+ input_signal=x,
88
+ input_signal_length=ln
89
+ )
90
+ hyps = model.decode_and_align(
91
+ encoder_output=proc,
92
+ encoded_lengths=plen
93
+ )
94
+
95
+ hyp = hyps[0][0] if isinstance(hyps[0], list) else hyps[0]
96
+
97
+ if hasattr(hyp, "words") and hyp.words:
98
+ return [(w.start_offset_ms/1000, w.end_offset_ms/1000, w.word) for w in hyp.words]
99
+
100
+ except:
101
+ pass # fallback auto
102
+
103
+ # ---- Priority 2: Universal fallback ---- #
104
+ out = model.transcribe([wav_path])[0]
105
+
106
+ if hasattr(out, "text"):
107
+ text = out.text.strip()
108
+ else:
109
+ text = str(out).strip()
110
+
111
+ if not text:
112
+ return []
113
+
114
  words = text.split()
115
+ if not words:
116
  return []
117
 
118
  wps = max(2.0, len(words) / total_s)
 
125
  break
126
  return subs
127
 
128
+
129
  # ---------------- BURN SUBTITLES ---------------- #
130
 
131
  def burn(video_path, subs):
132
+ clip, final = None, None
 
133
  try:
134
  clip = VideoFileClip(video_path)
135
  W, H = clip.size
 
136
 
137
+ layers = []
138
  for s, e, w in subs:
139
+ if e <= s:
140
+ continue
141
  txt = TextClip(
142
  w.upper(),
143
  fontsize=int(H/20),
 
160
  try: clip.close()
161
  except: pass
162
 
163
+
164
  # ---------------- PIPELINE ---------------- #
165
 
166
  def pipeline(video, model_name, progress=gr.Progress()):
 
173
  progress(0.5, "Extraction audio…")
174
  duration = extract_audio(video, wav)
175
 
176
+ progress(0.75, "Transcription en Bambara…")
177
  subs = transcribe(model, device, wav, model_name)
178
  if not subs:
179
  return "⚠️ Aucun mot détecté.", None
 
184
  progress(1.0, "✅ Terminé")
185
  return f"✅ Sous-titrage terminé avec **{model_name}**", out
186
 
187
+
188
  # ---------------- UI ---------------- #
189
 
190
  CSS = """
191
+ body { background:#F5F8FF; font-family:Inter, sans-serif; }
192
+ h1 { text-align:center; font-weight:800; color:#005BFF; margin-bottom:6px; }
193
  .gr-button { background:#005BFF !important; color:white !important; border-radius:8px; font-weight:700; }
194
  """
195
 
196
+ with gr.Blocks(css=CSS, title=TITLE) as demo:
197
+ gr.Markdown("<h1>RobotsMali Caption Studio</h1><p>Génération automatique de sous-titres en Bambara</p>")
198
  video = gr.File(label="🎥 Importer une vidéo (max 200MB)", type="filepath")
199
  model = gr.Dropdown(list(ASR_MODELS.keys()), value="Soloni 114M TDT CTC V1", label="🧠 Modèle ASR")
200
+ run = gr.Button("🚀 Générer les sous-titres")
201
  status = gr.Markdown()
202
  output = gr.Video()
203