Update app.py
Browse files
app.py
CHANGED
|
@@ -1,6 +1,7 @@
|
|
| 1 |
# -*- coding: utf-8 -*-
|
| 2 |
"""
|
| 3 |
ROBOTSMALI — Sous-titrage Bambara
|
|
|
|
| 4 |
"""
|
| 5 |
|
| 6 |
import os
|
|
@@ -51,13 +52,21 @@ def run_cmd(cmd):
|
|
| 51 |
return res.stdout
|
| 52 |
|
| 53 |
def ffprobe_duration(path):
|
| 54 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 55 |
out = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
|
|
|
|
| 56 |
if out.returncode != 0:
|
| 57 |
-
print("ffprobe
|
| 58 |
return None
|
| 59 |
try:
|
| 60 |
-
|
|
|
|
|
|
|
| 61 |
except:
|
| 62 |
return None
|
| 63 |
|
|
@@ -249,7 +258,7 @@ def burn(video_path, subs, output_path=None):
|
|
| 249 |
return output_path
|
| 250 |
|
| 251 |
# ----------------------------
|
| 252 |
-
# PIPELINE PRINCIPAL (
|
| 253 |
# ----------------------------
|
| 254 |
def pipeline(video_input, model_name):
|
| 255 |
"""
|
|
@@ -257,30 +266,40 @@ def pipeline(video_input, model_name):
|
|
| 257 |
model_name : clé dans MODELS
|
| 258 |
"""
|
| 259 |
try:
|
| 260 |
-
#
|
| 261 |
if isinstance(video_input, dict) and "tmp_path" in video_input:
|
| 262 |
video_path = video_input["tmp_path"]
|
| 263 |
else:
|
| 264 |
video_path = video_input
|
| 265 |
|
|
|
|
| 266 |
duration = ffprobe_duration(video_path)
|
| 267 |
-
if duration is None:
|
| 268 |
-
raise RuntimeError("Impossible d'obtenir la durée de la vidéo via ffprobe")
|
| 269 |
|
| 270 |
-
#
|
| 271 |
tmp_fd, tmp_wav = tempfile.mkstemp(suffix=".wav")
|
| 272 |
os.close(tmp_fd)
|
| 273 |
|
| 274 |
-
# extraction + nettoyage
|
| 275 |
extract_audio(video_path, tmp_wav)
|
| 276 |
clean_wav, audio, sr = clean_audio(tmp_wav)
|
| 277 |
|
| 278 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 279 |
model = load_model(model_name)
|
| 280 |
text = transcribe(model, clean_wav)
|
| 281 |
mode = MODELS[model_name][1]
|
| 282 |
|
| 283 |
-
#
|
| 284 |
subs = None
|
| 285 |
if mode == "rnnt":
|
| 286 |
# RNNT : tentative de segmentation via logits + ctc_segmentation si dispo
|
|
@@ -293,17 +312,20 @@ def pipeline(video_input, model_name):
|
|
| 293 |
ln = torch.tensor([x.shape[1]]).to(DEVICE)
|
| 294 |
with torch.no_grad():
|
| 295 |
logits = model(input_signal=x, input_signal_length=ln)[0]
|
| 296 |
-
|
| 297 |
time_per_frame = duration / max(1, logits.shape[1])
|
|
|
|
| 298 |
# build char list
|
| 299 |
try:
|
| 300 |
raw = model.tokenizer.vocab
|
| 301 |
vocab = list(raw.keys()) if isinstance(raw, dict) else list(raw)
|
| 302 |
except Exception:
|
| 303 |
vocab = None
|
|
|
|
| 304 |
cfg = CtcSegmentationParameters()
|
| 305 |
if vocab:
|
| 306 |
cfg.char_list = vocab
|
|
|
|
| 307 |
gt = prepare_text(cfg, words)[0]
|
| 308 |
try:
|
| 309 |
timing, _, _ = ctc_segmentation(cfg, logits.detach().cpu().numpy()[0], gt)
|
|
@@ -317,8 +339,7 @@ def pipeline(video_input, model_name):
|
|
| 317 |
subs = align_vad(text, audio, sr, duration)
|
| 318 |
|
| 319 |
elif mode == "ctc_char":
|
| 320 |
-
# QuartzNet : pas de tokenizer BPE,
|
| 321 |
-
# On essaie d'obtenir timestamps via model.transcribe() si disponible (mais souvent non)
|
| 322 |
try:
|
| 323 |
subs = align_vad(text, audio, sr, duration)
|
| 324 |
except Exception as e:
|
|
@@ -326,7 +347,7 @@ def pipeline(video_input, model_name):
|
|
| 326 |
subs = align_vad(text, audio, sr, duration)
|
| 327 |
|
| 328 |
else: # ctc (BPE)
|
| 329 |
-
#
|
| 330 |
try:
|
| 331 |
subs = align_vad(text, audio, sr, duration)
|
| 332 |
except Exception as e:
|
|
@@ -344,18 +365,21 @@ def pipeline(video_input, model_name):
|
|
| 344 |
return (f"❌ Erreur — {str(e)}", None)
|
| 345 |
|
| 346 |
# ----------------------------
|
| 347 |
-
# INTERFACE GRADIO
|
| 348 |
# ----------------------------
|
| 349 |
with gr.Blocks(title="RobotsMali - Sous-titrage") as demo:
|
| 350 |
-
gr.Markdown(" RobotsMali — Sous-titrage")
|
| 351 |
-
|
| 352 |
-
|
| 353 |
-
|
| 354 |
-
|
| 355 |
-
|
| 356 |
-
|
|
|
|
|
|
|
|
|
|
| 357 |
|
| 358 |
-
|
| 359 |
-
# demo.launch(share=True, debug=False)
|
| 360 |
-
demo.launch(share=True, debug=False)
|
| 361 |
|
|
|
|
|
|
|
|
|
| 1 |
# -*- coding: utf-8 -*-
|
| 2 |
"""
|
| 3 |
ROBOTSMALI — Sous-titrage Bambara
|
| 4 |
+
Correctif: Durée vidéo robuste (FFprobe + Fallback Audio)
|
| 5 |
"""
|
| 6 |
|
| 7 |
import os
|
|
|
|
| 52 |
return res.stdout
|
| 53 |
|
| 54 |
def ffprobe_duration(path):
|
| 55 |
+
"""
|
| 56 |
+
Tente d'obtenir la durée via ffprobe.
|
| 57 |
+
Modifié pour être plus tolérant avec les formats web/webcam.
|
| 58 |
+
"""
|
| 59 |
+
# On a retiré '-select_streams v:0' pour lire les métadonnées globales du conteneur
|
| 60 |
+
cmd = f'ffprobe -v error -show_entries format=duration -of default=noprint_wrappers=1:nokey=1 {shlex.quote(path)}'
|
| 61 |
out = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
|
| 62 |
+
|
| 63 |
if out.returncode != 0:
|
| 64 |
+
print(f"ffprobe warning: {out.stderr}")
|
| 65 |
return None
|
| 66 |
try:
|
| 67 |
+
# Parfois ffprobe renvoie plusieurs lignes, on prend la première valide
|
| 68 |
+
output = out.stdout.strip().split('\n')[0]
|
| 69 |
+
return float(output)
|
| 70 |
except:
|
| 71 |
return None
|
| 72 |
|
|
|
|
| 258 |
return output_path
|
| 259 |
|
| 260 |
# ----------------------------
|
| 261 |
+
# PIPELINE PRINCIPAL (FIXED)
|
| 262 |
# ----------------------------
|
| 263 |
def pipeline(video_input, model_name):
|
| 264 |
"""
|
|
|
|
| 266 |
model_name : clé dans MODELS
|
| 267 |
"""
|
| 268 |
try:
|
| 269 |
+
# Support Gradio dict (tmp_path)
|
| 270 |
if isinstance(video_input, dict) and "tmp_path" in video_input:
|
| 271 |
video_path = video_input["tmp_path"]
|
| 272 |
else:
|
| 273 |
video_path = video_input
|
| 274 |
|
| 275 |
+
# 1. Tentative d'obtention de durée via FFPROBE
|
| 276 |
duration = ffprobe_duration(video_path)
|
|
|
|
|
|
|
| 277 |
|
| 278 |
+
# 2. Extraction & Nettoyage Audio
|
| 279 |
tmp_fd, tmp_wav = tempfile.mkstemp(suffix=".wav")
|
| 280 |
os.close(tmp_fd)
|
| 281 |
|
|
|
|
| 282 |
extract_audio(video_path, tmp_wav)
|
| 283 |
clean_wav, audio, sr = clean_audio(tmp_wav)
|
| 284 |
|
| 285 |
+
# 3. FALLBACK: Si FFprobe a échoué (None), on calcule depuis l'audio
|
| 286 |
+
if duration is None:
|
| 287 |
+
print("[INFO] ffprobe duration failed, calculating from audio...")
|
| 288 |
+
if sr and sr > 0:
|
| 289 |
+
duration = len(audio) / sr
|
| 290 |
+
|
| 291 |
+
# Vérification finale
|
| 292 |
+
if not duration or duration <= 0:
|
| 293 |
+
raise RuntimeError("Impossible de déterminer la durée de la vidéo (fichier corrompu ?)")
|
| 294 |
+
|
| 295 |
+
print(f"[INFO] Durée détectée: {duration:.2f}s")
|
| 296 |
+
|
| 297 |
+
# 4. Chargement modèle + Transcription
|
| 298 |
model = load_model(model_name)
|
| 299 |
text = transcribe(model, clean_wav)
|
| 300 |
mode = MODELS[model_name][1]
|
| 301 |
|
| 302 |
+
# 5. Segmentation / Alignement
|
| 303 |
subs = None
|
| 304 |
if mode == "rnnt":
|
| 305 |
# RNNT : tentative de segmentation via logits + ctc_segmentation si dispo
|
|
|
|
| 312 |
ln = torch.tensor([x.shape[1]]).to(DEVICE)
|
| 313 |
with torch.no_grad():
|
| 314 |
logits = model(input_signal=x, input_signal_length=ln)[0]
|
| 315 |
+
|
| 316 |
time_per_frame = duration / max(1, logits.shape[1])
|
| 317 |
+
|
| 318 |
# build char list
|
| 319 |
try:
|
| 320 |
raw = model.tokenizer.vocab
|
| 321 |
vocab = list(raw.keys()) if isinstance(raw, dict) else list(raw)
|
| 322 |
except Exception:
|
| 323 |
vocab = None
|
| 324 |
+
|
| 325 |
cfg = CtcSegmentationParameters()
|
| 326 |
if vocab:
|
| 327 |
cfg.char_list = vocab
|
| 328 |
+
|
| 329 |
gt = prepare_text(cfg, words)[0]
|
| 330 |
try:
|
| 331 |
timing, _, _ = ctc_segmentation(cfg, logits.detach().cpu().numpy()[0], gt)
|
|
|
|
| 339 |
subs = align_vad(text, audio, sr, duration)
|
| 340 |
|
| 341 |
elif mode == "ctc_char":
|
| 342 |
+
# QuartzNet (char) : pas de tokenizer BPE, VAD fallback
|
|
|
|
| 343 |
try:
|
| 344 |
subs = align_vad(text, audio, sr, duration)
|
| 345 |
except Exception as e:
|
|
|
|
| 347 |
subs = align_vad(text, audio, sr, duration)
|
| 348 |
|
| 349 |
else: # ctc (BPE)
|
| 350 |
+
# Soloba CTC : VAD fallback
|
| 351 |
try:
|
| 352 |
subs = align_vad(text, audio, sr, duration)
|
| 353 |
except Exception as e:
|
|
|
|
| 365 |
return (f"❌ Erreur — {str(e)}", None)
|
| 366 |
|
| 367 |
# ----------------------------
|
| 368 |
+
# INTERFACE GRADIO
|
| 369 |
# ----------------------------
|
| 370 |
with gr.Blocks(title="RobotsMali - Sous-titrage") as demo:
|
| 371 |
+
gr.Markdown("## RobotsMali — Sous-titrage Bambara")
|
| 372 |
+
|
| 373 |
+
with gr.Row():
|
| 374 |
+
with gr.Column():
|
| 375 |
+
v = gr.Video(label="Vidéo à sous-titrer", sources=["upload", "webcam"])
|
| 376 |
+
m = gr.Dropdown(list(MODELS.keys()), value="Soloba V1 (CTC)", label="Modèle ASR")
|
| 377 |
+
b = gr.Button("▶️ Générer les sous-titres", variant="primary")
|
| 378 |
+
with gr.Column():
|
| 379 |
+
s = gr.Markdown(label="Statut")
|
| 380 |
+
o = gr.Video(label="Vidéo sous-titrée")
|
| 381 |
|
| 382 |
+
b.click(pipeline, [v, m], [s, o])
|
|
|
|
|
|
|
| 383 |
|
| 384 |
+
if __name__ == "__main__":
|
| 385 |
+
demo.launch(share=True, debug=True)
|