Update app.py
Browse files
app.py
CHANGED
|
@@ -1,5 +1,4 @@
|
|
| 1 |
import gradio as gr
|
| 2 |
-
import os
|
| 3 |
import numpy as np
|
| 4 |
import torch
|
| 5 |
import soundfile as sf
|
|
@@ -26,7 +25,7 @@ MODELS = {
|
|
| 26 |
|
| 27 |
|
| 28 |
# =============================
|
| 29 |
-
# EXTRACTION AUDIO (
|
| 30 |
# =============================
|
| 31 |
|
| 32 |
def extract_audio(video_path, wav_path):
|
|
@@ -65,7 +64,7 @@ def transcribe(model, device, wav, model_name):
|
|
| 65 |
hyp = hyps[0][0] if isinstance(hyps[0], list) else hyps[0]
|
| 66 |
return [(w.start_offset_ms/1000, w.end_offset_ms/1000, w.word) for w in hyp.words]
|
| 67 |
|
| 68 |
-
# === Soloba
|
| 69 |
text = model.transcribe([wav])[0].strip()
|
| 70 |
if not text:
|
| 71 |
return []
|
|
@@ -85,6 +84,7 @@ def transcribe(model, device, wav, model_name):
|
|
| 85 |
timings[i+1] * tps if i+1 < len(timings) else total_s,
|
| 86 |
words[i]) for i in range(len(words))]
|
| 87 |
|
|
|
|
| 88 |
grouped, temp = [], []
|
| 89 |
for w in aligned:
|
| 90 |
temp.append(w)
|
|
@@ -122,13 +122,14 @@ def burn(video, subs):
|
|
| 122 |
final = CompositeVideoClip([clip] + layers)
|
| 123 |
out = "RobotsMali_Subtitled.mp4"
|
| 124 |
final.write_videofile(out, codec="libx264", audio_codec="aac", fps=clip.fps, verbose=False, logger=None)
|
|
|
|
| 125 |
clip.close()
|
| 126 |
final.close()
|
| 127 |
return out
|
| 128 |
|
| 129 |
|
| 130 |
# =============================
|
| 131 |
-
# PIPELINE
|
| 132 |
# =============================
|
| 133 |
|
| 134 |
def pipeline(video_file, model_name):
|
|
@@ -136,7 +137,15 @@ def pipeline(video_file, model_name):
|
|
| 136 |
return "Veuillez importer une vidéo.", None
|
| 137 |
|
| 138 |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 139 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 140 |
|
| 141 |
wav = "audio.wav"
|
| 142 |
extract_audio(video_file, wav)
|
|
@@ -147,17 +156,17 @@ def pipeline(video_file, model_name):
|
|
| 147 |
|
| 148 |
|
| 149 |
# =============================
|
| 150 |
-
# INTERFACE (
|
| 151 |
# =============================
|
| 152 |
|
| 153 |
with gr.Blocks() as demo:
|
| 154 |
-
gr.Markdown("# 🎙️ **RobotsMali
|
| 155 |
|
| 156 |
video = gr.Video(label="Vidéo")
|
| 157 |
model = gr.Dropdown(list(MODELS.keys()), value="Soloni V1", label="Modèle")
|
| 158 |
btn = gr.Button("⚡ Générer les sous-titres")
|
| 159 |
status = gr.Markdown()
|
| 160 |
-
out = gr.Video(label="Résultat")
|
| 161 |
|
| 162 |
btn.click(pipeline, inputs=[video, model], outputs=[status, out])
|
| 163 |
|
|
|
|
| 1 |
import gradio as gr
|
|
|
|
| 2 |
import numpy as np
|
| 3 |
import torch
|
| 4 |
import soundfile as sf
|
|
|
|
| 25 |
|
| 26 |
|
| 27 |
# =============================
|
| 28 |
+
# EXTRACTION AUDIO (SOLIDE & COMPATIBLE HF)
|
| 29 |
# =============================
|
| 30 |
|
| 31 |
def extract_audio(video_path, wav_path):
|
|
|
|
| 64 |
hyp = hyps[0][0] if isinstance(hyps[0], list) else hyps[0]
|
| 65 |
return [(w.start_offset_ms/1000, w.end_offset_ms/1000, w.word) for w in hyp.words]
|
| 66 |
|
| 67 |
+
# === Soloba & QuartzNet → CTC Forced Alignment ===
|
| 68 |
text = model.transcribe([wav])[0].strip()
|
| 69 |
if not text:
|
| 70 |
return []
|
|
|
|
| 84 |
timings[i+1] * tps if i+1 < len(timings) else total_s,
|
| 85 |
words[i]) for i in range(len(words))]
|
| 86 |
|
| 87 |
+
# Groupage lisible (max 4 mots par sous-titre)
|
| 88 |
grouped, temp = [], []
|
| 89 |
for w in aligned:
|
| 90 |
temp.append(w)
|
|
|
|
| 122 |
final = CompositeVideoClip([clip] + layers)
|
| 123 |
out = "RobotsMali_Subtitled.mp4"
|
| 124 |
final.write_videofile(out, codec="libx264", audio_codec="aac", fps=clip.fps, verbose=False, logger=None)
|
| 125 |
+
|
| 126 |
clip.close()
|
| 127 |
final.close()
|
| 128 |
return out
|
| 129 |
|
| 130 |
|
| 131 |
# =============================
|
| 132 |
+
# PIPELINE PRINCIPAL
|
| 133 |
# =============================
|
| 134 |
|
| 135 |
def pipeline(video_file, model_name):
|
|
|
|
| 137 |
return "Veuillez importer une vidéo.", None
|
| 138 |
|
| 139 |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 140 |
+
|
| 141 |
+
# Chargement correct selon le modèle
|
| 142 |
+
if "Soloni" in model_name:
|
| 143 |
+
model = nemo_asr.models.EncDecHybridRNNTCTCBPEModel.from_pretrained(model_name=MODELS[model_name])
|
| 144 |
+
else:
|
| 145 |
+
model = nemo_asr.models.EncDecCTCModelBPE.from_pretrained(model_name=MODELS[model_name])
|
| 146 |
+
|
| 147 |
+
model = model.to(device)
|
| 148 |
+
model.eval()
|
| 149 |
|
| 150 |
wav = "audio.wav"
|
| 151 |
extract_audio(video_file, wav)
|
|
|
|
| 156 |
|
| 157 |
|
| 158 |
# =============================
|
| 159 |
+
# INTERFACE (DESIGN CONSERVÉ)
|
| 160 |
# =============================
|
| 161 |
|
| 162 |
with gr.Blocks() as demo:
|
| 163 |
+
gr.Markdown("# 🎙️ **RobotsMali — Sous-titrage automatique Bambara**")
|
| 164 |
|
| 165 |
video = gr.Video(label="Vidéo")
|
| 166 |
model = gr.Dropdown(list(MODELS.keys()), value="Soloni V1", label="Modèle")
|
| 167 |
btn = gr.Button("⚡ Générer les sous-titres")
|
| 168 |
status = gr.Markdown()
|
| 169 |
+
out = gr.Video(label="Résultat (avec sous-titres)")
|
| 170 |
|
| 171 |
btn.click(pipeline, inputs=[video, model], outputs=[status, out])
|
| 172 |
|