Update app.py
Browse files
app.py
CHANGED
|
@@ -8,12 +8,14 @@ torch.set_grad_enabled(False)
|
|
| 8 |
import gradio as gr
|
| 9 |
import numpy as np
|
| 10 |
import soundfile as sf
|
|
|
|
| 11 |
from moviepy.editor import VideoFileClip, CompositeVideoClip, ImageClip
|
| 12 |
from PIL import Image, ImageDraw, ImageFont
|
|
|
|
| 13 |
from nemo.collections import asr as nemo_asr
|
| 14 |
|
| 15 |
|
| 16 |
-
# ----------------
|
| 17 |
os.environ["NEMO_FORCE_CPU"] = "1"
|
| 18 |
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
| 19 |
|
|
@@ -44,10 +46,10 @@ def load_model(name):
|
|
| 44 |
return model, device
|
| 45 |
|
| 46 |
|
| 47 |
-
# ---------------- EXTRACT AUDIO
|
| 48 |
def extract_audio(video_path, wav_path):
|
| 49 |
if os.path.getsize(video_path) > MAX_VIDEO_BYTES:
|
| 50 |
-
raise RuntimeError("⚠️ Vidéo
|
| 51 |
os.system(f"ffmpeg -y -i '{video_path}' -ac 1 -ar {SR} -vn '{wav_path}' >/dev/null 2>&1")
|
| 52 |
audio, sr = sf.read(wav_path)
|
| 53 |
return len(audio)/sr
|
|
@@ -65,7 +67,7 @@ def transcribe(model, device, wav_path, model_key):
|
|
| 65 |
x = torch.tensor(audio, dtype=torch.float32).unsqueeze(0).to(device)
|
| 66 |
ln = torch.tensor([x.shape[1]]).to(device)
|
| 67 |
|
| 68 |
-
# ✅
|
| 69 |
if "Soloni" in model_key and hasattr(model, "decode_and_align"):
|
| 70 |
try:
|
| 71 |
with torch.no_grad():
|
|
@@ -77,14 +79,14 @@ def transcribe(model, device, wav_path, model_key):
|
|
| 77 |
except:
|
| 78 |
pass
|
| 79 |
|
| 80 |
-
# ✅ Universal fallback
|
| 81 |
out = model.transcribe([wav_path])[0]
|
| 82 |
text = out.text.strip() if hasattr(out, "text") else str(out).strip()
|
| 83 |
words = text.split()
|
| 84 |
if not words:
|
| 85 |
return []
|
| 86 |
|
| 87 |
-
wps = max(2.0, len(words) / total_s)
|
| 88 |
subs, t = [], 0
|
| 89 |
for w in words:
|
| 90 |
d = 1 / wps
|
|
@@ -106,15 +108,23 @@ def burn(video_path, subs):
|
|
| 106 |
|
| 107 |
layers = []
|
| 108 |
for s, e, w in subs:
|
| 109 |
-
if e <= s:
|
| 110 |
-
continue
|
| 111 |
|
| 112 |
-
img = Image.new("RGBA", (W, int(H*0.12)), (0,
|
| 113 |
draw = ImageDraw.Draw(img)
|
| 114 |
|
| 115 |
text = w.upper()
|
| 116 |
-
|
| 117 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 118 |
|
| 119 |
img_clip = ImageClip(np.array(img)).set_start(s).set_duration(e-s).set_position(("center", int(H*0.85)))
|
| 120 |
layers.append(img_clip)
|
|
@@ -122,7 +132,6 @@ def burn(video_path, subs):
|
|
| 122 |
final = CompositeVideoClip([clip] + layers)
|
| 123 |
out = "RobotsMali_Subtitled.mp4"
|
| 124 |
final.write_videofile(out, codec="libx264", audio_codec="aac", fps=clip.fps, verbose=False, logger=None)
|
| 125 |
-
|
| 126 |
clip.close()
|
| 127 |
final.close()
|
| 128 |
return out
|
|
@@ -135,18 +144,15 @@ def pipeline(video, model_name, progress=gr.Progress()):
|
|
| 135 |
|
| 136 |
with tempfile.TemporaryDirectory() as td:
|
| 137 |
wav = f"{td}/audio.wav"
|
| 138 |
-
progress(0.
|
| 139 |
extract_audio(video, wav)
|
| 140 |
|
| 141 |
-
progress(0.
|
| 142 |
subs = transcribe(model, device, wav, model_name)
|
| 143 |
-
if not subs:
|
| 144 |
-
return "⚠️ Aucun mot reconnu.", None
|
| 145 |
|
| 146 |
progress(0.95, "🎞️ Incrustation…")
|
| 147 |
out = burn(video, subs)
|
| 148 |
-
|
| 149 |
-
return f"✅ Sous-titres générés avec **{model_name}**", out
|
| 150 |
|
| 151 |
|
| 152 |
# ---------------- UI ---------------- #
|
|
@@ -157,7 +163,7 @@ h1 { text-align:center; font-weight:800; color:#005BFF; margin-bottom:6px; }
|
|
| 157 |
"""
|
| 158 |
|
| 159 |
with gr.Blocks(css=CSS, title="RobotsMali Caption Studio") as demo:
|
| 160 |
-
gr.Markdown("<h1>RobotsMali Caption Studio</h1><p>Sous-titrage
|
| 161 |
video = gr.File(label="🎥 Importer une vidéo")
|
| 162 |
model = gr.Dropdown(list(ASR_MODELS.keys()), value="Soloni 114M TDT CTC V1", label="🧠 Modèle ASR")
|
| 163 |
run = gr.Button("🚀 Générer les sous-titres")
|
|
|
|
| 8 |
import gradio as gr
|
| 9 |
import numpy as np
|
| 10 |
import soundfile as sf
|
| 11 |
+
|
| 12 |
from moviepy.editor import VideoFileClip, CompositeVideoClip, ImageClip
|
| 13 |
from PIL import Image, ImageDraw, ImageFont
|
| 14 |
+
|
| 15 |
from nemo.collections import asr as nemo_asr
|
| 16 |
|
| 17 |
|
| 18 |
+
# ---------------- CONFIG ---------------- #
|
| 19 |
os.environ["NEMO_FORCE_CPU"] = "1"
|
| 20 |
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
| 21 |
|
|
|
|
| 46 |
return model, device
|
| 47 |
|
| 48 |
|
| 49 |
+
# ---------------- EXTRACT AUDIO ---------------- #
|
| 50 |
def extract_audio(video_path, wav_path):
|
| 51 |
if os.path.getsize(video_path) > MAX_VIDEO_BYTES:
|
| 52 |
+
raise RuntimeError("⚠️ Vidéo > 200MB. Compressez avant l’upload.")
|
| 53 |
os.system(f"ffmpeg -y -i '{video_path}' -ac 1 -ar {SR} -vn '{wav_path}' >/dev/null 2>&1")
|
| 54 |
audio, sr = sf.read(wav_path)
|
| 55 |
return len(audio)/sr
|
|
|
|
| 67 |
x = torch.tensor(audio, dtype=torch.float32).unsqueeze(0).to(device)
|
| 68 |
ln = torch.tensor([x.shape[1]]).to(device)
|
| 69 |
|
| 70 |
+
# ✅ Soloni real timestamps
|
| 71 |
if "Soloni" in model_key and hasattr(model, "decode_and_align"):
|
| 72 |
try:
|
| 73 |
with torch.no_grad():
|
|
|
|
| 79 |
except:
|
| 80 |
pass
|
| 81 |
|
| 82 |
+
# ✅ Universal fallback for Soloba + QuartzNet
|
| 83 |
out = model.transcribe([wav_path])[0]
|
| 84 |
text = out.text.strip() if hasattr(out, "text") else str(out).strip()
|
| 85 |
words = text.split()
|
| 86 |
if not words:
|
| 87 |
return []
|
| 88 |
|
| 89 |
+
wps = max(2.0, len(words) / total_s)
|
| 90 |
subs, t = [], 0
|
| 91 |
for w in words:
|
| 92 |
d = 1 / wps
|
|
|
|
| 108 |
|
| 109 |
layers = []
|
| 110 |
for s, e, w in subs:
|
| 111 |
+
if e <= s: continue
|
|
|
|
| 112 |
|
| 113 |
+
img = Image.new("RGBA", (W, int(H*0.12)), (0,0,0,140))
|
| 114 |
draw = ImageDraw.Draw(img)
|
| 115 |
|
| 116 |
text = w.upper()
|
| 117 |
+
|
| 118 |
+
# ✅ Pillow 10+ compatible text size
|
| 119 |
+
try:
|
| 120 |
+
bbox = draw.textbbox((0,0), text, font=font)
|
| 121 |
+
tw, th = bbox[2]-bbox[0], bbox[3]-bbox[1]
|
| 122 |
+
except:
|
| 123 |
+
tw, th = draw.textsize(text, font=font)
|
| 124 |
+
|
| 125 |
+
x = (W - tw) // 2
|
| 126 |
+
y = (int(H*0.12) - th) // 2
|
| 127 |
+
draw.text((x, y), text, font=font, fill=(255,255,255))
|
| 128 |
|
| 129 |
img_clip = ImageClip(np.array(img)).set_start(s).set_duration(e-s).set_position(("center", int(H*0.85)))
|
| 130 |
layers.append(img_clip)
|
|
|
|
| 132 |
final = CompositeVideoClip([clip] + layers)
|
| 133 |
out = "RobotsMali_Subtitled.mp4"
|
| 134 |
final.write_videofile(out, codec="libx264", audio_codec="aac", fps=clip.fps, verbose=False, logger=None)
|
|
|
|
| 135 |
clip.close()
|
| 136 |
final.close()
|
| 137 |
return out
|
|
|
|
| 144 |
|
| 145 |
with tempfile.TemporaryDirectory() as td:
|
| 146 |
wav = f"{td}/audio.wav"
|
| 147 |
+
progress(0.5, "🔊 Extraction audio…")
|
| 148 |
extract_audio(video, wav)
|
| 149 |
|
| 150 |
+
progress(0.75, "🧠 Transcription…")
|
| 151 |
subs = transcribe(model, device, wav, model_name)
|
|
|
|
|
|
|
| 152 |
|
| 153 |
progress(0.95, "🎞️ Incrustation…")
|
| 154 |
out = burn(video, subs)
|
| 155 |
+
return f"✅ Sous-titrage généré avec **{model_name}**", out
|
|
|
|
| 156 |
|
| 157 |
|
| 158 |
# ---------------- UI ---------------- #
|
|
|
|
| 163 |
"""
|
| 164 |
|
| 165 |
with gr.Blocks(css=CSS, title="RobotsMali Caption Studio") as demo:
|
| 166 |
+
gr.Markdown("<h1>RobotsMali Caption Studio</h1><p>Sous-titrage automatique en Bambara</p>")
|
| 167 |
video = gr.File(label="🎥 Importer une vidéo")
|
| 168 |
model = gr.Dropdown(list(ASR_MODELS.keys()), value="Soloni 114M TDT CTC V1", label="🧠 Modèle ASR")
|
| 169 |
run = gr.Button("🚀 Générer les sous-titres")
|