Spaces:

RobotsMali
/

RobotsMali_Video_captionning

Running

App Files Files Community

binaryMao commited on Oct 30, 2025

Commit

8f9582a

verified ·

1 Parent(s): 4281210

Update app.py

Browse files

Files changed (1) hide show

app.py +50 -102

app.py CHANGED Viewed

@@ -1,46 +1,24 @@
 import os, warnings, logging, tempfile
-# === STOP useless warnings ===
 warnings.filterwarnings("ignore")
 logging.getLogger("nemo_logger").setLevel(logging.ERROR)
-# === CPU fallback for HuggingFace ===
-os.environ["NEMO_FORCE_CPU"] = "1"
-os.environ["TOKENIZERS_PARALLELISM"] = "false"
 import torch
 torch.set_grad_enabled(False)
 import gradio as gr
 import numpy as np
 import soundfile as sf
-# === Force MoviePy to use ImageMagick ===
-import moviepy.config as mpconf
-mpconf.change_settings({"IMAGEMAGICK_BINARY": "/usr/bin/convert"})
-from moviepy.editor import VideoFileClip, CompositeVideoClip, TextClip
 from nemo.collections import asr as nemo_asr
-# === FIX IMAGEMAGICK POLICY (Required on HF Spaces) ===
-def unlock_imagemagick():
-    POLICIES = [
-        "/etc/ImageMagick/policy.xml",
-        "/etc/ImageMagick-6/policy.xml"
-    ]
-    for p in POLICIES:
-        if os.path.exists(p):
-            print(f"⚙️ Patching ImageMagick security: {p}")
-            os.system(f"sed -i 's/rights=\"none\"/rights=\"read|write\"/g' {p}")
-unlock_imagemagick()
-# ---------------- CONFIG ---------------- #
 SR = 16000
-MAX_VIDEO_BYTES = 200_000_000  # Max 200MB video upload
 ASR_MODELS = {
     "Soloba CTC 0.6B V0": "RobotsMali/soloba-ctc-0.6b-v0",
@@ -55,7 +33,6 @@ _CACHE = {}
 # ---------------- LOAD MODEL ---------------- #
 def load_model(name):
     if name in _CACHE:
         return _CACHE[name]
@@ -68,140 +45,111 @@ def load_model(name):
 # ---------------- EXTRACT AUDIO (FORCE MONO) ---------------- #
 def extract_audio(video_path, wav_path):
     if os.path.getsize(video_path) > MAX_VIDEO_BYTES:
         raise RuntimeError("⚠️ Vidéo trop lourde (>200MB). Compressez avant l’upload.")
-    # Force mono + 16kHz → prevents all ASR crashes
     os.system(f"ffmpeg -y -i '{video_path}' -ac 1 -ar {SR} -vn '{wav_path}' >/dev/null 2>&1")
     audio, sr = sf.read(wav_path)
-    if sr == 0 or len(audio) == 0:
-        raise RuntimeError("⚠️ Impossible de lire l’audio.")
     return len(audio)/sr
-# ---------------- TRANSCRIBE (UNIFIED & SAFE) ---------------- #
 def transcribe(model, device, wav_path, model_key):
     audio, sr = sf.read(wav_path)
     if audio.ndim == 2:
         audio = np.mean(audio, axis=1).astype(np.float32)
     if np.max(np.abs(audio)) > 1:
         audio = audio / np.max(np.abs(audio))
-    total_s = len(audio)/sr if sr else 0
-    if total_s <= 0:
-        return []
     x = torch.tensor(audio, dtype=torch.float32).unsqueeze(0).to(device)
     ln = torch.tensor([x.shape[1]]).to(device)
-    # === SOLONI → true timestamps ===
     if "Soloni" in model_key and hasattr(model, "decode_and_align"):
         try:
             with torch.no_grad():
-                proc, plen = model.preprocessor(
-                    input_signal=x,
-                    input_signal_length=ln
-                )
-                hyps = model.decode_and_align(
-                    encoder_output=proc,
-                    encoded_lengths=plen
-                )
             hyp = hyps[0][0] if isinstance(hyps[0], list) else hyps[0]
             if hasattr(hyp, "words") and hyp.words:
                 return [(w.start_offset_ms/1000, w.end_offset_ms/1000, w.word) for w in hyp.words]
         except:
-            pass  # fallback auto
-    # === UNIVERSAL FALLBACK (Soloba + QuartzNet + backup Soloni) ===
     out = model.transcribe([wav_path])[0]
     text = out.text.strip() if hasattr(out, "text") else str(out).strip()
-    if not text:
-        return []
     words = text.split()
     if not words:
         return []
-    wps = max(2.0, len(words) / total_s)
     subs, t = [], 0
     for w in words:
         d = 1 / wps
         subs.append((t, min(total_s, t+d), w))
         t += d
         if t >= total_s: break
     return subs
-# ---------------- BURN SUBTITLES ---------------- #
 def burn(video_path, subs):
-    clip, final = None, None
     try:
-        clip = VideoFileClip(video_path)
-        W, H = clip.size
-        layers = []
-        for s, e, w in subs:
-            if e <= s: continue
-            txt = TextClip(
-                w.upper(),
-                fontsize=int(H/20),
-                font="DejaVu-Sans",        # ✅ Stable Linux font
-                color="white",
-                stroke_color="black",
-                stroke_width=2,
-                method="caption",
-                size=(int(W*0.9), None)
-            ).set_start(s).set_duration(e-s).set_position(("center", int(H*0.88)))
-            layers.append(txt)
-        final = CompositeVideoClip([clip] + layers)
-        out = "RobotsMali_Subtitled.mp4"
-        final.write_videofile(out, codec="libx264", audio_codec="aac", verbose=False, logger=None)
-        return out
-    finally:
-        try: final.close()
-        except: pass
-        try: clip.close()
-        except: pass
-# ---------------- PIPELINE ---------------- #
 def pipeline(video, model_name, progress=gr.Progress()):
-    progress(0.3, "📦 Chargement du modèle…")
     model, device = load_model(model_name)
     with tempfile.TemporaryDirectory() as td:
         wav = f"{td}/audio.wav"
-        progress(0.5, "🔊 Extraction audio…")
         extract_audio(video, wav)
-        progress(0.75, "🧠 Transcription en cours…")
         subs = transcribe(model, device, wav, model_name)
         if not subs:
-            return "⚠️ Aucun mot détecté.", None
-    progress(0.95, "🎞️ Incrustation des sous-titres…")
     out = burn(video, subs)
-    progress(1.0, "✅ Terminé.")
-    return f"✅ Sous-titrage généré avec **{model_name}**", out
 # ---------------- UI ---------------- #
 CSS = """
 body { background:#F5F8FF; font-family:Inter, sans-serif; }
 h1 { text-align:center; font-weight:800; color:#005BFF; margin-bottom:6px; }

 import os, warnings, logging, tempfile
 warnings.filterwarnings("ignore")
 logging.getLogger("nemo_logger").setLevel(logging.ERROR)
 import torch
 torch.set_grad_enabled(False)
 import gradio as gr
 import numpy as np
 import soundfile as sf
+from moviepy.editor import VideoFileClip, CompositeVideoClip, ImageClip
+from PIL import Image, ImageDraw, ImageFont
 from nemo.collections import asr as nemo_asr
+# ---------------- GLOBAL CONFIG ---------------- #
+os.environ["NEMO_FORCE_CPU"] = "1"
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
 SR = 16000
+MAX_VIDEO_BYTES = 200_000_000
 ASR_MODELS = {
     "Soloba CTC 0.6B V0": "RobotsMali/soloba-ctc-0.6b-v0",
 # ---------------- LOAD MODEL ---------------- #
 def load_model(name):
     if name in _CACHE:
         return _CACHE[name]
 # ---------------- EXTRACT AUDIO (FORCE MONO) ---------------- #
 def extract_audio(video_path, wav_path):
     if os.path.getsize(video_path) > MAX_VIDEO_BYTES:
         raise RuntimeError("⚠️ Vidéo trop lourde (>200MB). Compressez avant l’upload.")
     os.system(f"ffmpeg -y -i '{video_path}' -ac 1 -ar {SR} -vn '{wav_path}' >/dev/null 2>&1")
     audio, sr = sf.read(wav_path)
     return len(audio)/sr
+# ---------------- TRANSCRIBE ---------------- #
 def transcribe(model, device, wav_path, model_key):
     audio, sr = sf.read(wav_path)
     if audio.ndim == 2:
         audio = np.mean(audio, axis=1).astype(np.float32)
     if np.max(np.abs(audio)) > 1:
         audio = audio / np.max(np.abs(audio))
+    total_s = len(audio)/sr
     x = torch.tensor(audio, dtype=torch.float32).unsqueeze(0).to(device)
     ln = torch.tensor([x.shape[1]]).to(device)
+    # ✅ Real timestamps for Soloni
     if "Soloni" in model_key and hasattr(model, "decode_and_align"):
         try:
             with torch.no_grad():
+                proc, plen = model.preprocessor(input_signal=x, input_signal_length=ln)
+                hyps = model.decode_and_align(encoder_output=proc, encoded_lengths=plen)
             hyp = hyps[0][0] if isinstance(hyps[0], list) else hyps[0]
             if hasattr(hyp, "words") and hyp.words:
                 return [(w.start_offset_ms/1000, w.end_offset_ms/1000, w.word) for w in hyp.words]
         except:
+            pass
+    # ✅ Universal fallback (Soloba + QuartzNet + backup Soloni)
     out = model.transcribe([wav_path])[0]
     text = out.text.strip() if hasattr(out, "text") else str(out).strip()
     words = text.split()
     if not words:
         return []
+    wps = max(2.0, len(words) / total_s)  # words per second
     subs, t = [], 0
     for w in words:
         d = 1 / wps
         subs.append((t, min(total_s, t+d), w))
         t += d
         if t >= total_s: break
     return subs
+# ---------------- BURN SUBTITLES (NO IMAGEMAGICK) ---------------- #
 def burn(video_path, subs):
+    clip = VideoFileClip(video_path)
+    W, H = clip.size
     try:
+        font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf", int(H/20))
+    except:
+        font = ImageFont.load_default()
+    layers = []
+    for s, e, w in subs:
+        if e <= s:
+            continue
+        img = Image.new("RGBA", (W, int(H*0.12)), (0, 0, 0, 140))
+        draw = ImageDraw.Draw(img)
+        text = w.upper()
+        tw, th = draw.textsize(text, font=font)
+        draw.text(((W-tw)//2, (H*0.12-th)//2), text, font=font, fill=(255,255,255))
+        img_clip = ImageClip(np.array(img)).set_start(s).set_duration(e-s).set_position(("center", int(H*0.85)))
+        layers.append(img_clip)
+    final = CompositeVideoClip([clip] + layers)
+    out = "RobotsMali_Subtitled.mp4"
+    final.write_videofile(out, codec="libx264", audio_codec="aac", fps=clip.fps, verbose=False, logger=None)
+    clip.close()
+    final.close()
+    return out
+# ---------------- PIPELINE ---------------- #
 def pipeline(video, model_name, progress=gr.Progress()):
+    progress(0.2, "📦 Chargement du modèle…")
     model, device = load_model(model_name)
     with tempfile.TemporaryDirectory() as td:
         wav = f"{td}/audio.wav"
+        progress(0.4, "🔊 Extraction audio…")
         extract_audio(video, wav)
+        progress(0.7, "🧠 Transcription…")
         subs = transcribe(model, device, wav, model_name)
         if not subs:
+            return "⚠️ Aucun mot reconnu.", None
+    progress(0.95, "🎞️ Incrustation…")
     out = burn(video, subs)
+    return f"✅ Sous-titres générés avec **{model_name}**", out
 # ---------------- UI ---------------- #
 CSS = """
 body { background:#F5F8FF; font-family:Inter, sans-serif; }
 h1 { text-align:center; font-weight:800; color:#005BFF; margin-bottom:6px; }