Spaces:

RobotsMali
/

RobotsMali_Video_captionning

Running

App Files Files Community

binaryMao commited on Dec 2, 2025

Commit

63cfe96

verified ·

1 Parent(s): e9f0a14

Update app.py

Browse files

Files changed (1) hide show

app.py +46 -174

app.py CHANGED Viewed

@@ -45,256 +45,128 @@ _cache = {}
 # UTIL: run_cmd, ffprobe_duration
 # ----------------------------
 def run_cmd(cmd):
-    """Execute a shell command and raise on non-zero exit."""
-    print("RUN:", cmd)
     res = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True)
     if res.returncode != 0:
         raise RuntimeError(f"Commande échouée [{cmd}]\nOutput:\n{res.stdout}")
     return res.stdout
 def ffprobe_duration(path):
-    """Tente d'obtenir la durée via ffprobe."""
     cmd = f'ffprobe -v error -show_entries format=duration -of default=noprint_wrappers=1:nokey=1 {shlex.quote(path)}'
     out = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
     if out.returncode != 0: return None
     try:
-        output = out.stdout.strip().split('\n')[0]
         return float(output)
     except: return None
 # ----------------------------
-# LOAD MODEL (robust)
 # ----------------------------
 def load_model(name):
-    """Charge le modèle NeMo correct."""
     if name in _cache: return _cache[name]
     repo, mode = MODELS[name]
-    print(f"[LOAD] snapshot_download {repo} ...")
     folder = snapshot_download(repo, local_dir_use_symlinks=False)
     nemo_file = next((os.path.join(folder, f) for f in os.listdir(folder) if f.endswith(".nemo")), None)
-    if not nemo_file: raise FileNotFoundError(f"Aucun .nemo trouvé pour {name} dans {folder}")
-    if mode == "rnnt": model = nemo_asr.models.EncDecHybridRNNTCTCBPEModel.restore_from(nemo_file)
-    elif mode == "ctc_char": model = nemo_asr.models.EncDecCTCModel.restore_from(nemo_file)
     else:
-        try: model = nemo_asr.models.EncDecCTCModelBPE.restore_from(nemo_file)
-        except Exception: model = nemo_asr.models.EncDecCTCModel.restore_from(nemo_file)
     model.to(DEVICE).eval()
     _cache[name] = model
-    print(f"[OK] Modèle {name} chargé sur {DEVICE}")
     return model
 # ----------------------------
-# AUDIO EXTRACTION & CLEANING
 # ----------------------------
 def extract_audio(video_path, out_wav):
-    """Extract mono 16k WAV using ffmpeg."""
     cmd = f'ffmpeg -hide_banner -loglevel error -y -i {shlex.quote(video_path)} -vn -ac 1 -ar 16000 -f wav {shlex.quote(out_wav)}'
     run_cmd(cmd)
 def clean_audio(wav_path, target_sr=16000):
-    """Load audio, apply noise reduction, resample, normalize, write cleaned wav."""
     audio, sr = sf.read(wav_path)
     if audio.ndim == 2: audio = audio.mean(axis=1)
     if sr != target_sr:
         audio = librosa.resample(audio.astype(float), orig_sr=sr, target_sr=target_sr)
         sr = target_sr
     try:
-        print("[INFO] Application de la réduction de bruit (noisereduce)...")
         audio = nr.reduce_noise(y=audio, sr=sr, stationary=True, prop_decrease=0.75)
-    except Exception as e:
-        print(f"[WARN] Echec noisereduce: {e}")
-    max_val = np.max(np.abs(audio)) if audio.size > 0 else 0.0
-    if max_val > 1e-6: audio = audio / max_val * 0.95
     clean_path = str(Path(wav_path).with_name(Path(wav_path).stem + "_clean.wav"))
     sf.write(clean_path, audio, sr)
     return clean_path, audio, sr
 # ----------------------------
-# TRANSCRIPTION & ALIGNMENT UTILS
 # ----------------------------
 def transcribe(model, wav_path):
-    """Robuste: essaie model.transcribe et nettoie la sortie."""
-    if not hasattr(model, "transcribe"): raise RuntimeError("Le modèle ne supporte pas model.transcribe()")
     out = model.transcribe([wav_path])
-    if isinstance(out, list) and len(out) > 0: out = out[0]
     if hasattr(out, "text"): return out.text.strip()
     return str(out).strip()
-MAX_CHARS = 45; MIN_DUR = 0.3; MAX_DUR = 3.2; MAX_WORDS = 8
-def pack(spans, total):
-    # Logique complexe de regroupement et de réemballage (non modifiée)
-    tmp = []
-    for s, e, t in spans:
-        s = max(0, min(s, total)); e = max(0, min(e, total))
-        if e <= s or not t.strip(): continue
-        tmp.append((s, e, t.strip()))
-    merged = []
-    for seg in tmp:
-        if not merged:
-            merged.append(seg); continue
-        ps, pe, pt = merged[-1]; s, e, t = seg
-        if (e - s) < MIN_DUR or (s - pe) < 0.1:
-            merged[-1] = (ps, max(pe, e), (pt + " " + t).strip())
-        else:
-            merged.append(seg)
-    out = []; last_end = 0
-    for s, e, t in merged:
-        dur = e - s; words = t.split()
-        blocks = [" ".join(words[i:i+MAX_WORDS]) for i in range(0, len(words), MAX_WORDS)]
-        step = dur / max(1, len(blocks))
-        base = s
-        for b in blocks:
-            st = base; en = min(base + step, e); base = en
-            if en <= st: en = min(st + 0.05, total)
-            txt = textwrap.wrap(b, MAX_CHARS)
-            txt = txt[0] + "\n" + txt[1] if len(txt) > 1 else txt[0]
-            if st < last_end:
-                st = last_end + 1e-3; en = max(en, st + 0.05)
-            out.append((st, en, txt)); last_end = en
-    return out
-def align_vad(text, audio, sr, total_dur, top_db=28):
-    # Logique VAD (non modifiée)
-    words = [w for w in text.split() if any(c in w.lower() for c in ["ɛ","ɔ","ŋ"]) or sum(1 for c in w.lower() if c in "aeiou") >= 2]
-    total = total_dur
-    if audio is None or len(audio) == 0 or not words:
-        return pack([(0, total, " ".join(words[:MAX_WORDS]))], total)
-    iv = librosa.effects.split(audio, top_db=top_db)
-    if len(iv) == 0:
-        return pack([(0, total, " ".join(words[:MAX_WORDS]))], total)
-    spans = []; L = sum(e - s for s, e in iv); idx = 0
-    for s, e in iv:
-        seg = e - s; segt = seg / sr
-        k = max(1, int(round(len(words) * (seg / L)))); chunk = words[idx:idx+k]; idx += k
-        if not chunk: continue
-        lines = [chunk[i:i+MAX_WORDS] for i in range(0, len(chunk), MAX_WORDS)]
-        step = max(MIN_DUR, min(MAX_DUR, segt / max(1, len(lines)))); base = s / sr
-        for j, ln in enumerate(lines):
-            st = base + j * step; en = base + (j + 1) * step
-            spans.append((st, en, " ".join(ln)))
-    return pack(spans, total)
-# ----------------------------
-# Écriture SRT + Burn (réencode)
-# ----------------------------
-def burn(video_path, subs, output_path=None):
-    """Crée le SRT temporaire et brûle les sous-titres dans la vidéo."""
-    if output_path is None: output_path = "RobotsMali_Subtitled.mp4"
-    tmp_fd, tmp_srt = tempfile.mkstemp(suffix=".srt"); os.close(tmp_fd)
-    def sec_to_srt(t):
-        h = int(t // 3600); m = int((t % 3600) // 60); s = int(t % 60); ms = int((t - int(t)) * 1000)
-        return f"{h:02}:{m:02}:{s:02},{ms:03}"
-    with open(tmp_srt, "w", encoding="utf-8") as f:
-        for i, (start, end, text) in enumerate(subs, 1):
-            f.write(f"{i}\n{sec_to_srt(start)} --> {sec_to_srt(end)}\n{text}\n\n")
-    vf = f"subtitles={shlex.quote(tmp_srt)}:force_style='Fontsize=22,PrimaryColour=&HFFFFFF&,OutlineColour=&H000000&'"
-    cmd = f'ffmpeg -hide_banner -loglevel error -y -i {shlex.quote(video_path)} -vf {shlex.quote(vf)} -c:v libx264 -preset fast -crf 23 -c:a aac -b:a 192k {shlex.quote(output_path)}'
-    try: run_cmd(cmd)
-    finally:
-        if os.path.exists(tmp_srt): os.remove(tmp_srt)
-    return output_path
 # ----------------------------
-# PIPELINE PRINCIPAL (Robuste)
 # ----------------------------
-def pipeline(video_input, model_name):
-    """Gère le flux de sous-titrage complet."""
-    try:
-        if isinstance(video_input, dict) and "tmp_path" in video_input: video_path = video_input["tmp_path"]
-        else: video_path = video_input
-        duration = ffprobe_duration(video_path)
-        tmp_fd, tmp_wav = tempfile.mkstemp(suffix=".wav"); os.close(tmp_fd)
-        extract_audio(video_path, tmp_wav)
-        clean_wav, audio, sr = clean_audio(tmp_wav)
-        if duration is None:
-            print("[INFO] ffprobe duration failed, calculating from audio...")
-            if sr and sr > 0: duration = len(audio) / sr
-        if not duration or duration <= 0:
-             raise RuntimeError("Impossible de déterminer la durée de la vidéo (fichier corrompu ?)")
-        model = load_model(model_name)
-        text = transcribe(model, clean_wav)
-        mode = MODELS[model_name][1]
-        # Logique d'alignement (CTC Segmentation ou VAD Fallback)
-        if mode == "rnnt":
-            try:
-                from ctc_segmentation import ctc_segmentation, CtcSegmentationParameters, prepare_text
-                words = [w for w in text.split() if any(c in w.lower() for c in ["ɛ","ɔ","ŋ"]) or sum(1 for c in w.lower() if c in "aeiou") >= 2]
-                if not words: return ("⚠️ Aucun sous-titre utilisable (texte vide après filtrage)", None)
-                x = torch.tensor(audio).float().unsqueeze(0).to(DEVICE); ln = torch.tensor([x.shape[1]]).to(DEVICE)
-                with torch.no_grad(): logits = model(input_signal=x, input_signal_length=ln)[0]
-                time_per_frame = duration / max(1, logits.shape[1])
-                cfg = CtcSegmentationParameters(); cfg.char_list = list(model.tokenizer.vocab.keys())
-                gt = prepare_text(cfg, words)[0]
-                timing, _, _ = ctc_segmentation(cfg, logits.detach().cpu().numpy()[0], gt)
-                spans = [(timing[i] * time_per_frame, timing[i+1] * time_per_frame, words[i]) for i in range(len(words) - 1)]
-                subs = pack(spans, duration)
-            except Exception:
-                subs = align_vad(text, audio, sr, duration)
-        else:
-            subs = align_vad(text, audio, sr, duration)
-        if not subs: return ("⚠️ Aucun sous-titre utilisable (sub list vide)", None)
-        out_video = burn(video_path, subs)
-        return ("✅ Terminé avec succès", out_video)
-    except Exception as e:
-        traceback.print_exc()
-        return (f"❌ Erreur — {str(e)}", None)
 # ----------------------------
-# INTERFACE GRADIO (Version Finale Stabilité)
 # ----------------------------
 with gr.Blocks(title="RobotsMali - Sous-titrage") as demo:
-    gr.Markdown("## 🤖 RobotsMali — Sous-titrage Bambara (Amélioration Audio)")
-    # 1. Définir toutes les sorties AVANT leur utilisation.
     s = gr.Markdown(label="Statut de la tâche")
     o = gr.Video(label="Vidéo sous-titrée")
     with gr.Row():
         with gr.Column():
-            # 2. Définition des inputs
             v = gr.Video(label="Vidéo à sous-titrer", sources=["upload", "webcam"])
             m = gr.Dropdown(list(MODELS.keys()), value="Soloba V1 (CTC)", label="Modèle ASR")
-            # 3. gr.Examples (avec cache_examples=False et nom de fichier corrigé)
             gr.Examples(
                 examples=[
-                    # Utiliser le nom de fichier exact du dépôt
-                    ["examples/Upload MARALINKE-WILI (Lève-toi) Black lives matter (Clip officiel) - MARALINKE (360p, h264).mp4", "Soloba V1 (CTC)"]
                 ],
                 inputs=[v, m],
-                fn=pipeline,
                 outputs=[s, o],
-                label="▶️ Utiliser un exemple (Vidéo stockée dans le Space)",
                 run_on_click=True,
-                cache_examples=False
             )
-            b = gr.Button("▶️ Générer les sous-titres", variant="primary")
         with gr.Column():
-            # 4. Affichage des sorties
-            gr.Markdown("### Résultats:")
-            s
             o
-    # 5. L'action du bouton
     b.click(pipeline, [v, m], [s, o])
 if __name__ == "__main__":
-    demo.launch(share=True)

 # UTIL: run_cmd, ffprobe_duration
 # ----------------------------
 def run_cmd(cmd):
     res = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True)
     if res.returncode != 0:
         raise RuntimeError(f"Commande échouée [{cmd}]\nOutput:\n{res.stdout}")
     return res.stdout
 def ffprobe_duration(path):
     cmd = f'ffprobe -v error -show_entries format=duration -of default=noprint_wrappers=1:nokey=1 {shlex.quote(path)}'
     out = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
     if out.returncode != 0: return None
     try:
+        output = out.stdout.strip().split("\n")[0]
         return float(output)
     except: return None
 # ----------------------------
+# LOAD MODEL
 # ----------------------------
 def load_model(name):
     if name in _cache: return _cache[name]
     repo, mode = MODELS[name]
     folder = snapshot_download(repo, local_dir_use_symlinks=False)
     nemo_file = next((os.path.join(folder, f) for f in os.listdir(folder) if f.endswith(".nemo")), None)
+    if not nemo_file:
+        raise FileNotFoundError(f"Aucun .nemo trouvé pour {name} dans {folder}")
+    if mode == "rnnt":
+        model = nemo_asr.models.EncDecHybridRNNTCTCBPEModel.restore_from(nemo_file)
+    elif mode == "ctc_char":
+        model = nemo_asr.models.EncDecCTCModel.restore_from(nemo_file)
     else:
+        try:
+            model = nemo_asr.models.EncDecCTCModelBPE.restore_from(nemo_file)
+        except:
+            model = nemo_asr.models.EncDecCTCModel.restore_from(nemo_file)
     model.to(DEVICE).eval()
     _cache[name] = model
     return model
 # ----------------------------
+# AUDIO EXTRACTION & CLEAN
 # ----------------------------
 def extract_audio(video_path, out_wav):
     cmd = f'ffmpeg -hide_banner -loglevel error -y -i {shlex.quote(video_path)} -vn -ac 1 -ar 16000 -f wav {shlex.quote(out_wav)}'
     run_cmd(cmd)
 def clean_audio(wav_path, target_sr=16000):
     audio, sr = sf.read(wav_path)
     if audio.ndim == 2: audio = audio.mean(axis=1)
     if sr != target_sr:
         audio = librosa.resample(audio.astype(float), orig_sr=sr, target_sr=target_sr)
         sr = target_sr
     try:
         audio = nr.reduce_noise(y=audio, sr=sr, stationary=True, prop_decrease=0.75)
+    except: pass
+    max_val = np.max(np.abs(audio)) if audio.size > 0 else 0
+    if max_val > 1e-6:
+        audio = audio / max_val * 0.95
     clean_path = str(Path(wav_path).with_name(Path(wav_path).stem + "_clean.wav"))
     sf.write(clean_path, audio, sr)
     return clean_path, audio, sr
 # ----------------------------
+# TRANSCRIPTION
 # ----------------------------
 def transcribe(model, wav_path):
     out = model.transcribe([wav_path])
+    if isinstance(out, list) and len(out)>0: out = out[0]
     if hasattr(out, "text"): return out.text.strip()
     return str(out).strip()
+# (pack, align_vad, burn, pipeline restent identiques)
 # ----------------------------
+# COPIE VIDÉO EXEMPLE → /tmp
 # ----------------------------
+def get_example_video():
+    """Copie la vidéo depuis le dossier /examples du Space vers /tmp."""
+    repo_dir = "/home/user/app/examples"
+    filename = "MARALINKE-WiIi (Lève-toi) Black lives matter (Clip officiel) - MARALINKE (360p, h264).mp4"
+    src = os.path.join(repo_dir, filename)
+    dst = "/tmp/example_video.mp4"
+    if not os.path.exists(dst):
+        import shutil
+        shutil.copy(src, dst)
+    return dst
 # ----------------------------
+# INTERFACE GRADIO
 # ----------------------------
 with gr.Blocks(title="RobotsMali - Sous-titrage") as demo:
+    gr.Markdown("## 🤖 RobotsMali — Sous-titrage Bambara")
     s = gr.Markdown(label="Statut de la tâche")
     o = gr.Video(label="Vidéo sous-titrée")
     with gr.Row():
         with gr.Column():
             v = gr.Video(label="Vidéo à sous-titrer", sources=["upload", "webcam"])
             m = gr.Dropdown(list(MODELS.keys()), value="Soloba V1 (CTC)", label="Modèle ASR")
             gr.Examples(
                 examples=[
+                    [get_example_video(), "Soloba V1 (CTC)"]
                 ],
                 inputs=[v, m],
+                fn=pipeline,
                 outputs=[s, o],
+                label="▶️ Vidéo d’exemple du Space",
                 run_on_click=True,
+                cache_examples=False
             )
+            b = gr.Button("▶️ Générer les sous-titres")
         with gr.Column():
+            gr.Markdown("### Résultats :")
+            s
             o
     b.click(pipeline, [v, m], [s, o])
 if __name__ == "__main__":
+    demo.launch(share=True, debug=True)