Spaces:

RobotsMali
/

RobotsMali_Video_captionning

Running

App Files Files Community

binaryMao commited on Oct 31, 2025

Commit

9683e37

verified ·

1 Parent(s): 4d32742

Update app.py

Browse files

Files changed (1) hide show

app.py +58 -33

app.py CHANGED Viewed

@@ -13,6 +13,7 @@ from moviepy.editor import VideoFileClip, CompositeVideoClip, ImageClip
 from PIL import Image, ImageDraw, ImageFont
 from nemo.collections import asr as nemo_asr
 # ---------------- CONFIG ---------------- #
@@ -55,45 +56,74 @@ def extract_audio(video_path, wav_path):
     return len(audio)/sr
-# ---------------- TRANSCRIBE ---------------- #
 def transcribe(model, device, wav_path, model_key):
     audio, sr = sf.read(wav_path)
     if audio.ndim == 2:
-        audio = np.mean(audio, axis=1).astype(np.float32)
-    if np.max(np.abs(audio)) > 1:
-        audio = audio / np.max(np.abs(audio))
-    total_s = len(audio)/sr
     x = torch.tensor(audio, dtype=torch.float32).unsqueeze(0).to(device)
     ln = torch.tensor([x.shape[1]]).to(device)
-    # ✅ Soloni real timestamps
     if "Soloni" in model_key and hasattr(model, "decode_and_align"):
         try:
             with torch.no_grad():
                 proc, plen = model.preprocessor(input_signal=x, input_signal_length=ln)
                 hyps = model.decode_and_align(encoder_output=proc, encoded_lengths=plen)
             hyp = hyps[0][0] if isinstance(hyps[0], list) else hyps[0]
-            if hasattr(hyp, "words") and hyp.words:
-                return [(w.start_offset_ms/1000, w.end_offset_ms/1000, w.word) for w in hyp.words]
         except:
             pass
-    # ✅ Universal fallback for Soloba + QuartzNet
-    out = model.transcribe([wav_path])[0]
-    text = out.text.strip() if hasattr(out, "text") else str(out).strip()
     words = text.split()
     if not words:
         return []
-    wps = max(2.0, len(words) / total_s)
-    subs, t = [], 0
-    for w in words:
-        d = 1 / wps
-        subs.append((t, min(total_s, t+d), w))
-        t += d
-        if t >= total_s: break
-    return subs
 # ---------------- BURN SUBTITLES (NO IMAGEMAGICK) ---------------- #
@@ -107,29 +137,24 @@ def burn(video_path, subs):
         font = ImageFont.load_default()
     layers = []
-    for s, e, w in subs:
-        if e <= s: continue
         img = Image.new("RGBA", (W, int(H*0.12)), (0,0,0,140))
         draw = ImageDraw.Draw(img)
-        text = w.upper()
-        # ✅ Pillow 10+ compatible text size
         try:
             bbox = draw.textbbox((0,0), text, font=font)
             tw, th = bbox[2]-bbox[0], bbox[3]-bbox[1]
         except:
             tw, th = draw.textsize(text, font=font)
-        x = (W - tw) // 2
-        y = (int(H*0.12) - th) // 2
-        draw.text((x, y), text, font=font, fill=(255,255,255))
         img_clip = ImageClip(np.array(img)).set_start(s).set_duration(e-s).set_position(("center", int(H*0.85)))
         layers.append(img_clip)
-    final = CompositeVideoClip([clip] + layers)
     out = "RobotsMali_Subtitled.mp4"
     final.write_videofile(out, codec="libx264", audio_codec="aac", fps=clip.fps, verbose=False, logger=None)
     clip.close()
@@ -139,7 +164,7 @@ def burn(video_path, subs):
 # ---------------- PIPELINE ---------------- #
 def pipeline(video, model_name, progress=gr.Progress()):
-    progress(0.2, "📦 Chargement du modèle…")
     model, device = load_model(model_name)
     with tempfile.TemporaryDirectory() as td:
@@ -147,10 +172,10 @@ def pipeline(video, model_name, progress=gr.Progress()):
         progress(0.5, "🔊 Extraction audio…")
         extract_audio(video, wav)
-        progress(0.75, "🧠 Transcription…")
         subs = transcribe(model, device, wav, model_name)
-    progress(0.95, "🎞️ Incrustation…")
     out = burn(video, subs)
     return f"✅ Sous-titrage généré avec **{model_name}**", out
@@ -163,7 +188,7 @@ h1 { text-align:center; font-weight:800; color:#005BFF; margin-bottom:6px; }
 """
 with gr.Blocks(css=CSS, title="RobotsMali Caption Studio") as demo:
-    gr.Markdown("<h1>RobotsMali Caption Studio</h1><p>Sous-titrage automatique en Bambara</p>")
     video = gr.File(label="🎥 Importer une vidéo")
     model = gr.Dropdown(list(ASR_MODELS.keys()), value="Soloni 114M TDT CTC V1", label="🧠 Modèle ASR")
     run = gr.Button("🚀 Générer les sous-titres")

 from PIL import Image, ImageDraw, ImageFont
 from nemo.collections import asr as nemo_asr
+from ctc_segmentation import ctc_segmentation, CtcSegmentationParameters, prepare_text
 # ---------------- CONFIG ---------------- #
     return len(audio)/sr
+# ---------------- TRANSCRIBE (with forced alignment) ---------------- #
 def transcribe(model, device, wav_path, model_key):
     audio, sr = sf.read(wav_path)
     if audio.ndim == 2:
+        audio = np.mean(audio, axis=1)
+    total_s = len(audio) / sr
     x = torch.tensor(audio, dtype=torch.float32).unsqueeze(0).to(device)
     ln = torch.tensor([x.shape[1]]).to(device)
+    # --- Case 1 : Soloni → true word timestamps ---
     if "Soloni" in model_key and hasattr(model, "decode_and_align"):
         try:
             with torch.no_grad():
                 proc, plen = model.preprocessor(input_signal=x, input_signal_length=ln)
                 hyps = model.decode_and_align(encoder_output=proc, encoded_lengths=plen)
             hyp = hyps[0][0] if isinstance(hyps[0], list) else hyps[0]
+            return [(w.start_offset_ms/1000, w.end_offset_ms/1000, w.word) for w in hyp.words]
         except:
             pass
+    # --- Case 2 : Soloba / QuartzNet → forced alignment CTC ---
+    with torch.no_grad():
+        logits, logits_len = model.forward(input_signal=x, input_signal_length=ln)
+    text = model.transcribe([wav_path])[0].text.strip()
     words = text.split()
     if not words:
         return []
+    config = CtcSegmentationParameters()
+    config.char_list = list(model.tokenizer.vocab.keys())
+    ground_truth_mat, _ = prepare_text(config, words)
+    timings, _, _ = ctc_segmentation(
+        config,
+        logits.cpu().numpy()[0],
+        ground_truth_mat
+    )
+    time_per_step = total_s / logits_len.cpu().numpy()[0]
+    word_times = []
+    for i, w in enumerate(words):
+        start = timings[i] * time_per_step
+        end = timings[i+1] * time_per_step if i+1 < len(timings) else total_s
+        word_times.append((start, end, w))
+    # --- Segment mode B (2 to 5 words per subtitle line) ---
+    grouped = []
+    segment = []
+    for w in word_times:
+        segment.append(w)
+        if len(segment) >= 4: # max words per line
+            grouped.append(segment)
+            segment = []
+    if segment:
+        grouped.append(segment)
+    subtitles = []
+    for seg in grouped:
+        s = seg[0][0]
+        e = seg[-1][1]
+        text = " ".join([w[2] for w in seg])
+        subtitles.append((s, e, text))
+    return subtitles
 # ---------------- BURN SUBTITLES (NO IMAGEMAGICK) ---------------- #
         font = ImageFont.load_default()
     layers = []
+    for s, e, text in subs:
         img = Image.new("RGBA", (W, int(H*0.12)), (0,0,0,140))
         draw = ImageDraw.Draw(img)
         try:
             bbox = draw.textbbox((0,0), text, font=font)
             tw, th = bbox[2]-bbox[0], bbox[3]-bbox[1]
         except:
             tw, th = draw.textsize(text, font=font)
+        x = (W-tw)//2
+        y = (int(H*0.12)-th)//2
+        draw.text((x,y), text, font=font, fill=(255,255,255))
         img_clip = ImageClip(np.array(img)).set_start(s).set_duration(e-s).set_position(("center", int(H*0.85)))
         layers.append(img_clip)
+    final = CompositeVideoClip([clip] + layers]
     out = "RobotsMali_Subtitled.mp4"
     final.write_videofile(out, codec="libx264", audio_codec="aac", fps=clip.fps, verbose=False, logger=None)
     clip.close()
 # ---------------- PIPELINE ---------------- #
 def pipeline(video, model_name, progress=gr.Progress()):
+    progress(0.3, "📦 Chargement du modèle…")
     model, device = load_model(model_name)
     with tempfile.TemporaryDirectory() as td:
         progress(0.5, "🔊 Extraction audio…")
         extract_audio(video, wav)
+        progress(0.75, "🧠 Alignement temporel…")
         subs = transcribe(model, device, wav, model_name)
+    progress(0.95, "🎞️ Incrustation des sous-titres…")
     out = burn(video, subs)
     return f"✅ Sous-titrage généré avec **{model_name}**", out
 """
 with gr.Blocks(css=CSS, title="RobotsMali Caption Studio") as demo:
+    gr.Markdown("<h1>RobotsMali Caption Studio</h1><p>Sous-titrage automatique en Bambara (Alignement Professionnel)</p>")
     video = gr.File(label="🎥 Importer une vidéo")
     model = gr.Dropdown(list(ASR_MODELS.keys()), value="Soloni 114M TDT CTC V1", label="🧠 Modèle ASR")
     run = gr.Button("🚀 Générer les sous-titres")