Spaces:

RobotsMali
/

RobotsMali_Video_captionning

Runtime error

App Files Files Community

binaryMao commited on 29 days ago

Commit

5839b85

verified ·

1 Parent(s): 6d5ada0

Update app.py

Browse files

Files changed (1) hide show

app.py +131 -12

app.py CHANGED Viewed

@@ -1,4 +1,10 @@
 # -*- coding: utf-8 -*-
 import os, shlex, subprocess, tempfile, traceback, time, glob, gc, shutil
 import torch
 from huggingface_hub import snapshot_download
@@ -24,7 +30,18 @@ def find_example_video():
     paths = ["examples/MARALINKE_FIXED.mp4", "examples/MARALINKE.mp4", "MARALINKE.mp4"]
     for p in paths:
         if os.path.exists(p): return p
-    return None
 EXAMPLE_PATH = find_example_video()
 _cache = {}
@@ -69,7 +86,75 @@ def format_srt_time(sec):
     ms = int((sec - int(sec)) * 1000)
     return f"{time.strftime('%H:%M:%S', td)},{ms:03}"
-# 4. PIPELINE DE TRANSCRIPTION
 def pipeline(video_in, model_name):
     tmp_dir = tempfile.mkdtemp()
     try:
@@ -81,25 +166,59 @@ def pipeline(video_in, model_name):
         full_wav = os.path.join(tmp_dir, "full.wav")
         subprocess.run(f"ffmpeg -y -threads 0 -i {shlex.quote(video_in)} -vn -ac 1 -ar 16000 {full_wav}", shell=True, check=True)
-        yield "⏳ Phase 2/4 : Segmentation...", None
-        subprocess.run(f"ffmpeg -i {full_wav} -f segment -segment_time 20 -c copy {os.path.join(tmp_dir, 'seg_%03d.wav')}", shell=True, check=True)
-        audio_segments = sorted(glob.glob(os.path.join(tmp_dir, "seg_*.wav")))
         yield f"⏳ Phase 3/4 : Chargement de {model_name}...", None
         model = get_model(model_name)
-        yield f"🎙️ Transcription de {len(audio_segments)} segments...", None
-        b_size = 2 if DEVICE == "cpu" else 4
-        batch_hypotheses = model.transcribe(audio_segments, batch_size=b_size, return_hypotheses=True)
         all_words_ts = []
         for idx, hyp in enumerate(batch_hypotheses):
-            yield f"📝 Traitement : {idx+1}/{len(audio_segments)}...", None
-            base_time = idx * 20
             if isinstance(hyp, list): hyp = hyp[0]
             text = hyp.text if hasattr(hyp, 'text') else str(hyp)
             words = text.split()
-            gap = 20.0 / max(len(words), 1)
             for i, w in enumerate(words):
                 all_words_ts.append({"word": w, "start": base_time + (i * gap), "end": base_time + ((i+1) * gap)})
@@ -143,4 +262,4 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
     run_btn.click(pipeline, [v_input, m_input], [status, v_output])
-demo.queue().launch()

 # -*- coding: utf-8 -*-
+# POUR GOOGLE COLAB, EXÉCUTEZ CES CELLULES AVANT DE LANCER LE SCRIPT :
+# !apt-get install -y ffmpeg
+# !pip install gradio huggingface_hub torch
+# !pip install git+https://github.com/NVIDIA/NeMo.git@main#egg=nemo_toolkit[all]
+#
 import os, shlex, subprocess, tempfile, traceback, time, glob, gc, shutil
 import torch
 from huggingface_hub import snapshot_download
     paths = ["examples/MARALINKE_FIXED.mp4", "examples/MARALINKE.mp4", "MARALINKE.mp4"]
     for p in paths:
         if os.path.exists(p): return p
+    # Si aucun fichier local, on télécharge un exemple
+    print("⬇️ Téléchargement de la vidéo d'exemple...")
+    example_url = "https://huggingface.co/spaces/RobotsMali/Soloni-Demo/resolve/main/examples/MARALINKE.mp4"
+    target_path = "examples/MARALINKE.mp4"
+    os.makedirs("examples", exist_ok=True)
+    try:
+        subprocess.run(f"wget {example_url} -O {target_path}", shell=True, check=True)
+        return target_path
+    except Exception as e:
+        print(f"⚠️ Impossible de télécharger l'exemple : {e}")
+        return None
 EXAMPLE_PATH = find_example_video()
 _cache = {}
     ms = int((sec - int(sec)) * 1000)
     return f"{time.strftime('%H:%M:%S', td)},{ms:03}"
+# 4. PIPELINE DE TRANSCRIPTION (OPTIMISÉ)
+def detect_silences(path, min_silence_len=0.3, silence_thresh=-35):
+    """Detects silence intervals using ffmpeg"""
+    cmd = (
+        f"ffmpeg -i {shlex.quote(path)} -af "
+        f"silencedetect=noise={silence_thresh}dB:d={min_silence_len} "
+        f"-f null -"
+    )
+    result = subprocess.run(cmd, shell=True, stderr=subprocess.PIPE, text=True)
+    silences = []
+    for line in result.stderr.splitlines():
+        if "silence_start" in line:
+            start = float(line.split("silence_start: ")[1])
+            silences.append({"start": start, "end": None})
+        elif "silence_end" in line and silences:
+            end = float(line.split("silence_end: ")[1].split(" ")[0])
+            silences[-1]["end"] = end
+    return [s for s in silences if s["end"] is not None]
+def smart_segment_audio(audio_path, target_duration=5.0):
+    """Segments audio at silence points closest to target_duration"""
+    silences = detect_silences(audio_path)
+    segments_cuts = [0.0]
+    last_cut = 0.0
+    # Si aucun silence détecté, on fallback sur du découpage régulier
+    if not silences:
+        return None
+    # On cherche le meilleur point de coupe
+    duration = float(subprocess.check_output(
+        f"ffprobe -v error -show_entries format=duration -of default=noprint_wrappers=1:nokey=1 {shlex.quote(audio_path)}",
+        shell=True
+    ).strip())
+    current_pos = 0.0
+    while current_pos < duration:
+        target_pos = current_pos + target_duration
+        if target_pos >= duration:
+            break
+        # Trouver le silence le plus proche du target_pos
+        best_cut = None
+        min_dist = float('inf')
+        for s in silences:
+            # On coupe au milieu du silence
+            mid_silence = (s["start"] + s["end"]) / 2
+            if mid_silence <= current_pos: continue
+            dist = abs(mid_silence - target_pos)
+            if dist < min_dist:
+                min_dist = dist
+                best_cut = mid_silence
+            # Optimisation: inutile de chercher trop loin
+            if mid_silence > target_pos + 10: break
+        if best_cut and abs(best_cut - current_pos) > 1.0: # Éviter segments trop courts
+            segments_cuts.append(best_cut)
+            current_pos = best_cut
+        else:
+            # Pas de silence proche, on force la coupe (fallback)
+            current_pos += target_duration
+            segments_cuts.append(current_pos)
+    segments_cuts.append(duration)
+    return segments_cuts
 def pipeline(video_in, model_name):
     tmp_dir = tempfile.mkdtemp()
     try:
         full_wav = os.path.join(tmp_dir, "full.wav")
         subprocess.run(f"ffmpeg -y -threads 0 -i {shlex.quote(video_in)} -vn -ac 1 -ar 16000 {full_wav}", shell=True, check=True)
+        yield "⏳ Phase 2/4 : Segmentation Intelligente...", None
+        # Tentative de segmentation intelligente
+        try:
+            cut_points = smart_segment_audio(full_wav, target_duration=5.0)
+        except Exception as e:
+            print(f"Warning smart segment: {e}")
+            cut_points = None
+        segment_files = []
+        if cut_points:
+            # Découpage selon les points calculés
+            for i in range(len(cut_points)-1):
+                start = cut_points[i]
+                duration = cut_points[i+1] - start
+                out_name = os.path.join(tmp_dir, f"seg_{i:03d}.wav")
+                subprocess.run(
+                    f"ffmpeg -y -ss {start:.3f} -t {duration:.3f} -i {full_wav} -c copy {out_name}",
+                    shell=True, check=True
+                )
+                segment_files.append({"file": out_name, "start_offset": start})
+        else:
+            # Fallback méthode brute (moins précis mais robuste)
+            subprocess.run(f"ffmpeg -i {full_wav} -f segment -segment_time 5 -c copy {os.path.join(tmp_dir, 'seg_%03d.wav')}", shell=True, check=True)
+            files = sorted(glob.glob(os.path.join(tmp_dir, "seg_*.wav")))
+            for i, f in enumerate(files):
+                segment_files.append({"file": f, "start_offset": i * 5.0})
         yield f"⏳ Phase 3/4 : Chargement de {model_name}...", None
         model = get_model(model_name)
+        yield f"🎙️ Transcription de {len(segment_files)} segments...", None
+        # Optimisation batch size pour Colab (souvent T4/V100)
+        b_size = 16 if DEVICE == "cuda" else 2
+        audio_paths = [s["file"] for s in segment_files]
+        # Utilisation de torch.inference_mode pour gain perf
+        with torch.inference_mode():
+            batch_hypotheses = model.transcribe(audio_paths, batch_size=b_size, return_hypotheses=True)
         all_words_ts = []
         for idx, hyp in enumerate(batch_hypotheses):
+            yield f"📝 Traitement : {idx+1}/{len(segment_files)}...", None
+            base_time = segment_files[idx]["start_offset"]
             if isinstance(hyp, list): hyp = hyp[0]
             text = hyp.text if hasattr(hyp, 'text') else str(hyp)
             words = text.split()
+            # Ajustement temporel plus précis
+            segment_duration = segment_files[idx+1]["start_offset"] - base_time if idx < len(segment_files)-1 else 5.0
+            gap = segment_duration / max(len(words), 1)
             for i, w in enumerate(words):
                 all_words_ts.append({"word": w, "start": base_time + (i * gap), "end": base_time + ((i+1) * gap)})
     run_btn.click(pipeline, [v_input, m_input], [status, v_output])
+demo.queue().launch()