binaryMao commited on
Commit
5839b85
·
verified ·
1 Parent(s): 6d5ada0

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +131 -12
app.py CHANGED
@@ -1,4 +1,10 @@
1
  # -*- coding: utf-8 -*-
 
 
 
 
 
 
2
  import os, shlex, subprocess, tempfile, traceback, time, glob, gc, shutil
3
  import torch
4
  from huggingface_hub import snapshot_download
@@ -24,7 +30,18 @@ def find_example_video():
24
  paths = ["examples/MARALINKE_FIXED.mp4", "examples/MARALINKE.mp4", "MARALINKE.mp4"]
25
  for p in paths:
26
  if os.path.exists(p): return p
27
- return None
 
 
 
 
 
 
 
 
 
 
 
28
 
29
  EXAMPLE_PATH = find_example_video()
30
  _cache = {}
@@ -69,7 +86,75 @@ def format_srt_time(sec):
69
  ms = int((sec - int(sec)) * 1000)
70
  return f"{time.strftime('%H:%M:%S', td)},{ms:03}"
71
 
72
- # 4. PIPELINE DE TRANSCRIPTION
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
73
  def pipeline(video_in, model_name):
74
  tmp_dir = tempfile.mkdtemp()
75
  try:
@@ -81,25 +166,59 @@ def pipeline(video_in, model_name):
81
  full_wav = os.path.join(tmp_dir, "full.wav")
82
  subprocess.run(f"ffmpeg -y -threads 0 -i {shlex.quote(video_in)} -vn -ac 1 -ar 16000 {full_wav}", shell=True, check=True)
83
 
84
- yield "⏳ Phase 2/4 : Segmentation...", None
85
- subprocess.run(f"ffmpeg -i {full_wav} -f segment -segment_time 20 -c copy {os.path.join(tmp_dir, 'seg_%03d.wav')}", shell=True, check=True)
86
- audio_segments = sorted(glob.glob(os.path.join(tmp_dir, "seg_*.wav")))
87
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
88
  yield f"⏳ Phase 3/4 : Chargement de {model_name}...", None
89
  model = get_model(model_name)
90
 
91
- yield f"🎙️ Transcription de {len(audio_segments)} segments...", None
92
- b_size = 2 if DEVICE == "cpu" else 4
93
- batch_hypotheses = model.transcribe(audio_segments, batch_size=b_size, return_hypotheses=True)
 
 
 
 
 
 
94
 
95
  all_words_ts = []
96
  for idx, hyp in enumerate(batch_hypotheses):
97
- yield f"📝 Traitement : {idx+1}/{len(audio_segments)}...", None
98
- base_time = idx * 20
 
99
  if isinstance(hyp, list): hyp = hyp[0]
100
  text = hyp.text if hasattr(hyp, 'text') else str(hyp)
101
  words = text.split()
102
- gap = 20.0 / max(len(words), 1)
 
 
 
103
  for i, w in enumerate(words):
104
  all_words_ts.append({"word": w, "start": base_time + (i * gap), "end": base_time + ((i+1) * gap)})
105
 
@@ -143,4 +262,4 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
143
 
144
  run_btn.click(pipeline, [v_input, m_input], [status, v_output])
145
 
146
- demo.queue().launch()
 
1
  # -*- coding: utf-8 -*-
2
+ # POUR GOOGLE COLAB, EXÉCUTEZ CES CELLULES AVANT DE LANCER LE SCRIPT :
3
+ # !apt-get install -y ffmpeg
4
+ # !pip install gradio huggingface_hub torch
5
+ # !pip install git+https://github.com/NVIDIA/NeMo.git@main#egg=nemo_toolkit[all]
6
+ #
7
+
8
  import os, shlex, subprocess, tempfile, traceback, time, glob, gc, shutil
9
  import torch
10
  from huggingface_hub import snapshot_download
 
30
  paths = ["examples/MARALINKE_FIXED.mp4", "examples/MARALINKE.mp4", "MARALINKE.mp4"]
31
  for p in paths:
32
  if os.path.exists(p): return p
33
+
34
+ # Si aucun fichier local, on télécharge un exemple
35
+ print("⬇️ Téléchargement de la vidéo d'exemple...")
36
+ example_url = "https://huggingface.co/spaces/RobotsMali/Soloni-Demo/resolve/main/examples/MARALINKE.mp4"
37
+ target_path = "examples/MARALINKE.mp4"
38
+ os.makedirs("examples", exist_ok=True)
39
+ try:
40
+ subprocess.run(f"wget {example_url} -O {target_path}", shell=True, check=True)
41
+ return target_path
42
+ except Exception as e:
43
+ print(f"⚠️ Impossible de télécharger l'exemple : {e}")
44
+ return None
45
 
46
  EXAMPLE_PATH = find_example_video()
47
  _cache = {}
 
86
  ms = int((sec - int(sec)) * 1000)
87
  return f"{time.strftime('%H:%M:%S', td)},{ms:03}"
88
 
89
+ # 4. PIPELINE DE TRANSCRIPTION (OPTIMISÉ)
90
+ def detect_silences(path, min_silence_len=0.3, silence_thresh=-35):
91
+ """Detects silence intervals using ffmpeg"""
92
+ cmd = (
93
+ f"ffmpeg -i {shlex.quote(path)} -af "
94
+ f"silencedetect=noise={silence_thresh}dB:d={min_silence_len} "
95
+ f"-f null -"
96
+ )
97
+ result = subprocess.run(cmd, shell=True, stderr=subprocess.PIPE, text=True)
98
+ silences = []
99
+ for line in result.stderr.splitlines():
100
+ if "silence_start" in line:
101
+ start = float(line.split("silence_start: ")[1])
102
+ silences.append({"start": start, "end": None})
103
+ elif "silence_end" in line and silences:
104
+ end = float(line.split("silence_end: ")[1].split(" ")[0])
105
+ silences[-1]["end"] = end
106
+ return [s for s in silences if s["end"] is not None]
107
+
108
+ def smart_segment_audio(audio_path, target_duration=5.0):
109
+ """Segments audio at silence points closest to target_duration"""
110
+ silences = detect_silences(audio_path)
111
+ segments_cuts = [0.0]
112
+ last_cut = 0.0
113
+
114
+ # Si aucun silence détecté, on fallback sur du découpage régulier
115
+ if not silences:
116
+ return None
117
+
118
+ # On cherche le meilleur point de coupe
119
+ duration = float(subprocess.check_output(
120
+ f"ffprobe -v error -show_entries format=duration -of default=noprint_wrappers=1:nokey=1 {shlex.quote(audio_path)}",
121
+ shell=True
122
+ ).strip())
123
+
124
+ current_pos = 0.0
125
+ while current_pos < duration:
126
+ target_pos = current_pos + target_duration
127
+ if target_pos >= duration:
128
+ break
129
+
130
+ # Trouver le silence le plus proche du target_pos
131
+ best_cut = None
132
+ min_dist = float('inf')
133
+
134
+ for s in silences:
135
+ # On coupe au milieu du silence
136
+ mid_silence = (s["start"] + s["end"]) / 2
137
+ if mid_silence <= current_pos: continue
138
+
139
+ dist = abs(mid_silence - target_pos)
140
+ if dist < min_dist:
141
+ min_dist = dist
142
+ best_cut = mid_silence
143
+
144
+ # Optimisation: inutile de chercher trop loin
145
+ if mid_silence > target_pos + 10: break
146
+
147
+ if best_cut and abs(best_cut - current_pos) > 1.0: # Éviter segments trop courts
148
+ segments_cuts.append(best_cut)
149
+ current_pos = best_cut
150
+ else:
151
+ # Pas de silence proche, on force la coupe (fallback)
152
+ current_pos += target_duration
153
+ segments_cuts.append(current_pos)
154
+
155
+ segments_cuts.append(duration)
156
+ return segments_cuts
157
+
158
  def pipeline(video_in, model_name):
159
  tmp_dir = tempfile.mkdtemp()
160
  try:
 
166
  full_wav = os.path.join(tmp_dir, "full.wav")
167
  subprocess.run(f"ffmpeg -y -threads 0 -i {shlex.quote(video_in)} -vn -ac 1 -ar 16000 {full_wav}", shell=True, check=True)
168
 
169
+ yield "⏳ Phase 2/4 : Segmentation Intelligente...", None
 
 
170
 
171
+ # Tentative de segmentation intelligente
172
+ try:
173
+ cut_points = smart_segment_audio(full_wav, target_duration=5.0)
174
+ except Exception as e:
175
+ print(f"Warning smart segment: {e}")
176
+ cut_points = None
177
+
178
+ segment_files = []
179
+ if cut_points:
180
+ # Découpage selon les points calculés
181
+ for i in range(len(cut_points)-1):
182
+ start = cut_points[i]
183
+ duration = cut_points[i+1] - start
184
+ out_name = os.path.join(tmp_dir, f"seg_{i:03d}.wav")
185
+ subprocess.run(
186
+ f"ffmpeg -y -ss {start:.3f} -t {duration:.3f} -i {full_wav} -c copy {out_name}",
187
+ shell=True, check=True
188
+ )
189
+ segment_files.append({"file": out_name, "start_offset": start})
190
+ else:
191
+ # Fallback méthode brute (moins précis mais robuste)
192
+ subprocess.run(f"ffmpeg -i {full_wav} -f segment -segment_time 5 -c copy {os.path.join(tmp_dir, 'seg_%03d.wav')}", shell=True, check=True)
193
+ files = sorted(glob.glob(os.path.join(tmp_dir, "seg_*.wav")))
194
+ for i, f in enumerate(files):
195
+ segment_files.append({"file": f, "start_offset": i * 5.0})
196
+
197
  yield f"⏳ Phase 3/4 : Chargement de {model_name}...", None
198
  model = get_model(model_name)
199
 
200
+ yield f"🎙️ Transcription de {len(segment_files)} segments...", None
201
+ # Optimisation batch size pour Colab (souvent T4/V100)
202
+ b_size = 16 if DEVICE == "cuda" else 2
203
+
204
+ audio_paths = [s["file"] for s in segment_files]
205
+
206
+ # Utilisation de torch.inference_mode pour gain perf
207
+ with torch.inference_mode():
208
+ batch_hypotheses = model.transcribe(audio_paths, batch_size=b_size, return_hypotheses=True)
209
 
210
  all_words_ts = []
211
  for idx, hyp in enumerate(batch_hypotheses):
212
+ yield f"📝 Traitement : {idx+1}/{len(segment_files)}...", None
213
+ base_time = segment_files[idx]["start_offset"]
214
+
215
  if isinstance(hyp, list): hyp = hyp[0]
216
  text = hyp.text if hasattr(hyp, 'text') else str(hyp)
217
  words = text.split()
218
+ # Ajustement temporel plus précis
219
+ segment_duration = segment_files[idx+1]["start_offset"] - base_time if idx < len(segment_files)-1 else 5.0
220
+
221
+ gap = segment_duration / max(len(words), 1)
222
  for i, w in enumerate(words):
223
  all_words_ts.append({"word": w, "start": base_time + (i * gap), "end": base_time + ((i+1) * gap)})
224
 
 
262
 
263
  run_btn.click(pipeline, [v_input, m_input], [status, v_output])
264
 
265
+ demo.queue().launch()