binaryMao commited on
Commit
f6e735c
·
verified ·
1 Parent(s): e18b5e6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +63 -77
app.py CHANGED
@@ -1,5 +1,5 @@
1
  # -*- coding: utf-8 -*-
2
- """ROBOTSMALI VIDEO CAPTIONING V8 - MINIMALIST BLUE (STABLE VERSION)"""
3
 
4
  import gradio as gr
5
  import numpy as np
@@ -7,57 +7,51 @@ import torch
7
  import soundfile as sf
8
  import os
9
  import tempfile
10
- import warnings
11
  from moviepy.editor import VideoFileClip, TextClip, CompositeVideoClip
 
12
  from typing import List, Tuple
13
- from huggingface_hub import hf_hub_download, snapshot_download
14
 
15
- # ------------------------------------------------------------
16
- # Import NeMo
17
- # ------------------------------------------------------------
18
  try:
19
  from nemo.collections import asr as nemo_asr
20
  from ctc_segmentation import ctc_segmentation, CtcSegmentationParameters, prepare_text
21
  NEMO_LOADED = True
22
- except Exception as e:
23
- print("❌ ERREUR : NeMo ou ctc-segmentation non installé.")
24
  NEMO_LOADED = False
25
 
26
  # ------------------------------------------------------------
27
- # Modèles RobotsMali
28
  # ------------------------------------------------------------
29
  MODELS = {
30
- "Soloni V1 (RNnT - Précis)": ("RobotsMali/soloni-114m-tdt-ctc-V1", "soloni-114m-tdt-ctc-V1.nemo", "rnnt"),
31
- "Soloba V1 (CTC - Équilibré)": ("RobotsMali/soloba-ctc-0.6b-V1", None, "ctc"),
32
- "QuartzNet V1 (CTC - Rapide)": ("RobotsMali/stt-bm-quartznet15x5-V1", None, "ctc"),
33
  }
34
 
35
  asr_pipeline = {}
36
 
37
  # ------------------------------------------------------------
38
- # Chargement modèle robuste
39
  # ------------------------------------------------------------
40
- def load_ctc_model_safe(repo_id):
41
- try:
42
- return nemo_asr.models.EncDecCTCModelBPE.from_pretrained(model_name=repo_id)
43
- except:
44
- with tempfile.TemporaryDirectory() as tmpdir:
45
- path = snapshot_download(repo_id, cache_dir=tmpdir)
46
- for f in os.listdir(path):
47
- if f.endswith(".nemo"):
48
- return nemo_asr.models.EncDecCTCModelBPE.restore_from(os.path.join(path, f))
49
- raise RuntimeError("Impossible de charger le modèle CTC.")
50
-
51
  def load_asr_model(model_name):
52
- repo_id, nemo_file, mode = MODELS[model_name]
53
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
54
 
55
  if model_name not in asr_pipeline:
56
- if mode == "rnnt":
57
- nemo_path = hf_hub_download(repo_id, filename=nemo_file)
 
 
 
 
 
 
 
 
 
 
58
  model = nemo_asr.models.EncDecHybridRNNTCTCBPEModel.restore_from(nemo_path)
59
- else:
60
- model = load_ctc_model_safe(repo_id)
61
 
62
  model.to(device).eval()
63
  asr_pipeline[model_name] = model
@@ -65,7 +59,7 @@ def load_asr_model(model_name):
65
  return asr_pipeline[model_name]
66
 
67
  # ------------------------------------------------------------
68
- # Groupage des mots en sous-titres
69
  # ------------------------------------------------------------
70
  MAX_WORDS = 4
71
  MAX_CHARS = 45
@@ -73,59 +67,49 @@ MAX_DURATION = 3.5
73
 
74
  def group_words(words):
75
  subs, group = [], []
76
-
77
- def commit(g):
78
- if g:
79
- subs.append((g[0][0], g[-1][1], " ".join([w[2] for w in g])))
80
-
81
  for w in words:
82
  test = group + [w]
83
- text = " ".join([t[2] for t in test])
84
  duration = test[-1][1] - test[0][0]
85
-
86
  if len(test) > MAX_WORDS or len(text) > MAX_CHARS or duration > MAX_DURATION:
87
- commit(group)
88
- group = [w]
89
  else:
90
- group.append(w)
91
-
92
- commit(group)
93
  return subs
94
 
95
  # ------------------------------------------------------------
96
- # Transcription + Alignement
97
  # ------------------------------------------------------------
98
- def transcribe(model, device, wavfile, model_name):
99
- audio, sr = sf.read(wavfile)
100
- if audio.ndim == 2: audio = np.mean(audio, axis=1)
101
  x = torch.tensor(audio, dtype=torch.float32).unsqueeze(0).to(device)
102
  ln = torch.tensor([x.shape[1]]).to(device)
103
  total_s = len(audio) / sr
104
 
105
- # RNNT direct timestamps
106
  if "Soloni" in model_name:
107
  hyps = model.decode_and_align(*model.preprocessor(input_signal=x, input_signal_length=ln))
108
  words = [(w.start_offset_ms/1000, w.end_offset_ms/1000, w.word) for w in hyps[0][0].words]
109
  return group_words(words)
110
 
111
- # CTC + segmentation
112
- text = model.transcribe([wavfile])[0]
113
- if not text.strip(): return []
114
  with torch.no_grad(): logits, loglen = model(x, ln)
115
- words = text.strip().split()
116
- cfg = CtcSegmentationParameters()
117
- cfg.char_list = list(model.tokenizer.vocab.keys())
118
  gt, _ = prepare_text(cfg, words)
119
- timings, _, _ = ctc_segmentation(cfg, logits.cpu().numpy()[0], gt)
120
  tps = total_s / loglen.cpu().numpy()[0]
121
 
122
- aligned = [(timings[i]*tps,
123
- timings[i+1]*tps if i+1 < len(timings) else total_s,
124
- words[i]) for i in range(len(words))]
125
  return group_words(aligned)
126
 
127
  # ------------------------------------------------------------
128
- # Extraction audio
129
  # ------------------------------------------------------------
130
  def extract_audio(video, wav):
131
  v = VideoFileClip(video)
@@ -133,19 +117,25 @@ def extract_audio(video, wav):
133
  v.close()
134
 
135
  # ------------------------------------------------------------
136
- # Burn subtitles
137
  # ------------------------------------------------------------
138
  def burn(video, subs):
139
  output = "RobotsMali_Subtitled.mp4"
140
  clip = VideoFileClip(video)
141
  W, H = clip.size
142
- layers = []
143
 
144
- for start, end, text in subs:
 
145
  txt = TextClip(
146
- text.upper(), fontsize=H//20, color='white', bg_color='rgba(0,0,0,0.7)',
147
- method='caption', size=(W*0.9, None)
148
- ).set_pos(("center", H*0.85)).set_duration(end-start).set_start(start)
 
 
 
 
 
 
149
  layers.append(txt)
150
 
151
  final = CompositeVideoClip([clip] + layers)
@@ -154,17 +144,15 @@ def burn(video, subs):
154
  return output
155
 
156
  # ------------------------------------------------------------
157
- # PIPELINE STABLE (PAS DE YIELD)
158
  # ------------------------------------------------------------
159
  def pipeline(video_file, model_name):
160
- if video_file is None:
161
- return "⚠️ Importez une vidéo.", None
162
-
163
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
164
- status = f"🧠 Chargement du modèle {model_name}..."
165
 
166
  try:
167
  model = load_asr_model(model_name)
 
168
  status += "\n🎶 Extraction audio..."
169
  wav = os.path.join(tempfile.gettempdir(), "audio.wav")
170
  extract_audio(video_file, wav)
@@ -173,12 +161,11 @@ def pipeline(video_file, model_name):
173
  subs = transcribe(model, device, wav, model_name)
174
  if not subs: return "⚠️ Aucun mot détecté.", None
175
 
176
- status += "\n🎬 Sous-titrage..."
177
  out = burn(video_file, subs)
178
 
179
  if os.path.exists(wav): os.remove(wav)
180
  status += "\n✅ Terminé !"
181
-
182
  return status, out
183
 
184
  except Exception as e:
@@ -187,14 +174,13 @@ def pipeline(video_file, model_name):
187
  # ------------------------------------------------------------
188
  # Interface
189
  # ------------------------------------------------------------
190
- with gr.Blocks() as demo:
191
- gr.Markdown("# ⚡ ROBOTSMALI V8 — MINIMALIST BLUE")
192
- video = gr.Video(label="Importer une vidéo")
193
  model = gr.Dropdown(list(MODELS.keys()), value="Soloni V1 (RNnT - Précis)")
194
  run = gr.Button("▶️ PRODUIRE")
195
  status = gr.Markdown()
196
- out = gr.Video()
197
-
198
- run.click(pipeline, inputs=[video, model], outputs=[status, out])
199
 
200
  demo.launch(share=True)
 
1
  # -*- coding: utf-8 -*-
2
+ """ROBOTSMALI VIDEO CAPTIONING V8 MINIMALIST BLUE + NETFLIX SUBTITLES"""
3
 
4
  import gradio as gr
5
  import numpy as np
 
7
  import soundfile as sf
8
  import os
9
  import tempfile
 
10
  from moviepy.editor import VideoFileClip, TextClip, CompositeVideoClip
11
+ from huggingface_hub import snapshot_download
12
  from typing import List, Tuple
 
13
 
 
 
 
14
  try:
15
  from nemo.collections import asr as nemo_asr
16
  from ctc_segmentation import ctc_segmentation, CtcSegmentationParameters, prepare_text
17
  NEMO_LOADED = True
18
+ except:
 
19
  NEMO_LOADED = False
20
 
21
  # ------------------------------------------------------------
22
+ # MODELS (corrigés)
23
  # ------------------------------------------------------------
24
  MODELS = {
25
+ "Soloni V1 (RNnT - Précis)": ("RobotsMali/soloni-114m-tdt-ctc-v1", "rnnt"),
26
+ "Soloba V1 (CTC - Équilibré)": ("RobotsMali/soloba-ctc-0.6b-v1", "ctc"),
27
+ "QuartzNet V1 (CTC - Rapide)": ("RobotsMali/stt-bm-quartznet15x5-v1", "ctc"),
28
  }
29
 
30
  asr_pipeline = {}
31
 
32
  # ------------------------------------------------------------
33
+ # Chargement automatique du modèle (.nemo auto-detect)
34
  # ------------------------------------------------------------
 
 
 
 
 
 
 
 
 
 
 
35
  def load_asr_model(model_name):
36
+ repo_id, mode = MODELS[model_name]
37
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
38
 
39
  if model_name not in asr_pipeline:
40
+ repo_path = snapshot_download(repo_id, local_dir_use_symlinks=False)
41
+
42
+ nemo_path = None
43
+ for f in os.listdir(repo_path):
44
+ if f.endswith(".nemo"):
45
+ nemo_path = os.path.join(repo_path, f)
46
+ break
47
+
48
+ if nemo_path is None:
49
+ raise FileNotFoundError(f"Aucun .nemo trouvé dans {repo_id}")
50
+
51
+ try:
52
  model = nemo_asr.models.EncDecHybridRNNTCTCBPEModel.restore_from(nemo_path)
53
+ except:
54
+ model = nemo_asr.models.EncDecCTCModelBPE.restore_from(nemo_path)
55
 
56
  model.to(device).eval()
57
  asr_pipeline[model_name] = model
 
59
  return asr_pipeline[model_name]
60
 
61
  # ------------------------------------------------------------
62
+ # Paramètres de découpage
63
  # ------------------------------------------------------------
64
  MAX_WORDS = 4
65
  MAX_CHARS = 45
 
67
 
68
  def group_words(words):
69
  subs, group = [], []
70
+ def push(g):
71
+ if g: subs.append((g[0][0], g[-1][1], " ".join([w[2] for w in g])))
 
 
 
72
  for w in words:
73
  test = group + [w]
74
+ text = " ".join([x[2] for x in test])
75
  duration = test[-1][1] - test[0][0]
 
76
  if len(test) > MAX_WORDS or len(text) > MAX_CHARS or duration > MAX_DURATION:
77
+ push(group); group = [w]
 
78
  else:
79
+ group = test
80
+ push(group)
 
81
  return subs
82
 
83
  # ------------------------------------------------------------
84
+ # Transcription + alignement
85
  # ------------------------------------------------------------
86
+ def transcribe(model, device, wav, model_name):
87
+ audio, sr = sf.read(wav)
88
+ if audio.ndim == 2: audio = audio.mean(1)
89
  x = torch.tensor(audio, dtype=torch.float32).unsqueeze(0).to(device)
90
  ln = torch.tensor([x.shape[1]]).to(device)
91
  total_s = len(audio) / sr
92
 
 
93
  if "Soloni" in model_name:
94
  hyps = model.decode_and_align(*model.preprocessor(input_signal=x, input_signal_length=ln))
95
  words = [(w.start_offset_ms/1000, w.end_offset_ms/1000, w.word) for w in hyps[0][0].words]
96
  return group_words(words)
97
 
98
+ text = model.transcribe([wav])[0].strip()
99
+ if not text: return []
 
100
  with torch.no_grad(): logits, loglen = model(x, ln)
101
+
102
+ words = text.split()
103
+ cfg = CtcSegmentationParameters(); cfg.char_list = list(model.tokenizer.vocab.keys())
104
  gt, _ = prepare_text(cfg, words)
105
+ timing, _, _ = ctc_segmentation(cfg, logits.cpu().numpy()[0], gt)
106
  tps = total_s / loglen.cpu().numpy()[0]
107
 
108
+ aligned = [(timing[i]*tps, timing[i+1]*tps if i+1<len(timing) else total_s, words[i]) for i in range(len(words))]
 
 
109
  return group_words(aligned)
110
 
111
  # ------------------------------------------------------------
112
+ # Extraction Audio
113
  # ------------------------------------------------------------
114
  def extract_audio(video, wav):
115
  v = VideoFileClip(video)
 
117
  v.close()
118
 
119
  # ------------------------------------------------------------
120
+ # Sous-titres Style Netflix
121
  # ------------------------------------------------------------
122
  def burn(video, subs):
123
  output = "RobotsMali_Subtitled.mp4"
124
  clip = VideoFileClip(video)
125
  W, H = clip.size
 
126
 
127
+ layers = []
128
+ for s, e, t in subs:
129
  txt = TextClip(
130
+ t.upper(),
131
+ fontsize=H//18,
132
+ stroke_width=3,
133
+ stroke_color="black",
134
+ color="white",
135
+ method="caption",
136
+ size=(W*0.85, None),
137
+ bg_color="rgba(0,0,0,0.45)"
138
+ ).set_start(s).set_duration(e-s).set_pos(("center", H*0.82))
139
  layers.append(txt)
140
 
141
  final = CompositeVideoClip([clip] + layers)
 
144
  return output
145
 
146
  # ------------------------------------------------------------
147
+ # Pipeline
148
  # ------------------------------------------------------------
149
  def pipeline(video_file, model_name):
 
 
 
150
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
151
+ status = f"🧠 Chargement modèle sur {device}..."
152
 
153
  try:
154
  model = load_asr_model(model_name)
155
+
156
  status += "\n🎶 Extraction audio..."
157
  wav = os.path.join(tempfile.gettempdir(), "audio.wav")
158
  extract_audio(video_file, wav)
 
161
  subs = transcribe(model, device, wav, model_name)
162
  if not subs: return "⚠️ Aucun mot détecté.", None
163
 
164
+ status += "\n🎬 Sous-titres Netflix..."
165
  out = burn(video_file, subs)
166
 
167
  if os.path.exists(wav): os.remove(wav)
168
  status += "\n✅ Terminé !"
 
169
  return status, out
170
 
171
  except Exception as e:
 
174
  # ------------------------------------------------------------
175
  # Interface
176
  # ------------------------------------------------------------
177
+ with gr.Blocks(title="RobotsMali V8") as demo:
178
+ gr.Markdown("# ⚡ ROBOTSMALI V8 — Minimalist Blue + Netflix Subtitles")
179
+ video = gr.Video()
180
  model = gr.Dropdown(list(MODELS.keys()), value="Soloni V1 (RNnT - Précis)")
181
  run = gr.Button("▶️ PRODUIRE")
182
  status = gr.Markdown()
183
+ result = gr.Video()
184
+ run.click(pipeline, inputs=[video, model], outputs=[status, result])
 
185
 
186
  demo.launch(share=True)