binaryMao commited on
Commit
1a890b5
·
verified ·
1 Parent(s): 2bff7cb

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +52 -61
app.py CHANGED
@@ -1,27 +1,24 @@
1
- import os, warnings, tempfile, logging
2
 
3
- # --- Sécurité & compatibilité environnement ---
4
  warnings.filterwarnings("ignore")
5
  logging.getLogger("nemo_logger").setLevel(logging.ERROR)
 
6
  os.environ["NEMO_FORCE_CPU"] = "1"
7
  os.environ["TOKENIZERS_PARALLELISM"] = "false"
 
8
  import torch
9
  torch.set_grad_enabled(False)
10
 
11
- # Débloquer ImageMagick (HF varie selon build)
12
- for p in ("/etc/ImageMagick/policy.xml", "/etc/ImageMagick-6/policy.xml"):
13
- if os.path.exists(p):
14
- os.system(f'sed -i "s/rights=\\"none\\"/rights=\\"read|write\\"/g" "{p}"')
15
-
16
  import gradio as gr
17
  import numpy as np
18
  import soundfile as sf
19
  from moviepy.editor import VideoFileClip, CompositeVideoClip, TextClip
20
  from nemo.collections import asr as nemo_asr
21
 
22
- # --- Configuration ---
 
23
  SR = 16000
24
- MAX_VIDEO_BYTES = 200_000_000 # ≈200MB limite stable HF Space
25
 
26
  ASR_MODELS = {
27
  "Soloba CTC 0.6B V0": "RobotsMali/soloba-ctc-0.6b-v0",
@@ -46,44 +43,36 @@ def load_model(name):
46
  _CACHE[name] = (model, device)
47
  return model, device
48
 
49
- # ---------------- AUDIO EXTRACTION ---------------- #
50
 
51
  def extract_audio(video_path, wav_path):
52
- # Protéger contre fichiers trop lourds
53
- try:
54
- if os.path.getsize(video_path) > MAX_VIDEO_BYTES:
55
- raise RuntimeError("⚠️ Vidéo trop lourde (>200MB). Compressez puis réessayez.")
56
- except:
57
- pass
58
 
59
- # Extraction fiable (ffmpeg / MoviePy)
60
- try:
61
- clip = VideoFileClip(video_path)
62
- if clip.audio is None:
63
- clip.close()
64
- raise RuntimeError("⚠️ Aucun audio détecté dans la vidéo.")
65
- clip.audio.write_audiofile(
66
- wav_path, fps=SR, codec="pcm_s16le", verbose=False, logger=None
67
- )
68
- duration = clip.duration or 0
69
- clip.close()
70
- return duration
71
- except:
72
- os.system(f"ffmpeg -y -i '{video_path}' -ac 1 -ar {SR} -vn '{wav_path}' >/dev/null 2>&1")
73
- audio, sr = sf.read(wav_path)
74
- return len(audio)/sr if sr else 0.0
75
-
76
- # ---------------- TRANSCRIPTION ---------------- #
77
 
78
  def transcribe(model, device, wav_path, model_key):
79
  audio, sr = sf.read(wav_path)
80
- if audio.ndim > 1: audio = audio.mean(axis=1)
81
- total_s = len(audio)/sr if sr else 0
82
 
 
 
 
 
 
 
 
83
  x = torch.tensor(audio, dtype=torch.float32).unsqueeze(0).to(device)
84
  ln = torch.tensor([x.shape[1]]).to(device)
85
 
86
- # --- Soloni timestamps réels via decode_and_align ---
87
  if "Soloni" in model_key and hasattr(model, "decode_and_align"):
88
  with torch.no_grad():
89
  proc, plen = model.preprocessor(
@@ -96,26 +85,26 @@ def transcribe(model, device, wav_path, model_key):
96
  )
97
 
98
  hyp = hyps[0][0] if isinstance(hyps[0], list) else hyps[0]
99
-
100
  if hasattr(hyp, "words") and hyp.words:
101
  return [(w.start_offset_ms/1000, w.end_offset_ms/1000, w.word) for w in hyp.words]
102
 
103
- # --- Soloba + QuartzNet alignement fluide stable ---
104
  text = model.transcribe([wav_path])[0]
105
  words = text.split()
106
  if not words or total_s <= 0:
107
  return []
108
 
109
- wps = max(2.0, len(words)/total_s)
110
  subs, t = [], 0
111
  for w in words:
112
- d = 1/wps
113
- subs.append((t, min(total_s, t+d), w))
114
  t += d
115
- if t >= total_s: break
 
116
  return subs
117
 
118
- # ---------------- HARD SUBTITLE ---------------- #
119
 
120
  def burn(video_path, subs):
121
  clip = None
@@ -123,16 +112,20 @@ def burn(video_path, subs):
123
  try:
124
  clip = VideoFileClip(video_path)
125
  W, H = clip.size
126
-
127
  layers = []
128
- for s,e,w in subs:
 
129
  if e <= s: continue
130
- tc = TextClip(
131
- w.upper(), fontsize=int(H/20), color="white",
132
- stroke_color="black", stroke_width=2,
133
- method="caption", size=(int(W*0.9), None)
 
 
 
 
134
  ).set_start(s).set_duration(e - s).set_position(("center", int(H*0.88)))
135
- layers.append(tc)
136
 
137
  final = CompositeVideoClip([clip] + layers)
138
  out = "RobotsMali_Subtitled.mp4"
@@ -148,7 +141,7 @@ def burn(video_path, subs):
148
  # ---------------- PIPELINE ---------------- #
149
 
150
  def pipeline(video, model_name, progress=gr.Progress()):
151
- progress(0.25, "Chargement du modèle…")
152
  model, device = load_model(model_name)
153
 
154
  with tempfile.TemporaryDirectory() as td:
@@ -156,10 +149,8 @@ def pipeline(video, model_name, progress=gr.Progress()):
156
 
157
  progress(0.5, "Extraction audio…")
158
  duration = extract_audio(video, wav)
159
- if duration <= 0:
160
- return "❌ Audio introuvable ou illisible.", None
161
 
162
- progress(0.75, "Transcription en Bambara…")
163
  subs = transcribe(model, device, wav, model_name)
164
  if not subs:
165
  return "⚠️ Aucun mot détecté.", None
@@ -168,20 +159,20 @@ def pipeline(video, model_name, progress=gr.Progress()):
168
  out = burn(video, subs)
169
 
170
  progress(1.0, "✅ Terminé")
171
- return f"✅ Sous-titré avec **{model_name}**", out
172
 
173
  # ---------------- UI ---------------- #
174
 
175
  CSS = """
176
- body { background:#F6F9FF; font-family:Inter, sans-serif; }
177
- h1 { text-align:center; font-weight:800; color:#006CFF; }
178
- .gr-button { background:#007BFF !important; color:white !important; border-radius:8px; font-weight:700; }
179
  """
180
 
181
  with gr.Blocks(css=CSS, title="RobotsMali Caption Studio") as demo:
182
- gr.Markdown("<h1>RobotsMali Caption Studio</h1><p>Sous-titrage Automatique en Bambara</p>")
183
  video = gr.File(label="🎥 Importer une vidéo (max 200MB)", type="filepath")
184
- model = gr.Dropdown(list(ASR_MODELS.keys()), value="Soloni 114M TDT CTC V1", label="🧠 Sélection du modèle ASR")
185
  run = gr.Button("🚀 Générer")
186
  status = gr.Markdown()
187
  output = gr.Video()
 
1
+ import os, warnings, logging, tempfile
2
 
 
3
  warnings.filterwarnings("ignore")
4
  logging.getLogger("nemo_logger").setLevel(logging.ERROR)
5
+
6
  os.environ["NEMO_FORCE_CPU"] = "1"
7
  os.environ["TOKENIZERS_PARALLELISM"] = "false"
8
+
9
  import torch
10
  torch.set_grad_enabled(False)
11
 
 
 
 
 
 
12
  import gradio as gr
13
  import numpy as np
14
  import soundfile as sf
15
  from moviepy.editor import VideoFileClip, CompositeVideoClip, TextClip
16
  from nemo.collections import asr as nemo_asr
17
 
18
+ # ---------------- CONFIG ---------------- #
19
+
20
  SR = 16000
21
+ MAX_VIDEO_BYTES = 200_000_000
22
 
23
  ASR_MODELS = {
24
  "Soloba CTC 0.6B V0": "RobotsMali/soloba-ctc-0.6b-v0",
 
43
  _CACHE[name] = (model, device)
44
  return model, device
45
 
46
+ # ---------------- EXTRACT AUDIO (FORCE MONO) ---------------- #
47
 
48
  def extract_audio(video_path, wav_path):
49
+ if os.path.getsize(video_path) > MAX_VIDEO_BYTES:
50
+ raise RuntimeError("⚠️ Vidéo trop lourde (>200MB). Compressez puis réessayez.")
 
 
 
 
51
 
52
+ # Force audio mono + 16k (100% fiable)
53
+ os.system(f"ffmpeg -y -i '{video_path}' -ac 1 -ar {SR} -vn '{wav_path}' >/dev/null 2>&1")
54
+
55
+ audio, sr = sf.read(wav_path)
56
+ if sr == 0 or len(audio) == 0:
57
+ raise RuntimeError("⚠️ Audio introuvable ou illisible.")
58
+ return len(audio) / sr
59
+
60
+ # ---------------- TRANSCRIBE ---------------- #
 
 
 
 
 
 
 
 
 
61
 
62
  def transcribe(model, device, wav_path, model_key):
63
  audio, sr = sf.read(wav_path)
 
 
64
 
65
+ # Force mono propre + normalisation
66
+ if audio.ndim == 2:
67
+ audio = np.mean(audio, axis=1).astype(np.float32)
68
+ if np.max(np.abs(audio)) > 1:
69
+ audio = audio / np.max(np.abs(audio))
70
+
71
+ total_s = len(audio)/sr if sr else 0
72
  x = torch.tensor(audio, dtype=torch.float32).unsqueeze(0).to(device)
73
  ln = torch.tensor([x.shape[1]]).to(device)
74
 
75
+ # ---- Soloni : timestamps réels ---- #
76
  if "Soloni" in model_key and hasattr(model, "decode_and_align"):
77
  with torch.no_grad():
78
  proc, plen = model.preprocessor(
 
85
  )
86
 
87
  hyp = hyps[0][0] if isinstance(hyps[0], list) else hyps[0]
 
88
  if hasattr(hyp, "words") and hyp.words:
89
  return [(w.start_offset_ms/1000, w.end_offset_ms/1000, w.word) for w in hyp.words]
90
 
91
+ # ---- Soloba & QuartzNet fallback alignement fluide ---- #
92
  text = model.transcribe([wav_path])[0]
93
  words = text.split()
94
  if not words or total_s <= 0:
95
  return []
96
 
97
+ wps = max(2.0, len(words) / total_s)
98
  subs, t = [], 0
99
  for w in words:
100
+ d = 1 / wps
101
+ subs.append((t, min(total_s, t + d), w))
102
  t += d
103
+ if t >= total_s:
104
+ break
105
  return subs
106
 
107
+ # ---------------- BURN SUBTITLES ---------------- #
108
 
109
  def burn(video_path, subs):
110
  clip = None
 
112
  try:
113
  clip = VideoFileClip(video_path)
114
  W, H = clip.size
 
115
  layers = []
116
+
117
+ for s, e, w in subs:
118
  if e <= s: continue
119
+ txt = TextClip(
120
+ w.upper(),
121
+ fontsize=int(H/20),
122
+ color="white",
123
+ stroke_color="black",
124
+ stroke_width=2,
125
+ method="caption",
126
+ size=(int(W*0.9), None)
127
  ).set_start(s).set_duration(e - s).set_position(("center", int(H*0.88)))
128
+ layers.append(txt)
129
 
130
  final = CompositeVideoClip([clip] + layers)
131
  out = "RobotsMali_Subtitled.mp4"
 
141
  # ---------------- PIPELINE ---------------- #
142
 
143
  def pipeline(video, model_name, progress=gr.Progress()):
144
+ progress(0.3, "Chargement du modèle…")
145
  model, device = load_model(model_name)
146
 
147
  with tempfile.TemporaryDirectory() as td:
 
149
 
150
  progress(0.5, "Extraction audio…")
151
  duration = extract_audio(video, wav)
 
 
152
 
153
+ progress(0.75, "Transcription…")
154
  subs = transcribe(model, device, wav, model_name)
155
  if not subs:
156
  return "⚠️ Aucun mot détecté.", None
 
159
  out = burn(video, subs)
160
 
161
  progress(1.0, "✅ Terminé")
162
+ return f"✅ Sous-titrage terminé avec **{model_name}**", out
163
 
164
  # ---------------- UI ---------------- #
165
 
166
  CSS = """
167
+ body { background:#F7FAFF; font-family:Inter, sans-serif; }
168
+ h1 { text-align:center; font-weight:800; color:#005BFF; }
169
+ .gr-button { background:#005BFF !important; color:white !important; border-radius:8px; font-weight:700; }
170
  """
171
 
172
  with gr.Blocks(css=CSS, title="RobotsMali Caption Studio") as demo:
173
+ gr.Markdown("<h1>RobotsMali Caption Studio</h1><p>Transcription & Sous-titres Automatiques en Bambara</p>")
174
  video = gr.File(label="🎥 Importer une vidéo (max 200MB)", type="filepath")
175
+ model = gr.Dropdown(list(ASR_MODELS.keys()), value="Soloni 114M TDT CTC V1", label="🧠 Modèle ASR")
176
  run = gr.Button("🚀 Générer")
177
  status = gr.Markdown()
178
  output = gr.Video()