binaryMao commited on
Commit
fcf99ec
·
verified ·
1 Parent(s): 8f9582a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +25 -19
app.py CHANGED
@@ -8,12 +8,14 @@ torch.set_grad_enabled(False)
8
  import gradio as gr
9
  import numpy as np
10
  import soundfile as sf
 
11
  from moviepy.editor import VideoFileClip, CompositeVideoClip, ImageClip
12
  from PIL import Image, ImageDraw, ImageFont
 
13
  from nemo.collections import asr as nemo_asr
14
 
15
 
16
- # ---------------- GLOBAL CONFIG ---------------- #
17
  os.environ["NEMO_FORCE_CPU"] = "1"
18
  os.environ["TOKENIZERS_PARALLELISM"] = "false"
19
 
@@ -44,10 +46,10 @@ def load_model(name):
44
  return model, device
45
 
46
 
47
- # ---------------- EXTRACT AUDIO (FORCE MONO) ---------------- #
48
  def extract_audio(video_path, wav_path):
49
  if os.path.getsize(video_path) > MAX_VIDEO_BYTES:
50
- raise RuntimeError("⚠️ Vidéo trop lourde (>200MB). Compressez avant l’upload.")
51
  os.system(f"ffmpeg -y -i '{video_path}' -ac 1 -ar {SR} -vn '{wav_path}' >/dev/null 2>&1")
52
  audio, sr = sf.read(wav_path)
53
  return len(audio)/sr
@@ -65,7 +67,7 @@ def transcribe(model, device, wav_path, model_key):
65
  x = torch.tensor(audio, dtype=torch.float32).unsqueeze(0).to(device)
66
  ln = torch.tensor([x.shape[1]]).to(device)
67
 
68
- # ✅ Real timestamps for Soloni
69
  if "Soloni" in model_key and hasattr(model, "decode_and_align"):
70
  try:
71
  with torch.no_grad():
@@ -77,14 +79,14 @@ def transcribe(model, device, wav_path, model_key):
77
  except:
78
  pass
79
 
80
- # ✅ Universal fallback (Soloba + QuartzNet + backup Soloni)
81
  out = model.transcribe([wav_path])[0]
82
  text = out.text.strip() if hasattr(out, "text") else str(out).strip()
83
  words = text.split()
84
  if not words:
85
  return []
86
 
87
- wps = max(2.0, len(words) / total_s) # words per second
88
  subs, t = [], 0
89
  for w in words:
90
  d = 1 / wps
@@ -106,15 +108,23 @@ def burn(video_path, subs):
106
 
107
  layers = []
108
  for s, e, w in subs:
109
- if e <= s:
110
- continue
111
 
112
- img = Image.new("RGBA", (W, int(H*0.12)), (0, 0, 0, 140))
113
  draw = ImageDraw.Draw(img)
114
 
115
  text = w.upper()
116
- tw, th = draw.textsize(text, font=font)
117
- draw.text(((W-tw)//2, (H*0.12-th)//2), text, font=font, fill=(255,255,255))
 
 
 
 
 
 
 
 
 
118
 
119
  img_clip = ImageClip(np.array(img)).set_start(s).set_duration(e-s).set_position(("center", int(H*0.85)))
120
  layers.append(img_clip)
@@ -122,7 +132,6 @@ def burn(video_path, subs):
122
  final = CompositeVideoClip([clip] + layers)
123
  out = "RobotsMali_Subtitled.mp4"
124
  final.write_videofile(out, codec="libx264", audio_codec="aac", fps=clip.fps, verbose=False, logger=None)
125
-
126
  clip.close()
127
  final.close()
128
  return out
@@ -135,18 +144,15 @@ def pipeline(video, model_name, progress=gr.Progress()):
135
 
136
  with tempfile.TemporaryDirectory() as td:
137
  wav = f"{td}/audio.wav"
138
- progress(0.4, "🔊 Extraction audio…")
139
  extract_audio(video, wav)
140
 
141
- progress(0.7, "🧠 Transcription…")
142
  subs = transcribe(model, device, wav, model_name)
143
- if not subs:
144
- return "⚠️ Aucun mot reconnu.", None
145
 
146
  progress(0.95, "🎞️ Incrustation…")
147
  out = burn(video, subs)
148
-
149
- return f"✅ Sous-titres générés avec **{model_name}**", out
150
 
151
 
152
  # ---------------- UI ---------------- #
@@ -157,7 +163,7 @@ h1 { text-align:center; font-weight:800; color:#005BFF; margin-bottom:6px; }
157
  """
158
 
159
  with gr.Blocks(css=CSS, title="RobotsMali Caption Studio") as demo:
160
- gr.Markdown("<h1>RobotsMali Caption Studio</h1><p>Sous-titrage Automatique en Bambara</p>")
161
  video = gr.File(label="🎥 Importer une vidéo")
162
  model = gr.Dropdown(list(ASR_MODELS.keys()), value="Soloni 114M TDT CTC V1", label="🧠 Modèle ASR")
163
  run = gr.Button("🚀 Générer les sous-titres")
 
8
  import gradio as gr
9
  import numpy as np
10
  import soundfile as sf
11
+
12
  from moviepy.editor import VideoFileClip, CompositeVideoClip, ImageClip
13
  from PIL import Image, ImageDraw, ImageFont
14
+
15
  from nemo.collections import asr as nemo_asr
16
 
17
 
18
+ # ---------------- CONFIG ---------------- #
19
  os.environ["NEMO_FORCE_CPU"] = "1"
20
  os.environ["TOKENIZERS_PARALLELISM"] = "false"
21
 
 
46
  return model, device
47
 
48
 
49
+ # ---------------- EXTRACT AUDIO ---------------- #
50
  def extract_audio(video_path, wav_path):
51
  if os.path.getsize(video_path) > MAX_VIDEO_BYTES:
52
+ raise RuntimeError("⚠️ Vidéo > 200MB. Compressez avant l’upload.")
53
  os.system(f"ffmpeg -y -i '{video_path}' -ac 1 -ar {SR} -vn '{wav_path}' >/dev/null 2>&1")
54
  audio, sr = sf.read(wav_path)
55
  return len(audio)/sr
 
67
  x = torch.tensor(audio, dtype=torch.float32).unsqueeze(0).to(device)
68
  ln = torch.tensor([x.shape[1]]).to(device)
69
 
70
+ # ✅ Soloni real timestamps
71
  if "Soloni" in model_key and hasattr(model, "decode_and_align"):
72
  try:
73
  with torch.no_grad():
 
79
  except:
80
  pass
81
 
82
+ # ✅ Universal fallback for Soloba + QuartzNet
83
  out = model.transcribe([wav_path])[0]
84
  text = out.text.strip() if hasattr(out, "text") else str(out).strip()
85
  words = text.split()
86
  if not words:
87
  return []
88
 
89
+ wps = max(2.0, len(words) / total_s)
90
  subs, t = [], 0
91
  for w in words:
92
  d = 1 / wps
 
108
 
109
  layers = []
110
  for s, e, w in subs:
111
+ if e <= s: continue
 
112
 
113
+ img = Image.new("RGBA", (W, int(H*0.12)), (0,0,0,140))
114
  draw = ImageDraw.Draw(img)
115
 
116
  text = w.upper()
117
+
118
+ # Pillow 10+ compatible text size
119
+ try:
120
+ bbox = draw.textbbox((0,0), text, font=font)
121
+ tw, th = bbox[2]-bbox[0], bbox[3]-bbox[1]
122
+ except:
123
+ tw, th = draw.textsize(text, font=font)
124
+
125
+ x = (W - tw) // 2
126
+ y = (int(H*0.12) - th) // 2
127
+ draw.text((x, y), text, font=font, fill=(255,255,255))
128
 
129
  img_clip = ImageClip(np.array(img)).set_start(s).set_duration(e-s).set_position(("center", int(H*0.85)))
130
  layers.append(img_clip)
 
132
  final = CompositeVideoClip([clip] + layers)
133
  out = "RobotsMali_Subtitled.mp4"
134
  final.write_videofile(out, codec="libx264", audio_codec="aac", fps=clip.fps, verbose=False, logger=None)
 
135
  clip.close()
136
  final.close()
137
  return out
 
144
 
145
  with tempfile.TemporaryDirectory() as td:
146
  wav = f"{td}/audio.wav"
147
+ progress(0.5, "🔊 Extraction audio…")
148
  extract_audio(video, wav)
149
 
150
+ progress(0.75, "🧠 Transcription…")
151
  subs = transcribe(model, device, wav, model_name)
 
 
152
 
153
  progress(0.95, "🎞️ Incrustation…")
154
  out = burn(video, subs)
155
+ return f"✅ Sous-titrage généré avec **{model_name}**", out
 
156
 
157
 
158
  # ---------------- UI ---------------- #
 
163
  """
164
 
165
  with gr.Blocks(css=CSS, title="RobotsMali Caption Studio") as demo:
166
+ gr.Markdown("<h1>RobotsMali Caption Studio</h1><p>Sous-titrage automatique en Bambara</p>")
167
  video = gr.File(label="🎥 Importer une vidéo")
168
  model = gr.Dropdown(list(ASR_MODELS.keys()), value="Soloni 114M TDT CTC V1", label="🧠 Modèle ASR")
169
  run = gr.Button("🚀 Générer les sous-titres")