binaryMao commited on
Commit
61717c4
·
verified ·
1 Parent(s): 5254201

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +66 -58
app.py CHANGED
@@ -1,16 +1,16 @@
1
  import os, warnings, tempfile
2
  warnings.filterwarnings("ignore")
3
 
4
- # Autoriser ImageMagick sur n'importe quelle distribution
5
- for path in ["/etc/ImageMagick/policy.xml", "/etc/ImageMagick-6/policy.xml"]:
6
- if os.path.exists(path):
7
- os.system(f'sed -i "s/rights=\\"none\\"/rights=\\"read|write\\"/g" {path}')
8
 
9
  import gradio as gr
10
  import numpy as np
11
  import soundfile as sf
12
  import torch
13
- from moviepy.editor import VideoFileClip, TextClip, CompositeVideoClip
14
  from nemo.collections import asr as nemo_asr
15
 
16
  SR = 16000
@@ -26,98 +26,106 @@ ASR_MODELS = {
26
 
27
  _CACHE = {}
28
 
29
- def load_model(model_key):
30
- if model_key in _CACHE:
31
- return _CACHE[model_key]
32
  device = "cuda" if torch.cuda.is_available() else "cpu"
33
- model = nemo_asr.models.ASRModel.from_pretrained(model_name=ASR_MODELS[model_key]).to(device).eval()
34
- _CACHE[model_key] = (model, device)
35
  return model, device
36
 
37
  def extract_audio(video_path, wav_path):
38
- with VideoFileClip(video_path) as clip:
39
- audio = clip.audio.to_soundarray(fps=SR)
40
- if audio.ndim > 1:
41
- audio = audio.mean(axis=1)
42
- sf.write(wav_path, audio.astype(np.float32), SR)
43
- return len(audio) / SR
44
-
45
- def transcribe_with_timestamps(model, device, wav_path, model_key):
 
46
  audio, sr = sf.read(wav_path)
47
- if audio.ndim > 1:
48
- audio = audio.mean(axis=1)
49
- total_duration = len(audio) / sr
50
 
51
- audio_t = torch.tensor(audio, dtype=torch.float32).unsqueeze(0).to(device)
52
- length_t = torch.tensor([audio_t.shape[1]]).to(device)
 
 
 
 
 
53
 
54
- # === Soloni → timestamps exacts ===
55
  if "Soloni" in model_key:
56
  with torch.no_grad():
57
- proc, proc_len = model.preprocessor(audio_t, length_t)
58
- hyps = model.decode_and_align(encoder_output=proc, encoded_lengths=proc_len)
59
  hyp = hyps[0][0] if isinstance(hyps[0], list) else hyps[0]
60
- if hasattr(hyp, "words") and hyp.words:
61
  return [(w.start_offset_ms/1000, w.end_offset_ms/1000, w.word) for w in hyp.words]
62
 
63
- # === Soloba + QuartzNet → alignement fluide ===
64
  text = model.transcribe([wav_path])[0]
65
  words = text.split()
66
- if not words:
67
- return []
68
 
69
- wps = max(2.0, len(words)/total_duration)
70
  subs, t = [], 0
71
  for w in words:
72
- dur = 1.0 / wps
73
- subs.append((t, min(total_duration, t+dur), w))
74
- t += dur
75
- if t >= total_duration: break
76
  return subs
77
 
78
- def burn_subtitles(video_path, subs):
79
  clip = VideoFileClip(video_path)
80
  W, H = clip.size
81
- overlays = [
82
- TextClip(w.upper(), fontsize=int(H/20), color="white",
83
- stroke_color="black", stroke_width=2,
84
- method="caption", size=(int(W*0.9), None)
 
 
85
  ).set_start(s).set_duration(e-s).set_position(("center", int(H*0.88)))
86
- for s, e, w in subs
87
- ]
88
- final = CompositeVideoClip([clip] + overlays)
89
- out = "output_captioned.mp4"
90
  final.write_videofile(out, codec="libx264", audio_codec="aac", verbose=False, logger=None)
91
  return out
92
 
93
- def pipeline(video, model_key, progress=gr.Progress()):
94
  progress(0.2, "Chargement du modèle…")
95
- model, device = load_model(model_key)
 
96
  with tempfile.TemporaryDirectory() as td:
97
  wav = f"{td}/audio.wav"
98
- progress(0.4, "Extraction audio…")
99
  extract_audio(video, wav)
 
100
  progress(0.7, "Transcription en Bambara…")
101
- subs = transcribe_with_timestamps(model, device, wav, model_key)
 
102
  progress(0.9, "Incrustation des sous-titres…")
103
- out = burn_subtitles(video, subs)
 
104
  progress(1.0, "✅ Terminé")
105
- return f"✅ Sous-titres générés avec **{model_key}**", out
106
 
107
  CSS = """
108
- body { background:#F5F8FC; font-family:Inter, sans-serif; }
109
- h1 { text-align:center; font-weight:800; color:#007BFF; }
110
- p { text-align:center; color:#5A6B85; margin-bottom:24px; }
111
- .gr-button { background:#007BFF !important; color:white !important; border-radius:8px; }
112
  """
113
 
114
- with gr.Blocks(css=CSS) as demo:
115
  gr.Markdown("<h1>RobotsMali Caption Studio</h1><p>Sous-titrage Automatique en Bambara</p>")
116
  video = gr.File(label="🎥 Importer une vidéo", type="filepath")
117
  model = gr.Dropdown(list(ASR_MODELS.keys()), value="Soloni 114M TDT CTC V1")
118
- btn = gr.Button("🚀 Générer")
119
  status = gr.Markdown()
120
- out_video = gr.Video()
121
- btn.click(pipeline, inputs=[video, model], outputs=[status, out_video])
122
 
123
  demo.launch()
 
1
  import os, warnings, tempfile
2
  warnings.filterwarnings("ignore")
3
 
4
+ # Débloquer ImageMagick (compatible tous environnements)
5
+ for p in ["/etc/ImageMagick/policy.xml", "/etc/ImageMagick-6/policy.xml"]:
6
+ if os.path.exists(p):
7
+ os.system(f'sed -i "s/rights=\\"none\\"/rights=\\"read|write\\"/g" "{p}"')
8
 
9
  import gradio as gr
10
  import numpy as np
11
  import soundfile as sf
12
  import torch
13
+ from moviepy.editor import VideoFileClip, CompositeVideoClip, TextClip
14
  from nemo.collections import asr as nemo_asr
15
 
16
  SR = 16000
 
26
 
27
  _CACHE = {}
28
 
29
+ def load_model(name):
30
+ if name in _CACHE:
31
+ return _CACHE[name]
32
  device = "cuda" if torch.cuda.is_available() else "cpu"
33
+ model = nemo_asr.models.ASRModel.from_pretrained(model_name=ASR_MODELS[name]).to(device).eval()
34
+ _CACHE[name] = (model, device)
35
  return model, device
36
 
37
  def extract_audio(video_path, wav_path):
38
+ # Extraction audio stable (pas to_soundarray)
39
+ try:
40
+ clip = VideoFileClip(video_path)
41
+ clip.audio.write_audiofile(
42
+ wav_path, fps=SR, codec="pcm_s16le", verbose=False, logger=None
43
+ )
44
+ clip.close()
45
+ except:
46
+ os.system(f"ffmpeg -i '{video_path}' -ac 1 -ar {SR} -vn -y '{wav_path}' >/dev/null 2>&1")
47
  audio, sr = sf.read(wav_path)
48
+ return len(audio)/sr
 
 
49
 
50
+ def transcribe(model, device, wav_path, model_key):
51
+ audio, sr = sf.read(wav_path)
52
+ if audio.ndim > 1: audio = audio.mean(axis=1)
53
+ total_s = len(audio)/sr
54
+
55
+ x = torch.tensor(audio, dtype=torch.float32).unsqueeze(0).to(device)
56
+ ln = torch.tensor([x.shape[1]]).to(device)
57
 
58
+ # Timestamps Soloni (decode_and_align hyp.words)
59
  if "Soloni" in model_key:
60
  with torch.no_grad():
61
+ proc, plen = model.preprocessor(x, ln)
62
+ hyps = model.decode_and_align(proc, plen)
63
  hyp = hyps[0][0] if isinstance(hyps[0], list) else hyps[0]
64
+ if hasattr(hyp, "words"):
65
  return [(w.start_offset_ms/1000, w.end_offset_ms/1000, w.word) for w in hyp.words]
66
 
67
+ # CTC models → alignement dynamique
68
  text = model.transcribe([wav_path])[0]
69
  words = text.split()
70
+ if not words: return []
 
71
 
72
+ wps = max(2.2, len(words)/total_s)
73
  subs, t = [], 0
74
  for w in words:
75
+ d = 1/wps
76
+ subs.append((t, min(total_s, t+d), w))
77
+ t += d
78
+ if t >= total_s: break
79
  return subs
80
 
81
+ def burn(video_path, subs):
82
  clip = VideoFileClip(video_path)
83
  W, H = clip.size
84
+
85
+ layers = []
86
+ for s,e,w in subs:
87
+ txt = TextClip(w.upper(), fontsize=int(H/20), color="white",
88
+ stroke_color="black", stroke_width=2,
89
+ method="caption", size=(int(W*0.9), None)
90
  ).set_start(s).set_duration(e-s).set_position(("center", int(H*0.88)))
91
+ layers.append(txt)
92
+
93
+ final = CompositeVideoClip([clip] + layers)
94
+ out = "RobotsMali_Subtitled.mp4"
95
  final.write_videofile(out, codec="libx264", audio_codec="aac", verbose=False, logger=None)
96
  return out
97
 
98
+ def pipeline(video, model_name, progress=gr.Progress()):
99
  progress(0.2, "Chargement du modèle…")
100
+ model, device = load_model(model_name)
101
+
102
  with tempfile.TemporaryDirectory() as td:
103
  wav = f"{td}/audio.wav"
104
+ progress(0.45, "Extraction audio…")
105
  extract_audio(video, wav)
106
+
107
  progress(0.7, "Transcription en Bambara…")
108
+ subs = transcribe(model, device, wav, model_name)
109
+
110
  progress(0.9, "Incrustation des sous-titres…")
111
+ out = burn(video, subs)
112
+
113
  progress(1.0, "✅ Terminé")
114
+ return f"✅ Sous-titré avec **{model_name}**", out
115
 
116
  CSS = """
117
+ body { background:#F6F9FF; font-family:Inter, sans-serif; }
118
+ h1 { text-align:center; font-weight:800; color:#006CFF; }
119
+ .gr-button { background:#007BFF !important; color:white !important; border-radius:8px; font-weight:700; }
 
120
  """
121
 
122
+ with gr.Blocks(css=CSS, title="RobotsMali Caption Studio") as demo:
123
  gr.Markdown("<h1>RobotsMali Caption Studio</h1><p>Sous-titrage Automatique en Bambara</p>")
124
  video = gr.File(label="🎥 Importer une vidéo", type="filepath")
125
  model = gr.Dropdown(list(ASR_MODELS.keys()), value="Soloni 114M TDT CTC V1")
126
+ run = gr.Button("🚀 Générer les sous-titres")
127
  status = gr.Markdown()
128
+ output = gr.Video()
129
+ run.click(pipeline, inputs=[video, model], outputs=[status, output])
130
 
131
  demo.launch()