binaryMao commited on
Commit
88d36f5
·
verified ·
1 Parent(s): b01f955

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +64 -45
app.py CHANGED
@@ -5,10 +5,9 @@ from huggingface_hub import snapshot_download
5
  from nemo.collections import asr as nemo_asr
6
  import gradio as gr
7
 
8
- # 1. CONFIGURATION MATÉRIEL
9
  DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
10
 
11
- # 2. DICTIONNAIRE DES MODÈLES
12
  MODELS = {
13
  "Soloba V3 (CTC)": ("RobotsMali/soloba-ctc-0.6b-v3", "ctc"),
14
  "Soloba V1.5 (TDT)": ("RobotsMali/soloba-tdt-0.6b-v1.5", "rnnt"),
@@ -18,19 +17,30 @@ MODELS = {
18
  "Soloba V0.5 (TDT)": ("RobotsMali/soloba-tdt-0.6b-v0.5", "rnnt"),
19
  }
20
 
 
 
 
 
 
 
 
 
 
 
 
21
  _cache = {}
22
 
 
23
  def clear_memory():
24
- """Libère la VRAM et la RAM."""
25
  _cache.clear()
26
  gc.collect()
27
  if torch.cuda.is_available():
28
  torch.cuda.empty_cache()
29
 
30
  def get_model(name):
31
- """Charge le modèle et le retourne directement."""
32
- if name in _cache:
33
- return _cache[name]
34
 
35
  clear_memory()
36
  repo, mode = MODELS[name]
@@ -43,104 +53,113 @@ def get_model(name):
43
  model = nemo_asr.models.EncDecCTCModelBPE.restore_from(nemo_file)
44
 
45
  model.to(DEVICE).eval()
 
 
46
  if DEVICE == "cuda":
47
  model.half()
48
-
49
  _cache[name] = model
50
  return model
51
 
 
52
  def format_srt_time(sec):
53
  td = time.gmtime(sec)
54
  ms = int((sec - int(sec)) * 1000)
55
  return f"{time.strftime('%H:%M:%S', td)},{ms:03}"
56
 
 
57
  def pipeline(video_in, model_name):
58
  tmp_dir = tempfile.mkdtemp()
59
  try:
60
- if not video_in:
61
- return "❌ Source vide", None
62
 
63
- # A. Extraction Audio
64
- yield "⏳ Extraction de l'audio...", None
65
  full_wav = os.path.join(tmp_dir, "full.wav")
66
- subprocess.run(f"ffmpeg -y -i {shlex.quote(video_in)} -vn -ac 1 -ar 16000 {full_wav}", shell=True, check=True)
67
 
68
- # B. Segmentation (10s)
 
69
  segment_pattern = os.path.join(tmp_dir, "seg_%03d.wav")
70
- subprocess.run(f"ffmpeg -i {full_wav} -f segment -segment_time 10 -c copy {segment_pattern}", shell=True, check=True)
71
  audio_segments = sorted(glob.glob(os.path.join(tmp_dir, "seg_*.wav")))
72
 
73
- # C. Chargement du modèle
74
- yield f"⏳ Chargement du modèle {model_name}...", None
75
  model = get_model(model_name)
76
-
77
- # Facteur de temps dynamique
78
- stride = 0.02
79
  if hasattr(model, 'preprocessor') and hasattr(model.preprocessor, 'featurizer'):
80
- hop = model.preprocessor.featurizer.hop_length
81
- sr = model.preprocessor.featurizer.sample_rate
82
- stride = hop / sr
83
 
84
- # D. Transcription
85
  all_words_ts = []
86
  for idx, seg_path in enumerate(audio_segments):
87
- base_time = idx * 10.0
88
  yield f"⏳ IA : Transcription {idx+1}/{len(audio_segments)}...", None
89
 
 
90
  hyp = model.transcribe([seg_path], return_hypotheses=True)[0]
 
91
  offsets = getattr(hyp, 'word_offsets', None)
92
- words = hyp.text.split() if hasattr(hyp, 'text') else str(hyp).split()
93
 
94
  if offsets and len(offsets) == len(words):
95
  for i, word in enumerate(words):
96
- start_t = base_time + (offsets[i] * stride)
97
- all_words_ts.append({"word": word, "start": start_t, "end": start_t + 0.45})
98
  else:
99
- gap = 10.0 / max(len(words), 1)
 
100
  for i, w in enumerate(words):
101
  all_words_ts.append({"word": w, "start": base_time + (i * gap), "end": base_time + ((i+1) * gap)})
102
 
103
- # E. Génération SRT (Fichier temporaire interne)
104
  srt_path = os.path.join(tmp_dir, "final.srt")
105
- words_per_line = 6
106
  with open(srt_path, "w", encoding="utf-8") as f:
107
- for i in range(0, len(all_words_ts), words_per_line):
108
- chunk = all_words_ts[i:i+words_per_line]
109
- f.write(f"{(i//words_per_line)+1}\n")
110
- f.write(f"{format_srt_time(chunk[0]['start'])} --> {format_srt_time(chunk[-1]['end'])}\n")
111
  f.write(" ".join([c['word'] for c in chunk]) + "\n\n")
112
 
113
- # F. Encodage Vidéo
114
- yield "⏳ Rendu vidéo final...", None
115
  out_path = os.path.abspath(f"robotsmali_final_{int(time.time())}.mp4")
116
 
 
117
  safe_srt = srt_path.replace("\\", "/").replace(":", "\\:")
 
 
118
  cmd_ffmpeg = (
119
  f"ffmpeg -y -i {shlex.quote(video_in)} "
120
- f"-vf \"subtitles='{safe_srt}':force_style='Alignment=2,FontSize=18,OutlineColour=&H80000000,BorderStyle=4'\" "
121
- f"-c:v libx264 -preset fast -pix_fmt yuv420p -movflags +faststart -c:a aac {out_path}"
122
  )
123
  subprocess.run(cmd_ffmpeg, shell=True, check=True)
124
 
125
- yield "✅ Terminé !", out_path
126
 
127
  except Exception as e:
128
  traceback.print_exc()
129
  yield f"❌ Erreur : {str(e)}", None
130
 
131
- # 6. INTERFACE
132
- with gr.Blocks(theme=gr.themes.Soft()) as demo:
133
- gr.HTML("<h1 style='text-align:center; color:#EAB308;'>🤖 ROBOTSMALI TRANSCRIPTION</h1>")
134
 
135
  with gr.Row():
136
  with gr.Column():
137
- v_in = gr.Video(label="Source")
138
  m_sel = gr.Dropdown(choices=list(MODELS.keys()), value="Soloba V3 (CTC)", label="Modèle IA")
139
- btn_run = gr.Button("🚀 GÉNÉRER", variant="primary")
140
 
 
 
 
 
141
  with gr.Column():
142
  status = gr.Markdown("### État\nPrêt.")
143
- v_out = gr.Video(label="Vidéo Finale")
144
 
145
  btn_run.click(pipeline, [v_in, m_sel], [status, v_out])
146
 
 
5
  from nemo.collections import asr as nemo_asr
6
  import gradio as gr
7
 
8
+ # 1. CONFIGURATION MATÉRIEL ET MODÈLES
9
  DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
10
 
 
11
  MODELS = {
12
  "Soloba V3 (CTC)": ("RobotsMali/soloba-ctc-0.6b-v3", "ctc"),
13
  "Soloba V1.5 (TDT)": ("RobotsMali/soloba-tdt-0.6b-v1.5", "rnnt"),
 
17
  "Soloba V0.5 (TDT)": ("RobotsMali/soloba-tdt-0.6b-v0.5", "rnnt"),
18
  }
19
 
20
+ # 2. LOCALISATION DE LA VIDÉO D'EXEMPLE
21
+ def get_absolute_example():
22
+ names = ["MARALINKE.mp4", "maralinke.mp4", "example.mp4"]
23
+ dirs = [".", "examples", "/home/user/app", "/home/user/app/examples"]
24
+ for d in dirs:
25
+ for n in names:
26
+ p = os.path.join(d, n)
27
+ if os.path.exists(p): return os.path.abspath(p)
28
+ return None
29
+
30
+ EXAMPLE_PATH = get_absolute_example()
31
  _cache = {}
32
 
33
+ # 3. GESTION DE LA MÉMOIRE ET CHARGEMENT
34
  def clear_memory():
35
+ """Libère proprement la RAM et la VRAM."""
36
  _cache.clear()
37
  gc.collect()
38
  if torch.cuda.is_available():
39
  torch.cuda.empty_cache()
40
 
41
  def get_model(name):
42
+ """Charge le modèle avec optimisation FP16 pour la vitesse."""
43
+ if name in _cache: return _cache[name]
 
44
 
45
  clear_memory()
46
  repo, mode = MODELS[name]
 
53
  model = nemo_asr.models.EncDecCTCModelBPE.restore_from(nemo_file)
54
 
55
  model.to(DEVICE).eval()
56
+
57
+ # OPTIMISATION : Inférence en demi-précision (FP16) sur GPU
58
  if DEVICE == "cuda":
59
  model.half()
60
+
61
  _cache[name] = model
62
  return model
63
 
64
+ # 4. UTILITAIRE TEMPOREL
65
  def format_srt_time(sec):
66
  td = time.gmtime(sec)
67
  ms = int((sec - int(sec)) * 1000)
68
  return f"{time.strftime('%H:%M:%S', td)},{ms:03}"
69
 
70
+ # 5. PIPELINE DE TRANSCRIPTION OPTIMISÉ
71
  def pipeline(video_in, model_name):
72
  tmp_dir = tempfile.mkdtemp()
73
  try:
74
+ if not video_in: return "❌ Source vide", None
 
75
 
76
+ # A. Extraction Audio Rapide
77
+ yield "⏳ Extraction audio...", None
78
  full_wav = os.path.join(tmp_dir, "full.wav")
79
+ subprocess.run(f"ffmpeg -y -i {shlex.quote(video_in)} -vn -ac 1 -ar 16000 -threads 0 {full_wav}", shell=True, check=True)
80
 
81
+ # B. Segmentation (20s pour réduire le nombre d'appels IA)
82
+ seg_time = 20
83
  segment_pattern = os.path.join(tmp_dir, "seg_%03d.wav")
84
+ subprocess.run(f"ffmpeg -i {full_wav} -f segment -segment_time {seg_time} -c copy {segment_pattern}", shell=True, check=True)
85
  audio_segments = sorted(glob.glob(os.path.join(tmp_dir, "seg_*.wav")))
86
 
87
+ # C. Initialisation Modèle
88
+ yield f"⏳ IA : Chargement de {model_name}...", None
89
  model = get_model(model_name)
90
+
91
+ # Détermination du stride (standard RobotsMali 0.02)
92
+ stride = 0.02
93
  if hasattr(model, 'preprocessor') and hasattr(model.preprocessor, 'featurizer'):
94
+ stride = model.preprocessor.featurizer.hop_length / model.preprocessor.featurizer.sample_rate
 
 
95
 
96
+ # D. Transcription Séquentielle
97
  all_words_ts = []
98
  for idx, seg_path in enumerate(audio_segments):
99
+ base_time = idx * seg_time
100
  yield f"⏳ IA : Transcription {idx+1}/{len(audio_segments)}...", None
101
 
102
+ # Inférence
103
  hyp = model.transcribe([seg_path], return_hypotheses=True)[0]
104
+ words = (hyp.text if hasattr(hyp, 'text') else str(hyp)).split()
105
  offsets = getattr(hyp, 'word_offsets', None)
 
106
 
107
  if offsets and len(offsets) == len(words):
108
  for i, word in enumerate(words):
109
+ t_start = base_time + (offsets[i] * stride)
110
+ all_words_ts.append({"word": word, "start": t_start, "end": t_start + 0.45})
111
  else:
112
+ # Fallback linéaire si les offsets sont indisponibles
113
+ gap = float(seg_time) / max(len(words), 1)
114
  for i, w in enumerate(words):
115
  all_words_ts.append({"word": w, "start": base_time + (i * gap), "end": base_time + ((i+1) * gap)})
116
 
117
+ # E. Génération du SRT
118
  srt_path = os.path.join(tmp_dir, "final.srt")
 
119
  with open(srt_path, "w", encoding="utf-8") as f:
120
+ for i in range(0, len(all_words_ts), 6):
121
+ chunk = all_words_ts[i:i+6]
122
+ f.write(f"{(i//6)+1}\n{format_srt_time(chunk[0]['start'])} --> {format_srt_time(chunk[-1]['end'])}\n")
 
123
  f.write(" ".join([c['word'] for c in chunk]) + "\n\n")
124
 
125
+ # F. Encodage Vidéo Final (Ultra-rapide)
126
+ yield "⏳ Rendu vidéo (Ultra-rapide)...", None
127
  out_path = os.path.abspath(f"robotsmali_final_{int(time.time())}.mp4")
128
 
129
+ # Protection des chemins pour FFmpeg
130
  safe_srt = srt_path.replace("\\", "/").replace(":", "\\:")
131
+
132
+ # OPTIMISATION : -preset ultrafast pour minimiser le temps de rendu
133
  cmd_ffmpeg = (
134
  f"ffmpeg -y -i {shlex.quote(video_in)} "
135
+ f"-vf \"subtitles='{safe_srt}':force_style='Alignment=2,FontSize=18,BorderStyle=4'\" "
136
+ f"-c:v libx264 -preset ultrafast -pix_fmt yuv420p -movflags +faststart -c:a aac {out_path}"
137
  )
138
  subprocess.run(cmd_ffmpeg, shell=True, check=True)
139
 
140
+ yield "✅ Terminé avec succès !", out_path
141
 
142
  except Exception as e:
143
  traceback.print_exc()
144
  yield f"❌ Erreur : {str(e)}", None
145
 
146
+ # 6. INTERFACE GRADIO
147
+ with gr.Blocks(theme=gr.themes.Soft(), css="body {background-color: #0f172a;}") as demo:
148
+ gr.HTML("<h1 style='text-align:center; color:#facc15;'>🤖 ROBOTSMALI TRANSCRIPTION</h1>")
149
 
150
  with gr.Row():
151
  with gr.Column():
152
+ v_in = gr.Video(label="Source Vidéo", sources=["upload", "webcam"])
153
  m_sel = gr.Dropdown(choices=list(MODELS.keys()), value="Soloba V3 (CTC)", label="Modèle IA")
154
+ btn_run = gr.Button("🚀 GÉNÉRER SOUS-TITRES", variant="primary")
155
 
156
+ if EXAMPLE_PATH:
157
+ gr.Markdown("### 💡 Exemple Rapide")
158
+ gr.Examples(examples=[[EXAMPLE_PATH, "Soloba V3 (CTC)"]], inputs=[v_in, m_sel])
159
+
160
  with gr.Column():
161
  status = gr.Markdown("### État\nPrêt.")
162
+ v_out = gr.Video(label="Résultat Final")
163
 
164
  btn_run.click(pipeline, [v_in, m_sel], [status, v_out])
165