binaryMao commited on
Commit
93438d8
·
verified ·
1 Parent(s): bde1ae6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +77 -75
app.py CHANGED
@@ -1,5 +1,5 @@
1
  # -*- coding: utf-8 -*-
2
- import os, shlex, subprocess, tempfile, traceback, textwrap, time
3
  import torch
4
  from huggingface_hub import snapshot_download
5
  from nemo.collections import asr as nemo_asr
@@ -17,22 +17,20 @@ MODELS = {
17
  "QuartzNet V0 (CTC-char)": ("RobotsMali/stt-bm-quartznet15x5-v0", "ctc_char"),
18
  }
19
 
20
- # 2. GESTION DES CHEMINS (Correction du bug de chargement exemple)
21
  def get_absolute_example():
22
- paths = [
23
- os.path.abspath("MARALINKE.mp4"),
24
- os.path.abspath("examples/MARALINKE.mp4"),
25
- "/home/user/app/MARALINKE.mp4",
26
- "/home/user/app/examples/MARALINKE.mp4"
27
- ]
28
- for p in paths:
29
- if os.path.exists(p): return p
30
  return None
31
 
32
  EXAMPLE_PATH = get_absolute_example()
33
  _cache = {}
34
 
35
- # 3. MOTEUR IA NEMO
36
  def load_model(name):
37
  if name in _cache: return _cache[name]
38
  _cache.clear()
@@ -53,65 +51,74 @@ def load_model(name):
53
  _cache[name] = model
54
  return model
55
 
56
- # 4. UTILITAIRES DE SYNCHRONISATION
57
- def format_ts(seconds):
58
- td = time.gmtime(seconds)
59
- ms = int((seconds - int(seconds)) * 1000)
60
  return f"{time.strftime('%H:%M:%S', td)},{ms:03}"
61
 
62
- def get_real_duration(file_path):
63
- cmd = f"ffprobe -v error -show_entries format=duration -of default=noprint_wrappers=1:nokey=1 {shlex.quote(file_path)}"
64
- res = subprocess.run(cmd, shell=True, capture_output=True, text=True)
65
- try: return float(res.stdout.strip())
66
- except: return 0.0
67
-
68
- # 5. PIPELINE DE TRAITEMENT
69
  def pipeline(video_in, model_name):
 
70
  try:
71
- if not video_in: return "❌ Erreur : Aucune vidéo détectée.", None
 
 
 
 
 
72
 
73
- # Étape A : Extraction Audio
74
- yield " Extraction de l'audio...", None
75
- wav_path = os.path.abspath("temp.wav")
76
- subprocess.run(f"ffmpeg -y -i {shlex.quote(video_in)} -vn -ac 1 -ar 16000 {wav_path}", shell=True, check=True)
77
- duration = get_real_duration(video_in)
78
 
79
- # Étape B : Transcription avec Offsets (Alignement Natif)
80
- yield f"⏳ Transcription IA ({model_name}) avec alignement...", None
81
  model = load_model(model_name)
82
 
83
- # Utilisation de return_hypotheses pour récupérer les timestamps CTC
84
- hypotheses = model.transcribe([wav_path], return_hypotheses=True)[0]
85
- words_with_ts = []
86
-
87
- if hasattr(hypotheses, 'word_offsets') and hypotheses.word_offsets:
88
- offsets = hypotheses.word_offsets
89
- words = hypotheses.text.split()
90
- # Facteur 0.02 (Stride de NeMo) pour convertir frames en secondes
91
- for i, word in enumerate(words):
92
- t_start = offsets[i] * 0.02
93
- words_with_ts.append({"word": word, "start": t_start, "end": t_start + 0.4})
94
- else:
95
- # Fallback temporel linéaire si les offsets ne sont pas disponibles (RNNT)
96
- words = (hypotheses.text if hasattr(hypotheses, 'text') else str(hypotheses)).split()
97
- for i, w in enumerate(words):
98
- words_with_ts.append({"word": w, "start": (i/len(words))*duration, "end": ((i+1)/len(words))*duration})
 
 
 
 
 
 
 
 
 
 
 
 
 
 
99
 
100
- # Étape C : Création du SRT segmenté
101
- yield "⏳ Génération des segments synchronisés...", None
102
- srt_path = os.path.abspath("output.srt")
103
  words_per_line = 6
104
  with open(srt_path, "w", encoding="utf-8") as f:
105
- for i in range(0, len(words_with_ts), words_per_line):
106
- chunk = words_with_ts[i:i+words_per_line]
107
- start_time = chunk[0]['start']
108
- end_time = chunk[-1]['end'] + 0.5
109
- f.write(f"{(i//words_per_line)+1}\n{format_ts(start_time)} --> {format_ts(end_time)}\n")
110
- f.write(" ".join([w['word'] for w in chunk]) + "\n\n")
111
 
112
- # Étape D : Encodage et "Burn-in"
113
- yield "⏳ Incrustation des sous-titres (FastStart)...", None
114
- out_path = os.path.abspath(f"resultat_{int(time.time())}.mp4")
115
  cmd_ffmpeg = (
116
  f"ffmpeg -y -i {shlex.quote(video_in)} "
117
  f"-vf \"subtitles={shlex.quote(srt_path)}:force_style='Alignment=2,FontSize=20,PrimaryColour=&H00FFFF&'\" "
@@ -119,38 +126,33 @@ def pipeline(video_in, model_name):
119
  )
120
  subprocess.run(cmd_ffmpeg, shell=True, check=True)
121
 
122
- yield "✅ Terminé avec succès !", out_path
123
 
124
  except Exception as e:
125
  traceback.print_exc()
126
  yield f"❌ Erreur : {str(e)}", None
127
 
128
- # 6. INTERFACE GRADIO (Webcam + Example Fix)
129
  with gr.Blocks(theme=gr.themes.Soft(), css="body {background-color: #0b1120;}") as demo:
130
- gr.HTML("<h1 style='text-align:center; color:#facc15;'>🤖 ROBOTSMALI V10.5</h1>")
 
131
 
132
  with gr.Row():
133
  with gr.Column():
134
- gr.Markdown("### 📥 SOURCE")
135
- # Supporte l'upload ET la webcam
136
- v_in = gr.Video(label="Webcam ou Fichier", sources=["upload", "webcam"], interactive=True)
137
-
138
- if EXAMPLE_PATH:
139
- btn_demo = gr.Button("📂 CHARGER LA VIDÉO D'EXEMPLE", variant="secondary")
140
 
141
  m_sel = gr.Dropdown(list(MODELS.keys()), value="Soloba V1 (CTC)", label="Modèle IA")
142
- btn_run = gr.Button("🚀 GÉNÉRER", variant="primary")
143
 
 
 
 
 
144
  with gr.Column():
145
- gr.Markdown("### 📤 RÉSULTAT")
146
  status = gr.Markdown("### État\nPrêt")
147
- v_out = gr.Video(label="Vidéo finale synchronisée")
148
 
149
- # Actions
150
- if EXAMPLE_PATH:
151
- btn_demo.click(fn=lambda: EXAMPLE_PATH, outputs=v_in)
152
-
153
  btn_run.click(pipeline, [v_in, m_sel], [status, v_out])
154
 
155
  if __name__ == "__main__":
156
- demo.launch(share=True, debug=True)
 
1
  # -*- coding: utf-8 -*-
2
+ import os, shlex, subprocess, tempfile, traceback, textwrap, time, glob
3
  import torch
4
  from huggingface_hub import snapshot_download
5
  from nemo.collections import asr as nemo_asr
 
17
  "QuartzNet V0 (CTC-char)": ("RobotsMali/stt-bm-quartznet15x5-v0", "ctc_char"),
18
  }
19
 
20
+ # 2. LOCALISATION DE LA VIDÉO D'EXEMPLE
21
  def get_absolute_example():
22
+ names = ["MARALINKE.mp4", "maralinke.mp4", "example.mp4"]
23
+ dirs = [".", "examples", "/home/user/app", "/home/user/app/examples"]
24
+ for d in dirs:
25
+ for n in names:
26
+ p = os.path.join(d, n)
27
+ if os.path.exists(p): return os.path.abspath(p)
 
 
28
  return None
29
 
30
  EXAMPLE_PATH = get_absolute_example()
31
  _cache = {}
32
 
33
+ # 3. CHARGEMENT DES MODÈLES IA
34
  def load_model(name):
35
  if name in _cache: return _cache[name]
36
  _cache.clear()
 
51
  _cache[name] = model
52
  return model
53
 
54
+ # 4. UTILITAIRE DE FORMATAGE SRT
55
+ def format_srt_time(sec):
56
+ td = time.gmtime(sec)
57
+ ms = int((sec - int(sec)) * 1000)
58
  return f"{time.strftime('%H:%M:%S', td)},{ms:03}"
59
 
60
+ # 5. PIPELINE DE TRAITEMENT (SEGMENTATION 10S + OFFSETS)
 
 
 
 
 
 
61
  def pipeline(video_in, model_name):
62
+ tmp_dir = tempfile.mkdtemp()
63
  try:
64
+ if not video_in: return "❌ Erreur : Source vide", None
65
+
66
+ # Étape A : Extraction et Segmentation Audio
67
+ yield "⏳ Découpage de l'audio en segments de 10s...", None
68
+ full_wav = os.path.join(tmp_dir, "full.wav")
69
+ subprocess.run(f"ffmpeg -y -i {shlex.quote(video_in)} -vn -ac 1 -ar 16000 {full_wav}", shell=True, check=True)
70
 
71
+ segment_pattern = os.path.join(tmp_dir, "seg_%03d.wav")
72
+ subprocess.run(f"ffmpeg -i {full_wav} -f segment -segment_time 10 -c copy {segment_pattern}", shell=True, check=True)
 
 
 
73
 
74
+ audio_segments = sorted(glob.glob(os.path.join(tmp_dir, "seg_*.wav")))
 
75
  model = load_model(model_name)
76
 
77
+ # Étape B : Transcription segmentée avec Offsets natifs
78
+ all_words_ts = []
79
+ for idx, seg_path in enumerate(audio_segments):
80
+ base_time = idx * 10.0
81
+ yield f"⏳ IA : Transcription segment {idx+1}/{len(audio_segments)}...", None
82
+
83
+ # Utilisation de return_hypotheses pour les timestamps
84
+ hyp = model.transcribe([seg_path], return_hypotheses=True)[0]
85
+
86
+ if hasattr(hyp, 'word_offsets') and hyp.word_offsets:
87
+ words = hyp.text.split()
88
+ for i, word in enumerate(words):
89
+ # Facteur de conversion frame->seconde (standard NeMo 0.02)
90
+ rel_start = hyp.word_offsets[i] * 0.02
91
+ all_words_ts.append({
92
+ "word": word,
93
+ "start": base_time + rel_start,
94
+ "end": base_time + rel_start + 0.45
95
+ })
96
+ else:
97
+ # Fallback temporel si offsets non dispos
98
+ words = (hyp.text if hasattr(hyp, 'text') else str(hyp)).split()
99
+ if words:
100
+ gap = 10.0 / len(words)
101
+ for i, w in enumerate(words):
102
+ all_words_ts.append({
103
+ "word": w,
104
+ "start": base_time + (i * gap),
105
+ "end": base_time + ((i+1) * gap)
106
+ })
107
 
108
+ # Étape C : Génération du SRT optimisé
109
+ yield "⏳ Création du fichier de sous-titres...", None
110
+ srt_path = os.path.join(tmp_dir, "final.srt")
111
  words_per_line = 6
112
  with open(srt_path, "w", encoding="utf-8") as f:
113
+ for i in range(0, len(all_words_ts), words_per_line):
114
+ chunk = all_words_ts[i:i+words_per_line]
115
+ f.write(f"{(i//words_per_line)+1}\n")
116
+ f.write(f"{format_srt_time(chunk[0]['start'])} --> {format_srt_time(chunk[-1]['end'])}\n")
117
+ f.write(" ".join([c['word'] for c in chunk]) + "\n\n")
 
118
 
119
+ # Étape D : Incrustation Finale (Burn-in)
120
+ yield "⏳ Rendu vidéo final...", None
121
+ out_path = os.path.abspath(f"robotsmali_final_{int(time.time())}.mp4")
122
  cmd_ffmpeg = (
123
  f"ffmpeg -y -i {shlex.quote(video_in)} "
124
  f"-vf \"subtitles={shlex.quote(srt_path)}:force_style='Alignment=2,FontSize=20,PrimaryColour=&H00FFFF&'\" "
 
126
  )
127
  subprocess.run(cmd_ffmpeg, shell=True, check=True)
128
 
129
+ yield "✅ Synchronisation parfaite terminée !", out_path
130
 
131
  except Exception as e:
132
  traceback.print_exc()
133
  yield f"❌ Erreur : {str(e)}", None
134
 
135
+ # 6. INTERFACE UTILISATEUR GRADIO
136
  with gr.Blocks(theme=gr.themes.Soft(), css="body {background-color: #0b1120;}") as demo:
137
+ gr.HTML("<h1 style='text-align:center; color:#facc15;'>🤖 ROBOTSMALI V12.5</h1>")
138
+ gr.Markdown("<p style='text-align:center; color:white;'>Segmentation 10s + Offsets Natifs NeMo</p>")
139
 
140
  with gr.Row():
141
  with gr.Column():
142
+ v_in = gr.Video(label="Source (Webcam ou Fichier)", sources=["upload", "webcam"], interactive=True)
 
 
 
 
 
143
 
144
  m_sel = gr.Dropdown(list(MODELS.keys()), value="Soloba V1 (CTC)", label="Modèle IA")
145
+ btn_run = gr.Button("🚀 GÉNÉRER SOUS-TITRES", variant="primary")
146
 
147
+ if EXAMPLE_PATH:
148
+ gr.Markdown("### 💡 Exemple")
149
+ gr.Examples(examples=[[EXAMPLE_PATH, "Soloba V1 (CTC)"]], inputs=[v_in, m_sel])
150
+
151
  with gr.Column():
 
152
  status = gr.Markdown("### État\nPrêt")
153
+ v_out = gr.Video(label="Résultat Final")
154
 
 
 
 
 
155
  btn_run.click(pipeline, [v_in, m_sel], [status, v_out])
156
 
157
  if __name__ == "__main__":
158
+ demo.launch(debug=True)