binaryMao commited on
Commit
4dcffa6
·
verified ·
1 Parent(s): 722dfed

Create app.py

Browse files

Initial release of the RobotsMali Caption Studio.
Adds full Bambara speech-to-text video subtitling pipeline using the 6 official ASR models (Soloni, Soloba, QuartzNet). Includes timestamp alignment, hard subtitle rendering and Gradio web interface.

Files changed (1) hide show
  1. app.py +130 -0
app.py ADDED
@@ -0,0 +1,130 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os, warnings, tempfile
2
+ warnings.filterwarnings("ignore")
3
+
4
+ # Autoriser ImageMagick
5
+ os.system('sed -i "s/rights=\\"none\\"/rights=\\"read|write\\"/g" /etc/ImageMagick-6/policy.xml || true')
6
+
7
+ import gradio as gr
8
+ import numpy as np
9
+ import soundfile as sf
10
+ import torch
11
+ from moviepy.editor import VideoFileClip, TextClip, CompositeVideoClip
12
+ from nemo.collections import asr as nemo_asr
13
+ from nemo.collections.asr.parts.ds_cluster import convert_rnnt_to_word_timestamps
14
+
15
+ SR = 16000
16
+
17
+ ASR_MODELS = {
18
+ "Soloba CTC 0.6B V0": "RobotsMali/soloba-ctc-0.6b-v0",
19
+ "Soloba CTC 0.6B V1": "RobotsMali/soloba-ctc-0.6b-v1",
20
+ "Soloni 114M TDT CTC V0": "RobotsMali/soloni-114m-tdt-ctc-V0",
21
+ "Soloni 114M TDT CTC V1": "RobotsMali/soloni-114m-tdt-ctc-v1",
22
+ "QuartzNet BM V0": "RobotsMali/stt-bm-quartznet15x5-V0",
23
+ "QuartzNet BM V1": "RobotsMali/stt-bm-quartznet15x5-V1",
24
+ }
25
+
26
+ _MODEL_CACHE = {}
27
+
28
+ def load_asr(model_key):
29
+ if model_key in _MODEL_CACHE:
30
+ return _MODEL_CACHE[model_key]
31
+ device = "cuda" if torch.cuda.is_available() else "cpu"
32
+ model = nemo_asr.models.ASRModel.from_pretrained(model_name=ASR_MODELS[model_key])
33
+ model = model.to(device).eval()
34
+ _MODEL_CACHE[model_key] = (model, device)
35
+ return model, device
36
+
37
+ def extract_audio(video_path, wav_path):
38
+ with VideoFileClip(video_path) as clip:
39
+ audio = clip.audio.to_soundarray(fps=SR)
40
+ if audio.ndim > 1:
41
+ audio = audio.mean(axis=1)
42
+ sf.write(wav_path, audio.astype(np.float32), SR)
43
+ return len(audio) / SR
44
+
45
+ def transcribe_with_timestamps(model, device, wav_path, model_key):
46
+ audio, sr = sf.read(wav_path)
47
+ if audio.ndim > 1:
48
+ audio = audio.mean(axis=1)
49
+ total_duration = len(audio) / sr
50
+ audio_t = torch.tensor(audio, dtype=torch.float32).unsqueeze(0).to(device)
51
+ audio_len = torch.tensor([audio_t.shape[1]]).to(device)
52
+
53
+ # Soloni → alignement natif
54
+ if "Soloni" in model_key:
55
+ with torch.no_grad():
56
+ proc, proc_len = model.preprocessor(audio_t, audio_len)
57
+ hyps = model.decode_and_align(encoder_output=proc, encoded_lengths=proc_len)
58
+ hyp = hyps[0][0] if isinstance(hyps[0], list) else hyps[0]
59
+ words = convert_rnnt_to_word_timestamps(hyp, model.tokenizer)
60
+ return [(w.start_time, w.end_time, w.word) for w in words]
61
+
62
+ # Soloba / QuartzNet → alignement à débit contrôlé
63
+ text = model.transcribe([wav_path], batch_size=1)[0]
64
+ words = text.split()
65
+ if len(words) == 0:
66
+ return []
67
+ wps = max(1.9, len(words) / total_duration)
68
+ t = 0
69
+ timeline = []
70
+ for w in words:
71
+ dur = 1.0 / wps
72
+ start, end = t, min(total_duration, t + dur)
73
+ timeline.append((start, end, w))
74
+ t = end
75
+ if t >= total_duration: break
76
+ return timeline
77
+
78
+ def burn_subtitles(video_path, subs):
79
+ clip = VideoFileClip(video_path)
80
+ W, H = clip.size
81
+ textclips = []
82
+ for start, end, word in subs:
83
+ tc = (TextClip(word.upper(), fontsize=int(H/20), color="white",
84
+ stroke_color="black", stroke_width=2, method="caption",
85
+ size=(int(W*0.9), None))
86
+ .set_start(start)
87
+ .set_duration(end-start)
88
+ .set_position(("center", int(H*0.88))))
89
+ textclips.append(tc)
90
+ final = CompositeVideoClip([clip] + textclips)
91
+ out = "output_captioned.mp4"
92
+ final.write_videofile(out, codec="libx264", audio_codec="aac", verbose=False, logger=None)
93
+ return out
94
+
95
+ def pipeline(video, model_key, progress=gr.Progress()):
96
+ progress(0.2, "🧠 Chargement du modèle…")
97
+ model, device = load_asr(model_key)
98
+
99
+ with tempfile.TemporaryDirectory() as td:
100
+ wav_path = f"{td}/audio.wav"
101
+ progress(0.4, "🎧 Extraction audio…")
102
+ _ = extract_audio(video, wav_path)
103
+
104
+ progress(0.65, "✍🏾 Transcription Bambara…")
105
+ subs = transcribe_with_timestamps(model, device, wav_path, model_key)
106
+
107
+ progress(0.85, "🎞️ Incrustation des sous-titres…")
108
+ out = burn_subtitles(video, subs)
109
+
110
+ progress(1.0, "✅ Terminé.")
111
+ return f"✅ Sous-titres générés avec **{model_key}**", out
112
+
113
+ CSS = """
114
+ body { background:#F5F8FC; font-family:Inter, sans-serif; color:#222; }
115
+ .gradio-container { max-width:880px; margin:auto; background:white; border-radius:16px; padding:28px; box-shadow:0 8px 24px rgba(0,0,0,0.08)}
116
+ h1 { text-align:center; color:#007BFF; font-weight:800; }
117
+ p { text-align:center; color:#5A6B85; }
118
+ .gr-button { background:#007BFF !important; color:white !important; font-weight:700; border-radius:8px; }
119
+ """
120
+
121
+ with gr.Blocks(title="RobotsMali Caption Studio", css=CSS) as demo:
122
+ gr.Markdown("<h1>RobotsMali Caption Studio</h1><p>Sous-titrage Bambara Automatique</p><hr>")
123
+ video_in = gr.File(label="🎥 Importer une vidéo", type="filepath")
124
+ model_sel = gr.Dropdown(list(ASR_MODELS.keys()), value="Soloni 114M TDT CTC V1", label="🧠 Modèle ASR")
125
+ btn = gr.Button("🚀 Générer les sous-titres")
126
+ status = gr.Markdown()
127
+ video_out = gr.Video()
128
+ btn.click(pipeline, inputs=[video_in, model_sel], outputs=[status, video_out])
129
+
130
+ demo.launch()