Nick021402 commited on
Commit
870bab8
Β·
verified Β·
1 Parent(s): 914c1d3

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +223 -0
app.py ADDED
@@ -0,0 +1,223 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import whisper
2
+ import yt_dlp
3
+ import moviepy.editor as mp
4
+ import os
5
+ import torch
6
+ from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
7
+ from transformers import pipeline
8
+ import srt
9
+ from datetime import timedelta
10
+ import gradio as gr
11
+ import torchaudio
12
+ import whisper.tokenizer
13
+
14
+ # -----------------------------
15
+ # Helper Functions
16
+ # -----------------------------
17
+
18
+ def download_youtube_audio(url):
19
+ ydl_opts = {
20
+ 'format': 'bestaudio/best',
21
+ 'outtmpl': 'audio.%(ext)s',
22
+ 'postprocessors': [{
23
+ 'key': 'FFmpegExtractAudio',
24
+ 'preferredcodec': 'mp3',
25
+ 'preferredquality': '192',
26
+ }],
27
+ }
28
+ try:
29
+ with yt_dlp.YoutubeDL(ydl_opts) as ydl:
30
+ info = ydl.extract_info(url, download=True)
31
+ return "audio.mp3"
32
+ except Exception as e:
33
+ raise RuntimeError(f"Error downloading audio: {str(e)}")
34
+
35
+ def extract_audio_from_video(video_path):
36
+ try:
37
+ clip = mp.VideoFileClip(video_path)
38
+ clip.audio.write_audiofile("audio.mp3")
39
+ return "audio.mp3"
40
+ except Exception as e:
41
+ raise RuntimeError(f"Error extracting audio: {str(e)}")
42
+
43
+ def generate_srt(segments):
44
+ subs = []
45
+ for i, seg in enumerate(segments):
46
+ start = timedelta(seconds=seg['start'])
47
+ end = timedelta(seconds=seg['end'])
48
+ text = seg['text'].strip()
49
+ if text:
50
+ subs.append(srt.Subtitle(index=i+1, start=start, end=end, content=text))
51
+ return srt.compose(subs)
52
+
53
+ # -----------------------------
54
+ # Transcription Functions
55
+ # -----------------------------
56
+
57
+ def transcribe_kotani(audio_path):
58
+ model = whisper.load_model("small", download_root=".")
59
+ result = model.transcribe(audio_path, language=None) # auto-detect
60
+ return result["segments"], result["language"]
61
+
62
+ def transcribe_khaiii(audio_path):
63
+ processor = Wav2Vec2Processor.from_pretrained("khaiii/wav2vec2-xls1r-aishell-korean")
64
+ model = Wav2Vec2ForCTC.from_pretrained("khaiii/wav2vec2-xls1r-aishell-korean")
65
+ speech, sr = torchaudio.load(audio_path)
66
+ input_values = processor(speech.squeeze(), return_tensors="pt", sampling_rate=16000).input_values
67
+ logits = model(input_values).logits
68
+ predicted_ids = torch.argmax(logits, dim=-1)
69
+ transcription = processor.batch_decode(predicted_ids)[0]
70
+ duration = len(speech) / sr
71
+ return [{"start": 0, "end": duration, "text": transcription}], "ko"
72
+
73
+ # -----------------------------
74
+ # Translation Function
75
+ # -----------------------------
76
+
77
+ def translate_text(text, src_lang, tgt_lang="en"):
78
+ model_name = f"Helsinki-NLP/opus-mt-{src_lang}-{tgt_lang}"
79
+ try:
80
+ translator = pipeline("translation", model=model_name)
81
+ translated = translator(text, max_length=400)
82
+ return translated[0]['translation_text']
83
+ except Exception as e:
84
+ return f"[Translation error: {str(e)}]"
85
+
86
+ # -----------------------------
87
+ # Main Processing Function
88
+ # -----------------------------
89
+
90
+ def process_video(youtube_url, video_file, selected_model, translate, target_lang):
91
+ status = "⏳ Starting..."
92
+ yield status, "", None
93
+
94
+ try:
95
+ # Step 1: Extract audio
96
+ if youtube_url:
97
+ status = "πŸ“₯ Downloading YouTube audio..."
98
+ yield status, "", None
99
+ audio_path = download_youtube_audio(youtube_url)
100
+ elif video_file:
101
+ status = "πŸ“Ό Extracting audio from video..."
102
+ yield status, "", None
103
+ audio_path = extract_audio_from_video(video_file.name)
104
+ else:
105
+ yield "❌ Please provide a video or YouTube URL", "", None
106
+ return
107
+
108
+ # Step 2: Transcribe
109
+ if selected_model == "kotani":
110
+ status = "πŸŽ™οΈ Transcribing using Kotani Whisper Small..."
111
+ yield status, "", None
112
+ segments, lang = transcribe_kotani(audio_path)
113
+ else:
114
+ status = "πŸŽ™οΈ Transcribing using Khaiii Wav2Vec2..."
115
+ yield status, "", None
116
+ segments, lang = transcribe_khaiii(audio_path)
117
+
118
+ lang_desc = whisper.tokenizer.LLANGUAGES.get(lang, lang.upper())
119
+
120
+ # Step 3: Translate if needed
121
+ if translate:
122
+ status = f"🌐 Translating {lang_desc} to {target_lang.upper()}..."
123
+ yield status, "", None
124
+ translated_segments = []
125
+ for seg in segments:
126
+ translated = translate_text(seg['text'], lang, target_lang)
127
+ translated_segments.append({**seg, "text": translated})
128
+ segments = translated_segments
129
+
130
+ # Step 4: Generate SRT
131
+ status = "πŸ“ Generating subtitle file..."
132
+ yield status, "", None
133
+ srt_content = generate_srt(segments)
134
+
135
+ with open("output.srt", "w") as f:
136
+ f.write(srt_content)
137
+
138
+ preview = srt_content[:1000] + ("\n..." if len(srt_content) > 1000 else "")
139
+ status = f"βœ… Done! ({lang_desc})"
140
+ yield status, preview, "output.srt"
141
+
142
+ except Exception as e:
143
+ yield f"❌ Error: {str(e)}", "", None
144
+
145
+ # -----------------------------
146
+ # UI Layout
147
+ # -----------------------------
148
+
149
+ model_desc_kotani = """
150
+ <div style="border:1px solid #ddd; padding: 10px; border-radius:8px;">
151
+ <strong>Kotani Whisper Small</strong><br>
152
+ β–ͺ Fast & multilingual<br>
153
+ β–ͺ Good for quick subtitles<br>
154
+ β–ͺ Moderate accuracy for Korean
155
+ </div>
156
+ """
157
+
158
+ model_desc_khaiii = """
159
+ <div style="border:1px solid #ddd; padding: 10px; border-radius:8px;">
160
+ <strong>Khaiii Wav2Vec2</strong><br>
161
+ β–ͺ Best Korean speech recognition<br>
162
+ β–ͺ Slower but highly accurate<br>
163
+ β–ͺ Only supports Korean
164
+ </div>
165
+ """
166
+
167
+ with gr.Blocks(theme=gr.themes.Soft()) as demo:
168
+ gr.Markdown("## 🌍 Multilingual Subtitle Generator")
169
+ gr.Markdown("Upload a video or paste a YouTube link. Automatically detect language and optionally translate subtitles.")
170
+
171
+ selected_model = gr.State(value="kotani") # default model
172
+
173
+ gr.Markdown("### πŸ” Choose ASR Model")
174
+ with gr.Row():
175
+ with gr.Column():
176
+ kotani_btn = gr.Button("βœ… Select Kotani Whisper Small")
177
+ gr.HTML(model_desc_kotani)
178
+ with gr.Column():
179
+ khaiii_btn = gr.Button("βœ… Select Khaiii Wav2Vec2")
180
+ gr.HTML(model_desc_khaiii)
181
+
182
+ selected_model = gr.State(value="kotani")
183
+
184
+ def select_kotani():
185
+ return "kotani"
186
+
187
+ def select_khaiii():
188
+ return "khaiii"
189
+
190
+ kotani_btn.click(fn=select_kotani, outputs=selected_model)
191
+ khaiii_btn.click(fn=select_khaiii, outputs=selected_model)
192
+
193
+ gr.Markdown("### πŸ“₯ Input Source")
194
+ with gr.Row():
195
+ youtube_url = gr.Textbox(label="YouTube URL", scale=2)
196
+ video_upload = gr.File(label="Upload Video", type="file", scale=1)
197
+
198
+ gr.Markdown("### 🌍 Translation Options")
199
+ with gr.Row():
200
+ translate_checkbox = gr.Checkbox(label="Translate to another language?")
201
+ target_lang = gr.Textbox(label="Target Language Code (e.g., 'en')", value="en", visible=False)
202
+
203
+ def toggle_translate(checked):
204
+ return gr.update(visible=checked)
205
+
206
+ translate_checkbox.change(fn=toggle_translate, inputs=translate_checkbox, outputs=target_lang)
207
+
208
+ status_box = gr.Textbox(label="Status", interactive=False)
209
+ subtitle_preview = gr.Textbox(label="Generated Subtitles", lines=10)
210
+ download_file = gr.File(label="Download .srt File")
211
+
212
+ submit_btn = gr.Button("🎬 Generate Subtitles")
213
+
214
+ submit_btn.click(
215
+ fn=process_video,
216
+ inputs=[youtube_url, video_upload, selected_model, translate_checkbox, target_lang],
217
+ outputs=[status_box, subtitle_preview, download_file]
218
+ )
219
+
220
+ demo.queue(concurrency_count=1, max_size=20)
221
+
222
+ if __name__ == "__main__":
223
+ demo.launch()