Spaces:
Sleeping
Sleeping
File size: 9,102 Bytes
845138a 870bab8 635d08b e6c3f01 870bab8 635d08b 870bab8 635d08b 4d96ed6 635d08b 870bab8 635d08b 870bab8 e6c3f01 4d96ed6 e6c3f01 4d96ed6 e6c3f01 4d96ed6 e6c3f01 4d96ed6 e6c3f01 870bab8 e6c3f01 870bab8 4d96ed6 870bab8 845138a 870bab8 e6c3f01 870bab8 e6c3f01 870bab8 e6c3f01 870bab8 e6c3f01 870bab8 4d96ed6 870bab8 0eeb335 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 |
from whisper.tokenizer import LANGUAGES as LLANGUAGES
import whisper
import yt_dlp
import os
import torch
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
from transformers import pipeline
import srt
from datetime import timedelta
import gradio as gr
import torchaudio
import whisper.tokenizer
import ffmpeg
import time
# -----------------------------
# Helper Functions
# -----------------------------
def download_youtube_audio(url):
ydl_opts = {
'format': 'bestaudio/best',
'outtmpl': 'audio.%(ext)s',
'postprocessors': [{
'key': 'FFmpegExtractAudio',
'preferredcodec': 'mp3',
'preferredquality': '192',
}],
}
try:
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
info = ydl.extract_info(url, download=True)
return "audio.mp3"
except Exception as e:
raise RuntimeError(f"Error downloading audio: {str(e)}")
def extract_audio_from_video(video_path):
output_audio = "audio.mp3"
try:
ffmpeg.input(video_path).output(
output_audio,
vn=None, # Disable video
acodec="libmp3lame", # Use MP3 codec
ar="16000", # 16kHz sample rate
ac="1" # Mono channel
).run(quiet=True, overwrite_output=True)
print(f"Audio extracted to: {output_audio}")
return output_audio
except Exception as e:
raise RuntimeError(f"FFmpeg error: {str(e)}")
def generate_srt(segments):
subs = []
for i, seg in enumerate(segments):
start = timedelta(seconds=seg['start'])
end = timedelta(seconds=seg['end'])
text = seg['text'].strip()
if text:
subs.append(srt.Subtitle(index=i+1, start=start, end=end, content=text))
return srt.compose(subs)
# -----------------------------
# Model Loading Functions
# -----------------------------
def load_kotani_model():
status = "π₯ Loading Kotani Whisper Small model..."
print(status)
whisper.load_model("small", download_root=".")
print("Model loaded successfully.")
return status
def load_khaiii_model():
status = "π₯ Loading Khaiii Wav2Vec2 model..."
print(status)
Wav2Vec2Processor.from_pretrained("khaiii/wav2vec2-xls1r-aishell-korean")
Wav2Vec2ForCTC.from_pretrained("khaiii/wav2vec2-xls1r-aishell-korean")
print("Model loaded successfully.")
return status
# -----------------------------
# Transcription Functions
# -----------------------------
def transcribe_kotani(audio_path):
model = whisper.load_model("small", download_root=".")
result = model.transcribe(audio_path, language=None) # auto-detect
return result["segments"], result["language"]
def transcribe_khaiii(audio_path):
processor = Wav2Vec2Processor.from_pretrained("khaiii/wav2vec2-xls1r-aishell-korean")
model = Wav2Vec2ForCTC.from_pretrained("khaiii/wav2vec2-xls1r-aishell-korean")
speech, sr = torchaudio.load(audio_path)
input_values = processor(speech.squeeze(), return_tensors="pt", sampling_rate=16000).input_values
logits = model(input_values).logits
predicted_ids = torch.argmax(logits, dim=-1)
transcription = processor.batch_decode(predicted_ids)[0]
duration = len(speech) / sr
return [{"start": 0, "end": duration, "text": transcription}], "ko"
# -----------------------------
# Translation Function
# -----------------------------
def translate_text(text, src_lang, tgt_lang="en"):
model_name = f"Helsinki-NLP/opus-mt-{src_lang}-{tgt_lang}"
try:
translator = pipeline("translation", model=model_name)
translated = translator(text, max_length=400)
return translated[0]['translation_text']
except Exception as e:
return f"[Translation error: {str(e)}]"
# -----------------------------
# Main Processing Function
# -----------------------------
def process_video(youtube_url, video_file, selected_model, translate, target_lang):
status = "β³ Starting..."
yield status, "", None
try:
# Step 1: Extract audio
if youtube_url:
status = "π₯ Downloading YouTube audio..."
yield status, "", None
audio_path = download_youtube_audio(youtube_url)
elif video_file:
status = "πΌ Waiting for upload to complete..."
yield status, "", None
# Wait until file exists
start_time = time.time()
while not os.path.exists(video_file.name):
if time.time() - start_time > 30:
raise RuntimeError("Timeout: File upload took too long.")
time.sleep(1)
status = "πΌ Extracting audio from video..."
yield status, "", None
audio_path = extract_audio_from_video(video_file.name)
else:
yield "β Please provide a video or YouTube URL", "", None
return
# Debug: Confirm audio path
print(f"Audio path: {audio_path}")
# Step 2: Transcribe
if selected_model == "kotani":
status = "ποΈ Transcribing using Kotani Whisper Small..."
yield status, "", None
segments, lang = transcribe_kotani(audio_path)
else:
status = "ποΈ Transcribing using Khaiii Wav2Vec2..."
yield status, "", None
segments, lang = transcribe_khaiii(audio_path)
lang_desc = LLANGUAGES.get(lang, lang.upper())
# Step 3: Translate if needed
if translate:
status = f"π Translating {lang_desc} to {target_lang.upper()}..."
yield status, "", None
translated_segments = []
for seg in segments:
translated = translate_text(seg['text'], lang, target_lang)
translated_segments.append({**seg, "text": translated})
segments = translated_segments
# Step 4: Generate SRT
status = "π Generating subtitle file..."
yield status, "", None
srt_content = generate_srt(segments)
with open("output.srt", "w") as f:
f.write(srt_content)
preview = srt_content[:1000] + ("\n..." if len(srt_content) > 1000 else "")
status = f"β
Done! ({lang_desc})"
yield status, preview, "output.srt"
except Exception as e:
yield f"β Error: {str(e)}", "", None
# -----------------------------
# UI Layout
# -----------------------------
model_desc_kotani = """
<div style="border:1px solid #ddd; padding: 10px; border-radius:8px;">
<strong>Kotani Whisper Small</strong><br>
βͺ Fast & multilingual<br>
βͺ Good for quick subtitles<br>
βͺ Moderate accuracy for Korean
</div>
"""
model_desc_khaiii = """
<div style="border:1px solid #ddd; padding: 10px; border-radius:8px;">
<strong>Khaiii Wav2Vec2</strong><br>
βͺ Best Korean speech recognition<br>
βͺ Slower but highly accurate<br>
βͺ Only supports Korean
</div>
"""
with gr.Blocks(theme=gr.themes.Soft()) as demo:
status_box = gr.Textbox(label="Status", interactive=False)
gr.Markdown("## π Multilingual Subtitle Generator")
gr.Markdown("Upload a video or paste a YouTube link. Automatically detect language and optionally translate subtitles.")
selected_model = gr.State(value="kotani") # default model
gr.Markdown("### π Choose ASR Model")
with gr.Row():
with gr.Column():
kotani_btn = gr.Button("β
Select Kotani Whisper Small")
gr.HTML(model_desc_kotani)
with gr.Column():
khaiii_btn = gr.Button("β
Select Khaiii Wav2Vec2")
gr.HTML(model_desc_khaiii)
def select_kotani():
msg = load_kotani_model()
return "kotani", msg
def select_khaiii():
msg = load_khaiii_model()
return "khaiii", msg
kotani_btn.click(fn=select_kotani, outputs=[selected_model, status_box])
khaiii_btn.click(fn=select_khaiii, outputs=[selected_model, status_box])
gr.Markdown("### π₯ Input Source")
with gr.Row():
youtube_url = gr.Textbox(label="YouTube URL", scale=2)
video_upload = gr.File(label="Upload Video", type="filepath", file_types=["video"], scale=1)
gr.Markdown("### π Translation Options")
with gr.Row():
translate_checkbox = gr.Checkbox(label="Translate to another language?")
target_lang = gr.Textbox(label="Target Language Code (e.g., 'en')", value="en", visible=False)
def toggle_translate(checked):
return gr.update(visible=checked)
translate_checkbox.change(fn=toggle_translate, inputs=translate_checkbox, outputs=target_lang)
subtitle_preview = gr.Textbox(label="Generated Subtitles", lines=10)
download_file = gr.File(label="Download .srt File")
submit_btn = gr.Button("π¬ Generate Subtitles")
submit_btn.click(
fn=process_video,
inputs=[youtube_url, video_upload, selected_model, translate_checkbox, target_lang],
outputs=[status_box, subtitle_preview, download_file]
)
if __name__ == "__main__":
demo.launch()
|