DARKWICK's picture
Upload 3 files
a5861bb verified
import os
import tempfile
from pathlib import Path
import gradio as gr
import yt_dlp
from faster_whisper import WhisperModel
# -------- Settings you can tweak --------
DEFAULT_MODEL = os.getenv("WHISPER_MODEL", "small") # small | medium | large-v3 (requires more RAM)
COMPUTE_TYPE = os.getenv("COMPUTE_TYPE", "int8") # int8 | int8_float16 | float16 | float32
MAX_DURATION_SEC = int(os.getenv("MAX_DURATION_SEC", "1800")) # 30 min cap to keep things predictable
# ---------------------------------------
# Lazy-load model once per container
_model = None
def get_model():
global _model
if _model is None:
_model = WhisperModel(DEFAULT_MODEL, compute_type=COMPUTE_TYPE)
return _model
def _download_youtube_audio(url: str, workdir: str) -> str:
"""
Download YouTube audio and convert to WAV mono 16 kHz using FFmpegExtractAudio.
Returns path to the WAV file.
"""
outtmpl = str(Path(workdir) / "%(id)s.%(ext)s")
ydl_opts = {
"format": "bestaudio/best",
"outtmpl": outtmpl,
"noplaylist": True,
"quiet": True,
"no_warnings": True,
"postprocessors": [
{
"key": "FFmpegExtractAudio",
"preferredcodec": "wav",
"preferredquality": "5",
}
],
# ensure mono @ 16 kHz
"postprocessor_args": ["-ac", "1", "-ar", "16000"],
}
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
info = ydl.extract_info(url, download=True)
duration = info.get("duration") or 0
if duration and duration > MAX_DURATION_SEC:
raise gr.Error(f"Video too long ({duration//60} min). Max allowed is {MAX_DURATION_SEC//60} min.")
# Find the produced .wav in the temp dir (name can vary)
wavs = list(Path(workdir).glob("*.wav"))
if not wavs:
raise gr.Error("Audio extraction failed. Try a different video.")
return str(wavs[0])
def _write_srt(segments, path: str):
def srt_timestamp(t):
# t in seconds -> "HH:MM:SS,mmm"
h = int(t // 3600)
m = int((t % 3600) // 60)
s = int(t % 60)
ms = int((t - int(t)) * 1000)
return f"{h:02d}:{m:02d}:{s:02d},{ms:03d}"
with open(path, "w", encoding="utf-8") as f:
for i, seg in enumerate(segments, start=1):
f.write(f"{i}\n")
f.write(f"{srt_timestamp(seg.start)} --> {srt_timestamp(seg.end)}\n")
f.write(seg.text.strip() + "\n\n")
def transcribe(youtube_url, upload_file, model_size, language, translate_to_english):
if not youtube_url and not upload_file:
raise gr.Error("Provide a YouTube URL or upload a file.")
# Update model on-the-fly if user changes it
global _model
if _model is None or getattr(_model, "_model_size", None) != model_size:
_model = WhisperModel(model_size, compute_type=COMPUTE_TYPE)
_model._model_size = model_size # tag for reuse
with tempfile.TemporaryDirectory() as td:
if youtube_url:
audio_path = _download_youtube_audio(youtube_url.strip(), td)
else:
# Save uploaded file and (optionally) convert via ffmpeg if needed
src = Path(td) / Path(upload_file.name).name
with open(src, "wb") as w:
w.write(upload_file.read())
# Let faster-whisper/ffmpeg handle decoding directly
audio_path = str(src)
# Transcribe
segments, info = _model.transcribe(
audio_path,
language=None if language == "auto" else language,
task="translate" if translate_to_english else "transcribe",
vad_filter=True
)
# Collect text and also write SRT
segs = list(segments)
full_text = "".join(s.text for s in segs).strip()
srt_path = Path(td) / "subtitles.srt"
_write_srt(segs, srt_path)
return full_text, str(srt_path)
# ---- Gradio UI ----
with gr.Blocks(title="YouTube β†’ Text (Whisper)") as demo:
gr.Markdown("## 🎬 YouTube β†’ πŸ“ Text\nPaste a YouTube link **or** upload a media file to get a transcript.")
with gr.Row():
youtube_url = gr.Textbox(label="YouTube URL", placeholder="https://www.youtube.com/watch?v=...")
with gr.Row():
upload_file = gr.File(label="Or upload a video/audio file", file_count="single")
with gr.Row():
model_size = gr.Dropdown(
["small", "medium", "large-v3"],
value=DEFAULT_MODEL,
label="Model size (larger = more accurate, slower)"
)
language = gr.Dropdown(
["auto","en","ar","fr","de","es","hi","ur","fa","ru","zh"],
value="auto",
label="Language (auto-detect or force)"
)
translate_to_english = gr.Checkbox(value=False, label="Translate to English")
run_btn = gr.Button("Transcribe", variant="primary")
transcript = gr.Textbox(label="Transcript", lines=12)
srt_file = gr.File(label="Download SRT (subtitles)")
run_btn.click(
transcribe,
inputs=[youtube_url, upload_file, model_size, language, translate_to_english],
outputs=[transcript, srt_file]
)
if __name__ == "__main__":
demo.launch(server_name="0.0.0.0", server_port=7860)