Spaces:
Sleeping
Sleeping
File size: 5,423 Bytes
a5861bb |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 |
import os
import tempfile
from pathlib import Path
import gradio as gr
import yt_dlp
from faster_whisper import WhisperModel
# -------- Settings you can tweak --------
DEFAULT_MODEL = os.getenv("WHISPER_MODEL", "small") # small | medium | large-v3 (requires more RAM)
COMPUTE_TYPE = os.getenv("COMPUTE_TYPE", "int8") # int8 | int8_float16 | float16 | float32
MAX_DURATION_SEC = int(os.getenv("MAX_DURATION_SEC", "1800")) # 30 min cap to keep things predictable
# ---------------------------------------
# Lazy-load model once per container
_model = None
def get_model():
global _model
if _model is None:
_model = WhisperModel(DEFAULT_MODEL, compute_type=COMPUTE_TYPE)
return _model
def _download_youtube_audio(url: str, workdir: str) -> str:
"""
Download YouTube audio and convert to WAV mono 16 kHz using FFmpegExtractAudio.
Returns path to the WAV file.
"""
outtmpl = str(Path(workdir) / "%(id)s.%(ext)s")
ydl_opts = {
"format": "bestaudio/best",
"outtmpl": outtmpl,
"noplaylist": True,
"quiet": True,
"no_warnings": True,
"postprocessors": [
{
"key": "FFmpegExtractAudio",
"preferredcodec": "wav",
"preferredquality": "5",
}
],
# ensure mono @ 16 kHz
"postprocessor_args": ["-ac", "1", "-ar", "16000"],
}
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
info = ydl.extract_info(url, download=True)
duration = info.get("duration") or 0
if duration and duration > MAX_DURATION_SEC:
raise gr.Error(f"Video too long ({duration//60} min). Max allowed is {MAX_DURATION_SEC//60} min.")
# Find the produced .wav in the temp dir (name can vary)
wavs = list(Path(workdir).glob("*.wav"))
if not wavs:
raise gr.Error("Audio extraction failed. Try a different video.")
return str(wavs[0])
def _write_srt(segments, path: str):
def srt_timestamp(t):
# t in seconds -> "HH:MM:SS,mmm"
h = int(t // 3600)
m = int((t % 3600) // 60)
s = int(t % 60)
ms = int((t - int(t)) * 1000)
return f"{h:02d}:{m:02d}:{s:02d},{ms:03d}"
with open(path, "w", encoding="utf-8") as f:
for i, seg in enumerate(segments, start=1):
f.write(f"{i}\n")
f.write(f"{srt_timestamp(seg.start)} --> {srt_timestamp(seg.end)}\n")
f.write(seg.text.strip() + "\n\n")
def transcribe(youtube_url, upload_file, model_size, language, translate_to_english):
if not youtube_url and not upload_file:
raise gr.Error("Provide a YouTube URL or upload a file.")
# Update model on-the-fly if user changes it
global _model
if _model is None or getattr(_model, "_model_size", None) != model_size:
_model = WhisperModel(model_size, compute_type=COMPUTE_TYPE)
_model._model_size = model_size # tag for reuse
with tempfile.TemporaryDirectory() as td:
if youtube_url:
audio_path = _download_youtube_audio(youtube_url.strip(), td)
else:
# Save uploaded file and (optionally) convert via ffmpeg if needed
src = Path(td) / Path(upload_file.name).name
with open(src, "wb") as w:
w.write(upload_file.read())
# Let faster-whisper/ffmpeg handle decoding directly
audio_path = str(src)
# Transcribe
segments, info = _model.transcribe(
audio_path,
language=None if language == "auto" else language,
task="translate" if translate_to_english else "transcribe",
vad_filter=True
)
# Collect text and also write SRT
segs = list(segments)
full_text = "".join(s.text for s in segs).strip()
srt_path = Path(td) / "subtitles.srt"
_write_srt(segs, srt_path)
return full_text, str(srt_path)
# ---- Gradio UI ----
with gr.Blocks(title="YouTube β Text (Whisper)") as demo:
gr.Markdown("## π¬ YouTube β π Text\nPaste a YouTube link **or** upload a media file to get a transcript.")
with gr.Row():
youtube_url = gr.Textbox(label="YouTube URL", placeholder="https://www.youtube.com/watch?v=...")
with gr.Row():
upload_file = gr.File(label="Or upload a video/audio file", file_count="single")
with gr.Row():
model_size = gr.Dropdown(
["small", "medium", "large-v3"],
value=DEFAULT_MODEL,
label="Model size (larger = more accurate, slower)"
)
language = gr.Dropdown(
["auto","en","ar","fr","de","es","hi","ur","fa","ru","zh"],
value="auto",
label="Language (auto-detect or force)"
)
translate_to_english = gr.Checkbox(value=False, label="Translate to English")
run_btn = gr.Button("Transcribe", variant="primary")
transcript = gr.Textbox(label="Transcript", lines=12)
srt_file = gr.File(label="Download SRT (subtitles)")
run_btn.click(
transcribe,
inputs=[youtube_url, upload_file, model_size, language, translate_to_english],
outputs=[transcript, srt_file]
)
if __name__ == "__main__":
demo.launch(server_name="0.0.0.0", server_port=7860) |