import os import tempfile from pathlib import Path import gradio as gr import yt_dlp from faster_whisper import WhisperModel # -------- Settings you can tweak -------- DEFAULT_MODEL = os.getenv("WHISPER_MODEL", "small") # small | medium | large-v3 (requires more RAM) COMPUTE_TYPE = os.getenv("COMPUTE_TYPE", "int8") # int8 | int8_float16 | float16 | float32 MAX_DURATION_SEC = int(os.getenv("MAX_DURATION_SEC", "1800")) # 30 min cap to keep things predictable # --------------------------------------- # Lazy-load model once per container _model = None def get_model(): global _model if _model is None: _model = WhisperModel(DEFAULT_MODEL, compute_type=COMPUTE_TYPE) return _model def _download_youtube_audio(url: str, workdir: str) -> str: """ Download YouTube audio and convert to WAV mono 16 kHz using FFmpegExtractAudio. Returns path to the WAV file. """ outtmpl = str(Path(workdir) / "%(id)s.%(ext)s") ydl_opts = { "format": "bestaudio/best", "outtmpl": outtmpl, "noplaylist": True, "quiet": True, "no_warnings": True, "postprocessors": [ { "key": "FFmpegExtractAudio", "preferredcodec": "wav", "preferredquality": "5", } ], # ensure mono @ 16 kHz "postprocessor_args": ["-ac", "1", "-ar", "16000"], } with yt_dlp.YoutubeDL(ydl_opts) as ydl: info = ydl.extract_info(url, download=True) duration = info.get("duration") or 0 if duration and duration > MAX_DURATION_SEC: raise gr.Error(f"Video too long ({duration//60} min). Max allowed is {MAX_DURATION_SEC//60} min.") # Find the produced .wav in the temp dir (name can vary) wavs = list(Path(workdir).glob("*.wav")) if not wavs: raise gr.Error("Audio extraction failed. Try a different video.") return str(wavs[0]) def _write_srt(segments, path: str): def srt_timestamp(t): # t in seconds -> "HH:MM:SS,mmm" h = int(t // 3600) m = int((t % 3600) // 60) s = int(t % 60) ms = int((t - int(t)) * 1000) return f"{h:02d}:{m:02d}:{s:02d},{ms:03d}" with open(path, "w", encoding="utf-8") as f: for i, seg in enumerate(segments, start=1): f.write(f"{i}\n") f.write(f"{srt_timestamp(seg.start)} --> {srt_timestamp(seg.end)}\n") f.write(seg.text.strip() + "\n\n") def transcribe(youtube_url, upload_file, model_size, language, translate_to_english): if not youtube_url and not upload_file: raise gr.Error("Provide a YouTube URL or upload a file.") # Update model on-the-fly if user changes it global _model if _model is None or getattr(_model, "_model_size", None) != model_size: _model = WhisperModel(model_size, compute_type=COMPUTE_TYPE) _model._model_size = model_size # tag for reuse with tempfile.TemporaryDirectory() as td: if youtube_url: audio_path = _download_youtube_audio(youtube_url.strip(), td) else: # Save uploaded file and (optionally) convert via ffmpeg if needed src = Path(td) / Path(upload_file.name).name with open(src, "wb") as w: w.write(upload_file.read()) # Let faster-whisper/ffmpeg handle decoding directly audio_path = str(src) # Transcribe segments, info = _model.transcribe( audio_path, language=None if language == "auto" else language, task="translate" if translate_to_english else "transcribe", vad_filter=True ) # Collect text and also write SRT segs = list(segments) full_text = "".join(s.text for s in segs).strip() srt_path = Path(td) / "subtitles.srt" _write_srt(segs, srt_path) return full_text, str(srt_path) # ---- Gradio UI ---- with gr.Blocks(title="YouTube → Text (Whisper)") as demo: gr.Markdown("## 🎬 YouTube → 📝 Text\nPaste a YouTube link **or** upload a media file to get a transcript.") with gr.Row(): youtube_url = gr.Textbox(label="YouTube URL", placeholder="https://www.youtube.com/watch?v=...") with gr.Row(): upload_file = gr.File(label="Or upload a video/audio file", file_count="single") with gr.Row(): model_size = gr.Dropdown( ["small", "medium", "large-v3"], value=DEFAULT_MODEL, label="Model size (larger = more accurate, slower)" ) language = gr.Dropdown( ["auto","en","ar","fr","de","es","hi","ur","fa","ru","zh"], value="auto", label="Language (auto-detect or force)" ) translate_to_english = gr.Checkbox(value=False, label="Translate to English") run_btn = gr.Button("Transcribe", variant="primary") transcript = gr.Textbox(label="Transcript", lines=12) srt_file = gr.File(label="Download SRT (subtitles)") run_btn.click( transcribe, inputs=[youtube_url, upload_file, model_size, language, translate_to_english], outputs=[transcript, srt_file] ) if __name__ == "__main__": demo.launch(server_name="0.0.0.0", server_port=7860)