Spaces:
Sleeping
Sleeping
| import os | |
| import tempfile | |
| from pathlib import Path | |
| import gradio as gr | |
| import yt_dlp | |
| from faster_whisper import WhisperModel | |
| # -------- Settings you can tweak -------- | |
| DEFAULT_MODEL = os.getenv("WHISPER_MODEL", "small") # small | medium | large-v3 (requires more RAM) | |
| COMPUTE_TYPE = os.getenv("COMPUTE_TYPE", "int8") # int8 | int8_float16 | float16 | float32 | |
| MAX_DURATION_SEC = int(os.getenv("MAX_DURATION_SEC", "1800")) # 30 min cap to keep things predictable | |
| # --------------------------------------- | |
| # Lazy-load model once per container | |
| _model = None | |
| def get_model(): | |
| global _model | |
| if _model is None: | |
| _model = WhisperModel(DEFAULT_MODEL, compute_type=COMPUTE_TYPE) | |
| return _model | |
| def _download_youtube_audio(url: str, workdir: str) -> str: | |
| """ | |
| Download YouTube audio and convert to WAV mono 16 kHz using FFmpegExtractAudio. | |
| Returns path to the WAV file. | |
| """ | |
| outtmpl = str(Path(workdir) / "%(id)s.%(ext)s") | |
| ydl_opts = { | |
| "format": "bestaudio/best", | |
| "outtmpl": outtmpl, | |
| "noplaylist": True, | |
| "quiet": True, | |
| "no_warnings": True, | |
| "postprocessors": [ | |
| { | |
| "key": "FFmpegExtractAudio", | |
| "preferredcodec": "wav", | |
| "preferredquality": "5", | |
| } | |
| ], | |
| # ensure mono @ 16 kHz | |
| "postprocessor_args": ["-ac", "1", "-ar", "16000"], | |
| } | |
| with yt_dlp.YoutubeDL(ydl_opts) as ydl: | |
| info = ydl.extract_info(url, download=True) | |
| duration = info.get("duration") or 0 | |
| if duration and duration > MAX_DURATION_SEC: | |
| raise gr.Error(f"Video too long ({duration//60} min). Max allowed is {MAX_DURATION_SEC//60} min.") | |
| # Find the produced .wav in the temp dir (name can vary) | |
| wavs = list(Path(workdir).glob("*.wav")) | |
| if not wavs: | |
| raise gr.Error("Audio extraction failed. Try a different video.") | |
| return str(wavs[0]) | |
| def _write_srt(segments, path: str): | |
| def srt_timestamp(t): | |
| # t in seconds -> "HH:MM:SS,mmm" | |
| h = int(t // 3600) | |
| m = int((t % 3600) // 60) | |
| s = int(t % 60) | |
| ms = int((t - int(t)) * 1000) | |
| return f"{h:02d}:{m:02d}:{s:02d},{ms:03d}" | |
| with open(path, "w", encoding="utf-8") as f: | |
| for i, seg in enumerate(segments, start=1): | |
| f.write(f"{i}\n") | |
| f.write(f"{srt_timestamp(seg.start)} --> {srt_timestamp(seg.end)}\n") | |
| f.write(seg.text.strip() + "\n\n") | |
| def transcribe(youtube_url, upload_file, model_size, language, translate_to_english): | |
| if not youtube_url and not upload_file: | |
| raise gr.Error("Provide a YouTube URL or upload a file.") | |
| # Update model on-the-fly if user changes it | |
| global _model | |
| if _model is None or getattr(_model, "_model_size", None) != model_size: | |
| _model = WhisperModel(model_size, compute_type=COMPUTE_TYPE) | |
| _model._model_size = model_size # tag for reuse | |
| with tempfile.TemporaryDirectory() as td: | |
| if youtube_url: | |
| audio_path = _download_youtube_audio(youtube_url.strip(), td) | |
| else: | |
| # Save uploaded file and (optionally) convert via ffmpeg if needed | |
| src = Path(td) / Path(upload_file.name).name | |
| with open(src, "wb") as w: | |
| w.write(upload_file.read()) | |
| # Let faster-whisper/ffmpeg handle decoding directly | |
| audio_path = str(src) | |
| # Transcribe | |
| segments, info = _model.transcribe( | |
| audio_path, | |
| language=None if language == "auto" else language, | |
| task="translate" if translate_to_english else "transcribe", | |
| vad_filter=True | |
| ) | |
| # Collect text and also write SRT | |
| segs = list(segments) | |
| full_text = "".join(s.text for s in segs).strip() | |
| srt_path = Path(td) / "subtitles.srt" | |
| _write_srt(segs, srt_path) | |
| return full_text, str(srt_path) | |
| # ---- Gradio UI ---- | |
| with gr.Blocks(title="YouTube β Text (Whisper)") as demo: | |
| gr.Markdown("## π¬ YouTube β π Text\nPaste a YouTube link **or** upload a media file to get a transcript.") | |
| with gr.Row(): | |
| youtube_url = gr.Textbox(label="YouTube URL", placeholder="https://www.youtube.com/watch?v=...") | |
| with gr.Row(): | |
| upload_file = gr.File(label="Or upload a video/audio file", file_count="single") | |
| with gr.Row(): | |
| model_size = gr.Dropdown( | |
| ["small", "medium", "large-v3"], | |
| value=DEFAULT_MODEL, | |
| label="Model size (larger = more accurate, slower)" | |
| ) | |
| language = gr.Dropdown( | |
| ["auto","en","ar","fr","de","es","hi","ur","fa","ru","zh"], | |
| value="auto", | |
| label="Language (auto-detect or force)" | |
| ) | |
| translate_to_english = gr.Checkbox(value=False, label="Translate to English") | |
| run_btn = gr.Button("Transcribe", variant="primary") | |
| transcript = gr.Textbox(label="Transcript", lines=12) | |
| srt_file = gr.File(label="Download SRT (subtitles)") | |
| run_btn.click( | |
| transcribe, | |
| inputs=[youtube_url, upload_file, model_size, language, translate_to_english], | |
| outputs=[transcript, srt_file] | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch(server_name="0.0.0.0", server_port=7860) |