File size: 5,423 Bytes
a5861bb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
import os
import tempfile
from pathlib import Path
import gradio as gr
import yt_dlp
from faster_whisper import WhisperModel

# -------- Settings you can tweak --------
DEFAULT_MODEL = os.getenv("WHISPER_MODEL", "small")  # small | medium | large-v3 (requires more RAM)
COMPUTE_TYPE = os.getenv("COMPUTE_TYPE", "int8")     # int8 | int8_float16 | float16 | float32
MAX_DURATION_SEC = int(os.getenv("MAX_DURATION_SEC", "1800"))  # 30 min cap to keep things predictable
# ---------------------------------------

# Lazy-load model once per container
_model = None
def get_model():
    global _model
    if _model is None:
        _model = WhisperModel(DEFAULT_MODEL, compute_type=COMPUTE_TYPE)
    return _model

def _download_youtube_audio(url: str, workdir: str) -> str:
    """

    Download YouTube audio and convert to WAV mono 16 kHz using FFmpegExtractAudio.

    Returns path to the WAV file.

    """
    outtmpl = str(Path(workdir) / "%(id)s.%(ext)s")
    ydl_opts = {
        "format": "bestaudio/best",
        "outtmpl": outtmpl,
        "noplaylist": True,
        "quiet": True,
        "no_warnings": True,
        "postprocessors": [
            {
                "key": "FFmpegExtractAudio",
                "preferredcodec": "wav",
                "preferredquality": "5",
            }
        ],
        # ensure mono @ 16 kHz
        "postprocessor_args": ["-ac", "1", "-ar", "16000"],
    }

    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
        info = ydl.extract_info(url, download=True)
        duration = info.get("duration") or 0
        if duration and duration > MAX_DURATION_SEC:
            raise gr.Error(f"Video too long ({duration//60} min). Max allowed is {MAX_DURATION_SEC//60} min.")

    # Find the produced .wav in the temp dir (name can vary)
    wavs = list(Path(workdir).glob("*.wav"))
    if not wavs:
        raise gr.Error("Audio extraction failed. Try a different video.")
    return str(wavs[0])


def _write_srt(segments, path: str):
    def srt_timestamp(t):
        # t in seconds -> "HH:MM:SS,mmm"
        h = int(t // 3600)
        m = int((t % 3600) // 60)
        s = int(t % 60)
        ms = int((t - int(t)) * 1000)
        return f"{h:02d}:{m:02d}:{s:02d},{ms:03d}"

    with open(path, "w", encoding="utf-8") as f:
        for i, seg in enumerate(segments, start=1):
            f.write(f"{i}\n")
            f.write(f"{srt_timestamp(seg.start)} --> {srt_timestamp(seg.end)}\n")
            f.write(seg.text.strip() + "\n\n")

def transcribe(youtube_url, upload_file, model_size, language, translate_to_english):
    if not youtube_url and not upload_file:
        raise gr.Error("Provide a YouTube URL or upload a file.")

    # Update model on-the-fly if user changes it
    global _model
    if _model is None or getattr(_model, "_model_size", None) != model_size:
        _model = WhisperModel(model_size, compute_type=COMPUTE_TYPE)
        _model._model_size = model_size  # tag for reuse

    with tempfile.TemporaryDirectory() as td:
        if youtube_url:
            audio_path = _download_youtube_audio(youtube_url.strip(), td)
        else:
            # Save uploaded file and (optionally) convert via ffmpeg if needed
            src = Path(td) / Path(upload_file.name).name
            with open(src, "wb") as w:
                w.write(upload_file.read())
            # Let faster-whisper/ffmpeg handle decoding directly
            audio_path = str(src)

        # Transcribe
        segments, info = _model.transcribe(
            audio_path,
            language=None if language == "auto" else language,
            task="translate" if translate_to_english else "transcribe",
            vad_filter=True
        )

        # Collect text and also write SRT
        segs = list(segments)
        full_text = "".join(s.text for s in segs).strip()
        srt_path = Path(td) / "subtitles.srt"
        _write_srt(segs, srt_path)
        return full_text, str(srt_path)

# ---- Gradio UI ----
with gr.Blocks(title="YouTube β†’ Text (Whisper)") as demo:
    gr.Markdown("## 🎬 YouTube β†’ πŸ“ Text\nPaste a YouTube link **or** upload a media file to get a transcript.")
    with gr.Row():
        youtube_url = gr.Textbox(label="YouTube URL", placeholder="https://www.youtube.com/watch?v=...")
    with gr.Row():
        upload_file = gr.File(label="Or upload a video/audio file", file_count="single")
    with gr.Row():
        model_size = gr.Dropdown(
            ["small", "medium", "large-v3"],
            value=DEFAULT_MODEL,
            label="Model size (larger = more accurate, slower)"
        )
        language = gr.Dropdown(
            ["auto","en","ar","fr","de","es","hi","ur","fa","ru","zh"],
            value="auto",
            label="Language (auto-detect or force)"
        )
        translate_to_english = gr.Checkbox(value=False, label="Translate to English")

    run_btn = gr.Button("Transcribe", variant="primary")
    transcript = gr.Textbox(label="Transcript", lines=12)
    srt_file = gr.File(label="Download SRT (subtitles)")

    run_btn.click(
        transcribe,
        inputs=[youtube_url, upload_file, model_size, language, translate_to_english],
        outputs=[transcript, srt_file]
    )

if __name__ == "__main__":
    demo.launch(server_name="0.0.0.0", server_port=7860)