DARKWICK commited on
Commit
a5861bb
·
verified ·
1 Parent(s): b2dcd00

Upload 3 files

Browse files
Files changed (3) hide show
  1. Dockerfile.txt +20 -0
  2. app.py +140 -0
  3. requirements.txt +3 -0
Dockerfile.txt ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.11-slim
2
+
3
+ RUN apt-get update && apt-get install -y --no-install-recommends \
4
+ ffmpeg git build-essential \
5
+ && rm -rf /var/lib/apt/lists/*
6
+
7
+ WORKDIR /app
8
+ COPY requirements.txt .
9
+ RUN pip install --no-cache-dir -r requirements.txt
10
+
11
+ COPY app.py .
12
+
13
+ ENV PORT=7860
14
+ EXPOSE 7860
15
+
16
+ ENV WHISPER_MODEL=small
17
+ ENV COMPUTE_TYPE=int8
18
+ ENV MAX_DURATION_SEC=1800
19
+
20
+ CMD ["python", "app.py"]
app.py ADDED
@@ -0,0 +1,140 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import tempfile
3
+ from pathlib import Path
4
+ import gradio as gr
5
+ import yt_dlp
6
+ from faster_whisper import WhisperModel
7
+
8
+ # -------- Settings you can tweak --------
9
+ DEFAULT_MODEL = os.getenv("WHISPER_MODEL", "small") # small | medium | large-v3 (requires more RAM)
10
+ COMPUTE_TYPE = os.getenv("COMPUTE_TYPE", "int8") # int8 | int8_float16 | float16 | float32
11
+ MAX_DURATION_SEC = int(os.getenv("MAX_DURATION_SEC", "1800")) # 30 min cap to keep things predictable
12
+ # ---------------------------------------
13
+
14
+ # Lazy-load model once per container
15
+ _model = None
16
+ def get_model():
17
+ global _model
18
+ if _model is None:
19
+ _model = WhisperModel(DEFAULT_MODEL, compute_type=COMPUTE_TYPE)
20
+ return _model
21
+
22
+ def _download_youtube_audio(url: str, workdir: str) -> str:
23
+ """
24
+ Download YouTube audio and convert to WAV mono 16 kHz using FFmpegExtractAudio.
25
+ Returns path to the WAV file.
26
+ """
27
+ outtmpl = str(Path(workdir) / "%(id)s.%(ext)s")
28
+ ydl_opts = {
29
+ "format": "bestaudio/best",
30
+ "outtmpl": outtmpl,
31
+ "noplaylist": True,
32
+ "quiet": True,
33
+ "no_warnings": True,
34
+ "postprocessors": [
35
+ {
36
+ "key": "FFmpegExtractAudio",
37
+ "preferredcodec": "wav",
38
+ "preferredquality": "5",
39
+ }
40
+ ],
41
+ # ensure mono @ 16 kHz
42
+ "postprocessor_args": ["-ac", "1", "-ar", "16000"],
43
+ }
44
+
45
+ with yt_dlp.YoutubeDL(ydl_opts) as ydl:
46
+ info = ydl.extract_info(url, download=True)
47
+ duration = info.get("duration") or 0
48
+ if duration and duration > MAX_DURATION_SEC:
49
+ raise gr.Error(f"Video too long ({duration//60} min). Max allowed is {MAX_DURATION_SEC//60} min.")
50
+
51
+ # Find the produced .wav in the temp dir (name can vary)
52
+ wavs = list(Path(workdir).glob("*.wav"))
53
+ if not wavs:
54
+ raise gr.Error("Audio extraction failed. Try a different video.")
55
+ return str(wavs[0])
56
+
57
+
58
+ def _write_srt(segments, path: str):
59
+ def srt_timestamp(t):
60
+ # t in seconds -> "HH:MM:SS,mmm"
61
+ h = int(t // 3600)
62
+ m = int((t % 3600) // 60)
63
+ s = int(t % 60)
64
+ ms = int((t - int(t)) * 1000)
65
+ return f"{h:02d}:{m:02d}:{s:02d},{ms:03d}"
66
+
67
+ with open(path, "w", encoding="utf-8") as f:
68
+ for i, seg in enumerate(segments, start=1):
69
+ f.write(f"{i}\n")
70
+ f.write(f"{srt_timestamp(seg.start)} --> {srt_timestamp(seg.end)}\n")
71
+ f.write(seg.text.strip() + "\n\n")
72
+
73
+ def transcribe(youtube_url, upload_file, model_size, language, translate_to_english):
74
+ if not youtube_url and not upload_file:
75
+ raise gr.Error("Provide a YouTube URL or upload a file.")
76
+
77
+ # Update model on-the-fly if user changes it
78
+ global _model
79
+ if _model is None or getattr(_model, "_model_size", None) != model_size:
80
+ _model = WhisperModel(model_size, compute_type=COMPUTE_TYPE)
81
+ _model._model_size = model_size # tag for reuse
82
+
83
+ with tempfile.TemporaryDirectory() as td:
84
+ if youtube_url:
85
+ audio_path = _download_youtube_audio(youtube_url.strip(), td)
86
+ else:
87
+ # Save uploaded file and (optionally) convert via ffmpeg if needed
88
+ src = Path(td) / Path(upload_file.name).name
89
+ with open(src, "wb") as w:
90
+ w.write(upload_file.read())
91
+ # Let faster-whisper/ffmpeg handle decoding directly
92
+ audio_path = str(src)
93
+
94
+ # Transcribe
95
+ segments, info = _model.transcribe(
96
+ audio_path,
97
+ language=None if language == "auto" else language,
98
+ task="translate" if translate_to_english else "transcribe",
99
+ vad_filter=True
100
+ )
101
+
102
+ # Collect text and also write SRT
103
+ segs = list(segments)
104
+ full_text = "".join(s.text for s in segs).strip()
105
+ srt_path = Path(td) / "subtitles.srt"
106
+ _write_srt(segs, srt_path)
107
+ return full_text, str(srt_path)
108
+
109
+ # ---- Gradio UI ----
110
+ with gr.Blocks(title="YouTube → Text (Whisper)") as demo:
111
+ gr.Markdown("## 🎬 YouTube → 📝 Text\nPaste a YouTube link **or** upload a media file to get a transcript.")
112
+ with gr.Row():
113
+ youtube_url = gr.Textbox(label="YouTube URL", placeholder="https://www.youtube.com/watch?v=...")
114
+ with gr.Row():
115
+ upload_file = gr.File(label="Or upload a video/audio file", file_count="single")
116
+ with gr.Row():
117
+ model_size = gr.Dropdown(
118
+ ["small", "medium", "large-v3"],
119
+ value=DEFAULT_MODEL,
120
+ label="Model size (larger = more accurate, slower)"
121
+ )
122
+ language = gr.Dropdown(
123
+ ["auto","en","ar","fr","de","es","hi","ur","fa","ru","zh"],
124
+ value="auto",
125
+ label="Language (auto-detect or force)"
126
+ )
127
+ translate_to_english = gr.Checkbox(value=False, label="Translate to English")
128
+
129
+ run_btn = gr.Button("Transcribe", variant="primary")
130
+ transcript = gr.Textbox(label="Transcript", lines=12)
131
+ srt_file = gr.File(label="Download SRT (subtitles)")
132
+
133
+ run_btn.click(
134
+ transcribe,
135
+ inputs=[youtube_url, upload_file, model_size, language, translate_to_english],
136
+ outputs=[transcript, srt_file]
137
+ )
138
+
139
+ if __name__ == "__main__":
140
+ demo.launch(server_name="0.0.0.0", server_port=7860)
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ gradio
2
+ yt-dlp
3
+ faster-whisper