PPloychor commited on
Commit
6c69ff9
Β·
verified Β·
1 Parent(s): 6785e76

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +148 -0
app.py ADDED
@@ -0,0 +1,148 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import spaces
2
+ import torch
3
+ import gradio as gr
4
+ import yt_dlp
5
+ from transformers import pipeline
6
+ from transformers.pipelines.audio_utils import ffmpeg_read
7
+ import tempfile
8
+ import os
9
+ import time
10
+ import glob
11
+
12
+ # --------------------------------------------------
13
+ # CONFIG
14
+ # --------------------------------------------------
15
+ ASR_MODEL = "openai/whisper-large-v3"
16
+ SUM_MODEL = "google/flan-t5-large"
17
+ BATCH_SIZE = 8
18
+ YT_LENGTH_LIMIT_S = 3600 # 1 hour max
19
+
20
+ HAS_CUDA = torch.cuda.is_available()
21
+ DEVICE = 0 if HAS_CUDA else "cpu"
22
+ DTYPE = torch.float16 if HAS_CUDA else torch.float32
23
+
24
+ # Speech-to-Text
25
+ asr_pipe = pipeline(
26
+ task="automatic-speech-recognition",
27
+ model=ASR_MODEL,
28
+ device=DEVICE,
29
+ torch_dtype=DTYPE,
30
+ chunk_length_s=30,
31
+ )
32
+
33
+ # Summarization
34
+ sum_pipe = pipeline("summarization", model=SUM_MODEL, device=DEVICE)
35
+
36
+ # --------------------------------------------------
37
+ # HELPERS
38
+ # --------------------------------------------------
39
+ def _format_hms(sec: int) -> str:
40
+ return time.strftime("%H:%M:%S", time.gmtime(sec))
41
+
42
+ def _embed(video_id: str) -> str:
43
+ return (
44
+ f'<center><iframe width="500" height="320" '
45
+ f'src="https://www.youtube.com/embed/{video_id}" '
46
+ f'frameborder="0" allowfullscreen></iframe></center>'
47
+ )
48
+
49
+ def _download_audio(yt_url: str, out_dir: str) -> tuple[str, dict]:
50
+ """Download best-quality audio track."""
51
+ try:
52
+ with yt_dlp.YoutubeDL({"quiet": True}) as ydl:
53
+ info = ydl.extract_info(yt_url, download=False)
54
+ except yt_dlp.utils.DownloadError as err:
55
+ raise gr.Error(f"Cannot access YouTube URL: {err}")
56
+
57
+ duration = int(info.get("duration") or 0)
58
+ if duration > YT_LENGTH_LIMIT_S:
59
+ raise gr.Error(
60
+ f"Video too long: {_format_hms(duration)} > {_format_hms(YT_LENGTH_LIMIT_S)}"
61
+ )
62
+
63
+ outtmpl = os.path.join(out_dir, "audio.%(ext)s")
64
+ opts = {"format": "bestaudio/best", "outtmpl": outtmpl, "quiet": True, "noprogress": True}
65
+ with yt_dlp.YoutubeDL(opts) as ydl:
66
+ ydl.download([yt_url])
67
+
68
+ matches = glob.glob(os.path.join(out_dir, "audio.*"))
69
+ if not matches:
70
+ raise gr.Error("Failed to download audio track.")
71
+ return matches[0], info
72
+
73
+ # --------------------------------------------------
74
+ # MAIN FUNCTIONS
75
+ # --------------------------------------------------
76
+ @spaces.GPU
77
+ def transcribe_local(inputs, task):
78
+ if inputs is None:
79
+ raise gr.Error("Please upload or record an audio file.")
80
+ with open(inputs, "rb") as f:
81
+ data = f.read()
82
+ audio = ffmpeg_read(data, asr_pipe.feature_extractor.sampling_rate)
83
+ inputs = {"array": audio, "sampling_rate": asr_pipe.feature_extractor.sampling_rate}
84
+ out = asr_pipe(inputs, batch_size=BATCH_SIZE,
85
+ generate_kwargs={"task": task}, return_timestamps=True)
86
+ return out["text"]
87
+
88
+ @spaces.GPU
89
+ def transcribe_youtube(yt_url, task):
90
+ if not yt_url:
91
+ raise gr.Error("Paste a valid YouTube URL.")
92
+ with tempfile.TemporaryDirectory() as tmpdir:
93
+ audio_path, info = _download_audio(yt_url, tmpdir)
94
+ with open(audio_path, "rb") as f:
95
+ data = f.read()
96
+ audio = ffmpeg_read(data, asr_pipe.feature_extractor.sampling_rate)
97
+ inputs = {"array": audio, "sampling_rate": asr_pipe.feature_extractor.sampling_rate}
98
+ out = asr_pipe(inputs, batch_size=BATCH_SIZE,
99
+ generate_kwargs={"task": task}, return_timestamps=True)
100
+ text = out["text"]
101
+ txt_path = os.path.join(tmpdir, "transcript.txt")
102
+ with open(txt_path, "w", encoding="utf-8") as f:
103
+ f.write(text)
104
+ vid = info.get("id", "")
105
+ html = _embed(vid) if vid else ""
106
+ return html, text, txt_path
107
+
108
+ def summarize_text(text):
109
+ if not text.strip():
110
+ raise gr.Error("No transcript provided.")
111
+ chunks = [text[i:i+2000] for i in range(0, len(text), 2000)]
112
+ summaries = [sum_pipe(ch)[0]["summary_text"] for ch in chunks]
113
+ return " ".join(summaries)
114
+
115
+ # --------------------------------------------------
116
+ # UI
117
+ # --------------------------------------------------
118
+ with gr.Blocks(title="YouTube β†’ Transcript β†’ Summary") as demo:
119
+ gr.Markdown("## 🎬 Whisper V3 + Flan-T5 – YouTube Transcriber & Summarizer")
120
+
121
+ with gr.Tab("πŸŽ™οΈ Microphone"):
122
+ mic_audio = gr.Audio(sources="microphone", type="filepath")
123
+ mic_task = gr.Radio(["transcribe", "translate"], value="transcribe", label="Task")
124
+ mic_out = gr.Textbox(label="Transcript")
125
+ gr.Button("Run").click(fn=transcribe_local, inputs=[mic_audio, mic_task], outputs=mic_out)
126
+
127
+ with gr.Tab("πŸ“ Audio file"):
128
+ file_audio = gr.Audio(sources="upload", type="filepath")
129
+ file_task = gr.Radio(["transcribe", "translate"], value="transcribe", label="Task")
130
+ file_out = gr.Textbox(label="Transcript")
131
+ gr.Button("Run").click(fn=transcribe_local, inputs=[file_audio, file_task], outputs=file_out)
132
+
133
+ with gr.Tab("🎬 YouTube"):
134
+ yt_url = gr.Textbox(lines=1, placeholder="Paste YouTube URL here", label="YouTube URL")
135
+ yt_task = gr.Radio(["transcribe", "translate"], value="transcribe", label="Task")
136
+ yt_video = gr.HTML(label="Video Preview")
137
+ yt_text = gr.Textbox(label="Transcript", lines=10)
138
+ yt_file = gr.File(label="Download Transcript (.txt)")
139
+ gr.Button("Transcribe").click(fn=transcribe_youtube,
140
+ inputs=[yt_url, yt_task],
141
+ outputs=[yt_video, yt_text, yt_file])
142
+
143
+ gr.Markdown("---")
144
+ gr.Markdown("### 🧠 Summarize Transcript")
145
+ sum_out = gr.Textbox(label="Summary", lines=6)
146
+ gr.Button("Summarize Text").click(fn=summarize_text, inputs=yt_text, outputs=sum_out)
147
+
148
+ demo.queue().launch()