moFouad1 commited on
Commit
2ae23d8
·
verified ·
1 Parent(s): 555bbca

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +85 -54
app.py CHANGED
@@ -1,3 +1,4 @@
 
1
  import os
2
  import re
3
  import torch
@@ -10,55 +11,80 @@ import shutil
10
  from torch.utils.data import Dataset, DataLoader
11
  from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled, NoTranscriptFound, CouldNotRetrieveTranscript, VideoUnavailable
12
  from youtube_transcript_api.formatters import TextFormatter
13
- from transformers import pipeline, WhisperProcessor, WhisperForConditionalGeneration
 
 
 
 
 
 
 
 
 
 
 
 
14
 
15
- # === Functions from Code 1 ===
16
  def get_video_id(url):
17
  match = re.search(r'(?:v=|\/)([0-9A-Za-z_-]{11})', url)
18
  return match.group(1) if match else None
19
 
20
- def download_transcript(video_id, lang="en"):
21
  try:
22
- transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=[lang])
23
- return TextFormatter().format_transcript(transcript)
24
- except (TranscriptsDisabled, NoTranscriptFound, VideoUnavailable, CouldNotRetrieveTranscript):
 
25
  return None
26
  except Exception as e:
27
  print(f"Transcript error: {e}")
28
  return None
29
 
30
- def download_audio(url, sabr_only=False):
31
- try:
32
- ydl_opts = {
33
- 'format': 'bestaudio[asr>0]/bestaudio/best' if sabr_only else 'bestaudio/best',
34
- 'outtmpl': 'temp_audio.%(ext)s',
35
- 'postprocessors': [{
36
- 'key': 'FFmpegExtractAudio',
37
- 'preferredcodec': 'wav',
38
- }],
39
- }
40
- with yt_dlp.YoutubeDL(ydl_opts) as ydl:
41
- ydl.download([url])
42
- return 'temp_audio.wav'
43
- except Exception as e:
44
- print(f"Audio download error: {e}")
45
- return None
46
 
47
- def download_video(url, sabr_only=False):
48
  try:
49
- ydl_opts = {
50
- 'format': 'bestvideo+bestaudio/best' if sabr_only else 'best',
51
- 'outtmpl': 'temp_video.%(ext)s',
52
- 'merge_output_format': 'mp4',
53
- }
54
  with yt_dlp.YoutubeDL(ydl_opts) as ydl:
55
  ydl.download([url])
56
- return 'temp_video.mp4'
57
  except Exception as e:
58
- print(f"Video download error: {e}")
59
- return None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
60
 
61
- # === Utilities ===
62
  def extract_audio_from_video(video_path, audio_path="audio.wav"):
63
  ffmpeg.input(video_path).output(audio_path, ac=1, ar=16000).run(overwrite_output=True)
64
  return audio_path
@@ -147,34 +173,38 @@ def generate_questions_with_pipeline(text, num_questions=5):
147
 
148
  return questions[:num_questions]
149
 
150
- # === Main Processing ===
151
- def process_input_gradio(url_input, file_input, transcript_text):
 
152
  try:
153
- if transcript_text:
154
- transcript = transcript_text
 
 
155
 
156
- elif file_input is not None:
157
  audio_path = extract_audio_from_video(file_input.name)
158
  chunks, sr = split_audio(audio_path, chunk_length_sec=15)
159
  transcript = transcribe_chunks_dataset(chunks, sr)
160
 
161
  elif url_input:
162
- video_id = get_video_id(url_input)
163
- transcript = download_transcript(video_id)
164
-
165
- if not transcript:
166
- audio_path = download_audio(url_input)
167
- if not audio_path:
168
- video_path = download_video(url_input)
169
- if not video_path:
170
- return "❌ Failed to download video/audio/transcript.", ""
171
- audio_path = extract_audio_from_video(video_path)
172
-
 
 
173
  chunks, sr = split_audio(audio_path, chunk_length_sec=15)
174
  transcript = transcribe_chunks_dataset(chunks, sr)
175
-
176
  else:
177
- return "Please provide a URL, upload a file, or paste transcript text.", ""
178
 
179
  summary = summarize_with_bart(transcript)
180
  questions = generate_questions_with_pipeline(summary)
@@ -182,20 +212,21 @@ def process_input_gradio(url_input, file_input, transcript_text):
182
  except Exception as e:
183
  return f"Error: {str(e)}", ""
184
 
185
- # === Gradio UI ===
 
186
  iface = gr.Interface(
187
  fn=process_input_gradio,
188
  inputs=[
189
  gr.Textbox(label="YouTube or Direct Video URL", placeholder="https://..."),
190
  gr.File(label="Or Upload a Video File", file_types=[".mp4", ".mkv", ".webm"]),
191
- gr.Textbox(label="Or Paste Transcript Text Directly", lines=10, placeholder="Paste full transcript here..."),
192
  ],
193
  outputs=[
194
  gr.Textbox(label="Summary", lines=10),
195
  gr.Textbox(label="Generated Questions", lines=10),
196
  ],
197
  title="Lecture Summary & Question Generator",
198
- description="Provide a YouTube/Direct video URL, upload a video file, or paste a transcript."
199
  )
200
 
201
- iface.launch()
 
1
+ 2:
2
  import os
3
  import re
4
  import torch
 
11
  from torch.utils.data import Dataset, DataLoader
12
  from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled, NoTranscriptFound, CouldNotRetrieveTranscript, VideoUnavailable
13
  from youtube_transcript_api.formatters import TextFormatter
14
+ from transformers import (
15
+ pipeline,
16
+ WhisperProcessor,
17
+ WhisperForConditionalGeneration,
18
+ )
19
+
20
+ # === UTILS ===
21
+
22
+ def is_youtube_url(url):
23
+ return "youtube.com" in url or "youtu.be" in url
24
+
25
+ def is_web_url(url):
26
+ return url.startswith("http://") or url.startswith("https://")
27
 
 
28
  def get_video_id(url):
29
  match = re.search(r'(?:v=|\/)([0-9A-Za-z_-]{11})', url)
30
  return match.group(1) if match else None
31
 
32
+ def try_download_transcript(video_id):
33
  try:
34
+ transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=["en"])
35
+ formatted = TextFormatter().format_transcript(transcript)
36
+ return formatted
37
+ except (TranscriptsDisabled, NoTranscriptFound, CouldNotRetrieveTranscript, VideoUnavailable):
38
  return None
39
  except Exception as e:
40
  print(f"Transcript error: {e}")
41
  return None
42
 
43
+ def download_audio_youtube(url, output_path="audio.wav", cookies_path=None):
44
+ import subprocess
45
+
46
+ fallback_video_path = "fallback_video.mp4"
47
+
48
+ ydl_opts = {
49
+ "format": "best",
50
+ "outtmpl": fallback_video_path,
51
+ "user_agent": "com.google.android.youtube/17.31.35 (Linux; U; Android 11)",
52
+ "compat_opts": ["allow_unplayable_formats"]
53
+ }
54
+
55
+ if cookies_path:
56
+ ydl_opts["cookiefile"] = cookies_path
 
 
57
 
 
58
  try:
 
 
 
 
 
59
  with yt_dlp.YoutubeDL(ydl_opts) as ydl:
60
  ydl.download([url])
 
61
  except Exception as e:
62
+ # On failure, run yt-dlp in subprocess to list formats
63
+ try:
64
+ list_cmd = ["yt-dlp", "-F", url]
65
+ if cookies_path:
66
+ list_cmd += ["--cookies", cookies_path]
67
+ result = subprocess.run(list_cmd, capture_output=True, text=True, timeout=15)
68
+ formats = result.stdout or "No formats found."
69
+ except Exception as format_err:
70
+ formats = f"⚠️ Could not list formats due to: {format_err}"
71
+
72
+ raise RuntimeError(
73
+ f"yt-dlp failed: {e}\n\n"
74
+ f"Available formats for this video:\n\n{formats}"
75
+ )
76
+
77
+ return extract_audio_from_video(fallback_video_path, audio_path=output_path)
78
+
79
+ def download_video_direct(url, output_path="video.mp4"):
80
+ ydl_opts = {
81
+ "format": "best",
82
+ "outtmpl": output_path
83
+ }
84
+ with yt_dlp.YoutubeDL(ydl_opts) as ydl:
85
+ ydl.download([url])
86
+ return output_path
87
 
 
88
  def extract_audio_from_video(video_path, audio_path="audio.wav"):
89
  ffmpeg.input(video_path).output(audio_path, ac=1, ar=16000).run(overwrite_output=True)
90
  return audio_path
 
173
 
174
  return questions[:num_questions]
175
 
176
+ # === MAIN FUNCTION ===
177
+
178
+ def process_input_gradio(url_input, file_input, cookies_file):
179
  try:
180
+ cookies_path = None
181
+ if cookies_file is not None:
182
+ cookies_path = "cookies.txt"
183
+ shutil.copyfile(cookies_file.name, cookies_path)
184
 
185
+ if file_input is not None:
186
  audio_path = extract_audio_from_video(file_input.name)
187
  chunks, sr = split_audio(audio_path, chunk_length_sec=15)
188
  transcript = transcribe_chunks_dataset(chunks, sr)
189
 
190
  elif url_input:
191
+ if is_youtube_url(url_input):
192
+ video_id = get_video_id(url_input)
193
+ transcript = try_download_transcript(video_id)
194
+ if not transcript:
195
+ try:
196
+ audio_path = download_audio_youtube(url_input, cookies_path=cookies_path)
197
+ chunks, sr = split_audio(audio_path, chunk_length_sec=15)
198
+ transcript = transcribe_chunks_dataset(chunks, sr)
199
+ except Exception as e:
200
+ return f"⚠️ Could not download this YouTube video due to restrictions. Please upload the video manually.\nDetails: {e}", ""
201
+ else:
202
+ video_file = download_video_direct(url_input)
203
+ audio_path = extract_audio_from_video(video_file)
204
  chunks, sr = split_audio(audio_path, chunk_length_sec=15)
205
  transcript = transcribe_chunks_dataset(chunks, sr)
 
206
  else:
207
+ return "Please provide a URL or upload a video file.", ""
208
 
209
  summary = summarize_with_bart(transcript)
210
  questions = generate_questions_with_pipeline(summary)
 
212
  except Exception as e:
213
  return f"Error: {str(e)}", ""
214
 
215
+ # === GRADIO UI ===
216
+
217
  iface = gr.Interface(
218
  fn=process_input_gradio,
219
  inputs=[
220
  gr.Textbox(label="YouTube or Direct Video URL", placeholder="https://..."),
221
  gr.File(label="Or Upload a Video File", file_types=[".mp4", ".mkv", ".webm"]),
222
+ gr.File(label="Optional cookies.txt for YouTube", file_types=[".txt"]),
223
  ],
224
  outputs=[
225
  gr.Textbox(label="Summary", lines=10),
226
  gr.Textbox(label="Generated Questions", lines=10),
227
  ],
228
  title="Lecture Summary & Question Generator",
229
+ description="Provide a YouTube/Direct video URL or upload a video file. If the video is restricted, upload cookies.txt or the video file directly."
230
  )
231
 
232
+ iface.launch()