ayloll commited on
Commit
9246621
·
verified ·
1 Parent(s): ff027b3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +84 -48
app.py CHANGED
@@ -3,8 +3,8 @@ from transformers import pipeline
3
  import yt_dlp
4
  import whisper
5
  import os
6
- import uuid
7
  import re
 
8
 
9
  # Delete temporary files
10
  def clean_temp_files():
@@ -13,39 +13,56 @@ def clean_temp_files():
13
  if os.path.exists(file):
14
  os.remove(file)
15
 
16
- # Download YouTube video
17
  def download_video(video_url):
18
  try:
19
  ydl_opts = {
20
- 'format': 'bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]',
21
- 'outtmpl': 'temp_video.mp4',
22
  'quiet': True,
23
  'no_warnings': True,
 
 
 
 
24
  }
25
 
26
  with yt_dlp.YoutubeDL(ydl_opts) as ydl:
27
- ydl.download([video_url])
28
- return "temp_video.mp4"
 
 
29
  except Exception as e:
30
- print(f"Download error: {e}")
31
  return None
32
 
33
- # Extract audio (temporary)
34
  def extract_audio(video_path):
35
- os.system(f"ffmpeg -i \"{video_path}\" -vn -acodec libmp3lame -q:a 3 \"temp_audio.mp3\" -y")
36
- return "temp_audio.mp3" if os.path.exists("temp_audio.mp3") else None
 
 
 
 
 
 
 
 
37
 
38
- # Transcribe audio
39
  def transcribe_audio(audio_path):
40
  try:
 
 
 
41
  model = whisper.load_model("base")
42
- result = model.transcribe(audio_path)
43
  return result['text']
44
  except Exception as e:
45
- print(f"Transcription error: {e}")
46
  return None
47
 
48
- # Classify content
49
  def classify_content(text):
50
  try:
51
  if not text or len(text.strip()) == 0:
@@ -63,48 +80,66 @@ def classify_content(text):
63
 
64
  return result['labels'][0], result['scores'][0]
65
  except Exception as e:
66
- print(f"Classification error: {e}")
67
  return None, None
68
 
69
- # Main processing function
 
 
 
 
 
 
 
 
 
 
 
 
 
70
  def process_video(video_url):
71
  clean_temp_files()
72
 
73
  if not video_url or len(video_url.strip()) == 0:
74
  return "Please enter a valid YouTube URL", ""
75
 
76
- if "youtube.com" not in video_url and "youtu.be" not in video_url:
77
- return "Please enter a valid YouTube URL", ""
78
-
79
- # Download video
80
- video_path = download_video(video_url)
81
- if not video_path:
82
- return "Failed to download video", ""
83
-
84
- # Extract audio
85
- audio_path = extract_audio(video_path)
86
- if not audio_path:
87
- clean_temp_files()
88
- return "Failed to extract audio", ""
89
 
90
- # Transcribe
91
- transcription = transcribe_audio(audio_path)
92
- if not transcription:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
93
  clean_temp_files()
94
- return "Failed to transcribe audio", ""
95
-
96
- # Classify
97
- category, confidence = classify_content(transcription)
98
- if not category:
 
99
  clean_temp_files()
100
- return transcription, "Failed to classify content"
101
-
102
- # Clean up
103
- clean_temp_files()
104
-
105
- # Format classification result
106
- classification_result = f"{category} (confidence: {confidence:.2f})"
107
- return transcription, classification_result
108
 
109
  # Gradio interface
110
  with gr.Blocks(title="YouTube Content Analyzer") as demo:
@@ -116,7 +151,8 @@ with gr.Blocks(title="YouTube Content Analyzer") as demo:
116
  with gr.Row():
117
  url_input = gr.Textbox(
118
  label="YouTube URL",
119
- placeholder="Enter YouTube video URL here..."
 
120
  )
121
 
122
  with gr.Row():
@@ -138,8 +174,8 @@ with gr.Blocks(title="YouTube Content Analyzer") as demo:
138
  # Examples
139
  gr.Examples(
140
  examples=[
141
- ["https://www.youtube.com/watch?v=dQw4w9WgXcQ"],
142
- ["https://youtu.be/dQw4w9WgXcQ"]
143
  ],
144
  inputs=url_input
145
  )
 
3
  import yt_dlp
4
  import whisper
5
  import os
 
6
  import re
7
+ from urllib.parse import urlparse
8
 
9
  # Delete temporary files
10
  def clean_temp_files():
 
13
  if os.path.exists(file):
14
  os.remove(file)
15
 
16
+ # Download YouTube video with improved options
17
  def download_video(video_url):
18
  try:
19
  ydl_opts = {
20
+ 'format': 'bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best',
21
+ 'outtmpl': 'temp_video.%(ext)s',
22
  'quiet': True,
23
  'no_warnings': True,
24
+ 'merge_output_format': 'mp4',
25
+ 'retries': 3,
26
+ 'socket_timeout': 30,
27
+ 'extract_flat': False,
28
  }
29
 
30
  with yt_dlp.YoutubeDL(ydl_opts) as ydl:
31
+ info = ydl.extract_info(video_url, download=True)
32
+ filename = ydl.prepare_filename(info)
33
+ return filename if os.path.exists(filename) else None
34
+
35
  except Exception as e:
36
+ print(f"Download error: {str(e)}")
37
  return None
38
 
39
+ # Extract audio with better error handling
40
  def extract_audio(video_path):
41
+ try:
42
+ if not os.path.exists(video_path):
43
+ return None
44
+
45
+ audio_path = "temp_audio.mp3"
46
+ os.system(f"ffmpeg -i \"{video_path}\" -vn -acodec libmp3lame -q:a 2 \"{audio_path}\" -y -loglevel error")
47
+ return audio_path if os.path.exists(audio_path) else None
48
+ except Exception as e:
49
+ print(f"Audio extraction error: {str(e)}")
50
+ return None
51
 
52
+ # Transcribe audio with model caching
53
  def transcribe_audio(audio_path):
54
  try:
55
+ if not os.path.exists(audio_path):
56
+ return None
57
+
58
  model = whisper.load_model("base")
59
+ result = model.transcribe(audio_path, fp16=False) # fp16=False for better compatibility
60
  return result['text']
61
  except Exception as e:
62
+ print(f"Transcription error: {str(e)}")
63
  return None
64
 
65
+ # Classify content with fallback
66
  def classify_content(text):
67
  try:
68
  if not text or len(text.strip()) == 0:
 
80
 
81
  return result['labels'][0], result['scores'][0]
82
  except Exception as e:
83
+ print(f"Classification error: {str(e)}")
84
  return None, None
85
 
86
+ # Validate YouTube URL
87
+ def is_valid_youtube_url(url):
88
+ youtube_domains = ['youtube.com', 'www.youtube.com', 'youtu.be', 'www.youtu.be']
89
+ try:
90
+ parsed = urlparse(url)
91
+ if not parsed.scheme in ('http', 'https'):
92
+ return False
93
+ if not any(domain in parsed.netloc for domain in youtube_domains):
94
+ return False
95
+ return True
96
+ except:
97
+ return False
98
+
99
+ # Main processing function with better error handling
100
  def process_video(video_url):
101
  clean_temp_files()
102
 
103
  if not video_url or len(video_url.strip()) == 0:
104
  return "Please enter a valid YouTube URL", ""
105
 
106
+ if not is_valid_youtube_url(video_url):
107
+ return "Please enter a valid YouTube URL (should start with https://youtube.com or https://youtu.be)", ""
 
 
 
 
 
 
 
 
 
 
 
108
 
109
+ try:
110
+ # Download video
111
+ video_path = download_video(video_url)
112
+ if not video_path:
113
+ return "Failed to download video (may be private, age-restricted, or unavailable)", ""
114
+
115
+ # Extract audio
116
+ audio_path = extract_audio(video_path)
117
+ if not audio_path:
118
+ clean_temp_files()
119
+ return "Failed to extract audio from video", ""
120
+
121
+ # Transcribe
122
+ transcription = transcribe_audio(audio_path)
123
+ if not transcription:
124
+ clean_temp_files()
125
+ return "Failed to transcribe audio (may be no speech detected)", ""
126
+
127
+ # Classify
128
+ category, confidence = classify_content(transcription)
129
+ if not category:
130
+ clean_temp_files()
131
+ return transcription, "Failed to classify content"
132
+
133
+ # Clean up
134
  clean_temp_files()
135
+
136
+ # Format classification result
137
+ classification_result = f"{category} (confidence: {confidence:.2f})"
138
+ return transcription, classification_result
139
+
140
+ except Exception as e:
141
  clean_temp_files()
142
+ return f"An error occurred: {str(e)}", ""
 
 
 
 
 
 
 
143
 
144
  # Gradio interface
145
  with gr.Blocks(title="YouTube Content Analyzer") as demo:
 
151
  with gr.Row():
152
  url_input = gr.Textbox(
153
  label="YouTube URL",
154
+ placeholder="Enter YouTube video URL here...",
155
+ max_lines=1
156
  )
157
 
158
  with gr.Row():
 
174
  # Examples
175
  gr.Examples(
176
  examples=[
177
+ ["https://www.youtube.com/watch?v=dQw4w9WgXcQ"], # Rick Astley - Never Gonna Give You Up
178
+ ["https://youtu.be/J---aiyznGQ"] # Keyboard Cat
179
  ],
180
  inputs=url_input
181
  )