Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -14,16 +14,12 @@ from bs4 import SoupStrainer
|
|
| 14 |
from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled, NoTranscriptFound
|
| 15 |
import yt_dlp
|
| 16 |
import re
|
| 17 |
-
from googleapiclient.discovery import build
|
| 18 |
-
from googleapiclient.errors import HttpError
|
| 19 |
|
| 20 |
# Load environment variables (optional)
|
| 21 |
load_dotenv()
|
| 22 |
|
| 23 |
# Hardcoded Groq API key
|
| 24 |
GROQ_API_KEY = "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"
|
| 25 |
-
# YouTube API key (to be set in Hugging Face Spaces secrets)
|
| 26 |
-
YOUTUBE_API_KEY = os.getenv("YOUTUBE_API_KEY")
|
| 27 |
|
| 28 |
# Custom CSS
|
| 29 |
st.markdown("""
|
|
@@ -121,7 +117,7 @@ if "vectorstore" not in st.session_state:
|
|
| 121 |
if "summary" not in st.session_state:
|
| 122 |
st.session_state.summary = None
|
| 123 |
if "qa_chain" not in st.session_state:
|
| 124 |
-
st.session_state.qa_chain = None
|
| 125 |
|
| 126 |
# Initialize embeddings once at the start
|
| 127 |
if "embeddings" not in st.session_state:
|
|
@@ -193,65 +189,32 @@ def fetch_youtube_transcript(video_id):
|
|
| 193 |
return " ".join([item['text'] for item in translated_transcript])
|
| 194 |
return None
|
| 195 |
|
| 196 |
-
# Function to fetch captions using YouTube Data API
|
| 197 |
-
def fetch_youtube_captions_api(video_id, api_key):
|
| 198 |
-
if not api_key:
|
| 199 |
-
return None
|
| 200 |
-
try:
|
| 201 |
-
youtube = build('youtube', 'v3', developerKey=api_key)
|
| 202 |
-
captions = youtube.captions().list(
|
| 203 |
-
part='snippet',
|
| 204 |
-
videoId=video_id
|
| 205 |
-
).execute()
|
| 206 |
-
|
| 207 |
-
caption_id = None
|
| 208 |
-
for item in captions.get('items', []):
|
| 209 |
-
if item['snippet']['language'] == 'en':
|
| 210 |
-
caption_id = item['id']
|
| 211 |
-
break
|
| 212 |
-
elif item['snippet']['language'] in ['en-US', 'en-GB']:
|
| 213 |
-
caption_id = item['id']
|
| 214 |
-
break
|
| 215 |
-
|
| 216 |
-
if not caption_id:
|
| 217 |
-
return None
|
| 218 |
-
|
| 219 |
-
caption_content = youtube.captions().download(
|
| 220 |
-
id=caption_id,
|
| 221 |
-
tfmt='srt'
|
| 222 |
-
).execute()
|
| 223 |
-
|
| 224 |
-
# Parse SRT content
|
| 225 |
-
caption_text = caption_content.decode('utf-8')
|
| 226 |
-
lines = caption_text.split('\n')
|
| 227 |
-
text_lines = []
|
| 228 |
-
for line in lines:
|
| 229 |
-
if not line.strip().isdigit() and not re.match(r'\d{2}:\d{2}:\d{2},\d{3} --> \d{2}:\d{2}:\d{2},\d{3}', line) and line.strip():
|
| 230 |
-
text_lines.append(line.strip())
|
| 231 |
-
|
| 232 |
-
return " ".join(text_lines)
|
| 233 |
-
except HttpError as e:
|
| 234 |
-
st.error(f"Error fetching captions with YouTube Data API: {str(e)}")
|
| 235 |
-
return None
|
| 236 |
-
|
| 237 |
# Function to extract subtitles using yt-dlp with cookies
|
| 238 |
def extract_subtitles_with_ytdlp(video_url):
|
| 239 |
ydl_opts = {
|
| 240 |
'writesubtitles': True,
|
| 241 |
'writeautomaticsub': True,
|
| 242 |
-
'subtitleslangs': ['all'],
|
| 243 |
'skip_download': True,
|
| 244 |
'subtitlesformat': 'vtt',
|
| 245 |
'outtmpl': 'subtitle.%(ext)s',
|
| 246 |
'http_headers': {
|
| 247 |
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
|
| 248 |
-
'Accept
|
|
|
|
|
|
|
| 249 |
},
|
| 250 |
'cookiefile': 'cookies.txt', # Path to cookies.txt
|
| 251 |
-
'retries':
|
| 252 |
-
'retry_sleep':
|
|
|
|
| 253 |
}
|
| 254 |
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 255 |
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
|
| 256 |
info = ydl.extract_info(video_url, download=False)
|
| 257 |
available_subs = info.get('subtitles', {})
|
|
@@ -261,17 +224,22 @@ def extract_subtitles_with_ytdlp(video_url):
|
|
| 261 |
st.text(f"Available subtitles: {list(available_subs.keys())}")
|
| 262 |
st.text(f"Available auto-captions: {list(auto_subs.keys())}")
|
| 263 |
|
| 264 |
-
#
|
| 265 |
subtitle_langs = list(available_subs.keys()) or list(auto_subs.keys())
|
| 266 |
if not subtitle_langs:
|
| 267 |
return None
|
| 268 |
|
| 269 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 270 |
ydl.download([video_url])
|
| 271 |
|
| 272 |
# Look for the subtitle file
|
| 273 |
subtitle_file = None
|
| 274 |
-
for lang in
|
| 275 |
possible_file = f"subtitle.{lang}.vtt"
|
| 276 |
if os.path.exists(possible_file):
|
| 277 |
subtitle_file = possible_file
|
|
@@ -341,8 +309,7 @@ if process_url_clicked:
|
|
| 341 |
loader = WebBaseLoader(
|
| 342 |
web_paths=[url.strip()],
|
| 343 |
bs_kwargs={"parse_only": parse_only},
|
| 344 |
-
requests_kwargs={"headers": {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"}}
|
| 345 |
-
)
|
| 346 |
data = loader.load()
|
| 347 |
|
| 348 |
if not data or all(len(doc.page_content.strip()) == 0 for doc in data):
|
|
@@ -382,13 +349,12 @@ if process_youtube_clicked:
|
|
| 382 |
st.text("Fetching Closed Captions...Started...β
β
β
")
|
| 383 |
transcript_text = extract_subtitles_with_ytdlp(youtube_url)
|
| 384 |
|
| 385 |
-
# Fallback to YouTube Data API if yt-dlp fails
|
| 386 |
-
if not transcript_text and YOUTUBE_API_KEY:
|
| 387 |
-
st.text("Fetching Captions via YouTube Data API...Started...β
β
β
")
|
| 388 |
-
transcript_text = fetch_youtube_captions_api(video_id, YOUTUBE_API_KEY)
|
| 389 |
-
|
| 390 |
if not transcript_text:
|
| 391 |
-
st.error(
|
|
|
|
|
|
|
|
|
|
|
|
|
| 392 |
st.stop()
|
| 393 |
|
| 394 |
if not transcript_text.strip():
|
|
|
|
| 14 |
from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled, NoTranscriptFound
|
| 15 |
import yt_dlp
|
| 16 |
import re
|
|
|
|
|
|
|
| 17 |
|
| 18 |
# Load environment variables (optional)
|
| 19 |
load_dotenv()
|
| 20 |
|
| 21 |
# Hardcoded Groq API key
|
| 22 |
GROQ_API_KEY = "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"
|
|
|
|
|
|
|
| 23 |
|
| 24 |
# Custom CSS
|
| 25 |
st.markdown("""
|
|
|
|
| 117 |
if "summary" not in st.session_state:
|
| 118 |
st.session_state.summary = None
|
| 119 |
if "qa_chain" not in st.session_state:
|
| 120 |
+
st.session_state.qa_chain = None
|
| 121 |
|
| 122 |
# Initialize embeddings once at the start
|
| 123 |
if "embeddings" not in st.session_state:
|
|
|
|
| 189 |
return " ".join([item['text'] for item in translated_transcript])
|
| 190 |
return None
|
| 191 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 192 |
# Function to extract subtitles using yt-dlp with cookies
|
| 193 |
def extract_subtitles_with_ytdlp(video_url):
|
| 194 |
ydl_opts = {
|
| 195 |
'writesubtitles': True,
|
| 196 |
'writeautomaticsub': True,
|
| 197 |
+
'subtitleslangs': ['all', '-live_chat'],
|
| 198 |
'skip_download': True,
|
| 199 |
'subtitlesformat': 'vtt',
|
| 200 |
'outtmpl': 'subtitle.%(ext)s',
|
| 201 |
'http_headers': {
|
| 202 |
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
|
| 203 |
+
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
|
| 204 |
+
'Accept-Language': 'en-US,en;q=0.5',
|
| 205 |
+
'Referer': 'https://www.youtube.com/',
|
| 206 |
},
|
| 207 |
'cookiefile': 'cookies.txt', # Path to cookies.txt
|
| 208 |
+
'retries': 5,
|
| 209 |
+
'retry_sleep': 3,
|
| 210 |
+
'force_generic_extractor': True,
|
| 211 |
}
|
| 212 |
try:
|
| 213 |
+
# Check if cookies.txt exists
|
| 214 |
+
if not os.path.exists('cookies.txt'):
|
| 215 |
+
st.error("cookies.txt file not found. Please upload a valid cookies.txt file to the root directory of your Space. See instructions on how to generate it using a browser extension like 'Export Cookies' for Chrome.")
|
| 216 |
+
return None
|
| 217 |
+
|
| 218 |
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
|
| 219 |
info = ydl.extract_info(video_url, download=False)
|
| 220 |
available_subs = info.get('subtitles', {})
|
|
|
|
| 224 |
st.text(f"Available subtitles: {list(available_subs.keys())}")
|
| 225 |
st.text(f"Available auto-captions: {list(auto_subs.keys())}")
|
| 226 |
|
| 227 |
+
# Prioritize English subtitles or auto-captions
|
| 228 |
subtitle_langs = list(available_subs.keys()) or list(auto_subs.keys())
|
| 229 |
if not subtitle_langs:
|
| 230 |
return None
|
| 231 |
|
| 232 |
+
# Filter for English or related languages
|
| 233 |
+
target_langs = [lang for lang in subtitle_langs if lang.startswith('en')]
|
| 234 |
+
if not target_langs:
|
| 235 |
+
target_langs = subtitle_langs # Fallback to any language
|
| 236 |
+
|
| 237 |
+
ydl.params['subtitleslangs'] = target_langs
|
| 238 |
ydl.download([video_url])
|
| 239 |
|
| 240 |
# Look for the subtitle file
|
| 241 |
subtitle_file = None
|
| 242 |
+
for lang in target_langs:
|
| 243 |
possible_file = f"subtitle.{lang}.vtt"
|
| 244 |
if os.path.exists(possible_file):
|
| 245 |
subtitle_file = possible_file
|
|
|
|
| 309 |
loader = WebBaseLoader(
|
| 310 |
web_paths=[url.strip()],
|
| 311 |
bs_kwargs={"parse_only": parse_only},
|
| 312 |
+
requests_kwargs={"headers": {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"}})
|
|
|
|
| 313 |
data = loader.load()
|
| 314 |
|
| 315 |
if not data or all(len(doc.page_content.strip()) == 0 for doc in data):
|
|
|
|
| 349 |
st.text("Fetching Closed Captions...Started...β
β
β
")
|
| 350 |
transcript_text = extract_subtitles_with_ytdlp(youtube_url)
|
| 351 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 352 |
if not transcript_text:
|
| 353 |
+
st.error(
|
| 354 |
+
"No transcripts or closed captions available in any language. "
|
| 355 |
+
"Please ensure captions are enabled for this video, or try a different video (e.g., https://www.youtube.com/watch?v=dQw4w9WgXcQ). "
|
| 356 |
+
"If the issue persists, ensure your cookies.txt file is up-to-date."
|
| 357 |
+
)
|
| 358 |
st.stop()
|
| 359 |
|
| 360 |
if not transcript_text.strip():
|