Spaces:

MahatirTusher
/

WebChatter

Sleeping

App Files Files Community

MahatirTusher commited on Apr 23, 2025

Commit

74d84fb

verified ·

1 Parent(s): ea40c8d

Update app.py

Browse files

Files changed (1) hide show

app.py +27 -61

app.py CHANGED Viewed

@@ -14,16 +14,12 @@ from bs4 import SoupStrainer
 from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled, NoTranscriptFound
 import yt_dlp
 import re
-from googleapiclient.discovery import build
-from googleapiclient.errors import HttpError
 # Load environment variables (optional)
 load_dotenv()
 # Hardcoded Groq API key
 GROQ_API_KEY = "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"
-# YouTube API key (to be set in Hugging Face Spaces secrets)
-YOUTUBE_API_KEY = os.getenv("YOUTUBE_API_KEY")
 # Custom CSS
 st.markdown("""
@@ -121,7 +117,7 @@ if "vectorstore" not in st.session_state:
 if "summary" not in st.session_state:
     st.session_state.summary = None
 if "qa_chain" not in st.session_state:
-    st.session_state.qa_chain = None  # Clear any cached QA chain
 # Initialize embeddings once at the start
 if "embeddings" not in st.session_state:
@@ -193,65 +189,32 @@ def fetch_youtube_transcript(video_id):
                 return " ".join([item['text'] for item in translated_transcript])
         return None
-# Function to fetch captions using YouTube Data API
-def fetch_youtube_captions_api(video_id, api_key):
-    if not api_key:
-        return None
-    try:
-        youtube = build('youtube', 'v3', developerKey=api_key)
-        captions = youtube.captions().list(
-            part='snippet',
-            videoId=video_id
-        ).execute()
-        caption_id = None
-        for item in captions.get('items', []):
-            if item['snippet']['language'] == 'en':
-                caption_id = item['id']
-                break
-            elif item['snippet']['language'] in ['en-US', 'en-GB']:
-                caption_id = item['id']
-                break
-        if not caption_id:
-            return None
-        caption_content = youtube.captions().download(
-            id=caption_id,
-            tfmt='srt'
-        ).execute()
-        # Parse SRT content
-        caption_text = caption_content.decode('utf-8')
-        lines = caption_text.split('\n')
-        text_lines = []
-        for line in lines:
-            if not line.strip().isdigit() and not re.match(r'\d{2}:\d{2}:\d{2},\d{3} --> \d{2}:\d{2}:\d{2},\d{3}', line) and line.strip():
-                text_lines.append(line.strip())
-        return " ".join(text_lines)
-    except HttpError as e:
-        st.error(f"Error fetching captions with YouTube Data API: {str(e)}")
-        return None
 # Function to extract subtitles using yt-dlp with cookies
 def extract_subtitles_with_ytdlp(video_url):
     ydl_opts = {
         'writesubtitles': True,
         'writeautomaticsub': True,
-        'subtitleslangs': ['all'],
         'skip_download': True,
         'subtitlesformat': 'vtt',
         'outtmpl': 'subtitle.%(ext)s',
         'http_headers': {
             'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
-            'Accept-Language': 'en-US,en;q=0.9',
         },
         'cookiefile': 'cookies.txt',  # Path to cookies.txt
-        'retries': 3,
-        'retry_sleep': 5,
     }
     try:
         with yt_dlp.YoutubeDL(ydl_opts) as ydl:
             info = ydl.extract_info(video_url, download=False)
             available_subs = info.get('subtitles', {})
@@ -261,17 +224,22 @@ def extract_subtitles_with_ytdlp(video_url):
             st.text(f"Available subtitles: {list(available_subs.keys())}")
             st.text(f"Available auto-captions: {list(auto_subs.keys())}")
-            # Download the first available subtitle or auto-caption
             subtitle_langs = list(available_subs.keys()) or list(auto_subs.keys())
             if not subtitle_langs:
                 return None
-            ydl.params['subtitleslangs'] = subtitle_langs
             ydl.download([video_url])
         # Look for the subtitle file
         subtitle_file = None
-        for lang in subtitle_langs:
             possible_file = f"subtitle.{lang}.vtt"
             if os.path.exists(possible_file):
                 subtitle_file = possible_file
@@ -341,8 +309,7 @@ if process_url_clicked:
                     loader = WebBaseLoader(
                         web_paths=[url.strip()],
                         bs_kwargs={"parse_only": parse_only},
-                        requests_kwargs={"headers": {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"}}
-                    )
                     data = loader.load()
                     if not data or all(len(doc.page_content.strip()) == 0 for doc in data):
@@ -382,13 +349,12 @@ if process_youtube_clicked:
                         st.text("Fetching Closed Captions...Started...✅✅✅")
                         transcript_text = extract_subtitles_with_ytdlp(youtube_url)
-                        # Fallback to YouTube Data API if yt-dlp fails
-                        if not transcript_text and YOUTUBE_API_KEY:
-                            st.text("Fetching Captions via YouTube Data API...Started...✅✅✅")
-                            transcript_text = fetch_youtube_captions_api(video_id, YOUTUBE_API_KEY)
                         if not transcript_text:
-                            st.error("No transcripts or closed captions available in any language. Please try a different video, or ensure captions are enabled for this video.")
                             st.stop()
                     if not transcript_text.strip():

 from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled, NoTranscriptFound
 import yt_dlp
 import re
 # Load environment variables (optional)
 load_dotenv()
 # Hardcoded Groq API key
 GROQ_API_KEY = "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"
 # Custom CSS
 st.markdown("""
 if "summary" not in st.session_state:
     st.session_state.summary = None
 if "qa_chain" not in st.session_state:
+    st.session_state.qa_chain = None
 # Initialize embeddings once at the start
 if "embeddings" not in st.session_state:
                 return " ".join([item['text'] for item in translated_transcript])
         return None
 # Function to extract subtitles using yt-dlp with cookies
 def extract_subtitles_with_ytdlp(video_url):
     ydl_opts = {
         'writesubtitles': True,
         'writeautomaticsub': True,
+        'subtitleslangs': ['all', '-live_chat'],
         'skip_download': True,
         'subtitlesformat': 'vtt',
         'outtmpl': 'subtitle.%(ext)s',
         'http_headers': {
             'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
+            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
+            'Accept-Language': 'en-US,en;q=0.5',
+            'Referer': 'https://www.youtube.com/',
         },
         'cookiefile': 'cookies.txt',  # Path to cookies.txt
+        'retries': 5,
+        'retry_sleep': 3,
+        'force_generic_extractor': True,
     }
     try:
+        # Check if cookies.txt exists
+        if not os.path.exists('cookies.txt'):
+            st.error("cookies.txt file not found. Please upload a valid cookies.txt file to the root directory of your Space. See instructions on how to generate it using a browser extension like 'Export Cookies' for Chrome.")
+            return None
         with yt_dlp.YoutubeDL(ydl_opts) as ydl:
             info = ydl.extract_info(video_url, download=False)
             available_subs = info.get('subtitles', {})
             st.text(f"Available subtitles: {list(available_subs.keys())}")
             st.text(f"Available auto-captions: {list(auto_subs.keys())}")
+            # Prioritize English subtitles or auto-captions
             subtitle_langs = list(available_subs.keys()) or list(auto_subs.keys())
             if not subtitle_langs:
                 return None
+            # Filter for English or related languages
+            target_langs = [lang for lang in subtitle_langs if lang.startswith('en')]
+            if not target_langs:
+                target_langs = subtitle_langs  # Fallback to any language
+            ydl.params['subtitleslangs'] = target_langs
             ydl.download([video_url])
         # Look for the subtitle file
         subtitle_file = None
+        for lang in target_langs:
             possible_file = f"subtitle.{lang}.vtt"
             if os.path.exists(possible_file):
                 subtitle_file = possible_file
                     loader = WebBaseLoader(
                         web_paths=[url.strip()],
                         bs_kwargs={"parse_only": parse_only},
+                        requests_kwargs={"headers": {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"}})
                     data = loader.load()
                     if not data or all(len(doc.page_content.strip()) == 0 for doc in data):
                         st.text("Fetching Closed Captions...Started...✅✅✅")
                         transcript_text = extract_subtitles_with_ytdlp(youtube_url)
                         if not transcript_text:
+                            st.error(
+                                "No transcripts or closed captions available in any language. "
+                                "Please ensure captions are enabled for this video, or try a different video (e.g., https://www.youtube.com/watch?v=dQw4w9WgXcQ). "
+                                "If the issue persists, ensure your cookies.txt file is up-to-date."
+                            )
                             st.stop()
                     if not transcript_text.strip():