Spaces:

MahatirTusher
/

WebChatter

Sleeping

App Files Files Community

MahatirTusher commited on Apr 23, 2025

Commit

4535d8e

verified ·

1 Parent(s): 8b4678f

Update app.py

Browse files

Files changed (1) hide show

app.py +56 -23

app.py CHANGED Viewed

@@ -14,12 +14,16 @@ from bs4 import SoupStrainer
 from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled, NoTranscriptFound
 import yt_dlp
 import re
 # Load environment variables (optional)
 load_dotenv()
 # Hardcoded Groq API key
 GROQ_API_KEY = "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"
 # Custom CSS
 st.markdown("""
@@ -189,12 +193,49 @@ def fetch_youtube_transcript(video_id):
                 return " ".join([item['text'] for item in translated_transcript])
         return None
 # Function to extract subtitles using yt-dlp with cookies
 def extract_subtitles_with_ytdlp(video_url):
     ydl_opts = {
         'writesubtitles': True,
         'writeautomaticsub': True,
-        'subtitleslangs': ['all', '-live_chat'],  # Fetch all languages, exclude live chat
         'skip_download': True,
         'subtitlesformat': 'vtt',
         'outtmpl': 'subtitle.%(ext)s',
@@ -205,17 +246,16 @@ def extract_subtitles_with_ytdlp(video_url):
             'Referer': 'https://www.youtube.com/',
             'Origin': 'https://www.youtube.com',
         },
-        'cookiefile': 'cookies.txt',  # Path to cookies.txt
         'retries': 10,
         'retry_sleep': 5,
-        'no_check_certificate': True,  # Bypass certificate checks
-        'geo_bypass': True,  # Attempt to bypass geo-restrictions
         'force_generic_extractor': True,
-        'quiet': False,  # Allow verbose output for debugging
-        'verbose': True,  # Enable verbose logging for debugging
     }
     try:
-        # Check if cookies.txt exists
         if not os.path.exists('cookies.txt'):
             st.error(
                 "cookies.txt file not found. Please upload a valid cookies.txt file to the root directory of your Space. "
@@ -228,17 +268,14 @@ def extract_subtitles_with_ytdlp(video_url):
             return None
         with yt_dlp.YoutubeDL(ydl_opts) as ydl:
-            # Redirect yt-dlp output to Streamlit for debugging
             ydl.params['logger'] = st
             info = ydl.extract_info(video_url, download=False)
             available_subs = info.get('subtitles', {})
             auto_subs = info.get('automatic_captions', {})
-            # Log available subtitles for debugging
             st.text(f"Available subtitles: {list(available_subs.keys())}")
             st.text(f"Available auto-captions: {list(auto_subs.keys())}")
-            # Use any available language if English isn't available
             subtitle_langs = list(available_subs.keys()) or list(auto_subs.keys())
             if not subtitle_langs:
                 return None
@@ -246,7 +283,6 @@ def extract_subtitles_with_ytdlp(video_url):
             ydl.params['subtitleslangs'] = subtitle_langs
             ydl.download([video_url])
-        # Look for the subtitle file
         subtitle_file = None
         for lang in subtitle_langs:
             possible_file = f"subtitle.{lang}.vtt"
@@ -257,14 +293,11 @@ def extract_subtitles_with_ytdlp(video_url):
         if not subtitle_file:
             return None
-        # Read and parse the subtitle file
         with open(subtitle_file, 'r', encoding='utf-8') as f:
             subtitle_text = f.read()
-        # Clean up the subtitle file
         os.remove(subtitle_file)
-        # Extract text from VTT format, removing timestamps and metadata
         lines = subtitle_text.split('\n')
         text_lines = []
         for line in lines:
@@ -283,12 +316,11 @@ def process_content(text, embeddings, source):
         chunk_overlap=200,
         separators=["\n\n", "\n", ".", " ", ""]
     )
-    # Create documents with source metadata
     docs = text_splitter.create_documents([text], metadatas=[{"source": source}])
-    # Debug: Check metadata of the first document
     if docs:
         st.text(f"Document metadata: {docs[0].metadata}")
     vectorstore = FAISS.from_documents(docs, embeddings)
     return vectorstore
 # Function to create QA chain
@@ -300,9 +332,10 @@ def create_qa_chain(vectorstore, llm):
         chain_type="stuff",
         chain_type_kwargs={
             "prompt": qa_prompt,
-            "document_variable_name": "context"  # Match the variable name in the prompt
         }
     )
     return qa_chain
 # Process Web URL
@@ -325,13 +358,11 @@ if process_url_clicked:
                         st.error("No content loaded from URL. Try a different URL (e.g., https://www.bbc.com/news/science-environment-67299122).")
                         st.stop()
-                    # Store content for summarization
                     st.session_state.url_content = "\n".join([doc.page_content for doc in data])
                     embeddings = st.session_state.embeddings
-                    # Pass the URL as the source metadata
                     st.session_state.vectorstore = process_content(st.session_state.url_content, embeddings, source=url.strip())
                     st.session_state.index_created = True
-                    st.session_state.qa_chain = None  # Clear cached QA chain
                     st.text("Content processed successfully! ✅✅✅")
                 except Exception as e:
                     st.error(f"Error processing URL: {str(e)}")
@@ -358,6 +389,10 @@ if process_youtube_clicked:
                         st.text("Fetching Closed Captions...Started...✅✅✅")
                         transcript_text = extract_subtitles_with_ytdlp(youtube_url)
                         if not transcript_text:
                             st.error(
                                 "No transcripts or closed captions available. "
@@ -376,13 +411,11 @@ if process_youtube_clicked:
                         st.error("Transcript or captions are empty. Try a different video.")
                         st.stop()
-                    # Process the transcript
                     st.session_state.url_content = transcript_text
                     embeddings = st.session_state.embeddings
-                    # Pass the YouTube URL as the source metadata
                     st.session_state.vectorstore = process_content(transcript_text, embeddings, source=youtube_url.strip())
                     st.session_state.index_created = True
-                    st.session_state.qa_chain = None  # Clear cached QA chain
                     st.text("YouTube video processed successfully! ✅✅✅")
                 except Exception as e:
                     st.error(f"Error processing YouTube video: {str(e)}")

 from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled, NoTranscriptFound
 import yt_dlp
 import re
+from googleapiclient.discovery import build
+from googleapiclient.errors import HttpError
 # Load environment variables (optional)
 load_dotenv()
 # Hardcoded Groq API key
 GROQ_API_KEY = "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"
+# YouTube API key (to be set in Hugging Face Spaces secrets)
+YOUTUBE_API_KEY = os.getenv("YOUTUBE_API_KEY")
 # Custom CSS
 st.markdown("""
                 return " ".join([item['text'] for item in translated_transcript])
         return None
+# Function to fetch captions using YouTube Data API (limited to listing with API key)
+def fetch_youtube_captions_api(video_id, api_key):
+    if not api_key:
+        st.warning("YOUTUBE_API_KEY not set. Skipping YouTube Data API fallback.")
+        return None
+    try:
+        youtube = build('youtube', 'v3', developerKey=api_key)
+        captions = youtube.captions().list(
+            part='snippet',
+            videoId=video_id
+        ).execute()
+        caption_id = None
+        for item in captions.get('items', []):
+            if item['snippet']['language'] == 'en':
+                caption_id = item['id']
+                break
+            elif item['snippet']['language'] in ['en-US', 'en-GB']:
+                caption_id = item['id']
+                break
+        if not caption_id:
+            st.warning("No English captions found via YouTube Data API.")
+            return None
+        # Note: Downloading captions requires OAuth 2.0 authentication
+        st.warning(
+            "Downloading captions requires OAuth 2.0 authentication, which is not supported in this environment. "
+            "English captions are available but cannot be fetched with an API key alone. "
+            "Consider setting up OAuth 2.0 for full functionality (see documentation)."
+        )
+        return None
+    except HttpError as e:
+        st.error(f"Error fetching captions with YouTube Data API: {str(e)}")
+        return None
 # Function to extract subtitles using yt-dlp with cookies
 def extract_subtitles_with_ytdlp(video_url):
     ydl_opts = {
         'writesubtitles': True,
         'writeautomaticsub': True,
+        'subtitleslangs': ['all', '-live_chat'],
         'skip_download': True,
         'subtitlesformat': 'vtt',
         'outtmpl': 'subtitle.%(ext)s',
             'Referer': 'https://www.youtube.com/',
             'Origin': 'https://www.youtube.com',
         },
+        'cookiefile': 'cookies.txt',
         'retries': 10,
         'retry_sleep': 5,
+        'no_check_certificate': True,
+        'geo_bypass': True,
         'force_generic_extractor': True,
+        'quiet': False,
+        'verbose': True,
     }
     try:
         if not os.path.exists('cookies.txt'):
             st.error(
                 "cookies.txt file not found. Please upload a valid cookies.txt file to the root directory of your Space. "
             return None
         with yt_dlp.YoutubeDL(ydl_opts) as ydl:
             ydl.params['logger'] = st
             info = ydl.extract_info(video_url, download=False)
             available_subs = info.get('subtitles', {})
             auto_subs = info.get('automatic_captions', {})
             st.text(f"Available subtitles: {list(available_subs.keys())}")
             st.text(f"Available auto-captions: {list(auto_subs.keys())}")
             subtitle_langs = list(available_subs.keys()) or list(auto_subs.keys())
             if not subtitle_langs:
                 return None
             ydl.params['subtitleslangs'] = subtitle_langs
             ydl.download([video_url])
         subtitle_file = None
         for lang in subtitle_langs:
             possible_file = f"subtitle.{lang}.vtt"
         if not subtitle_file:
             return None
         with open(subtitle_file, 'r', encoding='utf-8') as f:
             subtitle_text = f.read()
         os.remove(subtitle_file)
         lines = subtitle_text.split('\n')
         text_lines = []
         for line in lines:
         chunk_overlap=200,
         separators=["\n\n", "\n", ".", " ", ""]
     )
     docs = text_splitter.create_documents([text], metadatas=[{"source": source}])
     if docs:
         st.text(f"Document metadata: {docs[0].metadata}")
     vectorstore = FAISS.from_documents(docs, embeddings)
+    st.text(f"Vector store created with {len(docs)} documents.")
     return vectorstore
 # Function to create QA chain
         chain_type="stuff",
         chain_type_kwargs={
             "prompt": qa_prompt,
+            "document_variable_name": "context"
         }
     )
+    st.text("QA chain created successfully.")
     return qa_chain
 # Process Web URL
                         st.error("No content loaded from URL. Try a different URL (e.g., https://www.bbc.com/news/science-environment-67299122).")
                         st.stop()
                     st.session_state.url_content = "\n".join([doc.page_content for doc in data])
                     embeddings = st.session_state.embeddings
                     st.session_state.vectorstore = process_content(st.session_state.url_content, embeddings, source=url.strip())
                     st.session_state.index_created = True
+                    st.session_state.qa_chain = None
                     st.text("Content processed successfully! ✅✅✅")
                 except Exception as e:
                     st.error(f"Error processing URL: {str(e)}")
                         st.text("Fetching Closed Captions...Started...✅✅✅")
                         transcript_text = extract_subtitles_with_ytdlp(youtube_url)
+                        if not transcript_text and YOUTUBE_API_KEY:
+                            st.text("Fetching Captions via YouTube Data API...Started...✅✅✅")
+                            transcript_text = fetch_youtube_captions_api(video_id, YOUTUBE_API_KEY)
                         if not transcript_text:
                             st.error(
                                 "No transcripts or closed captions available. "
                         st.error("Transcript or captions are empty. Try a different video.")
                         st.stop()
                     st.session_state.url_content = transcript_text
                     embeddings = st.session_state.embeddings
                     st.session_state.vectorstore = process_content(transcript_text, embeddings, source=youtube_url.strip())
                     st.session_state.index_created = True
+                    st.session_state.qa_chain = None
                     st.text("YouTube video processed successfully! ✅✅✅")
                 except Exception as e:
                     st.error(f"Error processing YouTube video: {str(e)}")