Spaces:

MahatirTusher
/

WebChatter

Sleeping

App Files Files Community

MahatirTusher commited on Apr 23, 2025

Commit

c8f736d

verified ·

1 Parent(s): 1c8b05d

Update app.py

Browse files

Files changed (1) hide show

app.py +96 -9

app.py CHANGED Viewed

@@ -6,6 +6,7 @@ from langchain_community.vectorstores.faiss import FAISS
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 from langchain_core.documents import Document
 import os
 from langchain_groq import ChatGroq
 from langchain.chains.qa_with_sources.retrieval import RetrievalQAWithSourcesChain
 from langchain.prompts import PromptTemplate
@@ -15,15 +16,22 @@ import yt_dlp
 import re
 from googleapiclient.discovery import build
 from googleapiclient.errors import HttpError
 # Load environment variables (optional)
 load_dotenv()
 # Hardcoded Groq API key
 GROQ_API_KEY = "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"
-# YouTube API key (to be set in Hugging Face Spaces secrets)
 YOUTUBE_API_KEY = os.getenv("YOUTUBE_API_KEY")
 # Custom CSS
 st.markdown("""
     <style>
@@ -207,10 +215,89 @@ def fetch_youtube_transcript(video_id):
         st.error(f"Error fetching transcript with youtube-transcript-api: {str(e)}")
         return None
-# Function to fetch captions using YouTube Data API (limited to listing with API key)
-def fetch_youtube_captions_api(video_id, api_key):
     if not api_key:
-        st.warning("YOUTUBE_API_KEY not set. Skipping YouTube Data API fallback.")
         return None
     try:
         youtube = build('youtube', 'v3', developerKey=api_key)
@@ -237,13 +324,13 @@ def fetch_youtube_captions_api(video_id, api_key):
             "English captions are available for this video but cannot be fetched with an API key alone. "
             "Downloading captions requires OAuth 2.0 authentication, which is not supported in Hugging Face Spaces without user interaction. "
             "To fetch captions:\n"
-            "- Test locally with OAuth 2.0 setup (see https://developers.google.com/youtube/v3/guides/auth/installed-apps for instructions).\n"
             "- Or try a video with transcripts available (e.g., https://www.youtube.com/watch?v=dQw4w9WgXcQ)."
         )
         return None
     except HttpError as e:
-        st.error(f"Error fetching captions with YouTube Data API: {str(e)}")
         return None
 # Function to extract subtitles using yt-dlp with cookies
@@ -400,10 +487,10 @@ if process_youtube_clicked:
                     if not transcript_text:
                         st.warning("Transcripts are disabled or unavailable. Attempting to fetch closed captions...")
-                        st.text("Fetching Closed Captions...Started...✅✅✅")
                         transcript_text = extract_subtitles_with_ytdlp(youtube_url)
-                        if not transcript_text and YOUTUBE_API_KEY:
                             st.text("Fetching Captions via YouTube Data API...Started...✅✅✅")
                             transcript_text = fetch_youtube_captions_api(video_id, YOUTUBE_API_KEY)
@@ -416,7 +503,7 @@ if process_youtube_clicked:
                                 "Solutions:\n"
                                 "- Ensure captions are enabled for the video by checking the video settings on YouTube (gear icon > Subtitles/CC > Enable if available).\n"
                                 "- Regenerate and upload a fresh cookies.txt file (see instructions above).\n"
-                                "- Ensure YOUTUBE_API_KEY is set in Spaces secrets (Settings > Secrets > Add YOUTUBE_API_KEY).\n"
                                 "- Try a different video (e.g., https://www.youtube.com/watch?v=dQw4w9WgXcQ, which has transcripts available).\n"
                                 "- Test locally to rule out Hugging Face Spaces IP restrictions by running: pip install -r requirements.txt && streamlit run app.py"
                             )

 from langchain.text_splitter import RecursiveCharacterTextSplitter
 from langchain_core.documents import Document
 import os
+import json
 from langchain_groq import ChatGroq
 from langchain.chains.qa_with_sources.retrieval import RetrievalQAWithSourcesChain
 from langchain.prompts import PromptTemplate
 import re
 from googleapiclient.discovery import build
 from googleapiclient.errors import HttpError
+from google_auth_oauthlib.flow import InstalledAppFlow
+from google.auth.transport.requests import Request
+from google.oauth2.credentials import Credentials
 # Load environment variables (optional)
 load_dotenv()
 # Hardcoded Groq API key
 GROQ_API_KEY = "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"
+# YouTube API key (to be set in Hugging Face Spaces secrets, optional if using OAuth)
 YOUTUBE_API_KEY = os.getenv("YOUTUBE_API_KEY")
+# Path to store OAuth credentials
+CREDENTIALS_FILE = "youtube_credentials.json"
+CLIENT_SECRETS_FILE = "client_secrets.json"
 # Custom CSS
 st.markdown("""
     <style>
         st.error(f"Error fetching transcript with youtube-transcript-api: {str(e)}")
         return None
+# Function to get YouTube API credentials
+def get_youtube_credentials():
+    creds = None
+    if os.path.exists(CREDENTIALS_FILE):
+        creds = Credentials.from_authorized_user_file(CREDENTIALS_FILE, scopes=['https://www.googleapis.com/auth/youtube.force-ssl'])
+    if not creds or not creds.valid:
+        if creds and creds.expired and creds.refresh_token:
+            creds.refresh(Request())
+        else:
+            if os.path.exists(CLIENT_SECRETS_FILE):
+                st.warning("Attempting to authenticate with YouTube Data API. This may not work in Hugging Face Spaces due to redirect URI limitations.")
+                flow = InstalledAppFlow.from_client_secrets_file(
+                    CLIENT_SECRETS_FILE,
+                    scopes=['https://www.googleapis.com/auth/youtube.force-ssl']
+                )
+                # This will fail in Hugging Face Spaces because it can't open a browser
+                creds = flow.run_local_server(port=0)
+                with open(CREDENTIALS_FILE, 'w') as token_file:
+                    token_file.write(creds.to_json())
+            else:
+                st.warning(
+                    f"{CLIENT_SECRETS_FILE} not found. To use OAuth 2.0 for YouTube Data API:\n"
+                    "1. Go to https://console.developers.google.com/.\n"
+                    "2. Create a project, enable YouTube Data API v3, and create OAuth 2.0 credentials.\n"
+                    "3. Download the credentials as 'client_secrets.json'.\n"
+                    "4. Run the app locally: pip install -r requirements.txt && streamlit run app.py\n"
+                    "5. Authenticate via the browser prompt to generate youtube_credentials.json.\n"
+                    "6. Upload youtube_credentials.json to your Hugging Face Space via the Files tab."
+                )
+                return None
+    return creds
+# Function to fetch captions using YouTube Data API (with OAuth 2.0 or API key fallback)
+def fetch_youtube_captions_api(video_id, api_key=None):
+    # First, try OAuth 2.0 if credentials are available
+    creds = get_youtube_credentials()
+    if creds:
+        try:
+            youtube = build('youtube', 'v3', credentials=creds)
+            captions = youtube.captions().list(
+                part='snippet',
+                videoId=video_id
+            ).execute()
+            caption_id = None
+            for item in captions.get('items', []):
+                if item['snippet']['language'] == 'en':
+                    caption_id = item['id']
+                    break
+                elif item['snippet']['language'] in ['en-US', 'en-GB']:
+                    caption_id = item['id']
+                    break
+            if not caption_id:
+                st.warning("No English captions found via YouTube Data API.")
+                return None
+            # Download captions using OAuth 2.0 credentials
+            caption_content = youtube.captions().download(
+                id=caption_id,
+                tfmt='srt'
+            ).execute()
+            # The response is a binary string, decode it
+            caption_text = caption_content.decode('utf-8')
+            # Parse SRT format to extract text
+            lines = caption_text.split('\n')
+            text_lines = []
+            for line in lines:
+                if line.strip() and not line.isdigit() and not re.match(r'\d{2}:\d{2}:\d{2},\d{3} --> \d{2}:\d{2}:\d{2},\d{3}', line):
+                    text_lines.append(line.strip())
+            return " ".join(text_lines)
+        except HttpError as e:
+            st.error(f"Error fetching captions with YouTube Data API (OAuth 2.0): {str(e)}")
+            return None
+    # Fallback to API key if OAuth fails or credentials are not available
     if not api_key:
+        st.warning("YOUTUBE_API_KEY not set and OAuth 2.0 credentials not available. Skipping YouTube Data API fallback.")
         return None
     try:
         youtube = build('youtube', 'v3', developerKey=api_key)
             "English captions are available for this video but cannot be fetched with an API key alone. "
             "Downloading captions requires OAuth 2.0 authentication, which is not supported in Hugging Face Spaces without user interaction. "
             "To fetch captions:\n"
+            "- Follow the instructions above to generate youtube_credentials.json locally and upload it.\n"
             "- Or try a video with transcripts available (e.g., https://www.youtube.com/watch?v=dQw4w9WgXcQ)."
         )
         return None
     except HttpError as e:
+        st.error(f"Error fetching captions with YouTube Data API (API Key): {str(e)}")
         return None
 # Function to extract subtitles using yt-dlp with cookies
                     if not transcript_text:
                         st.warning("Transcripts are disabled or unavailable. Attempting to fetch closed captions...")
+                        st.text("Fetching Closed Captions with yt-dlp...Started...✅✅✅")
                         transcript_text = extract_subtitles_with_ytdlp(youtube_url)
+                        if not transcript_text:
                             st.text("Fetching Captions via YouTube Data API...Started...✅✅✅")
                             transcript_text = fetch_youtube_captions_api(video_id, YOUTUBE_API_KEY)
                                 "Solutions:\n"
                                 "- Ensure captions are enabled for the video by checking the video settings on YouTube (gear icon > Subtitles/CC > Enable if available).\n"
                                 "- Regenerate and upload a fresh cookies.txt file (see instructions above).\n"
+                                "- Set up OAuth 2.0 credentials by following the instructions above to download captions directly.\n"
                                 "- Try a different video (e.g., https://www.youtube.com/watch?v=dQw4w9WgXcQ, which has transcripts available).\n"
                                 "- Test locally to rule out Hugging Face Spaces IP restrictions by running: pip install -r requirements.txt && streamlit run app.py"
                             )