Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -6,6 +6,7 @@ from langchain_community.vectorstores.faiss import FAISS
|
|
| 6 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
| 7 |
from langchain_core.documents import Document
|
| 8 |
import os
|
|
|
|
| 9 |
from langchain_groq import ChatGroq
|
| 10 |
from langchain.chains.qa_with_sources.retrieval import RetrievalQAWithSourcesChain
|
| 11 |
from langchain.prompts import PromptTemplate
|
|
@@ -15,15 +16,22 @@ import yt_dlp
|
|
| 15 |
import re
|
| 16 |
from googleapiclient.discovery import build
|
| 17 |
from googleapiclient.errors import HttpError
|
|
|
|
|
|
|
|
|
|
| 18 |
|
| 19 |
# Load environment variables (optional)
|
| 20 |
load_dotenv()
|
| 21 |
|
| 22 |
# Hardcoded Groq API key
|
| 23 |
GROQ_API_KEY = "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"
|
| 24 |
-
# YouTube API key (to be set in Hugging Face Spaces secrets)
|
| 25 |
YOUTUBE_API_KEY = os.getenv("YOUTUBE_API_KEY")
|
| 26 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 27 |
# Custom CSS
|
| 28 |
st.markdown("""
|
| 29 |
<style>
|
|
@@ -207,10 +215,89 @@ def fetch_youtube_transcript(video_id):
|
|
| 207 |
st.error(f"Error fetching transcript with youtube-transcript-api: {str(e)}")
|
| 208 |
return None
|
| 209 |
|
| 210 |
-
# Function to
|
| 211 |
-
def
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 212 |
if not api_key:
|
| 213 |
-
st.warning("YOUTUBE_API_KEY not set. Skipping YouTube Data API fallback.")
|
| 214 |
return None
|
| 215 |
try:
|
| 216 |
youtube = build('youtube', 'v3', developerKey=api_key)
|
|
@@ -237,13 +324,13 @@ def fetch_youtube_captions_api(video_id, api_key):
|
|
| 237 |
"English captions are available for this video but cannot be fetched with an API key alone. "
|
| 238 |
"Downloading captions requires OAuth 2.0 authentication, which is not supported in Hugging Face Spaces without user interaction. "
|
| 239 |
"To fetch captions:\n"
|
| 240 |
-
"-
|
| 241 |
"- Or try a video with transcripts available (e.g., https://www.youtube.com/watch?v=dQw4w9WgXcQ)."
|
| 242 |
)
|
| 243 |
return None
|
| 244 |
|
| 245 |
except HttpError as e:
|
| 246 |
-
st.error(f"Error fetching captions with YouTube Data API: {str(e)}")
|
| 247 |
return None
|
| 248 |
|
| 249 |
# Function to extract subtitles using yt-dlp with cookies
|
|
@@ -400,10 +487,10 @@ if process_youtube_clicked:
|
|
| 400 |
|
| 401 |
if not transcript_text:
|
| 402 |
st.warning("Transcripts are disabled or unavailable. Attempting to fetch closed captions...")
|
| 403 |
-
st.text("Fetching Closed Captions...Started...β
β
β
")
|
| 404 |
transcript_text = extract_subtitles_with_ytdlp(youtube_url)
|
| 405 |
|
| 406 |
-
if not transcript_text
|
| 407 |
st.text("Fetching Captions via YouTube Data API...Started...β
β
β
")
|
| 408 |
transcript_text = fetch_youtube_captions_api(video_id, YOUTUBE_API_KEY)
|
| 409 |
|
|
@@ -416,7 +503,7 @@ if process_youtube_clicked:
|
|
| 416 |
"Solutions:\n"
|
| 417 |
"- Ensure captions are enabled for the video by checking the video settings on YouTube (gear icon > Subtitles/CC > Enable if available).\n"
|
| 418 |
"- Regenerate and upload a fresh cookies.txt file (see instructions above).\n"
|
| 419 |
-
"-
|
| 420 |
"- Try a different video (e.g., https://www.youtube.com/watch?v=dQw4w9WgXcQ, which has transcripts available).\n"
|
| 421 |
"- Test locally to rule out Hugging Face Spaces IP restrictions by running: pip install -r requirements.txt && streamlit run app.py"
|
| 422 |
)
|
|
|
|
| 6 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
| 7 |
from langchain_core.documents import Document
|
| 8 |
import os
|
| 9 |
+
import json
|
| 10 |
from langchain_groq import ChatGroq
|
| 11 |
from langchain.chains.qa_with_sources.retrieval import RetrievalQAWithSourcesChain
|
| 12 |
from langchain.prompts import PromptTemplate
|
|
|
|
| 16 |
import re
|
| 17 |
from googleapiclient.discovery import build
|
| 18 |
from googleapiclient.errors import HttpError
|
| 19 |
+
from google_auth_oauthlib.flow import InstalledAppFlow
|
| 20 |
+
from google.auth.transport.requests import Request
|
| 21 |
+
from google.oauth2.credentials import Credentials
|
| 22 |
|
| 23 |
# Load environment variables (optional)
|
| 24 |
load_dotenv()
|
| 25 |
|
| 26 |
# Hardcoded Groq API key
|
| 27 |
GROQ_API_KEY = "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"
|
| 28 |
+
# YouTube API key (to be set in Hugging Face Spaces secrets, optional if using OAuth)
|
| 29 |
YOUTUBE_API_KEY = os.getenv("YOUTUBE_API_KEY")
|
| 30 |
|
| 31 |
+
# Path to store OAuth credentials
|
| 32 |
+
CREDENTIALS_FILE = "youtube_credentials.json"
|
| 33 |
+
CLIENT_SECRETS_FILE = "client_secrets.json"
|
| 34 |
+
|
| 35 |
# Custom CSS
|
| 36 |
st.markdown("""
|
| 37 |
<style>
|
|
|
|
| 215 |
st.error(f"Error fetching transcript with youtube-transcript-api: {str(e)}")
|
| 216 |
return None
|
| 217 |
|
| 218 |
+
# Function to get YouTube API credentials
|
| 219 |
+
def get_youtube_credentials():
|
| 220 |
+
creds = None
|
| 221 |
+
if os.path.exists(CREDENTIALS_FILE):
|
| 222 |
+
creds = Credentials.from_authorized_user_file(CREDENTIALS_FILE, scopes=['https://www.googleapis.com/auth/youtube.force-ssl'])
|
| 223 |
+
|
| 224 |
+
if not creds or not creds.valid:
|
| 225 |
+
if creds and creds.expired and creds.refresh_token:
|
| 226 |
+
creds.refresh(Request())
|
| 227 |
+
else:
|
| 228 |
+
if os.path.exists(CLIENT_SECRETS_FILE):
|
| 229 |
+
st.warning("Attempting to authenticate with YouTube Data API. This may not work in Hugging Face Spaces due to redirect URI limitations.")
|
| 230 |
+
flow = InstalledAppFlow.from_client_secrets_file(
|
| 231 |
+
CLIENT_SECRETS_FILE,
|
| 232 |
+
scopes=['https://www.googleapis.com/auth/youtube.force-ssl']
|
| 233 |
+
)
|
| 234 |
+
# This will fail in Hugging Face Spaces because it can't open a browser
|
| 235 |
+
creds = flow.run_local_server(port=0)
|
| 236 |
+
with open(CREDENTIALS_FILE, 'w') as token_file:
|
| 237 |
+
token_file.write(creds.to_json())
|
| 238 |
+
else:
|
| 239 |
+
st.warning(
|
| 240 |
+
f"{CLIENT_SECRETS_FILE} not found. To use OAuth 2.0 for YouTube Data API:\n"
|
| 241 |
+
"1. Go to https://console.developers.google.com/.\n"
|
| 242 |
+
"2. Create a project, enable YouTube Data API v3, and create OAuth 2.0 credentials.\n"
|
| 243 |
+
"3. Download the credentials as 'client_secrets.json'.\n"
|
| 244 |
+
"4. Run the app locally: pip install -r requirements.txt && streamlit run app.py\n"
|
| 245 |
+
"5. Authenticate via the browser prompt to generate youtube_credentials.json.\n"
|
| 246 |
+
"6. Upload youtube_credentials.json to your Hugging Face Space via the Files tab."
|
| 247 |
+
)
|
| 248 |
+
return None
|
| 249 |
+
|
| 250 |
+
return creds
|
| 251 |
+
|
| 252 |
+
# Function to fetch captions using YouTube Data API (with OAuth 2.0 or API key fallback)
|
| 253 |
+
def fetch_youtube_captions_api(video_id, api_key=None):
|
| 254 |
+
# First, try OAuth 2.0 if credentials are available
|
| 255 |
+
creds = get_youtube_credentials()
|
| 256 |
+
if creds:
|
| 257 |
+
try:
|
| 258 |
+
youtube = build('youtube', 'v3', credentials=creds)
|
| 259 |
+
captions = youtube.captions().list(
|
| 260 |
+
part='snippet',
|
| 261 |
+
videoId=video_id
|
| 262 |
+
).execute()
|
| 263 |
+
|
| 264 |
+
caption_id = None
|
| 265 |
+
for item in captions.get('items', []):
|
| 266 |
+
if item['snippet']['language'] == 'en':
|
| 267 |
+
caption_id = item['id']
|
| 268 |
+
break
|
| 269 |
+
elif item['snippet']['language'] in ['en-US', 'en-GB']:
|
| 270 |
+
caption_id = item['id']
|
| 271 |
+
break
|
| 272 |
+
|
| 273 |
+
if not caption_id:
|
| 274 |
+
st.warning("No English captions found via YouTube Data API.")
|
| 275 |
+
return None
|
| 276 |
+
|
| 277 |
+
# Download captions using OAuth 2.0 credentials
|
| 278 |
+
caption_content = youtube.captions().download(
|
| 279 |
+
id=caption_id,
|
| 280 |
+
tfmt='srt'
|
| 281 |
+
).execute()
|
| 282 |
+
|
| 283 |
+
# The response is a binary string, decode it
|
| 284 |
+
caption_text = caption_content.decode('utf-8')
|
| 285 |
+
# Parse SRT format to extract text
|
| 286 |
+
lines = caption_text.split('\n')
|
| 287 |
+
text_lines = []
|
| 288 |
+
for line in lines:
|
| 289 |
+
if line.strip() and not line.isdigit() and not re.match(r'\d{2}:\d{2}:\d{2},\d{3} --> \d{2}:\d{2}:\d{2},\d{3}', line):
|
| 290 |
+
text_lines.append(line.strip())
|
| 291 |
+
|
| 292 |
+
return " ".join(text_lines)
|
| 293 |
+
|
| 294 |
+
except HttpError as e:
|
| 295 |
+
st.error(f"Error fetching captions with YouTube Data API (OAuth 2.0): {str(e)}")
|
| 296 |
+
return None
|
| 297 |
+
|
| 298 |
+
# Fallback to API key if OAuth fails or credentials are not available
|
| 299 |
if not api_key:
|
| 300 |
+
st.warning("YOUTUBE_API_KEY not set and OAuth 2.0 credentials not available. Skipping YouTube Data API fallback.")
|
| 301 |
return None
|
| 302 |
try:
|
| 303 |
youtube = build('youtube', 'v3', developerKey=api_key)
|
|
|
|
| 324 |
"English captions are available for this video but cannot be fetched with an API key alone. "
|
| 325 |
"Downloading captions requires OAuth 2.0 authentication, which is not supported in Hugging Face Spaces without user interaction. "
|
| 326 |
"To fetch captions:\n"
|
| 327 |
+
"- Follow the instructions above to generate youtube_credentials.json locally and upload it.\n"
|
| 328 |
"- Or try a video with transcripts available (e.g., https://www.youtube.com/watch?v=dQw4w9WgXcQ)."
|
| 329 |
)
|
| 330 |
return None
|
| 331 |
|
| 332 |
except HttpError as e:
|
| 333 |
+
st.error(f"Error fetching captions with YouTube Data API (API Key): {str(e)}")
|
| 334 |
return None
|
| 335 |
|
| 336 |
# Function to extract subtitles using yt-dlp with cookies
|
|
|
|
| 487 |
|
| 488 |
if not transcript_text:
|
| 489 |
st.warning("Transcripts are disabled or unavailable. Attempting to fetch closed captions...")
|
| 490 |
+
st.text("Fetching Closed Captions with yt-dlp...Started...β
β
β
")
|
| 491 |
transcript_text = extract_subtitles_with_ytdlp(youtube_url)
|
| 492 |
|
| 493 |
+
if not transcript_text:
|
| 494 |
st.text("Fetching Captions via YouTube Data API...Started...β
β
β
")
|
| 495 |
transcript_text = fetch_youtube_captions_api(video_id, YOUTUBE_API_KEY)
|
| 496 |
|
|
|
|
| 503 |
"Solutions:\n"
|
| 504 |
"- Ensure captions are enabled for the video by checking the video settings on YouTube (gear icon > Subtitles/CC > Enable if available).\n"
|
| 505 |
"- Regenerate and upload a fresh cookies.txt file (see instructions above).\n"
|
| 506 |
+
"- Set up OAuth 2.0 credentials by following the instructions above to download captions directly.\n"
|
| 507 |
"- Try a different video (e.g., https://www.youtube.com/watch?v=dQw4w9WgXcQ, which has transcripts available).\n"
|
| 508 |
"- Test locally to rule out Hugging Face Spaces IP restrictions by running: pip install -r requirements.txt && streamlit run app.py"
|
| 509 |
)
|