MahatirTusher commited on
Commit
dd230ea
·
verified ·
1 Parent(s): ca1c429

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +44 -300
app.py CHANGED
@@ -6,31 +6,17 @@ from langchain_community.vectorstores.faiss import FAISS
6
  from langchain.text_splitter import RecursiveCharacterTextSplitter
7
  from langchain_core.documents import Document
8
  import os
9
- import json
10
  from langchain_groq import ChatGroq
11
  from langchain.chains.qa_with_sources.retrieval import RetrievalQAWithSourcesChain
12
  from langchain.prompts import PromptTemplate
13
  from bs4 import SoupStrainer
14
- from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled, NoTranscriptFound
15
- import yt_dlp
16
- import re
17
- from googleapiclient.discovery import build
18
- from googleapiclient.errors import HttpError
19
- from google_auth_oauthlib.flow import InstalledAppFlow
20
- from google.auth.transport.requests import Request
21
- from google.oauth2.credentials import Credentials
22
 
23
  # Load environment variables (optional)
24
  load_dotenv()
25
 
26
  # Hardcoded Groq API key
27
  GROQ_API_KEY = "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"
28
- # YouTube API key (to be set in Hugging Face Spaces secrets, optional if using OAuth)
29
- YOUTUBE_API_KEY = os.getenv("YOUTUBE_API_KEY")
30
-
31
- # Path to store OAuth credentials
32
- CREDENTIALS_FILE = "youtube_credentials.json"
33
- CLIENT_SECRETS_FILE = "client_secrets.json"
34
 
35
  # Custom CSS
36
  st.markdown("""
@@ -138,20 +124,20 @@ if "llm" not in st.session_state:
138
  max_tokens=512 # Keep reduced to minimize resource usage
139
  )
140
 
141
- # Sidebar for URL and YouTube input
142
  with st.sidebar:
143
  st.header("Enter Web URL")
144
  url = st.text_input("URL", placeholder="e.g., https://mahatirtusher.com/astronomy-mythology/")
145
  process_url_clicked = st.button("Process URL")
146
 
147
- st.header("Enter YouTube URL")
148
- youtube_url = st.text_input("YouTube URL", placeholder="e.g., https://www.youtube.com/watch?v=DJO_9auJhJQ")
149
- process_youtube_clicked = st.button("Process YouTube Video")
150
 
151
  # Main content container
152
  main_container = st.container()
153
 
154
- # Custom prompt for detailed answers (for web URLs only)
155
  qa_prompt = PromptTemplate(
156
  template="""You are an expert assistant tasked with providing detailed, extensive, and comprehensive answers. Use the provided context to answer the question thoroughly, including explanations, examples, and additional relevant information. If the context is limited, expand on the topic with your knowledge to ensure a complete response. In case of explaining anything, break the topic and explain step by step. Sometimes use your own reasoning and knowledge to explain anything to the users. If the users ask any question in Bengali, you too will answer it in fine and detailed Bengali.
157
 
@@ -163,17 +149,9 @@ Answer with sources: """
163
  )
164
 
165
  # Function to summarize content
166
- def summarize_content(content, llm, is_youtube=False):
167
- if is_youtube:
168
- # Extensive summary for YouTube videos (15-20 sentences)
169
- summary_prompt = f"""You are an expert summarizer tasked with providing a very detailed and extensive summary of the following YouTube video transcript. Capture all key points, main ideas, and significant details in 15-20 sentences. Include specific examples, quotes, or moments from the transcript to make the summary comprehensive and vivid. Ensure the summary is well-organized, flowing naturally from one point to the next, and provides a thorough overview of the video's content.
170
-
171
- Transcript: {content}
172
-
173
- Extensive Summary: """
174
- else:
175
- # Shorter summary for web URLs (5-10 sentences)
176
- summary_prompt = f"""Summarize the following content in 5-10 sentences, capturing the main points and key details in easy expression:
177
 
178
  {content}
179
 
@@ -181,228 +159,21 @@ Summary: """
181
  summary = llm.invoke(summary_prompt).content
182
  return summary
183
 
184
- # Function to extract YouTube video ID from URL
185
- def get_video_id(url):
186
- if "youtube.com/watch?v=" in url:
187
- return url.split("v=")[1].split("&")[0]
188
- elif "youtu.be/" in url:
189
- return url.split("youtu.be/")[1].split("?")[0]
190
- return None
191
-
192
- # Function to fetch YouTube transcript
193
- def fetch_youtube_transcript(video_id):
194
  try:
195
- transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)
196
- # Try English variants first
197
- for lang in ['en', 'en-US', 'en-GB']:
198
- try:
199
- transcript = transcript_list.find_transcript([lang]).fetch()
200
- full_text = " ".join([item['text'] for item in transcript])
201
- return full_text
202
- except NoTranscriptFound:
203
- continue
204
-
205
- # If no English transcript, try any available transcript and translate to English
206
- for transcript in transcript_list:
207
- if transcript.is_translatable:
208
- translated_transcript = transcript.translate('en').fetch()
209
- return " ".join([item['text'] for item in translated_transcript])
210
-
211
- return None
212
- except TranscriptsDisabled:
213
- return None
214
  except Exception as e:
215
- st.error(f"Error fetching transcript with youtube-transcript-api: {str(e)}")
216
- return None
217
-
218
- # Function to get YouTube API credentials
219
- def get_youtube_credentials():
220
- creds = None
221
- if os.path.exists(CREDENTIALS_FILE):
222
- creds = Credentials.from_authorized_user_file(CREDENTIALS_FILE, scopes=['https://www.googleapis.com/auth/youtube.force-ssl'])
223
-
224
- if not creds or not creds.valid:
225
- if creds and creds.expired and creds.refresh_token:
226
- creds.refresh(Request())
227
- else:
228
- if os.path.exists(CLIENT_SECRETS_FILE):
229
- st.warning("Attempting to authenticate with YouTube Data API. This may not work in Hugging Face Spaces due to redirect URI limitations.")
230
- flow = InstalledAppFlow.from_client_secrets_file(
231
- CLIENT_SECRETS_FILE,
232
- scopes=['https://www.googleapis.com/auth/youtube.force-ssl']
233
- )
234
- # This will fail in Hugging Face Spaces because it can't open a browser
235
- creds = flow.run_local_server(port=0)
236
- with open(CREDENTIALS_FILE, 'w') as token_file:
237
- token_file.write(creds.to_json())
238
- else:
239
- st.warning(
240
- f"{CLIENT_SECRETS_FILE} not found. To use OAuth 2.0 for YouTube Data API:\n"
241
- "1. Go to https://console.developers.google.com/.\n"
242
- "2. Create a project, enable YouTube Data API v3, and create OAuth 2.0 credentials.\n"
243
- "3. Download the credentials as 'client_secrets.json'.\n"
244
- "4. Run the app locally: pip install -r requirements.txt && streamlit run app.py\n"
245
- "5. Authenticate via the browser prompt to generate youtube_credentials.json.\n"
246
- "6. Upload youtube_credentials.json to your Hugging Face Space via the Files tab."
247
- )
248
- return None
249
-
250
- return creds
251
-
252
- # Function to fetch captions using YouTube Data API (with OAuth 2.0 or API key fallback)
253
- def fetch_youtube_captions_api(video_id, api_key=None):
254
- # First, try OAuth 2.0 if credentials are available
255
- creds = get_youtube_credentials()
256
- if creds:
257
- try:
258
- youtube = build('youtube', 'v3', credentials=creds)
259
- captions = youtube.captions().list(
260
- part='snippet',
261
- videoId=video_id
262
- ).execute()
263
-
264
- caption_id = None
265
- for item in captions.get('items', []):
266
- if item['snippet']['language'] == 'en':
267
- caption_id = item['id']
268
- break
269
- elif item['snippet']['language'] in ['en-US', 'en-GB']:
270
- caption_id = item['id']
271
- break
272
-
273
- if not caption_id:
274
- st.warning("No English captions found via YouTube Data API.")
275
- return None
276
-
277
- # Download captions using OAuth 2.0 credentials
278
- caption_content = youtube.captions().download(
279
- id=caption_id,
280
- tfmt='srt'
281
- ).execute()
282
-
283
- # The response is a binary string, decode it
284
- caption_text = caption_content.decode('utf-8')
285
- # Parse SRT format to extract text
286
- lines = caption_text.split('\n')
287
- text_lines = []
288
- for line in lines:
289
- if line.strip() and not line.isdigit() and not re.match(r'\d{2}:\d{2}:\d{2},\d{3} --> \d{2}:\d{2}:\d{2},\d{3}', line):
290
- text_lines.append(line.strip())
291
-
292
- return " ".join(text_lines)
293
-
294
- except HttpError as e:
295
- st.error(f"Error fetching captions with YouTube Data API (OAuth 2.0): {str(e)}")
296
- return None
297
-
298
- # Fallback to API key if OAuth fails or credentials are not available
299
- if not api_key:
300
- st.warning("YOUTUBE_API_KEY not set and OAuth 2.0 credentials not available. Skipping YouTube Data API fallback.")
301
- return None
302
- try:
303
- youtube = build('youtube', 'v3', developerKey=api_key)
304
- captions = youtube.captions().list(
305
- part='snippet',
306
- videoId=video_id
307
- ).execute()
308
-
309
- caption_id = None
310
- for item in captions.get('items', []):
311
- if item['snippet']['language'] == 'en':
312
- caption_id = item['id']
313
- break
314
- elif item['snippet']['language'] in ['en-US', 'en-GB']:
315
- caption_id = item['id']
316
- break
317
-
318
- if not caption_id:
319
- st.warning("No English captions found via YouTube Data API.")
320
- return None
321
-
322
- # Note: Downloading captions requires OAuth 2.0 authentication
323
- st.warning(
324
- "English captions are available for this video but cannot be fetched with an API key alone. "
325
- "Downloading captions requires OAuth 2.0 authentication, which is not supported in Hugging Face Spaces without user interaction. "
326
- "To fetch captions:\n"
327
- "- Follow the instructions above to generate youtube_credentials.json locally and upload it.\n"
328
- "- Or try a video with transcripts available (e.g., https://www.youtube.com/watch?v=dQw4w9WgXcQ)."
329
- )
330
  return None
331
 
332
- except HttpError as e:
333
- st.error(f"Error fetching captions with YouTube Data API (API Key): {str(e)}")
334
- return None
335
-
336
- # Function to extract subtitles using yt-dlp with cookies
337
- def extract_subtitles_with_ytdlp(video_url):
338
- ydl_opts = {
339
- 'writesubtitles': True,
340
- 'writeautomaticsub': True,
341
- 'subtitleslangs': ['all', '-live_chat'],
342
- 'skip_download': True,
343
- 'subtitlesformat': 'vtt',
344
- 'outtmpl': 'subtitle.%(ext)s',
345
- 'http_headers': {
346
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36',
347
- 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
348
- 'Accept-Language': 'en-US,en;q=0.5',
349
- },
350
- 'cookiefile': 'cookies.txt',
351
- }
352
- try:
353
- if not os.path.exists('cookies.txt'):
354
- st.error(
355
- "cookies.txt file not found. Please upload a valid cookies.txt file to the root directory of your Space. "
356
- "To generate it:\n"
357
- "1. Open Chrome and log in to YouTube.\n"
358
- "2. Install the 'Export Cookies' extension (or use a tool like 'cookies.txt' for Firefox).\n"
359
- "3. Export cookies for 'youtube.com' and save as 'cookies.txt'.\n"
360
- "4. Upload the file to your Space via the Files tab.\n"
361
- "Alternative: If this fails, test locally to rule out Spaces IP restrictions."
362
- )
363
- return None
364
-
365
- with yt_dlp.YoutubeDL(ydl_opts) as ydl:
366
- info = ydl.extract_info(video_url, download=False)
367
- available_subs = info.get('subtitles', {})
368
- auto_subs = info.get('automatic_captions', {})
369
-
370
- subtitle_langs = list(available_subs.keys()) or list(auto_subs.keys())
371
- if not subtitle_langs:
372
- st.warning("No subtitles or auto-captions available in any language.")
373
- return None
374
-
375
- ydl.params['subtitleslangs'] = subtitle_langs
376
- ydl.download([video_url])
377
-
378
- subtitle_file = None
379
- for lang in subtitle_langs:
380
- possible_file = f"subtitle.{lang}.vtt"
381
- if os.path.exists(possible_file):
382
- subtitle_file = possible_file
383
- break
384
-
385
- if not subtitle_file:
386
- st.warning("No subtitle files were downloaded.")
387
- return None
388
-
389
- with open(subtitle_file, 'r', encoding='utf-8') as f:
390
- subtitle_text = f.read()
391
-
392
- os.remove(subtitle_file)
393
-
394
- lines = subtitle_text.split('\n')
395
- text_lines = []
396
- for line in lines:
397
- if line.strip() and not line.startswith('WEBVTT') and not line.startswith('Kind:') and not line.startswith('Language:') and not re.match(r'\d{2}:\d{2}:\d{2}\.\d{3} --> \d{2}:\d{2}:\d{2}\.\d{3}', line):
398
- text_lines.append(line.strip())
399
-
400
- return " ".join(text_lines)
401
- except Exception as e:
402
- st.error(f"Error fetching captions with yt-dlp: {str(e)}")
403
- return None
404
-
405
- # Function to process and chunk text (for web URLs only)
406
  def process_content(text, embeddings, source):
407
  text_splitter = RecursiveCharacterTextSplitter(
408
  chunk_size=1000,
@@ -416,7 +187,7 @@ def process_content(text, embeddings, source):
416
  vectorstore = FAISS.from_documents(docs, embeddings)
417
  return vectorstore
418
 
419
- # Function to create QA chain (for web URLs only)
420
  def create_qa_chain(vectorstore, llm):
421
  if vectorstore is None:
422
  st.error("Vector store is not initialized. Cannot create QA chain.")
@@ -468,68 +239,41 @@ if process_url_clicked:
468
  st.error(f"Error processing URL: {str(e)}")
469
  st.stop()
470
 
471
- # Process YouTube Video
472
- if process_youtube_clicked:
473
  with main_container:
474
- if not youtube_url.strip():
475
- st.error("Please provide a valid YouTube URL.")
476
  else:
477
- with st.spinner("Processing YouTube Video..."):
478
  try:
479
- video_id = get_video_id(youtube_url)
480
- if not video_id:
481
- st.error("Invalid YouTube URL. Please provide a URL like https://www.youtube.com/watch?v=VIDEO_ID.")
482
- st.stop()
483
-
484
- transcript_text = None
485
- st.text("Fetching Transcript...Started...✅✅✅")
486
- transcript_text = fetch_youtube_transcript(video_id)
487
-
488
- if not transcript_text:
489
- st.warning("Transcripts are disabled or unavailable. Attempting to fetch closed captions...")
490
- st.text("Fetching Closed Captions with yt-dlp...Started...✅✅✅")
491
- transcript_text = extract_subtitles_with_ytdlp(youtube_url)
492
-
493
- if not transcript_text:
494
- st.text("Fetching Captions via YouTube Data API...Started...✅✅✅")
495
- transcript_text = fetch_youtube_captions_api(video_id, YOUTUBE_API_KEY)
496
-
497
- if not transcript_text:
498
- st.error(
499
- "No transcripts or closed captions available. "
500
- "Possible reasons:\n"
501
- "1. Captions are not enabled for this video.\n"
502
- "2. YouTube detected this request as a bot (even with cookies.txt).\n"
503
- "Solutions:\n"
504
- "- Ensure captions are enabled for the video by checking the video settings on YouTube (gear icon > Subtitles/CC > Enable if available).\n"
505
- "- Regenerate and upload a fresh cookies.txt file (see instructions above).\n"
506
- "- Set up OAuth 2.0 credentials by following the instructions above to download captions directly.\n"
507
- "- Try a different video (e.g., https://www.youtube.com/watch?v=dQw4w9WgXcQ, which has transcripts available).\n"
508
- "- Test locally to rule out Hugging Face Spaces IP restrictions by running: pip install -r requirements.txt && streamlit run app.py"
509
- )
510
- st.stop()
511
 
512
- if not transcript_text.strip():
513
- st.error("Transcript or captions are empty. Try a different video.")
514
  st.stop()
515
 
516
- st.session_state.url_content = transcript_text
517
- # No vector store for YouTube videos since we're not doing QA
518
- st.session_state.vectorstore = None
519
- st.session_state.index_created = False
520
- st.session_state.content_type = "youtube"
 
 
 
 
521
  st.session_state.summary = None
522
- st.text("YouTube video processed successfully! ✅✅✅")
523
  except Exception as e:
524
- st.error(f"Error processing YouTube video: {str(e)}")
525
  st.stop()
526
 
527
  # Summary button
528
  with main_container:
529
  if st.session_state.url_content and st.button("Generate Summary"):
530
  with st.spinner("Generating summary..."):
531
- is_youtube = st.session_state.content_type == "youtube"
532
- st.session_state.summary = summarize_content(st.session_state.url_content, st.session_state.llm, is_youtube=is_youtube)
533
 
534
  # Display summary if generated
535
  if st.session_state.summary:
@@ -537,8 +281,8 @@ if st.session_state.summary:
537
  st.header("Summary of the Content")
538
  st.write(st.session_state.summary)
539
 
540
- # Query input with Ask button (only for web URLs)
541
- if st.session_state.url_content and st.session_state.content_type == "web":
542
  with main_container:
543
  st.header("Ask a Question")
544
  query = st.text_input("Question", placeholder="e.g., What is the article about?")
 
6
  from langchain.text_splitter import RecursiveCharacterTextSplitter
7
  from langchain_core.documents import Document
8
  import os
 
9
  from langchain_groq import ChatGroq
10
  from langchain.chains.qa_with_sources.retrieval import RetrievalQAWithSourcesChain
11
  from langchain.prompts import PromptTemplate
12
  from bs4 import SoupStrainer
13
+ import PyPDF2
 
 
 
 
 
 
 
14
 
15
  # Load environment variables (optional)
16
  load_dotenv()
17
 
18
  # Hardcoded Groq API key
19
  GROQ_API_KEY = "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"
 
 
 
 
 
 
20
 
21
  # Custom CSS
22
  st.markdown("""
 
124
  max_tokens=512 # Keep reduced to minimize resource usage
125
  )
126
 
127
+ # Sidebar for URL and PDF input
128
  with st.sidebar:
129
  st.header("Enter Web URL")
130
  url = st.text_input("URL", placeholder="e.g., https://mahatirtusher.com/astronomy-mythology/")
131
  process_url_clicked = st.button("Process URL")
132
 
133
+ st.header("Upload PDF File")
134
+ pdf_file = st.file_uploader("Upload a PDF", type=["pdf"])
135
+ process_pdf_clicked = st.button("Process PDF")
136
 
137
  # Main content container
138
  main_container = st.container()
139
 
140
+ # Custom prompt for detailed answers
141
  qa_prompt = PromptTemplate(
142
  template="""You are an expert assistant tasked with providing detailed, extensive, and comprehensive answers. Use the provided context to answer the question thoroughly, including explanations, examples, and additional relevant information. If the context is limited, expand on the topic with your knowledge to ensure a complete response. In case of explaining anything, break the topic and explain step by step. Sometimes use your own reasoning and knowledge to explain anything to the users. If the users ask any question in Bengali, you too will answer it in fine and detailed Bengali.
143
 
 
149
  )
150
 
151
  # Function to summarize content
152
+ def summarize_content(content, llm):
153
+ # Shorter summary for web URLs and PDFs (5-10 sentences)
154
+ summary_prompt = f"""Summarize the following content in 5-10 sentences, capturing the main points and key details in easy expression:
 
 
 
 
 
 
 
 
155
 
156
  {content}
157
 
 
159
  summary = llm.invoke(summary_prompt).content
160
  return summary
161
 
162
+ # Function to extract text from PDF
163
+ def extract_text_from_pdf(pdf_file):
 
 
 
 
 
 
 
 
164
  try:
165
+ pdf_reader = PyPDF2.PdfReader(pdf_file)
166
+ text = ""
167
+ for page in pdf_reader.pages:
168
+ page_text = page.extract_text()
169
+ if page_text:
170
+ text += page_text + "\n"
171
+ return text
 
 
 
 
 
 
 
 
 
 
 
 
172
  except Exception as e:
173
+ st.error(f"Error extracting text from PDF: {str(e)}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
174
  return None
175
 
176
+ # Function to process and chunk text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
177
  def process_content(text, embeddings, source):
178
  text_splitter = RecursiveCharacterTextSplitter(
179
  chunk_size=1000,
 
187
  vectorstore = FAISS.from_documents(docs, embeddings)
188
  return vectorstore
189
 
190
+ # Function to create QA chain
191
  def create_qa_chain(vectorstore, llm):
192
  if vectorstore is None:
193
  st.error("Vector store is not initialized. Cannot create QA chain.")
 
239
  st.error(f"Error processing URL: {str(e)}")
240
  st.stop()
241
 
242
+ # Process PDF File
243
+ if process_pdf_clicked:
244
  with main_container:
245
+ if not pdf_file:
246
+ st.error("Please upload a PDF file.")
247
  else:
248
+ with st.spinner("Processing PDF..."):
249
  try:
250
+ st.text("Extracting Text from PDF...Started...✅✅✅")
251
+ pdf_text = extract_text_from_pdf(pdf_file)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
252
 
253
+ if not pdf_text:
254
+ st.error("No text could be extracted from the PDF. Try a different file.")
255
  st.stop()
256
 
257
+ # Initialize embeddings only when needed
258
+ if "embeddings" not in st.session_state:
259
+ st.session_state.embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
260
+
261
+ st.session_state.url_content = pdf_text
262
+ embeddings = st.session_state.embeddings
263
+ st.session_state.vectorstore = process_content(st.session_state.url_content, embeddings, source=pdf_file.name)
264
+ st.session_state.index_created = True
265
+ st.session_state.content_type = "pdf"
266
  st.session_state.summary = None
267
+ st.text("PDF processed successfully! ✅✅✅")
268
  except Exception as e:
269
+ st.error(f"Error processing PDF: {str(e)}")
270
  st.stop()
271
 
272
  # Summary button
273
  with main_container:
274
  if st.session_state.url_content and st.button("Generate Summary"):
275
  with st.spinner("Generating summary..."):
276
+ st.session_state.summary = summarize_content(st.session_state.url_content, st.session_state.llm)
 
277
 
278
  # Display summary if generated
279
  if st.session_state.summary:
 
281
  st.header("Summary of the Content")
282
  st.write(st.session_state.summary)
283
 
284
+ # Query input with Ask button
285
+ if st.session_state.url_content and st.session_state.index_created:
286
  with main_container:
287
  st.header("Ask a Question")
288
  query = st.text_input("Question", placeholder="e.g., What is the article about?")