MahatirTusher commited on
Commit
3f006bc
Β·
verified Β·
1 Parent(s): c3e31e8

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +41 -101
app.py CHANGED
@@ -6,7 +6,6 @@ from langchain_community.vectorstores.faiss import FAISS
6
  from langchain.text_splitter import RecursiveCharacterTextSplitter
7
  from langchain_core.documents import Document
8
  import os
9
- import time
10
  from langchain_groq import ChatGroq
11
  from langchain.chains.qa_with_sources.retrieval import RetrievalQAWithSourcesChain
12
  from langchain.prompts import PromptTemplate
@@ -14,16 +13,12 @@ from bs4 import SoupStrainer
14
  from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled, NoTranscriptFound
15
  import yt_dlp
16
  import re
17
- from googleapiclient.discovery import build
18
- from googleapiclient.errors import HttpError
19
 
20
  # Load environment variables (optional)
21
  load_dotenv()
22
 
23
  # Hardcoded Groq API key
24
  GROQ_API_KEY = "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"
25
- # YouTube API key (to be set in Hugging Face Spaces secrets)
26
- YOUTUBE_API_KEY = os.getenv("YOUTUBE_API_KEY")
27
 
28
  # Custom CSS
29
  st.markdown("""
@@ -112,18 +107,16 @@ st.image("https://i.postimg.cc/2j0QWF3Z/Removal-575.png", width=390)
112
  st.title("WebChatter πŸ’¬")
113
 
114
  # Initialize session state
115
- if "index_created" not in st.session_state:
116
- st.session_state.index_created = False
117
  if "url_content" not in st.session_state:
118
  st.session_state.url_content = None
119
- if "vectorstore" not in st.session_state:
120
- st.session_state.vectorstore = None
121
  if "summary" not in st.session_state:
122
  st.session_state.summary = None
123
- if "qa_chain" not in st.session_state:
124
- st.session_state.qa_chain = None
 
 
125
 
126
- # Initialize embeddings once at the start
127
  if "embeddings" not in st.session_state:
128
  st.session_state.embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
129
 
@@ -132,23 +125,9 @@ if "llm" not in st.session_state:
132
  st.session_state.llm = ChatGroq(
133
  api_key=GROQ_API_KEY,
134
  model="llama3-70b-8192",
135
- max_tokens=2048
136
  )
137
 
138
- # Custom logger for yt-dlp to redirect logs to Streamlit
139
- class StreamlitLogger:
140
- def debug(self, msg):
141
- st.text(f"[yt-dlp DEBUG] {msg}")
142
-
143
- def info(self, msg):
144
- st.info(f"[yt-dlp INFO] {msg}")
145
-
146
- def warning(self, msg):
147
- st.warning(f"[yt-dlp WARNING] {msg}")
148
-
149
- def error(self, msg):
150
- st.error(f"[yt-dlp ERROR] {msg}")
151
-
152
  # Sidebar for URL and YouTube input
153
  with st.sidebar:
154
  st.header("Enter Web URL")
@@ -162,7 +141,7 @@ with st.sidebar:
162
  # Main content container
163
  main_container = st.container()
164
 
165
- # Custom prompt for detailed answers
166
  qa_prompt = PromptTemplate(
167
  template="""You are an expert assistant tasked with providing detailed, extensive, and comprehensive answers. Use the provided context to answer the question thoroughly, including explanations, examples, and additional relevant information. If the context is limited, expand on the topic with your knowledge to ensure a complete response. In case of explaining anything, break the topic and explain step by step. Sometimes use your own reasoning and knowledge to explain anything to the users. If the users ask any question in Bengali, you too will answer it in fine and detailed Bengali.
168
 
@@ -174,8 +153,17 @@ Answer with sources: """
174
  )
175
 
176
  # Function to summarize content
177
- def summarize_content(content, llm):
178
- summary_prompt = f"""Summarize the following content in 5-10 sentences, capturing the main points and key details in easy expression:
 
 
 
 
 
 
 
 
 
179
 
180
  {content}
181
 
@@ -207,51 +195,12 @@ def fetch_youtube_transcript(video_id):
207
  return " ".join([item['text'] for item in translated_transcript])
208
  return None
209
 
210
- # Function to fetch captions using YouTube Data API (limited to listing with API key)
211
- def fetch_youtube_captions_api(video_id, api_key):
212
- if not api_key:
213
- st.warning("YOUTUBE_API_KEY not set. Skipping YouTube Data API fallback.")
214
- return None
215
- try:
216
- youtube = build('youtube', 'v3', developerKey=api_key)
217
- captions = youtube.captions().list(
218
- part='snippet',
219
- videoId=video_id
220
- ).execute()
221
-
222
- caption_id = None
223
- for item in captions.get('items', []):
224
- if item['snippet']['language'] == 'en':
225
- caption_id = item['id']
226
- break
227
- elif item['snippet']['language'] in ['en-US', 'en-GB']:
228
- caption_id = item['id']
229
- break
230
-
231
- if not caption_id:
232
- st.warning("No English captions found via YouTube Data API.")
233
- return None
234
-
235
- # Note: Downloading captions requires OAuth 2.0 authentication
236
- st.warning(
237
- "English captions are available for this video but cannot be fetched with an API key alone. "
238
- "Downloading captions requires OAuth 2.0 authentication, which is not supported in Hugging Face Spaces without user interaction. "
239
- "To fetch captions:\n"
240
- "- Test locally with OAuth 2.0 setup (see instructions in the documentation).\n"
241
- "- Or try a video with transcripts available (e.g., https://www.youtube.com/watch?v=dQw4w9WgXcQ)."
242
- )
243
- return None
244
-
245
- except HttpError as e:
246
- st.error(f"Error fetching captions with YouTube Data API: {str(e)}")
247
- return None
248
-
249
  # Function to extract subtitles using yt-dlp with cookies
250
  def extract_subtitles_with_ytdlp(video_url):
251
  ydl_opts = {
252
  'writesubtitles': True,
253
  'writeautomaticsub': True,
254
- 'subtitleslangs': ['all', '-live_chat'], # Match previous version, exclude live chat
255
  'skip_download': True,
256
  'subtitlesformat': 'vtt',
257
  'outtmpl': 'subtitle.%(ext)s',
@@ -260,8 +209,6 @@ def extract_subtitles_with_ytdlp(video_url):
260
  'Accept-Language': 'en-US,en;q=0.9',
261
  },
262
  'cookiefile': 'cookies.txt',
263
- 'retries': 3,
264
- 'retry_sleep': 5,
265
  }
266
  try:
267
  if not os.path.exists('cookies.txt'):
@@ -276,7 +223,6 @@ def extract_subtitles_with_ytdlp(video_url):
276
  return None
277
 
278
  with yt_dlp.YoutubeDL(ydl_opts) as ydl:
279
- ydl.params['logger'] = StreamlitLogger()
280
  info = ydl.extract_info(video_url, download=False)
281
  available_subs = info.get('subtitles', {})
282
  auto_subs = info.get('automatic_captions', {})
@@ -317,7 +263,7 @@ def extract_subtitles_with_ytdlp(video_url):
317
  st.error(f"Error fetching captions with yt-dlp: {str(e)}")
318
  return None
319
 
320
- # Function to process and chunk text (web or YouTube) with source metadata
321
  def process_content(text, embeddings, source):
322
  text_splitter = RecursiveCharacterTextSplitter(
323
  chunk_size=1000,
@@ -325,16 +271,13 @@ def process_content(text, embeddings, source):
325
  separators=["\n\n", "\n", ".", " ", ""]
326
  )
327
  docs = text_splitter.create_documents([text], metadatas=[{"source": source}])
328
- if docs:
329
- st.text(f"Document metadata: {docs[0].metadata}")
330
- else:
331
  st.error("No documents created from the content.")
332
  return None
333
  vectorstore = FAISS.from_documents(docs, embeddings)
334
- st.text(f"Vector store created with {len(docs)} documents.")
335
  return vectorstore
336
 
337
- # Function to create QA chain
338
  def create_qa_chain(vectorstore, llm):
339
  if vectorstore is None:
340
  st.error("Vector store is not initialized. Cannot create QA chain.")
@@ -349,7 +292,6 @@ def create_qa_chain(vectorstore, llm):
349
  "document_variable_name": "context"
350
  }
351
  )
352
- st.text("QA chain created successfully.")
353
  return qa_chain
354
 
355
  # Process Web URL
@@ -376,10 +318,11 @@ if process_url_clicked:
376
  embeddings = st.session_state.embeddings
377
  st.session_state.vectorstore = process_content(st.session_state.url_content, embeddings, source=url.strip())
378
  st.session_state.index_created = True
379
- st.session_state.qa_chain = None
380
  st.text("Content processed successfully! βœ…βœ…βœ…")
381
  except Exception as e:
382
  st.error(f"Error processing URL: {str(e)}")
 
383
 
384
  # Process YouTube Video
385
  if process_youtube_clicked:
@@ -403,10 +346,6 @@ if process_youtube_clicked:
403
  st.text("Fetching Closed Captions...Started...βœ…βœ…βœ…")
404
  transcript_text = extract_subtitles_with_ytdlp(youtube_url)
405
 
406
- if not transcript_text and YOUTUBE_API_KEY:
407
- st.text("Fetching Captions via YouTube Data API...Started...βœ…βœ…βœ…")
408
- transcript_text = fetch_youtube_captions_api(video_id, YOUTUBE_API_KEY)
409
-
410
  if not transcript_text:
411
  st.error(
412
  "No transcripts or closed captions available. "
@@ -426,19 +365,23 @@ if process_youtube_clicked:
426
  st.stop()
427
 
428
  st.session_state.url_content = transcript_text
429
- embeddings = st.session_state.embeddings
430
- st.session_state.vectorstore = process_content(transcript_text, embeddings, source=youtube_url.strip())
431
- st.session_state.index_created = True
432
- st.session_state.qa_chain = None
433
  st.text("YouTube video processed successfully! βœ…βœ…βœ…")
434
  except Exception as e:
435
  st.error(f"Error processing YouTube video: {str(e)}")
 
436
 
437
  # Summary button
438
  with main_container:
439
  if st.session_state.url_content and st.button("Generate Summary"):
440
  with st.spinner("Generating summary..."):
441
- st.session_state.summary = summarize_content(st.session_state.url_content, st.session_state.llm)
 
 
 
442
 
443
  # Display summary if generated
444
  if st.session_state.summary:
@@ -446,17 +389,14 @@ if st.session_state.summary:
446
  st.header("Summary of the Content")
447
  st.write(st.session_state.summary)
448
 
449
- # Query input with Ask button
450
- with main_container:
451
- st.header("Ask a Question")
452
- query = st.text_input("Question", placeholder="e.g., What is the video or article about?")
453
- ask_clicked = st.button("Ask")
454
-
455
- if ask_clicked and query:
456
  with main_container:
457
- if not st.session_state.index_created or st.session_state.vectorstore is None:
458
- st.error("No content processed. Please process a URL or YouTube video first.")
459
- else:
 
 
460
  with st.spinner("Processing your question..."):
461
  try:
462
  if "qa_chain" not in st.session_state or st.session_state.qa_chain is None:
@@ -465,7 +405,6 @@ if ask_clicked and query:
465
  st.error("Failed to create QA chain.")
466
  st.stop()
467
 
468
- st.text(f"Querying with question: {query}")
469
  result = st.session_state.qa_chain({"question": query}, return_only_outputs=True)
470
 
471
  if not result.get("answer"):
@@ -485,6 +424,7 @@ if ask_clicked and query:
485
  st.write("No sources found.")
486
  except Exception as e:
487
  st.error(f"Error answering query: {str(e)}")
 
488
 
489
  # Footer with tiny logo and text
490
  st.markdown(
 
6
  from langchain.text_splitter import RecursiveCharacterTextSplitter
7
  from langchain_core.documents import Document
8
  import os
 
9
  from langchain_groq import ChatGroq
10
  from langchain.chains.qa_with_sources.retrieval import RetrievalQAWithSourcesChain
11
  from langchain.prompts import PromptTemplate
 
13
  from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled, NoTranscriptFound
14
  import yt_dlp
15
  import re
 
 
16
 
17
  # Load environment variables (optional)
18
  load_dotenv()
19
 
20
  # Hardcoded Groq API key
21
  GROQ_API_KEY = "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"
 
 
22
 
23
  # Custom CSS
24
  st.markdown("""
 
107
  st.title("WebChatter πŸ’¬")
108
 
109
  # Initialize session state
 
 
110
  if "url_content" not in st.session_state:
111
  st.session_state.url_content = None
 
 
112
  if "summary" not in st.session_state:
113
  st.session_state.summary = None
114
+ if "vectorstore" not in st.session_state:
115
+ st.session_state.vectorstore = None
116
+ if "index_created" not in st.session_state:
117
+ st.session_state.index_created = False
118
 
119
+ # Initialize embeddings once at the start (only for web URLs)
120
  if "embeddings" not in st.session_state:
121
  st.session_state.embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
122
 
 
125
  st.session_state.llm = ChatGroq(
126
  api_key=GROQ_API_KEY,
127
  model="llama3-70b-8192",
128
+ max_tokens=1024 # Reduced to lower resource usage
129
  )
130
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
131
  # Sidebar for URL and YouTube input
132
  with st.sidebar:
133
  st.header("Enter Web URL")
 
141
  # Main content container
142
  main_container = st.container()
143
 
144
+ # Custom prompt for detailed answers (for web URLs only)
145
  qa_prompt = PromptTemplate(
146
  template="""You are an expert assistant tasked with providing detailed, extensive, and comprehensive answers. Use the provided context to answer the question thoroughly, including explanations, examples, and additional relevant information. If the context is limited, expand on the topic with your knowledge to ensure a complete response. In case of explaining anything, break the topic and explain step by step. Sometimes use your own reasoning and knowledge to explain anything to the users. If the users ask any question in Bengali, you too will answer it in fine and detailed Bengali.
147
 
 
153
  )
154
 
155
  # Function to summarize content
156
+ def summarize_content(content, llm, is_youtube=False):
157
+ if is_youtube:
158
+ # Extensive summary for YouTube videos (15-20 sentences)
159
+ summary_prompt = f"""You are an expert summarizer tasked with providing a very detailed and extensive summary of the following YouTube video transcript. Capture all key points, main ideas, and significant details in 15-20 sentences. Include specific examples, quotes, or moments from the transcript to make the summary comprehensive and vivid. Ensure the summary is well-organized, flowing naturally from one point to the next, and provides a thorough overview of the video's content.
160
+
161
+ Transcript: {content}
162
+
163
+ Extensive Summary: """
164
+ else:
165
+ # Shorter summary for web URLs (5-10 sentences)
166
+ summary_prompt = f"""Summarize the following content in 5-10 sentences, capturing the main points and key details in easy expression:
167
 
168
  {content}
169
 
 
195
  return " ".join([item['text'] for item in translated_transcript])
196
  return None
197
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
198
  # Function to extract subtitles using yt-dlp with cookies
199
  def extract_subtitles_with_ytdlp(video_url):
200
  ydl_opts = {
201
  'writesubtitles': True,
202
  'writeautomaticsub': True,
203
+ 'subtitleslangs': ['all', '-live_chat'],
204
  'skip_download': True,
205
  'subtitlesformat': 'vtt',
206
  'outtmpl': 'subtitle.%(ext)s',
 
209
  'Accept-Language': 'en-US,en;q=0.9',
210
  },
211
  'cookiefile': 'cookies.txt',
 
 
212
  }
213
  try:
214
  if not os.path.exists('cookies.txt'):
 
223
  return None
224
 
225
  with yt_dlp.YoutubeDL(ydl_opts) as ydl:
 
226
  info = ydl.extract_info(video_url, download=False)
227
  available_subs = info.get('subtitles', {})
228
  auto_subs = info.get('automatic_captions', {})
 
263
  st.error(f"Error fetching captions with yt-dlp: {str(e)}")
264
  return None
265
 
266
+ # Function to process and chunk text (for web URLs only)
267
  def process_content(text, embeddings, source):
268
  text_splitter = RecursiveCharacterTextSplitter(
269
  chunk_size=1000,
 
271
  separators=["\n\n", "\n", ".", " ", ""]
272
  )
273
  docs = text_splitter.create_documents([text], metadatas=[{"source": source}])
274
+ if not docs:
 
 
275
  st.error("No documents created from the content.")
276
  return None
277
  vectorstore = FAISS.from_documents(docs, embeddings)
 
278
  return vectorstore
279
 
280
+ # Function to create QA chain (for web URLs only)
281
  def create_qa_chain(vectorstore, llm):
282
  if vectorstore is None:
283
  st.error("Vector store is not initialized. Cannot create QA chain.")
 
292
  "document_variable_name": "context"
293
  }
294
  )
 
295
  return qa_chain
296
 
297
  # Process Web URL
 
318
  embeddings = st.session_state.embeddings
319
  st.session_state.vectorstore = process_content(st.session_state.url_content, embeddings, source=url.strip())
320
  st.session_state.index_created = True
321
+ st.session_state.summary = None
322
  st.text("Content processed successfully! βœ…βœ…βœ…")
323
  except Exception as e:
324
  st.error(f"Error processing URL: {str(e)}")
325
+ st.stop()
326
 
327
  # Process YouTube Video
328
  if process_youtube_clicked:
 
346
  st.text("Fetching Closed Captions...Started...βœ…βœ…βœ…")
347
  transcript_text = extract_subtitles_with_ytdlp(youtube_url)
348
 
 
 
 
 
349
  if not transcript_text:
350
  st.error(
351
  "No transcripts or closed captions available. "
 
365
  st.stop()
366
 
367
  st.session_state.url_content = transcript_text
368
+ # No vector store for YouTube videos since we're not doing QA
369
+ st.session_state.vectorstore = None
370
+ st.session_state.index_created = False
371
+ st.session_state.summary = None
372
  st.text("YouTube video processed successfully! βœ…βœ…βœ…")
373
  except Exception as e:
374
  st.error(f"Error processing YouTube video: {str(e)}")
375
+ st.stop()
376
 
377
  # Summary button
378
  with main_container:
379
  if st.session_state.url_content and st.button("Generate Summary"):
380
  with st.spinner("Generating summary..."):
381
+ # Check if the content is from a YouTube video (based on last processed input)
382
+ is_youtube = youtube_url.strip() and youtube_url == (st.session_state.get('last_processed_url', ''))
383
+ st.session_state.summary = summarize_content(st.session_state.url_content, st.session_state.llm, is_youtube=is_youtube)
384
+ st.session_state.last_processed_url = youtube_url if is_youtube else url
385
 
386
  # Display summary if generated
387
  if st.session_state.summary:
 
389
  st.header("Summary of the Content")
390
  st.write(st.session_state.summary)
391
 
392
+ # Query input with Ask button (only for web URLs)
393
+ if st.session_state.url_content and not youtube_url.strip():
 
 
 
 
 
394
  with main_container:
395
+ st.header("Ask a Question")
396
+ query = st.text_input("Question", placeholder="e.g., What is the article about?")
397
+ ask_clicked = st.button("Ask")
398
+
399
+ if ask_clicked and query:
400
  with st.spinner("Processing your question..."):
401
  try:
402
  if "qa_chain" not in st.session_state or st.session_state.qa_chain is None:
 
405
  st.error("Failed to create QA chain.")
406
  st.stop()
407
 
 
408
  result = st.session_state.qa_chain({"question": query}, return_only_outputs=True)
409
 
410
  if not result.get("answer"):
 
424
  st.write("No sources found.")
425
  except Exception as e:
426
  st.error(f"Error answering query: {str(e)}")
427
+ st.stop()
428
 
429
  # Footer with tiny logo and text
430
  st.markdown(