MahatirTusher commited on
Commit
fa9a363
Β·
verified Β·
1 Parent(s): 9352b95

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +82 -94
app.py CHANGED
@@ -14,6 +14,7 @@ from bs4 import SoupStrainer
14
  from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled, NoTranscriptFound
15
  import yt_dlp
16
  import re
 
17
 
18
  # Load environment variables (optional)
19
  load_dotenv()
@@ -104,6 +105,8 @@ st.markdown("""
104
  # Display large logo at the top of the main page
105
  st.image("https://i.postimg.cc/2j0QWF3Z/Removal-575.png", width=390)
106
 
 
 
107
 
108
  # Initialize session state
109
  if "index_created" not in st.session_state:
@@ -119,6 +122,14 @@ if "summary" not in st.session_state:
119
  if "embeddings" not in st.session_state:
120
  st.session_state.embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
121
 
 
 
 
 
 
 
 
 
122
  # Sidebar for URL and YouTube input
123
  with st.sidebar:
124
  st.header("Enter Web URL")
@@ -126,19 +137,12 @@ with st.sidebar:
126
  process_url_clicked = st.button("Process URL")
127
 
128
  st.header("Enter YouTube URL")
129
- youtube_url = st.text_input("YouTube URL", placeholder="e.g., https://www.youtube.com/watch?v=pxiP-HJLCx0")
130
  process_youtube_clicked = st.button("Process YouTube Video")
131
 
132
  # Main content container
133
  main_container = st.container()
134
 
135
- # Initialize the Groq LLM
136
- llm = ChatGroq(
137
- api_key=GROQ_API_KEY,
138
- model="llama3-70b-8192",
139
- max_tokens=2048 # Increased for detailed answers
140
- )
141
-
142
  # Custom prompt for detailed answers
143
  qa_prompt = PromptTemplate(
144
  template="""You are an expert assistant tasked with providing detailed, extensive, and comprehensive answers. Use the provided context to answer the question thoroughly, including explanations, examples, and additional relevant information. If the context is limited, expand on the topic with your knowledge to ensure a complete response. In case of explaining anything, break the topic and explain step by step. Sometimes use your own reasoning and knowledge to explain anything to the users. If the users ask any question in Bengali, you too will answer it in fine and detailed Bengali.
@@ -160,52 +164,73 @@ Summary: """
160
  summary = llm.invoke(summary_prompt).content
161
  return summary
162
 
163
- def save_faiss_index(vectorstore, path):
164
- vectorstore.save_local(path)
165
-
166
- def load_faiss_index(path, embeddings):
167
- return FAISS.load_local(path, embeddings, allow_dangerous_deserialization=True)
168
-
169
  # Function to extract YouTube video ID from URL
170
- def get_youtube_video_id(url):
171
  if "youtube.com/watch?v=" in url:
172
  return url.split("v=")[1].split("&")[0]
173
  elif "youtu.be/" in url:
174
  return url.split("youtu.be/")[1].split("?")[0]
175
  return None
176
 
177
- # Function to extract subtitles using yt-dlp
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
178
  def extract_subtitles_with_ytdlp(video_url):
179
  ydl_opts = {
180
  'writesubtitles': True,
181
  'writeautomaticsub': True,
182
- 'subtitleslangs': ['all'], # Fetch subtitles in all available languages
183
  'skip_download': True,
184
  'subtitlesformat': 'vtt',
185
  'outtmpl': 'subtitle.%(ext)s',
 
 
 
 
 
 
186
  }
187
  try:
188
  with yt_dlp.YoutubeDL(ydl_opts) as ydl:
189
  info = ydl.extract_info(video_url, download=False)
190
  available_subs = info.get('subtitles', {})
191
  auto_subs = info.get('automatic_captions', {})
192
-
193
  # Log available subtitles for debugging
194
  st.text(f"Available subtitles: {list(available_subs.keys())}")
195
  st.text(f"Available auto-captions: {list(auto_subs.keys())}")
196
 
197
  # Download the first available subtitle or auto-caption
198
- ydl.params['subtitleslangs'] = list(available_subs.keys()) or list(auto_subs.keys()) or ['en']
 
 
 
 
199
  ydl.download([video_url])
200
 
201
  # Look for the subtitle file
202
  subtitle_file = None
203
- for lang in available_subs.keys() or auto_subs.keys():
204
  possible_file = f"subtitle.{lang}.vtt"
205
  if os.path.exists(possible_file):
206
  subtitle_file = possible_file
207
  break
208
-
209
  if not subtitle_file:
210
  return None
211
 
@@ -220,7 +245,6 @@ def extract_subtitles_with_ytdlp(video_url):
220
  lines = subtitle_text.split('\n')
221
  text_lines = []
222
  for line in lines:
223
- # Skip WEBVTT header, timestamps, and empty lines
224
  if line.strip() and not line.startswith('WEBVTT') and not line.startswith('Kind:') and not line.startswith('Language:') and not re.match(r'\d{2}:\d{2}:\d{2}\.\d{3} --> \d{2}:\d{2}:\d{2}\.\d{3}', line):
225
  text_lines.append(line.strip())
226
 
@@ -229,29 +253,27 @@ def extract_subtitles_with_ytdlp(video_url):
229
  st.error(f"Error fetching captions with yt-dlp: {str(e)}")
230
  return None
231
 
232
- # Function to process content (web or YouTube)
233
- def process_content(docs, embeddings):
234
- st.text("Text Splitter...Started...βœ…βœ…βœ…")
235
  text_splitter = RecursiveCharacterTextSplitter(
236
- separators=['\n\n', '\n', '.', ','],
237
- chunk_size=1000
 
238
  )
239
- docs = text_splitter.split_documents(docs)
240
-
241
- if not docs:
242
- st.error("No document chunks created. Try a different URL or video.")
243
- st.stop()
244
- st.text(f"Split into {len(docs)} document chunks.")
245
-
246
- st.text("Embedding Vector Started Building...βœ…βœ…βœ…")
247
  vectorstore = FAISS.from_documents(docs, embeddings)
248
-
249
- faiss_index_path = "faiss_index"
250
- save_faiss_index(vectorstore, faiss_index_path)
251
- st.session_state.vectorstore = vectorstore # Cache the vectorstore
252
- st.session_state.index_created = True
253
- st.text("FAISS index saved successfully! βœ…βœ…βœ…")
254
- time.sleep(2)
 
 
 
 
 
255
 
256
  # Process Web URL
257
  if process_url_clicked:
@@ -277,7 +299,9 @@ if process_url_clicked:
277
  # Store content for summarization
278
  st.session_state.url_content = "\n".join([doc.page_content for doc in data])
279
  embeddings = st.session_state.embeddings
280
- process_content(data, embeddings)
 
 
281
  except Exception as e:
282
  st.error(f"Error processing URL: {str(e)}")
283
 
@@ -289,52 +313,17 @@ if process_youtube_clicked:
289
  else:
290
  with st.spinner("Processing YouTube Video..."):
291
  try:
292
- video_id = get_youtube_video_id(youtube_url)
293
  if not video_id:
294
  st.error("Invalid YouTube URL. Please provide a URL like https://www.youtube.com/watch?v=VIDEO_ID.")
295
  st.stop()
296
 
297
  transcript_text = None
298
  st.text("Fetching Transcript...Started...βœ…βœ…βœ…")
299
- try:
300
- # Get the list of available transcripts
301
- transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)
302
-
303
- # Log available transcripts for debugging
304
- available_languages = [t.language for t in transcript_list]
305
- st.text(f"Available transcript languages: {available_languages}")
306
-
307
- transcript = None
308
- # Try to find a manually created transcript in any language
309
- for lang in available_languages:
310
- try:
311
- transcript = transcript_list.find_manually_created_transcript([lang])
312
- break
313
- except NoTranscriptFound:
314
- continue
315
-
316
- # If no manual transcript, try an auto-generated one
317
- if not transcript:
318
- for lang in available_languages:
319
- try:
320
- transcript = transcript_list.find_generated_transcript([lang])
321
- break
322
- except NoTranscriptFound:
323
- continue
324
-
325
- # If a transcript is found and it's not in English, translate to English
326
- if transcript:
327
- if transcript.language_code != 'en' and transcript.is_translatable:
328
- transcript = transcript.translate('en')
329
- transcript_data = transcript.fetch()
330
- transcript_text = " ".join([entry['text'] for entry in transcript_data])
331
- except TranscriptsDisabled:
332
- st.warning("Transcripts are disabled for this video. Attempting to fetch closed captions...")
333
- except NoTranscriptFound:
334
- st.warning("No transcript found in any language. Attempting to fetch closed captions...")
335
-
336
- # Fallback to yt-dlp for closed captions
337
  if not transcript_text:
 
338
  st.text("Fetching Closed Captions...Started...βœ…βœ…βœ…")
339
  transcript_text = extract_subtitles_with_ytdlp(youtube_url)
340
  if not transcript_text:
@@ -345,11 +334,12 @@ if process_youtube_clicked:
345
  st.error("Transcript or captions are empty. Try a different video.")
346
  st.stop()
347
 
348
- # Create a Document object from the transcript
349
- doc = Document(page_content=transcript_text, metadata={"source": youtube_url})
350
- st.session_state.url_content = transcript_text # Store for summarization
351
  embeddings = st.session_state.embeddings
352
- process_content([doc], embeddings)
 
 
353
  except Exception as e:
354
  st.error(f"Error processing YouTube video: {str(e)}")
355
 
@@ -357,7 +347,7 @@ if process_youtube_clicked:
357
  with main_container:
358
  if st.session_state.url_content and st.button("Generate Summary"):
359
  with st.spinner("Generating summary..."):
360
- st.session_state.summary = summarize_content(st.session_state.url_content, llm)
361
 
362
  # Display summary if generated
363
  if st.session_state.summary:
@@ -378,12 +368,10 @@ if ask_clicked and query:
378
  else:
379
  with st.spinner("Processing your question..."):
380
  try:
381
- chain = RetrievalQAWithSourcesChain.from_llm(
382
- llm=llm,
383
- retriever=st.session_state.vectorstore.as_retriever(search_kwargs={"k": 2}),
384
- question_prompt=qa_prompt
385
- )
386
- result = chain({"question": query}, return_only_outputs=True)
387
 
388
  if not result.get("answer"):
389
  st.warning("No answer generated. Try a different question or content.")
@@ -407,7 +395,7 @@ if ask_clicked and query:
407
  st.markdown(
408
  """
409
  <div class="footer">
410
- <img src="https://i.postimg.cc/2j0QWF3Z/Removal-575.png" width="130">
411
  WebChatter Β© 2025 | Developed by Mahatir Ahmed Tusher
412
  </div>
413
  """,
 
14
  from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled, NoTranscriptFound
15
  import yt_dlp
16
  import re
17
+ import requests
18
 
19
  # Load environment variables (optional)
20
  load_dotenv()
 
105
  # Display large logo at the top of the main page
106
  st.image("https://i.postimg.cc/2j0QWF3Z/Removal-575.png", width=390)
107
 
108
+ # Set Streamlit app title
109
+ st.title("WebChatter πŸ’¬")
110
 
111
  # Initialize session state
112
  if "index_created" not in st.session_state:
 
122
  if "embeddings" not in st.session_state:
123
  st.session_state.embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
124
 
125
+ # Initialize LLM once at the start
126
+ if "llm" not in st.session_state:
127
+ st.session_state.llm = ChatGroq(
128
+ api_key=GROQ_API_KEY,
129
+ model="llama3-70b-8192",
130
+ max_tokens=2048
131
+ )
132
+
133
  # Sidebar for URL and YouTube input
134
  with st.sidebar:
135
  st.header("Enter Web URL")
 
137
  process_url_clicked = st.button("Process URL")
138
 
139
  st.header("Enter YouTube URL")
140
+ youtube_url = st.text_input("YouTube URL", placeholder="e.g., https://www.youtube.com/watch?v=DJO_9auJhJQ")
141
  process_youtube_clicked = st.button("Process YouTube Video")
142
 
143
  # Main content container
144
  main_container = st.container()
145
 
 
 
 
 
 
 
 
146
  # Custom prompt for detailed answers
147
  qa_prompt = PromptTemplate(
148
  template="""You are an expert assistant tasked with providing detailed, extensive, and comprehensive answers. Use the provided context to answer the question thoroughly, including explanations, examples, and additional relevant information. If the context is limited, expand on the topic with your knowledge to ensure a complete response. In case of explaining anything, break the topic and explain step by step. Sometimes use your own reasoning and knowledge to explain anything to the users. If the users ask any question in Bengali, you too will answer it in fine and detailed Bengali.
 
164
  summary = llm.invoke(summary_prompt).content
165
  return summary
166
 
 
 
 
 
 
 
167
  # Function to extract YouTube video ID from URL
168
+ def get_video_id(url):
169
  if "youtube.com/watch?v=" in url:
170
  return url.split("v=")[1].split("&")[0]
171
  elif "youtu.be/" in url:
172
  return url.split("youtu.be/")[1].split("?")[0]
173
  return None
174
 
175
+ # Function to fetch YouTube transcript (ChatGPT-inspired)
176
+ def fetch_youtube_transcript(video_id):
177
+ try:
178
+ transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)
179
+ transcript = transcript_list.find_transcript(['en']).fetch()
180
+ full_text = " ".join([item['text'] for item in transcript])
181
+ return full_text
182
+ except TranscriptsDisabled:
183
+ return None
184
+ except NoTranscriptFound:
185
+ # Try to find any transcript and translate to English
186
+ for transcript in transcript_list:
187
+ if transcript.is_translatable:
188
+ translated_transcript = transcript.translate('en').fetch()
189
+ return " ".join([item['text'] for item in translated_transcript])
190
+ return None
191
+
192
+ # Function to extract subtitles using yt-dlp with bot detection bypass
193
  def extract_subtitles_with_ytdlp(video_url):
194
  ydl_opts = {
195
  'writesubtitles': True,
196
  'writeautomaticsub': True,
197
+ 'subtitleslangs': ['all'],
198
  'skip_download': True,
199
  'subtitlesformat': 'vtt',
200
  'outtmpl': 'subtitle.%(ext)s',
201
+ 'http_headers': {
202
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
203
+ 'Accept-Language': 'en-US,en;q=0.9',
204
+ },
205
+ 'retries': 3,
206
+ 'retry_sleep': 5,
207
  }
208
  try:
209
  with yt_dlp.YoutubeDL(ydl_opts) as ydl:
210
  info = ydl.extract_info(video_url, download=False)
211
  available_subs = info.get('subtitles', {})
212
  auto_subs = info.get('automatic_captions', {})
213
+
214
  # Log available subtitles for debugging
215
  st.text(f"Available subtitles: {list(available_subs.keys())}")
216
  st.text(f"Available auto-captions: {list(auto_subs.keys())}")
217
 
218
  # Download the first available subtitle or auto-caption
219
+ subtitle_langs = list(available_subs.keys()) or list(auto_subs.keys())
220
+ if not subtitle_langs:
221
+ return None
222
+
223
+ ydl.params['subtitleslangs'] = subtitle_langs
224
  ydl.download([video_url])
225
 
226
  # Look for the subtitle file
227
  subtitle_file = None
228
+ for lang in subtitle_langs:
229
  possible_file = f"subtitle.{lang}.vtt"
230
  if os.path.exists(possible_file):
231
  subtitle_file = possible_file
232
  break
233
+
234
  if not subtitle_file:
235
  return None
236
 
 
245
  lines = subtitle_text.split('\n')
246
  text_lines = []
247
  for line in lines:
 
248
  if line.strip() and not line.startswith('WEBVTT') and not line.startswith('Kind:') and not line.startswith('Language:') and not re.match(r'\d{2}:\d{2}:\d{2}\.\d{3} --> \d{2}:\d{2}:\d{2}\.\d{3}', line):
249
  text_lines.append(line.strip())
250
 
 
253
  st.error(f"Error fetching captions with yt-dlp: {str(e)}")
254
  return None
255
 
256
+ # Function to process and chunk text (web or YouTube)
257
+ def process_content(text, embeddings):
 
258
  text_splitter = RecursiveCharacterTextSplitter(
259
+ chunk_size=1000,
260
+ chunk_overlap=200,
261
+ separators=["\n\n", "\n", ".", " ", ""]
262
  )
263
+ docs = text_splitter.create_documents([text])
 
 
 
 
 
 
 
264
  vectorstore = FAISS.from_documents(docs, embeddings)
265
+ return vectorstore
266
+
267
+ # Function to create QA chain
268
+ def create_qa_chain(vectorstore, llm):
269
+ retriever = vectorstore.as_retriever(search_kwargs={"k": 2})
270
+ qa_chain = RetrievalQAWithSourcesChain.from_chain_type(
271
+ llm=llm,
272
+ retriever=retriever,
273
+ chain_type="stuff",
274
+ chain_type_kwargs={"prompt": qa_prompt}
275
+ )
276
+ return qa_chain
277
 
278
  # Process Web URL
279
  if process_url_clicked:
 
299
  # Store content for summarization
300
  st.session_state.url_content = "\n".join([doc.page_content for doc in data])
301
  embeddings = st.session_state.embeddings
302
+ st.session_state.vectorstore = process_content(st.session_state.url_content, embeddings)
303
+ st.session_state.index_created = True
304
+ st.text("Content processed successfully! βœ…βœ…βœ…")
305
  except Exception as e:
306
  st.error(f"Error processing URL: {str(e)}")
307
 
 
313
  else:
314
  with st.spinner("Processing YouTube Video..."):
315
  try:
316
+ video_id = get_video_id(youtube_url)
317
  if not video_id:
318
  st.error("Invalid YouTube URL. Please provide a URL like https://www.youtube.com/watch?v=VIDEO_ID.")
319
  st.stop()
320
 
321
  transcript_text = None
322
  st.text("Fetching Transcript...Started...βœ…βœ…βœ…")
323
+ transcript_text = fetch_youtube_transcript(video_id)
324
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
325
  if not transcript_text:
326
+ st.warning("Transcripts are disabled or unavailable. Attempting to fetch closed captions...")
327
  st.text("Fetching Closed Captions...Started...βœ…βœ…βœ…")
328
  transcript_text = extract_subtitles_with_ytdlp(youtube_url)
329
  if not transcript_text:
 
334
  st.error("Transcript or captions are empty. Try a different video.")
335
  st.stop()
336
 
337
+ # Process the transcript
338
+ st.session_state.url_content = transcript_text
 
339
  embeddings = st.session_state.embeddings
340
+ st.session_state.vectorstore = process_content(transcript_text, embeddings)
341
+ st.session_state.index_created = True
342
+ st.text("YouTube video processed successfully! βœ…βœ…βœ…")
343
  except Exception as e:
344
  st.error(f"Error processing YouTube video: {str(e)}")
345
 
 
347
  with main_container:
348
  if st.session_state.url_content and st.button("Generate Summary"):
349
  with st.spinner("Generating summary..."):
350
+ st.session_state.summary = summarize_content(st.session_state.url_content, st.session_state.llm)
351
 
352
  # Display summary if generated
353
  if st.session_state.summary:
 
368
  else:
369
  with st.spinner("Processing your question..."):
370
  try:
371
+ if "qa_chain" not in st.session_state:
372
+ st.session_state.qa_chain = create_qa_chain(st.session_state.vectorstore, st.session_state.llm)
373
+
374
+ result = st.session_state.qa_chain({"question": query}, return_only_outputs=True)
 
 
375
 
376
  if not result.get("answer"):
377
  st.warning("No answer generated. Try a different question or content.")
 
395
  st.markdown(
396
  """
397
  <div class="footer">
398
+ <img src="https://i.postimg.cc/2j0QWF3Z/Removal-575.png" width="80">
399
  WebChatter Β© 2025 | Developed by Mahatir Ahmed Tusher
400
  </div>
401
  """,