MahatirTusher commited on
Commit
142e8e8
Β·
verified Β·
1 Parent(s): e82afb7

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +14 -18
app.py CHANGED
@@ -115,17 +115,15 @@ if "vectorstore" not in st.session_state:
115
  st.session_state.vectorstore = None
116
  if "index_created" not in st.session_state:
117
  st.session_state.index_created = False
118
-
119
- # Initialize embeddings once at the start (only for web URLs)
120
- if "embeddings" not in st.session_state:
121
- st.session_state.embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
122
 
123
  # Initialize LLM once at the start
124
  if "llm" not in st.session_state:
125
  st.session_state.llm = ChatGroq(
126
  api_key=GROQ_API_KEY,
127
  model="llama3-70b-8192",
128
- max_tokens=1024 # Reduced to lower resource usage
129
  )
130
 
131
  # Sidebar for URL and YouTube input
@@ -218,14 +216,8 @@ def extract_subtitles_with_ytdlp(video_url):
218
  'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36',
219
  'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
220
  'Accept-Language': 'en-US,en;q=0.5',
221
- 'Referer': 'https://www.youtube.com/',
222
- 'Origin': 'https://www.youtube.com',
223
  },
224
  'cookiefile': 'cookies.txt',
225
- 'retries': 5, # Increased retries
226
- 'retry_sleep': 3,
227
- 'geo_bypass': True, # Attempt to bypass geo-restrictions
228
- 'no_check_certificate': True, # Bypass certificate issues
229
  }
230
  try:
231
  if not os.path.exists('cookies.txt'):
@@ -331,10 +323,15 @@ if process_url_clicked:
331
  st.error("No content loaded from URL. Try a different URL (e.g., https://www.bbc.com/news/science-environment-67299122).")
332
  st.stop()
333
 
 
 
 
 
334
  st.session_state.url_content = "\n".join([doc.page_content for doc in data])
335
  embeddings = st.session_state.embeddings
336
  st.session_state.vectorstore = process_content(st.session_state.url_content, embeddings, source=url.strip())
337
  st.session_state.index_created = True
 
338
  st.session_state.summary = None
339
  st.text("Content processed successfully! βœ…βœ…βœ…")
340
  except Exception as e:
@@ -370,10 +367,10 @@ if process_youtube_clicked:
370
  "1. Captions are not enabled for this video.\n"
371
  "2. YouTube detected this request as a bot (even with cookies.txt).\n"
372
  "Solutions:\n"
373
- "- Ensure captions are enabled for the video.\n"
374
  "- Regenerate and upload a fresh cookies.txt file (see instructions above).\n"
375
- "- Try a different video (e.g., https://www.youtube.com/watch?v=dQw4w9WgXcQ).\n"
376
- "- Test locally to rule out Hugging Face Spaces IP restrictions."
377
  )
378
  st.stop()
379
 
@@ -385,6 +382,7 @@ if process_youtube_clicked:
385
  # No vector store for YouTube videos since we're not doing QA
386
  st.session_state.vectorstore = None
387
  st.session_state.index_created = False
 
388
  st.session_state.summary = None
389
  st.text("YouTube video processed successfully! βœ…βœ…βœ…")
390
  except Exception as e:
@@ -395,10 +393,8 @@ if process_youtube_clicked:
395
  with main_container:
396
  if st.session_state.url_content and st.button("Generate Summary"):
397
  with st.spinner("Generating summary..."):
398
- # Check if the content is from a YouTube video (based on last processed input)
399
- is_youtube = youtube_url.strip() and youtube_url == (st.session_state.get('last_processed_url', ''))
400
  st.session_state.summary = summarize_content(st.session_state.url_content, st.session_state.llm, is_youtube=is_youtube)
401
- st.session_state.last_processed_url = youtube_url if is_youtube else url
402
 
403
  # Display summary if generated
404
  if st.session_state.summary:
@@ -407,7 +403,7 @@ if st.session_state.summary:
407
  st.write(st.session_state.summary)
408
 
409
  # Query input with Ask button (only for web URLs)
410
- if st.session_state.url_content and not youtube_url.strip():
411
  with main_container:
412
  st.header("Ask a Question")
413
  query = st.text_input("Question", placeholder="e.g., What is the article about?")
 
115
  st.session_state.vectorstore = None
116
  if "index_created" not in st.session_state:
117
  st.session_state.index_created = False
118
+ if "content_type" not in st.session_state:
119
+ st.session_state.content_type = None
 
 
120
 
121
  # Initialize LLM once at the start
122
  if "llm" not in st.session_state:
123
  st.session_state.llm = ChatGroq(
124
  api_key=GROQ_API_KEY,
125
  model="llama3-70b-8192",
126
+ max_tokens=512 # Further reduced to minimize resource usage
127
  )
128
 
129
  # Sidebar for URL and YouTube input
 
216
  'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36',
217
  'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
218
  'Accept-Language': 'en-US,en;q=0.5',
 
 
219
  },
220
  'cookiefile': 'cookies.txt',
 
 
 
 
221
  }
222
  try:
223
  if not os.path.exists('cookies.txt'):
 
323
  st.error("No content loaded from URL. Try a different URL (e.g., https://www.bbc.com/news/science-environment-67299122).")
324
  st.stop()
325
 
326
+ # Initialize embeddings only when needed
327
+ if "embeddings" not in st.session_state:
328
+ st.session_state.embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
329
+
330
  st.session_state.url_content = "\n".join([doc.page_content for doc in data])
331
  embeddings = st.session_state.embeddings
332
  st.session_state.vectorstore = process_content(st.session_state.url_content, embeddings, source=url.strip())
333
  st.session_state.index_created = True
334
+ st.session_state.content_type = "web"
335
  st.session_state.summary = None
336
  st.text("Content processed successfully! βœ…βœ…βœ…")
337
  except Exception as e:
 
367
  "1. Captions are not enabled for this video.\n"
368
  "2. YouTube detected this request as a bot (even with cookies.txt).\n"
369
  "Solutions:\n"
370
+ "- Ensure captions are enabled for the video by checking the video settings on YouTube (gear icon > Subtitles/CC > Enable if available).\n"
371
  "- Regenerate and upload a fresh cookies.txt file (see instructions above).\n"
372
+ "- Try a different video (e.g., https://www.youtube.com/watch?v=dQw4w9WgXcQ, which has transcripts available).\n"
373
+ "- Test locally to rule out Hugging Face Spaces IP restrictions by running: pip install -r requirements.txt && streamlit run app.py"
374
  )
375
  st.stop()
376
 
 
382
  # No vector store for YouTube videos since we're not doing QA
383
  st.session_state.vectorstore = None
384
  st.session_state.index_created = False
385
+ st.session_state.content_type = "youtube"
386
  st.session_state.summary = None
387
  st.text("YouTube video processed successfully! βœ…βœ…βœ…")
388
  except Exception as e:
 
393
  with main_container:
394
  if st.session_state.url_content and st.button("Generate Summary"):
395
  with st.spinner("Generating summary..."):
396
+ is_youtube = st.session_state.content_type == "youtube"
 
397
  st.session_state.summary = summarize_content(st.session_state.url_content, st.session_state.llm, is_youtube=is_youtube)
 
398
 
399
  # Display summary if generated
400
  if st.session_state.summary:
 
403
  st.write(st.session_state.summary)
404
 
405
  # Query input with Ask button (only for web URLs)
406
+ if st.session_state.url_content and st.session_state.content_type == "web":
407
  with main_container:
408
  st.header("Ask a Question")
409
  query = st.text_input("Question", placeholder="e.g., What is the article about?")