Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -115,17 +115,15 @@ if "vectorstore" not in st.session_state:
|
|
| 115 |
st.session_state.vectorstore = None
|
| 116 |
if "index_created" not in st.session_state:
|
| 117 |
st.session_state.index_created = False
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
if "embeddings" not in st.session_state:
|
| 121 |
-
st.session_state.embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
|
| 122 |
|
| 123 |
# Initialize LLM once at the start
|
| 124 |
if "llm" not in st.session_state:
|
| 125 |
st.session_state.llm = ChatGroq(
|
| 126 |
api_key=GROQ_API_KEY,
|
| 127 |
model="llama3-70b-8192",
|
| 128 |
-
max_tokens=
|
| 129 |
)
|
| 130 |
|
| 131 |
# Sidebar for URL and YouTube input
|
|
@@ -218,14 +216,8 @@ def extract_subtitles_with_ytdlp(video_url):
|
|
| 218 |
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36',
|
| 219 |
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
|
| 220 |
'Accept-Language': 'en-US,en;q=0.5',
|
| 221 |
-
'Referer': 'https://www.youtube.com/',
|
| 222 |
-
'Origin': 'https://www.youtube.com',
|
| 223 |
},
|
| 224 |
'cookiefile': 'cookies.txt',
|
| 225 |
-
'retries': 5, # Increased retries
|
| 226 |
-
'retry_sleep': 3,
|
| 227 |
-
'geo_bypass': True, # Attempt to bypass geo-restrictions
|
| 228 |
-
'no_check_certificate': True, # Bypass certificate issues
|
| 229 |
}
|
| 230 |
try:
|
| 231 |
if not os.path.exists('cookies.txt'):
|
|
@@ -331,10 +323,15 @@ if process_url_clicked:
|
|
| 331 |
st.error("No content loaded from URL. Try a different URL (e.g., https://www.bbc.com/news/science-environment-67299122).")
|
| 332 |
st.stop()
|
| 333 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 334 |
st.session_state.url_content = "\n".join([doc.page_content for doc in data])
|
| 335 |
embeddings = st.session_state.embeddings
|
| 336 |
st.session_state.vectorstore = process_content(st.session_state.url_content, embeddings, source=url.strip())
|
| 337 |
st.session_state.index_created = True
|
|
|
|
| 338 |
st.session_state.summary = None
|
| 339 |
st.text("Content processed successfully! β
β
β
")
|
| 340 |
except Exception as e:
|
|
@@ -370,10 +367,10 @@ if process_youtube_clicked:
|
|
| 370 |
"1. Captions are not enabled for this video.\n"
|
| 371 |
"2. YouTube detected this request as a bot (even with cookies.txt).\n"
|
| 372 |
"Solutions:\n"
|
| 373 |
-
"- Ensure captions are enabled for the video.\n"
|
| 374 |
"- Regenerate and upload a fresh cookies.txt file (see instructions above).\n"
|
| 375 |
-
"- Try a different video (e.g., https://www.youtube.com/watch?v=dQw4w9WgXcQ).\n"
|
| 376 |
-
"- Test locally to rule out Hugging Face Spaces IP restrictions."
|
| 377 |
)
|
| 378 |
st.stop()
|
| 379 |
|
|
@@ -385,6 +382,7 @@ if process_youtube_clicked:
|
|
| 385 |
# No vector store for YouTube videos since we're not doing QA
|
| 386 |
st.session_state.vectorstore = None
|
| 387 |
st.session_state.index_created = False
|
|
|
|
| 388 |
st.session_state.summary = None
|
| 389 |
st.text("YouTube video processed successfully! β
β
β
")
|
| 390 |
except Exception as e:
|
|
@@ -395,10 +393,8 @@ if process_youtube_clicked:
|
|
| 395 |
with main_container:
|
| 396 |
if st.session_state.url_content and st.button("Generate Summary"):
|
| 397 |
with st.spinner("Generating summary..."):
|
| 398 |
-
|
| 399 |
-
is_youtube = youtube_url.strip() and youtube_url == (st.session_state.get('last_processed_url', ''))
|
| 400 |
st.session_state.summary = summarize_content(st.session_state.url_content, st.session_state.llm, is_youtube=is_youtube)
|
| 401 |
-
st.session_state.last_processed_url = youtube_url if is_youtube else url
|
| 402 |
|
| 403 |
# Display summary if generated
|
| 404 |
if st.session_state.summary:
|
|
@@ -407,7 +403,7 @@ if st.session_state.summary:
|
|
| 407 |
st.write(st.session_state.summary)
|
| 408 |
|
| 409 |
# Query input with Ask button (only for web URLs)
|
| 410 |
-
if st.session_state.url_content and
|
| 411 |
with main_container:
|
| 412 |
st.header("Ask a Question")
|
| 413 |
query = st.text_input("Question", placeholder="e.g., What is the article about?")
|
|
|
|
| 115 |
st.session_state.vectorstore = None
|
| 116 |
if "index_created" not in st.session_state:
|
| 117 |
st.session_state.index_created = False
|
| 118 |
+
if "content_type" not in st.session_state:
|
| 119 |
+
st.session_state.content_type = None
|
|
|
|
|
|
|
| 120 |
|
| 121 |
# Initialize LLM once at the start
|
| 122 |
if "llm" not in st.session_state:
|
| 123 |
st.session_state.llm = ChatGroq(
|
| 124 |
api_key=GROQ_API_KEY,
|
| 125 |
model="llama3-70b-8192",
|
| 126 |
+
max_tokens=512 # Further reduced to minimize resource usage
|
| 127 |
)
|
| 128 |
|
| 129 |
# Sidebar for URL and YouTube input
|
|
|
|
| 216 |
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36',
|
| 217 |
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
|
| 218 |
'Accept-Language': 'en-US,en;q=0.5',
|
|
|
|
|
|
|
| 219 |
},
|
| 220 |
'cookiefile': 'cookies.txt',
|
|
|
|
|
|
|
|
|
|
|
|
|
| 221 |
}
|
| 222 |
try:
|
| 223 |
if not os.path.exists('cookies.txt'):
|
|
|
|
| 323 |
st.error("No content loaded from URL. Try a different URL (e.g., https://www.bbc.com/news/science-environment-67299122).")
|
| 324 |
st.stop()
|
| 325 |
|
| 326 |
+
# Initialize embeddings only when needed
|
| 327 |
+
if "embeddings" not in st.session_state:
|
| 328 |
+
st.session_state.embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
|
| 329 |
+
|
| 330 |
st.session_state.url_content = "\n".join([doc.page_content for doc in data])
|
| 331 |
embeddings = st.session_state.embeddings
|
| 332 |
st.session_state.vectorstore = process_content(st.session_state.url_content, embeddings, source=url.strip())
|
| 333 |
st.session_state.index_created = True
|
| 334 |
+
st.session_state.content_type = "web"
|
| 335 |
st.session_state.summary = None
|
| 336 |
st.text("Content processed successfully! β
β
β
")
|
| 337 |
except Exception as e:
|
|
|
|
| 367 |
"1. Captions are not enabled for this video.\n"
|
| 368 |
"2. YouTube detected this request as a bot (even with cookies.txt).\n"
|
| 369 |
"Solutions:\n"
|
| 370 |
+
"- Ensure captions are enabled for the video by checking the video settings on YouTube (gear icon > Subtitles/CC > Enable if available).\n"
|
| 371 |
"- Regenerate and upload a fresh cookies.txt file (see instructions above).\n"
|
| 372 |
+
"- Try a different video (e.g., https://www.youtube.com/watch?v=dQw4w9WgXcQ, which has transcripts available).\n"
|
| 373 |
+
"- Test locally to rule out Hugging Face Spaces IP restrictions by running: pip install -r requirements.txt && streamlit run app.py"
|
| 374 |
)
|
| 375 |
st.stop()
|
| 376 |
|
|
|
|
| 382 |
# No vector store for YouTube videos since we're not doing QA
|
| 383 |
st.session_state.vectorstore = None
|
| 384 |
st.session_state.index_created = False
|
| 385 |
+
st.session_state.content_type = "youtube"
|
| 386 |
st.session_state.summary = None
|
| 387 |
st.text("YouTube video processed successfully! β
β
β
")
|
| 388 |
except Exception as e:
|
|
|
|
| 393 |
with main_container:
|
| 394 |
if st.session_state.url_content and st.button("Generate Summary"):
|
| 395 |
with st.spinner("Generating summary..."):
|
| 396 |
+
is_youtube = st.session_state.content_type == "youtube"
|
|
|
|
| 397 |
st.session_state.summary = summarize_content(st.session_state.url_content, st.session_state.llm, is_youtube=is_youtube)
|
|
|
|
| 398 |
|
| 399 |
# Display summary if generated
|
| 400 |
if st.session_state.summary:
|
|
|
|
| 403 |
st.write(st.session_state.summary)
|
| 404 |
|
| 405 |
# Query input with Ask button (only for web URLs)
|
| 406 |
+
if st.session_state.url_content and st.session_state.content_type == "web":
|
| 407 |
with main_container:
|
| 408 |
st.header("Ask a Question")
|
| 409 |
query = st.text_input("Question", placeholder="e.g., What is the article about?")
|