Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -120,6 +120,8 @@ if "vectorstore" not in st.session_state:
|
|
| 120 |
st.session_state.vectorstore = None
|
| 121 |
if "summary" not in st.session_state:
|
| 122 |
st.session_state.summary = None
|
|
|
|
|
|
|
| 123 |
|
| 124 |
# Initialize embeddings once at the start
|
| 125 |
if "embeddings" not in st.session_state:
|
|
@@ -297,14 +299,18 @@ def extract_subtitles_with_ytdlp(video_url):
|
|
| 297 |
st.error(f"Error fetching captions with yt-dlp: {str(e)}")
|
| 298 |
return None
|
| 299 |
|
| 300 |
-
# Function to process and chunk text (web or YouTube)
|
| 301 |
-
def process_content(text, embeddings):
|
| 302 |
text_splitter = RecursiveCharacterTextSplitter(
|
| 303 |
chunk_size=1000,
|
| 304 |
chunk_overlap=200,
|
| 305 |
separators=["\n\n", "\n", ".", " ", ""]
|
| 306 |
)
|
| 307 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 308 |
vectorstore = FAISS.from_documents(docs, embeddings)
|
| 309 |
return vectorstore
|
| 310 |
|
|
@@ -346,8 +352,10 @@ if process_url_clicked:
|
|
| 346 |
# Store content for summarization
|
| 347 |
st.session_state.url_content = "\n".join([doc.page_content for doc in data])
|
| 348 |
embeddings = st.session_state.embeddings
|
| 349 |
-
|
|
|
|
| 350 |
st.session_state.index_created = True
|
|
|
|
| 351 |
st.text("Content processed successfully! β
β
β
")
|
| 352 |
except Exception as e:
|
| 353 |
st.error(f"Error processing URL: {str(e)}")
|
|
@@ -390,8 +398,10 @@ if process_youtube_clicked:
|
|
| 390 |
# Process the transcript
|
| 391 |
st.session_state.url_content = transcript_text
|
| 392 |
embeddings = st.session_state.embeddings
|
| 393 |
-
|
|
|
|
| 394 |
st.session_state.index_created = True
|
|
|
|
| 395 |
st.text("YouTube video processed successfully! β
β
β
")
|
| 396 |
except Exception as e:
|
| 397 |
st.error(f"Error processing YouTube video: {str(e)}")
|
|
@@ -421,7 +431,7 @@ if ask_clicked and query:
|
|
| 421 |
else:
|
| 422 |
with st.spinner("Processing your question..."):
|
| 423 |
try:
|
| 424 |
-
if "qa_chain" not in st.session_state:
|
| 425 |
st.session_state.qa_chain = create_qa_chain(st.session_state.vectorstore, st.session_state.llm)
|
| 426 |
|
| 427 |
result = st.session_state.qa_chain({"question": query}, return_only_outputs=True)
|
|
|
|
| 120 |
st.session_state.vectorstore = None
|
| 121 |
if "summary" not in st.session_state:
|
| 122 |
st.session_state.summary = None
|
| 123 |
+
if "qa_chain" not in st.session_state:
|
| 124 |
+
st.session_state.qa_chain = None # Clear any cached QA chain
|
| 125 |
|
| 126 |
# Initialize embeddings once at the start
|
| 127 |
if "embeddings" not in st.session_state:
|
|
|
|
| 299 |
st.error(f"Error fetching captions with yt-dlp: {str(e)}")
|
| 300 |
return None
|
| 301 |
|
| 302 |
+
# Function to process and chunk text (web or YouTube) with source metadata
|
| 303 |
+
def process_content(text, embeddings, source):
|
| 304 |
text_splitter = RecursiveCharacterTextSplitter(
|
| 305 |
chunk_size=1000,
|
| 306 |
chunk_overlap=200,
|
| 307 |
separators=["\n\n", "\n", ".", " ", ""]
|
| 308 |
)
|
| 309 |
+
# Create documents with source metadata
|
| 310 |
+
docs = text_splitter.create_documents([text], metadatas=[{"source": source}])
|
| 311 |
+
# Debug: Check metadata of the first document
|
| 312 |
+
if docs:
|
| 313 |
+
st.text(f"Document metadata: {docs[0].metadata}")
|
| 314 |
vectorstore = FAISS.from_documents(docs, embeddings)
|
| 315 |
return vectorstore
|
| 316 |
|
|
|
|
| 352 |
# Store content for summarization
|
| 353 |
st.session_state.url_content = "\n".join([doc.page_content for doc in data])
|
| 354 |
embeddings = st.session_state.embeddings
|
| 355 |
+
# Pass the URL as the source metadata
|
| 356 |
+
st.session_state.vectorstore = process_content(st.session_state.url_content, embeddings, source=url.strip())
|
| 357 |
st.session_state.index_created = True
|
| 358 |
+
st.session_state.qa_chain = None # Clear cached QA chain
|
| 359 |
st.text("Content processed successfully! β
β
β
")
|
| 360 |
except Exception as e:
|
| 361 |
st.error(f"Error processing URL: {str(e)}")
|
|
|
|
| 398 |
# Process the transcript
|
| 399 |
st.session_state.url_content = transcript_text
|
| 400 |
embeddings = st.session_state.embeddings
|
| 401 |
+
# Pass the YouTube URL as the source metadata
|
| 402 |
+
st.session_state.vectorstore = process_content(transcript_text, embeddings, source=youtube_url.strip())
|
| 403 |
st.session_state.index_created = True
|
| 404 |
+
st.session_state.qa_chain = None # Clear cached QA chain
|
| 405 |
st.text("YouTube video processed successfully! β
β
β
")
|
| 406 |
except Exception as e:
|
| 407 |
st.error(f"Error processing YouTube video: {str(e)}")
|
|
|
|
| 431 |
else:
|
| 432 |
with st.spinner("Processing your question..."):
|
| 433 |
try:
|
| 434 |
+
if "qa_chain" not in st.session_state or st.session_state.qa_chain is None:
|
| 435 |
st.session_state.qa_chain = create_qa_chain(st.session_state.vectorstore, st.session_state.llm)
|
| 436 |
|
| 437 |
result = st.session_state.qa_chain({"question": query}, return_only_outputs=True)
|