Spaces:

MahatirTusher
/

WebChatter

Sleeping

App Files Files Community

MahatirTusher commited on Apr 22, 2025

Commit

2965a81

verified ·

1 Parent(s): e4a5b6a

Update app.py

Browse files

Files changed (1) hide show

app.py +16 -6

app.py CHANGED Viewed

@@ -120,6 +120,8 @@ if "vectorstore" not in st.session_state:
     st.session_state.vectorstore = None
 if "summary" not in st.session_state:
     st.session_state.summary = None
 # Initialize embeddings once at the start
 if "embeddings" not in st.session_state:
@@ -297,14 +299,18 @@ def extract_subtitles_with_ytdlp(video_url):
         st.error(f"Error fetching captions with yt-dlp: {str(e)}")
         return None
-# Function to process and chunk text (web or YouTube)
-def process_content(text, embeddings):
     text_splitter = RecursiveCharacterTextSplitter(
         chunk_size=1000,
         chunk_overlap=200,
         separators=["\n\n", "\n", ".", " ", ""]
     )
-    docs = text_splitter.create_documents([text])
     vectorstore = FAISS.from_documents(docs, embeddings)
     return vectorstore
@@ -346,8 +352,10 @@ if process_url_clicked:
                     # Store content for summarization
                     st.session_state.url_content = "\n".join([doc.page_content for doc in data])
                     embeddings = st.session_state.embeddings
-                    st.session_state.vectorstore = process_content(st.session_state.url_content, embeddings)
                     st.session_state.index_created = True
                     st.text("Content processed successfully! ✅✅✅")
                 except Exception as e:
                     st.error(f"Error processing URL: {str(e)}")
@@ -390,8 +398,10 @@ if process_youtube_clicked:
                     # Process the transcript
                     st.session_state.url_content = transcript_text
                     embeddings = st.session_state.embeddings
-                    st.session_state.vectorstore = process_content(transcript_text, embeddings)
                     st.session_state.index_created = True
                     st.text("YouTube video processed successfully! ✅✅✅")
                 except Exception as e:
                     st.error(f"Error processing YouTube video: {str(e)}")
@@ -421,7 +431,7 @@ if ask_clicked and query:
         else:
             with st.spinner("Processing your question..."):
                 try:
-                    if "qa_chain" not in st.session_state:
                         st.session_state.qa_chain = create_qa_chain(st.session_state.vectorstore, st.session_state.llm)
                     result = st.session_state.qa_chain({"question": query}, return_only_outputs=True)

     st.session_state.vectorstore = None
 if "summary" not in st.session_state:
     st.session_state.summary = None
+if "qa_chain" not in st.session_state:
+    st.session_state.qa_chain = None  # Clear any cached QA chain
 # Initialize embeddings once at the start
 if "embeddings" not in st.session_state:
         st.error(f"Error fetching captions with yt-dlp: {str(e)}")
         return None
+# Function to process and chunk text (web or YouTube) with source metadata
+def process_content(text, embeddings, source):
     text_splitter = RecursiveCharacterTextSplitter(
         chunk_size=1000,
         chunk_overlap=200,
         separators=["\n\n", "\n", ".", " ", ""]
     )
+    # Create documents with source metadata
+    docs = text_splitter.create_documents([text], metadatas=[{"source": source}])
+    # Debug: Check metadata of the first document
+    if docs:
+        st.text(f"Document metadata: {docs[0].metadata}")
     vectorstore = FAISS.from_documents(docs, embeddings)
     return vectorstore
                     # Store content for summarization
                     st.session_state.url_content = "\n".join([doc.page_content for doc in data])
                     embeddings = st.session_state.embeddings
+                    # Pass the URL as the source metadata
+                    st.session_state.vectorstore = process_content(st.session_state.url_content, embeddings, source=url.strip())
                     st.session_state.index_created = True
+                    st.session_state.qa_chain = None  # Clear cached QA chain
                     st.text("Content processed successfully! ✅✅✅")
                 except Exception as e:
                     st.error(f"Error processing URL: {str(e)}")
                     # Process the transcript
                     st.session_state.url_content = transcript_text
                     embeddings = st.session_state.embeddings
+                    # Pass the YouTube URL as the source metadata
+                    st.session_state.vectorstore = process_content(transcript_text, embeddings, source=youtube_url.strip())
                     st.session_state.index_created = True
+                    st.session_state.qa_chain = None  # Clear cached QA chain
                     st.text("YouTube video processed successfully! ✅✅✅")
                 except Exception as e:
                     st.error(f"Error processing YouTube video: {str(e)}")
         else:
             with st.spinner("Processing your question..."):
                 try:
+                    if "qa_chain" not in st.session_state or st.session_state.qa_chain is None:
                         st.session_state.qa_chain = create_qa_chain(st.session_state.vectorstore, st.session_state.llm)
                     result = st.session_state.qa_chain({"question": query}, return_only_outputs=True)