MahatirTusher commited on
Commit
2965a81
Β·
verified Β·
1 Parent(s): e4a5b6a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +16 -6
app.py CHANGED
@@ -120,6 +120,8 @@ if "vectorstore" not in st.session_state:
120
  st.session_state.vectorstore = None
121
  if "summary" not in st.session_state:
122
  st.session_state.summary = None
 
 
123
 
124
  # Initialize embeddings once at the start
125
  if "embeddings" not in st.session_state:
@@ -297,14 +299,18 @@ def extract_subtitles_with_ytdlp(video_url):
297
  st.error(f"Error fetching captions with yt-dlp: {str(e)}")
298
  return None
299
 
300
- # Function to process and chunk text (web or YouTube)
301
- def process_content(text, embeddings):
302
  text_splitter = RecursiveCharacterTextSplitter(
303
  chunk_size=1000,
304
  chunk_overlap=200,
305
  separators=["\n\n", "\n", ".", " ", ""]
306
  )
307
- docs = text_splitter.create_documents([text])
 
 
 
 
308
  vectorstore = FAISS.from_documents(docs, embeddings)
309
  return vectorstore
310
 
@@ -346,8 +352,10 @@ if process_url_clicked:
346
  # Store content for summarization
347
  st.session_state.url_content = "\n".join([doc.page_content for doc in data])
348
  embeddings = st.session_state.embeddings
349
- st.session_state.vectorstore = process_content(st.session_state.url_content, embeddings)
 
350
  st.session_state.index_created = True
 
351
  st.text("Content processed successfully! βœ…βœ…βœ…")
352
  except Exception as e:
353
  st.error(f"Error processing URL: {str(e)}")
@@ -390,8 +398,10 @@ if process_youtube_clicked:
390
  # Process the transcript
391
  st.session_state.url_content = transcript_text
392
  embeddings = st.session_state.embeddings
393
- st.session_state.vectorstore = process_content(transcript_text, embeddings)
 
394
  st.session_state.index_created = True
 
395
  st.text("YouTube video processed successfully! βœ…βœ…βœ…")
396
  except Exception as e:
397
  st.error(f"Error processing YouTube video: {str(e)}")
@@ -421,7 +431,7 @@ if ask_clicked and query:
421
  else:
422
  with st.spinner("Processing your question..."):
423
  try:
424
- if "qa_chain" not in st.session_state:
425
  st.session_state.qa_chain = create_qa_chain(st.session_state.vectorstore, st.session_state.llm)
426
 
427
  result = st.session_state.qa_chain({"question": query}, return_only_outputs=True)
 
120
  st.session_state.vectorstore = None
121
  if "summary" not in st.session_state:
122
  st.session_state.summary = None
123
+ if "qa_chain" not in st.session_state:
124
+ st.session_state.qa_chain = None # Clear any cached QA chain
125
 
126
  # Initialize embeddings once at the start
127
  if "embeddings" not in st.session_state:
 
299
  st.error(f"Error fetching captions with yt-dlp: {str(e)}")
300
  return None
301
 
302
+ # Function to process and chunk text (web or YouTube) with source metadata
303
+ def process_content(text, embeddings, source):
304
  text_splitter = RecursiveCharacterTextSplitter(
305
  chunk_size=1000,
306
  chunk_overlap=200,
307
  separators=["\n\n", "\n", ".", " ", ""]
308
  )
309
+ # Create documents with source metadata
310
+ docs = text_splitter.create_documents([text], metadatas=[{"source": source}])
311
+ # Debug: Check metadata of the first document
312
+ if docs:
313
+ st.text(f"Document metadata: {docs[0].metadata}")
314
  vectorstore = FAISS.from_documents(docs, embeddings)
315
  return vectorstore
316
 
 
352
  # Store content for summarization
353
  st.session_state.url_content = "\n".join([doc.page_content for doc in data])
354
  embeddings = st.session_state.embeddings
355
+ # Pass the URL as the source metadata
356
+ st.session_state.vectorstore = process_content(st.session_state.url_content, embeddings, source=url.strip())
357
  st.session_state.index_created = True
358
+ st.session_state.qa_chain = None # Clear cached QA chain
359
  st.text("Content processed successfully! βœ…βœ…βœ…")
360
  except Exception as e:
361
  st.error(f"Error processing URL: {str(e)}")
 
398
  # Process the transcript
399
  st.session_state.url_content = transcript_text
400
  embeddings = st.session_state.embeddings
401
+ # Pass the YouTube URL as the source metadata
402
+ st.session_state.vectorstore = process_content(transcript_text, embeddings, source=youtube_url.strip())
403
  st.session_state.index_created = True
404
+ st.session_state.qa_chain = None # Clear cached QA chain
405
  st.text("YouTube video processed successfully! βœ…βœ…βœ…")
406
  except Exception as e:
407
  st.error(f"Error processing YouTube video: {str(e)}")
 
431
  else:
432
  with st.spinner("Processing your question..."):
433
  try:
434
+ if "qa_chain" not in st.session_state or st.session_state.qa_chain is None:
435
  st.session_state.qa_chain = create_qa_chain(st.session_state.vectorstore, st.session_state.llm)
436
 
437
  result = st.session_state.qa_chain({"question": query}, return_only_outputs=True)