import tempfile from datetime import datetime from typing import List import streamlit as st from langchain_community.document_loaders import PyPDFLoader, WebBaseLoader from langchain.text_splitter import RecursiveCharacterTextSplitter def process_pdf(file) -> List: """Process PDF file and add source metadata.""" try: with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp_file: tmp_file.write(file.getvalue()) loader = PyPDFLoader(tmp_file.name) documents = loader.load() # Add source metadata for doc in documents: doc.metadata.update({ "source_type": "pdf", "file_name": file.name, "timestamp": datetime.now().isoformat() }) text_splitter = RecursiveCharacterTextSplitter( chunk_size=1000, chunk_overlap=200 ) return text_splitter.split_documents(documents) except Exception as e: st.error(f"📄 PDF processing error: {str(e)}") return [] def process_web(url: str) -> List: """Process web URL and add source metadata.""" try: loader = WebBaseLoader(web_path=url) documents = loader.load() # Add source metadata for doc in documents: doc.metadata.update({ "source_type": "url", "url": url, "timestamp": datetime.now().isoformat() }) text_splitter = RecursiveCharacterTextSplitter( chunk_size=1000, chunk_overlap=200 ) return text_splitter.split_documents(documents) except Exception as e: st.error(f"🌐 Web processing error: {str(e)}") return []