|
|
import tempfile |
|
|
from datetime import datetime |
|
|
from typing import List |
|
|
|
|
|
import streamlit as st |
|
|
from langchain_community.document_loaders import PyPDFLoader, WebBaseLoader |
|
|
from langchain.text_splitter import RecursiveCharacterTextSplitter |
|
|
|
|
|
def process_pdf(file) -> List: |
|
|
"""Process PDF file and add source metadata.""" |
|
|
try: |
|
|
with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp_file: |
|
|
tmp_file.write(file.getvalue()) |
|
|
loader = PyPDFLoader(tmp_file.name) |
|
|
documents = loader.load() |
|
|
|
|
|
|
|
|
for doc in documents: |
|
|
doc.metadata.update({ |
|
|
"source_type": "pdf", |
|
|
"file_name": file.name, |
|
|
"timestamp": datetime.now().isoformat() |
|
|
}) |
|
|
|
|
|
text_splitter = RecursiveCharacterTextSplitter( |
|
|
chunk_size=1000, |
|
|
chunk_overlap=200 |
|
|
) |
|
|
return text_splitter.split_documents(documents) |
|
|
|
|
|
except Exception as e: |
|
|
st.error(f"π PDF processing error: {str(e)}") |
|
|
return [] |
|
|
|
|
|
|
|
|
def process_web(url: str) -> List: |
|
|
"""Process web URL and add source metadata.""" |
|
|
try: |
|
|
loader = WebBaseLoader(web_path=url) |
|
|
documents = loader.load() |
|
|
|
|
|
|
|
|
for doc in documents: |
|
|
doc.metadata.update({ |
|
|
"source_type": "url", |
|
|
"url": url, |
|
|
"timestamp": datetime.now().isoformat() |
|
|
}) |
|
|
|
|
|
text_splitter = RecursiveCharacterTextSplitter( |
|
|
chunk_size=1000, |
|
|
chunk_overlap=200 |
|
|
) |
|
|
return text_splitter.split_documents(documents) |
|
|
|
|
|
except Exception as e: |
|
|
st.error(f"π Web processing error: {str(e)}") |
|
|
return [] |
|
|
|