hugging2021's picture
Upload folder using huggingface_hub
ca637d1 verified
import tempfile
from datetime import datetime
from typing import List
import streamlit as st
from langchain_community.document_loaders import PyPDFLoader, WebBaseLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
def process_pdf(file) -> List:
"""Process PDF file and add source metadata."""
try:
with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp_file:
tmp_file.write(file.getvalue())
loader = PyPDFLoader(tmp_file.name)
documents = loader.load()
# Add source metadata
for doc in documents:
doc.metadata.update({
"source_type": "pdf",
"file_name": file.name,
"timestamp": datetime.now().isoformat()
})
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=1000,
chunk_overlap=200
)
return text_splitter.split_documents(documents)
except Exception as e:
st.error(f"πŸ“„ PDF processing error: {str(e)}")
return []
def process_web(url: str) -> List:
"""Process web URL and add source metadata."""
try:
loader = WebBaseLoader(web_path=url)
documents = loader.load()
# Add source metadata
for doc in documents:
doc.metadata.update({
"source_type": "url",
"url": url,
"timestamp": datetime.now().isoformat()
})
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=1000,
chunk_overlap=200
)
return text_splitter.split_documents(documents)
except Exception as e:
st.error(f"🌐 Web processing error: {str(e)}")
return []