upload-pdf / app.py
GovindRaj's picture
Update app.py
3dde6f1 verified
import streamlit as st
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
import os
import tempfile
from huggingface_hub import HfApi, HfFolder
DB_FAISS_PATH = 'vectorstore/db_faiss'
SPACE_REPO = "GovindRaj/ebiz-chatbot" # Your Hugging Face Space ID
# Function to create or update FAISS vector DB and upload to Hugging Face Space
def create_vector_db(uploaded_files):
# Create a temporary directory
with tempfile.TemporaryDirectory() as temp_dir:
# Save uploaded files to temporary directory
for file in uploaded_files:
if file.name.endswith('.pdf'):
temp_path = os.path.join(temp_dir, file.name)
with open(temp_path, "wb") as f:
f.write(file.getvalue())
# Load PDFs
documents = []
for file in os.listdir(temp_dir):
if file.endswith('.pdf'):
pdf_path = os.path.join(temp_dir, file)
loader = PyPDFLoader(pdf_path)
documents.extend(loader.load())
# Split documents into chunks
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=500,
chunk_overlap=50
)
texts = text_splitter.split_documents(documents)
# Create embeddings
embeddings = HuggingFaceEmbeddings(
model_name='sentence-transformers/all-MiniLM-L6-v2',
model_kwargs={'device': 'cpu'}
)
# Check if FAISS vectorstore already exists
if os.path.exists(DB_FAISS_PATH):
# Load existing FAISS database
db = FAISS.load_local(DB_FAISS_PATH, embeddings, allow_dangerous_deserialization=True)
# Add new documents to the existing database
db.add_documents(texts)
else:
# Create a new FAISS database if none exists
db = FAISS.from_documents(texts, embeddings)
# Save the updated FAISS database locally
db.save_local(DB_FAISS_PATH)
# Retrieve the token from environment variables (Hugging Face Secrets)
hf_token = os.getenv("HF_TOKEN")
if not hf_token:
raise ValueError("Hugging Face token not found. Please set the token in Hugging Face secrets.")
# Push the updated vector database to Hugging Face Space
HfFolder.save_token(hf_token)
api = HfApi()
api.upload_folder(
folder_path=DB_FAISS_PATH, # Local path to the FAISS folder
path_in_repo="faiss_data", # Path in the Space repo
repo_id=SPACE_REPO, # Hugging Face Space ID
repo_type="space", # Specify that this is a Space
token=hf_token # Use the token from secrets
)
return True
# Streamlit app
def main():
st.title("PDF to Vector Database Converter")
uploaded_files = st.file_uploader(
"Upload PDF files",
type=['pdf'],
accept_multiple_files=True
)
if st.button("Create Vector Database") and uploaded_files:
with st.spinner("Creating vector database..."):
try:
success = create_vector_db(uploaded_files)
if success:
st.success("Vector database created and uploaded to your Hugging Face Space successfully!")
except Exception as e:
st.error(f"An error occurred: {str(e)}")
if __name__ == "__main__":
main()