import streamlit as st from langchain_community.embeddings import HuggingFaceEmbeddings from langchain_community.vectorstores import FAISS from langchain_community.document_loaders import PyPDFLoader from langchain.text_splitter import RecursiveCharacterTextSplitter import os import tempfile from huggingface_hub import HfApi, HfFolder DB_FAISS_PATH = 'vectorstore/db_faiss' SPACE_REPO = "GovindRaj/ebiz-chatbot" # Your Hugging Face Space ID # Function to create or update FAISS vector DB and upload to Hugging Face Space def create_vector_db(uploaded_files): # Create a temporary directory with tempfile.TemporaryDirectory() as temp_dir: # Save uploaded files to temporary directory for file in uploaded_files: if file.name.endswith('.pdf'): temp_path = os.path.join(temp_dir, file.name) with open(temp_path, "wb") as f: f.write(file.getvalue()) # Load PDFs documents = [] for file in os.listdir(temp_dir): if file.endswith('.pdf'): pdf_path = os.path.join(temp_dir, file) loader = PyPDFLoader(pdf_path) documents.extend(loader.load()) # Split documents into chunks text_splitter = RecursiveCharacterTextSplitter( chunk_size=500, chunk_overlap=50 ) texts = text_splitter.split_documents(documents) # Create embeddings embeddings = HuggingFaceEmbeddings( model_name='sentence-transformers/all-MiniLM-L6-v2', model_kwargs={'device': 'cpu'} ) # Check if FAISS vectorstore already exists if os.path.exists(DB_FAISS_PATH): # Load existing FAISS database db = FAISS.load_local(DB_FAISS_PATH, embeddings, allow_dangerous_deserialization=True) # Add new documents to the existing database db.add_documents(texts) else: # Create a new FAISS database if none exists db = FAISS.from_documents(texts, embeddings) # Save the updated FAISS database locally db.save_local(DB_FAISS_PATH) # Retrieve the token from environment variables (Hugging Face Secrets) hf_token = os.getenv("HF_TOKEN") if not hf_token: raise ValueError("Hugging Face token not found. Please set the token in Hugging Face secrets.") # Push the updated vector database to Hugging Face Space HfFolder.save_token(hf_token) api = HfApi() api.upload_folder( folder_path=DB_FAISS_PATH, # Local path to the FAISS folder path_in_repo="faiss_data", # Path in the Space repo repo_id=SPACE_REPO, # Hugging Face Space ID repo_type="space", # Specify that this is a Space token=hf_token # Use the token from secrets ) return True # Streamlit app def main(): st.title("PDF to Vector Database Converter") uploaded_files = st.file_uploader( "Upload PDF files", type=['pdf'], accept_multiple_files=True ) if st.button("Create Vector Database") and uploaded_files: with st.spinner("Creating vector database..."): try: success = create_vector_db(uploaded_files) if success: st.success("Vector database created and uploaded to your Hugging Face Space successfully!") except Exception as e: st.error(f"An error occurred: {str(e)}") if __name__ == "__main__": main()