Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| from langchain_community.embeddings import HuggingFaceEmbeddings | |
| from langchain_community.vectorstores import FAISS | |
| from langchain_community.document_loaders import PyPDFLoader | |
| from langchain.text_splitter import RecursiveCharacterTextSplitter | |
| import os | |
| import tempfile | |
| from huggingface_hub import HfApi, HfFolder | |
| DB_FAISS_PATH = 'vectorstore/db_faiss' | |
| SPACE_REPO = "GovindRaj/ebiz-chatbot" # Your Hugging Face Space ID | |
| # Function to create or update FAISS vector DB and upload to Hugging Face Space | |
| def create_vector_db(uploaded_files): | |
| # Create a temporary directory | |
| with tempfile.TemporaryDirectory() as temp_dir: | |
| # Save uploaded files to temporary directory | |
| for file in uploaded_files: | |
| if file.name.endswith('.pdf'): | |
| temp_path = os.path.join(temp_dir, file.name) | |
| with open(temp_path, "wb") as f: | |
| f.write(file.getvalue()) | |
| # Load PDFs | |
| documents = [] | |
| for file in os.listdir(temp_dir): | |
| if file.endswith('.pdf'): | |
| pdf_path = os.path.join(temp_dir, file) | |
| loader = PyPDFLoader(pdf_path) | |
| documents.extend(loader.load()) | |
| # Split documents into chunks | |
| text_splitter = RecursiveCharacterTextSplitter( | |
| chunk_size=500, | |
| chunk_overlap=50 | |
| ) | |
| texts = text_splitter.split_documents(documents) | |
| # Create embeddings | |
| embeddings = HuggingFaceEmbeddings( | |
| model_name='sentence-transformers/all-MiniLM-L6-v2', | |
| model_kwargs={'device': 'cpu'} | |
| ) | |
| # Check if FAISS vectorstore already exists | |
| if os.path.exists(DB_FAISS_PATH): | |
| # Load existing FAISS database | |
| db = FAISS.load_local(DB_FAISS_PATH, embeddings, allow_dangerous_deserialization=True) | |
| # Add new documents to the existing database | |
| db.add_documents(texts) | |
| else: | |
| # Create a new FAISS database if none exists | |
| db = FAISS.from_documents(texts, embeddings) | |
| # Save the updated FAISS database locally | |
| db.save_local(DB_FAISS_PATH) | |
| # Retrieve the token from environment variables (Hugging Face Secrets) | |
| hf_token = os.getenv("HF_TOKEN") | |
| if not hf_token: | |
| raise ValueError("Hugging Face token not found. Please set the token in Hugging Face secrets.") | |
| # Push the updated vector database to Hugging Face Space | |
| HfFolder.save_token(hf_token) | |
| api = HfApi() | |
| api.upload_folder( | |
| folder_path=DB_FAISS_PATH, # Local path to the FAISS folder | |
| path_in_repo="faiss_data", # Path in the Space repo | |
| repo_id=SPACE_REPO, # Hugging Face Space ID | |
| repo_type="space", # Specify that this is a Space | |
| token=hf_token # Use the token from secrets | |
| ) | |
| return True | |
| # Streamlit app | |
| def main(): | |
| st.title("PDF to Vector Database Converter") | |
| uploaded_files = st.file_uploader( | |
| "Upload PDF files", | |
| type=['pdf'], | |
| accept_multiple_files=True | |
| ) | |
| if st.button("Create Vector Database") and uploaded_files: | |
| with st.spinner("Creating vector database..."): | |
| try: | |
| success = create_vector_db(uploaded_files) | |
| if success: | |
| st.success("Vector database created and uploaded to your Hugging Face Space successfully!") | |
| except Exception as e: | |
| st.error(f"An error occurred: {str(e)}") | |
| if __name__ == "__main__": | |
| main() | |