Spaces:

GovindRaj
/

upload-pdf

Sleeping

App Files Files Community

upload-pdf / app.py

GovindRaj

Update app.py

3dde6f1 verified over 1 year ago

raw

history blame contribute delete

3.61 kB

	import streamlit as st
	from langchain_community.embeddings import HuggingFaceEmbeddings
	from langchain_community.vectorstores import FAISS
	from langchain_community.document_loaders import PyPDFLoader
	from langchain.text_splitter import RecursiveCharacterTextSplitter
	import os
	import tempfile
	from huggingface_hub import HfApi, HfFolder

	DB_FAISS_PATH = 'vectorstore/db_faiss'
	SPACE_REPO = "GovindRaj/ebiz-chatbot" # Your Hugging Face Space ID

	# Function to create or update FAISS vector DB and upload to Hugging Face Space
	def create_vector_db(uploaded_files):
	# Create a temporary directory
	with tempfile.TemporaryDirectory() as temp_dir:
	# Save uploaded files to temporary directory
	for file in uploaded_files:
	if file.name.endswith('.pdf'):
	temp_path = os.path.join(temp_dir, file.name)
	with open(temp_path, "wb") as f:
	f.write(file.getvalue())

	# Load PDFs
	documents = []
	for file in os.listdir(temp_dir):
	if file.endswith('.pdf'):
	pdf_path = os.path.join(temp_dir, file)
	loader = PyPDFLoader(pdf_path)
	documents.extend(loader.load())

	# Split documents into chunks
	text_splitter = RecursiveCharacterTextSplitter(
	chunk_size=500,
	chunk_overlap=50
	)
	texts = text_splitter.split_documents(documents)

	# Create embeddings
	embeddings = HuggingFaceEmbeddings(
	model_name='sentence-transformers/all-MiniLM-L6-v2',
	model_kwargs={'device': 'cpu'}
	)

	# Check if FAISS vectorstore already exists
	if os.path.exists(DB_FAISS_PATH):
	# Load existing FAISS database
	db = FAISS.load_local(DB_FAISS_PATH, embeddings, allow_dangerous_deserialization=True)
	# Add new documents to the existing database
	db.add_documents(texts)
	else:
	# Create a new FAISS database if none exists
	db = FAISS.from_documents(texts, embeddings)

	# Save the updated FAISS database locally
	db.save_local(DB_FAISS_PATH)

	# Retrieve the token from environment variables (Hugging Face Secrets)
	hf_token = os.getenv("HF_TOKEN")

	if not hf_token:
	raise ValueError("Hugging Face token not found. Please set the token in Hugging Face secrets.")

	# Push the updated vector database to Hugging Face Space
	HfFolder.save_token(hf_token)
	api = HfApi()
	api.upload_folder(
	folder_path=DB_FAISS_PATH, # Local path to the FAISS folder
	path_in_repo="faiss_data", # Path in the Space repo
	repo_id=SPACE_REPO, # Hugging Face Space ID
	repo_type="space", # Specify that this is a Space
	token=hf_token # Use the token from secrets
	)

	return True

	# Streamlit app
	def main():
	st.title("PDF to Vector Database Converter")

	uploaded_files = st.file_uploader(
	"Upload PDF files",
	type=['pdf'],
	accept_multiple_files=True
	)

	if st.button("Create Vector Database") and uploaded_files:
	with st.spinner("Creating vector database..."):
	try:
	success = create_vector_db(uploaded_files)
	if success:
	st.success("Vector database created and uploaded to your Hugging Face Space successfully!")
	except Exception as e:
	st.error(f"An error occurred: {str(e)}")

	if __name__ == "__main__":
	main()