from dotenv import load_dotenv # langchain libraries from langchain.document_loaders import DirectoryLoader, TextLoader from langchain.embeddings.openai import OpenAIEmbeddings from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter from langchain.vectorstores import FAISS, Pinecone import pinecone import openai import os load_dotenv() openai.api_key = os.getenv("OPENAI_API_KEY") PINECONE_API_KEY = os.getenv("PINECONE_API_KEY") def generate_pincone_vector_store(index_name='btc-chat-bot'): pinecone.init() pinecone.create_index("test-index", dimension=1536, metric='cosine') pinecone.list_indexes() result = Pinecone.from_documents(documents, embeddings, index_name) return result def load_local_vector_store(index_name='hr_faiss_index'): embeddings = OpenAIEmbeddings() try: vector_store = FAISS.load_local(index_name, embeddings) print("Local VectorDB Found.") return vector_store except Exception as e: print(e) return None def load_local_documents(): doc_dir = os.path.join(os.getcwd() + '/docs', 'processed') loader = DirectoryLoader(doc_dir) documents = loader.load() assert len(documents) > 0 return documents def generate_new_vector_store(index_name='hr_faiss_index'): print("No Local VectorDB Found. Generating new one...") documents = load_local_documents() text_splitter = RecursiveCharacterTextSplitter( chunk_size=1000, chunk_overlap=0, separators=["\n", "\r\n", "\r", " "]) documents = text_splitter.split_documents(documents) embeddings = OpenAIEmbeddings() vector_store = FAISS.from_documents(documents, embeddings) vector_store.save_local(index_name) return vector_store def get_or_create_vector_store(index_name='hr_faiss_index'): vector_store = load_local_vector_store(index_name) if vector_store is None: vector_store = generate_new_vector_store(index_name) return vector_store if __name__ == "__main__": vector = get_or_create_vector_store() print(vector)