import os import openai from langchain.text_splitter import CharacterTextSplitter from langchain.document_loaders import UnstructuredFileLoader from langchain.vectorstores.faiss import FAISS from langchain.embeddings import OpenAIEmbeddings from langchain.document_loaders import DirectoryLoader from langchain.document_loaders import TextLoader from langchain.document_loaders import CSVLoader from langchain.document_loaders import PyPDFLoader from langchain.document_loaders import UnstructuredWordDocumentLoader from langchain.text_splitter import RecursiveCharacterTextSplitter, Language from langchain.vectorstores import Chroma from langchain.document_loaders import NotionDBLoader from langchain.vectorstores.utils import filter_complex_metadata import pickle from Constants import * from apiKey import * from db_types import * from utilities import transform_complex_metadata def createChromaFromNotiondb(documents, embeddings) : vectordb = Chroma(persist_directory=NOTION_PERSIST_DIRECTORY, embedding_function=embeddings, collection_name=NOTION_COLLECTION_NAME) print("Checking for existing collection count "+str(vectordb._collection.count())) if (vectordb._collection.count()== 0): print("Transforming notion collection "+ NOTION_COLLECTION_NAME) documents = transform_complex_metadata(documents) print("Creating notion database") vectordb = Chroma.from_documents(documents=documents, embedding=embeddings, persist_directory=NOTION_PERSIST_DIRECTORY, collection_name=NOTION_COLLECTION_NAME) vectordb.persist() print("Count of Notion collections: " + str(vectordb._collection.count())) else : print("Count of Notion collections: " + str(vectordb._collection.count())) def createChromadb(documents, embeddings) : vectordb = Chroma(persist_directory=CHROMA_PERSIST_DIRECTORY, embedding_function=embeddings, collection_name=CHROMA_COLLECTION_NAME) if (vectordb._collection.count()== 0): print("Creating chromadb") vectordb = Chroma.from_documents(documents=documents, embedding=embeddings, persist_directory=CHROMA_PERSIST_DIRECTORY, collection_name=CHROMA_COLLECTION_NAME) vectordb.persist() print("Count of collections: " + str(vectordb._collection.count())) else : print("Count of collections: " + str(vectordb._collection.count())) def createFaissVectorstore(documents, embeddings) : print("Creating vectorstore...") vectorstore = FAISS.from_documents(documents, embeddings) with open("myvectorstore.pkl", "wb") as f: pickle.dump(vectorstore, f) def enrichMetada(docs): for doc in docs: for m in custom_meta_data: if (doc.metadata["source"] != ""): if ((m.get("name"))in doc.metadata["source"] ): doc.metadata["name"] = m.get("name") doc.metadata["profile"] = m.get("profile") doc.metadata["creationYear"] = m.get("creationYear") doc.metadata["topics"] = m.get("topics") class MyLoader: def __init__(self, file_path, **kwargs): if file_path.endswith('.docx'): self.loader = UnstructuredWordDocumentLoader(file_path, **kwargs) elif file_path.endswith('.pdf'): self.loader = PyPDFLoader(file_path, **kwargs) elif file_path.endswith('.csv'): self.loader = CSVLoader(file_path, **kwargs) else: self.loader = TextLoader(file_path, **kwargs) def load(self): return self.loader.load() custom_meta_data = [ { "name":"Tanmay Chopra", "profile":"https://www.linkedin.com/in/tanmayc98/", "creationYear":"2023", "topics":"Pinecone", }, { "name":"Neal Patel", "profile":"https://www.linkedin.com/in/nealpatel112/", "creationYear":"2023", "topics" :"Core - Model", }, { "name":"Navid", "profile":"https://www.linkedin.com/in/Navid", "creationYear":"2022", "topics":"LLM", }, { "name":"Josua Krause", "profile":"https://www.linkedin.com/in/Josua", "creationYear":"2022", "topics":"vector databases", }, { "name":"Jay Zhong", "profile":"https://www.linkedin.com/in/Jay", "creationYear":"2021", "topics" : "LLM", }, { "name":"Evan", "profile":"https://www.linkedin.com/in/Evan", "creationYear":"2021", "topics":"OpenAI", }, { "name":"Siva_values", "profile":"https://www.linkedin.com/Siva", "creationYear":"2023", "topics":"Personal goals" }, ] custom_meta_data = [ { "name":"Tanmay Chopra", "profile":"https://www.linkedin.com/in/tanmayc98/", "creationYear":"2023", "topics":"Pinecone", }, { "name":"Neal Patel", "profile":"https://www.linkedin.com/in/nealpatel112/", "creationYear":"2023", "topics" :"Core - Model", }, { "name":"Navid", "profile":"https://www.linkedin.com/in/Navid", "creationYear":"2022", "topics":"LLM", }, { "name":"Josua Krause", "profile":"https://www.linkedin.com/in/Josua", "creationYear":"2022", "topics":"vector databases", }, { "name":"Jay Zhong", "profile":"https://www.linkedin.com/in/Jay", "creationYear":"2021", "topics" : "LLM", }, { "name":"Evan", "profile":"https://www.linkedin.com/in/Evan", "creationYear":"2021", "topics":"OpenAI", }, { "name":"Siva_values", "profile":"https://www.linkedin.com/Siva", "creationYear":"2023", "topics":"Personal goals" }, ] custom_meta_data = [ { "name":"Tanmay Chopra", "profile":"https://www.linkedin.com/in/tanmayc98/", "creationYear":"2023", "topics":"Pinecone", }, { "name":"Neal Patel", "profile":"https://www.linkedin.com/in/nealpatel112/", "creationYear":"2023", "topics" :"Core - Model", }, { "name":"Navid", "profile":"https://www.linkedin.com/in/Navid", "creationYear":"2022", "topics":"LLM", }, { "name":"Josua Krause", "profile":"https://www.linkedin.com/in/Josua", "creationYear":"2022", "topics":"vector databases", }, { "name":"Jay Zhong", "profile":"https://www.linkedin.com/in/Jay", "creationYear":"2021", "topics" : "LLM", }, { "name":"Evan", "profile":"https://www.linkedin.com/in/Evan", "creationYear":"2021", "topics":"OpenAI", }, { "name":"Siva_values", "profile":"https://www.linkedin.com/Siva", "creationYear":"2023", "topics":"Personal goals" }, ] def ingestData(): os.environ['OPENAI_API_KEY'] =OPENAI_API_KEY print("Loading data...") embeddings = OpenAIEmbeddings() if (DB_TYPE == DBTypes['FAISS'].value or DB_TYPE == DBTypes['CHROMA'].value) : loader = DirectoryLoader(DATA_DIRECTORY, glob="**/*.*", loader_cls=MyLoader) print("Loading directory") docs = loader.load() text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0) enrichMetada(docs) print("splitting documents") documents = (text_splitter.split_documents(docs)) if (DB_TYPE == DBTypes['FAISS']): createFaissVectorstore(documents, embeddings) elif (DB_TYPE == DBTypes['CHROMA'].value) : createChromadb(documents, embeddings) elif (DB_TYPE == DBTypes['NOTION'].value): loader = NotionDBLoader( integration_token=NOTION_API_KEY, database_id=NOTION_DB, request_timeout_sec=30, # optional, defaults to 10 ) documents = loader.load() createChromaFromNotiondb(documents, embeddings) #ingestData()