Spaces:
Sleeping
Sleeping
| import os | |
| import openai | |
| from langchain.text_splitter import CharacterTextSplitter | |
| from langchain.document_loaders import UnstructuredFileLoader | |
| from langchain.vectorstores.faiss import FAISS | |
| from langchain.embeddings import OpenAIEmbeddings | |
| from langchain.document_loaders import DirectoryLoader | |
| from langchain.document_loaders import TextLoader | |
| from langchain.document_loaders import CSVLoader | |
| from langchain.document_loaders import PyPDFLoader | |
| from langchain.document_loaders import UnstructuredWordDocumentLoader | |
| from langchain.text_splitter import RecursiveCharacterTextSplitter, Language | |
| from langchain.vectorstores import Chroma | |
| from langchain.document_loaders import NotionDBLoader | |
| from langchain.vectorstores.utils import filter_complex_metadata | |
| import pickle | |
| from Constants import * | |
| from apiKey import * | |
| from db_types import * | |
| from utilities import transform_complex_metadata | |
| def createChromaFromNotiondb(documents, embeddings) : | |
| vectordb = Chroma(persist_directory=NOTION_PERSIST_DIRECTORY, embedding_function=embeddings, | |
| collection_name=NOTION_COLLECTION_NAME) | |
| print("Checking for existing collection count "+str(vectordb._collection.count())) | |
| if (vectordb._collection.count()== 0): | |
| print("Transforming notion collection "+ NOTION_COLLECTION_NAME) | |
| documents = transform_complex_metadata(documents) | |
| print("Creating notion database") | |
| vectordb = Chroma.from_documents(documents=documents, embedding=embeddings, persist_directory=NOTION_PERSIST_DIRECTORY, collection_name=NOTION_COLLECTION_NAME) | |
| vectordb.persist() | |
| print("Count of Notion collections: " + str(vectordb._collection.count())) | |
| else : | |
| print("Count of Notion collections: " + str(vectordb._collection.count())) | |
| def createChromadb(documents, embeddings) : | |
| vectordb = Chroma(persist_directory=CHROMA_PERSIST_DIRECTORY, embedding_function=embeddings, | |
| collection_name=CHROMA_COLLECTION_NAME) | |
| if (vectordb._collection.count()== 0): | |
| print("Creating chromadb") | |
| vectordb = Chroma.from_documents(documents=documents, embedding=embeddings, persist_directory=CHROMA_PERSIST_DIRECTORY, collection_name=CHROMA_COLLECTION_NAME) | |
| vectordb.persist() | |
| print("Count of collections: " + str(vectordb._collection.count())) | |
| else : | |
| print("Count of collections: " + str(vectordb._collection.count())) | |
| def createFaissVectorstore(documents, embeddings) : | |
| print("Creating vectorstore...") | |
| vectorstore = FAISS.from_documents(documents, embeddings) | |
| with open("myvectorstore.pkl", "wb") as f: | |
| pickle.dump(vectorstore, f) | |
| def enrichMetada(docs): | |
| for doc in docs: | |
| for m in custom_meta_data: | |
| if (doc.metadata["source"] != ""): | |
| if ((m.get("name"))in doc.metadata["source"] ): | |
| doc.metadata["name"] = m.get("name") | |
| doc.metadata["profile"] = m.get("profile") | |
| doc.metadata["creationYear"] = m.get("creationYear") | |
| doc.metadata["topics"] = m.get("topics") | |
| class MyLoader: | |
| def __init__(self, file_path, **kwargs): | |
| if file_path.endswith('.docx'): | |
| self.loader = UnstructuredWordDocumentLoader(file_path, **kwargs) | |
| elif file_path.endswith('.pdf'): | |
| self.loader = PyPDFLoader(file_path, **kwargs) | |
| elif file_path.endswith('.csv'): | |
| self.loader = CSVLoader(file_path, **kwargs) | |
| else: | |
| self.loader = TextLoader(file_path, **kwargs) | |
| def load(self): | |
| return self.loader.load() | |
| custom_meta_data = [ | |
| { | |
| "name":"Tanmay Chopra", | |
| "profile":"https://www.linkedin.com/in/tanmayc98/", | |
| "creationYear":"2023", | |
| "topics":"Pinecone", | |
| }, | |
| { | |
| "name":"Neal Patel", | |
| "profile":"https://www.linkedin.com/in/nealpatel112/", | |
| "creationYear":"2023", | |
| "topics" :"Core - Model", | |
| }, | |
| { | |
| "name":"Navid", | |
| "profile":"https://www.linkedin.com/in/Navid", | |
| "creationYear":"2022", | |
| "topics":"LLM", | |
| }, | |
| { | |
| "name":"Josua Krause", | |
| "profile":"https://www.linkedin.com/in/Josua", | |
| "creationYear":"2022", | |
| "topics":"vector databases", | |
| }, | |
| { | |
| "name":"Jay Zhong", | |
| "profile":"https://www.linkedin.com/in/Jay", | |
| "creationYear":"2021", | |
| "topics" : "LLM", | |
| }, | |
| { | |
| "name":"Evan", | |
| "profile":"https://www.linkedin.com/in/Evan", | |
| "creationYear":"2021", | |
| "topics":"OpenAI", | |
| }, | |
| { | |
| "name":"Siva_values", | |
| "profile":"https://www.linkedin.com/Siva", | |
| "creationYear":"2023", | |
| "topics":"Personal goals" | |
| }, | |
| ] | |
| custom_meta_data = [ | |
| { | |
| "name":"Tanmay Chopra", | |
| "profile":"https://www.linkedin.com/in/tanmayc98/", | |
| "creationYear":"2023", | |
| "topics":"Pinecone", | |
| }, | |
| { | |
| "name":"Neal Patel", | |
| "profile":"https://www.linkedin.com/in/nealpatel112/", | |
| "creationYear":"2023", | |
| "topics" :"Core - Model", | |
| }, | |
| { | |
| "name":"Navid", | |
| "profile":"https://www.linkedin.com/in/Navid", | |
| "creationYear":"2022", | |
| "topics":"LLM", | |
| }, | |
| { | |
| "name":"Josua Krause", | |
| "profile":"https://www.linkedin.com/in/Josua", | |
| "creationYear":"2022", | |
| "topics":"vector databases", | |
| }, | |
| { | |
| "name":"Jay Zhong", | |
| "profile":"https://www.linkedin.com/in/Jay", | |
| "creationYear":"2021", | |
| "topics" : "LLM", | |
| }, | |
| { | |
| "name":"Evan", | |
| "profile":"https://www.linkedin.com/in/Evan", | |
| "creationYear":"2021", | |
| "topics":"OpenAI", | |
| }, | |
| { | |
| "name":"Siva_values", | |
| "profile":"https://www.linkedin.com/Siva", | |
| "creationYear":"2023", | |
| "topics":"Personal goals" | |
| }, | |
| ] | |
| custom_meta_data = [ | |
| { | |
| "name":"Tanmay Chopra", | |
| "profile":"https://www.linkedin.com/in/tanmayc98/", | |
| "creationYear":"2023", | |
| "topics":"Pinecone", | |
| }, | |
| { | |
| "name":"Neal Patel", | |
| "profile":"https://www.linkedin.com/in/nealpatel112/", | |
| "creationYear":"2023", | |
| "topics" :"Core - Model", | |
| }, | |
| { | |
| "name":"Navid", | |
| "profile":"https://www.linkedin.com/in/Navid", | |
| "creationYear":"2022", | |
| "topics":"LLM", | |
| }, | |
| { | |
| "name":"Josua Krause", | |
| "profile":"https://www.linkedin.com/in/Josua", | |
| "creationYear":"2022", | |
| "topics":"vector databases", | |
| }, | |
| { | |
| "name":"Jay Zhong", | |
| "profile":"https://www.linkedin.com/in/Jay", | |
| "creationYear":"2021", | |
| "topics" : "LLM", | |
| }, | |
| { | |
| "name":"Evan", | |
| "profile":"https://www.linkedin.com/in/Evan", | |
| "creationYear":"2021", | |
| "topics":"OpenAI", | |
| }, | |
| { | |
| "name":"Siva_values", | |
| "profile":"https://www.linkedin.com/Siva", | |
| "creationYear":"2023", | |
| "topics":"Personal goals" | |
| }, | |
| ] | |
| def ingestData(): | |
| os.environ['OPENAI_API_KEY'] =OPENAI_API_KEY | |
| print("Loading data...") | |
| embeddings = OpenAIEmbeddings() | |
| if (DB_TYPE == DBTypes['FAISS'].value or DB_TYPE == DBTypes['CHROMA'].value) : | |
| loader = DirectoryLoader(DATA_DIRECTORY, glob="**/*.*", loader_cls=MyLoader) | |
| print("Loading directory") | |
| docs = loader.load() | |
| text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0) | |
| enrichMetada(docs) | |
| print("splitting documents") | |
| documents = (text_splitter.split_documents(docs)) | |
| if (DB_TYPE == DBTypes['FAISS']): | |
| createFaissVectorstore(documents, embeddings) | |
| elif (DB_TYPE == DBTypes['CHROMA'].value) : | |
| createChromadb(documents, embeddings) | |
| elif (DB_TYPE == DBTypes['NOTION'].value): | |
| loader = NotionDBLoader( | |
| integration_token=NOTION_API_KEY, | |
| database_id=NOTION_DB, | |
| request_timeout_sec=30, # optional, defaults to 10 | |
| ) | |
| documents = loader.load() | |
| createChromaFromNotiondb(documents, embeddings) | |
| #ingestData() | |