spur-chatbot / ingest_data.py
snehasquasher's picture
Upload folder using huggingface_hub
b0d4092
import os
import openai
from langchain.text_splitter import CharacterTextSplitter
from langchain.document_loaders import UnstructuredFileLoader
from langchain.vectorstores.faiss import FAISS
from langchain.embeddings import OpenAIEmbeddings
from langchain.document_loaders import DirectoryLoader
from langchain.document_loaders import TextLoader
from langchain.document_loaders import CSVLoader
from langchain.document_loaders import PyPDFLoader
from langchain.document_loaders import UnstructuredWordDocumentLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter, Language
from langchain.vectorstores import Chroma
from langchain.document_loaders import NotionDBLoader
from langchain.vectorstores.utils import filter_complex_metadata
import pickle
from Constants import *
from apiKey import *
from db_types import *
from utilities import transform_complex_metadata
def createChromaFromNotiondb(documents, embeddings) :
vectordb = Chroma(persist_directory=NOTION_PERSIST_DIRECTORY, embedding_function=embeddings,
collection_name=NOTION_COLLECTION_NAME)
print("Checking for existing collection count "+str(vectordb._collection.count()))
if (vectordb._collection.count()== 0):
print("Transforming notion collection "+ NOTION_COLLECTION_NAME)
documents = transform_complex_metadata(documents)
print("Creating notion database")
vectordb = Chroma.from_documents(documents=documents, embedding=embeddings, persist_directory=NOTION_PERSIST_DIRECTORY, collection_name=NOTION_COLLECTION_NAME)
vectordb.persist()
print("Count of Notion collections: " + str(vectordb._collection.count()))
else :
print("Count of Notion collections: " + str(vectordb._collection.count()))
def createChromadb(documents, embeddings) :
vectordb = Chroma(persist_directory=CHROMA_PERSIST_DIRECTORY, embedding_function=embeddings,
collection_name=CHROMA_COLLECTION_NAME)
if (vectordb._collection.count()== 0):
print("Creating chromadb")
vectordb = Chroma.from_documents(documents=documents, embedding=embeddings, persist_directory=CHROMA_PERSIST_DIRECTORY, collection_name=CHROMA_COLLECTION_NAME)
vectordb.persist()
print("Count of collections: " + str(vectordb._collection.count()))
else :
print("Count of collections: " + str(vectordb._collection.count()))
def createFaissVectorstore(documents, embeddings) :
print("Creating vectorstore...")
vectorstore = FAISS.from_documents(documents, embeddings)
with open("myvectorstore.pkl", "wb") as f:
pickle.dump(vectorstore, f)
def enrichMetada(docs):
for doc in docs:
for m in custom_meta_data:
if (doc.metadata["source"] != ""):
if ((m.get("name"))in doc.metadata["source"] ):
doc.metadata["name"] = m.get("name")
doc.metadata["profile"] = m.get("profile")
doc.metadata["creationYear"] = m.get("creationYear")
doc.metadata["topics"] = m.get("topics")
class MyLoader:
def __init__(self, file_path, **kwargs):
if file_path.endswith('.docx'):
self.loader = UnstructuredWordDocumentLoader(file_path, **kwargs)
elif file_path.endswith('.pdf'):
self.loader = PyPDFLoader(file_path, **kwargs)
elif file_path.endswith('.csv'):
self.loader = CSVLoader(file_path, **kwargs)
else:
self.loader = TextLoader(file_path, **kwargs)
def load(self):
return self.loader.load()
custom_meta_data = [
{
"name":"Tanmay Chopra",
"profile":"https://www.linkedin.com/in/tanmayc98/",
"creationYear":"2023",
"topics":"Pinecone",
},
{
"name":"Neal Patel",
"profile":"https://www.linkedin.com/in/nealpatel112/",
"creationYear":"2023",
"topics" :"Core - Model",
},
{
"name":"Navid",
"profile":"https://www.linkedin.com/in/Navid",
"creationYear":"2022",
"topics":"LLM",
},
{
"name":"Josua Krause",
"profile":"https://www.linkedin.com/in/Josua",
"creationYear":"2022",
"topics":"vector databases",
},
{
"name":"Jay Zhong",
"profile":"https://www.linkedin.com/in/Jay",
"creationYear":"2021",
"topics" : "LLM",
},
{
"name":"Evan",
"profile":"https://www.linkedin.com/in/Evan",
"creationYear":"2021",
"topics":"OpenAI",
},
{
"name":"Siva_values",
"profile":"https://www.linkedin.com/Siva",
"creationYear":"2023",
"topics":"Personal goals"
},
]
custom_meta_data = [
{
"name":"Tanmay Chopra",
"profile":"https://www.linkedin.com/in/tanmayc98/",
"creationYear":"2023",
"topics":"Pinecone",
},
{
"name":"Neal Patel",
"profile":"https://www.linkedin.com/in/nealpatel112/",
"creationYear":"2023",
"topics" :"Core - Model",
},
{
"name":"Navid",
"profile":"https://www.linkedin.com/in/Navid",
"creationYear":"2022",
"topics":"LLM",
},
{
"name":"Josua Krause",
"profile":"https://www.linkedin.com/in/Josua",
"creationYear":"2022",
"topics":"vector databases",
},
{
"name":"Jay Zhong",
"profile":"https://www.linkedin.com/in/Jay",
"creationYear":"2021",
"topics" : "LLM",
},
{
"name":"Evan",
"profile":"https://www.linkedin.com/in/Evan",
"creationYear":"2021",
"topics":"OpenAI",
},
{
"name":"Siva_values",
"profile":"https://www.linkedin.com/Siva",
"creationYear":"2023",
"topics":"Personal goals"
},
]
custom_meta_data = [
{
"name":"Tanmay Chopra",
"profile":"https://www.linkedin.com/in/tanmayc98/",
"creationYear":"2023",
"topics":"Pinecone",
},
{
"name":"Neal Patel",
"profile":"https://www.linkedin.com/in/nealpatel112/",
"creationYear":"2023",
"topics" :"Core - Model",
},
{
"name":"Navid",
"profile":"https://www.linkedin.com/in/Navid",
"creationYear":"2022",
"topics":"LLM",
},
{
"name":"Josua Krause",
"profile":"https://www.linkedin.com/in/Josua",
"creationYear":"2022",
"topics":"vector databases",
},
{
"name":"Jay Zhong",
"profile":"https://www.linkedin.com/in/Jay",
"creationYear":"2021",
"topics" : "LLM",
},
{
"name":"Evan",
"profile":"https://www.linkedin.com/in/Evan",
"creationYear":"2021",
"topics":"OpenAI",
},
{
"name":"Siva_values",
"profile":"https://www.linkedin.com/Siva",
"creationYear":"2023",
"topics":"Personal goals"
},
]
def ingestData():
os.environ['OPENAI_API_KEY'] =OPENAI_API_KEY
print("Loading data...")
embeddings = OpenAIEmbeddings()
if (DB_TYPE == DBTypes['FAISS'].value or DB_TYPE == DBTypes['CHROMA'].value) :
loader = DirectoryLoader(DATA_DIRECTORY, glob="**/*.*", loader_cls=MyLoader)
print("Loading directory")
docs = loader.load()
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
enrichMetada(docs)
print("splitting documents")
documents = (text_splitter.split_documents(docs))
if (DB_TYPE == DBTypes['FAISS']):
createFaissVectorstore(documents, embeddings)
elif (DB_TYPE == DBTypes['CHROMA'].value) :
createChromadb(documents, embeddings)
elif (DB_TYPE == DBTypes['NOTION'].value):
loader = NotionDBLoader(
integration_token=NOTION_API_KEY,
database_id=NOTION_DB,
request_timeout_sec=30, # optional, defaults to 10
)
documents = loader.load()
createChromaFromNotiondb(documents, embeddings)
#ingestData()