Spaces:

snehasquasher
/

spur-chatbot

Sleeping

App Files Files Community

spur-chatbot / ingest_data.py

snehasquasher

Upload folder using huggingface_hub

b0d4092 over 2 years ago

raw

history blame contribute delete

8.26 kB

	import os
	import openai
	from langchain.text_splitter import CharacterTextSplitter
	from langchain.document_loaders import UnstructuredFileLoader
	from langchain.vectorstores.faiss import FAISS
	from langchain.embeddings import OpenAIEmbeddings
	from langchain.document_loaders import DirectoryLoader
	from langchain.document_loaders import TextLoader
	from langchain.document_loaders import CSVLoader
	from langchain.document_loaders import PyPDFLoader
	from langchain.document_loaders import UnstructuredWordDocumentLoader
	from langchain.text_splitter import RecursiveCharacterTextSplitter, Language
	from langchain.vectorstores import Chroma
	from langchain.document_loaders import NotionDBLoader
	from langchain.vectorstores.utils import filter_complex_metadata
	import pickle
	from Constants import *
	from apiKey import *
	from db_types import *
	from utilities import transform_complex_metadata

	def createChromaFromNotiondb(documents, embeddings) :
	vectordb = Chroma(persist_directory=NOTION_PERSIST_DIRECTORY, embedding_function=embeddings,
	collection_name=NOTION_COLLECTION_NAME)
	print("Checking for existing collection count "+str(vectordb._collection.count()))
	if (vectordb._collection.count()== 0):
	print("Transforming notion collection "+ NOTION_COLLECTION_NAME)
	documents = transform_complex_metadata(documents)
	print("Creating notion database")
	vectordb = Chroma.from_documents(documents=documents, embedding=embeddings, persist_directory=NOTION_PERSIST_DIRECTORY, collection_name=NOTION_COLLECTION_NAME)
	vectordb.persist()
	print("Count of Notion collections: " + str(vectordb._collection.count()))
	else :
	print("Count of Notion collections: " + str(vectordb._collection.count()))

	def createChromadb(documents, embeddings) :
	vectordb = Chroma(persist_directory=CHROMA_PERSIST_DIRECTORY, embedding_function=embeddings,
	collection_name=CHROMA_COLLECTION_NAME)
	if (vectordb._collection.count()== 0):
	print("Creating chromadb")
	vectordb = Chroma.from_documents(documents=documents, embedding=embeddings, persist_directory=CHROMA_PERSIST_DIRECTORY, collection_name=CHROMA_COLLECTION_NAME)
	vectordb.persist()
	print("Count of collections: " + str(vectordb._collection.count()))
	else :
	print("Count of collections: " + str(vectordb._collection.count()))

	def createFaissVectorstore(documents, embeddings) :
	print("Creating vectorstore...")
	vectorstore = FAISS.from_documents(documents, embeddings)
	with open("myvectorstore.pkl", "wb") as f:
	pickle.dump(vectorstore, f)

	def enrichMetada(docs):

	for doc in docs:
	for m in custom_meta_data:
	if (doc.metadata["source"] != ""):
	if ((m.get("name"))in doc.metadata["source"] ):
	doc.metadata["name"] = m.get("name")
	doc.metadata["profile"] = m.get("profile")
	doc.metadata["creationYear"] = m.get("creationYear")
	doc.metadata["topics"] = m.get("topics")

	class MyLoader:
	def __init__(self, file_path, **kwargs):
	if file_path.endswith('.docx'):
	self.loader = UnstructuredWordDocumentLoader(file_path, **kwargs)
	elif file_path.endswith('.pdf'):
	self.loader = PyPDFLoader(file_path, **kwargs)
	elif file_path.endswith('.csv'):
	self.loader = CSVLoader(file_path, **kwargs)
	else:
	self.loader = TextLoader(file_path, **kwargs)

	def load(self):
	return self.loader.load()

	custom_meta_data = [
	{
	"name":"Tanmay Chopra",
	"profile":"https://www.linkedin.com/in/tanmayc98/",
	"creationYear":"2023",
	"topics":"Pinecone",
	},
	{
	"name":"Neal Patel",
	"profile":"https://www.linkedin.com/in/nealpatel112/",
	"creationYear":"2023",
	"topics" :"Core - Model",
	},
	{
	"name":"Navid",
	"profile":"https://www.linkedin.com/in/Navid",
	"creationYear":"2022",
	"topics":"LLM",
	},
	{
	"name":"Josua Krause",
	"profile":"https://www.linkedin.com/in/Josua",
	"creationYear":"2022",
	"topics":"vector databases",
	},
	{
	"name":"Jay Zhong",
	"profile":"https://www.linkedin.com/in/Jay",
	"creationYear":"2021",
	"topics" : "LLM",
	},
	{
	"name":"Evan",
	"profile":"https://www.linkedin.com/in/Evan",
	"creationYear":"2021",
	"topics":"OpenAI",
	},
	{
	"name":"Siva_values",
	"profile":"https://www.linkedin.com/Siva",
	"creationYear":"2023",
	"topics":"Personal goals"
	},
	]

	custom_meta_data = [
	{
	"name":"Tanmay Chopra",
	"profile":"https://www.linkedin.com/in/tanmayc98/",
	"creationYear":"2023",
	"topics":"Pinecone",
	},
	{
	"name":"Neal Patel",
	"profile":"https://www.linkedin.com/in/nealpatel112/",
	"creationYear":"2023",
	"topics" :"Core - Model",
	},
	{
	"name":"Navid",
	"profile":"https://www.linkedin.com/in/Navid",
	"creationYear":"2022",
	"topics":"LLM",
	},
	{
	"name":"Josua Krause",
	"profile":"https://www.linkedin.com/in/Josua",
	"creationYear":"2022",
	"topics":"vector databases",
	},
	{
	"name":"Jay Zhong",
	"profile":"https://www.linkedin.com/in/Jay",
	"creationYear":"2021",
	"topics" : "LLM",
	},
	{
	"name":"Evan",
	"profile":"https://www.linkedin.com/in/Evan",
	"creationYear":"2021",
	"topics":"OpenAI",
	},
	{
	"name":"Siva_values",
	"profile":"https://www.linkedin.com/Siva",
	"creationYear":"2023",
	"topics":"Personal goals"
	},
	]
	custom_meta_data = [
	{
	"name":"Tanmay Chopra",
	"profile":"https://www.linkedin.com/in/tanmayc98/",
	"creationYear":"2023",
	"topics":"Pinecone",
	},
	{
	"name":"Neal Patel",
	"profile":"https://www.linkedin.com/in/nealpatel112/",
	"creationYear":"2023",
	"topics" :"Core - Model",
	},
	{
	"name":"Navid",
	"profile":"https://www.linkedin.com/in/Navid",
	"creationYear":"2022",
	"topics":"LLM",
	},
	{
	"name":"Josua Krause",
	"profile":"https://www.linkedin.com/in/Josua",
	"creationYear":"2022",
	"topics":"vector databases",
	},
	{
	"name":"Jay Zhong",
	"profile":"https://www.linkedin.com/in/Jay",
	"creationYear":"2021",
	"topics" : "LLM",
	},
	{
	"name":"Evan",
	"profile":"https://www.linkedin.com/in/Evan",
	"creationYear":"2021",
	"topics":"OpenAI",
	},
	{
	"name":"Siva_values",
	"profile":"https://www.linkedin.com/Siva",
	"creationYear":"2023",
	"topics":"Personal goals"
	},
	]

	def ingestData():
	os.environ['OPENAI_API_KEY'] =OPENAI_API_KEY
	print("Loading data...")

	embeddings = OpenAIEmbeddings()

	if (DB_TYPE == DBTypes['FAISS'].value or DB_TYPE == DBTypes['CHROMA'].value) :
	loader = DirectoryLoader(DATA_DIRECTORY, glob="*/.*", loader_cls=MyLoader)
	print("Loading directory")
	docs = loader.load()

	text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)

	enrichMetada(docs)
	print("splitting documents")
	documents = (text_splitter.split_documents(docs))
	if (DB_TYPE == DBTypes['FAISS']):
	createFaissVectorstore(documents, embeddings)
	elif (DB_TYPE == DBTypes['CHROMA'].value) :
	createChromadb(documents, embeddings)
	elif (DB_TYPE == DBTypes['NOTION'].value):
	loader = NotionDBLoader(
	integration_token=NOTION_API_KEY,
	database_id=NOTION_DB,
	request_timeout_sec=30, # optional, defaults to 10
	)

	documents = loader.load()
	createChromaFromNotiondb(documents, embeddings)

	#ingestData()