Spaces:

nirmanpatel
/

semantic-book-recommender

Sleeping

App Files Files Community

semantic-book-recommender / vector_search.py

nirmanpatel

Upload 4 files

226e11e verified 7 months ago

raw

history blame contribute delete

3.47 kB

	from langchain_chroma import Chroma
	from langchain_openai import OpenAIEmbeddings
	from langchain_huggingface import HuggingFaceEmbeddings
	from langchain_community.document_loaders import TextLoader
	from langchain_text_splitters import CharacterTextSplitter
	from dotenv import load_dotenv
	from tabulate import tabulate
	import pandas as pd

	#loading the .env file
	load_dotenv()

	books = pd.read_csv("books_cleaned.csv")

	books["tagged_description"].to_csv("tagged_description.txt",
	sep = "\n",
	index = False,
	header = False)

	"""Our existing Chroma DB (chroma_db_books) was created with OpenAIEmbeddings, which produce 1536-dimensional vectors.
	On the other hand, HuggingFaceEmbeddings produces 384-dimensional vectors.
	"""

	# OpenAI approach using its API
	# load the documents and instantiate the text-splitter
	# the chunk size it set to zero to prioritize splitting at the separator rather than the chunk-size, hence we might warnings
	"""raw_documents = TextLoader("tagged_description.txt").load()
	text_splitter = CharacterTextSplitter(chunk_size=0, chunk_overlap=0, separator="\n")
	documents = text_splitter.split_documents(raw_documents)

	# checking if it prints the first description correctly
	print(documents[0])

	#create the document embeddings and store them in the vector database
	db_books = Chroma.from_documents(
	documents,
	embedding=OpenAIEmbeddings(),
	persist_directory="chroma_db_books"
	)
	print("Vector database stored to local disk:)")
	"""

	# HuggingFace approach >> to save money
	# conditional flag to avoid creating vector database everytime
	query = "A book to teach children about nature"

	REBUILD_VECTOR_DB = False
	PERSIST_DIR = "chroma_db_books_hf"
	MODEL = "sentence-transformers/all-MiniLM-L6-v2" #384-dim (keep consistent!)

	# 1. Define the embedding model (same for build & query)
	embedding = HuggingFaceEmbeddings(model_name=MODEL)

	if REBUILD_VECTOR_DB:
	# 2. Load and split text
	raw_documents = TextLoader("tagged_description.txt").load()
	text_splitter = CharacterTextSplitter(chunk_size=0, chunk_overlap=0, separator="\n")
	documents = text_splitter.split_documents(raw_documents)

	# 3. Create and persist vector DB
	db_books = Chroma.from_documents(
	documents,
	embedding=embedding,
	persist_directory=PERSIST_DIR
	)

	print("First split chunk:")
	print(documents[0].page_content)

	else:
	# 4. Load existing DB (no re-embedding)
	db_books = Chroma(
	persist_directory=PERSIST_DIR,
	embedding_function=embedding
	)

	# 5. Run a query
	results = db_books.similarity_search(query, k=1)
	print("Top semantic match:\n" + results[0].page_content + "\n")

	docs = db_books.similarity_search(query, k = 10)
	print("First 10 results: \n", docs, "\n")

	# filters and gives the isbn for the first result from the query results
	print("First result of all:\n")
	print(books[books["isbn13"] == int(docs[0].page_content.split()[0].strip())])
	print("\n")

	def retrieve_semantic_recommendations(
	query: str,
	top_k: int = 10,
	) -> pd. DataFrame:
	recs = db_books.similarity_search (query, k = 50)

	books_list = []

	for i in range(0, len(recs)):
	books_list += [int(recs[i].page_content.strip('"').split()[0])]

	return books[books["isbn13"].isin(books_list)].head(top_k)

	results = retrieve_semantic_recommendations(query)
	print("Recommendations:\n")
	print(tabulate(results, headers='keys', tablefmt='grid', showindex=False))