semantic-book-recommender / vector_search.py
nirmanpatel's picture
Upload 4 files
226e11e verified
from langchain_chroma import Chroma
from langchain_openai import OpenAIEmbeddings
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.document_loaders import TextLoader
from langchain_text_splitters import CharacterTextSplitter
from dotenv import load_dotenv
from tabulate import tabulate
import pandas as pd
#loading the .env file
load_dotenv()
books = pd.read_csv("books_cleaned.csv")
books["tagged_description"].to_csv("tagged_description.txt",
sep = "\n",
index = False,
header = False)
"""Our existing Chroma DB (chroma_db_books) was created with OpenAIEmbeddings, which produce 1536-dimensional vectors.
On the other hand, HuggingFaceEmbeddings produces 384-dimensional vectors.
"""
# OpenAI approach using its API
# load the documents and instantiate the text-splitter
# the chunk size it set to zero to prioritize splitting at the separator rather than the chunk-size, hence we might warnings
"""raw_documents = TextLoader("tagged_description.txt").load()
text_splitter = CharacterTextSplitter(chunk_size=0, chunk_overlap=0, separator="\n")
documents = text_splitter.split_documents(raw_documents)
# checking if it prints the first description correctly
print(documents[0])
#create the document embeddings and store them in the vector database
db_books = Chroma.from_documents(
documents,
embedding=OpenAIEmbeddings(),
persist_directory="chroma_db_books"
)
print("Vector database stored to local disk:)")
"""
# HuggingFace approach >> to save money
# conditional flag to avoid creating vector database everytime
query = "A book to teach children about nature"
REBUILD_VECTOR_DB = False
PERSIST_DIR = "chroma_db_books_hf"
MODEL = "sentence-transformers/all-MiniLM-L6-v2" #384-dim (keep consistent!)
# 1. Define the embedding model (same for build & query)
embedding = HuggingFaceEmbeddings(model_name=MODEL)
if REBUILD_VECTOR_DB:
# 2. Load and split text
raw_documents = TextLoader("tagged_description.txt").load()
text_splitter = CharacterTextSplitter(chunk_size=0, chunk_overlap=0, separator="\n")
documents = text_splitter.split_documents(raw_documents)
# 3. Create and persist vector DB
db_books = Chroma.from_documents(
documents,
embedding=embedding,
persist_directory=PERSIST_DIR
)
print("First split chunk:")
print(documents[0].page_content)
else:
# 4. Load existing DB (no re-embedding)
db_books = Chroma(
persist_directory=PERSIST_DIR,
embedding_function=embedding
)
# 5. Run a query
results = db_books.similarity_search(query, k=1)
print("Top semantic match:\n" + results[0].page_content + "\n")
docs = db_books.similarity_search(query, k = 10)
print("First 10 results: \n", docs, "\n")
# filters and gives the isbn for the first result from the query results
print("First result of all:\n")
print(books[books["isbn13"] == int(docs[0].page_content.split()[0].strip())])
print("\n")
def retrieve_semantic_recommendations(
query: str,
top_k: int = 10,
) -> pd. DataFrame:
recs = db_books.similarity_search (query, k = 50)
books_list = []
for i in range(0, len(recs)):
books_list += [int(recs[i].page_content.strip('"').split()[0])]
return books[books["isbn13"].isin(books_list)].head(top_k)
results = retrieve_semantic_recommendations(query)
print("Recommendations:\n")
print(tabulate(results, headers='keys', tablefmt='grid', showindex=False))