|
|
from langchain_chroma import Chroma |
|
|
from langchain_openai import OpenAIEmbeddings |
|
|
from langchain_huggingface import HuggingFaceEmbeddings |
|
|
from langchain_community.document_loaders import TextLoader |
|
|
from langchain_text_splitters import CharacterTextSplitter |
|
|
from dotenv import load_dotenv |
|
|
from tabulate import tabulate |
|
|
import pandas as pd |
|
|
|
|
|
|
|
|
load_dotenv() |
|
|
|
|
|
books = pd.read_csv("books_cleaned.csv") |
|
|
|
|
|
books["tagged_description"].to_csv("tagged_description.txt", |
|
|
sep = "\n", |
|
|
index = False, |
|
|
header = False) |
|
|
|
|
|
"""Our existing Chroma DB (chroma_db_books) was created with OpenAIEmbeddings, which produce 1536-dimensional vectors. |
|
|
On the other hand, HuggingFaceEmbeddings produces 384-dimensional vectors. |
|
|
""" |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
"""raw_documents = TextLoader("tagged_description.txt").load() |
|
|
text_splitter = CharacterTextSplitter(chunk_size=0, chunk_overlap=0, separator="\n") |
|
|
documents = text_splitter.split_documents(raw_documents) |
|
|
|
|
|
# checking if it prints the first description correctly |
|
|
print(documents[0]) |
|
|
|
|
|
#create the document embeddings and store them in the vector database |
|
|
db_books = Chroma.from_documents( |
|
|
documents, |
|
|
embedding=OpenAIEmbeddings(), |
|
|
persist_directory="chroma_db_books" |
|
|
) |
|
|
print("Vector database stored to local disk:)") |
|
|
""" |
|
|
|
|
|
|
|
|
|
|
|
query = "A book to teach children about nature" |
|
|
|
|
|
REBUILD_VECTOR_DB = False |
|
|
PERSIST_DIR = "chroma_db_books_hf" |
|
|
MODEL = "sentence-transformers/all-MiniLM-L6-v2" |
|
|
|
|
|
|
|
|
embedding = HuggingFaceEmbeddings(model_name=MODEL) |
|
|
|
|
|
if REBUILD_VECTOR_DB: |
|
|
|
|
|
raw_documents = TextLoader("tagged_description.txt").load() |
|
|
text_splitter = CharacterTextSplitter(chunk_size=0, chunk_overlap=0, separator="\n") |
|
|
documents = text_splitter.split_documents(raw_documents) |
|
|
|
|
|
|
|
|
db_books = Chroma.from_documents( |
|
|
documents, |
|
|
embedding=embedding, |
|
|
persist_directory=PERSIST_DIR |
|
|
) |
|
|
|
|
|
print("First split chunk:") |
|
|
print(documents[0].page_content) |
|
|
|
|
|
else: |
|
|
|
|
|
db_books = Chroma( |
|
|
persist_directory=PERSIST_DIR, |
|
|
embedding_function=embedding |
|
|
) |
|
|
|
|
|
|
|
|
results = db_books.similarity_search(query, k=1) |
|
|
print("Top semantic match:\n" + results[0].page_content + "\n") |
|
|
|
|
|
docs = db_books.similarity_search(query, k = 10) |
|
|
print("First 10 results: \n", docs, "\n") |
|
|
|
|
|
|
|
|
print("First result of all:\n") |
|
|
print(books[books["isbn13"] == int(docs[0].page_content.split()[0].strip())]) |
|
|
print("\n") |
|
|
|
|
|
def retrieve_semantic_recommendations( |
|
|
query: str, |
|
|
top_k: int = 10, |
|
|
) -> pd. DataFrame: |
|
|
recs = db_books.similarity_search (query, k = 50) |
|
|
|
|
|
books_list = [] |
|
|
|
|
|
for i in range(0, len(recs)): |
|
|
books_list += [int(recs[i].page_content.strip('"').split()[0])] |
|
|
|
|
|
return books[books["isbn13"].isin(books_list)].head(top_k) |
|
|
|
|
|
results = retrieve_semantic_recommendations(query) |
|
|
print("Recommendations:\n") |
|
|
print(tabulate(results, headers='keys', tablefmt='grid', showindex=False)) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|