Upload 4 files
Browse files- data_exploration.py +125 -0
- sentiment_analysis.py +61 -0
- text_classification.py +157 -0
- vector_search.py +115 -0
data_exploration.py
ADDED
|
@@ -0,0 +1,125 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pandas as pd
|
| 2 |
+
import numpy as np
|
| 3 |
+
import seaborn as sns
|
| 4 |
+
import matplotlib.pyplot as plt
|
| 5 |
+
|
| 6 |
+
#------------------------ data cleaning ---------------------------#
|
| 7 |
+
|
| 8 |
+
books = pd.read_csv('books.csv')
|
| 9 |
+
print("Initial look at the dataset:\n", books.head())
|
| 10 |
+
|
| 11 |
+
#plot showing the missing values from the dataset
|
| 12 |
+
ax = plt.axes()
|
| 13 |
+
sns.heatmap(books.isna().transpose(), cbar=False, ax=ax)
|
| 14 |
+
plt.xlabel ("Columns")
|
| 15 |
+
plt.ylabel ("Missing values")
|
| 16 |
+
plt.show()
|
| 17 |
+
|
| 18 |
+
#marking if the description is present or not by adding the below column: 1 where it's missing else 0
|
| 19 |
+
books["missing_description"] = np.where(books["description"].isna(), 1, 0)
|
| 20 |
+
books["age_of_book"] = 2025 - books["published_year"]
|
| 21 |
+
|
| 22 |
+
columns_of_interest = ["num_pages", "age_of_book", "missing_description", "average_rating"]
|
| 23 |
+
|
| 24 |
+
correlation_matrix = books[columns_of_interest].corr(method='spearman')
|
| 25 |
+
|
| 26 |
+
sns.set_theme (style="white")
|
| 27 |
+
plt.figure(figsize=(8, 6))
|
| 28 |
+
heatmap = sns.heatmap(correlation_matrix, annot=True, fmt=".2f", cmap="coolwarm", cbar_kws={"label": "Spearman correlation"})
|
| 29 |
+
heatmap.set_title("Correlation heatmap")
|
| 30 |
+
plt.show()
|
| 31 |
+
|
| 32 |
+
#checking all the books where anyone of the columns_of_interest data is missing
|
| 33 |
+
print(books[(books["description"].isna()) |
|
| 34 |
+
(books ["num_pages"]. isna()) |
|
| 35 |
+
(books["average_rating"].isna()) |
|
| 36 |
+
(books["published_year"]. isna())
|
| 37 |
+
])
|
| 38 |
+
|
| 39 |
+
#hence, removing all such books from the dataframe
|
| 40 |
+
books_updated = books[~(books["description"].isna()) &
|
| 41 |
+
~(books ["num_pages"]. isna()) &
|
| 42 |
+
~(books["average_rating"].isna()) &
|
| 43 |
+
~(books["published_year"]. isna())
|
| 44 |
+
]
|
| 45 |
+
|
| 46 |
+
'''
|
| 47 |
+
Potential problems for not making a copy of modified DataFrame:
|
| 48 |
+
|
| 49 |
+
Unpredictable behavior - Code might work sometimes and fail other times
|
| 50 |
+
Silent data corruption - Changes might affect the original DataFrame unexpectedly
|
| 51 |
+
Hard-to-debug issues - The behavior depends on pandas' internal optimizations, which can change
|
| 52 |
+
Future pandas versions - This might become an error instead of just a warning
|
| 53 |
+
'''
|
| 54 |
+
books_updated = books_updated.copy()
|
| 55 |
+
print("Removing the above observations..\nUpdated dataset:\n", books_updated)
|
| 56 |
+
|
| 57 |
+
#just for visualization - first 100 entries
|
| 58 |
+
visualize_top = books_updated[:100]
|
| 59 |
+
|
| 60 |
+
#take a closer look at the distribution of the categories in the descending order
|
| 61 |
+
print("Categorical Distribution:\n", books_updated["categories"].value_counts().reset_index().sort_values("count", ascending=False))
|
| 62 |
+
sns.histplot(data=visualize_top, x = 'categories', kde=True)
|
| 63 |
+
plt.xticks(rotation=90)
|
| 64 |
+
plt.show()
|
| 65 |
+
|
| 66 |
+
#taking a closer look at the first few descriptions
|
| 67 |
+
print(books["description"].head())
|
| 68 |
+
|
| 69 |
+
'''IMP Note: Books with one-word descriptions won't enable a smooth recommendation process. They wouldn't be useful in the process
|
| 70 |
+
Hence, it makes sense to remove such observations from the dataset.'''
|
| 71 |
+
|
| 72 |
+
#introduce a new variable - gives us the length of the description
|
| 73 |
+
books_updated["words_in_description"] = books_updated["description"].str.split().str.len()
|
| 74 |
+
print(books_updated["words_in_description"].head())
|
| 75 |
+
sns.histplot(data=books_updated, x = 'words_in_description', kde=True)
|
| 76 |
+
plt.xticks(rotation=45)
|
| 77 |
+
plt.show()
|
| 78 |
+
|
| 79 |
+
'''
|
| 80 |
+
#books with description words ranging from 1 to 4
|
| 81 |
+
print("Books having description of upto 4 words:\n",
|
| 82 |
+
books_updated.loc[books_updated["words_in_description"].between(1, 4), "description"], "\n")
|
| 83 |
+
|
| 84 |
+
#books with description words ranging from 5 to 14
|
| 85 |
+
print("Books having description words ranging from 5 to 14:\n",
|
| 86 |
+
books_updated.loc[books_updated["words_in_description"].between(5, 14), "description"], "\n")
|
| 87 |
+
|
| 88 |
+
#books with description words ranging from 15 to 24
|
| 89 |
+
print("Books having description words ranging from 15 to 24:\n",
|
| 90 |
+
books_updated.loc[books_updated["words_in_description"].between(15, 24), "description"], "\n")
|
| 91 |
+
|
| 92 |
+
#books with description words ranging from 25 to 34
|
| 93 |
+
print("Books having description of upto 4 words:\n",
|
| 94 |
+
books_updated.loc[books_updated["words_in_description"].between(25, 34), "description"],)
|
| 95 |
+
'''
|
| 96 |
+
|
| 97 |
+
#use 25 words in description as a cutoff and remove all with less than 25 words
|
| 98 |
+
books_updated_25_words = books_updated[books_updated["words_in_description"] >= 25]
|
| 99 |
+
#print("Books having description words more than 25:\n", books_updated_25_words)
|
| 100 |
+
books_updated_25_words = books_updated_25_words.copy()
|
| 101 |
+
|
| 102 |
+
#create a new column where if the subtitle is missing, append the the title itself in that field, or
|
| 103 |
+
#if both, title and subtitle are present, merge them as a string using a colon
|
| 104 |
+
books_updated_25_words["title_and_subtitle"] = (
|
| 105 |
+
np.where(books_updated_25_words["subtitle"].isna(), books_updated_25_words["title"],
|
| 106 |
+
books_updated_25_words[["title", "subtitle"]].astype(str).agg(": ".join, axis=1))
|
| 107 |
+
)
|
| 108 |
+
#print("Title & Subtitle Column:\n", books_updated_25_words["title_and_subtitle"])
|
| 109 |
+
|
| 110 |
+
|
| 111 |
+
'''
|
| 112 |
+
create a new column with a tagged description. Why?
|
| 113 |
+
it's a good practice as compared to direct string matching and filtering while recommending bcz it can get messy and slow
|
| 114 |
+
isbn number is treated as an identifier and then later can be removed
|
| 115 |
+
'''
|
| 116 |
+
books_updated_25_words["tagged_description"] = books_updated_25_words[["isbn13", "description"]].astype(str).agg(" ".join, axis=1)
|
| 117 |
+
#print("Tagged Description Column:\n", books_updated_25_words["tagged_description"])
|
| 118 |
+
|
| 119 |
+
#now, removing all the unwanted columns and saving the dataframe as a new csv
|
| 120 |
+
(
|
| 121 |
+
books_updated_25_words
|
| 122 |
+
.drop(["subtitle", "missing_description", "age_of_book", "words_in_description"], axis=1)
|
| 123 |
+
.to_csv("books_cleaned.csv", index = False)
|
| 124 |
+
)
|
| 125 |
+
|
sentiment_analysis.py
ADDED
|
@@ -0,0 +1,61 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pandas as pd
|
| 2 |
+
import numpy as np
|
| 3 |
+
from transformers import pipeline
|
| 4 |
+
from tqdm import tqdm
|
| 5 |
+
|
| 6 |
+
books = pd. read_csv("books_with_categories.csv")
|
| 7 |
+
|
| 8 |
+
# test the HF model with sample-text
|
| 9 |
+
classifier = pipeline("text-classification",
|
| 10 |
+
model="j-hartmann/emotion-english-distilroberta-base",
|
| 11 |
+
top_k=None,
|
| 12 |
+
device='mps')
|
| 13 |
+
|
| 14 |
+
"""result = classifier("I love this!")
|
| 15 |
+
|
| 16 |
+
# as it returns a list of dictionaries
|
| 17 |
+
for item in result[0]:
|
| 18 |
+
print(item)
|
| 19 |
+
#another way, print(f"{item['label']:<10}: {item['score']}")"""
|
| 20 |
+
|
| 21 |
+
# break the description into sentences and give individual scores for each
|
| 22 |
+
"""sentences = books["description"][0].split(".")
|
| 23 |
+
predictions = classifier(sentences)
|
| 24 |
+
|
| 25 |
+
i = len(predictions)
|
| 26 |
+
while i > 0:
|
| 27 |
+
for sentence in predictions[i-1]:
|
| 28 |
+
print(sentence)
|
| 29 |
+
print("--------------------")
|
| 30 |
+
i -= 1"""
|
| 31 |
+
|
| 32 |
+
# create a dictionary with emotions having maximum probabilities from each sentence
|
| 33 |
+
emotion_labels = ["anger", "disgust", "fear", "joy", "sadness", "surprise", "neutral"]
|
| 34 |
+
isbn = []
|
| 35 |
+
emotion_scores = {label: [] for label in emotion_labels}
|
| 36 |
+
|
| 37 |
+
def calculate_max_emotion_scores(predictions):
|
| 38 |
+
per_emotion_scores = {label: [] for label in emotion_labels}
|
| 39 |
+
for prediction in predictions:
|
| 40 |
+
sorted_predictions = sorted(prediction, key=lambda x: ["label"])
|
| 41 |
+
for index, label in enumerate (emotion_labels):
|
| 42 |
+
per_emotion_scores[label].append(sorted_predictions[index]["score"])
|
| 43 |
+
return {label: np.max(scores) for label, scores in per_emotion_scores.items()}
|
| 44 |
+
|
| 45 |
+
# for all the books in dataset
|
| 46 |
+
for i in tqdm(range(len(books))):
|
| 47 |
+
isbn.append(books["isbn13"][i])
|
| 48 |
+
sentences = books["description"][i].split(".")
|
| 49 |
+
predictions = classifier(sentences)
|
| 50 |
+
max_scores = calculate_max_emotion_scores(predictions)
|
| 51 |
+
for label in emotion_labels:
|
| 52 |
+
emotion_scores[label].append(max_scores[label])
|
| 53 |
+
|
| 54 |
+
# create a new dataframe from the results
|
| 55 |
+
emotions_df = pd.DataFrame(emotion_scores)
|
| 56 |
+
emotions_df["isbn13"] = isbn
|
| 57 |
+
print(emotions_df)
|
| 58 |
+
|
| 59 |
+
books = pd.merge(books, emotions_df, on = "isbn13")
|
| 60 |
+
books.to_csv("books_with_emotions.csv", index = False)
|
| 61 |
+
|
text_classification.py
ADDED
|
@@ -0,0 +1,157 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pandas as pd
|
| 2 |
+
import numpy as np
|
| 3 |
+
from transformers import pipeline
|
| 4 |
+
from tqdm import tqdm
|
| 5 |
+
import os
|
| 6 |
+
|
| 7 |
+
books = pd.read_csv('books_cleaned.csv')
|
| 8 |
+
|
| 9 |
+
# print categories with more than 50 books
|
| 10 |
+
print(books["categories"].value_counts().reset_index().query("count > 50"))
|
| 11 |
+
|
| 12 |
+
# look at a specific category
|
| 13 |
+
print(books[books["categories"] == "Juvenile Fiction"])
|
| 14 |
+
|
| 15 |
+
# come up with a mapping to highlight subtle categories, rest will be NaN by default
|
| 16 |
+
category_mapping = {'Fiction' : "Fiction",
|
| 17 |
+
'Juvenile Fiction': "Children's Fiction",
|
| 18 |
+
'Biography & Autobiography': "Nonfiction",
|
| 19 |
+
'History': "Nonfiction",
|
| 20 |
+
'Literary Criticism': "Nonfiction",
|
| 21 |
+
'Philosophy': "Nonfiction",
|
| 22 |
+
'Religion': "Nonfiction",
|
| 23 |
+
'Comics & Graphic Novels': "Fiction",
|
| 24 |
+
'Drama': "Fiction",
|
| 25 |
+
'Juvenile Nonfiction': "Children's Nonfiction",
|
| 26 |
+
'Science': "Nonfiction",
|
| 27 |
+
'Poetry': "Fiction"}
|
| 28 |
+
|
| 29 |
+
# add the new column to the dataset
|
| 30 |
+
books["simple_categories"] = books["categories"].map(category_mapping)
|
| 31 |
+
print(books.head())
|
| 32 |
+
|
| 33 |
+
# applying the ZERO-SHOT CLASSIFICATION
|
| 34 |
+
# import the HF model
|
| 35 |
+
fiction_categories = ["Fiction", "Nonfiction"]
|
| 36 |
+
pipe = pipeline("zero-shot-classification",
|
| 37 |
+
model="facebook/bart-large-mnli",
|
| 38 |
+
device="mps"
|
| 39 |
+
)
|
| 40 |
+
|
| 41 |
+
# looking at the first known fiction entry in the dataframe
|
| 42 |
+
print(books.loc[books["simple_categories"] == "Fiction", "description"].reset_index(drop=True)[0])
|
| 43 |
+
|
| 44 |
+
# set a sequence and categories to run/apply the classifier, check for the scpres once you run, the depic the probabilities
|
| 45 |
+
sequence = books.loc[books["simple_categories"] == "Fiction", "description"].reset_index(drop=True)[0]
|
| 46 |
+
print(pipe(sequence, fiction_categories))
|
| 47 |
+
print("\n")
|
| 48 |
+
|
| 49 |
+
# get the predicted label from the output using post-processing
|
| 50 |
+
max_index = np.argmax(pipe(sequence, fiction_categories)["scores"])
|
| 51 |
+
max_label = pipe(sequence, fiction_categories)["labels"][max_index]
|
| 52 |
+
print("Predicted Label using ZERO-SHOT: " + max_label + "\n")
|
| 53 |
+
|
| 54 |
+
# define a function for generating predictions
|
| 55 |
+
def generate_predictions(sequence, categories):
|
| 56 |
+
predictions = pipe(sequence, categories)
|
| 57 |
+
max_index = np.argmax(predictions["scores"])
|
| 58 |
+
max_label = predictions["labels"][max_index]
|
| 59 |
+
return max_label
|
| 60 |
+
|
| 61 |
+
# take a sizeable sample of fiction and non-fiction, and prediict the label using the classifier
|
| 62 |
+
# compare it with the already known label
|
| 63 |
+
# the if-else is here to avoid re-generation during every run
|
| 64 |
+
if os.path.exists("predictions_results.csv"):
|
| 65 |
+
predictions_df = pd.read_csv("predictions_results.csv")
|
| 66 |
+
else:
|
| 67 |
+
# Run the loops, then save
|
| 68 |
+
actual_cats = []
|
| 69 |
+
predicted_cats = []
|
| 70 |
+
|
| 71 |
+
for i in tqdm(range(0, 300)):
|
| 72 |
+
sequence = books.loc[books["simple_categories"] == "Fiction", "description"].reset_index(drop=True)[i]
|
| 73 |
+
predicted_cats += [generate_predictions(sequence, fiction_categories)]
|
| 74 |
+
actual_cats += ["Fiction"]
|
| 75 |
+
|
| 76 |
+
for i in tqdm(range(0, 300)):
|
| 77 |
+
sequence = books.loc[books["simple_categories"] == "Nonfiction", "description"].reset_index(drop=True)[i]
|
| 78 |
+
predicted_cats += [generate_predictions(sequence, fiction_categories)]
|
| 79 |
+
actual_cats += ["Nonfiction"]
|
| 80 |
+
|
| 81 |
+
predictions_df = pd.DataFrame({
|
| 82 |
+
"actual_categories": actual_cats,
|
| 83 |
+
"predicted_categories": predicted_cats
|
| 84 |
+
})
|
| 85 |
+
|
| 86 |
+
predictions_df.to_csv("predictions_results.csv", index=False)
|
| 87 |
+
|
| 88 |
+
# sets 1 for correct prediction and 0 for incorrect
|
| 89 |
+
predictions_df["correct_prediction"] = (
|
| 90 |
+
np.where(predictions_df["actual_categories"] == predictions_df["predicted_categories"], 1, 0)
|
| 91 |
+
)
|
| 92 |
+
print(predictions_df)
|
| 93 |
+
|
| 94 |
+
# calculating the accuracy of our labelling
|
| 95 |
+
accuracy = predictions_df["correct_prediction"].sum()/len(predictions_df)
|
| 96 |
+
print("Labelling accuracy: ", accuracy*100, "%\n")
|
| 97 |
+
|
| 98 |
+
# use this model to identify the missing categories (kinda create a subset of the dataset and take ones only with missing categories)
|
| 99 |
+
isbns = []
|
| 100 |
+
predicted_cats = []
|
| 101 |
+
|
| 102 |
+
# consider the isbn13 so that we can merge back to the original dataframe
|
| 103 |
+
missing_cats = books.loc[books["simple_categories"].isna(), ["isbn13", "description"]].reset_index(drop=True)
|
| 104 |
+
|
| 105 |
+
for i in tqdm(range(0, len(missing_cats))):
|
| 106 |
+
sequence = missing_cats["description"][i]
|
| 107 |
+
predicted_cats += [generate_predictions(sequence, fiction_categories)]
|
| 108 |
+
isbns += [missing_cats["isbn13"][i]]
|
| 109 |
+
|
| 110 |
+
missing_predicted_df = pd.DataFrame({"isbn13": isbns, "predicted_categories": predicted_cats})
|
| 111 |
+
print("The missing predicted categories: \n")
|
| 112 |
+
print(missing_predicted_df)
|
| 113 |
+
|
| 114 |
+
# merge it back to the main dataframe
|
| 115 |
+
books = pd.merge(books, missing_predicted_df, on="isbn13", how="left")
|
| 116 |
+
books["simple_categories"] = np.where(books["simple_categories"].isna(), books["predicted_categories"], books["simple_categories"])
|
| 117 |
+
books = books.drop(columns = ["predicted_categories"])
|
| 118 |
+
|
| 119 |
+
# specific fiction categories with these keywords exclusively
|
| 120 |
+
print(books[books["categories"].str.lower().isin([
|
| 121 |
+
"romance",
|
| 122 |
+
"science fiction",
|
| 123 |
+
"scifi",
|
| 124 |
+
"fantasy",
|
| 125 |
+
"horror",
|
| 126 |
+
"mystery",
|
| 127 |
+
"thriller",
|
| 128 |
+
"comedy",
|
| 129 |
+
"crime"
|
| 130 |
+
"historical"
|
| 131 |
+
])])
|
| 132 |
+
|
| 133 |
+
# save the changes in a new file
|
| 134 |
+
books.to_csv("books_with_categories.csv", index=False)
|
| 135 |
+
|
| 136 |
+
|
| 137 |
+
|
| 138 |
+
|
| 139 |
+
|
| 140 |
+
|
| 141 |
+
|
| 142 |
+
|
| 143 |
+
|
| 144 |
+
|
| 145 |
+
|
| 146 |
+
|
| 147 |
+
|
| 148 |
+
|
| 149 |
+
|
| 150 |
+
|
| 151 |
+
|
| 152 |
+
|
| 153 |
+
|
| 154 |
+
|
| 155 |
+
|
| 156 |
+
|
| 157 |
+
|
vector_search.py
ADDED
|
@@ -0,0 +1,115 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from langchain_chroma import Chroma
|
| 2 |
+
from langchain_openai import OpenAIEmbeddings
|
| 3 |
+
from langchain_huggingface import HuggingFaceEmbeddings
|
| 4 |
+
from langchain_community.document_loaders import TextLoader
|
| 5 |
+
from langchain_text_splitters import CharacterTextSplitter
|
| 6 |
+
from dotenv import load_dotenv
|
| 7 |
+
from tabulate import tabulate
|
| 8 |
+
import pandas as pd
|
| 9 |
+
|
| 10 |
+
#loading the .env file
|
| 11 |
+
load_dotenv()
|
| 12 |
+
|
| 13 |
+
books = pd.read_csv("books_cleaned.csv")
|
| 14 |
+
|
| 15 |
+
books["tagged_description"].to_csv("tagged_description.txt",
|
| 16 |
+
sep = "\n",
|
| 17 |
+
index = False,
|
| 18 |
+
header = False)
|
| 19 |
+
|
| 20 |
+
"""Our existing Chroma DB (chroma_db_books) was created with OpenAIEmbeddings, which produce 1536-dimensional vectors.
|
| 21 |
+
On the other hand, HuggingFaceEmbeddings produces 384-dimensional vectors.
|
| 22 |
+
"""
|
| 23 |
+
|
| 24 |
+
# OpenAI approach using its API
|
| 25 |
+
# load the documents and instantiate the text-splitter
|
| 26 |
+
# the chunk size it set to zero to prioritize splitting at the separator rather than the chunk-size, hence we might warnings
|
| 27 |
+
"""raw_documents = TextLoader("tagged_description.txt").load()
|
| 28 |
+
text_splitter = CharacterTextSplitter(chunk_size=0, chunk_overlap=0, separator="\n")
|
| 29 |
+
documents = text_splitter.split_documents(raw_documents)
|
| 30 |
+
|
| 31 |
+
# checking if it prints the first description correctly
|
| 32 |
+
print(documents[0])
|
| 33 |
+
|
| 34 |
+
#create the document embeddings and store them in the vector database
|
| 35 |
+
db_books = Chroma.from_documents(
|
| 36 |
+
documents,
|
| 37 |
+
embedding=OpenAIEmbeddings(),
|
| 38 |
+
persist_directory="chroma_db_books"
|
| 39 |
+
)
|
| 40 |
+
print("Vector database stored to local disk:)")
|
| 41 |
+
"""
|
| 42 |
+
|
| 43 |
+
# HuggingFace approach >> to save money
|
| 44 |
+
# conditional flag to avoid creating vector database everytime
|
| 45 |
+
query = "A book to teach children about nature"
|
| 46 |
+
|
| 47 |
+
REBUILD_VECTOR_DB = False
|
| 48 |
+
PERSIST_DIR = "chroma_db_books_hf"
|
| 49 |
+
MODEL = "sentence-transformers/all-MiniLM-L6-v2" #384-dim (keep consistent!)
|
| 50 |
+
|
| 51 |
+
# 1. Define the embedding model (same for build & query)
|
| 52 |
+
embedding = HuggingFaceEmbeddings(model_name=MODEL)
|
| 53 |
+
|
| 54 |
+
if REBUILD_VECTOR_DB:
|
| 55 |
+
# 2. Load and split text
|
| 56 |
+
raw_documents = TextLoader("tagged_description.txt").load()
|
| 57 |
+
text_splitter = CharacterTextSplitter(chunk_size=0, chunk_overlap=0, separator="\n")
|
| 58 |
+
documents = text_splitter.split_documents(raw_documents)
|
| 59 |
+
|
| 60 |
+
# 3. Create and persist vector DB
|
| 61 |
+
db_books = Chroma.from_documents(
|
| 62 |
+
documents,
|
| 63 |
+
embedding=embedding,
|
| 64 |
+
persist_directory=PERSIST_DIR
|
| 65 |
+
)
|
| 66 |
+
|
| 67 |
+
print("First split chunk:")
|
| 68 |
+
print(documents[0].page_content)
|
| 69 |
+
|
| 70 |
+
else:
|
| 71 |
+
# 4. Load existing DB (no re-embedding)
|
| 72 |
+
db_books = Chroma(
|
| 73 |
+
persist_directory=PERSIST_DIR,
|
| 74 |
+
embedding_function=embedding
|
| 75 |
+
)
|
| 76 |
+
|
| 77 |
+
# 5. Run a query
|
| 78 |
+
results = db_books.similarity_search(query, k=1)
|
| 79 |
+
print("Top semantic match:\n" + results[0].page_content + "\n")
|
| 80 |
+
|
| 81 |
+
docs = db_books.similarity_search(query, k = 10)
|
| 82 |
+
print("First 10 results: \n", docs, "\n")
|
| 83 |
+
|
| 84 |
+
# filters and gives the isbn for the first result from the query results
|
| 85 |
+
print("First result of all:\n")
|
| 86 |
+
print(books[books["isbn13"] == int(docs[0].page_content.split()[0].strip())])
|
| 87 |
+
print("\n")
|
| 88 |
+
|
| 89 |
+
def retrieve_semantic_recommendations(
|
| 90 |
+
query: str,
|
| 91 |
+
top_k: int = 10,
|
| 92 |
+
) -> pd. DataFrame:
|
| 93 |
+
recs = db_books.similarity_search (query, k = 50)
|
| 94 |
+
|
| 95 |
+
books_list = []
|
| 96 |
+
|
| 97 |
+
for i in range(0, len(recs)):
|
| 98 |
+
books_list += [int(recs[i].page_content.strip('"').split()[0])]
|
| 99 |
+
|
| 100 |
+
return books[books["isbn13"].isin(books_list)].head(top_k)
|
| 101 |
+
|
| 102 |
+
results = retrieve_semantic_recommendations(query)
|
| 103 |
+
print("Recommendations:\n")
|
| 104 |
+
print(tabulate(results, headers='keys', tablefmt='grid', showindex=False))
|
| 105 |
+
|
| 106 |
+
|
| 107 |
+
|
| 108 |
+
|
| 109 |
+
|
| 110 |
+
|
| 111 |
+
|
| 112 |
+
|
| 113 |
+
|
| 114 |
+
|
| 115 |
+
|