Spaces:

nirmanpatel
/

semantic-book-recommender

Sleeping

App Files Files Community

nirmanpatel commited on Jul 18, 2025

Commit

226e11e

verified ·

1 Parent(s): f632dba

Upload 4 files

Browse files

Files changed (4) hide show

data_exploration.py +125 -0
sentiment_analysis.py +61 -0
text_classification.py +157 -0
vector_search.py +115 -0

data_exploration.py ADDED Viewed

	@@ -0,0 +1,125 @@

+import pandas as pd
+import numpy as np
+import seaborn as sns
+import matplotlib.pyplot as plt
+#------------------------ data cleaning ---------------------------#
+books = pd.read_csv('books.csv')
+print("Initial look at the dataset:\n", books.head())
+#plot showing the missing values from the dataset
+ax = plt.axes()
+sns.heatmap(books.isna().transpose(), cbar=False, ax=ax)
+plt.xlabel ("Columns")
+plt.ylabel ("Missing values")
+plt.show()
+#marking if the description is present or not by adding the below column: 1 where it's missing else 0
+books["missing_description"] = np.where(books["description"].isna(), 1, 0)
+books["age_of_book"] = 2025 - books["published_year"]
+columns_of_interest = ["num_pages", "age_of_book", "missing_description", "average_rating"]
+correlation_matrix = books[columns_of_interest].corr(method='spearman')
+sns.set_theme (style="white")
+plt.figure(figsize=(8, 6))
+heatmap = sns.heatmap(correlation_matrix, annot=True, fmt=".2f", cmap="coolwarm", cbar_kws={"label": "Spearman correlation"})
+heatmap.set_title("Correlation heatmap")
+plt.show()
+#checking all the books where anyone of the columns_of_interest data is missing
+print(books[(books["description"].isna()) |
+			(books ["num_pages"]. isna()) |
+			(books["average_rating"].isna()) |
+			(books["published_year"]. isna())
+])
+#hence, removing all such books from the dataframe
+books_updated = books[~(books["description"].isna()) &
+					~(books ["num_pages"]. isna()) &
+					~(books["average_rating"].isna()) &
+					~(books["published_year"]. isna())
+]
+'''
+Potential problems for not making a copy of modified DataFrame:
+Unpredictable behavior - Code might work sometimes and fail other times
+Silent data corruption - Changes might affect the original DataFrame unexpectedly
+Hard-to-debug issues - The behavior depends on pandas' internal optimizations, which can change
+Future pandas versions - This might become an error instead of just a warning
+'''
+books_updated = books_updated.copy()
+print("Removing the above observations..\nUpdated dataset:\n", books_updated)
+#just for visualization - first 100 entries
+visualize_top = books_updated[:100]
+#take a closer look at the distribution of the categories in the descending order
+print("Categorical Distribution:\n", books_updated["categories"].value_counts().reset_index().sort_values("count", ascending=False))
+sns.histplot(data=visualize_top, x = 'categories', kde=True)
+plt.xticks(rotation=90)
+plt.show()
+#taking a closer look at the first few descriptions
+print(books["description"].head())
+'''IMP Note: Books with one-word descriptions won't enable a smooth recommendation process. They wouldn't be useful in the process
+			 Hence, it makes sense to remove such observations from the dataset.'''
+#introduce a new variable - gives us the length of the description
+books_updated["words_in_description"] = books_updated["description"].str.split().str.len()
+print(books_updated["words_in_description"].head())
+sns.histplot(data=books_updated, x = 'words_in_description', kde=True)
+plt.xticks(rotation=45)
+plt.show()
+'''
+#books with description words ranging from 1 to 4
+print("Books having description of upto 4 words:\n",
+	books_updated.loc[books_updated["words_in_description"].between(1, 4), "description"], "\n")
+#books with description words ranging from 5 to 14
+print("Books having description words ranging from 5 to 14:\n",
+	books_updated.loc[books_updated["words_in_description"].between(5, 14), "description"], "\n")
+#books with description words ranging from 15 to 24
+print("Books having description words ranging from 15 to 24:\n",
+	books_updated.loc[books_updated["words_in_description"].between(15, 24), "description"], "\n")
+#books with description words ranging from 25 to 34
+print("Books having description of upto 4 words:\n",
+	books_updated.loc[books_updated["words_in_description"].between(25, 34), "description"],)
+'''
+#use 25 words in description as a cutoff and remove all with less than 25 words
+books_updated_25_words = books_updated[books_updated["words_in_description"] >= 25]
+#print("Books having description words more than 25:\n", books_updated_25_words)
+books_updated_25_words = books_updated_25_words.copy()
+#create a new column where if the subtitle is missing, append the the title itself in that field, or
+#if both, title and subtitle are present, merge them as a string using a colon
+books_updated_25_words["title_and_subtitle"] = (
+	np.where(books_updated_25_words["subtitle"].isna(), books_updated_25_words["title"],
+			 books_updated_25_words[["title", "subtitle"]].astype(str).agg(": ".join, axis=1))
+)
+#print("Title & Subtitle Column:\n", books_updated_25_words["title_and_subtitle"])
+'''
+create a new column with a tagged description. Why?
+it's a good practice as compared to direct string matching and filtering while recommending bcz it can get messy and slow
+isbn number is treated as an identifier and then later can be removed
+'''
+books_updated_25_words["tagged_description"] = books_updated_25_words[["isbn13", "description"]].astype(str).agg(" ".join, axis=1)
+#print("Tagged Description Column:\n", books_updated_25_words["tagged_description"])
+#now, removing all the unwanted columns and saving the dataframe as a new csv
+(
+	books_updated_25_words
+	.drop(["subtitle", "missing_description", "age_of_book", "words_in_description"], axis=1)
+	.to_csv("books_cleaned.csv", index = False)
+)

sentiment_analysis.py ADDED Viewed

	@@ -0,0 +1,61 @@

+import pandas as pd
+import numpy as np
+from transformers import pipeline
+from tqdm import tqdm
+books = pd. read_csv("books_with_categories.csv")
+# test the HF model with sample-text
+classifier = pipeline("text-classification",
+					  model="j-hartmann/emotion-english-distilroberta-base",
+					  top_k=None,
+					  device='mps')
+"""result = classifier("I love this!")
+# as it returns a list of dictionaries
+for item in result[0]:
+	print(item)
+    #another way, print(f"{item['label']:<10}: {item['score']}")"""
+# break the description into sentences and give individual scores for each
+"""sentences = books["description"][0].split(".")
+predictions = classifier(sentences)
+i = len(predictions)
+while i > 0:
+	for sentence in predictions[i-1]:
+		print(sentence)
+	print("--------------------")
+	i -= 1"""
+# create a dictionary with emotions having maximum probabilities from each sentence
+emotion_labels = ["anger", "disgust", "fear", "joy", "sadness", "surprise", "neutral"]
+isbn = []
+emotion_scores = {label: [] for label in emotion_labels}
+def calculate_max_emotion_scores(predictions):
+	per_emotion_scores = {label: [] for label in emotion_labels}
+	for prediction in predictions:
+		sorted_predictions = sorted(prediction, key=lambda x: ["label"])
+		for index, label in enumerate (emotion_labels):
+			per_emotion_scores[label].append(sorted_predictions[index]["score"])
+	return {label: np.max(scores) for label, scores in per_emotion_scores.items()}
+# for all the books in dataset
+for i in tqdm(range(len(books))):
+	isbn.append(books["isbn13"][i])
+	sentences = books["description"][i].split(".")
+	predictions = classifier(sentences)
+	max_scores = calculate_max_emotion_scores(predictions)
+	for label in emotion_labels:
+		emotion_scores[label].append(max_scores[label])
+# create a new dataframe from the results
+emotions_df = pd.DataFrame(emotion_scores)
+emotions_df["isbn13"] = isbn
+print(emotions_df)
+books = pd.merge(books, emotions_df, on = "isbn13")
+books.to_csv("books_with_emotions.csv", index = False)

text_classification.py ADDED Viewed

	@@ -0,0 +1,157 @@

+import pandas as pd
+import numpy as np
+from transformers import pipeline
+from tqdm import tqdm
+import os
+books = pd.read_csv('books_cleaned.csv')
+# print categories with more than 50 books
+print(books["categories"].value_counts().reset_index().query("count > 50"))
+# look at a specific category
+print(books[books["categories"] == "Juvenile Fiction"])
+# come up with a mapping to highlight subtle categories, rest will be NaN by default
+category_mapping = {'Fiction' : "Fiction",
+'Juvenile Fiction': "Children's Fiction",
+'Biography & Autobiography': "Nonfiction",
+'History': "Nonfiction",
+'Literary Criticism': "Nonfiction",
+'Philosophy': "Nonfiction",
+'Religion': "Nonfiction",
+'Comics & Graphic Novels': "Fiction",
+'Drama': "Fiction",
+'Juvenile Nonfiction': "Children's Nonfiction",
+'Science': "Nonfiction",
+'Poetry': "Fiction"}
+# add the new column to the dataset
+books["simple_categories"] = books["categories"].map(category_mapping)
+print(books.head())
+# applying the ZERO-SHOT CLASSIFICATION
+# import the HF model
+fiction_categories = ["Fiction", "Nonfiction"]
+pipe = pipeline("zero-shot-classification",
+				model="facebook/bart-large-mnli",
+				device="mps"
+)
+# looking at the first known fiction entry in the dataframe
+print(books.loc[books["simple_categories"] == "Fiction", "description"].reset_index(drop=True)[0])
+# set a sequence and categories to run/apply the classifier, check for the scpres once you run, the depic the probabilities
+sequence = books.loc[books["simple_categories"] == "Fiction", "description"].reset_index(drop=True)[0]
+print(pipe(sequence, fiction_categories))
+print("\n")
+# get the predicted label from the output using post-processing
+max_index = np.argmax(pipe(sequence, fiction_categories)["scores"])
+max_label = pipe(sequence, fiction_categories)["labels"][max_index]
+print("Predicted Label using ZERO-SHOT: " + max_label + "\n")
+# define a function for generating predictions
+def generate_predictions(sequence, categories):
+	predictions = pipe(sequence, categories)
+	max_index = np.argmax(predictions["scores"])
+	max_label = predictions["labels"][max_index]
+	return max_label
+# take a sizeable sample of fiction and non-fiction, and prediict the label using the classifier
+# compare it with the already known label
+# the if-else is here to avoid re-generation during every run
+if os.path.exists("predictions_results.csv"):
+    predictions_df = pd.read_csv("predictions_results.csv")
+else:
+    # Run the loops, then save
+    actual_cats = []
+    predicted_cats = []
+    for i in tqdm(range(0, 300)):
+        sequence = books.loc[books["simple_categories"] == "Fiction", "description"].reset_index(drop=True)[i]
+        predicted_cats += [generate_predictions(sequence, fiction_categories)]
+        actual_cats += ["Fiction"]
+    for i in tqdm(range(0, 300)):
+        sequence = books.loc[books["simple_categories"] == "Nonfiction", "description"].reset_index(drop=True)[i]
+        predicted_cats += [generate_predictions(sequence, fiction_categories)]
+        actual_cats += ["Nonfiction"]
+    predictions_df = pd.DataFrame({
+        "actual_categories": actual_cats,
+        "predicted_categories": predicted_cats
+    })
+    predictions_df.to_csv("predictions_results.csv", index=False)
+# sets 1 for correct prediction and 0 for incorrect
+predictions_df["correct_prediction"] = (
+	np.where(predictions_df["actual_categories"] == predictions_df["predicted_categories"], 1, 0)
+)
+print(predictions_df)
+# calculating the accuracy of our labelling
+accuracy = predictions_df["correct_prediction"].sum()/len(predictions_df)
+print("Labelling accuracy: ", accuracy*100, "%\n")
+# use this model to identify the missing categories (kinda create a subset of the dataset and take ones only with missing categories)
+isbns = []
+predicted_cats = []
+# consider the isbn13 so that we can merge back to the original dataframe
+missing_cats = books.loc[books["simple_categories"].isna(), ["isbn13", "description"]].reset_index(drop=True)
+for i in tqdm(range(0, len(missing_cats))):
+	sequence = missing_cats["description"][i]
+	predicted_cats += [generate_predictions(sequence, fiction_categories)]
+	isbns += [missing_cats["isbn13"][i]]
+missing_predicted_df = pd.DataFrame({"isbn13": isbns, "predicted_categories": predicted_cats})
+print("The missing predicted categories: \n")
+print(missing_predicted_df)
+# merge it back to the main dataframe
+books = pd.merge(books, missing_predicted_df, on="isbn13", how="left")
+books["simple_categories"] = np.where(books["simple_categories"].isna(), books["predicted_categories"], books["simple_categories"])
+books = books.drop(columns = ["predicted_categories"])
+# specific fiction categories with these keywords exclusively
+print(books[books["categories"].str.lower().isin([
+	"romance",
+	"science fiction",
+	"scifi",
+	"fantasy",
+	"horror",
+	"mystery",
+	"thriller",
+	"comedy",
+	"crime"
+	"historical"
+])])
+# save the changes in a new file
+books.to_csv("books_with_categories.csv", index=False)

vector_search.py ADDED Viewed

	@@ -0,0 +1,115 @@

+from langchain_chroma import Chroma
+from langchain_openai import OpenAIEmbeddings
+from langchain_huggingface import HuggingFaceEmbeddings
+from langchain_community.document_loaders import TextLoader
+from langchain_text_splitters import CharacterTextSplitter
+from dotenv import load_dotenv
+from tabulate import tabulate
+import pandas as pd
+#loading the .env file
+load_dotenv()
+books = pd.read_csv("books_cleaned.csv")
+books["tagged_description"].to_csv("tagged_description.txt",
+									sep = "\n",
+									index = False,
+									header = False)
+"""Our existing Chroma DB (chroma_db_books) was created with OpenAIEmbeddings, which produce 1536-dimensional vectors.
+On the other hand, HuggingFaceEmbeddings produces 384-dimensional vectors.
+"""
+# OpenAI approach using its API
+# load the documents and instantiate the text-splitter
+# the chunk size it set to zero to prioritize splitting at the separator rather than the chunk-size, hence we might warnings
+"""raw_documents = TextLoader("tagged_description.txt").load()
+text_splitter = CharacterTextSplitter(chunk_size=0, chunk_overlap=0, separator="\n")
+documents = text_splitter.split_documents(raw_documents)
+# checking if it prints the first description correctly
+print(documents[0])
+#create the document embeddings and store them in the vector database
+db_books = Chroma.from_documents(
+    documents,
+    embedding=OpenAIEmbeddings(),
+    persist_directory="chroma_db_books"
+)
+print("Vector database stored to local disk:)")
+"""
+# HuggingFace approach >> to save money
+# conditional flag to avoid creating vector database everytime
+query = "A book to teach children about nature"
+REBUILD_VECTOR_DB = False
+PERSIST_DIR = "chroma_db_books_hf"
+MODEL = "sentence-transformers/all-MiniLM-L6-v2" #384-dim (keep consistent!)
+# 1. Define the embedding model (same for build & query)
+embedding = HuggingFaceEmbeddings(model_name=MODEL)
+if REBUILD_VECTOR_DB:
+    # 2. Load and split text
+    raw_documents = TextLoader("tagged_description.txt").load()
+    text_splitter = CharacterTextSplitter(chunk_size=0, chunk_overlap=0, separator="\n")
+    documents = text_splitter.split_documents(raw_documents)
+    # 3. Create and persist vector DB
+    db_books = Chroma.from_documents(
+        documents,
+        embedding=embedding,
+        persist_directory=PERSIST_DIR
+    )
+    print("First split chunk:")
+    print(documents[0].page_content)
+else:
+    # 4. Load existing DB (no re-embedding)
+    db_books = Chroma(
+        persist_directory=PERSIST_DIR,
+        embedding_function=embedding
+    )
+    # 5. Run a query
+    results = db_books.similarity_search(query, k=1)
+    print("Top semantic match:\n" + results[0].page_content + "\n")
+docs = db_books.similarity_search(query, k = 10)
+print("First 10 results: \n", docs, "\n")
+# filters and gives the isbn for the first result from the query results
+print("First result of all:\n")
+print(books[books["isbn13"] == int(docs[0].page_content.split()[0].strip())])
+print("\n")
+def retrieve_semantic_recommendations(
+		query: str,
+		top_k: int = 10,
+) -> pd. DataFrame:
+	recs = db_books.similarity_search (query, k = 50)
+	books_list = []
+	for i in range(0, len(recs)):
+		books_list += [int(recs[i].page_content.strip('"').split()[0])]
+	return books[books["isbn13"].isin(books_list)].head(top_k)
+results = retrieve_semantic_recommendations(query)
+print("Recommendations:\n")
+print(tabulate(results, headers='keys', tablefmt='grid', showindex=False))