|
|
import pandas as pd |
|
|
import numpy as np |
|
|
import seaborn as sns |
|
|
import matplotlib.pyplot as plt |
|
|
|
|
|
|
|
|
|
|
|
books = pd.read_csv('books.csv') |
|
|
print("Initial look at the dataset:\n", books.head()) |
|
|
|
|
|
|
|
|
ax = plt.axes() |
|
|
sns.heatmap(books.isna().transpose(), cbar=False, ax=ax) |
|
|
plt.xlabel ("Columns") |
|
|
plt.ylabel ("Missing values") |
|
|
plt.show() |
|
|
|
|
|
|
|
|
books["missing_description"] = np.where(books["description"].isna(), 1, 0) |
|
|
books["age_of_book"] = 2025 - books["published_year"] |
|
|
|
|
|
columns_of_interest = ["num_pages", "age_of_book", "missing_description", "average_rating"] |
|
|
|
|
|
correlation_matrix = books[columns_of_interest].corr(method='spearman') |
|
|
|
|
|
sns.set_theme (style="white") |
|
|
plt.figure(figsize=(8, 6)) |
|
|
heatmap = sns.heatmap(correlation_matrix, annot=True, fmt=".2f", cmap="coolwarm", cbar_kws={"label": "Spearman correlation"}) |
|
|
heatmap.set_title("Correlation heatmap") |
|
|
plt.show() |
|
|
|
|
|
|
|
|
print(books[(books["description"].isna()) | |
|
|
(books ["num_pages"]. isna()) | |
|
|
(books["average_rating"].isna()) | |
|
|
(books["published_year"]. isna()) |
|
|
]) |
|
|
|
|
|
|
|
|
books_updated = books[~(books["description"].isna()) & |
|
|
~(books ["num_pages"]. isna()) & |
|
|
~(books["average_rating"].isna()) & |
|
|
~(books["published_year"]. isna()) |
|
|
] |
|
|
|
|
|
''' |
|
|
Potential problems for not making a copy of modified DataFrame: |
|
|
|
|
|
Unpredictable behavior - Code might work sometimes and fail other times |
|
|
Silent data corruption - Changes might affect the original DataFrame unexpectedly |
|
|
Hard-to-debug issues - The behavior depends on pandas' internal optimizations, which can change |
|
|
Future pandas versions - This might become an error instead of just a warning |
|
|
''' |
|
|
books_updated = books_updated.copy() |
|
|
print("Removing the above observations..\nUpdated dataset:\n", books_updated) |
|
|
|
|
|
|
|
|
visualize_top = books_updated[:100] |
|
|
|
|
|
|
|
|
print("Categorical Distribution:\n", books_updated["categories"].value_counts().reset_index().sort_values("count", ascending=False)) |
|
|
sns.histplot(data=visualize_top, x = 'categories', kde=True) |
|
|
plt.xticks(rotation=90) |
|
|
plt.show() |
|
|
|
|
|
|
|
|
print(books["description"].head()) |
|
|
|
|
|
'''IMP Note: Books with one-word descriptions won't enable a smooth recommendation process. They wouldn't be useful in the process |
|
|
Hence, it makes sense to remove such observations from the dataset.''' |
|
|
|
|
|
|
|
|
books_updated["words_in_description"] = books_updated["description"].str.split().str.len() |
|
|
print(books_updated["words_in_description"].head()) |
|
|
sns.histplot(data=books_updated, x = 'words_in_description', kde=True) |
|
|
plt.xticks(rotation=45) |
|
|
plt.show() |
|
|
|
|
|
''' |
|
|
#books with description words ranging from 1 to 4 |
|
|
print("Books having description of upto 4 words:\n", |
|
|
books_updated.loc[books_updated["words_in_description"].between(1, 4), "description"], "\n") |
|
|
|
|
|
#books with description words ranging from 5 to 14 |
|
|
print("Books having description words ranging from 5 to 14:\n", |
|
|
books_updated.loc[books_updated["words_in_description"].between(5, 14), "description"], "\n") |
|
|
|
|
|
#books with description words ranging from 15 to 24 |
|
|
print("Books having description words ranging from 15 to 24:\n", |
|
|
books_updated.loc[books_updated["words_in_description"].between(15, 24), "description"], "\n") |
|
|
|
|
|
#books with description words ranging from 25 to 34 |
|
|
print("Books having description of upto 4 words:\n", |
|
|
books_updated.loc[books_updated["words_in_description"].between(25, 34), "description"],) |
|
|
''' |
|
|
|
|
|
|
|
|
books_updated_25_words = books_updated[books_updated["words_in_description"] >= 25] |
|
|
|
|
|
books_updated_25_words = books_updated_25_words.copy() |
|
|
|
|
|
|
|
|
|
|
|
books_updated_25_words["title_and_subtitle"] = ( |
|
|
np.where(books_updated_25_words["subtitle"].isna(), books_updated_25_words["title"], |
|
|
books_updated_25_words[["title", "subtitle"]].astype(str).agg(": ".join, axis=1)) |
|
|
) |
|
|
|
|
|
|
|
|
|
|
|
''' |
|
|
create a new column with a tagged description. Why? |
|
|
it's a good practice as compared to direct string matching and filtering while recommending bcz it can get messy and slow |
|
|
isbn number is treated as an identifier and then later can be removed |
|
|
''' |
|
|
books_updated_25_words["tagged_description"] = books_updated_25_words[["isbn13", "description"]].astype(str).agg(" ".join, axis=1) |
|
|
|
|
|
|
|
|
|
|
|
( |
|
|
books_updated_25_words |
|
|
.drop(["subtitle", "missing_description", "age_of_book", "words_in_description"], axis=1) |
|
|
.to_csv("books_cleaned.csv", index = False) |
|
|
) |
|
|
|
|
|
|