In [None]:
from statistics import correlation

import kagglehub

# Download latest version
path = kagglehub.dataset_download("dylanjcastillo/7k-books-with-metadata")

print("Path to dataset files:", path)

In [None]:
import pandas as pd

In [None]:
from pathlib import Path

# Convert string path → Path object
path = Path(kagglehub.dataset_download("dylanjcastillo/7k-books-with-metadata"))

books = pd.read_csv(path / "books.csv")

In [None]:
books

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
ax = plt.axes()
sns.heatmap(books.isna().transpose(), cbar = False , ax=ax)

plt.xlabel("Columns")
plt.ylabel("Missing values")

plt.show()

In [None]:
import numpy as np
books["missing_description"] = np.where(books["description"].isna(), 1, 0)
books["age_of_book"] = 2024 - books["published_year"]

In [None]:
columns_of_interest = ["num_pages", "age_of_book", "missing_description", "average_rating"]
correlation_matrix = books[columns_of_interest].corr(method = "spearman")
sns.set_theme(style="white")
plt.figure(figsize = (8, 6))
heatmap = sns.heatmap(correlation_matrix, annot=True, fmt=".2f", cmap="coolwarm", cbar_kws={"label": "Spearman Correlation"})

heatmap.set_title("Correlation Heatmap")
plt.show()

In [None]:
books_missing = books[(books["description"].isna()) |
 ~(books["num_pages"].isna()) &
 ~(books["average_rating"].isna()) &
 ~(books["published_year"].isna())
]

In [None]:
books_missing = books[~(books["description"].isna()) &
 ~(books["num_pages"].isna()) &
 ~(books["average_rating"].isna()) &
 ~(books["published_year"].isna())
]

In [None]:
books_missing

In [None]:
books_missing["categories"].value_counts().reset_index().sort_values("count", ascending=False)

In [None]:
books_missing

In [None]:
books_missing.loc[:, "words_in_description"] = books_missing["description"].str.split().str.len()


In [None]:
books_missing

In [None]:
print(books_missing.loc[books_missing["words_in_description"].between(25, 34), ["description", "words_in_description"]])


In [None]:
books_missing_25_words = books_missing[books_missing["words_in_description"] >= 25]

In [None]:
books_missing_25_words

In [None]:
books_missing_25_words.loc[:, "title_and_subtitle"] = np.where(
 books_missing_25_words["subtitle"].isna(),
 books_missing_25_words["title"],
 books_missing_25_words[["title", "subtitle"]].astype(str).agg(": ".join, axis=1)
)


In [None]:
books_missing_25_words

In [None]:
books_missing_25_words["title_and_subtitle"].value_counts().reset_index().sort_values("count", ascending=False)

In [None]:
books_missing_25_words = books_missing_25_words.copy() # slice warning हटाने के लिए

books_missing_25_words.loc[:, "tagged_description"] = (
 books_missing_25_words[["isbn13", "description"]]
 .astype(str)
 .agg(" ".join, axis=1)
)


In [None]:
books_missing_25_words

In [None]:
(
 books_missing_25_words
 .drop(["subtitle", "missing_description", "age_of_book", "words_in_description"], axis=1)
 .to_csv("books_cleaned.csv", index = False)
)