{ "cells": [ { "cell_type": "code", "execution_count": null, "id": "initial_id", "metadata": { "ExecuteTime": { "end_time": "2025-09-14T07:12:03.050818Z", "start_time": "2025-09-14T07:11:56.152605Z" } }, "outputs": [], "source": [ "from statistics import correlation\n", "\n", "import kagglehub\n", "\n", "# Download latest version\n", "path = kagglehub.dataset_download(\"dylanjcastillo/7k-books-with-metadata\")\n", "\n", "print(\"Path to dataset files:\", path)" ] }, { "cell_type": "code", "execution_count": null, "id": "ae99194daafd1775", "metadata": { "ExecuteTime": { "end_time": "2025-09-14T07:51:53.432293Z", "start_time": "2025-09-14T07:51:52.436694Z" } }, "outputs": [], "source": [ "import pandas as pd" ] }, { "cell_type": "code", "execution_count": null, "id": "6df67758ebb1137c", "metadata": { "ExecuteTime": { "end_time": "2025-09-14T08:03:25.179234Z", "start_time": "2025-09-14T08:03:24.185253Z" } }, "outputs": [], "source": [ "from pathlib import Path\n", "\n", "# Convert string path → Path object\n", "path = Path(kagglehub.dataset_download(\"dylanjcastillo/7k-books-with-metadata\"))\n", "\n", "books = pd.read_csv(path / \"books.csv\")" ] }, { "cell_type": "code", "execution_count": null, "id": "94828bc9ccbfafa1", "metadata": { "ExecuteTime": { "end_time": "2025-09-14T08:03:37.133785Z", "start_time": "2025-09-14T08:03:37.079170Z" } }, "outputs": [], "source": [ "books" ] }, { "cell_type": "code", "execution_count": null, "id": "9403c10bb9a0112e", "metadata": { "ExecuteTime": { "end_time": "2025-09-14T08:12:20.943772Z", "start_time": "2025-09-14T08:12:16.468843Z" } }, "outputs": [], "source": [ "import seaborn as sns\n", "import matplotlib.pyplot as plt" ] }, { "cell_type": "code", "execution_count": null, "id": "aaba3c5cc9492dbc", "metadata": { "ExecuteTime": { "end_time": "2025-09-14T08:16:47.484763Z", "start_time": "2025-09-14T08:16:47.134190Z" } }, "outputs": [], "source": [ "ax = plt.axes()\n", "sns.heatmap(books.isna().transpose(), cbar = False , ax=ax)\n", "\n", "plt.xlabel(\"Columns\")\n", "plt.ylabel(\"Missing values\")\n", "\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": null, "id": "5020d8ec7f517390", "metadata": { "ExecuteTime": { "end_time": "2025-09-14T08:34:19.472432Z", "start_time": "2025-09-14T08:34:19.396405Z" } }, "outputs": [], "source": [ "import numpy as np\n", "books[\"missing_description\"] = np.where(books[\"description\"].isna(), 1, 0)\n", "books[\"age_of_book\"] = 2024 - books[\"published_year\"]" ] }, { "cell_type": "code", "execution_count": null, "id": "8693f57773a2f2ca", "metadata": { "ExecuteTime": { "end_time": "2025-09-14T08:45:22.257526Z", "start_time": "2025-09-14T08:45:22.005185Z" } }, "outputs": [], "source": [ "columns_of_interest = [\"num_pages\", \"age_of_book\", \"missing_description\", \"average_rating\"]\n", "correlation_matrix = books[columns_of_interest].corr(method = \"spearman\")\n", "sns.set_theme(style=\"white\")\n", "plt.figure(figsize = (8, 6))\n", "heatmap = sns.heatmap(correlation_matrix, annot=True, fmt=\".2f\", cmap=\"coolwarm\", cbar_kws={\"label\": \"Spearman Correlation\"})\n", "\n", "heatmap.set_title(\"Correlation Heatmap\")\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": null, "id": "1218eb9769f7ec28", "metadata": { "ExecuteTime": { "end_time": "2025-09-14T09:01:30.451492Z", "start_time": "2025-09-14T09:01:30.397573Z" } }, "outputs": [], "source": [ "books_missing = books[(books[\"description\"].isna()) |\n", " ~(books[\"num_pages\"].isna()) &\n", " ~(books[\"average_rating\"].isna()) &\n", " ~(books[\"published_year\"].isna())\n", "]" ] }, { "cell_type": "code", "execution_count": null, "id": "a16b79d748237fa6", "metadata": { "ExecuteTime": { "end_time": "2025-09-14T09:29:57.037634Z", "start_time": "2025-09-14T09:29:56.971479Z" } }, "outputs": [], "source": [ "books_missing = books[~(books[\"description\"].isna()) &\n", " ~(books[\"num_pages\"].isna()) &\n", " ~(books[\"average_rating\"].isna()) &\n", " ~(books[\"published_year\"].isna())\n", "]" ] }, { "cell_type": "code", "execution_count": null, "id": "997cafb5e60fef34", "metadata": { "ExecuteTime": { "end_time": "2025-09-14T09:30:14.028246Z", "start_time": "2025-09-14T09:30:13.969750Z" } }, "outputs": [], "source": [ "books_missing" ] }, { "cell_type": "code", "execution_count": null, "id": "6aad6ddc169cf39d", "metadata": { "ExecuteTime": { "end_time": "2025-09-14T09:33:31.453933Z", "start_time": "2025-09-14T09:33:31.395084Z" } }, "outputs": [], "source": [ "books_missing[\"categories\"].value_counts().reset_index().sort_values(\"count\", ascending=False)" ] }, { "cell_type": "code", "execution_count": null, "id": "a7c0628d5619c32b", "metadata": { "ExecuteTime": { "end_time": "2025-09-14T09:52:43.243363Z", "start_time": "2025-09-14T09:52:43.211576Z" } }, "outputs": [], "source": [ "books_missing" ] }, { "cell_type": "code", "execution_count": null, "id": "b971c57a22e2721e", "metadata": { "ExecuteTime": { "end_time": "2025-09-14T10:06:37.305268Z", "start_time": "2025-09-14T10:06:37.242773Z" } }, "outputs": [], "source": [ "books_missing.loc[:, \"words_in_description\"] = books_missing[\"description\"].str.split().str.len()\n" ] }, { "cell_type": "code", "execution_count": null, "id": "5cf80ede1a996820", "metadata": { "ExecuteTime": { "end_time": "2025-09-14T10:07:11.889795Z", "start_time": "2025-09-14T10:07:11.815772Z" } }, "outputs": [], "source": [ "books_missing" ] }, { "cell_type": "code", "execution_count": null, "id": "d4a20c7b8a28d843", "metadata": { "ExecuteTime": { "end_time": "2025-09-14T10:16:26.757853Z", "start_time": "2025-09-14T10:16:26.738194Z" } }, "outputs": [], "source": [ "print(books_missing.loc[books_missing[\"words_in_description\"].between(25, 34), [\"description\", \"words_in_description\"]])\n" ] }, { "cell_type": "code", "execution_count": null, "id": "add578fb79f75576", "metadata": { "ExecuteTime": { "end_time": "2025-09-14T10:18:38.671378Z", "start_time": "2025-09-14T10:18:38.655678Z" } }, "outputs": [], "source": [ "books_missing_25_words = books_missing[books_missing[\"words_in_description\"] >= 25]" ] }, { "cell_type": "code", "execution_count": null, "id": "337cc14a7592597", "metadata": { "ExecuteTime": { "end_time": "2025-09-14T10:18:45.020133Z", "start_time": "2025-09-14T10:18:44.995404Z" } }, "outputs": [], "source": [ "books_missing_25_words" ] }, { "cell_type": "code", "execution_count": null, "id": "15505042aaae206b", "metadata": { "ExecuteTime": { "end_time": "2025-09-14T10:36:25.385493Z", "start_time": "2025-09-14T10:36:25.348788Z" } }, "outputs": [], "source": [ "books_missing_25_words.loc[:, \"title_and_subtitle\"] = np.where(\n", " books_missing_25_words[\"subtitle\"].isna(),\n", " books_missing_25_words[\"title\"],\n", " books_missing_25_words[[\"title\", \"subtitle\"]].astype(str).agg(\": \".join, axis=1)\n", ")\n" ] }, { "cell_type": "code", "execution_count": null, "id": "8f48839b393f1be6", "metadata": { "ExecuteTime": { "end_time": "2025-09-14T10:36:36.463971Z", "start_time": "2025-09-14T10:36:36.442637Z" } }, "outputs": [], "source": [ "books_missing_25_words" ] }, { "cell_type": "code", "execution_count": null, "id": "1033bd78abfa34a3", "metadata": { "ExecuteTime": { "end_time": "2025-09-14T10:38:24.509449Z", "start_time": "2025-09-14T10:38:24.480830Z" } }, "outputs": [], "source": [ "books_missing_25_words[\"title_and_subtitle\"].value_counts().reset_index().sort_values(\"count\", ascending=False)" ] }, { "cell_type": "code", "execution_count": null, "id": "1871d27d7eb01493", "metadata": { "ExecuteTime": { "end_time": "2025-09-14T10:45:15.551772Z", "start_time": "2025-09-14T10:45:15.504051Z" } }, "outputs": [], "source": [ "books_missing_25_words = books_missing_25_words.copy() # slice warning हटाने के लिए\n", "\n", "books_missing_25_words.loc[:, \"tagged_description\"] = (\n", " books_missing_25_words[[\"isbn13\", \"description\"]]\n", " .astype(str)\n", " .agg(\" \".join, axis=1)\n", ")\n" ] }, { "cell_type": "code", "execution_count": null, "id": "20a704320865f12b", "metadata": { "ExecuteTime": { "end_time": "2025-09-14T10:45:38.585999Z", "start_time": "2025-09-14T10:45:38.566081Z" } }, "outputs": [], "source": [ "books_missing_25_words" ] }, { "cell_type": "code", "execution_count": null, "id": "36a89080af8a4f1c", "metadata": { "ExecuteTime": { "end_time": "2025-09-14T10:49:30.500326Z", "start_time": "2025-09-14T10:49:30.213437Z" } }, "outputs": [], "source": [ "(\n", " books_missing_25_words\n", " .drop([\"subtitle\", \"missing_description\", \"age_of_book\", \"words_in_description\"], axis=1)\n", " .to_csv(\"books_cleaned.csv\", index = False)\n", ")" ] }, { "cell_type": "code", "execution_count": null, "id": "a2308b29e727ba70", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.6" } }, "nbformat": 4, "nbformat_minor": 5 }