{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "176502f3",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Cell 1: Download the dataset from Kaggle using kagglehub\n",
    "import kagglehub\n",
    "\n",
    "# Download latest version\n",
    "path = kagglehub.dataset_download(\"dylanjcastillo/7k-books-with-metadata\")\n",
    "\n",
    "print(\"Path to dataset files:\", path)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2cddaba0",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "\n",
    "path = r\"C:\\Users\\ravis\\.cache\\kagglehub\\datasets\\dylanjcastillo\\7k-books-with-metadata\\versions\\3\\books.csv\"\n",
    "\n",
    "books = pd.read_csv(path)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0f0a325a",
   "metadata": {},
   "outputs": [],
   "source": [
    "books.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "64a81372",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Cell 4: Add columns for missing descriptions and calculate the age of each book\n",
    "import numpy as np\n",
    "\n",
    "books[\"missing_description\"] = np.where(books[\"description\"].isna(), 1, 0)\n",
    "books[\"age_of_book\"] = 2023 - books[\"published_year\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "937762e4",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Cell 5: Compute and visualize the Spearman correlation matrix for selected columns\n",
    "columns = ['num_pages','age_of_book', 'missing_description', 'average_rating']\n",
    "\n",
    "correlation_matrix = books[columns].corr(method='spearman')\n",
    "\n",
    "import seaborn as sns\n",
    "import matplotlib.pyplot as plt\n",
    "\n",
    "sns.set_theme(style=\"white\")\n",
    "plt.figure(figsize=(8, 6))\n",
    "\n",
    "heatmap = sns.heatmap(correlation_matrix,\n",
    "                      annot=True,\n",
    "                      fmt=\".2f\",\n",
    "                      cmap=\"coolwarm\",\n",
    "                      cbar_kws={'label': 'Spearman Correlation'}\n",
    "                      )\n",
    "heatmap.set_title('Spearman Correlation Matrix', fontdict={'fontsize':16}, pad=12)\n",
    "\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b071bcdd",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Cell 6: Filter out rows with missing values in key columns\n",
    "book_missing = books[\n",
    "        ~(books['description'].isna()) &\n",
    "        ~(books['num_pages'].isna()) &\n",
    "        ~(books['average_rating'].isna()) &\n",
    "        ~(books['published_year'].isna())\n",
    "    ]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "059ad7c0",
   "metadata": {},
   "outputs": [],
   "source": [
    "book_missing.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "56e5f02e",
   "metadata": {},
   "outputs": [],
   "source": [
    "book_missing['categories'].value_counts().reset_index().sort_values(\"count\", ascending=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6637f49c",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Cell 9: Add a column counting the number of words in each book's description\n",
    "book_missing['words_in_description'] = book_missing['description'].str.split().str.len()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "406785b4",
   "metadata": {},
   "outputs": [],
   "source": [
    "book_missing.loc[book_missing['words_in_description'].between(1,4), 'description']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c6edd620",
   "metadata": {},
   "outputs": [],
   "source": [
    "book_missing.loc[book_missing['words_in_description'].between(5,14), 'description']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7b1d5305",
   "metadata": {},
   "outputs": [],
   "source": [
    "book_missing.loc[book_missing['words_in_description'].between(15,24), 'description']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "44fc9f68",
   "metadata": {},
   "outputs": [],
   "source": [
    "book_missing.loc[book_missing['words_in_description'].between(25,34), 'description']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "62597c72",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Cell 14: Filter books with at least 25 words in the description and show the shape\n",
    "book_missing_25_words = book_missing[book_missing['words_in_description'] >= 25]\n",
    "book_missing_25_words.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "be102f7e",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Cell 15: Create a new column combining title and subtitle (if available)\n",
    "book_missing_25_words['title_and_subtitle'] = (\n",
    "    np.where(\n",
    "        book_missing_25_words['subtitle'].isna(), book_missing_25_words['title'],\n",
    "        book_missing_25_words[['title', 'subtitle']].astype(str).agg(': '.join, axis=1)\n",
    "    )\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d7fc57e4",
   "metadata": {},
   "outputs": [],
   "source": [
    "book_missing_25_words.head(4)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1684a367",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Cell 17: Create a new column combining isbn13 and description for tagging\n",
    "book_missing_25_words['tagged_description'] = book_missing_25_words[['isbn13', 'description']].astype(str).agg(' '.join, axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "faf74e50",
   "metadata": {},
   "outputs": [],
   "source": [
    "book_missing_25_words.tagged_description"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ff617bea",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Cell 19: Save the cleaned DataFrame to a CSV file, dropping some columns\n",
    "(\n",
    "    book_missing_25_words\n",
    "    .drop([\"subtitle\", \"missing_description\", \"age_of_book\", \"words_in_description\"], axis=1)\n",
    "    .to_csv(\"books_cleaned.csv\", index = False)\n",
    ")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "myenv",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}