{ "cells": [ { "cell_type": "code", "execution_count": null, "id": "176502f3", "metadata": {}, "outputs": [], "source": [ "# Cell 1: Download the dataset from Kaggle using kagglehub\n", "import kagglehub\n", "\n", "# Download latest version\n", "path = kagglehub.dataset_download(\"dylanjcastillo/7k-books-with-metadata\")\n", "\n", "print(\"Path to dataset files:\", path)" ] }, { "cell_type": "code", "execution_count": null, "id": "2cddaba0", "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "\n", "path = r\"C:\\Users\\ravis\\.cache\\kagglehub\\datasets\\dylanjcastillo\\7k-books-with-metadata\\versions\\3\\books.csv\"\n", "\n", "books = pd.read_csv(path)" ] }, { "cell_type": "code", "execution_count": null, "id": "0f0a325a", "metadata": {}, "outputs": [], "source": [ "books.head()" ] }, { "cell_type": "code", "execution_count": null, "id": "64a81372", "metadata": {}, "outputs": [], "source": [ "# Cell 4: Add columns for missing descriptions and calculate the age of each book\n", "import numpy as np\n", "\n", "books[\"missing_description\"] = np.where(books[\"description\"].isna(), 1, 0)\n", "books[\"age_of_book\"] = 2023 - books[\"published_year\"]" ] }, { "cell_type": "code", "execution_count": null, "id": "937762e4", "metadata": {}, "outputs": [], "source": [ "# Cell 5: Compute and visualize the Spearman correlation matrix for selected columns\n", "columns = ['num_pages','age_of_book', 'missing_description', 'average_rating']\n", "\n", "correlation_matrix = books[columns].corr(method='spearman')\n", "\n", "import seaborn as sns\n", "import matplotlib.pyplot as plt\n", "\n", "sns.set_theme(style=\"white\")\n", "plt.figure(figsize=(8, 6))\n", "\n", "heatmap = sns.heatmap(correlation_matrix,\n", " annot=True,\n", " fmt=\".2f\",\n", " cmap=\"coolwarm\",\n", " cbar_kws={'label': 'Spearman Correlation'}\n", " )\n", "heatmap.set_title('Spearman Correlation Matrix', fontdict={'fontsize':16}, pad=12)\n", "\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": null, "id": "b071bcdd", "metadata": {}, "outputs": [], "source": [ "# Cell 6: Filter out rows with missing values in key columns\n", "book_missing = books[\n", " ~(books['description'].isna()) &\n", " ~(books['num_pages'].isna()) &\n", " ~(books['average_rating'].isna()) &\n", " ~(books['published_year'].isna())\n", " ]" ] }, { "cell_type": "code", "execution_count": null, "id": "059ad7c0", "metadata": {}, "outputs": [], "source": [ "book_missing.shape" ] }, { "cell_type": "code", "execution_count": null, "id": "56e5f02e", "metadata": {}, "outputs": [], "source": [ "book_missing['categories'].value_counts().reset_index().sort_values(\"count\", ascending=False)" ] }, { "cell_type": "code", "execution_count": null, "id": "6637f49c", "metadata": {}, "outputs": [], "source": [ "# Cell 9: Add a column counting the number of words in each book's description\n", "book_missing['words_in_description'] = book_missing['description'].str.split().str.len()" ] }, { "cell_type": "code", "execution_count": null, "id": "406785b4", "metadata": {}, "outputs": [], "source": [ "book_missing.loc[book_missing['words_in_description'].between(1,4), 'description']" ] }, { "cell_type": "code", "execution_count": null, "id": "c6edd620", "metadata": {}, "outputs": [], "source": [ "book_missing.loc[book_missing['words_in_description'].between(5,14), 'description']" ] }, { "cell_type": "code", "execution_count": null, "id": "7b1d5305", "metadata": {}, "outputs": [], "source": [ "book_missing.loc[book_missing['words_in_description'].between(15,24), 'description']" ] }, { "cell_type": "code", "execution_count": null, "id": "44fc9f68", "metadata": {}, "outputs": [], "source": [ "book_missing.loc[book_missing['words_in_description'].between(25,34), 'description']" ] }, { "cell_type": "code", "execution_count": null, "id": "62597c72", "metadata": {}, "outputs": [], "source": [ "# Cell 14: Filter books with at least 25 words in the description and show the shape\n", "book_missing_25_words = book_missing[book_missing['words_in_description'] >= 25]\n", "book_missing_25_words.shape" ] }, { "cell_type": "code", "execution_count": null, "id": "be102f7e", "metadata": {}, "outputs": [], "source": [ "# Cell 15: Create a new column combining title and subtitle (if available)\n", "book_missing_25_words['title_and_subtitle'] = (\n", " np.where(\n", " book_missing_25_words['subtitle'].isna(), book_missing_25_words['title'],\n", " book_missing_25_words[['title', 'subtitle']].astype(str).agg(': '.join, axis=1)\n", " )\n", ")" ] }, { "cell_type": "code", "execution_count": null, "id": "d7fc57e4", "metadata": {}, "outputs": [], "source": [ "book_missing_25_words.head(4)" ] }, { "cell_type": "code", "execution_count": null, "id": "1684a367", "metadata": {}, "outputs": [], "source": [ "# Cell 17: Create a new column combining isbn13 and description for tagging\n", "book_missing_25_words['tagged_description'] = book_missing_25_words[['isbn13', 'description']].astype(str).agg(' '.join, axis=1)" ] }, { "cell_type": "code", "execution_count": null, "id": "faf74e50", "metadata": {}, "outputs": [], "source": [ "book_missing_25_words.tagged_description" ] }, { "cell_type": "code", "execution_count": null, "id": "ff617bea", "metadata": {}, "outputs": [], "source": [ "# Cell 19: Save the cleaned DataFrame to a CSV file, dropping some columns\n", "(\n", " book_missing_25_words\n", " .drop([\"subtitle\", \"missing_description\", \"age_of_book\", \"words_in_description\"], axis=1)\n", " .to_csv(\"books_cleaned.csv\", index = False)\n", ")" ] } ], "metadata": { "kernelspec": { "display_name": "myenv", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.0" } }, "nbformat": 4, "nbformat_minor": 5 }