{ "cells": [ { "cell_type": "code", "execution_count": null, "id": "063cfaa8", "metadata": { "id": "063cfaa8" }, "outputs": [], "source": [ "import pandas as pd" ] }, { "cell_type": "code", "execution_count": null, "id": "1c81ca58", "metadata": { "id": "1c81ca58" }, "outputs": [], "source": [ "books = pd.read_csv('books_cleaned.csv')" ] }, { "cell_type": "code", "execution_count": null, "id": "8244b265", "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 423 }, "id": "8244b265", "outputId": "0af624a8-1577-4c66-e252-a29479ad8446" }, "outputs": [], "source": [ "books['categories'].value_counts().reset_index()" ] }, { "cell_type": "code", "execution_count": null, "id": "aefb7553", "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 425 }, "id": "aefb7553", "outputId": "5aad706e-7f5c-4704-dc31-3fbb00a8da9c" }, "outputs": [], "source": [ "books['categories'].value_counts().reset_index().query('count > 50')" ] }, { "cell_type": "code", "execution_count": null, "id": "f8635069", "metadata": { "id": "f8635069" }, "outputs": [], "source": [ "category_mapping = {\n", " 'Fiction' : \"Fiction\",\n", " 'Juvenile Fiction': \"Children's Fiction\",\n", " 'Biography & Autobiography': \"Nonfiction\",\n", " 'History': \"Nonfiction\",\n", " 'Literary Criticism': \"Nonfiction\",\n", " 'Philosophy': \"Nonfiction\",\n", " 'Religion': \"Nonfiction\",\n", " 'Comics & Graphic Novels': \"Fiction\",\n", " 'Drama': \"Fiction\",\n", " 'Juvenile Nonfiction': \"Children's Nonfiction\",\n", " 'Science': \"Nonfiction\",\n", " 'Poetry': \"Fiction\"\n", " }\n", "\n", "books['simple_categories'] = books['categories'].map(category_mapping)" ] }, { "cell_type": "code", "execution_count": null, "id": "7f1a4097", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "7f1a4097", "outputId": "3a4a95b5-c920-4d85-a62d-8ca302670df1" }, "outputs": [], "source": [ "books[~(books['simple_categories'].isna())].shape" ] }, { "cell_type": "code", "execution_count": null, "id": "09433430", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "09433430", "outputId": "b8840759-5f44-4dc0-81a1-63c70b489653" }, "outputs": [], "source": [ "from transformers import pipeline\n", "fiction_categories = [\n", " \"Fiction\",\n", " \"Nonfiction\"]\n", "pipe = pipeline(\"zero-shot-classification\",\n", " model=\"facebook/bart-large-mnli\",\n", " device=\"cuda\")" ] }, { "cell_type": "code", "execution_count": null, "id": "a44f3e64", "metadata": { "id": "a44f3e64" }, "outputs": [], "source": [ "sequence = books.loc[books['simple_categories']==\"Fiction\", 'description'].reset_index(drop=True)[0]" ] }, { "cell_type": "code", "execution_count": null, "id": "3e3ff995", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "3e3ff995", "outputId": "00ea0a49-3a9e-4d24-a621-92fb84de7d92" }, "outputs": [], "source": [ "pipe(sequence,fiction_categories)" ] }, { "cell_type": "code", "execution_count": null, "id": "16b259eb", "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 36 }, "id": "16b259eb", "outputId": "3d8a6725-f246-49c9-cfce-ebd4c6e79151" }, "outputs": [], "source": [ "import numpy as np\n", "max_index = np.argmax(pipe(sequence,fiction_categories)['scores'])\n", "max_label = pipe(sequence,fiction_categories)['labels'][max_index]\n", "\n", "max_label" ] }, { "cell_type": "code", "execution_count": null, "id": "bd1a160f", "metadata": { "id": "bd1a160f" }, "outputs": [], "source": [ "def generate_predictions(sequence, categories):\n", " results = pipe(sequence, categories)\n", " max_index = np.argmax(results['scores'])\n", " max_label = results['labels'][max_index]\n", " return max_label" ] }, { "cell_type": "code", "execution_count": null, "id": "4945125a", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "4945125a", "outputId": "bbd1a877-0c02-4860-d289-201ac5ff6f41" }, "outputs": [], "source": [ "from tqdm import tqdm\n", "\n", "actual_cats = []\n", "predicted_cats = []\n", "\n", "for i in tqdm(range(0, 300)):\n", " sequence = books.loc[books['simple_categories']==\"Fiction\", 'description'].reset_index(drop=True)[i]\n", " predicted_cats.append(generate_predictions(sequence, fiction_categories))\n", " actual_cats.append(\"Fiction\")" ] }, { "cell_type": "code", "execution_count": null, "id": "efd30e84", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "efd30e84", "outputId": "462d4f63-d2cd-4dc8-de04-eb9da6c55b20" }, "outputs": [], "source": [ "for i in tqdm(range(0, 300)):\n", " sequence = books.loc[books['simple_categories']==\"Nonfiction\", 'description'].reset_index(drop=True)[i]\n", " predicted_cats.append(generate_predictions(sequence, fiction_categories))\n", " actual_cats.append(\"Nonfiction\")" ] }, { "cell_type": "code", "execution_count": null, "id": "34322614", "metadata": { "id": "34322614" }, "outputs": [], "source": [ "predictions_df = pd.DataFrame({\n", " 'actual': actual_cats,\n", " 'predicted': predicted_cats\n", "})" ] }, { "cell_type": "code", "execution_count": null, "id": "fc41ebe7", "metadata": { "id": "fc41ebe7" }, "outputs": [], "source": [ "predictions_df['correct_prediction'] = (\n", " np.where(predictions_df['actual'] == predictions_df['predicted'], 1, 0)\n", ")" ] }, { "cell_type": "code", "execution_count": null, "id": "325834c0", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "325834c0", "outputId": "479b349e-c6e4-49cf-eda1-107fe595b57c" }, "outputs": [], "source": [ "predictions_df['correct_prediction'].sum() / predictions_df.shape[0]" ] }, { "cell_type": "code", "execution_count": null, "id": "4f3834ac", "metadata": { "id": "4f3834ac" }, "outputs": [], "source": [ "isbns = []\n", "predicted_cats = []\n", "missing_cats = books.loc[books['simple_categories'].isna(), ['isbn13', \"description\"]].reset_index(drop=True)" ] }, { "cell_type": "code", "execution_count": null, "id": "38a9529a", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "38a9529a", "outputId": "190e56ad-fa21-4b98-fcb2-7896197bc349" }, "outputs": [], "source": [ "for i in tqdm(range(0, missing_cats.shape[0])):\n", " sequence = missing_cats['description'][i]\n", " pred = generate_predictions(sequence, fiction_categories)\n", " predicted_cats.append(pred)\n", " isbns.append(missing_cats['isbn13'][i])\n" ] }, { "cell_type": "code", "execution_count": null, "id": "d76c7c58", "metadata": { "id": "d76c7c58" }, "outputs": [], "source": [ "missing_predictions_df = pd.DataFrame({\n", " 'isbn13': isbns,\n", " 'predicted': predicted_cats\n", "})" ] }, { "cell_type": "code", "execution_count": null, "id": "6e83ce7e", "metadata": { "id": "6e83ce7e" }, "outputs": [], "source": [ "missing_predictions_df" ] }, { "cell_type": "code", "execution_count": null, "id": "17fbb19c", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "17fbb19c", "outputId": "dfc3c2ef-0e99-4d76-f3eb-aca138c6782f" }, "outputs": [], "source": [ "books.columns" ] }, { "cell_type": "code", "execution_count": null, "id": "f0060e11", "metadata": { "id": "f0060e11" }, "outputs": [], "source": [ "asdf" ] }, { "cell_type": "code", "execution_count": null, "id": "c5c6ad46", "metadata": { "id": "c5c6ad46" }, "outputs": [], "source": [ "books = pd.merge(books, missing_predictions_df, on='isbn13', how='left')\n", "books['simple_categories'] = np.where(books['simple_categories'].isna(),books['predicted'], books['simple_categories'])\n", "books = books.drop(columns=['predicted'])\n" ] }, { "cell_type": "code", "execution_count": null, "id": "83beb18f", "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 271 }, "id": "83beb18f", "outputId": "e7c6e296-748c-4f12-d818-e16b75083bff" }, "outputs": [], "source": [ "books.head(2)" ] }, { "cell_type": "code", "execution_count": null, "id": "1e8e9374", "metadata": { "id": "1e8e9374" }, "outputs": [], "source": [ "books[books[\"categories\"].str.lower().isin([\n", " \"romance\",\n", " \"science fiction\",\n", " \"scifi\",\n", " \"fantasy\",\n", " \"horror\",\n", " \"mystery\",\n", " \"thriller\",\n", " \"comedy\",\n", " \"crime\",\n", " \"historical\"\n", "])]" ] }, { "cell_type": "code", "execution_count": null, "id": "DA0gYVkklR1e", "metadata": { "id": "DA0gYVkklR1e" }, "outputs": [], "source": [ "books.to_csv(\"books_with_categories.csv\", index=False)" ] } ], "metadata": { "accelerator": "GPU", "colab": { "gpuType": "T4", "provenance": [] }, "kernelspec": { "display_name": "Python 3", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.0" } }, "nbformat": 4, "nbformat_minor": 5 }