{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "initial_id",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-09-16T09:43:18.055617Z",
     "start_time": "2025-09-16T09:43:17.869905Z"
    }
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "\n",
    "books = pd.read_csv(\"books_with_categories.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3d9a521af5640cd2",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-09-16T09:43:20.918046Z",
     "start_time": "2025-09-16T09:43:18.066451Z"
    }
   },
   "outputs": [],
   "source": [
    "!pip install torch transformers\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a222cc24cb3d9e50",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-09-16T09:43:20.956314Z",
     "start_time": "2025-09-16T09:43:20.934627Z"
    }
   },
   "outputs": [],
   "source": [
    "import torch\n",
    "import transformers\n",
    "print(f\"PyTorch version: {torch.__version__}\")\n",
    "print(f\"Transformers version: {transformers.__version__}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "418145b8ff28c108",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-09-16T09:43:23.555715Z",
     "start_time": "2025-09-16T09:43:20.969958Z"
    }
   },
   "outputs": [],
   "source": [
    "# Fix the bug by making torch available in transformers namespace\n",
    "transformers.torch = torch\n",
    "\n",
    "from transformers import pipeline\n",
    "\n",
    "pipe = pipeline(\n",
    "    \"text-classification\",\n",
    "    model=\"j-hartmann/emotion-english-distilroberta-base\",\n",
    "    return_all_scores=True\n",
    ")\n",
    "\n",
    "# Test it\n",
    "text = \"I am so happy today!\"\n",
    "result = pipe(text)\n",
    "print(result)\n",
    "\n",
    "#top-k None\n",
    "#device  -- mps /cuda for warnings"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "90acf250d3189ec1",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-09-16T09:43:23.912340Z",
     "start_time": "2025-09-16T09:43:23.574192Z"
    }
   },
   "outputs": [],
   "source": [
    "pipe(books[\"description\"][0])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c9781bcf4224efd4",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-09-16T09:43:24.797286Z",
     "start_time": "2025-09-16T09:43:23.944842Z"
    }
   },
   "outputs": [],
   "source": [
    "pipe(books[\"description\"][0].split(\".\"))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "57fc949d567e3f7",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-09-16T09:43:25.167345Z",
     "start_time": "2025-09-16T09:43:24.810715Z"
    }
   },
   "outputs": [],
   "source": [
    "sentences = books[\"description\"][0].split(\".\")\n",
    "predictions = pipe(sentences)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "41b5470987223a69",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-09-16T09:43:25.187522Z",
     "start_time": "2025-09-16T09:43:25.179974Z"
    }
   },
   "outputs": [],
   "source": [
    "sentences[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "81bb270a79fdd290",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-09-16T09:43:25.232413Z",
     "start_time": "2025-09-16T09:43:25.225824Z"
    }
   },
   "outputs": [],
   "source": [
    "predictions[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d85ba7066b85eb7d",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-09-16T09:43:25.273001Z",
     "start_time": "2025-09-16T09:43:25.267108Z"
    }
   },
   "outputs": [],
   "source": [
    "sentences[4]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8dea7d5c2077d566",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-09-16T09:43:25.306831Z",
     "start_time": "2025-09-16T09:43:25.300457Z"
    }
   },
   "outputs": [],
   "source": [
    "predictions[4]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a540e26e090b9050",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-09-16T09:43:25.342124Z",
     "start_time": "2025-09-16T09:43:25.334958Z"
    }
   },
   "outputs": [],
   "source": [
    "sorted(predictions[0], key = lambda x: x['label'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a496645a7d858dcf",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-09-16T09:43:25.369056Z",
     "start_time": "2025-09-16T09:43:25.360888Z"
    }
   },
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "\n",
    "emotion_labels = [\"anger\", \"disgust\", \"fear\", \"joy\", \"sadness\", \"surprise\", \"neutral\"]\n",
    "isbn = []\n",
    "emotion_scores = {label: [] for label in emotion_labels}\n",
    "\n",
    "def calculate_max_emotion_scores(predictions):\n",
    "    per_emotion_scores = {label: [] for label in emotion_labels}\n",
    "    for prediction in predictions:\n",
    "        sorted_predictions = sorted(prediction, key=lambda x: x['label'], reverse=True)\n",
    "        for index, label in enumerate(emotion_labels):\n",
    "            per_emotion_scores[label].append(sorted_predictions[index]['score'])\n",
    "    return {label: np.max(scores) for label, scores in per_emotion_scores.items()}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b911145893e482f3",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-09-16T09:43:27.622710Z",
     "start_time": "2025-09-16T09:43:25.385198Z"
    }
   },
   "outputs": [],
   "source": [
    "for i, row in books.head(10).iterrows():\n",
    "    isbn.append(str(row[\"isbn13\"]))\n",
    "\n",
    "    sentences = str(row[\"description\"]).split(\".\")\n",
    "    predictions = pipe(sentences)\n",
    "    max_scores = calculate_max_emotion_scores(predictions)\n",
    "\n",
    "    for label in emotion_labels:\n",
    "        # force conversion to Python float\n",
    "        emotion_scores[label].append(float(max_scores[label]))\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b1e1e2960a0314b3",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-09-16T09:43:27.634705Z",
     "start_time": "2025-09-16T09:43:27.630600Z"
    }
   },
   "outputs": [],
   "source": [
    "emotion_scores = {\n",
    "    label: [float(x) for x in scores]\n",
    "    for label, scores in emotion_scores.items()\n",
    "}\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b4d6abd593a32daa",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-09-16T09:43:27.652229Z",
     "start_time": "2025-09-16T09:43:27.644745Z"
    }
   },
   "outputs": [],
   "source": [
    "emotion_scores"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "632f787a4b7d3eaf",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-09-16T11:10:46.250826Z",
     "start_time": "2025-09-16T10:58:12.182208Z"
    }
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "from tqdm import tqdm\n",
    "from transformers import pipeline\n",
    "\n",
    "# Initialize the emotion analysis pipeline\n",
    "pipe = pipeline(\"text-classification\", model=\"j-hartmann/emotion-english-distilroberta-base\", top_k=None)\n",
    "\n",
    "# Load your books data\n",
    "books = pd.read_csv(\"books_with_categories.csv\")  # Replace with your actual file name\n",
    "\n",
    "emotion_labels = [\"anger\", \"disgust\", \"fear\", \"joy\", \"sadness\", \"surprise\", \"neutral\"]\n",
    "isbn = []\n",
    "emotion_scores = {label: [] for label in emotion_labels}\n",
    "\n",
    "def calculate_max_emotion_scores(predictions):\n",
    "    \"\"\"Calculate maximum emotion scores across all sentences\"\"\"\n",
    "    per_emotion_scores = {label: [] for label in emotion_labels}\n",
    "\n",
    "    for prediction in predictions:\n",
    "        # Create a dictionary for easy lookup by label\n",
    "        prediction_dict = {pred['label']: pred['score'] for pred in prediction}\n",
    "\n",
    "        # Add scores for each emotion label\n",
    "        for label in emotion_labels:\n",
    "            score = prediction_dict.get(label, 0.0)  # Default to 0 if label not found\n",
    "            per_emotion_scores[label].append(score)\n",
    "\n",
    "    # Return maximum score for each emotion across all sentences\n",
    "    return {label: np.max(scores) if scores else 0.0 for label, scores in per_emotion_scores.items()}\n",
    "\n",
    "print(\"Processing emotions for books...\")\n",
    "for i, row in tqdm(books.iterrows(), total=len(books)):\n",
    "    isbn.append(str(row[\"isbn13\"]))\n",
    "\n",
    "    # Handle missing descriptions\n",
    "    description = str(row[\"description\"]) if pd.notna(row[\"description\"]) else \"\"\n",
    "\n",
    "    if description and description != \"nan\":\n",
    "        # Split into sentences and filter out empty ones\n",
    "        sentences = [s.strip() for s in description.split(\".\") if s.strip()]\n",
    "\n",
    "        if sentences:\n",
    "            try:\n",
    "                predictions = pipe(sentences)\n",
    "                max_scores = calculate_max_emotion_scores(predictions)\n",
    "            except Exception as e:\n",
    "                print(f\"Error processing book {row['isbn13']}: {e}\")\n",
    "                # Use default scores if processing fails\n",
    "                max_scores = {label: 0.0 for label in emotion_labels}\n",
    "        else:\n",
    "            # Empty description\n",
    "            max_scores = {label: 0.0 for label in emotion_labels}\n",
    "    else:\n",
    "        # No description available\n",
    "        max_scores = {label: 0.0 for label in emotion_labels}\n",
    "\n",
    "    # Add scores to our lists\n",
    "    for label in emotion_labels:\n",
    "        emotion_scores[label].append(float(max_scores[label]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "31dfb34d4f4aee9a",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-09-16T11:11:55.455696Z",
     "start_time": "2025-09-16T11:11:55.422818Z"
    }
   },
   "outputs": [],
   "source": [
    "# Create emotion DataFrame\n",
    "emotion_df = pd.DataFrame(emotion_scores)\n",
    "emotion_df['isbn13'] = isbn\n",
    "\n",
    "print(\"Emotion processing completed!\")\n",
    "print(\"Sample emotion scores:\")\n",
    "print(emotion_df.head(10))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8db5e8f5cee59321",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-09-16T11:12:15.342595Z",
     "start_time": "2025-09-16T11:12:14.895173Z"
    }
   },
   "outputs": [],
   "source": [
    "books['isbn13'] = books['isbn13'].astype(str).str.replace('.0', '', regex=False)\n",
    "emotion_df['isbn13'] = emotion_df['isbn13'].astype(str).str.replace('.0', '', regex=False)\n",
    "\n",
    "print(\"Data types before merge:\")\n",
    "print(f\"Books isbn13 dtype: {books['isbn13'].dtype}\")\n",
    "print(f\"Emotion isbn13 dtype: {emotion_df['isbn13'].dtype}\")\n",
    "\n",
    "# Merge emotion scores back to the original books DataFrame\n",
    "books_with_emotions = books.merge(emotion_df, on='isbn13', how='left')\n",
    "\n",
    "# Save the combined DataFrame\n",
    "books_with_emotions.to_csv(\"books_with_emotions.csv\", index=False)\n",
    "\n",
    "print(f\"Saved books with emotions to 'books_with_emotions.csv'\")\n",
    "print(f\"Total books processed: {len(books_with_emotions)}\")\n",
    "print(\"Available columns:\", books_with_emotions.columns.tolist())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e1cc83da7893e926",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}