{ "cells": [ { "cell_type": "code", "execution_count": null, "id": "initial_id", "metadata": { "ExecuteTime": { "end_time": "2025-09-16T09:43:18.055617Z", "start_time": "2025-09-16T09:43:17.869905Z" } }, "outputs": [], "source": [ "import pandas as pd\n", "\n", "books = pd.read_csv(\"books_with_categories.csv\")" ] }, { "cell_type": "code", "execution_count": null, "id": "3d9a521af5640cd2", "metadata": { "ExecuteTime": { "end_time": "2025-09-16T09:43:20.918046Z", "start_time": "2025-09-16T09:43:18.066451Z" } }, "outputs": [], "source": [ "!pip install torch transformers\n" ] }, { "cell_type": "code", "execution_count": null, "id": "a222cc24cb3d9e50", "metadata": { "ExecuteTime": { "end_time": "2025-09-16T09:43:20.956314Z", "start_time": "2025-09-16T09:43:20.934627Z" } }, "outputs": [], "source": [ "import torch\n", "import transformers\n", "print(f\"PyTorch version: {torch.__version__}\")\n", "print(f\"Transformers version: {transformers.__version__}\")" ] }, { "cell_type": "code", "execution_count": null, "id": "418145b8ff28c108", "metadata": { "ExecuteTime": { "end_time": "2025-09-16T09:43:23.555715Z", "start_time": "2025-09-16T09:43:20.969958Z" } }, "outputs": [], "source": [ "# Fix the bug by making torch available in transformers namespace\n", "transformers.torch = torch\n", "\n", "from transformers import pipeline\n", "\n", "pipe = pipeline(\n", " \"text-classification\",\n", " model=\"j-hartmann/emotion-english-distilroberta-base\",\n", " return_all_scores=True\n", ")\n", "\n", "# Test it\n", "text = \"I am so happy today!\"\n", "result = pipe(text)\n", "print(result)\n", "\n", "#top-k None\n", "#device -- mps /cuda for warnings" ] }, { "cell_type": "code", "execution_count": null, "id": "90acf250d3189ec1", "metadata": { "ExecuteTime": { "end_time": "2025-09-16T09:43:23.912340Z", "start_time": "2025-09-16T09:43:23.574192Z" } }, "outputs": [], "source": [ "pipe(books[\"description\"][0])" ] }, { "cell_type": "code", "execution_count": null, "id": "c9781bcf4224efd4", "metadata": { "ExecuteTime": { "end_time": "2025-09-16T09:43:24.797286Z", "start_time": "2025-09-16T09:43:23.944842Z" } }, "outputs": [], "source": [ "pipe(books[\"description\"][0].split(\".\"))" ] }, { "cell_type": "code", "execution_count": null, "id": "57fc949d567e3f7", "metadata": { "ExecuteTime": { "end_time": "2025-09-16T09:43:25.167345Z", "start_time": "2025-09-16T09:43:24.810715Z" } }, "outputs": [], "source": [ "sentences = books[\"description\"][0].split(\".\")\n", "predictions = pipe(sentences)" ] }, { "cell_type": "code", "execution_count": null, "id": "41b5470987223a69", "metadata": { "ExecuteTime": { "end_time": "2025-09-16T09:43:25.187522Z", "start_time": "2025-09-16T09:43:25.179974Z" } }, "outputs": [], "source": [ "sentences[0]" ] }, { "cell_type": "code", "execution_count": null, "id": "81bb270a79fdd290", "metadata": { "ExecuteTime": { "end_time": "2025-09-16T09:43:25.232413Z", "start_time": "2025-09-16T09:43:25.225824Z" } }, "outputs": [], "source": [ "predictions[0]" ] }, { "cell_type": "code", "execution_count": null, "id": "d85ba7066b85eb7d", "metadata": { "ExecuteTime": { "end_time": "2025-09-16T09:43:25.273001Z", "start_time": "2025-09-16T09:43:25.267108Z" } }, "outputs": [], "source": [ "sentences[4]" ] }, { "cell_type": "code", "execution_count": null, "id": "8dea7d5c2077d566", "metadata": { "ExecuteTime": { "end_time": "2025-09-16T09:43:25.306831Z", "start_time": "2025-09-16T09:43:25.300457Z" } }, "outputs": [], "source": [ "predictions[4]" ] }, { "cell_type": "code", "execution_count": null, "id": "a540e26e090b9050", "metadata": { "ExecuteTime": { "end_time": "2025-09-16T09:43:25.342124Z", "start_time": "2025-09-16T09:43:25.334958Z" } }, "outputs": [], "source": [ "sorted(predictions[0], key = lambda x: x['label'])" ] }, { "cell_type": "code", "execution_count": null, "id": "a496645a7d858dcf", "metadata": { "ExecuteTime": { "end_time": "2025-09-16T09:43:25.369056Z", "start_time": "2025-09-16T09:43:25.360888Z" } }, "outputs": [], "source": [ "import numpy as np\n", "\n", "emotion_labels = [\"anger\", \"disgust\", \"fear\", \"joy\", \"sadness\", \"surprise\", \"neutral\"]\n", "isbn = []\n", "emotion_scores = {label: [] for label in emotion_labels}\n", "\n", "def calculate_max_emotion_scores(predictions):\n", " per_emotion_scores = {label: [] for label in emotion_labels}\n", " for prediction in predictions:\n", " sorted_predictions = sorted(prediction, key=lambda x: x['label'], reverse=True)\n", " for index, label in enumerate(emotion_labels):\n", " per_emotion_scores[label].append(sorted_predictions[index]['score'])\n", " return {label: np.max(scores) for label, scores in per_emotion_scores.items()}" ] }, { "cell_type": "code", "execution_count": null, "id": "b911145893e482f3", "metadata": { "ExecuteTime": { "end_time": "2025-09-16T09:43:27.622710Z", "start_time": "2025-09-16T09:43:25.385198Z" } }, "outputs": [], "source": [ "for i, row in books.head(10).iterrows():\n", " isbn.append(str(row[\"isbn13\"]))\n", "\n", " sentences = str(row[\"description\"]).split(\".\")\n", " predictions = pipe(sentences)\n", " max_scores = calculate_max_emotion_scores(predictions)\n", "\n", " for label in emotion_labels:\n", " # force conversion to Python float\n", " emotion_scores[label].append(float(max_scores[label]))\n" ] }, { "cell_type": "code", "execution_count": null, "id": "b1e1e2960a0314b3", "metadata": { "ExecuteTime": { "end_time": "2025-09-16T09:43:27.634705Z", "start_time": "2025-09-16T09:43:27.630600Z" } }, "outputs": [], "source": [ "emotion_scores = {\n", " label: [float(x) for x in scores]\n", " for label, scores in emotion_scores.items()\n", "}\n" ] }, { "cell_type": "code", "execution_count": null, "id": "b4d6abd593a32daa", "metadata": { "ExecuteTime": { "end_time": "2025-09-16T09:43:27.652229Z", "start_time": "2025-09-16T09:43:27.644745Z" } }, "outputs": [], "source": [ "emotion_scores" ] }, { "cell_type": "code", "execution_count": null, "id": "632f787a4b7d3eaf", "metadata": { "ExecuteTime": { "end_time": "2025-09-16T11:10:46.250826Z", "start_time": "2025-09-16T10:58:12.182208Z" } }, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "from tqdm import tqdm\n", "from transformers import pipeline\n", "\n", "# Initialize the emotion analysis pipeline\n", "pipe = pipeline(\"text-classification\", model=\"j-hartmann/emotion-english-distilroberta-base\", top_k=None)\n", "\n", "# Load your books data\n", "books = pd.read_csv(\"books_with_categories.csv\") # Replace with your actual file name\n", "\n", "emotion_labels = [\"anger\", \"disgust\", \"fear\", \"joy\", \"sadness\", \"surprise\", \"neutral\"]\n", "isbn = []\n", "emotion_scores = {label: [] for label in emotion_labels}\n", "\n", "def calculate_max_emotion_scores(predictions):\n", " \"\"\"Calculate maximum emotion scores across all sentences\"\"\"\n", " per_emotion_scores = {label: [] for label in emotion_labels}\n", "\n", " for prediction in predictions:\n", " # Create a dictionary for easy lookup by label\n", " prediction_dict = {pred['label']: pred['score'] for pred in prediction}\n", "\n", " # Add scores for each emotion label\n", " for label in emotion_labels:\n", " score = prediction_dict.get(label, 0.0) # Default to 0 if label not found\n", " per_emotion_scores[label].append(score)\n", "\n", " # Return maximum score for each emotion across all sentences\n", " return {label: np.max(scores) if scores else 0.0 for label, scores in per_emotion_scores.items()}\n", "\n", "print(\"Processing emotions for books...\")\n", "for i, row in tqdm(books.iterrows(), total=len(books)):\n", " isbn.append(str(row[\"isbn13\"]))\n", "\n", " # Handle missing descriptions\n", " description = str(row[\"description\"]) if pd.notna(row[\"description\"]) else \"\"\n", "\n", " if description and description != \"nan\":\n", " # Split into sentences and filter out empty ones\n", " sentences = [s.strip() for s in description.split(\".\") if s.strip()]\n", "\n", " if sentences:\n", " try:\n", " predictions = pipe(sentences)\n", " max_scores = calculate_max_emotion_scores(predictions)\n", " except Exception as e:\n", " print(f\"Error processing book {row['isbn13']}: {e}\")\n", " # Use default scores if processing fails\n", " max_scores = {label: 0.0 for label in emotion_labels}\n", " else:\n", " # Empty description\n", " max_scores = {label: 0.0 for label in emotion_labels}\n", " else:\n", " # No description available\n", " max_scores = {label: 0.0 for label in emotion_labels}\n", "\n", " # Add scores to our lists\n", " for label in emotion_labels:\n", " emotion_scores[label].append(float(max_scores[label]))" ] }, { "cell_type": "code", "execution_count": null, "id": "31dfb34d4f4aee9a", "metadata": { "ExecuteTime": { "end_time": "2025-09-16T11:11:55.455696Z", "start_time": "2025-09-16T11:11:55.422818Z" } }, "outputs": [], "source": [ "# Create emotion DataFrame\n", "emotion_df = pd.DataFrame(emotion_scores)\n", "emotion_df['isbn13'] = isbn\n", "\n", "print(\"Emotion processing completed!\")\n", "print(\"Sample emotion scores:\")\n", "print(emotion_df.head(10))" ] }, { "cell_type": "code", "execution_count": null, "id": "8db5e8f5cee59321", "metadata": { "ExecuteTime": { "end_time": "2025-09-16T11:12:15.342595Z", "start_time": "2025-09-16T11:12:14.895173Z" } }, "outputs": [], "source": [ "books['isbn13'] = books['isbn13'].astype(str).str.replace('.0', '', regex=False)\n", "emotion_df['isbn13'] = emotion_df['isbn13'].astype(str).str.replace('.0', '', regex=False)\n", "\n", "print(\"Data types before merge:\")\n", "print(f\"Books isbn13 dtype: {books['isbn13'].dtype}\")\n", "print(f\"Emotion isbn13 dtype: {emotion_df['isbn13'].dtype}\")\n", "\n", "# Merge emotion scores back to the original books DataFrame\n", "books_with_emotions = books.merge(emotion_df, on='isbn13', how='left')\n", "\n", "# Save the combined DataFrame\n", "books_with_emotions.to_csv(\"books_with_emotions.csv\", index=False)\n", "\n", "print(f\"Saved books with emotions to 'books_with_emotions.csv'\")\n", "print(f\"Total books processed: {len(books_with_emotions)}\")\n", "print(\"Available columns:\", books_with_emotions.columns.tolist())" ] }, { "cell_type": "code", "execution_count": null, "id": "e1cc83da7893e926", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.6" } }, "nbformat": 4, "nbformat_minor": 5 }