Spaces:
Sleeping
Sleeping
Merge branch 'main' of https://github.com/Deathshot78/ReviewSense
Browse files- .gitattributes +4 -0
- assets/confusion_bay.png +3 -0
- assets/confusion_bert.png +3 -0
- assets/gradio.png +3 -0
- assets/wordcloud.png +3 -0
- notebooks/reviewsense.ipynb +1001 -0
- requirements.txt +15 -1
- scripts/app.py +164 -1
- scripts/data_prepare.py +263 -0
- scripts/main.py +113 -1
- scripts/models.py +256 -0
- scripts/train_distilbet.py +101 -0
- scripts/train_naive_bayes.py +118 -0
.gitattributes
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
|
|
| 1 |
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
*.bin filter=lfs diff=lfs merge=lfs -text
|
|
@@ -35,3 +36,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
*.png filter=lfs diff=lfs merge=lfs -text
|
| 37 |
*.jpg filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<<<<<<< HEAD
|
| 2 |
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 3 |
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 4 |
*.bin filter=lfs diff=lfs merge=lfs -text
|
|
|
|
| 36 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 37 |
*.png filter=lfs diff=lfs merge=lfs -text
|
| 38 |
*.jpg filter=lfs diff=lfs merge=lfs -text
|
| 39 |
+
=======
|
| 40 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 41 |
+
>>>>>>> e6de3c4338f79386345fa6e4bba5b0666ad808da
|
assets/confusion_bay.png
ADDED
|
Git LFS Details
|
assets/confusion_bert.png
ADDED
|
Git LFS Details
|
assets/gradio.png
ADDED
|
Git LFS Details
|
assets/wordcloud.png
ADDED
|
Git LFS Details
|
notebooks/reviewsense.ipynb
ADDED
|
@@ -0,0 +1,1001 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"cells": [
|
| 3 |
+
{
|
| 4 |
+
"cell_type": "markdown",
|
| 5 |
+
"id": "1754f3bb",
|
| 6 |
+
"metadata": {},
|
| 7 |
+
"source": [
|
| 8 |
+
"# 🛍️ ReviewSense: Product Review Analysis Engine\n",
|
| 9 |
+
"\n",
|
| 10 |
+
"> *ReviewSense is a comprehensive, end-to-end Natural Language Processing application built to extract deep, actionable insights from unstructured product reviews.* \n",
|
| 11 |
+
"Where a simple star rating only tells part of the story, ReviewSense dives into the text to uncover what customers are saying, why they're saying it, and how they feel about specific product features. "
|
| 12 |
+
]
|
| 13 |
+
},
|
| 14 |
+
{
|
| 15 |
+
"cell_type": "markdown",
|
| 16 |
+
"id": "00d383d6",
|
| 17 |
+
"metadata": {},
|
| 18 |
+
"source": [
|
| 19 |
+
"## Imports"
|
| 20 |
+
]
|
| 21 |
+
},
|
| 22 |
+
{
|
| 23 |
+
"cell_type": "code",
|
| 24 |
+
"execution_count": null,
|
| 25 |
+
"id": "4d48ba17",
|
| 26 |
+
"metadata": {},
|
| 27 |
+
"outputs": [],
|
| 28 |
+
"source": [
|
| 29 |
+
"import pytorch_lightning as pl\n",
|
| 30 |
+
"from torch.utils.data import DataLoader, Dataset\n",
|
| 31 |
+
"from transformers import AutoTokenizer\n",
|
| 32 |
+
"import pandas as pd\n",
|
| 33 |
+
"from sklearn.model_selection import train_test_split\n",
|
| 34 |
+
"import torch\n",
|
| 35 |
+
"import os\n",
|
| 36 |
+
"import numpy as np\n",
|
| 37 |
+
"from sklearn.model_selection import train_test_split, ParameterGrid, StratifiedKFold\n",
|
| 38 |
+
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
|
| 39 |
+
"from sklearn.naive_bayes import MultinomialNB\n",
|
| 40 |
+
"from sklearn.pipeline import Pipeline\n",
|
| 41 |
+
"from sklearn.metrics import accuracy_score, classification_report, confusion_matrix\n",
|
| 42 |
+
"import seaborn as sns\n",
|
| 43 |
+
"import matplotlib.pyplot as plt\n",
|
| 44 |
+
"from tqdm.notebook import tqdm\n",
|
| 45 |
+
"from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping\n",
|
| 46 |
+
"from pytorch_lightning.loggers import TensorBoardLogger\n",
|
| 47 |
+
"from transformers import T5ForConditionalGeneration, T5Tokenizer\n",
|
| 48 |
+
"from transformers import AutoModelForSequenceClassification, get_linear_schedule_with_warmup, AutoConfig\n",
|
| 49 |
+
"from torch.optim import AdamW\n",
|
| 50 |
+
"import torch\n",
|
| 51 |
+
"from torchmetrics.functional import accuracy\n",
|
| 52 |
+
"from transformers import T5ForConditionalGeneration, T5Tokenizer, AutoTokenizer, pipeline\n",
|
| 53 |
+
"\n"
|
| 54 |
+
]
|
| 55 |
+
},
|
| 56 |
+
{
|
| 57 |
+
"cell_type": "markdown",
|
| 58 |
+
"id": "8263bc02",
|
| 59 |
+
"metadata": {},
|
| 60 |
+
"source": [
|
| 61 |
+
"## Prepare the data"
|
| 62 |
+
]
|
| 63 |
+
},
|
| 64 |
+
{
|
| 65 |
+
"cell_type": "code",
|
| 66 |
+
"execution_count": null,
|
| 67 |
+
"id": "a5f8dcda",
|
| 68 |
+
"metadata": {},
|
| 69 |
+
"outputs": [],
|
| 70 |
+
"source": [
|
| 71 |
+
"def explore_and_preprocess_reviews(\n",
|
| 72 |
+
" train_path='data/train.csv', \n",
|
| 73 |
+
" test_path='data/test.csv',\n",
|
| 74 |
+
" output_dir='data'\n",
|
| 75 |
+
"):\n",
|
| 76 |
+
" \"\"\"\n",
|
| 77 |
+
" Loads the Amazon Sentiment Analysis dataset (https://www.kaggle.com/datasets/kritanjalijain/amazon-reviews)\n",
|
| 78 |
+
" (you need to extract the train/test splits from the zip file in the data folder),\n",
|
| 79 |
+
" performs basic EDA, and preprocesses it for model training.\n",
|
| 80 |
+
"\n",
|
| 81 |
+
" Args:\n",
|
| 82 |
+
" train_path (str): Path to the training CSV file.\n",
|
| 83 |
+
" test_path (str): Path to the testing CSV file.\n",
|
| 84 |
+
" output_dir (str): Directory to save the processed file.\n",
|
| 85 |
+
" \"\"\"\n",
|
| 86 |
+
" # --- 1. Load Data ---\n",
|
| 87 |
+
" # This dataset typically comes without headers. We'll assign them.\n",
|
| 88 |
+
" # Column 1: Sentiment (1 = Negative, 2 = Positive)\n",
|
| 89 |
+
" # Column 2: Title\n",
|
| 90 |
+
" # Column 3: Review Text\n",
|
| 91 |
+
" print(f\"Loading data from '{train_path}' and '{test_path}'...\")\n",
|
| 92 |
+
" try:\n",
|
| 93 |
+
" col_names = ['sentiment_orig', 'title', 'review']\n",
|
| 94 |
+
" train_df = pd.read_csv(train_path, header=None, names=col_names)\n",
|
| 95 |
+
" test_df = pd.read_csv(test_path, header=None, names=col_names)\n",
|
| 96 |
+
" \n",
|
| 97 |
+
" # Combine for unified EDA and preprocessing\n",
|
| 98 |
+
" df = pd.concat([train_df, test_df], ignore_index=True)\n",
|
| 99 |
+
"\n",
|
| 100 |
+
" except FileNotFoundError:\n",
|
| 101 |
+
" print(f\"\\nERROR: Make sure '{train_path}' and '{test_path}' are in the specified directory.\")\n",
|
| 102 |
+
" print(\"This script is designed for the 'Amazon Reviews for Sentiment Analysis' dataset from Kaggle.\")\n",
|
| 103 |
+
" return\n",
|
| 104 |
+
"\n",
|
| 105 |
+
" df.dropna(inplace=True)\n",
|
| 106 |
+
"\n",
|
| 107 |
+
" # --- 2. Preprocessing ---\n",
|
| 108 |
+
" print(\"\\n--- Preprocessing Data for Sentiment Analysis ---\")\n",
|
| 109 |
+
"\n",
|
| 110 |
+
" # a) Create new sentiment labels (0 = Negative, 1 = Positive)\n",
|
| 111 |
+
" # This dataset is binary, not three-class like the previous one.\n",
|
| 112 |
+
" df['sentiment'] = df['sentiment_orig'].apply(lambda x: 0 if x == 1 else 1)\n",
|
| 113 |
+
"\n",
|
| 114 |
+
" # b) Combine title and review body\n",
|
| 115 |
+
" df['full_text'] = df['title'].astype(str) + \". \" + df['review'].astype(str)\n",
|
| 116 |
+
"\n",
|
| 117 |
+
" # c) Select and rename columns\n",
|
| 118 |
+
" processed_df = df[['full_text', 'sentiment']].copy()\n",
|
| 119 |
+
"\n",
|
| 120 |
+
" # --- 4. Save Processed Data ---\n",
|
| 121 |
+
" os.makedirs(output_dir, exist_ok=True)\n",
|
| 122 |
+
" output_path = os.path.join(output_dir, 'reviews_processed.csv')\n",
|
| 123 |
+
" processed_df.to_csv(output_path, index=False)\n",
|
| 124 |
+
" print(f\"\\nSaved {len(processed_df)} processed reviews to '{output_path}'\")\n"
|
| 125 |
+
]
|
| 126 |
+
},
|
| 127 |
+
{
|
| 128 |
+
"cell_type": "code",
|
| 129 |
+
"execution_count": null,
|
| 130 |
+
"id": "60ab838c",
|
| 131 |
+
"metadata": {},
|
| 132 |
+
"outputs": [],
|
| 133 |
+
"source": [
|
| 134 |
+
"#--- Preprocess the Reviews Dataset ---\n",
|
| 135 |
+
"print(\"\\n--- Preprocessing started ---\")\n",
|
| 136 |
+
"explore_and_preprocess_reviews()\n",
|
| 137 |
+
"print(\"\\n--- Preprocessing finished ---\")"
|
| 138 |
+
]
|
| 139 |
+
},
|
| 140 |
+
{
|
| 141 |
+
"cell_type": "markdown",
|
| 142 |
+
"id": "4c381d73",
|
| 143 |
+
"metadata": {},
|
| 144 |
+
"source": [
|
| 145 |
+
"## Define a base model (Multinomial Naive Bayes)"
|
| 146 |
+
]
|
| 147 |
+
},
|
| 148 |
+
{
|
| 149 |
+
"cell_type": "code",
|
| 150 |
+
"execution_count": null,
|
| 151 |
+
"id": "b3cd2b5b",
|
| 152 |
+
"metadata": {},
|
| 153 |
+
"outputs": [],
|
| 154 |
+
"source": [
|
| 155 |
+
"def train_baseline_sentiment_model(data_path='data/reviews_processed.csv', grid_search=True, nb__alpha=0.1, tfidf__max_df=0.75, tfidf__ngram_range=(1, 2), sample_size: int = 50000):\n",
|
| 156 |
+
" \"\"\"\n",
|
| 157 |
+
" Trains and evaluates a Multinomial Naive Bayes model for sentiment analysis.\n",
|
| 158 |
+
" Can optionally perform a grid search.\n",
|
| 159 |
+
"\n",
|
| 160 |
+
" Args:\n",
|
| 161 |
+
" data_path (str): Path to the processed reviews CSV file.\n",
|
| 162 |
+
" grid_search (bool): If True, performs a grid search.\n",
|
| 163 |
+
" nb__alpha (float): Alpha for MultinomialNB.\n",
|
| 164 |
+
" tfidf__max_df (float): max_df for TfidfVectorizer.\n",
|
| 165 |
+
" tfidf__ngram_range (tuple): ngram_range for TfidfVectorizer.\n",
|
| 166 |
+
" sample_size (int, optional): Number of reviews to use. If None, uses all.\n",
|
| 167 |
+
" \"\"\"\n",
|
| 168 |
+
" # --- 1. Load Data ---\n",
|
| 169 |
+
" print(f\"Loading data from '{data_path}'...\")\n",
|
| 170 |
+
" if not os.path.exists(data_path):\n",
|
| 171 |
+
" print(f\"\\nERROR: '{data_path}' not found. Please run the EDA script first!\")\n",
|
| 172 |
+
" return\n",
|
| 173 |
+
" \n",
|
| 174 |
+
" df = pd.read_csv(data_path)\n",
|
| 175 |
+
" df.dropna(inplace=True)\n",
|
| 176 |
+
"\n",
|
| 177 |
+
" # --- 2. Sample Data ---\n",
|
| 178 |
+
" if sample_size:\n",
|
| 179 |
+
" print(f\"Using a sample of {sample_size} reviews for training the baseline model.\")\n",
|
| 180 |
+
" df = df.sample(n=sample_size, random_state=42)\n",
|
| 181 |
+
"\n",
|
| 182 |
+
" # --- 3. Train-Test Split ---\n",
|
| 183 |
+
" print(\"Splitting data into training and testing sets...\")\n",
|
| 184 |
+
" X_train, X_test, y_train, y_test = train_test_split(\n",
|
| 185 |
+
" df['full_text'],\n",
|
| 186 |
+
" df['sentiment'],\n",
|
| 187 |
+
" test_size=0.2,\n",
|
| 188 |
+
" random_state=42,\n",
|
| 189 |
+
" stratify=df['sentiment']\n",
|
| 190 |
+
" )\n",
|
| 191 |
+
"\n",
|
| 192 |
+
" # --- 4. Create a Pipeline ---\n",
|
| 193 |
+
" pipeline = Pipeline([\n",
|
| 194 |
+
" ('tfidf', TfidfVectorizer(stop_words='english')),\n",
|
| 195 |
+
" ('nb', MultinomialNB()),\n",
|
| 196 |
+
" ])\n",
|
| 197 |
+
"\n",
|
| 198 |
+
" best_params = None\n",
|
| 199 |
+
"\n",
|
| 200 |
+
" if grid_search:\n",
|
| 201 |
+
" # --- 5a. Perform Grid Search ---\n",
|
| 202 |
+
" print(\"Performing Grid Search to find the best hyperparameters...\")\n",
|
| 203 |
+
" parameters = {\n",
|
| 204 |
+
" 'tfidf__ngram_range': [(1, 1), (1, 2)],\n",
|
| 205 |
+
" 'tfidf__max_df': [0.5, 0.75, 1.0],\n",
|
| 206 |
+
" 'nb__alpha': [0.1, 0.5, 1.0],\n",
|
| 207 |
+
" }\n",
|
| 208 |
+
" param_grid = list(ParameterGrid(parameters))\n",
|
| 209 |
+
" best_score = -1\n",
|
| 210 |
+
"\n",
|
| 211 |
+
" for params in tqdm(param_grid, desc=\"Grid Search Progress\"):\n",
|
| 212 |
+
" pipeline.set_params(**params)\n",
|
| 213 |
+
" pipeline.fit(X_train, y_train)\n",
|
| 214 |
+
" score = pipeline.score(X_test, y_test)\n",
|
| 215 |
+
" if score > best_score:\n",
|
| 216 |
+
" best_score = score\n",
|
| 217 |
+
" best_params = params\n",
|
| 218 |
+
" \n",
|
| 219 |
+
" print(f\"\\nBest score on test set: {best_score:.4f}\")\n",
|
| 220 |
+
" print(\"Best parameters found:\")\n",
|
| 221 |
+
" print(best_params)\n",
|
| 222 |
+
"\n",
|
| 223 |
+
" else:\n",
|
| 224 |
+
" # --- 5b. Use provided hyperparameters ---\n",
|
| 225 |
+
" print(\"Skipping grid search and using provided hyperparameters...\")\n",
|
| 226 |
+
" best_params = {\n",
|
| 227 |
+
" 'nb__alpha': nb__alpha,\n",
|
| 228 |
+
" 'tfidf__max_df': tfidf__max_df,\n",
|
| 229 |
+
" 'tfidf__ngram_range': tfidf__ngram_range\n",
|
| 230 |
+
" }\n",
|
| 231 |
+
"\n",
|
| 232 |
+
" # --- 6. Train the Final Model ---\n",
|
| 233 |
+
" print(\"\\nTraining final model...\")\n",
|
| 234 |
+
" best_model = pipeline.set_params(**best_params)\n",
|
| 235 |
+
" best_model.fit(X_train, y_train)\n",
|
| 236 |
+
" print(\"Model training complete.\")\n",
|
| 237 |
+
"\n",
|
| 238 |
+
" # --- 7. Evaluate the Best Model ---\n",
|
| 239 |
+
" print(\"\\n--- Model Evaluation ---\")\n",
|
| 240 |
+
" y_pred = best_model.predict(X_test)\n",
|
| 241 |
+
" \n",
|
| 242 |
+
" accuracy = accuracy_score(y_test, y_pred)\n",
|
| 243 |
+
" target_names = ['Negative', 'Positive']\n",
|
| 244 |
+
" \n",
|
| 245 |
+
" print(f\"Accuracy: {accuracy:.4f}\")\n",
|
| 246 |
+
" print(\"\\nClassification Report:\")\n",
|
| 247 |
+
" print(classification_report(y_test, y_pred, target_names=target_names))\n",
|
| 248 |
+
" \n",
|
| 249 |
+
" print(\"Confusion Matrix:\")\n",
|
| 250 |
+
" cm = confusion_matrix(y_test, y_pred)\n",
|
| 251 |
+
" plt.figure(figsize=(8, 6))\n",
|
| 252 |
+
" sns.heatmap(cm, annot=True, fmt='d', cmap='Greens', \n",
|
| 253 |
+
" xticklabels=target_names, yticklabels=target_names)\n",
|
| 254 |
+
" plt.title('Confusion Matrix for Naive Bayes on Amazon Reviews')\n",
|
| 255 |
+
" plt.xlabel('Predicted Label')\n",
|
| 256 |
+
" plt.ylabel('True Label')\n",
|
| 257 |
+
" plt.show()"
|
| 258 |
+
]
|
| 259 |
+
},
|
| 260 |
+
{
|
| 261 |
+
"cell_type": "code",
|
| 262 |
+
"execution_count": null,
|
| 263 |
+
"id": "093e6ae9",
|
| 264 |
+
"metadata": {},
|
| 265 |
+
"outputs": [],
|
| 266 |
+
"source": [
|
| 267 |
+
"#--- Train the base model ---\n",
|
| 268 |
+
"train_baseline_sentiment_model(sample_size=150000, grid_search=False)"
|
| 269 |
+
]
|
| 270 |
+
},
|
| 271 |
+
{
|
| 272 |
+
"cell_type": "markdown",
|
| 273 |
+
"id": "71f5e4ba",
|
| 274 |
+
"metadata": {},
|
| 275 |
+
"source": [
|
| 276 |
+
"## Define the dataset and lightning DataModule"
|
| 277 |
+
]
|
| 278 |
+
},
|
| 279 |
+
{
|
| 280 |
+
"cell_type": "code",
|
| 281 |
+
"execution_count": null,
|
| 282 |
+
"id": "c977e0f4",
|
| 283 |
+
"metadata": {},
|
| 284 |
+
"outputs": [],
|
| 285 |
+
"source": [
|
| 286 |
+
"class ReviewDataset(Dataset):\n",
|
| 287 |
+
" \"\"\"\n",
|
| 288 |
+
" Custom PyTorch Dataset for Amazon Reviews.\n",
|
| 289 |
+
"\n",
|
| 290 |
+
" This class takes a pandas DataFrame of review data, a tokenizer, and a max\n",
|
| 291 |
+
" token length, and prepares it for use in a PyTorch model. It handles the\n",
|
| 292 |
+
" tokenization of the text and the formatting of the labels for each item.\n",
|
| 293 |
+
"\n",
|
| 294 |
+
" Attributes:\n",
|
| 295 |
+
" tokenizer: The Hugging Face tokenizer to use for processing text.\n",
|
| 296 |
+
" data (pd.DataFrame): The DataFrame containing the review data.\n",
|
| 297 |
+
" max_token_len (int): The maximum sequence length for the tokenizer.\n",
|
| 298 |
+
" \"\"\"\n",
|
| 299 |
+
" def __init__(self, data: pd.DataFrame, tokenizer, max_token_len: int):\n",
|
| 300 |
+
" \"\"\"\n",
|
| 301 |
+
" Initializes the ReviewDataset.\n",
|
| 302 |
+
"\n",
|
| 303 |
+
" Args:\n",
|
| 304 |
+
" data (pd.DataFrame): The input DataFrame containing 'full_text' and\n",
|
| 305 |
+
" 'sentiment' columns.\n",
|
| 306 |
+
" tokenizer: The pre-trained tokenizer instance.\n",
|
| 307 |
+
" max_token_len (int): The maximum length for tokenized sequences.\n",
|
| 308 |
+
" \"\"\"\n",
|
| 309 |
+
" self.tokenizer = tokenizer\n",
|
| 310 |
+
" self.data = data\n",
|
| 311 |
+
" self.max_token_len = max_token_len\n",
|
| 312 |
+
"\n",
|
| 313 |
+
" def __len__(self):\n",
|
| 314 |
+
" \"\"\"\n",
|
| 315 |
+
" Returns the total number of samples in the dataset.\n",
|
| 316 |
+
" \"\"\"\n",
|
| 317 |
+
" return len(self.data)\n",
|
| 318 |
+
"\n",
|
| 319 |
+
" def __getitem__(self, index: int):\n",
|
| 320 |
+
" \"\"\"\n",
|
| 321 |
+
" Retrieves one sample from the dataset at the specified index.\n",
|
| 322 |
+
"\n",
|
| 323 |
+
" This method handles the tokenization of a single review text, including\n",
|
| 324 |
+
" padding and truncation, and formats the output into a dictionary of\n",
|
| 325 |
+
" tensors ready for the model.\n",
|
| 326 |
+
"\n",
|
| 327 |
+
" Args:\n",
|
| 328 |
+
" index (int): The index of the data sample to retrieve.\n",
|
| 329 |
+
"\n",
|
| 330 |
+
" Returns:\n",
|
| 331 |
+
" dict: A dictionary containing the tokenized inputs and the label,\n",
|
| 332 |
+
" with the following keys:\n",
|
| 333 |
+
" - 'input_ids': The token IDs of the review text.\n",
|
| 334 |
+
" - 'attention_mask': The attention mask for the review text.\n",
|
| 335 |
+
" - 'labels': The sentiment label as a tensor.\n",
|
| 336 |
+
" \"\"\"\n",
|
| 337 |
+
" data_row = self.data.iloc[index]\n",
|
| 338 |
+
" text = str(data_row.full_text)\n",
|
| 339 |
+
" labels = data_row.sentiment\n",
|
| 340 |
+
"\n",
|
| 341 |
+
" encoding = self.tokenizer.encode_plus(\n",
|
| 342 |
+
" text,\n",
|
| 343 |
+
" add_special_tokens=True,\n",
|
| 344 |
+
" max_length=self.max_token_len,\n",
|
| 345 |
+
" return_token_type_ids=False,\n",
|
| 346 |
+
" padding=\"max_length\",\n",
|
| 347 |
+
" truncation=True,\n",
|
| 348 |
+
" return_attention_mask=True,\n",
|
| 349 |
+
" return_tensors='pt',\n",
|
| 350 |
+
" )\n",
|
| 351 |
+
"\n",
|
| 352 |
+
" return dict(\n",
|
| 353 |
+
" input_ids=encoding[\"input_ids\"].flatten(),\n",
|
| 354 |
+
" attention_mask=encoding[\"attention_mask\"].flatten(),\n",
|
| 355 |
+
" labels=torch.tensor(labels, dtype=torch.long)\n",
|
| 356 |
+
" )\n",
|
| 357 |
+
"\n",
|
| 358 |
+
"class ReviewDataModule(pl.LightningDataModule):\n",
|
| 359 |
+
" \"\"\"\n",
|
| 360 |
+
" PyTorch Lightning DataModule to handle the Amazon Reviews dataset.\n",
|
| 361 |
+
"\n",
|
| 362 |
+
" This class encapsulates all the steps needed to process the data:\n",
|
| 363 |
+
" loading, splitting, and creating PyTorch DataLoaders for training,\n",
|
| 364 |
+
" validation, and testing. It allows for using a smaller random sample of the\n",
|
| 365 |
+
" full dataset for faster experimentation.\n",
|
| 366 |
+
"\n",
|
| 367 |
+
" Attributes:\n",
|
| 368 |
+
" data_path (str): Path to the processed CSV file.\n",
|
| 369 |
+
" batch_size (int): The size of each data batch.\n",
|
| 370 |
+
" max_token_len (int): The maximum sequence length for the tokenizer.\n",
|
| 371 |
+
" tokenizer: The Hugging Face tokenizer instance.\n",
|
| 372 |
+
" num_workers (int): The number of CPU cores to use for data loading.\n",
|
| 373 |
+
" sample_size (int, optional): The number of samples to use. If None,\n",
|
| 374 |
+
" the full dataset is used.\n",
|
| 375 |
+
" \"\"\"\n",
|
| 376 |
+
" def __init__(self, data_path: str, batch_size: int = 16, max_token_len: int = 256, model_name='distilbert-base-uncased', num_workers: int = 0, sample_size: int = None):\n",
|
| 377 |
+
" \"\"\"\n",
|
| 378 |
+
" Initializes the ReviewDataModule.\n",
|
| 379 |
+
"\n",
|
| 380 |
+
" Args:\n",
|
| 381 |
+
" data_path (str): The path to the processed CSV data file.\n",
|
| 382 |
+
" batch_size (int): The number of samples per batch.\n",
|
| 383 |
+
" max_token_len (int): Maximum length of tokenized sequences.\n",
|
| 384 |
+
" model_name (str): The name of the pre-trained model to use for the tokenizer.\n",
|
| 385 |
+
" num_workers (int): Number of subprocesses to use for data loading.\n",
|
| 386 |
+
" sample_size (int, optional): If specified, a random sample of this\n",
|
| 387 |
+
" size will be used from the dataset.\n",
|
| 388 |
+
" Defaults to None, which uses the full dataset.\n",
|
| 389 |
+
" \"\"\"\n",
|
| 390 |
+
" super().__init__()\n",
|
| 391 |
+
" self.data_path = data_path\n",
|
| 392 |
+
" self.batch_size = batch_size\n",
|
| 393 |
+
" self.max_token_len = max_token_len\n",
|
| 394 |
+
" self.tokenizer = AutoTokenizer.from_pretrained(model_name)\n",
|
| 395 |
+
" self.num_workers = num_workers\n",
|
| 396 |
+
" self.sample_size = sample_size\n",
|
| 397 |
+
" self.train_df = None\n",
|
| 398 |
+
" self.val_df = None\n",
|
| 399 |
+
" self.test_df = None\n",
|
| 400 |
+
"\n",
|
| 401 |
+
" def setup(self, stage=None):\n",
|
| 402 |
+
" \"\"\"\n",
|
| 403 |
+
" Loads and splits the data for training, validation, and testing.\n",
|
| 404 |
+
"\n",
|
| 405 |
+
" This method is called by PyTorch Lightning. It reads the CSV, handles\n",
|
| 406 |
+
" missing values, optionally takes a random sample, and performs a\n",
|
| 407 |
+
" stratified train-validation-test split. The indices of the resulting\n",
|
| 408 |
+
" DataFrames are reset to prevent potential KeyErrors during data loading.\n",
|
| 409 |
+
" \"\"\"\n",
|
| 410 |
+
" df = pd.read_csv(self.data_path)\n",
|
| 411 |
+
" df.dropna(inplace=True)\n",
|
| 412 |
+
"\n",
|
| 413 |
+
" # If a sample size is provided, sample the dataframe\n",
|
| 414 |
+
" if self.sample_size:\n",
|
| 415 |
+
" print(f\"Using a sample of {self.sample_size} reviews.\")\n",
|
| 416 |
+
" df = df.sample(n=self.sample_size, random_state=42)\n",
|
| 417 |
+
"\n",
|
| 418 |
+
" # Stratified split to maintain label distribution\n",
|
| 419 |
+
" train_val_df, self.test_df = train_test_split(df, test_size=0.1, random_state=42, stratify=df.sentiment)\n",
|
| 420 |
+
" self.train_df, self.val_df = train_test_split(train_val_df, test_size=0.1, random_state=42, stratify=train_val_df.sentiment)\n",
|
| 421 |
+
"\n",
|
| 422 |
+
" # Reset indices to prevent KeyErrors\n",
|
| 423 |
+
" self.train_df = self.train_df.reset_index(drop=True)\n",
|
| 424 |
+
" self.val_df = self.val_df.reset_index(drop=True)\n",
|
| 425 |
+
" self.test_df = self.test_df.reset_index(drop=True)\n",
|
| 426 |
+
"\n",
|
| 427 |
+
" print(f\"Size of training set: {len(self.train_df)}\")\n",
|
| 428 |
+
" print(f\"Size of validation set: {len(self.val_df)}\")\n",
|
| 429 |
+
" print(f\"Size of test set: {len(self.test_df)}\")\n",
|
| 430 |
+
"\n",
|
| 431 |
+
" def train_dataloader(self):\n",
|
| 432 |
+
" \"\"\"Returns the DataLoader for the training set.\"\"\"\n",
|
| 433 |
+
" return DataLoader(\n",
|
| 434 |
+
" ReviewDataset(self.train_df, self.tokenizer, self.max_token_len),\n",
|
| 435 |
+
" batch_size=self.batch_size,\n",
|
| 436 |
+
" shuffle=True,\n",
|
| 437 |
+
" num_workers=self.num_workers\n",
|
| 438 |
+
" )\n",
|
| 439 |
+
"\n",
|
| 440 |
+
" def val_dataloader(self):\n",
|
| 441 |
+
" \"\"\"Returns the DataLoader for the validation set.\"\"\"\n",
|
| 442 |
+
" return DataLoader(\n",
|
| 443 |
+
" ReviewDataset(self.val_df, self.tokenizer, self.max_token__len),\n",
|
| 444 |
+
" batch_size=self.batch_size,\n",
|
| 445 |
+
" num_workers=self.num_workers\n",
|
| 446 |
+
" )\n",
|
| 447 |
+
"\n",
|
| 448 |
+
" def test_dataloader(self):\n",
|
| 449 |
+
" \"\"\"Returns the DataLoader for the test set.\"\"\"\n",
|
| 450 |
+
" return DataLoader(\n",
|
| 451 |
+
" ReviewDataset(self.test_df, self.tokenizer, self.max_token_len),\n",
|
| 452 |
+
" batch_size=self.batch_size,\n",
|
| 453 |
+
" num_workers=self.num_workers\n",
|
| 454 |
+
" )\n",
|
| 455 |
+
" "
|
| 456 |
+
]
|
| 457 |
+
},
|
| 458 |
+
{
|
| 459 |
+
"cell_type": "code",
|
| 460 |
+
"execution_count": null,
|
| 461 |
+
"id": "985ac47b",
|
| 462 |
+
"metadata": {},
|
| 463 |
+
"outputs": [],
|
| 464 |
+
"source": [
|
| 465 |
+
"# --- Configuration ---\n",
|
| 466 |
+
"data_path = \"data/reviews_processed.csv\"\n",
|
| 467 |
+
"BATCH_SIZE = 64\n",
|
| 468 |
+
"MAX_TOKEN_LEN = 256\n",
|
| 469 |
+
"\n",
|
| 470 |
+
"print(\"Initializing ReviewDataModule...\")\n",
|
| 471 |
+
"review_datamodule = ReviewDataModule(\n",
|
| 472 |
+
" data_path=data_path,\n",
|
| 473 |
+
" batch_size=BATCH_SIZE,\n",
|
| 474 |
+
" max_token_len=MAX_TOKEN_LEN,\n",
|
| 475 |
+
" model_name='distilbert-base-uncased',\n",
|
| 476 |
+
" sample_size=100000 # Pass the sample size to the datamodule\n",
|
| 477 |
+
")\n",
|
| 478 |
+
"review_datamodule.setup()\n",
|
| 479 |
+
"\n",
|
| 480 |
+
"# Fetch one batch from the training dataloader to inspect its contents\n",
|
| 481 |
+
"print(\"\\n--- Fetching one batch from the training dataloader ---\")\n",
|
| 482 |
+
"train_batch = next(iter(review_datamodule.train_dataloader()))\n",
|
| 483 |
+
"\n",
|
| 484 |
+
"print(\"\\n--- Example Batch ---\")\n",
|
| 485 |
+
"print(f\"Input IDs shape: {train_batch['input_ids'].shape}\")\n",
|
| 486 |
+
"print(f\"Attention Mask shape: {train_batch['attention_mask'].shape}\")\n",
|
| 487 |
+
"print(f\"Labels: {train_batch['labels']}\")\n",
|
| 488 |
+
"print(f\"Labels shape: {train_batch['labels'].shape}\")"
|
| 489 |
+
]
|
| 490 |
+
},
|
| 491 |
+
{
|
| 492 |
+
"cell_type": "markdown",
|
| 493 |
+
"id": "2c7781f4",
|
| 494 |
+
"metadata": {},
|
| 495 |
+
"source": [
|
| 496 |
+
"## FineTune DistilBert"
|
| 497 |
+
]
|
| 498 |
+
},
|
| 499 |
+
{
|
| 500 |
+
"cell_type": "code",
|
| 501 |
+
"execution_count": null,
|
| 502 |
+
"id": "d046b940",
|
| 503 |
+
"metadata": {},
|
| 504 |
+
"outputs": [],
|
| 505 |
+
"source": [
|
| 506 |
+
"class SentimentClassifier(pl.LightningModule):\n",
|
| 507 |
+
" \"\"\"\n",
|
| 508 |
+
" PyTorch Lightning module for the sentiment classification model.\n",
|
| 509 |
+
" \"\"\"\n",
|
| 510 |
+
" def __init__(self, model_name='distilbert-base-uncased', n_classes=2, learning_rate=2e-5, n_warmup_steps=0, n_training_steps=0, dropout_prob=0.2): # Added dropout\n",
|
| 511 |
+
" super().__init__()\n",
|
| 512 |
+
" self.save_hyperparameters()\n",
|
| 513 |
+
"\n",
|
| 514 |
+
" # Configure dropout\n",
|
| 515 |
+
" config = AutoConfig.from_pretrained(model_name)\n",
|
| 516 |
+
" config.hidden_dropout_prob = dropout_prob\n",
|
| 517 |
+
" config.attention_probs_dropout_prob = dropout_prob\n",
|
| 518 |
+
" config.num_labels = n_classes\n",
|
| 519 |
+
"\n",
|
| 520 |
+
" self.model = AutoModelForSequenceClassification.from_pretrained(model_name, config=config)\n",
|
| 521 |
+
"\n",
|
| 522 |
+
" def forward(self, input_ids, attention_mask, labels=None):\n",
|
| 523 |
+
" return self.model(\n",
|
| 524 |
+
" input_ids=input_ids,\n",
|
| 525 |
+
" attention_mask=attention_mask,\n",
|
| 526 |
+
" labels=labels\n",
|
| 527 |
+
" )\n",
|
| 528 |
+
"\n",
|
| 529 |
+
" def training_step(self, batch, batch_idx):\n",
|
| 530 |
+
" output = self.forward(**batch)\n",
|
| 531 |
+
" self.log(\"train_loss\", output.loss, prog_bar=True, logger=True)\n",
|
| 532 |
+
" return output.loss\n",
|
| 533 |
+
"\n",
|
| 534 |
+
" def validation_step(self, batch, batch_idx):\n",
|
| 535 |
+
" output = self.forward(**batch)\n",
|
| 536 |
+
" preds = torch.argmax(output.logits, dim=1)\n",
|
| 537 |
+
" val_acc = accuracy(preds, batch['labels'], task='binary')\n",
|
| 538 |
+
" self.log(\"val_loss\", output.loss, prog_bar=True, logger=True)\n",
|
| 539 |
+
" self.log(\"val_accuracy\", val_acc, prog_bar=True, logger=True)\n",
|
| 540 |
+
" return output.loss\n",
|
| 541 |
+
"\n",
|
| 542 |
+
" def test_step(self, batch, batch_idx):\n",
|
| 543 |
+
" output = self.forward(**batch)\n",
|
| 544 |
+
" preds = torch.argmax(output.logits, dim=1)\n",
|
| 545 |
+
" test_acc = accuracy(preds, batch['labels'], task='binary')\n",
|
| 546 |
+
" self.log(\"test_accuracy\", test_acc)\n",
|
| 547 |
+
" return test_acc\n",
|
| 548 |
+
"\n",
|
| 549 |
+
" def predict_step(self, batch, batch_idx, dataloader_idx=0):\n",
|
| 550 |
+
" output = self.forward(input_ids=batch['input_ids'], attention_mask=batch['attention_mask'])\n",
|
| 551 |
+
" return torch.argmax(output.logits, dim=1)\n",
|
| 552 |
+
"\n",
|
| 553 |
+
" def configure_optimizers(self):\n",
|
| 554 |
+
" optimizer = AdamW(self.parameters(), lr=self.hparams.learning_rate, weight_decay=0.01)\n",
|
| 555 |
+
" scheduler = get_linear_schedule_with_warmup(\n",
|
| 556 |
+
" optimizer,\n",
|
| 557 |
+
" num_warmup_steps=self.hparams.n_warmup_steps,\n",
|
| 558 |
+
" num_training_steps=self.hparams.n_training_steps\n",
|
| 559 |
+
" )\n",
|
| 560 |
+
" return dict(optimizer=optimizer, lr_scheduler=dict(scheduler=scheduler, interval='step'))\n"
|
| 561 |
+
]
|
| 562 |
+
},
|
| 563 |
+
{
|
| 564 |
+
"cell_type": "code",
|
| 565 |
+
"execution_count": null,
|
| 566 |
+
"id": "b3a3708d",
|
| 567 |
+
"metadata": {},
|
| 568 |
+
"outputs": [],
|
| 569 |
+
"source": [
|
| 570 |
+
"def train_sentiment_model(data_path='data/reviews_processed.csv', model_name='distilbert-base-uncased', n_epochs=5, sample_size: int = None):\n",
|
| 571 |
+
" \"\"\"\n",
|
| 572 |
+
" Main function to train the sentiment analysis model on the Amazon Reviews dataset.\n",
|
| 573 |
+
"\n",
|
| 574 |
+
" Args:\n",
|
| 575 |
+
" data_path (str): Path to the processed data file.\n",
|
| 576 |
+
" model_name (str): Name of the transformer model to use.\n",
|
| 577 |
+
" n_epochs (int): Maximum number of epochs for training.\n",
|
| 578 |
+
" sample_size (int, optional): The number of reviews to use for training.\n",
|
| 579 |
+
" If None, the full dataset is used.\n",
|
| 580 |
+
" \"\"\"\n",
|
| 581 |
+
" # --- 1. Hyperparameters ---\n",
|
| 582 |
+
" BATCH_SIZE = 64\n",
|
| 583 |
+
" MAX_TOKEN_LEN = 256\n",
|
| 584 |
+
" LEARNING_RATE = 2e-5\n",
|
| 585 |
+
" N_CLASSES = 2 # Negative, Positive\n",
|
| 586 |
+
"\n",
|
| 587 |
+
" # --- 2. Initialize DataModule ---\n",
|
| 588 |
+
" print(\"Initializing ReviewDataModule...\")\n",
|
| 589 |
+
" review_datamodule = ReviewDataModule(\n",
|
| 590 |
+
" data_path=data_path,\n",
|
| 591 |
+
" batch_size=BATCH_SIZE,\n",
|
| 592 |
+
" max_token_len=MAX_TOKEN_LEN,\n",
|
| 593 |
+
" model_name=model_name,\n",
|
| 594 |
+
" sample_size=sample_size # Pass the sample size to the datamodule\n",
|
| 595 |
+
" )\n",
|
| 596 |
+
" review_datamodule.setup()\n",
|
| 597 |
+
"\n",
|
| 598 |
+
" n_training_steps = len(review_datamodule.train_dataloader()) * n_epochs\n",
|
| 599 |
+
" n_warmup_steps = int(n_training_steps * 0.1)\n",
|
| 600 |
+
"\n",
|
| 601 |
+
" # --- 3. Initialize Model ---\n",
|
| 602 |
+
" print(\"Initializing SentimentClassifier model...\")\n",
|
| 603 |
+
" model = SentimentClassifier(\n",
|
| 604 |
+
" model_name=model_name,\n",
|
| 605 |
+
" n_classes=N_CLASSES,\n",
|
| 606 |
+
" learning_rate=LEARNING_RATE,\n",
|
| 607 |
+
" n_warmup_steps=n_warmup_steps,\n",
|
| 608 |
+
" n_training_steps=n_training_steps\n",
|
| 609 |
+
" )\n",
|
| 610 |
+
"\n",
|
| 611 |
+
" # --- 4. Configure Training Callbacks ---\n",
|
| 612 |
+
" checkpoint_callback = ModelCheckpoint(\n",
|
| 613 |
+
" dirpath=\"checkpoints\",\n",
|
| 614 |
+
" filename=\"sentiment-binary-best-checkpoint\",\n",
|
| 615 |
+
" save_top_k=1,\n",
|
| 616 |
+
" verbose=True,\n",
|
| 617 |
+
" monitor=\"val_loss\",\n",
|
| 618 |
+
" mode=\"min\"\n",
|
| 619 |
+
" )\n",
|
| 620 |
+
" logger = TensorBoardLogger(\"lightning_logs\", name=\"sentiment-classifier-binary\")\n",
|
| 621 |
+
" early_stopping_callback = EarlyStopping(monitor='val_loss', patience=2)\n",
|
| 622 |
+
"\n",
|
| 623 |
+
" # --- 5. Initialize Trainer ---\n",
|
| 624 |
+
" print(\"Initializing PyTorch Lightning Trainer...\")\n",
|
| 625 |
+
" trainer = pl.Trainer(\n",
|
| 626 |
+
" logger=logger,\n",
|
| 627 |
+
" callbacks=[checkpoint_callback, early_stopping_callback],\n",
|
| 628 |
+
" max_epochs=n_epochs,\n",
|
| 629 |
+
" accelerator='gpu' if torch.cuda.is_available() else 'cpu',\n",
|
| 630 |
+
" devices=1,\n",
|
| 631 |
+
" )\n",
|
| 632 |
+
"\n",
|
| 633 |
+
" # --- 6. Start Training ---\n",
|
| 634 |
+
" print(f\"Starting training with {model_name} for up to {n_epochs} epochs...\")\n",
|
| 635 |
+
" trainer.fit(model, review_datamodule)\n",
|
| 636 |
+
"\n",
|
| 637 |
+
" # --- 7. Evaluate on Test Set and Generate Confusion Matrix ---\n",
|
| 638 |
+
" print(\"\\nTraining complete. Evaluating on the test set...\")\n",
|
| 639 |
+
" trainer.test(model, datamodule=review_datamodule)\n",
|
| 640 |
+
"\n",
|
| 641 |
+
" predictions = trainer.predict(model, datamodule=review_datamodule)\n",
|
| 642 |
+
" if predictions:\n",
|
| 643 |
+
" all_preds = torch.cat(predictions).cpu().numpy()\n",
|
| 644 |
+
" true_labels = review_datamodule.test_df.sentiment.to_numpy()\n",
|
| 645 |
+
" target_names = ['Negative', 'Positive'] # Updated labels\n",
|
| 646 |
+
"\n",
|
| 647 |
+
" cm = confusion_matrix(true_labels, all_preds)\n",
|
| 648 |
+
" plt.figure(figsize=(8, 6))\n",
|
| 649 |
+
" sns.heatmap(cm, annot=True, fmt='d', cmap='YlGnBu',\n",
|
| 650 |
+
" xticklabels=target_names, yticklabels=target_names)\n",
|
| 651 |
+
" plt.title('Confusion Matrix for Sentiment Analysis')\n",
|
| 652 |
+
" plt.xlabel('Predicted Label')\n",
|
| 653 |
+
" plt.ylabel('True Label')\n",
|
| 654 |
+
" plt.show()\n",
|
| 655 |
+
"\n"
|
| 656 |
+
]
|
| 657 |
+
},
|
| 658 |
+
{
|
| 659 |
+
"cell_type": "code",
|
| 660 |
+
"execution_count": null,
|
| 661 |
+
"id": "3dae58e3",
|
| 662 |
+
"metadata": {},
|
| 663 |
+
"outputs": [],
|
| 664 |
+
"source": [
|
| 665 |
+
"#--- Train DistilBert ---\n",
|
| 666 |
+
"train_sentiment_model(data_path=data_path, sample_size=100000)"
|
| 667 |
+
]
|
| 668 |
+
},
|
| 669 |
+
{
|
| 670 |
+
"cell_type": "markdown",
|
| 671 |
+
"id": "ddbc7315",
|
| 672 |
+
"metadata": {},
|
| 673 |
+
"source": [
|
| 674 |
+
"## Define the models"
|
| 675 |
+
]
|
| 676 |
+
},
|
| 677 |
+
{
|
| 678 |
+
"cell_type": "code",
|
| 679 |
+
"execution_count": null,
|
| 680 |
+
"id": "85bd352b",
|
| 681 |
+
"metadata": {},
|
| 682 |
+
"outputs": [],
|
| 683 |
+
"source": [
|
| 684 |
+
"class ReviewSummarizer:\n",
|
| 685 |
+
" \"\"\"\n",
|
| 686 |
+
" A class to handle the summarization of product reviews using a pre-trained T5 model.\n",
|
| 687 |
+
" \"\"\"\n",
|
| 688 |
+
" def __init__(self, model_name='t5-small'):\n",
|
| 689 |
+
" \"\"\"\n",
|
| 690 |
+
" Initializes the summarizer with a pre-trained T5 model and tokenizer.\n",
|
| 691 |
+
"\n",
|
| 692 |
+
" Args:\n",
|
| 693 |
+
" model_name (str): The name of the pre-trained T5 model to use.\n",
|
| 694 |
+
" \"\"\"\n",
|
| 695 |
+
" print(f\"Loading summarization model: {model_name}...\")\n",
|
| 696 |
+
" self.model_name = model_name\n",
|
| 697 |
+
" self.device = 'cuda' if torch.cuda.is_available() else 'cpu'\n",
|
| 698 |
+
"\n",
|
| 699 |
+
" # Load the tokenizer and model from Hugging Face\n",
|
| 700 |
+
" self.tokenizer = T5Tokenizer.from_pretrained(self.model_name)\n",
|
| 701 |
+
" self.model = T5ForConditionalGeneration.from_pretrained(self.model_name).to(self.device)\n",
|
| 702 |
+
" print(\"Summarization model loaded successfully.\")\n",
|
| 703 |
+
"\n",
|
| 704 |
+
" def summarize(self, text: str, max_length: int = 50, min_length: int = 10) -> str:\n",
|
| 705 |
+
" \"\"\"\n",
|
| 706 |
+
" Generates a summary for a given text.\n",
|
| 707 |
+
"\n",
|
| 708 |
+
" Args:\n",
|
| 709 |
+
" text (str): The review text to summarize.\n",
|
| 710 |
+
" max_length (int): The maximum length of the generated summary.\n",
|
| 711 |
+
" min_length (int): The minimum length of the generated summary.\n",
|
| 712 |
+
"\n",
|
| 713 |
+
" Returns:\n",
|
| 714 |
+
" str: The generated summary.\n",
|
| 715 |
+
" \"\"\"\n",
|
| 716 |
+
" if not text or not isinstance(text, str):\n",
|
| 717 |
+
" return \"\"\n",
|
| 718 |
+
"\n",
|
| 719 |
+
" # T5 models require a prefix for the task. For summarization, it's \"summarize: \"\n",
|
| 720 |
+
" preprocess_text = f\"summarize: {text.strip()}\"\n",
|
| 721 |
+
"\n",
|
| 722 |
+
" # Tokenize the input text\n",
|
| 723 |
+
" tokenized_text = self.tokenizer.encode(preprocess_text, return_tensors=\"pt\").to(self.device)\n",
|
| 724 |
+
"\n",
|
| 725 |
+
" # Generate the summary\n",
|
| 726 |
+
" summary_ids = self.model.generate(\n",
|
| 727 |
+
" tokenized_text,\n",
|
| 728 |
+
" max_length=max_length,\n",
|
| 729 |
+
" min_length=min_length,\n",
|
| 730 |
+
" length_penalty=2.0,\n",
|
| 731 |
+
" num_beams=4,\n",
|
| 732 |
+
" early_stopping=True\n",
|
| 733 |
+
" )\n",
|
| 734 |
+
"\n",
|
| 735 |
+
" # Decode the summary and return it\n",
|
| 736 |
+
" summary = self.tokenizer.decode(summary_ids[0], skip_special_tokens=True)\n",
|
| 737 |
+
" return summary\n",
|
| 738 |
+
"\n",
|
| 739 |
+
"class AspectAnalyzer:\n",
|
| 740 |
+
" \"\"\"\n",
|
| 741 |
+
" A class to handle Aspect-Based Sentiment Analysis (ABSA) using a pre-trained model.\n",
|
| 742 |
+
" \"\"\"\n",
|
| 743 |
+
" # Changed to a different, currently valid lightweight model for ABSA.\n",
|
| 744 |
+
" def __init__(self, model_name='yangheng/deberta-v3-base-absa-v1.1', force_cpu=False):\n",
|
| 745 |
+
" \"\"\"\n",
|
| 746 |
+
" Initializes the ABSA pipeline with a pre-trained model.\n",
|
| 747 |
+
"\n",
|
| 748 |
+
" Args:\n",
|
| 749 |
+
" model_name (str): The name of the pre-trained ABSA model.\n",
|
| 750 |
+
" force_cpu (bool): If True, forces the model to run on the CPU.\n",
|
| 751 |
+
" \"\"\"\n",
|
| 752 |
+
" print(f\"Loading Aspect-Based Sentiment Analysis model: {model_name}...\")\n",
|
| 753 |
+
" self.model_name = model_name\n",
|
| 754 |
+
"\n",
|
| 755 |
+
" if force_cpu:\n",
|
| 756 |
+
" self.device = -1 # Use -1 for CPU in pipeline\n",
|
| 757 |
+
" print(\"Forcing ABSA model to run on CPU.\")\n",
|
| 758 |
+
" else:\n",
|
| 759 |
+
" self.device = 0 if torch.cuda.is_available() else -1\n",
|
| 760 |
+
"\n",
|
| 761 |
+
" print(f\"Using device: {self.device} (0 for GPU, -1 for CPU)\")\n",
|
| 762 |
+
"\n",
|
| 763 |
+
" self.absa_pipeline = pipeline(\n",
|
| 764 |
+
" \"text-classification\",\n",
|
| 765 |
+
" model=self.model_name,\n",
|
| 766 |
+
" tokenizer=self.model_name,\n",
|
| 767 |
+
" device=self.device\n",
|
| 768 |
+
" )\n",
|
| 769 |
+
" print(\"ABSA model loaded successfully.\")\n",
|
| 770 |
+
"\n",
|
| 771 |
+
" def analyze(self, text: str, aspects: list) -> dict:\n",
|
| 772 |
+
" \"\"\"\n",
|
| 773 |
+
" Analyzes the sentiment towards a list of aspects within a given text.\n",
|
| 774 |
+
" \"\"\"\n",
|
| 775 |
+
" if not text or not isinstance(text, str) or not aspects:\n",
|
| 776 |
+
" return {}\n",
|
| 777 |
+
"\n",
|
| 778 |
+
" # The model expects the review and aspect separated by a special token.\n",
|
| 779 |
+
" # Note: Different ABSA models might expect different input formats.\n",
|
| 780 |
+
" # This format is common but may need adjustment for other models.\n",
|
| 781 |
+
" inputs = [f\"{text} [SEP] {aspect}\" for aspect in aspects]\n",
|
| 782 |
+
" results = self.absa_pipeline(inputs)\n",
|
| 783 |
+
"\n",
|
| 784 |
+
" # Process results into a user-friendly dictionary\n",
|
| 785 |
+
" aspect_sentiments = {}\n",
|
| 786 |
+
" for aspect, result in zip(aspects, results):\n",
|
| 787 |
+
" aspect_sentiments[aspect] = {'sentiment': result['label'], 'score': result['score']}\n",
|
| 788 |
+
"\n",
|
| 789 |
+
" return aspect_sentiments\n",
|
| 790 |
+
"\n",
|
| 791 |
+
"class FineTunedSentimentClassifier:\n",
|
| 792 |
+
" \"\"\"\n",
|
| 793 |
+
" This class handles loading the fine-tuned checkpoint and making predictions.\n",
|
| 794 |
+
" \"\"\"\n",
|
| 795 |
+
" def __init__(self, checkpoint_path, model_name='distilbert-base-uncased', force_cpu=False):\n",
|
| 796 |
+
" self.device = 'cpu' if force_cpu else ('cuda' if torch.cuda.is_available() else 'cpu')\n",
|
| 797 |
+
" print(f\"Loading fine-tuned sentiment model from checkpoint: {checkpoint_path}...\")\n",
|
| 798 |
+
" print(f\"Using device: {self.device}\")\n",
|
| 799 |
+
"\n",
|
| 800 |
+
" self.model = SentimentClassifier.load_from_checkpoint(checkpoint_path, map_location=self.device)\n",
|
| 801 |
+
" self.model.to(self.device)\n",
|
| 802 |
+
" self.model.eval() # Set model to evaluation mode\n",
|
| 803 |
+
"\n",
|
| 804 |
+
" self.tokenizer = AutoTokenizer.from_pretrained(model_name)\n",
|
| 805 |
+
" self.labels = ['NEGATIVE', 'POSITIVE']\n",
|
| 806 |
+
" print(\"Fine-tuned sentiment model loaded successfully.\")\n",
|
| 807 |
+
"\n",
|
| 808 |
+
" def classify(self, text: str) -> dict:\n",
|
| 809 |
+
" encoding = self.tokenizer.encode_plus(\n",
|
| 810 |
+
" text, add_special_tokens=True, max_length=128,\n",
|
| 811 |
+
" return_token_type_ids=False, padding=\"max_length\",\n",
|
| 812 |
+
" truncation=True, return_attention_mask=True, return_tensors='pt',\n",
|
| 813 |
+
" )\n",
|
| 814 |
+
" input_ids = encoding[\"input_ids\"].to(self.device)\n",
|
| 815 |
+
" attention_mask = encoding[\"attention_mask\"].to(self.device)\n",
|
| 816 |
+
" with torch.no_grad():\n",
|
| 817 |
+
" outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)\n",
|
| 818 |
+
" logits = outputs.logits\n",
|
| 819 |
+
" probabilities = torch.softmax(logits, dim=1)\n",
|
| 820 |
+
" prediction_idx = torch.argmax(probabilities, dim=1).item()\n",
|
| 821 |
+
" return {'label': self.labels[prediction_idx], 'score': probabilities[0][prediction_idx].item()}\n",
|
| 822 |
+
"\n",
|
| 823 |
+
"class AspectExtractor:\n",
|
| 824 |
+
" \"\"\"\n",
|
| 825 |
+
" This class uses a Part-of-Speech (POS) tagging model to first extract all\n",
|
| 826 |
+
" potential aspect terms (nouns) from a review text. It then filters these\n",
|
| 827 |
+
" nouns against a pre-defined dictionary of valid aspects for a given\n",
|
| 828 |
+
" product category to return only the relevant features.\n",
|
| 829 |
+
" \"\"\"\n",
|
| 830 |
+
" def __init__(self, model_name=\"vblagoje/bert-english-uncased-finetuned-pos\", force_cpu=False):\n",
|
| 831 |
+
" self.model_name = model_name\n",
|
| 832 |
+
" self.device = 'cpu' if force_cpu else ('cuda' if torch.cuda.is_available() else 'cpu')\n",
|
| 833 |
+
" print(f\"Loading Part-of-Speech (POS) tagging model: {self.model_name}...\")\n",
|
| 834 |
+
" print(f\"Using device: {self.device}\")\n",
|
| 835 |
+
"\n",
|
| 836 |
+
" self.pipeline = pipeline(\n",
|
| 837 |
+
" \"token-classification\",\n",
|
| 838 |
+
" model=self.model_name,\n",
|
| 839 |
+
" device=-1 if self.device == 'cpu' else 0,\n",
|
| 840 |
+
" aggregation_strategy=\"simple\"\n",
|
| 841 |
+
" )\n",
|
| 842 |
+
" print(\"POS tagging model loaded successfully.\")\n",
|
| 843 |
+
"\n",
|
| 844 |
+
" def extract(self, text: str, aspect_dictionary: list) -> list:\n",
|
| 845 |
+
" \"\"\"\n",
|
| 846 |
+
" Extracts aspects from the given text that are present in the provided\n",
|
| 847 |
+
" aspect dictionary.\n",
|
| 848 |
+
"\n",
|
| 849 |
+
" Args:\n",
|
| 850 |
+
" text (str): The review text to analyze.\n",
|
| 851 |
+
" aspect_dictionary (list): A list of valid, known aspects for the\n",
|
| 852 |
+
" product category.\n",
|
| 853 |
+
"\n",
|
| 854 |
+
" Returns:\n",
|
| 855 |
+
" list: A list of aspects that were both found in the text and are\n",
|
| 856 |
+
" present in the aspect dictionary.\n",
|
| 857 |
+
" \"\"\"\n",
|
| 858 |
+
" if not text or not aspect_dictionary:\n",
|
| 859 |
+
" return []\n",
|
| 860 |
+
"\n",
|
| 861 |
+
" # 1. Extract all nouns from the text using the POS model\n",
|
| 862 |
+
" model_outputs = self.pipeline(text)\n",
|
| 863 |
+
" noun_tags = {'NOUN', 'PROPN'}\n",
|
| 864 |
+
" extracted_nouns = {\n",
|
| 865 |
+
" output['word'].lower() for output in model_outputs\n",
|
| 866 |
+
" if output['entity_group'] in noun_tags\n",
|
| 867 |
+
" }\n",
|
| 868 |
+
"\n",
|
| 869 |
+
" # 2. Filter the extracted nouns against the provided dictionary\n",
|
| 870 |
+
" # We find the intersection between the two sets.\n",
|
| 871 |
+
" valid_aspects = {aspect.lower() for aspect in aspect_dictionary}\n",
|
| 872 |
+
"\n",
|
| 873 |
+
" final_aspects = list(extracted_nouns.intersection(valid_aspects))\n",
|
| 874 |
+
"\n",
|
| 875 |
+
" return final_aspects\n",
|
| 876 |
+
" "
|
| 877 |
+
]
|
| 878 |
+
},
|
| 879 |
+
{
|
| 880 |
+
"cell_type": "code",
|
| 881 |
+
"execution_count": null,
|
| 882 |
+
"id": "6fc21c8b",
|
| 883 |
+
"metadata": {},
|
| 884 |
+
"outputs": [],
|
| 885 |
+
"source": [
|
| 886 |
+
"# --- Configuration ---\n",
|
| 887 |
+
"# --- IMPORTANT: UPDATE THIS PATH ---\n",
|
| 888 |
+
"# You need to provide the path to the best checkpoint file that was saved\n",
|
| 889 |
+
"# during the training of your sentiment model.\n",
|
| 890 |
+
"SENTIMENT_CHECKPOINT_PATH = \"checkpoints/sentiment-binary-best-checkpoint.ckpt\"\n",
|
| 891 |
+
"\n",
|
| 892 |
+
"# --- Pre-defined Aspect Dictionaries for Different Product Categories ---\n",
|
| 893 |
+
"ASPECT_DICTIONARIES = {\n",
|
| 894 |
+
" \"Phone\": ['camera', 'battery', 'battery life', 'screen', 'performance', 'price', 'design'],\n",
|
| 895 |
+
" \"Coffee Maker\": ['ease of use', 'design', 'noise level', 'coffee quality', 'brew time', 'cleaning'],\n",
|
| 896 |
+
" \"Book\": ['plot', 'characters', 'writing style', 'pacing', 'ending'],\n",
|
| 897 |
+
" \"Default\": ['quality', 'price', 'service', 'design', 'features'] # A fallback list\n",
|
| 898 |
+
"}\n",
|
| 899 |
+
"\n",
|
| 900 |
+
"def main():\n",
|
| 901 |
+
" \"\"\"\n",
|
| 902 |
+
" Main function to run the command-line review analysis tool.\n",
|
| 903 |
+
" \"\"\"\n",
|
| 904 |
+
" # --- 1. Load All Models ---\n",
|
| 905 |
+
" print(\"--- Initializing all models ---\")\n",
|
| 906 |
+
" sentiment_classifier, summarizer, aspect_analyzer, aspect_extractor = None, None, None, None\n",
|
| 907 |
+
" try:\n",
|
| 908 |
+
" summarizer = ReviewSummarizer(force_cpu=True)\n",
|
| 909 |
+
" aspect_analyzer = AspectAnalyzer(force_cpu=True)\n",
|
| 910 |
+
" aspect_extractor = AspectExtractor(force_cpu=True)\n",
|
| 911 |
+
"\n",
|
| 912 |
+
" if not os.path.exists(SENTIMENT_CHECKPOINT_PATH):\n",
|
| 913 |
+
" print(\"!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\")\n",
|
| 914 |
+
" print(\"!!! WARNING: Sentiment checkpoint path not found or not set. !!!\")\n",
|
| 915 |
+
" print(f\"!!! Please update the 'SENTIMENT_CHECKPOINT_PATH' variable in main.py\")\n",
|
| 916 |
+
" print(\"!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\")\n",
|
| 917 |
+
" else:\n",
|
| 918 |
+
" sentiment_classifier = FineTunedSentimentClassifier(\n",
|
| 919 |
+
" checkpoint_path=SENTIMENT_CHECKPOINT_PATH, force_cpu=True\n",
|
| 920 |
+
" )\n",
|
| 921 |
+
" print(\"\\n--- All models loaded successfully ---\\n\")\n",
|
| 922 |
+
" except Exception as e:\n",
|
| 923 |
+
" print(f\"An error occurred during model initialization: {e}\")\n",
|
| 924 |
+
" return\n",
|
| 925 |
+
"\n",
|
| 926 |
+
" # --- 2. Interactive Loop ---\n",
|
| 927 |
+
" while True:\n",
|
| 928 |
+
" print(\"\\n==================================================\")\n",
|
| 929 |
+
" print(\" Product Review Analysis Tool \")\n",
|
| 930 |
+
" print(\"==================================================\")\n",
|
| 931 |
+
"\n",
|
| 932 |
+
" # Get user input\n",
|
| 933 |
+
" review_text = input(\"Enter the product review text (or type 'quit' to exit):\\n> \")\n",
|
| 934 |
+
" if review_text.lower() == 'quit':\n",
|
| 935 |
+
" break\n",
|
| 936 |
+
"\n",
|
| 937 |
+
" print(\"\\nAvailable Product Categories:\")\n",
|
| 938 |
+
" for i, category in enumerate(ASPECT_DICTIONARIES.keys(), 1):\n",
|
| 939 |
+
" print(f\"{i}. {category}\")\n",
|
| 940 |
+
"\n",
|
| 941 |
+
" category_choice = input(f\"Select a product category (1-{len(ASPECT_DICTIONARIES)}):\\n> \")\n",
|
| 942 |
+
" try:\n",
|
| 943 |
+
" category_idx = int(category_choice) - 1\n",
|
| 944 |
+
" product_category = list(ASPECT_DICTIONARIES.keys())[category_idx]\n",
|
| 945 |
+
" except (ValueError, IndexError):\n",
|
| 946 |
+
" print(\"Invalid choice. Using 'Default' category.\")\n",
|
| 947 |
+
" product_category = \"Default\"\n",
|
| 948 |
+
"\n",
|
| 949 |
+
" # --- 3. Run Analysis ---\n",
|
| 950 |
+
" print(\"\\n--- Analyzing Review... ---\")\n",
|
| 951 |
+
"\n",
|
| 952 |
+
" # a. Overall Sentiment\n",
|
| 953 |
+
" sentiment_result = sentiment_classifier.classify(review_text)\n",
|
| 954 |
+
"\n",
|
| 955 |
+
" # b. Summary\n",
|
| 956 |
+
" summary_result = summarizer.summarize(review_text)\n",
|
| 957 |
+
"\n",
|
| 958 |
+
" # c. Aspect Extraction and Analysis\n",
|
| 959 |
+
" aspect_dictionary = ASPECT_DICTIONARIES.get(product_category)\n",
|
| 960 |
+
" extracted_aspects = aspect_extractor.extract(review_text, aspect_dictionary)\n",
|
| 961 |
+
" aspect_results = None\n",
|
| 962 |
+
" if extracted_aspects:\n",
|
| 963 |
+
" aspect_results = aspect_analyzer.analyze(review_text, extracted_aspects)\n",
|
| 964 |
+
"\n",
|
| 965 |
+
" # --- 4. Display Results ---\n",
|
| 966 |
+
" print(\"\\n-------------------- ANALYSIS RESULTS --------------------\")\n",
|
| 967 |
+
" print(f\"\\n[ Overall Sentiment ]\")\n",
|
| 968 |
+
" print(f\" - Sentiment: {sentiment_result['label']} (Score: {sentiment_result['score']:.2f})\")\n",
|
| 969 |
+
"\n",
|
| 970 |
+
" print(f\"\\n[ Generated Summary ]\")\n",
|
| 971 |
+
" print(f\" - {summary_result}\")\n",
|
| 972 |
+
"\n",
|
| 973 |
+
" print(f\"\\n[ Detected Aspect Sentiments ]\")\n",
|
| 974 |
+
" if aspect_results:\n",
|
| 975 |
+
" for aspect, result in aspect_results.items():\n",
|
| 976 |
+
" print(f\" - {aspect.title()}: {result['sentiment']} (Score: {result['score']:.2f})\")\n",
|
| 977 |
+
" else:\n",
|
| 978 |
+
" print(\" - No relevant aspects from the dictionary were detected in the review.\")\n",
|
| 979 |
+
" print(\"----------------------------------------------------------\")\n"
|
| 980 |
+
]
|
| 981 |
+
},
|
| 982 |
+
{
|
| 983 |
+
"cell_type": "code",
|
| 984 |
+
"execution_count": null,
|
| 985 |
+
"id": "71257428",
|
| 986 |
+
"metadata": {},
|
| 987 |
+
"outputs": [],
|
| 988 |
+
"source": [
|
| 989 |
+
"# --- Run the workflow ---\n",
|
| 990 |
+
"main()"
|
| 991 |
+
]
|
| 992 |
+
}
|
| 993 |
+
],
|
| 994 |
+
"metadata": {
|
| 995 |
+
"language_info": {
|
| 996 |
+
"name": "python"
|
| 997 |
+
}
|
| 998 |
+
},
|
| 999 |
+
"nbformat": 4,
|
| 1000 |
+
"nbformat_minor": 5
|
| 1001 |
+
}
|
requirements.txt
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
|
|
| 1 |
langchain==0.3.27
|
| 2 |
langchain-community==0.3.31
|
| 3 |
gradio==5.49.1
|
|
@@ -13,4 +14,17 @@ datasets==4.0.0
|
|
| 13 |
numpy==2.0.2
|
| 14 |
accelerate==1.11.0
|
| 15 |
aiohttp==3.13.1
|
| 16 |
-
huggingface-hub==0.35.3
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<<<<<<< HEAD
|
| 2 |
langchain==0.3.27
|
| 3 |
langchain-community==0.3.31
|
| 4 |
gradio==5.49.1
|
|
|
|
| 14 |
numpy==2.0.2
|
| 15 |
accelerate==1.11.0
|
| 16 |
aiohttp==3.13.1
|
| 17 |
+
huggingface-hub==0.35.3
|
| 18 |
+
=======
|
| 19 |
+
torch==2.8.0
|
| 20 |
+
transformers==4.56.1
|
| 21 |
+
pytorch-lightning==2.5.5
|
| 22 |
+
torchmetrics==1.8.2
|
| 23 |
+
sentencepiece==0.2.1
|
| 24 |
+
pandas==2.2.2
|
| 25 |
+
scikit-learn==1.6.1
|
| 26 |
+
gradio==5.44.1
|
| 27 |
+
matplotlib==3.10.0
|
| 28 |
+
seaborn==0.13.2
|
| 29 |
+
wordcloud==1.9.4
|
| 30 |
+
>>>>>>> e6de3c4338f79386345fa6e4bba5b0666ad808da
|
scripts/app.py
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
|
|
| 1 |
# app.py
|
| 2 |
|
| 3 |
import gradio as gr
|
|
@@ -277,4 +278,166 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
|
|
| 277 |
# --- Launch Command ---
|
| 278 |
if __name__ == "__main__":
|
| 279 |
chat_memory.clear() # Clear memory each time app starts
|
| 280 |
-
demo.launch(debug=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<<<<<<< HEAD
|
| 2 |
# app.py
|
| 3 |
|
| 4 |
import gradio as gr
|
|
|
|
| 278 |
# --- Launch Command ---
|
| 279 |
if __name__ == "__main__":
|
| 280 |
chat_memory.clear() # Clear memory each time app starts
|
| 281 |
+
demo.launch(debug=True)
|
| 282 |
+
=======
|
| 283 |
+
import gradio as gr
|
| 284 |
+
import os
|
| 285 |
+
import torch
|
| 286 |
+
import pandas as pd
|
| 287 |
+
import re
|
| 288 |
+
|
| 289 |
+
# --- IMPORTANT ---
|
| 290 |
+
# This script assumes you have a 'models.py' file in the same directory
|
| 291 |
+
# containing the definitions for all model and inference classes.
|
| 292 |
+
try:
|
| 293 |
+
from models import (
|
| 294 |
+
ReviewSummarizer,
|
| 295 |
+
AspectAnalyzer,
|
| 296 |
+
AspectExtractor,
|
| 297 |
+
FineTunedSentimentClassifier
|
| 298 |
+
)
|
| 299 |
+
except ImportError:
|
| 300 |
+
print("CRITICAL ERROR: Make sure 'models.py' exists and contains the required classes.")
|
| 301 |
+
# Define dummy classes if imports fail, so Gradio can at least launch with an error message.
|
| 302 |
+
class ReviewSummarizer: pass
|
| 303 |
+
class AspectAnalyzer: pass
|
| 304 |
+
class AspectExtractor: pass
|
| 305 |
+
class FineTunedSentimentClassifier: pass
|
| 306 |
+
|
| 307 |
+
# --- Configuration ---
|
| 308 |
+
# --- IMPORTANT: UPDATE THIS PATH ---
|
| 309 |
+
# You need to provide the path to the best checkpoint file that was saved
|
| 310 |
+
# during the training of your sentiment model.
|
| 311 |
+
SENTIMENT_CHECKPOINT_PATH = "checkpoints/sentiment-binary-best-checkpoint.ckpt" # <-- CHANGE THIS
|
| 312 |
+
|
| 313 |
+
# --- Pre-defined Aspect Dictionaries for Different Product Categories ---
|
| 314 |
+
ASPECT_DICTIONARIES = {
|
| 315 |
+
"Phone": ['camera', 'battery', 'battery life', 'screen', 'performance', 'price', 'design'],
|
| 316 |
+
"Coffee Maker": ['ease of use', 'design', 'noise level', 'coffee quality', 'brew time', 'cleaning'],
|
| 317 |
+
"Book": ['plot', 'characters', 'writing style', 'pacing', 'ending'],
|
| 318 |
+
"Default": ['quality', 'price', 'service', 'design', 'features'] # A fallback list
|
| 319 |
+
}
|
| 320 |
+
|
| 321 |
+
|
| 322 |
+
# --- 1. Load All Models (Global Objects) ---
|
| 323 |
+
print("--- Initializing all models for the Gradio App ---")
|
| 324 |
+
sentiment_classifier, summarizer, aspect_analyzer, aspect_extractor = None, None, None, None
|
| 325 |
+
try:
|
| 326 |
+
summarizer = ReviewSummarizer(force_cpu=True)
|
| 327 |
+
aspect_analyzer = AspectAnalyzer(force_cpu=True)
|
| 328 |
+
aspect_extractor = AspectExtractor(force_cpu=True)
|
| 329 |
+
|
| 330 |
+
if not os.path.exists(SENTIMENT_CHECKPOINT_PATH):
|
| 331 |
+
print("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
|
| 332 |
+
print("!!! WARNING: Sentiment checkpoint path not found or not set. !!!")
|
| 333 |
+
print(f"!!! Please update the 'SENTIMENT_CHECKPOINT_PATH' variable in app.py")
|
| 334 |
+
print("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
|
| 335 |
+
else:
|
| 336 |
+
sentiment_classifier = FineTunedSentimentClassifier(
|
| 337 |
+
checkpoint_path=SENTIMENT_CHECKPOINT_PATH, force_cpu=True
|
| 338 |
+
)
|
| 339 |
+
print("\n--- All models loaded successfully ---\n")
|
| 340 |
+
except Exception as e:
|
| 341 |
+
print(f"An error occurred during model initialization: {e}")
|
| 342 |
+
|
| 343 |
+
|
| 344 |
+
# --- 2. Define the Core Analysis Function ---
|
| 345 |
+
def analyze_review(review_text, product_category):
|
| 346 |
+
if not review_text:
|
| 347 |
+
return {"ERROR": "Please enter a review."}, "", None
|
| 348 |
+
|
| 349 |
+
# --- a. Overall Sentiment Analysis ---
|
| 350 |
+
if sentiment_classifier:
|
| 351 |
+
sentiment_result = sentiment_classifier.classify(review_text)
|
| 352 |
+
sentiment_output = {
|
| 353 |
+
sentiment_result['label']: f"{sentiment_result['score']:.2f}"
|
| 354 |
+
}
|
| 355 |
+
else:
|
| 356 |
+
sentiment_output = {"ERROR": "Fine-tuned model not loaded. Check path."}
|
| 357 |
+
|
| 358 |
+
# --- b. Review Summarization ---
|
| 359 |
+
if summarizer:
|
| 360 |
+
summary_output = summarizer.summarize(review_text)
|
| 361 |
+
else:
|
| 362 |
+
summary_output = "ERROR: Summarizer model not loaded."
|
| 363 |
+
|
| 364 |
+
# --- c. Dynamic Aspect Extraction & Analysis ---
|
| 365 |
+
aspect_df = None
|
| 366 |
+
if aspect_extractor and aspect_analyzer:
|
| 367 |
+
aspect_dictionary = ASPECT_DICTIONARIES.get(product_category, ASPECT_DICTIONARIES["Default"])
|
| 368 |
+
extracted_aspects = aspect_extractor.extract(review_text, aspect_dictionary=aspect_dictionary)
|
| 369 |
+
|
| 370 |
+
if extracted_aspects:
|
| 371 |
+
aspect_results = aspect_analyzer.analyze(review_text, extracted_aspects)
|
| 372 |
+
aspect_df = pd.DataFrame([
|
| 373 |
+
{'Aspect': aspect, 'Sentiment': result['sentiment'], 'Score': f"{result['score']:.2f}"}
|
| 374 |
+
for aspect, result in aspect_results.items()
|
| 375 |
+
])
|
| 376 |
+
|
| 377 |
+
return sentiment_output, summary_output, aspect_df
|
| 378 |
+
|
| 379 |
+
|
| 380 |
+
# --- 3. Build the Gradio Interface ---
|
| 381 |
+
with gr.Blocks(theme=gr.themes.Soft()) as demo:
|
| 382 |
+
gr.Markdown("# 🛍️ ReviewSense: Product Review Analysis Engine")
|
| 383 |
+
gr.Markdown(
|
| 384 |
+
"Enter a product review and select the product category. The tool will automatically "
|
| 385 |
+
"detect relevant features and provide an overall sentiment score, a summary, and a "
|
| 386 |
+
"breakdown of sentiment towards each feature."
|
| 387 |
+
)
|
| 388 |
+
|
| 389 |
+
with gr.Row():
|
| 390 |
+
with gr.Column(scale=2):
|
| 391 |
+
review_input = gr.Textbox(
|
| 392 |
+
lines=10,
|
| 393 |
+
label="Enter Product Review Here",
|
| 394 |
+
placeholder="e.g., The camera is amazing, but the battery life is terrible..."
|
| 395 |
+
)
|
| 396 |
+
category_input = gr.Dropdown(
|
| 397 |
+
choices=list(ASPECT_DICTIONARIES.keys()),
|
| 398 |
+
label="Select Product Category",
|
| 399 |
+
value="Phone"
|
| 400 |
+
)
|
| 401 |
+
analyze_button = gr.Button("Analyze Review", variant="primary")
|
| 402 |
+
|
| 403 |
+
with gr.Column(scale=1):
|
| 404 |
+
gr.Markdown("### Overall Sentiment")
|
| 405 |
+
sentiment_output = gr.Label()
|
| 406 |
+
|
| 407 |
+
gr.Markdown("### Generated Summary")
|
| 408 |
+
summary_output = gr.Textbox(lines=5, label="Summary", interactive=False)
|
| 409 |
+
|
| 410 |
+
gr.Markdown("### Detected Aspect Sentiments")
|
| 411 |
+
aspect_output = gr.DataFrame(headers=["Aspect", "Sentiment", "Score"], label="Aspects", interactive=False)
|
| 412 |
+
|
| 413 |
+
# Connect the button to the function
|
| 414 |
+
analyze_button.click(
|
| 415 |
+
fn=analyze_review,
|
| 416 |
+
inputs=[review_input, category_input],
|
| 417 |
+
outputs=[sentiment_output, summary_output, aspect_output]
|
| 418 |
+
)
|
| 419 |
+
|
| 420 |
+
gr.Examples(
|
| 421 |
+
examples=[
|
| 422 |
+
[
|
| 423 |
+
"The camera on this phone is incredible, the pictures are professional quality. However, the battery life is a total disaster, it barely lasts half a day with light use. The screen is bright and responsive, which I love.",
|
| 424 |
+
"Phone"
|
| 425 |
+
],
|
| 426 |
+
[
|
| 427 |
+
"I am absolutely in love with this coffee maker! It's incredibly easy to use, brews a perfect cup every single time, and the design looks fantastic on my countertop. It's also surprisingly quiet.",
|
| 428 |
+
"Coffee Maker"
|
| 429 |
+
],
|
| 430 |
+
[
|
| 431 |
+
"An amazing story with characters that felt so real. The plot had me hooked from the first page, though I felt the ending was a bit rushed.",
|
| 432 |
+
"Book"
|
| 433 |
+
]
|
| 434 |
+
],
|
| 435 |
+
inputs=[review_input, category_input]
|
| 436 |
+
)
|
| 437 |
+
|
| 438 |
+
|
| 439 |
+
# --- 4. Launch the App ---
|
| 440 |
+
if __name__ == "__main__":
|
| 441 |
+
print("Launching Gradio App...")
|
| 442 |
+
demo.launch()
|
| 443 |
+
>>>>>>> e6de3c4338f79386345fa6e4bba5b0666ad808da
|
scripts/data_prepare.py
ADDED
|
@@ -0,0 +1,263 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pytorch_lightning as pl
|
| 2 |
+
from torch.utils.data import DataLoader, Dataset
|
| 3 |
+
from transformers import AutoTokenizer
|
| 4 |
+
import pandas as pd
|
| 5 |
+
from sklearn.model_selection import train_test_split
|
| 6 |
+
import torch
|
| 7 |
+
import os
|
| 8 |
+
|
| 9 |
+
def explore_and_preprocess_reviews(
|
| 10 |
+
train_path='data/train.csv',
|
| 11 |
+
test_path='data/test.csv',
|
| 12 |
+
output_dir='data'
|
| 13 |
+
):
|
| 14 |
+
"""
|
| 15 |
+
Loads the Amazon Sentiment Analysis dataset (https://www.kaggle.com/datasets/kritanjalijain/amazon-reviews)
|
| 16 |
+
(you need to extract the train/test splits from the zip file in the data folder),
|
| 17 |
+
performs basic EDA, and preprocesses it for model training.
|
| 18 |
+
|
| 19 |
+
Args:
|
| 20 |
+
train_path (str): Path to the training CSV file.
|
| 21 |
+
test_path (str): Path to the testing CSV file.
|
| 22 |
+
output_dir (str): Directory to save the processed file.
|
| 23 |
+
"""
|
| 24 |
+
# --- 1. Load Data ---
|
| 25 |
+
# This dataset typically comes without headers. We'll assign them.
|
| 26 |
+
# Column 1: Sentiment (1 = Negative, 2 = Positive)
|
| 27 |
+
# Column 2: Title
|
| 28 |
+
# Column 3: Review Text
|
| 29 |
+
print(f"Loading data from '{train_path}' and '{test_path}'...")
|
| 30 |
+
try:
|
| 31 |
+
col_names = ['sentiment_orig', 'title', 'review']
|
| 32 |
+
train_df = pd.read_csv(train_path, header=None, names=col_names)
|
| 33 |
+
test_df = pd.read_csv(test_path, header=None, names=col_names)
|
| 34 |
+
|
| 35 |
+
# Combine for unified EDA and preprocessing
|
| 36 |
+
df = pd.concat([train_df, test_df], ignore_index=True)
|
| 37 |
+
|
| 38 |
+
except FileNotFoundError:
|
| 39 |
+
print(f"\nERROR: Make sure '{train_path}' and '{test_path}' are in the specified directory.")
|
| 40 |
+
print("This script is designed for the 'Amazon Reviews for Sentiment Analysis' dataset from Kaggle.")
|
| 41 |
+
return
|
| 42 |
+
|
| 43 |
+
df.dropna(inplace=True)
|
| 44 |
+
|
| 45 |
+
# --- 2. Preprocessing ---
|
| 46 |
+
print("\n--- Preprocessing Data for Sentiment Analysis ---")
|
| 47 |
+
|
| 48 |
+
# a) Create new sentiment labels (0 = Negative, 1 = Positive)
|
| 49 |
+
# This dataset is binary, not three-class like the previous one.
|
| 50 |
+
df['sentiment'] = df['sentiment_orig'].apply(lambda x: 0 if x == 1 else 1)
|
| 51 |
+
|
| 52 |
+
# b) Combine title and review body
|
| 53 |
+
df['full_text'] = df['title'].astype(str) + ". " + df['review'].astype(str)
|
| 54 |
+
|
| 55 |
+
# c) Select and rename columns
|
| 56 |
+
processed_df = df[['full_text', 'sentiment']].copy()
|
| 57 |
+
|
| 58 |
+
# --- 4. Save Processed Data ---
|
| 59 |
+
os.makedirs(output_dir, exist_ok=True)
|
| 60 |
+
output_path = os.path.join(output_dir, 'reviews_processed.csv')
|
| 61 |
+
processed_df.to_csv(output_path, index=False)
|
| 62 |
+
print(f"\nSaved {len(processed_df)} processed reviews to '{output_path}'")
|
| 63 |
+
|
| 64 |
+
class ReviewDataset(Dataset):
|
| 65 |
+
"""
|
| 66 |
+
Custom PyTorch Dataset for Amazon Reviews.
|
| 67 |
+
|
| 68 |
+
This class takes a pandas DataFrame of review data, a tokenizer, and a max
|
| 69 |
+
token length, and prepares it for use in a PyTorch model. It handles the
|
| 70 |
+
tokenization of the text and the formatting of the labels for each item.
|
| 71 |
+
|
| 72 |
+
Attributes:
|
| 73 |
+
tokenizer: The Hugging Face tokenizer to use for processing text.
|
| 74 |
+
data (pd.DataFrame): The DataFrame containing the review data.
|
| 75 |
+
max_token_len (int): The maximum sequence length for the tokenizer.
|
| 76 |
+
"""
|
| 77 |
+
def __init__(self, data: pd.DataFrame, tokenizer, max_token_len: int):
|
| 78 |
+
"""
|
| 79 |
+
Initializes the ReviewDataset.
|
| 80 |
+
|
| 81 |
+
Args:
|
| 82 |
+
data (pd.DataFrame): The input DataFrame containing 'full_text' and
|
| 83 |
+
'sentiment' columns.
|
| 84 |
+
tokenizer: The pre-trained tokenizer instance.
|
| 85 |
+
max_token_len (int): The maximum length for tokenized sequences.
|
| 86 |
+
"""
|
| 87 |
+
self.tokenizer = tokenizer
|
| 88 |
+
self.data = data
|
| 89 |
+
self.max_token_len = max_token_len
|
| 90 |
+
|
| 91 |
+
def __len__(self):
|
| 92 |
+
"""
|
| 93 |
+
Returns the total number of samples in the dataset.
|
| 94 |
+
"""
|
| 95 |
+
return len(self.data)
|
| 96 |
+
|
| 97 |
+
def __getitem__(self, index: int):
|
| 98 |
+
"""
|
| 99 |
+
Retrieves one sample from the dataset at the specified index.
|
| 100 |
+
|
| 101 |
+
This method handles the tokenization of a single review text, including
|
| 102 |
+
padding and truncation, and formats the output into a dictionary of
|
| 103 |
+
tensors ready for the model.
|
| 104 |
+
|
| 105 |
+
Args:
|
| 106 |
+
index (int): The index of the data sample to retrieve.
|
| 107 |
+
|
| 108 |
+
Returns:
|
| 109 |
+
dict: A dictionary containing the tokenized inputs and the label,
|
| 110 |
+
with the following keys:
|
| 111 |
+
- 'input_ids': The token IDs of the review text.
|
| 112 |
+
- 'attention_mask': The attention mask for the review text.
|
| 113 |
+
- 'labels': The sentiment label as a tensor.
|
| 114 |
+
"""
|
| 115 |
+
data_row = self.data.iloc[index]
|
| 116 |
+
text = str(data_row.full_text)
|
| 117 |
+
labels = data_row.sentiment
|
| 118 |
+
|
| 119 |
+
encoding = self.tokenizer.encode_plus(
|
| 120 |
+
text,
|
| 121 |
+
add_special_tokens=True,
|
| 122 |
+
max_length=self.max_token_len,
|
| 123 |
+
return_token_type_ids=False,
|
| 124 |
+
padding="max_length",
|
| 125 |
+
truncation=True,
|
| 126 |
+
return_attention_mask=True,
|
| 127 |
+
return_tensors='pt',
|
| 128 |
+
)
|
| 129 |
+
|
| 130 |
+
return dict(
|
| 131 |
+
input_ids=encoding["input_ids"].flatten(),
|
| 132 |
+
attention_mask=encoding["attention_mask"].flatten(),
|
| 133 |
+
labels=torch.tensor(labels, dtype=torch.long)
|
| 134 |
+
)
|
| 135 |
+
|
| 136 |
+
class ReviewDataModule(pl.LightningDataModule):
|
| 137 |
+
"""
|
| 138 |
+
PyTorch Lightning DataModule to handle the Amazon Reviews dataset.
|
| 139 |
+
|
| 140 |
+
This class encapsulates all the steps needed to process the data:
|
| 141 |
+
loading, splitting, and creating PyTorch DataLoaders for training,
|
| 142 |
+
validation, and testing. It allows for using a smaller random sample of the
|
| 143 |
+
full dataset for faster experimentation.
|
| 144 |
+
|
| 145 |
+
Attributes:
|
| 146 |
+
data_path (str): Path to the processed CSV file.
|
| 147 |
+
batch_size (int): The size of each data batch.
|
| 148 |
+
max_token_len (int): The maximum sequence length for the tokenizer.
|
| 149 |
+
tokenizer: The Hugging Face tokenizer instance.
|
| 150 |
+
num_workers (int): The number of CPU cores to use for data loading.
|
| 151 |
+
sample_size (int, optional): The number of samples to use. If None,
|
| 152 |
+
the full dataset is used.
|
| 153 |
+
"""
|
| 154 |
+
def __init__(self, data_path: str, batch_size: int = 16, max_token_len: int = 256, model_name='distilbert-base-uncased', num_workers: int = 0, sample_size: int = None):
|
| 155 |
+
"""
|
| 156 |
+
Initializes the ReviewDataModule.
|
| 157 |
+
|
| 158 |
+
Args:
|
| 159 |
+
data_path (str): The path to the processed CSV data file.
|
| 160 |
+
batch_size (int): The number of samples per batch.
|
| 161 |
+
max_token_len (int): Maximum length of tokenized sequences.
|
| 162 |
+
model_name (str): The name of the pre-trained model to use for the tokenizer.
|
| 163 |
+
num_workers (int): Number of subprocesses to use for data loading.
|
| 164 |
+
sample_size (int, optional): If specified, a random sample of this
|
| 165 |
+
size will be used from the dataset.
|
| 166 |
+
Defaults to None, which uses the full dataset.
|
| 167 |
+
"""
|
| 168 |
+
super().__init__()
|
| 169 |
+
self.data_path = data_path
|
| 170 |
+
self.batch_size = batch_size
|
| 171 |
+
self.max_token_len = max_token_len
|
| 172 |
+
self.tokenizer = AutoTokenizer.from_pretrained(model_name)
|
| 173 |
+
self.num_workers = num_workers
|
| 174 |
+
self.sample_size = sample_size
|
| 175 |
+
self.train_df = None
|
| 176 |
+
self.val_df = None
|
| 177 |
+
self.test_df = None
|
| 178 |
+
|
| 179 |
+
def setup(self, stage=None):
|
| 180 |
+
"""
|
| 181 |
+
Loads and splits the data for training, validation, and testing.
|
| 182 |
+
|
| 183 |
+
This method is called by PyTorch Lightning. It reads the CSV, handles
|
| 184 |
+
missing values, optionally takes a random sample, and performs a
|
| 185 |
+
stratified train-validation-test split. The indices of the resulting
|
| 186 |
+
DataFrames are reset to prevent potential KeyErrors during data loading.
|
| 187 |
+
"""
|
| 188 |
+
df = pd.read_csv(self.data_path)
|
| 189 |
+
df.dropna(inplace=True)
|
| 190 |
+
|
| 191 |
+
# If a sample size is provided, sample the dataframe
|
| 192 |
+
if self.sample_size:
|
| 193 |
+
print(f"Using a sample of {self.sample_size} reviews.")
|
| 194 |
+
df = df.sample(n=self.sample_size, random_state=42)
|
| 195 |
+
|
| 196 |
+
# Stratified split to maintain label distribution
|
| 197 |
+
train_val_df, self.test_df = train_test_split(df, test_size=0.1, random_state=42, stratify=df.sentiment)
|
| 198 |
+
self.train_df, self.val_df = train_test_split(train_val_df, test_size=0.1, random_state=42, stratify=train_val_df.sentiment)
|
| 199 |
+
|
| 200 |
+
# Reset indices to prevent KeyErrors
|
| 201 |
+
self.train_df = self.train_df.reset_index(drop=True)
|
| 202 |
+
self.val_df = self.val_df.reset_index(drop=True)
|
| 203 |
+
self.test_df = self.test_df.reset_index(drop=True)
|
| 204 |
+
|
| 205 |
+
print(f"Size of training set: {len(self.train_df)}")
|
| 206 |
+
print(f"Size of validation set: {len(self.val_df)}")
|
| 207 |
+
print(f"Size of test set: {len(self.test_df)}")
|
| 208 |
+
|
| 209 |
+
def train_dataloader(self):
|
| 210 |
+
"""Returns the DataLoader for the training set."""
|
| 211 |
+
return DataLoader(
|
| 212 |
+
ReviewDataset(self.train_df, self.tokenizer, self.max_token_len),
|
| 213 |
+
batch_size=self.batch_size,
|
| 214 |
+
shuffle=True,
|
| 215 |
+
num_workers=self.num_workers
|
| 216 |
+
)
|
| 217 |
+
|
| 218 |
+
def val_dataloader(self):
|
| 219 |
+
"""Returns the DataLoader for the validation set."""
|
| 220 |
+
return DataLoader(
|
| 221 |
+
ReviewDataset(self.val_df, self.tokenizer, self.max_token__len),
|
| 222 |
+
batch_size=self.batch_size,
|
| 223 |
+
num_workers=self.num_workers
|
| 224 |
+
)
|
| 225 |
+
|
| 226 |
+
def test_dataloader(self):
|
| 227 |
+
"""Returns the DataLoader for the test set."""
|
| 228 |
+
return DataLoader(
|
| 229 |
+
ReviewDataset(self.test_df, self.tokenizer, self.max_token_len),
|
| 230 |
+
batch_size=self.batch_size,
|
| 231 |
+
num_workers=self.num_workers
|
| 232 |
+
)
|
| 233 |
+
|
| 234 |
+
if __name__ == "__main__":
|
| 235 |
+
|
| 236 |
+
#--- Step 1: Preprocess the Reviews Dataset ---
|
| 237 |
+
print("\n--- Preprocessing started ---")
|
| 238 |
+
explore_and_preprocess_reviews()
|
| 239 |
+
print("\n--- Preprocessing finished ---")
|
| 240 |
+
# --- Configuration ---
|
| 241 |
+
data_path = "data/reviews_processed.csv"
|
| 242 |
+
BATCH_SIZE = 64
|
| 243 |
+
MAX_TOKEN_LEN = 256
|
| 244 |
+
|
| 245 |
+
print("Initializing ReviewDataModule...")
|
| 246 |
+
review_datamodule = ReviewDataModule(
|
| 247 |
+
data_path=data_path,
|
| 248 |
+
batch_size=BATCH_SIZE,
|
| 249 |
+
max_token_len=MAX_TOKEN_LEN,
|
| 250 |
+
model_name='distilbert-base-uncased',
|
| 251 |
+
sample_size=100000 # Pass the sample size to the datamodule
|
| 252 |
+
)
|
| 253 |
+
review_datamodule.setup()
|
| 254 |
+
|
| 255 |
+
# Fetch one batch from the training dataloader to inspect its contents
|
| 256 |
+
print("\n--- Fetching one batch from the training dataloader ---")
|
| 257 |
+
train_batch = next(iter(review_datamodule.train_dataloader()))
|
| 258 |
+
|
| 259 |
+
print("\n--- Example Batch ---")
|
| 260 |
+
print(f"Input IDs shape: {train_batch['input_ids'].shape}")
|
| 261 |
+
print(f"Attention Mask shape: {train_batch['attention_mask'].shape}")
|
| 262 |
+
print(f"Labels: {train_batch['labels']}")
|
| 263 |
+
print(f"Labels shape: {train_batch['labels'].shape}")
|
scripts/main.py
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
|
|
| 1 |
# main.py
|
| 2 |
|
| 3 |
import torch
|
|
@@ -209,4 +210,115 @@ if __name__ == "__main__":
|
|
| 209 |
break
|
| 210 |
print("\n--- Chat session ended. ---")
|
| 211 |
|
| 212 |
-
print("\n--- Local Execution Finished ---")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<<<<<<< HEAD
|
| 2 |
# main.py
|
| 3 |
|
| 4 |
import torch
|
|
|
|
| 210 |
break
|
| 211 |
print("\n--- Chat session ended. ---")
|
| 212 |
|
| 213 |
+
print("\n--- Local Execution Finished ---")
|
| 214 |
+
=======
|
| 215 |
+
import os
|
| 216 |
+
import torch
|
| 217 |
+
import pandas as pd
|
| 218 |
+
|
| 219 |
+
try:
|
| 220 |
+
from data_prepare import ReviewDataset, ReviewDataModule
|
| 221 |
+
from models import SentimentClassifier, ReviewSummarizer, AspectAnalyzer, FineTunedSentimentClassifier, AspectExtractor
|
| 222 |
+
except ImportError:
|
| 223 |
+
print("CRITICAL ERROR: Make sure 'review_summarizer.py', 'aspect_extractor.py', and 'sentiment_classifier_model.py' are in the same directory.")
|
| 224 |
+
exit()
|
| 225 |
+
|
| 226 |
+
# --- Configuration ---
|
| 227 |
+
# --- IMPORTANT: UPDATE THIS PATH ---
|
| 228 |
+
# You need to provide the path to the best checkpoint file that was saved
|
| 229 |
+
# during the training of your sentiment model.
|
| 230 |
+
SENTIMENT_CHECKPOINT_PATH = "checkpoints/sentiment-binary-best-checkpoint.ckpt"
|
| 231 |
+
|
| 232 |
+
# --- Pre-defined Aspect Dictionaries for Different Product Categories ---
|
| 233 |
+
ASPECT_DICTIONARIES = {
|
| 234 |
+
"Phone": ['camera', 'battery', 'battery life', 'screen', 'performance', 'price', 'design'],
|
| 235 |
+
"Coffee Maker": ['ease of use', 'design', 'noise level', 'coffee quality', 'brew time', 'cleaning'],
|
| 236 |
+
"Book": ['plot', 'characters', 'writing style', 'pacing', 'ending'],
|
| 237 |
+
"Default": ['quality', 'price', 'service', 'design', 'features'] # A fallback list
|
| 238 |
+
}
|
| 239 |
+
|
| 240 |
+
def main():
|
| 241 |
+
"""
|
| 242 |
+
Main function to run the command-line review analysis tool.
|
| 243 |
+
"""
|
| 244 |
+
# --- 1. Load All Models ---
|
| 245 |
+
print("--- Initializing all models ---")
|
| 246 |
+
sentiment_classifier, summarizer, aspect_analyzer, aspect_extractor = None, None, None, None
|
| 247 |
+
try:
|
| 248 |
+
summarizer = ReviewSummarizer(force_cpu=True)
|
| 249 |
+
aspect_analyzer = AspectAnalyzer(force_cpu=True)
|
| 250 |
+
aspect_extractor = AspectExtractor(force_cpu=True)
|
| 251 |
+
|
| 252 |
+
if not os.path.exists(SENTIMENT_CHECKPOINT_PATH):
|
| 253 |
+
print("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
|
| 254 |
+
print("!!! WARNING: Sentiment checkpoint path not found or not set. !!!")
|
| 255 |
+
print(f"!!! Please update the 'SENTIMENT_CHECKPOINT_PATH' variable in main.py")
|
| 256 |
+
print("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
|
| 257 |
+
else:
|
| 258 |
+
sentiment_classifier = FineTunedSentimentClassifier(
|
| 259 |
+
checkpoint_path=SENTIMENT_CHECKPOINT_PATH, force_cpu=True
|
| 260 |
+
)
|
| 261 |
+
print("\n--- All models loaded successfully ---\n")
|
| 262 |
+
except Exception as e:
|
| 263 |
+
print(f"An error occurred during model initialization: {e}")
|
| 264 |
+
return
|
| 265 |
+
|
| 266 |
+
# --- 2. Interactive Loop ---
|
| 267 |
+
while True:
|
| 268 |
+
print("\n==================================================")
|
| 269 |
+
print(" Product Review Analysis Tool ")
|
| 270 |
+
print("==================================================")
|
| 271 |
+
|
| 272 |
+
# Get user input
|
| 273 |
+
review_text = input("Enter the product review text (or type 'quit' to exit):\n> ")
|
| 274 |
+
if review_text.lower() == 'quit':
|
| 275 |
+
break
|
| 276 |
+
|
| 277 |
+
print("\nAvailable Product Categories:")
|
| 278 |
+
for i, category in enumerate(ASPECT_DICTIONARIES.keys(), 1):
|
| 279 |
+
print(f"{i}. {category}")
|
| 280 |
+
|
| 281 |
+
category_choice = input(f"Select a product category (1-{len(ASPECT_DICTIONARIES)}):\n> ")
|
| 282 |
+
try:
|
| 283 |
+
category_idx = int(category_choice) - 1
|
| 284 |
+
product_category = list(ASPECT_DICTIONARIES.keys())[category_idx]
|
| 285 |
+
except (ValueError, IndexError):
|
| 286 |
+
print("Invalid choice. Using 'Default' category.")
|
| 287 |
+
product_category = "Default"
|
| 288 |
+
|
| 289 |
+
# --- 3. Run Analysis ---
|
| 290 |
+
print("\n--- Analyzing Review... ---")
|
| 291 |
+
|
| 292 |
+
# a. Overall Sentiment
|
| 293 |
+
sentiment_result = sentiment_classifier.classify(review_text)
|
| 294 |
+
|
| 295 |
+
# b. Summary
|
| 296 |
+
summary_result = summarizer.summarize(review_text)
|
| 297 |
+
|
| 298 |
+
# c. Aspect Extraction and Analysis
|
| 299 |
+
aspect_dictionary = ASPECT_DICTIONARIES.get(product_category)
|
| 300 |
+
extracted_aspects = aspect_extractor.extract(review_text, aspect_dictionary)
|
| 301 |
+
aspect_results = None
|
| 302 |
+
if extracted_aspects:
|
| 303 |
+
aspect_results = aspect_analyzer.analyze(review_text, extracted_aspects)
|
| 304 |
+
|
| 305 |
+
# --- 4. Display Results ---
|
| 306 |
+
print("\n-------------------- ANALYSIS RESULTS --------------------")
|
| 307 |
+
print(f"\n[ Overall Sentiment ]")
|
| 308 |
+
print(f" - Sentiment: {sentiment_result['label']} (Score: {sentiment_result['score']:.2f})")
|
| 309 |
+
|
| 310 |
+
print(f"\n[ Generated Summary ]")
|
| 311 |
+
print(f" - {summary_result}")
|
| 312 |
+
|
| 313 |
+
print(f"\n[ Detected Aspect Sentiments ]")
|
| 314 |
+
if aspect_results:
|
| 315 |
+
for aspect, result in aspect_results.items():
|
| 316 |
+
print(f" - {aspect.title()}: {result['sentiment']} (Score: {result['score']:.2f})")
|
| 317 |
+
else:
|
| 318 |
+
print(" - No relevant aspects from the dictionary were detected in the review.")
|
| 319 |
+
print("----------------------------------------------------------")
|
| 320 |
+
|
| 321 |
+
|
| 322 |
+
if __name__ == "__main__":
|
| 323 |
+
main()
|
| 324 |
+
>>>>>>> e6de3c4338f79386345fa6e4bba5b0666ad808da
|
scripts/models.py
ADDED
|
@@ -0,0 +1,256 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pytorch_lightning as pl
|
| 2 |
+
from transformers import AutoModelForSequenceClassification, get_linear_schedule_with_warmup, AutoConfig
|
| 3 |
+
from torch.optim import AdamW
|
| 4 |
+
import torch
|
| 5 |
+
from torchmetrics.functional import accuracy
|
| 6 |
+
from transformers import T5ForConditionalGeneration, T5Tokenizer, AutoTokenizer, pipeline
|
| 7 |
+
|
| 8 |
+
class SentimentClassifier(pl.LightningModule):
|
| 9 |
+
"""
|
| 10 |
+
PyTorch Lightning module for the sentiment classification model.
|
| 11 |
+
"""
|
| 12 |
+
def __init__(self, model_name='distilbert-base-uncased', n_classes=2, learning_rate=2e-5, n_warmup_steps=0, n_training_steps=0, dropout_prob=0.2): # Added dropout
|
| 13 |
+
super().__init__()
|
| 14 |
+
self.save_hyperparameters()
|
| 15 |
+
|
| 16 |
+
# Configure dropout
|
| 17 |
+
config = AutoConfig.from_pretrained(model_name)
|
| 18 |
+
config.hidden_dropout_prob = dropout_prob
|
| 19 |
+
config.attention_probs_dropout_prob = dropout_prob
|
| 20 |
+
config.num_labels = n_classes
|
| 21 |
+
|
| 22 |
+
self.model = AutoModelForSequenceClassification.from_pretrained(model_name, config=config)
|
| 23 |
+
|
| 24 |
+
def forward(self, input_ids, attention_mask, labels=None):
|
| 25 |
+
return self.model(
|
| 26 |
+
input_ids=input_ids,
|
| 27 |
+
attention_mask=attention_mask,
|
| 28 |
+
labels=labels
|
| 29 |
+
)
|
| 30 |
+
|
| 31 |
+
def training_step(self, batch, batch_idx):
|
| 32 |
+
output = self.forward(**batch)
|
| 33 |
+
self.log("train_loss", output.loss, prog_bar=True, logger=True)
|
| 34 |
+
return output.loss
|
| 35 |
+
|
| 36 |
+
def validation_step(self, batch, batch_idx):
|
| 37 |
+
output = self.forward(**batch)
|
| 38 |
+
preds = torch.argmax(output.logits, dim=1)
|
| 39 |
+
val_acc = accuracy(preds, batch['labels'], task='binary')
|
| 40 |
+
self.log("val_loss", output.loss, prog_bar=True, logger=True)
|
| 41 |
+
self.log("val_accuracy", val_acc, prog_bar=True, logger=True)
|
| 42 |
+
return output.loss
|
| 43 |
+
|
| 44 |
+
def test_step(self, batch, batch_idx):
|
| 45 |
+
output = self.forward(**batch)
|
| 46 |
+
preds = torch.argmax(output.logits, dim=1)
|
| 47 |
+
test_acc = accuracy(preds, batch['labels'], task='binary')
|
| 48 |
+
self.log("test_accuracy", test_acc)
|
| 49 |
+
return test_acc
|
| 50 |
+
|
| 51 |
+
def predict_step(self, batch, batch_idx, dataloader_idx=0):
|
| 52 |
+
output = self.forward(input_ids=batch['input_ids'], attention_mask=batch['attention_mask'])
|
| 53 |
+
return torch.argmax(output.logits, dim=1)
|
| 54 |
+
|
| 55 |
+
def configure_optimizers(self):
|
| 56 |
+
optimizer = AdamW(self.parameters(), lr=self.hparams.learning_rate, weight_decay=0.01)
|
| 57 |
+
scheduler = get_linear_schedule_with_warmup(
|
| 58 |
+
optimizer,
|
| 59 |
+
num_warmup_steps=self.hparams.n_warmup_steps,
|
| 60 |
+
num_training_steps=self.hparams.n_training_steps
|
| 61 |
+
)
|
| 62 |
+
return dict(optimizer=optimizer, lr_scheduler=dict(scheduler=scheduler, interval='step'))
|
| 63 |
+
|
| 64 |
+
class ReviewSummarizer:
|
| 65 |
+
"""
|
| 66 |
+
A class to handle the summarization of product reviews using a pre-trained T5 model.
|
| 67 |
+
"""
|
| 68 |
+
def __init__(self, model_name='t5-small'):
|
| 69 |
+
"""
|
| 70 |
+
Initializes the summarizer with a pre-trained T5 model and tokenizer.
|
| 71 |
+
|
| 72 |
+
Args:
|
| 73 |
+
model_name (str): The name of the pre-trained T5 model to use.
|
| 74 |
+
"""
|
| 75 |
+
print(f"Loading summarization model: {model_name}...")
|
| 76 |
+
self.model_name = model_name
|
| 77 |
+
self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
|
| 78 |
+
|
| 79 |
+
# Load the tokenizer and model from Hugging Face
|
| 80 |
+
self.tokenizer = T5Tokenizer.from_pretrained(self.model_name)
|
| 81 |
+
self.model = T5ForConditionalGeneration.from_pretrained(self.model_name).to(self.device)
|
| 82 |
+
print("Summarization model loaded successfully.")
|
| 83 |
+
|
| 84 |
+
def summarize(self, text: str, max_length: int = 50, min_length: int = 10) -> str:
|
| 85 |
+
"""
|
| 86 |
+
Generates a summary for a given text.
|
| 87 |
+
|
| 88 |
+
Args:
|
| 89 |
+
text (str): The review text to summarize.
|
| 90 |
+
max_length (int): The maximum length of the generated summary.
|
| 91 |
+
min_length (int): The minimum length of the generated summary.
|
| 92 |
+
|
| 93 |
+
Returns:
|
| 94 |
+
str: The generated summary.
|
| 95 |
+
"""
|
| 96 |
+
if not text or not isinstance(text, str):
|
| 97 |
+
return ""
|
| 98 |
+
|
| 99 |
+
# T5 models require a prefix for the task. For summarization, it's "summarize: "
|
| 100 |
+
preprocess_text = f"summarize: {text.strip()}"
|
| 101 |
+
|
| 102 |
+
# Tokenize the input text
|
| 103 |
+
tokenized_text = self.tokenizer.encode(preprocess_text, return_tensors="pt").to(self.device)
|
| 104 |
+
|
| 105 |
+
# Generate the summary
|
| 106 |
+
summary_ids = self.model.generate(
|
| 107 |
+
tokenized_text,
|
| 108 |
+
max_length=max_length,
|
| 109 |
+
min_length=min_length,
|
| 110 |
+
length_penalty=2.0,
|
| 111 |
+
num_beams=4,
|
| 112 |
+
early_stopping=True
|
| 113 |
+
)
|
| 114 |
+
|
| 115 |
+
# Decode the summary and return it
|
| 116 |
+
summary = self.tokenizer.decode(summary_ids[0], skip_special_tokens=True)
|
| 117 |
+
return summary
|
| 118 |
+
|
| 119 |
+
class AspectAnalyzer:
|
| 120 |
+
"""
|
| 121 |
+
A class to handle Aspect-Based Sentiment Analysis (ABSA) using a pre-trained model.
|
| 122 |
+
"""
|
| 123 |
+
# Changed to a different, currently valid lightweight model for ABSA.
|
| 124 |
+
def __init__(self, model_name='yangheng/deberta-v3-base-absa-v1.1', force_cpu=False):
|
| 125 |
+
"""
|
| 126 |
+
Initializes the ABSA pipeline with a pre-trained model.
|
| 127 |
+
|
| 128 |
+
Args:
|
| 129 |
+
model_name (str): The name of the pre-trained ABSA model.
|
| 130 |
+
force_cpu (bool): If True, forces the model to run on the CPU.
|
| 131 |
+
"""
|
| 132 |
+
print(f"Loading Aspect-Based Sentiment Analysis model: {model_name}...")
|
| 133 |
+
self.model_name = model_name
|
| 134 |
+
|
| 135 |
+
if force_cpu:
|
| 136 |
+
self.device = -1 # Use -1 for CPU in pipeline
|
| 137 |
+
print("Forcing ABSA model to run on CPU.")
|
| 138 |
+
else:
|
| 139 |
+
self.device = 0 if torch.cuda.is_available() else -1
|
| 140 |
+
|
| 141 |
+
print(f"Using device: {self.device} (0 for GPU, -1 for CPU)")
|
| 142 |
+
|
| 143 |
+
self.absa_pipeline = pipeline(
|
| 144 |
+
"text-classification",
|
| 145 |
+
model=self.model_name,
|
| 146 |
+
tokenizer=self.model_name,
|
| 147 |
+
device=self.device
|
| 148 |
+
)
|
| 149 |
+
print("ABSA model loaded successfully.")
|
| 150 |
+
|
| 151 |
+
def analyze(self, text: str, aspects: list) -> dict:
|
| 152 |
+
"""
|
| 153 |
+
Analyzes the sentiment towards a list of aspects within a given text.
|
| 154 |
+
"""
|
| 155 |
+
if not text or not isinstance(text, str) or not aspects:
|
| 156 |
+
return {}
|
| 157 |
+
|
| 158 |
+
# The model expects the review and aspect separated by a special token.
|
| 159 |
+
# Note: Different ABSA models might expect different input formats.
|
| 160 |
+
# This format is common but may need adjustment for other models.
|
| 161 |
+
inputs = [f"{text} [SEP] {aspect}" for aspect in aspects]
|
| 162 |
+
results = self.absa_pipeline(inputs)
|
| 163 |
+
|
| 164 |
+
# Process results into a user-friendly dictionary
|
| 165 |
+
aspect_sentiments = {}
|
| 166 |
+
for aspect, result in zip(aspects, results):
|
| 167 |
+
aspect_sentiments[aspect] = {'sentiment': result['label'], 'score': result['score']}
|
| 168 |
+
|
| 169 |
+
return aspect_sentiments
|
| 170 |
+
|
| 171 |
+
class FineTunedSentimentClassifier:
|
| 172 |
+
"""
|
| 173 |
+
This class handles loading the fine-tuned checkpoint and making predictions.
|
| 174 |
+
"""
|
| 175 |
+
def __init__(self, checkpoint_path, model_name='distilbert-base-uncased', force_cpu=False):
|
| 176 |
+
self.device = 'cpu' if force_cpu else ('cuda' if torch.cuda.is_available() else 'cpu')
|
| 177 |
+
print(f"Loading fine-tuned sentiment model from checkpoint: {checkpoint_path}...")
|
| 178 |
+
print(f"Using device: {self.device}")
|
| 179 |
+
|
| 180 |
+
self.model = SentimentClassifier.load_from_checkpoint(checkpoint_path, map_location=self.device)
|
| 181 |
+
self.model.to(self.device)
|
| 182 |
+
self.model.eval() # Set model to evaluation mode
|
| 183 |
+
|
| 184 |
+
self.tokenizer = AutoTokenizer.from_pretrained(model_name)
|
| 185 |
+
self.labels = ['NEGATIVE', 'POSITIVE']
|
| 186 |
+
print("Fine-tuned sentiment model loaded successfully.")
|
| 187 |
+
|
| 188 |
+
def classify(self, text: str) -> dict:
|
| 189 |
+
encoding = self.tokenizer.encode_plus(
|
| 190 |
+
text, add_special_tokens=True, max_length=128,
|
| 191 |
+
return_token_type_ids=False, padding="max_length",
|
| 192 |
+
truncation=True, return_attention_mask=True, return_tensors='pt',
|
| 193 |
+
)
|
| 194 |
+
input_ids = encoding["input_ids"].to(self.device)
|
| 195 |
+
attention_mask = encoding["attention_mask"].to(self.device)
|
| 196 |
+
with torch.no_grad():
|
| 197 |
+
outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)
|
| 198 |
+
logits = outputs.logits
|
| 199 |
+
probabilities = torch.softmax(logits, dim=1)
|
| 200 |
+
prediction_idx = torch.argmax(probabilities, dim=1).item()
|
| 201 |
+
return {'label': self.labels[prediction_idx], 'score': probabilities[0][prediction_idx].item()}
|
| 202 |
+
|
| 203 |
+
class AspectExtractor:
|
| 204 |
+
"""
|
| 205 |
+
This class uses a Part-of-Speech (POS) tagging model to first extract all
|
| 206 |
+
potential aspect terms (nouns) from a review text. It then filters these
|
| 207 |
+
nouns against a pre-defined dictionary of valid aspects for a given
|
| 208 |
+
product category to return only the relevant features.
|
| 209 |
+
"""
|
| 210 |
+
def __init__(self, model_name="vblagoje/bert-english-uncased-finetuned-pos", force_cpu=False):
|
| 211 |
+
self.model_name = model_name
|
| 212 |
+
self.device = 'cpu' if force_cpu else ('cuda' if torch.cuda.is_available() else 'cpu')
|
| 213 |
+
print(f"Loading Part-of-Speech (POS) tagging model: {self.model_name}...")
|
| 214 |
+
print(f"Using device: {self.device}")
|
| 215 |
+
|
| 216 |
+
self.pipeline = pipeline(
|
| 217 |
+
"token-classification",
|
| 218 |
+
model=self.model_name,
|
| 219 |
+
device=-1 if self.device == 'cpu' else 0,
|
| 220 |
+
aggregation_strategy="simple"
|
| 221 |
+
)
|
| 222 |
+
print("POS tagging model loaded successfully.")
|
| 223 |
+
|
| 224 |
+
def extract(self, text: str, aspect_dictionary: list) -> list:
|
| 225 |
+
"""
|
| 226 |
+
Extracts aspects from the given text that are present in the provided
|
| 227 |
+
aspect dictionary.
|
| 228 |
+
|
| 229 |
+
Args:
|
| 230 |
+
text (str): The review text to analyze.
|
| 231 |
+
aspect_dictionary (list): A list of valid, known aspects for the
|
| 232 |
+
product category.
|
| 233 |
+
|
| 234 |
+
Returns:
|
| 235 |
+
list: A list of aspects that were both found in the text and are
|
| 236 |
+
present in the aspect dictionary.
|
| 237 |
+
"""
|
| 238 |
+
if not text or not aspect_dictionary:
|
| 239 |
+
return []
|
| 240 |
+
|
| 241 |
+
# 1. Extract all nouns from the text using the POS model
|
| 242 |
+
model_outputs = self.pipeline(text)
|
| 243 |
+
noun_tags = {'NOUN', 'PROPN'}
|
| 244 |
+
extracted_nouns = {
|
| 245 |
+
output['word'].lower() for output in model_outputs
|
| 246 |
+
if output['entity_group'] in noun_tags
|
| 247 |
+
}
|
| 248 |
+
|
| 249 |
+
# 2. Filter the extracted nouns against the provided dictionary
|
| 250 |
+
# We find the intersection between the two sets.
|
| 251 |
+
valid_aspects = {aspect.lower() for aspect in aspect_dictionary}
|
| 252 |
+
|
| 253 |
+
final_aspects = list(extracted_nouns.intersection(valid_aspects))
|
| 254 |
+
|
| 255 |
+
return final_aspects
|
| 256 |
+
|
scripts/train_distilbet.py
ADDED
|
@@ -0,0 +1,101 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pytorch_lightning as pl
|
| 2 |
+
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping
|
| 3 |
+
from pytorch_lightning.loggers import TensorBoardLogger
|
| 4 |
+
import torch
|
| 5 |
+
import seaborn as sns
|
| 6 |
+
import matplotlib.pyplot as plt
|
| 7 |
+
from sklearn.metrics import confusion_matrix
|
| 8 |
+
from data_prepare import ReviewDataModule, ReviewDataset
|
| 9 |
+
from models import SentimentClassifier
|
| 10 |
+
|
| 11 |
+
def train_sentiment_model(data_path='data/reviews_processed.csv', model_name='distilbert-base-uncased', n_epochs=5, sample_size: int = None):
|
| 12 |
+
"""
|
| 13 |
+
Main function to train the sentiment analysis model on the Amazon Reviews dataset.
|
| 14 |
+
|
| 15 |
+
Args:
|
| 16 |
+
data_path (str): Path to the processed data file.
|
| 17 |
+
model_name (str): Name of the transformer model to use.
|
| 18 |
+
n_epochs (int): Maximum number of epochs for training.
|
| 19 |
+
sample_size (int, optional): The number of reviews to use for training.
|
| 20 |
+
If None, the full dataset is used.
|
| 21 |
+
"""
|
| 22 |
+
# --- 1. Hyperparameters ---
|
| 23 |
+
BATCH_SIZE = 64
|
| 24 |
+
MAX_TOKEN_LEN = 256
|
| 25 |
+
LEARNING_RATE = 2e-5
|
| 26 |
+
N_CLASSES = 2 # Negative, Positive
|
| 27 |
+
|
| 28 |
+
# --- 2. Initialize DataModule ---
|
| 29 |
+
print("Initializing ReviewDataModule...")
|
| 30 |
+
review_datamodule = ReviewDataModule(
|
| 31 |
+
data_path=data_path,
|
| 32 |
+
batch_size=BATCH_SIZE,
|
| 33 |
+
max_token_len=MAX_TOKEN_LEN,
|
| 34 |
+
model_name=model_name,
|
| 35 |
+
sample_size=sample_size # Pass the sample size to the datamodule
|
| 36 |
+
)
|
| 37 |
+
review_datamodule.setup()
|
| 38 |
+
|
| 39 |
+
n_training_steps = len(review_datamodule.train_dataloader()) * n_epochs
|
| 40 |
+
n_warmup_steps = int(n_training_steps * 0.1)
|
| 41 |
+
|
| 42 |
+
# --- 3. Initialize Model ---
|
| 43 |
+
print("Initializing SentimentClassifier model...")
|
| 44 |
+
model = SentimentClassifier(
|
| 45 |
+
model_name=model_name,
|
| 46 |
+
n_classes=N_CLASSES,
|
| 47 |
+
learning_rate=LEARNING_RATE,
|
| 48 |
+
n_warmup_steps=n_warmup_steps,
|
| 49 |
+
n_training_steps=n_training_steps
|
| 50 |
+
)
|
| 51 |
+
|
| 52 |
+
# --- 4. Configure Training Callbacks ---
|
| 53 |
+
checkpoint_callback = ModelCheckpoint(
|
| 54 |
+
dirpath="checkpoints",
|
| 55 |
+
filename="sentiment-binary-best-checkpoint",
|
| 56 |
+
save_top_k=1,
|
| 57 |
+
verbose=True,
|
| 58 |
+
monitor="val_loss",
|
| 59 |
+
mode="min"
|
| 60 |
+
)
|
| 61 |
+
logger = TensorBoardLogger("lightning_logs", name="sentiment-classifier-binary")
|
| 62 |
+
early_stopping_callback = EarlyStopping(monitor='val_loss', patience=2)
|
| 63 |
+
|
| 64 |
+
# --- 5. Initialize Trainer ---
|
| 65 |
+
print("Initializing PyTorch Lightning Trainer...")
|
| 66 |
+
trainer = pl.Trainer(
|
| 67 |
+
logger=logger,
|
| 68 |
+
callbacks=[checkpoint_callback, early_stopping_callback],
|
| 69 |
+
max_epochs=n_epochs,
|
| 70 |
+
accelerator='gpu' if torch.cuda.is_available() else 'cpu',
|
| 71 |
+
devices=1,
|
| 72 |
+
)
|
| 73 |
+
|
| 74 |
+
# --- 6. Start Training ---
|
| 75 |
+
print(f"Starting training with {model_name} for up to {n_epochs} epochs...")
|
| 76 |
+
trainer.fit(model, review_datamodule)
|
| 77 |
+
|
| 78 |
+
# --- 7. Evaluate on Test Set and Generate Confusion Matrix ---
|
| 79 |
+
print("\nTraining complete. Evaluating on the test set...")
|
| 80 |
+
trainer.test(model, datamodule=review_datamodule)
|
| 81 |
+
|
| 82 |
+
predictions = trainer.predict(model, datamodule=review_datamodule)
|
| 83 |
+
if predictions:
|
| 84 |
+
all_preds = torch.cat(predictions).cpu().numpy()
|
| 85 |
+
true_labels = review_datamodule.test_df.sentiment.to_numpy()
|
| 86 |
+
target_names = ['Negative', 'Positive'] # Updated labels
|
| 87 |
+
|
| 88 |
+
cm = confusion_matrix(true_labels, all_preds)
|
| 89 |
+
plt.figure(figsize=(8, 6))
|
| 90 |
+
sns.heatmap(cm, annot=True, fmt='d', cmap='YlGnBu',
|
| 91 |
+
xticklabels=target_names, yticklabels=target_names)
|
| 92 |
+
plt.title('Confusion Matrix for Sentiment Analysis')
|
| 93 |
+
plt.xlabel('Predicted Label')
|
| 94 |
+
plt.ylabel('True Label')
|
| 95 |
+
plt.show()
|
| 96 |
+
|
| 97 |
+
|
| 98 |
+
|
| 99 |
+
if __name__ == "__main__":
|
| 100 |
+
data_path = "data/reviews_processed.csv"
|
| 101 |
+
train_sentiment_model(data_path=data_path, sample_size=100000)
|
scripts/train_naive_bayes.py
ADDED
|
@@ -0,0 +1,118 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pandas as pd
|
| 2 |
+
import numpy as np
|
| 3 |
+
from sklearn.model_selection import train_test_split, ParameterGrid, StratifiedKFold
|
| 4 |
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
| 5 |
+
from sklearn.naive_bayes import MultinomialNB
|
| 6 |
+
from sklearn.pipeline import Pipeline
|
| 7 |
+
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
|
| 8 |
+
import seaborn as sns
|
| 9 |
+
import matplotlib.pyplot as plt
|
| 10 |
+
from tqdm.notebook import tqdm
|
| 11 |
+
import os
|
| 12 |
+
|
| 13 |
+
def train_baseline_sentiment_model(data_path='data/reviews_processed.csv', grid_search=True, nb__alpha=0.1, tfidf__max_df=0.75, tfidf__ngram_range=(1, 2), sample_size: int = 50000):
|
| 14 |
+
"""
|
| 15 |
+
Trains and evaluates a Multinomial Naive Bayes model for sentiment analysis.
|
| 16 |
+
Can optionally perform a grid search.
|
| 17 |
+
|
| 18 |
+
Args:
|
| 19 |
+
data_path (str): Path to the processed reviews CSV file.
|
| 20 |
+
grid_search (bool): If True, performs a grid search.
|
| 21 |
+
nb__alpha (float): Alpha for MultinomialNB.
|
| 22 |
+
tfidf__max_df (float): max_df for TfidfVectorizer.
|
| 23 |
+
tfidf__ngram_range (tuple): ngram_range for TfidfVectorizer.
|
| 24 |
+
sample_size (int, optional): Number of reviews to use. If None, uses all.
|
| 25 |
+
"""
|
| 26 |
+
# --- 1. Load Data ---
|
| 27 |
+
print(f"Loading data from '{data_path}'...")
|
| 28 |
+
if not os.path.exists(data_path):
|
| 29 |
+
print(f"\nERROR: '{data_path}' not found. Please run the EDA script first!")
|
| 30 |
+
return
|
| 31 |
+
|
| 32 |
+
df = pd.read_csv(data_path)
|
| 33 |
+
df.dropna(inplace=True)
|
| 34 |
+
|
| 35 |
+
# --- 2. Sample Data ---
|
| 36 |
+
if sample_size:
|
| 37 |
+
print(f"Using a sample of {sample_size} reviews for training the baseline model.")
|
| 38 |
+
df = df.sample(n=sample_size, random_state=42)
|
| 39 |
+
|
| 40 |
+
# --- 3. Train-Test Split ---
|
| 41 |
+
print("Splitting data into training and testing sets...")
|
| 42 |
+
X_train, X_test, y_train, y_test = train_test_split(
|
| 43 |
+
df['full_text'],
|
| 44 |
+
df['sentiment'],
|
| 45 |
+
test_size=0.2,
|
| 46 |
+
random_state=42,
|
| 47 |
+
stratify=df['sentiment']
|
| 48 |
+
)
|
| 49 |
+
|
| 50 |
+
# --- 4. Create a Pipeline ---
|
| 51 |
+
pipeline = Pipeline([
|
| 52 |
+
('tfidf', TfidfVectorizer(stop_words='english')),
|
| 53 |
+
('nb', MultinomialNB()),
|
| 54 |
+
])
|
| 55 |
+
|
| 56 |
+
best_params = None
|
| 57 |
+
|
| 58 |
+
if grid_search:
|
| 59 |
+
# --- 5a. Perform Grid Search ---
|
| 60 |
+
print("Performing Grid Search to find the best hyperparameters...")
|
| 61 |
+
parameters = {
|
| 62 |
+
'tfidf__ngram_range': [(1, 1), (1, 2)],
|
| 63 |
+
'tfidf__max_df': [0.5, 0.75, 1.0],
|
| 64 |
+
'nb__alpha': [0.1, 0.5, 1.0],
|
| 65 |
+
}
|
| 66 |
+
param_grid = list(ParameterGrid(parameters))
|
| 67 |
+
best_score = -1
|
| 68 |
+
|
| 69 |
+
for params in tqdm(param_grid, desc="Grid Search Progress"):
|
| 70 |
+
pipeline.set_params(**params)
|
| 71 |
+
pipeline.fit(X_train, y_train)
|
| 72 |
+
score = pipeline.score(X_test, y_test)
|
| 73 |
+
if score > best_score:
|
| 74 |
+
best_score = score
|
| 75 |
+
best_params = params
|
| 76 |
+
|
| 77 |
+
print(f"\nBest score on test set: {best_score:.4f}")
|
| 78 |
+
print("Best parameters found:")
|
| 79 |
+
print(best_params)
|
| 80 |
+
|
| 81 |
+
else:
|
| 82 |
+
# --- 5b. Use provided hyperparameters ---
|
| 83 |
+
print("Skipping grid search and using provided hyperparameters...")
|
| 84 |
+
best_params = {
|
| 85 |
+
'nb__alpha': nb__alpha,
|
| 86 |
+
'tfidf__max_df': tfidf__max_df,
|
| 87 |
+
'tfidf__ngram_range': tfidf__ngram_range
|
| 88 |
+
}
|
| 89 |
+
|
| 90 |
+
# --- 6. Train the Final Model ---
|
| 91 |
+
print("\nTraining final model...")
|
| 92 |
+
best_model = pipeline.set_params(**best_params)
|
| 93 |
+
best_model.fit(X_train, y_train)
|
| 94 |
+
print("Model training complete.")
|
| 95 |
+
|
| 96 |
+
# --- 7. Evaluate the Best Model ---
|
| 97 |
+
print("\n--- Model Evaluation ---")
|
| 98 |
+
y_pred = best_model.predict(X_test)
|
| 99 |
+
|
| 100 |
+
accuracy = accuracy_score(y_test, y_pred)
|
| 101 |
+
target_names = ['Negative', 'Positive']
|
| 102 |
+
|
| 103 |
+
print(f"Accuracy: {accuracy:.4f}")
|
| 104 |
+
print("\nClassification Report:")
|
| 105 |
+
print(classification_report(y_test, y_pred, target_names=target_names))
|
| 106 |
+
|
| 107 |
+
print("Confusion Matrix:")
|
| 108 |
+
cm = confusion_matrix(y_test, y_pred)
|
| 109 |
+
plt.figure(figsize=(8, 6))
|
| 110 |
+
sns.heatmap(cm, annot=True, fmt='d', cmap='Greens',
|
| 111 |
+
xticklabels=target_names, yticklabels=target_names)
|
| 112 |
+
plt.title('Confusion Matrix for Naive Bayes on Amazon Reviews')
|
| 113 |
+
plt.xlabel('Predicted Label')
|
| 114 |
+
plt.ylabel('True Label')
|
| 115 |
+
plt.show()
|
| 116 |
+
|
| 117 |
+
if __name__ == "__main__":
|
| 118 |
+
train_baseline_sentiment_model(sample_size=150000, grid_search=False)
|