DanielKiani commited on
Commit
e580bee
·
2 Parent(s): 1e8e3db 4dbd3a6

Merge branch 'main' of https://github.com/Deathshot78/ReviewSense

Browse files
.gitattributes CHANGED
@@ -1,3 +1,4 @@
 
1
  *.7z filter=lfs diff=lfs merge=lfs -text
2
  *.arrow filter=lfs diff=lfs merge=lfs -text
3
  *.bin filter=lfs diff=lfs merge=lfs -text
@@ -35,3 +36,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
  *.png filter=lfs diff=lfs merge=lfs -text
37
  *.jpg filter=lfs diff=lfs merge=lfs -text
 
 
 
 
1
+ <<<<<<< HEAD
2
  *.7z filter=lfs diff=lfs merge=lfs -text
3
  *.arrow filter=lfs diff=lfs merge=lfs -text
4
  *.bin filter=lfs diff=lfs merge=lfs -text
 
36
  *tfevents* filter=lfs diff=lfs merge=lfs -text
37
  *.png filter=lfs diff=lfs merge=lfs -text
38
  *.jpg filter=lfs diff=lfs merge=lfs -text
39
+ =======
40
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
41
+ >>>>>>> e6de3c4338f79386345fa6e4bba5b0666ad808da
assets/confusion_bay.png ADDED

Git LFS Details

  • SHA256: 2b668b34ae693b3cd9a6a5209254b92dd16d1955af100ad682355a15dde5809c
  • Pointer size: 130 Bytes
  • Size of remote file: 30.3 kB
assets/confusion_bert.png ADDED

Git LFS Details

  • SHA256: a33f8326059d934a40b241702a375813ac09dbf02c4c11fc3d2c2d30d6a28f8b
  • Pointer size: 130 Bytes
  • Size of remote file: 27.3 kB
assets/gradio.png ADDED

Git LFS Details

  • SHA256: 6a3653cb91d2f1f328096c572863b103a03f0b60e9c9cdd922871522b338c6bd
  • Pointer size: 130 Bytes
  • Size of remote file: 65.8 kB
assets/wordcloud.png ADDED

Git LFS Details

  • SHA256: 126af3f40232b435991a19a85329d4ed8bb25f239b393748c82bdd69c423082c
  • Pointer size: 130 Bytes
  • Size of remote file: 63.5 kB
notebooks/reviewsense.ipynb ADDED
@@ -0,0 +1,1001 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "id": "1754f3bb",
6
+ "metadata": {},
7
+ "source": [
8
+ "# 🛍️ ReviewSense: Product Review Analysis Engine\n",
9
+ "\n",
10
+ "> *ReviewSense is a comprehensive, end-to-end Natural Language Processing application built to extract deep, actionable insights from unstructured product reviews.* \n",
11
+ "Where a simple star rating only tells part of the story, ReviewSense dives into the text to uncover what customers are saying, why they're saying it, and how they feel about specific product features. "
12
+ ]
13
+ },
14
+ {
15
+ "cell_type": "markdown",
16
+ "id": "00d383d6",
17
+ "metadata": {},
18
+ "source": [
19
+ "## Imports"
20
+ ]
21
+ },
22
+ {
23
+ "cell_type": "code",
24
+ "execution_count": null,
25
+ "id": "4d48ba17",
26
+ "metadata": {},
27
+ "outputs": [],
28
+ "source": [
29
+ "import pytorch_lightning as pl\n",
30
+ "from torch.utils.data import DataLoader, Dataset\n",
31
+ "from transformers import AutoTokenizer\n",
32
+ "import pandas as pd\n",
33
+ "from sklearn.model_selection import train_test_split\n",
34
+ "import torch\n",
35
+ "import os\n",
36
+ "import numpy as np\n",
37
+ "from sklearn.model_selection import train_test_split, ParameterGrid, StratifiedKFold\n",
38
+ "from sklearn.feature_extraction.text import TfidfVectorizer\n",
39
+ "from sklearn.naive_bayes import MultinomialNB\n",
40
+ "from sklearn.pipeline import Pipeline\n",
41
+ "from sklearn.metrics import accuracy_score, classification_report, confusion_matrix\n",
42
+ "import seaborn as sns\n",
43
+ "import matplotlib.pyplot as plt\n",
44
+ "from tqdm.notebook import tqdm\n",
45
+ "from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping\n",
46
+ "from pytorch_lightning.loggers import TensorBoardLogger\n",
47
+ "from transformers import T5ForConditionalGeneration, T5Tokenizer\n",
48
+ "from transformers import AutoModelForSequenceClassification, get_linear_schedule_with_warmup, AutoConfig\n",
49
+ "from torch.optim import AdamW\n",
50
+ "import torch\n",
51
+ "from torchmetrics.functional import accuracy\n",
52
+ "from transformers import T5ForConditionalGeneration, T5Tokenizer, AutoTokenizer, pipeline\n",
53
+ "\n"
54
+ ]
55
+ },
56
+ {
57
+ "cell_type": "markdown",
58
+ "id": "8263bc02",
59
+ "metadata": {},
60
+ "source": [
61
+ "## Prepare the data"
62
+ ]
63
+ },
64
+ {
65
+ "cell_type": "code",
66
+ "execution_count": null,
67
+ "id": "a5f8dcda",
68
+ "metadata": {},
69
+ "outputs": [],
70
+ "source": [
71
+ "def explore_and_preprocess_reviews(\n",
72
+ " train_path='data/train.csv', \n",
73
+ " test_path='data/test.csv',\n",
74
+ " output_dir='data'\n",
75
+ "):\n",
76
+ " \"\"\"\n",
77
+ " Loads the Amazon Sentiment Analysis dataset (https://www.kaggle.com/datasets/kritanjalijain/amazon-reviews)\n",
78
+ " (you need to extract the train/test splits from the zip file in the data folder),\n",
79
+ " performs basic EDA, and preprocesses it for model training.\n",
80
+ "\n",
81
+ " Args:\n",
82
+ " train_path (str): Path to the training CSV file.\n",
83
+ " test_path (str): Path to the testing CSV file.\n",
84
+ " output_dir (str): Directory to save the processed file.\n",
85
+ " \"\"\"\n",
86
+ " # --- 1. Load Data ---\n",
87
+ " # This dataset typically comes without headers. We'll assign them.\n",
88
+ " # Column 1: Sentiment (1 = Negative, 2 = Positive)\n",
89
+ " # Column 2: Title\n",
90
+ " # Column 3: Review Text\n",
91
+ " print(f\"Loading data from '{train_path}' and '{test_path}'...\")\n",
92
+ " try:\n",
93
+ " col_names = ['sentiment_orig', 'title', 'review']\n",
94
+ " train_df = pd.read_csv(train_path, header=None, names=col_names)\n",
95
+ " test_df = pd.read_csv(test_path, header=None, names=col_names)\n",
96
+ " \n",
97
+ " # Combine for unified EDA and preprocessing\n",
98
+ " df = pd.concat([train_df, test_df], ignore_index=True)\n",
99
+ "\n",
100
+ " except FileNotFoundError:\n",
101
+ " print(f\"\\nERROR: Make sure '{train_path}' and '{test_path}' are in the specified directory.\")\n",
102
+ " print(\"This script is designed for the 'Amazon Reviews for Sentiment Analysis' dataset from Kaggle.\")\n",
103
+ " return\n",
104
+ "\n",
105
+ " df.dropna(inplace=True)\n",
106
+ "\n",
107
+ " # --- 2. Preprocessing ---\n",
108
+ " print(\"\\n--- Preprocessing Data for Sentiment Analysis ---\")\n",
109
+ "\n",
110
+ " # a) Create new sentiment labels (0 = Negative, 1 = Positive)\n",
111
+ " # This dataset is binary, not three-class like the previous one.\n",
112
+ " df['sentiment'] = df['sentiment_orig'].apply(lambda x: 0 if x == 1 else 1)\n",
113
+ "\n",
114
+ " # b) Combine title and review body\n",
115
+ " df['full_text'] = df['title'].astype(str) + \". \" + df['review'].astype(str)\n",
116
+ "\n",
117
+ " # c) Select and rename columns\n",
118
+ " processed_df = df[['full_text', 'sentiment']].copy()\n",
119
+ "\n",
120
+ " # --- 4. Save Processed Data ---\n",
121
+ " os.makedirs(output_dir, exist_ok=True)\n",
122
+ " output_path = os.path.join(output_dir, 'reviews_processed.csv')\n",
123
+ " processed_df.to_csv(output_path, index=False)\n",
124
+ " print(f\"\\nSaved {len(processed_df)} processed reviews to '{output_path}'\")\n"
125
+ ]
126
+ },
127
+ {
128
+ "cell_type": "code",
129
+ "execution_count": null,
130
+ "id": "60ab838c",
131
+ "metadata": {},
132
+ "outputs": [],
133
+ "source": [
134
+ "#--- Preprocess the Reviews Dataset ---\n",
135
+ "print(\"\\n--- Preprocessing started ---\")\n",
136
+ "explore_and_preprocess_reviews()\n",
137
+ "print(\"\\n--- Preprocessing finished ---\")"
138
+ ]
139
+ },
140
+ {
141
+ "cell_type": "markdown",
142
+ "id": "4c381d73",
143
+ "metadata": {},
144
+ "source": [
145
+ "## Define a base model (Multinomial Naive Bayes)"
146
+ ]
147
+ },
148
+ {
149
+ "cell_type": "code",
150
+ "execution_count": null,
151
+ "id": "b3cd2b5b",
152
+ "metadata": {},
153
+ "outputs": [],
154
+ "source": [
155
+ "def train_baseline_sentiment_model(data_path='data/reviews_processed.csv', grid_search=True, nb__alpha=0.1, tfidf__max_df=0.75, tfidf__ngram_range=(1, 2), sample_size: int = 50000):\n",
156
+ " \"\"\"\n",
157
+ " Trains and evaluates a Multinomial Naive Bayes model for sentiment analysis.\n",
158
+ " Can optionally perform a grid search.\n",
159
+ "\n",
160
+ " Args:\n",
161
+ " data_path (str): Path to the processed reviews CSV file.\n",
162
+ " grid_search (bool): If True, performs a grid search.\n",
163
+ " nb__alpha (float): Alpha for MultinomialNB.\n",
164
+ " tfidf__max_df (float): max_df for TfidfVectorizer.\n",
165
+ " tfidf__ngram_range (tuple): ngram_range for TfidfVectorizer.\n",
166
+ " sample_size (int, optional): Number of reviews to use. If None, uses all.\n",
167
+ " \"\"\"\n",
168
+ " # --- 1. Load Data ---\n",
169
+ " print(f\"Loading data from '{data_path}'...\")\n",
170
+ " if not os.path.exists(data_path):\n",
171
+ " print(f\"\\nERROR: '{data_path}' not found. Please run the EDA script first!\")\n",
172
+ " return\n",
173
+ " \n",
174
+ " df = pd.read_csv(data_path)\n",
175
+ " df.dropna(inplace=True)\n",
176
+ "\n",
177
+ " # --- 2. Sample Data ---\n",
178
+ " if sample_size:\n",
179
+ " print(f\"Using a sample of {sample_size} reviews for training the baseline model.\")\n",
180
+ " df = df.sample(n=sample_size, random_state=42)\n",
181
+ "\n",
182
+ " # --- 3. Train-Test Split ---\n",
183
+ " print(\"Splitting data into training and testing sets...\")\n",
184
+ " X_train, X_test, y_train, y_test = train_test_split(\n",
185
+ " df['full_text'],\n",
186
+ " df['sentiment'],\n",
187
+ " test_size=0.2,\n",
188
+ " random_state=42,\n",
189
+ " stratify=df['sentiment']\n",
190
+ " )\n",
191
+ "\n",
192
+ " # --- 4. Create a Pipeline ---\n",
193
+ " pipeline = Pipeline([\n",
194
+ " ('tfidf', TfidfVectorizer(stop_words='english')),\n",
195
+ " ('nb', MultinomialNB()),\n",
196
+ " ])\n",
197
+ "\n",
198
+ " best_params = None\n",
199
+ "\n",
200
+ " if grid_search:\n",
201
+ " # --- 5a. Perform Grid Search ---\n",
202
+ " print(\"Performing Grid Search to find the best hyperparameters...\")\n",
203
+ " parameters = {\n",
204
+ " 'tfidf__ngram_range': [(1, 1), (1, 2)],\n",
205
+ " 'tfidf__max_df': [0.5, 0.75, 1.0],\n",
206
+ " 'nb__alpha': [0.1, 0.5, 1.0],\n",
207
+ " }\n",
208
+ " param_grid = list(ParameterGrid(parameters))\n",
209
+ " best_score = -1\n",
210
+ "\n",
211
+ " for params in tqdm(param_grid, desc=\"Grid Search Progress\"):\n",
212
+ " pipeline.set_params(**params)\n",
213
+ " pipeline.fit(X_train, y_train)\n",
214
+ " score = pipeline.score(X_test, y_test)\n",
215
+ " if score > best_score:\n",
216
+ " best_score = score\n",
217
+ " best_params = params\n",
218
+ " \n",
219
+ " print(f\"\\nBest score on test set: {best_score:.4f}\")\n",
220
+ " print(\"Best parameters found:\")\n",
221
+ " print(best_params)\n",
222
+ "\n",
223
+ " else:\n",
224
+ " # --- 5b. Use provided hyperparameters ---\n",
225
+ " print(\"Skipping grid search and using provided hyperparameters...\")\n",
226
+ " best_params = {\n",
227
+ " 'nb__alpha': nb__alpha,\n",
228
+ " 'tfidf__max_df': tfidf__max_df,\n",
229
+ " 'tfidf__ngram_range': tfidf__ngram_range\n",
230
+ " }\n",
231
+ "\n",
232
+ " # --- 6. Train the Final Model ---\n",
233
+ " print(\"\\nTraining final model...\")\n",
234
+ " best_model = pipeline.set_params(**best_params)\n",
235
+ " best_model.fit(X_train, y_train)\n",
236
+ " print(\"Model training complete.\")\n",
237
+ "\n",
238
+ " # --- 7. Evaluate the Best Model ---\n",
239
+ " print(\"\\n--- Model Evaluation ---\")\n",
240
+ " y_pred = best_model.predict(X_test)\n",
241
+ " \n",
242
+ " accuracy = accuracy_score(y_test, y_pred)\n",
243
+ " target_names = ['Negative', 'Positive']\n",
244
+ " \n",
245
+ " print(f\"Accuracy: {accuracy:.4f}\")\n",
246
+ " print(\"\\nClassification Report:\")\n",
247
+ " print(classification_report(y_test, y_pred, target_names=target_names))\n",
248
+ " \n",
249
+ " print(\"Confusion Matrix:\")\n",
250
+ " cm = confusion_matrix(y_test, y_pred)\n",
251
+ " plt.figure(figsize=(8, 6))\n",
252
+ " sns.heatmap(cm, annot=True, fmt='d', cmap='Greens', \n",
253
+ " xticklabels=target_names, yticklabels=target_names)\n",
254
+ " plt.title('Confusion Matrix for Naive Bayes on Amazon Reviews')\n",
255
+ " plt.xlabel('Predicted Label')\n",
256
+ " plt.ylabel('True Label')\n",
257
+ " plt.show()"
258
+ ]
259
+ },
260
+ {
261
+ "cell_type": "code",
262
+ "execution_count": null,
263
+ "id": "093e6ae9",
264
+ "metadata": {},
265
+ "outputs": [],
266
+ "source": [
267
+ "#--- Train the base model ---\n",
268
+ "train_baseline_sentiment_model(sample_size=150000, grid_search=False)"
269
+ ]
270
+ },
271
+ {
272
+ "cell_type": "markdown",
273
+ "id": "71f5e4ba",
274
+ "metadata": {},
275
+ "source": [
276
+ "## Define the dataset and lightning DataModule"
277
+ ]
278
+ },
279
+ {
280
+ "cell_type": "code",
281
+ "execution_count": null,
282
+ "id": "c977e0f4",
283
+ "metadata": {},
284
+ "outputs": [],
285
+ "source": [
286
+ "class ReviewDataset(Dataset):\n",
287
+ " \"\"\"\n",
288
+ " Custom PyTorch Dataset for Amazon Reviews.\n",
289
+ "\n",
290
+ " This class takes a pandas DataFrame of review data, a tokenizer, and a max\n",
291
+ " token length, and prepares it for use in a PyTorch model. It handles the\n",
292
+ " tokenization of the text and the formatting of the labels for each item.\n",
293
+ "\n",
294
+ " Attributes:\n",
295
+ " tokenizer: The Hugging Face tokenizer to use for processing text.\n",
296
+ " data (pd.DataFrame): The DataFrame containing the review data.\n",
297
+ " max_token_len (int): The maximum sequence length for the tokenizer.\n",
298
+ " \"\"\"\n",
299
+ " def __init__(self, data: pd.DataFrame, tokenizer, max_token_len: int):\n",
300
+ " \"\"\"\n",
301
+ " Initializes the ReviewDataset.\n",
302
+ "\n",
303
+ " Args:\n",
304
+ " data (pd.DataFrame): The input DataFrame containing 'full_text' and\n",
305
+ " 'sentiment' columns.\n",
306
+ " tokenizer: The pre-trained tokenizer instance.\n",
307
+ " max_token_len (int): The maximum length for tokenized sequences.\n",
308
+ " \"\"\"\n",
309
+ " self.tokenizer = tokenizer\n",
310
+ " self.data = data\n",
311
+ " self.max_token_len = max_token_len\n",
312
+ "\n",
313
+ " def __len__(self):\n",
314
+ " \"\"\"\n",
315
+ " Returns the total number of samples in the dataset.\n",
316
+ " \"\"\"\n",
317
+ " return len(self.data)\n",
318
+ "\n",
319
+ " def __getitem__(self, index: int):\n",
320
+ " \"\"\"\n",
321
+ " Retrieves one sample from the dataset at the specified index.\n",
322
+ "\n",
323
+ " This method handles the tokenization of a single review text, including\n",
324
+ " padding and truncation, and formats the output into a dictionary of\n",
325
+ " tensors ready for the model.\n",
326
+ "\n",
327
+ " Args:\n",
328
+ " index (int): The index of the data sample to retrieve.\n",
329
+ "\n",
330
+ " Returns:\n",
331
+ " dict: A dictionary containing the tokenized inputs and the label,\n",
332
+ " with the following keys:\n",
333
+ " - 'input_ids': The token IDs of the review text.\n",
334
+ " - 'attention_mask': The attention mask for the review text.\n",
335
+ " - 'labels': The sentiment label as a tensor.\n",
336
+ " \"\"\"\n",
337
+ " data_row = self.data.iloc[index]\n",
338
+ " text = str(data_row.full_text)\n",
339
+ " labels = data_row.sentiment\n",
340
+ "\n",
341
+ " encoding = self.tokenizer.encode_plus(\n",
342
+ " text,\n",
343
+ " add_special_tokens=True,\n",
344
+ " max_length=self.max_token_len,\n",
345
+ " return_token_type_ids=False,\n",
346
+ " padding=\"max_length\",\n",
347
+ " truncation=True,\n",
348
+ " return_attention_mask=True,\n",
349
+ " return_tensors='pt',\n",
350
+ " )\n",
351
+ "\n",
352
+ " return dict(\n",
353
+ " input_ids=encoding[\"input_ids\"].flatten(),\n",
354
+ " attention_mask=encoding[\"attention_mask\"].flatten(),\n",
355
+ " labels=torch.tensor(labels, dtype=torch.long)\n",
356
+ " )\n",
357
+ "\n",
358
+ "class ReviewDataModule(pl.LightningDataModule):\n",
359
+ " \"\"\"\n",
360
+ " PyTorch Lightning DataModule to handle the Amazon Reviews dataset.\n",
361
+ "\n",
362
+ " This class encapsulates all the steps needed to process the data:\n",
363
+ " loading, splitting, and creating PyTorch DataLoaders for training,\n",
364
+ " validation, and testing. It allows for using a smaller random sample of the\n",
365
+ " full dataset for faster experimentation.\n",
366
+ "\n",
367
+ " Attributes:\n",
368
+ " data_path (str): Path to the processed CSV file.\n",
369
+ " batch_size (int): The size of each data batch.\n",
370
+ " max_token_len (int): The maximum sequence length for the tokenizer.\n",
371
+ " tokenizer: The Hugging Face tokenizer instance.\n",
372
+ " num_workers (int): The number of CPU cores to use for data loading.\n",
373
+ " sample_size (int, optional): The number of samples to use. If None,\n",
374
+ " the full dataset is used.\n",
375
+ " \"\"\"\n",
376
+ " def __init__(self, data_path: str, batch_size: int = 16, max_token_len: int = 256, model_name='distilbert-base-uncased', num_workers: int = 0, sample_size: int = None):\n",
377
+ " \"\"\"\n",
378
+ " Initializes the ReviewDataModule.\n",
379
+ "\n",
380
+ " Args:\n",
381
+ " data_path (str): The path to the processed CSV data file.\n",
382
+ " batch_size (int): The number of samples per batch.\n",
383
+ " max_token_len (int): Maximum length of tokenized sequences.\n",
384
+ " model_name (str): The name of the pre-trained model to use for the tokenizer.\n",
385
+ " num_workers (int): Number of subprocesses to use for data loading.\n",
386
+ " sample_size (int, optional): If specified, a random sample of this\n",
387
+ " size will be used from the dataset.\n",
388
+ " Defaults to None, which uses the full dataset.\n",
389
+ " \"\"\"\n",
390
+ " super().__init__()\n",
391
+ " self.data_path = data_path\n",
392
+ " self.batch_size = batch_size\n",
393
+ " self.max_token_len = max_token_len\n",
394
+ " self.tokenizer = AutoTokenizer.from_pretrained(model_name)\n",
395
+ " self.num_workers = num_workers\n",
396
+ " self.sample_size = sample_size\n",
397
+ " self.train_df = None\n",
398
+ " self.val_df = None\n",
399
+ " self.test_df = None\n",
400
+ "\n",
401
+ " def setup(self, stage=None):\n",
402
+ " \"\"\"\n",
403
+ " Loads and splits the data for training, validation, and testing.\n",
404
+ "\n",
405
+ " This method is called by PyTorch Lightning. It reads the CSV, handles\n",
406
+ " missing values, optionally takes a random sample, and performs a\n",
407
+ " stratified train-validation-test split. The indices of the resulting\n",
408
+ " DataFrames are reset to prevent potential KeyErrors during data loading.\n",
409
+ " \"\"\"\n",
410
+ " df = pd.read_csv(self.data_path)\n",
411
+ " df.dropna(inplace=True)\n",
412
+ "\n",
413
+ " # If a sample size is provided, sample the dataframe\n",
414
+ " if self.sample_size:\n",
415
+ " print(f\"Using a sample of {self.sample_size} reviews.\")\n",
416
+ " df = df.sample(n=self.sample_size, random_state=42)\n",
417
+ "\n",
418
+ " # Stratified split to maintain label distribution\n",
419
+ " train_val_df, self.test_df = train_test_split(df, test_size=0.1, random_state=42, stratify=df.sentiment)\n",
420
+ " self.train_df, self.val_df = train_test_split(train_val_df, test_size=0.1, random_state=42, stratify=train_val_df.sentiment)\n",
421
+ "\n",
422
+ " # Reset indices to prevent KeyErrors\n",
423
+ " self.train_df = self.train_df.reset_index(drop=True)\n",
424
+ " self.val_df = self.val_df.reset_index(drop=True)\n",
425
+ " self.test_df = self.test_df.reset_index(drop=True)\n",
426
+ "\n",
427
+ " print(f\"Size of training set: {len(self.train_df)}\")\n",
428
+ " print(f\"Size of validation set: {len(self.val_df)}\")\n",
429
+ " print(f\"Size of test set: {len(self.test_df)}\")\n",
430
+ "\n",
431
+ " def train_dataloader(self):\n",
432
+ " \"\"\"Returns the DataLoader for the training set.\"\"\"\n",
433
+ " return DataLoader(\n",
434
+ " ReviewDataset(self.train_df, self.tokenizer, self.max_token_len),\n",
435
+ " batch_size=self.batch_size,\n",
436
+ " shuffle=True,\n",
437
+ " num_workers=self.num_workers\n",
438
+ " )\n",
439
+ "\n",
440
+ " def val_dataloader(self):\n",
441
+ " \"\"\"Returns the DataLoader for the validation set.\"\"\"\n",
442
+ " return DataLoader(\n",
443
+ " ReviewDataset(self.val_df, self.tokenizer, self.max_token__len),\n",
444
+ " batch_size=self.batch_size,\n",
445
+ " num_workers=self.num_workers\n",
446
+ " )\n",
447
+ "\n",
448
+ " def test_dataloader(self):\n",
449
+ " \"\"\"Returns the DataLoader for the test set.\"\"\"\n",
450
+ " return DataLoader(\n",
451
+ " ReviewDataset(self.test_df, self.tokenizer, self.max_token_len),\n",
452
+ " batch_size=self.batch_size,\n",
453
+ " num_workers=self.num_workers\n",
454
+ " )\n",
455
+ " "
456
+ ]
457
+ },
458
+ {
459
+ "cell_type": "code",
460
+ "execution_count": null,
461
+ "id": "985ac47b",
462
+ "metadata": {},
463
+ "outputs": [],
464
+ "source": [
465
+ "# --- Configuration ---\n",
466
+ "data_path = \"data/reviews_processed.csv\"\n",
467
+ "BATCH_SIZE = 64\n",
468
+ "MAX_TOKEN_LEN = 256\n",
469
+ "\n",
470
+ "print(\"Initializing ReviewDataModule...\")\n",
471
+ "review_datamodule = ReviewDataModule(\n",
472
+ " data_path=data_path,\n",
473
+ " batch_size=BATCH_SIZE,\n",
474
+ " max_token_len=MAX_TOKEN_LEN,\n",
475
+ " model_name='distilbert-base-uncased',\n",
476
+ " sample_size=100000 # Pass the sample size to the datamodule\n",
477
+ ")\n",
478
+ "review_datamodule.setup()\n",
479
+ "\n",
480
+ "# Fetch one batch from the training dataloader to inspect its contents\n",
481
+ "print(\"\\n--- Fetching one batch from the training dataloader ---\")\n",
482
+ "train_batch = next(iter(review_datamodule.train_dataloader()))\n",
483
+ "\n",
484
+ "print(\"\\n--- Example Batch ---\")\n",
485
+ "print(f\"Input IDs shape: {train_batch['input_ids'].shape}\")\n",
486
+ "print(f\"Attention Mask shape: {train_batch['attention_mask'].shape}\")\n",
487
+ "print(f\"Labels: {train_batch['labels']}\")\n",
488
+ "print(f\"Labels shape: {train_batch['labels'].shape}\")"
489
+ ]
490
+ },
491
+ {
492
+ "cell_type": "markdown",
493
+ "id": "2c7781f4",
494
+ "metadata": {},
495
+ "source": [
496
+ "## FineTune DistilBert"
497
+ ]
498
+ },
499
+ {
500
+ "cell_type": "code",
501
+ "execution_count": null,
502
+ "id": "d046b940",
503
+ "metadata": {},
504
+ "outputs": [],
505
+ "source": [
506
+ "class SentimentClassifier(pl.LightningModule):\n",
507
+ " \"\"\"\n",
508
+ " PyTorch Lightning module for the sentiment classification model.\n",
509
+ " \"\"\"\n",
510
+ " def __init__(self, model_name='distilbert-base-uncased', n_classes=2, learning_rate=2e-5, n_warmup_steps=0, n_training_steps=0, dropout_prob=0.2): # Added dropout\n",
511
+ " super().__init__()\n",
512
+ " self.save_hyperparameters()\n",
513
+ "\n",
514
+ " # Configure dropout\n",
515
+ " config = AutoConfig.from_pretrained(model_name)\n",
516
+ " config.hidden_dropout_prob = dropout_prob\n",
517
+ " config.attention_probs_dropout_prob = dropout_prob\n",
518
+ " config.num_labels = n_classes\n",
519
+ "\n",
520
+ " self.model = AutoModelForSequenceClassification.from_pretrained(model_name, config=config)\n",
521
+ "\n",
522
+ " def forward(self, input_ids, attention_mask, labels=None):\n",
523
+ " return self.model(\n",
524
+ " input_ids=input_ids,\n",
525
+ " attention_mask=attention_mask,\n",
526
+ " labels=labels\n",
527
+ " )\n",
528
+ "\n",
529
+ " def training_step(self, batch, batch_idx):\n",
530
+ " output = self.forward(**batch)\n",
531
+ " self.log(\"train_loss\", output.loss, prog_bar=True, logger=True)\n",
532
+ " return output.loss\n",
533
+ "\n",
534
+ " def validation_step(self, batch, batch_idx):\n",
535
+ " output = self.forward(**batch)\n",
536
+ " preds = torch.argmax(output.logits, dim=1)\n",
537
+ " val_acc = accuracy(preds, batch['labels'], task='binary')\n",
538
+ " self.log(\"val_loss\", output.loss, prog_bar=True, logger=True)\n",
539
+ " self.log(\"val_accuracy\", val_acc, prog_bar=True, logger=True)\n",
540
+ " return output.loss\n",
541
+ "\n",
542
+ " def test_step(self, batch, batch_idx):\n",
543
+ " output = self.forward(**batch)\n",
544
+ " preds = torch.argmax(output.logits, dim=1)\n",
545
+ " test_acc = accuracy(preds, batch['labels'], task='binary')\n",
546
+ " self.log(\"test_accuracy\", test_acc)\n",
547
+ " return test_acc\n",
548
+ "\n",
549
+ " def predict_step(self, batch, batch_idx, dataloader_idx=0):\n",
550
+ " output = self.forward(input_ids=batch['input_ids'], attention_mask=batch['attention_mask'])\n",
551
+ " return torch.argmax(output.logits, dim=1)\n",
552
+ "\n",
553
+ " def configure_optimizers(self):\n",
554
+ " optimizer = AdamW(self.parameters(), lr=self.hparams.learning_rate, weight_decay=0.01)\n",
555
+ " scheduler = get_linear_schedule_with_warmup(\n",
556
+ " optimizer,\n",
557
+ " num_warmup_steps=self.hparams.n_warmup_steps,\n",
558
+ " num_training_steps=self.hparams.n_training_steps\n",
559
+ " )\n",
560
+ " return dict(optimizer=optimizer, lr_scheduler=dict(scheduler=scheduler, interval='step'))\n"
561
+ ]
562
+ },
563
+ {
564
+ "cell_type": "code",
565
+ "execution_count": null,
566
+ "id": "b3a3708d",
567
+ "metadata": {},
568
+ "outputs": [],
569
+ "source": [
570
+ "def train_sentiment_model(data_path='data/reviews_processed.csv', model_name='distilbert-base-uncased', n_epochs=5, sample_size: int = None):\n",
571
+ " \"\"\"\n",
572
+ " Main function to train the sentiment analysis model on the Amazon Reviews dataset.\n",
573
+ "\n",
574
+ " Args:\n",
575
+ " data_path (str): Path to the processed data file.\n",
576
+ " model_name (str): Name of the transformer model to use.\n",
577
+ " n_epochs (int): Maximum number of epochs for training.\n",
578
+ " sample_size (int, optional): The number of reviews to use for training.\n",
579
+ " If None, the full dataset is used.\n",
580
+ " \"\"\"\n",
581
+ " # --- 1. Hyperparameters ---\n",
582
+ " BATCH_SIZE = 64\n",
583
+ " MAX_TOKEN_LEN = 256\n",
584
+ " LEARNING_RATE = 2e-5\n",
585
+ " N_CLASSES = 2 # Negative, Positive\n",
586
+ "\n",
587
+ " # --- 2. Initialize DataModule ---\n",
588
+ " print(\"Initializing ReviewDataModule...\")\n",
589
+ " review_datamodule = ReviewDataModule(\n",
590
+ " data_path=data_path,\n",
591
+ " batch_size=BATCH_SIZE,\n",
592
+ " max_token_len=MAX_TOKEN_LEN,\n",
593
+ " model_name=model_name,\n",
594
+ " sample_size=sample_size # Pass the sample size to the datamodule\n",
595
+ " )\n",
596
+ " review_datamodule.setup()\n",
597
+ "\n",
598
+ " n_training_steps = len(review_datamodule.train_dataloader()) * n_epochs\n",
599
+ " n_warmup_steps = int(n_training_steps * 0.1)\n",
600
+ "\n",
601
+ " # --- 3. Initialize Model ---\n",
602
+ " print(\"Initializing SentimentClassifier model...\")\n",
603
+ " model = SentimentClassifier(\n",
604
+ " model_name=model_name,\n",
605
+ " n_classes=N_CLASSES,\n",
606
+ " learning_rate=LEARNING_RATE,\n",
607
+ " n_warmup_steps=n_warmup_steps,\n",
608
+ " n_training_steps=n_training_steps\n",
609
+ " )\n",
610
+ "\n",
611
+ " # --- 4. Configure Training Callbacks ---\n",
612
+ " checkpoint_callback = ModelCheckpoint(\n",
613
+ " dirpath=\"checkpoints\",\n",
614
+ " filename=\"sentiment-binary-best-checkpoint\",\n",
615
+ " save_top_k=1,\n",
616
+ " verbose=True,\n",
617
+ " monitor=\"val_loss\",\n",
618
+ " mode=\"min\"\n",
619
+ " )\n",
620
+ " logger = TensorBoardLogger(\"lightning_logs\", name=\"sentiment-classifier-binary\")\n",
621
+ " early_stopping_callback = EarlyStopping(monitor='val_loss', patience=2)\n",
622
+ "\n",
623
+ " # --- 5. Initialize Trainer ---\n",
624
+ " print(\"Initializing PyTorch Lightning Trainer...\")\n",
625
+ " trainer = pl.Trainer(\n",
626
+ " logger=logger,\n",
627
+ " callbacks=[checkpoint_callback, early_stopping_callback],\n",
628
+ " max_epochs=n_epochs,\n",
629
+ " accelerator='gpu' if torch.cuda.is_available() else 'cpu',\n",
630
+ " devices=1,\n",
631
+ " )\n",
632
+ "\n",
633
+ " # --- 6. Start Training ---\n",
634
+ " print(f\"Starting training with {model_name} for up to {n_epochs} epochs...\")\n",
635
+ " trainer.fit(model, review_datamodule)\n",
636
+ "\n",
637
+ " # --- 7. Evaluate on Test Set and Generate Confusion Matrix ---\n",
638
+ " print(\"\\nTraining complete. Evaluating on the test set...\")\n",
639
+ " trainer.test(model, datamodule=review_datamodule)\n",
640
+ "\n",
641
+ " predictions = trainer.predict(model, datamodule=review_datamodule)\n",
642
+ " if predictions:\n",
643
+ " all_preds = torch.cat(predictions).cpu().numpy()\n",
644
+ " true_labels = review_datamodule.test_df.sentiment.to_numpy()\n",
645
+ " target_names = ['Negative', 'Positive'] # Updated labels\n",
646
+ "\n",
647
+ " cm = confusion_matrix(true_labels, all_preds)\n",
648
+ " plt.figure(figsize=(8, 6))\n",
649
+ " sns.heatmap(cm, annot=True, fmt='d', cmap='YlGnBu',\n",
650
+ " xticklabels=target_names, yticklabels=target_names)\n",
651
+ " plt.title('Confusion Matrix for Sentiment Analysis')\n",
652
+ " plt.xlabel('Predicted Label')\n",
653
+ " plt.ylabel('True Label')\n",
654
+ " plt.show()\n",
655
+ "\n"
656
+ ]
657
+ },
658
+ {
659
+ "cell_type": "code",
660
+ "execution_count": null,
661
+ "id": "3dae58e3",
662
+ "metadata": {},
663
+ "outputs": [],
664
+ "source": [
665
+ "#--- Train DistilBert ---\n",
666
+ "train_sentiment_model(data_path=data_path, sample_size=100000)"
667
+ ]
668
+ },
669
+ {
670
+ "cell_type": "markdown",
671
+ "id": "ddbc7315",
672
+ "metadata": {},
673
+ "source": [
674
+ "## Define the models"
675
+ ]
676
+ },
677
+ {
678
+ "cell_type": "code",
679
+ "execution_count": null,
680
+ "id": "85bd352b",
681
+ "metadata": {},
682
+ "outputs": [],
683
+ "source": [
684
+ "class ReviewSummarizer:\n",
685
+ " \"\"\"\n",
686
+ " A class to handle the summarization of product reviews using a pre-trained T5 model.\n",
687
+ " \"\"\"\n",
688
+ " def __init__(self, model_name='t5-small'):\n",
689
+ " \"\"\"\n",
690
+ " Initializes the summarizer with a pre-trained T5 model and tokenizer.\n",
691
+ "\n",
692
+ " Args:\n",
693
+ " model_name (str): The name of the pre-trained T5 model to use.\n",
694
+ " \"\"\"\n",
695
+ " print(f\"Loading summarization model: {model_name}...\")\n",
696
+ " self.model_name = model_name\n",
697
+ " self.device = 'cuda' if torch.cuda.is_available() else 'cpu'\n",
698
+ "\n",
699
+ " # Load the tokenizer and model from Hugging Face\n",
700
+ " self.tokenizer = T5Tokenizer.from_pretrained(self.model_name)\n",
701
+ " self.model = T5ForConditionalGeneration.from_pretrained(self.model_name).to(self.device)\n",
702
+ " print(\"Summarization model loaded successfully.\")\n",
703
+ "\n",
704
+ " def summarize(self, text: str, max_length: int = 50, min_length: int = 10) -> str:\n",
705
+ " \"\"\"\n",
706
+ " Generates a summary for a given text.\n",
707
+ "\n",
708
+ " Args:\n",
709
+ " text (str): The review text to summarize.\n",
710
+ " max_length (int): The maximum length of the generated summary.\n",
711
+ " min_length (int): The minimum length of the generated summary.\n",
712
+ "\n",
713
+ " Returns:\n",
714
+ " str: The generated summary.\n",
715
+ " \"\"\"\n",
716
+ " if not text or not isinstance(text, str):\n",
717
+ " return \"\"\n",
718
+ "\n",
719
+ " # T5 models require a prefix for the task. For summarization, it's \"summarize: \"\n",
720
+ " preprocess_text = f\"summarize: {text.strip()}\"\n",
721
+ "\n",
722
+ " # Tokenize the input text\n",
723
+ " tokenized_text = self.tokenizer.encode(preprocess_text, return_tensors=\"pt\").to(self.device)\n",
724
+ "\n",
725
+ " # Generate the summary\n",
726
+ " summary_ids = self.model.generate(\n",
727
+ " tokenized_text,\n",
728
+ " max_length=max_length,\n",
729
+ " min_length=min_length,\n",
730
+ " length_penalty=2.0,\n",
731
+ " num_beams=4,\n",
732
+ " early_stopping=True\n",
733
+ " )\n",
734
+ "\n",
735
+ " # Decode the summary and return it\n",
736
+ " summary = self.tokenizer.decode(summary_ids[0], skip_special_tokens=True)\n",
737
+ " return summary\n",
738
+ "\n",
739
+ "class AspectAnalyzer:\n",
740
+ " \"\"\"\n",
741
+ " A class to handle Aspect-Based Sentiment Analysis (ABSA) using a pre-trained model.\n",
742
+ " \"\"\"\n",
743
+ " # Changed to a different, currently valid lightweight model for ABSA.\n",
744
+ " def __init__(self, model_name='yangheng/deberta-v3-base-absa-v1.1', force_cpu=False):\n",
745
+ " \"\"\"\n",
746
+ " Initializes the ABSA pipeline with a pre-trained model.\n",
747
+ "\n",
748
+ " Args:\n",
749
+ " model_name (str): The name of the pre-trained ABSA model.\n",
750
+ " force_cpu (bool): If True, forces the model to run on the CPU.\n",
751
+ " \"\"\"\n",
752
+ " print(f\"Loading Aspect-Based Sentiment Analysis model: {model_name}...\")\n",
753
+ " self.model_name = model_name\n",
754
+ "\n",
755
+ " if force_cpu:\n",
756
+ " self.device = -1 # Use -1 for CPU in pipeline\n",
757
+ " print(\"Forcing ABSA model to run on CPU.\")\n",
758
+ " else:\n",
759
+ " self.device = 0 if torch.cuda.is_available() else -1\n",
760
+ "\n",
761
+ " print(f\"Using device: {self.device} (0 for GPU, -1 for CPU)\")\n",
762
+ "\n",
763
+ " self.absa_pipeline = pipeline(\n",
764
+ " \"text-classification\",\n",
765
+ " model=self.model_name,\n",
766
+ " tokenizer=self.model_name,\n",
767
+ " device=self.device\n",
768
+ " )\n",
769
+ " print(\"ABSA model loaded successfully.\")\n",
770
+ "\n",
771
+ " def analyze(self, text: str, aspects: list) -> dict:\n",
772
+ " \"\"\"\n",
773
+ " Analyzes the sentiment towards a list of aspects within a given text.\n",
774
+ " \"\"\"\n",
775
+ " if not text or not isinstance(text, str) or not aspects:\n",
776
+ " return {}\n",
777
+ "\n",
778
+ " # The model expects the review and aspect separated by a special token.\n",
779
+ " # Note: Different ABSA models might expect different input formats.\n",
780
+ " # This format is common but may need adjustment for other models.\n",
781
+ " inputs = [f\"{text} [SEP] {aspect}\" for aspect in aspects]\n",
782
+ " results = self.absa_pipeline(inputs)\n",
783
+ "\n",
784
+ " # Process results into a user-friendly dictionary\n",
785
+ " aspect_sentiments = {}\n",
786
+ " for aspect, result in zip(aspects, results):\n",
787
+ " aspect_sentiments[aspect] = {'sentiment': result['label'], 'score': result['score']}\n",
788
+ "\n",
789
+ " return aspect_sentiments\n",
790
+ "\n",
791
+ "class FineTunedSentimentClassifier:\n",
792
+ " \"\"\"\n",
793
+ " This class handles loading the fine-tuned checkpoint and making predictions.\n",
794
+ " \"\"\"\n",
795
+ " def __init__(self, checkpoint_path, model_name='distilbert-base-uncased', force_cpu=False):\n",
796
+ " self.device = 'cpu' if force_cpu else ('cuda' if torch.cuda.is_available() else 'cpu')\n",
797
+ " print(f\"Loading fine-tuned sentiment model from checkpoint: {checkpoint_path}...\")\n",
798
+ " print(f\"Using device: {self.device}\")\n",
799
+ "\n",
800
+ " self.model = SentimentClassifier.load_from_checkpoint(checkpoint_path, map_location=self.device)\n",
801
+ " self.model.to(self.device)\n",
802
+ " self.model.eval() # Set model to evaluation mode\n",
803
+ "\n",
804
+ " self.tokenizer = AutoTokenizer.from_pretrained(model_name)\n",
805
+ " self.labels = ['NEGATIVE', 'POSITIVE']\n",
806
+ " print(\"Fine-tuned sentiment model loaded successfully.\")\n",
807
+ "\n",
808
+ " def classify(self, text: str) -> dict:\n",
809
+ " encoding = self.tokenizer.encode_plus(\n",
810
+ " text, add_special_tokens=True, max_length=128,\n",
811
+ " return_token_type_ids=False, padding=\"max_length\",\n",
812
+ " truncation=True, return_attention_mask=True, return_tensors='pt',\n",
813
+ " )\n",
814
+ " input_ids = encoding[\"input_ids\"].to(self.device)\n",
815
+ " attention_mask = encoding[\"attention_mask\"].to(self.device)\n",
816
+ " with torch.no_grad():\n",
817
+ " outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)\n",
818
+ " logits = outputs.logits\n",
819
+ " probabilities = torch.softmax(logits, dim=1)\n",
820
+ " prediction_idx = torch.argmax(probabilities, dim=1).item()\n",
821
+ " return {'label': self.labels[prediction_idx], 'score': probabilities[0][prediction_idx].item()}\n",
822
+ "\n",
823
+ "class AspectExtractor:\n",
824
+ " \"\"\"\n",
825
+ " This class uses a Part-of-Speech (POS) tagging model to first extract all\n",
826
+ " potential aspect terms (nouns) from a review text. It then filters these\n",
827
+ " nouns against a pre-defined dictionary of valid aspects for a given\n",
828
+ " product category to return only the relevant features.\n",
829
+ " \"\"\"\n",
830
+ " def __init__(self, model_name=\"vblagoje/bert-english-uncased-finetuned-pos\", force_cpu=False):\n",
831
+ " self.model_name = model_name\n",
832
+ " self.device = 'cpu' if force_cpu else ('cuda' if torch.cuda.is_available() else 'cpu')\n",
833
+ " print(f\"Loading Part-of-Speech (POS) tagging model: {self.model_name}...\")\n",
834
+ " print(f\"Using device: {self.device}\")\n",
835
+ "\n",
836
+ " self.pipeline = pipeline(\n",
837
+ " \"token-classification\",\n",
838
+ " model=self.model_name,\n",
839
+ " device=-1 if self.device == 'cpu' else 0,\n",
840
+ " aggregation_strategy=\"simple\"\n",
841
+ " )\n",
842
+ " print(\"POS tagging model loaded successfully.\")\n",
843
+ "\n",
844
+ " def extract(self, text: str, aspect_dictionary: list) -> list:\n",
845
+ " \"\"\"\n",
846
+ " Extracts aspects from the given text that are present in the provided\n",
847
+ " aspect dictionary.\n",
848
+ "\n",
849
+ " Args:\n",
850
+ " text (str): The review text to analyze.\n",
851
+ " aspect_dictionary (list): A list of valid, known aspects for the\n",
852
+ " product category.\n",
853
+ "\n",
854
+ " Returns:\n",
855
+ " list: A list of aspects that were both found in the text and are\n",
856
+ " present in the aspect dictionary.\n",
857
+ " \"\"\"\n",
858
+ " if not text or not aspect_dictionary:\n",
859
+ " return []\n",
860
+ "\n",
861
+ " # 1. Extract all nouns from the text using the POS model\n",
862
+ " model_outputs = self.pipeline(text)\n",
863
+ " noun_tags = {'NOUN', 'PROPN'}\n",
864
+ " extracted_nouns = {\n",
865
+ " output['word'].lower() for output in model_outputs\n",
866
+ " if output['entity_group'] in noun_tags\n",
867
+ " }\n",
868
+ "\n",
869
+ " # 2. Filter the extracted nouns against the provided dictionary\n",
870
+ " # We find the intersection between the two sets.\n",
871
+ " valid_aspects = {aspect.lower() for aspect in aspect_dictionary}\n",
872
+ "\n",
873
+ " final_aspects = list(extracted_nouns.intersection(valid_aspects))\n",
874
+ "\n",
875
+ " return final_aspects\n",
876
+ " "
877
+ ]
878
+ },
879
+ {
880
+ "cell_type": "code",
881
+ "execution_count": null,
882
+ "id": "6fc21c8b",
883
+ "metadata": {},
884
+ "outputs": [],
885
+ "source": [
886
+ "# --- Configuration ---\n",
887
+ "# --- IMPORTANT: UPDATE THIS PATH ---\n",
888
+ "# You need to provide the path to the best checkpoint file that was saved\n",
889
+ "# during the training of your sentiment model.\n",
890
+ "SENTIMENT_CHECKPOINT_PATH = \"checkpoints/sentiment-binary-best-checkpoint.ckpt\"\n",
891
+ "\n",
892
+ "# --- Pre-defined Aspect Dictionaries for Different Product Categories ---\n",
893
+ "ASPECT_DICTIONARIES = {\n",
894
+ " \"Phone\": ['camera', 'battery', 'battery life', 'screen', 'performance', 'price', 'design'],\n",
895
+ " \"Coffee Maker\": ['ease of use', 'design', 'noise level', 'coffee quality', 'brew time', 'cleaning'],\n",
896
+ " \"Book\": ['plot', 'characters', 'writing style', 'pacing', 'ending'],\n",
897
+ " \"Default\": ['quality', 'price', 'service', 'design', 'features'] # A fallback list\n",
898
+ "}\n",
899
+ "\n",
900
+ "def main():\n",
901
+ " \"\"\"\n",
902
+ " Main function to run the command-line review analysis tool.\n",
903
+ " \"\"\"\n",
904
+ " # --- 1. Load All Models ---\n",
905
+ " print(\"--- Initializing all models ---\")\n",
906
+ " sentiment_classifier, summarizer, aspect_analyzer, aspect_extractor = None, None, None, None\n",
907
+ " try:\n",
908
+ " summarizer = ReviewSummarizer(force_cpu=True)\n",
909
+ " aspect_analyzer = AspectAnalyzer(force_cpu=True)\n",
910
+ " aspect_extractor = AspectExtractor(force_cpu=True)\n",
911
+ "\n",
912
+ " if not os.path.exists(SENTIMENT_CHECKPOINT_PATH):\n",
913
+ " print(\"!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\")\n",
914
+ " print(\"!!! WARNING: Sentiment checkpoint path not found or not set. !!!\")\n",
915
+ " print(f\"!!! Please update the 'SENTIMENT_CHECKPOINT_PATH' variable in main.py\")\n",
916
+ " print(\"!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\")\n",
917
+ " else:\n",
918
+ " sentiment_classifier = FineTunedSentimentClassifier(\n",
919
+ " checkpoint_path=SENTIMENT_CHECKPOINT_PATH, force_cpu=True\n",
920
+ " )\n",
921
+ " print(\"\\n--- All models loaded successfully ---\\n\")\n",
922
+ " except Exception as e:\n",
923
+ " print(f\"An error occurred during model initialization: {e}\")\n",
924
+ " return\n",
925
+ "\n",
926
+ " # --- 2. Interactive Loop ---\n",
927
+ " while True:\n",
928
+ " print(\"\\n==================================================\")\n",
929
+ " print(\" Product Review Analysis Tool \")\n",
930
+ " print(\"==================================================\")\n",
931
+ "\n",
932
+ " # Get user input\n",
933
+ " review_text = input(\"Enter the product review text (or type 'quit' to exit):\\n> \")\n",
934
+ " if review_text.lower() == 'quit':\n",
935
+ " break\n",
936
+ "\n",
937
+ " print(\"\\nAvailable Product Categories:\")\n",
938
+ " for i, category in enumerate(ASPECT_DICTIONARIES.keys(), 1):\n",
939
+ " print(f\"{i}. {category}\")\n",
940
+ "\n",
941
+ " category_choice = input(f\"Select a product category (1-{len(ASPECT_DICTIONARIES)}):\\n> \")\n",
942
+ " try:\n",
943
+ " category_idx = int(category_choice) - 1\n",
944
+ " product_category = list(ASPECT_DICTIONARIES.keys())[category_idx]\n",
945
+ " except (ValueError, IndexError):\n",
946
+ " print(\"Invalid choice. Using 'Default' category.\")\n",
947
+ " product_category = \"Default\"\n",
948
+ "\n",
949
+ " # --- 3. Run Analysis ---\n",
950
+ " print(\"\\n--- Analyzing Review... ---\")\n",
951
+ "\n",
952
+ " # a. Overall Sentiment\n",
953
+ " sentiment_result = sentiment_classifier.classify(review_text)\n",
954
+ "\n",
955
+ " # b. Summary\n",
956
+ " summary_result = summarizer.summarize(review_text)\n",
957
+ "\n",
958
+ " # c. Aspect Extraction and Analysis\n",
959
+ " aspect_dictionary = ASPECT_DICTIONARIES.get(product_category)\n",
960
+ " extracted_aspects = aspect_extractor.extract(review_text, aspect_dictionary)\n",
961
+ " aspect_results = None\n",
962
+ " if extracted_aspects:\n",
963
+ " aspect_results = aspect_analyzer.analyze(review_text, extracted_aspects)\n",
964
+ "\n",
965
+ " # --- 4. Display Results ---\n",
966
+ " print(\"\\n-------------------- ANALYSIS RESULTS --------------------\")\n",
967
+ " print(f\"\\n[ Overall Sentiment ]\")\n",
968
+ " print(f\" - Sentiment: {sentiment_result['label']} (Score: {sentiment_result['score']:.2f})\")\n",
969
+ "\n",
970
+ " print(f\"\\n[ Generated Summary ]\")\n",
971
+ " print(f\" - {summary_result}\")\n",
972
+ "\n",
973
+ " print(f\"\\n[ Detected Aspect Sentiments ]\")\n",
974
+ " if aspect_results:\n",
975
+ " for aspect, result in aspect_results.items():\n",
976
+ " print(f\" - {aspect.title()}: {result['sentiment']} (Score: {result['score']:.2f})\")\n",
977
+ " else:\n",
978
+ " print(\" - No relevant aspects from the dictionary were detected in the review.\")\n",
979
+ " print(\"----------------------------------------------------------\")\n"
980
+ ]
981
+ },
982
+ {
983
+ "cell_type": "code",
984
+ "execution_count": null,
985
+ "id": "71257428",
986
+ "metadata": {},
987
+ "outputs": [],
988
+ "source": [
989
+ "# --- Run the workflow ---\n",
990
+ "main()"
991
+ ]
992
+ }
993
+ ],
994
+ "metadata": {
995
+ "language_info": {
996
+ "name": "python"
997
+ }
998
+ },
999
+ "nbformat": 4,
1000
+ "nbformat_minor": 5
1001
+ }
requirements.txt CHANGED
@@ -1,3 +1,4 @@
 
1
  langchain==0.3.27
2
  langchain-community==0.3.31
3
  gradio==5.49.1
@@ -13,4 +14,17 @@ datasets==4.0.0
13
  numpy==2.0.2
14
  accelerate==1.11.0
15
  aiohttp==3.13.1
16
- huggingface-hub==0.35.3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <<<<<<< HEAD
2
  langchain==0.3.27
3
  langchain-community==0.3.31
4
  gradio==5.49.1
 
14
  numpy==2.0.2
15
  accelerate==1.11.0
16
  aiohttp==3.13.1
17
+ huggingface-hub==0.35.3
18
+ =======
19
+ torch==2.8.0
20
+ transformers==4.56.1
21
+ pytorch-lightning==2.5.5
22
+ torchmetrics==1.8.2
23
+ sentencepiece==0.2.1
24
+ pandas==2.2.2
25
+ scikit-learn==1.6.1
26
+ gradio==5.44.1
27
+ matplotlib==3.10.0
28
+ seaborn==0.13.2
29
+ wordcloud==1.9.4
30
+ >>>>>>> e6de3c4338f79386345fa6e4bba5b0666ad808da
scripts/app.py CHANGED
@@ -1,3 +1,4 @@
 
1
  # app.py
2
 
3
  import gradio as gr
@@ -277,4 +278,166 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
277
  # --- Launch Command ---
278
  if __name__ == "__main__":
279
  chat_memory.clear() # Clear memory each time app starts
280
- demo.launch(debug=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <<<<<<< HEAD
2
  # app.py
3
 
4
  import gradio as gr
 
278
  # --- Launch Command ---
279
  if __name__ == "__main__":
280
  chat_memory.clear() # Clear memory each time app starts
281
+ demo.launch(debug=True)
282
+ =======
283
+ import gradio as gr
284
+ import os
285
+ import torch
286
+ import pandas as pd
287
+ import re
288
+
289
+ # --- IMPORTANT ---
290
+ # This script assumes you have a 'models.py' file in the same directory
291
+ # containing the definitions for all model and inference classes.
292
+ try:
293
+ from models import (
294
+ ReviewSummarizer,
295
+ AspectAnalyzer,
296
+ AspectExtractor,
297
+ FineTunedSentimentClassifier
298
+ )
299
+ except ImportError:
300
+ print("CRITICAL ERROR: Make sure 'models.py' exists and contains the required classes.")
301
+ # Define dummy classes if imports fail, so Gradio can at least launch with an error message.
302
+ class ReviewSummarizer: pass
303
+ class AspectAnalyzer: pass
304
+ class AspectExtractor: pass
305
+ class FineTunedSentimentClassifier: pass
306
+
307
+ # --- Configuration ---
308
+ # --- IMPORTANT: UPDATE THIS PATH ---
309
+ # You need to provide the path to the best checkpoint file that was saved
310
+ # during the training of your sentiment model.
311
+ SENTIMENT_CHECKPOINT_PATH = "checkpoints/sentiment-binary-best-checkpoint.ckpt" # <-- CHANGE THIS
312
+
313
+ # --- Pre-defined Aspect Dictionaries for Different Product Categories ---
314
+ ASPECT_DICTIONARIES = {
315
+ "Phone": ['camera', 'battery', 'battery life', 'screen', 'performance', 'price', 'design'],
316
+ "Coffee Maker": ['ease of use', 'design', 'noise level', 'coffee quality', 'brew time', 'cleaning'],
317
+ "Book": ['plot', 'characters', 'writing style', 'pacing', 'ending'],
318
+ "Default": ['quality', 'price', 'service', 'design', 'features'] # A fallback list
319
+ }
320
+
321
+
322
+ # --- 1. Load All Models (Global Objects) ---
323
+ print("--- Initializing all models for the Gradio App ---")
324
+ sentiment_classifier, summarizer, aspect_analyzer, aspect_extractor = None, None, None, None
325
+ try:
326
+ summarizer = ReviewSummarizer(force_cpu=True)
327
+ aspect_analyzer = AspectAnalyzer(force_cpu=True)
328
+ aspect_extractor = AspectExtractor(force_cpu=True)
329
+
330
+ if not os.path.exists(SENTIMENT_CHECKPOINT_PATH):
331
+ print("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
332
+ print("!!! WARNING: Sentiment checkpoint path not found or not set. !!!")
333
+ print(f"!!! Please update the 'SENTIMENT_CHECKPOINT_PATH' variable in app.py")
334
+ print("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
335
+ else:
336
+ sentiment_classifier = FineTunedSentimentClassifier(
337
+ checkpoint_path=SENTIMENT_CHECKPOINT_PATH, force_cpu=True
338
+ )
339
+ print("\n--- All models loaded successfully ---\n")
340
+ except Exception as e:
341
+ print(f"An error occurred during model initialization: {e}")
342
+
343
+
344
+ # --- 2. Define the Core Analysis Function ---
345
+ def analyze_review(review_text, product_category):
346
+ if not review_text:
347
+ return {"ERROR": "Please enter a review."}, "", None
348
+
349
+ # --- a. Overall Sentiment Analysis ---
350
+ if sentiment_classifier:
351
+ sentiment_result = sentiment_classifier.classify(review_text)
352
+ sentiment_output = {
353
+ sentiment_result['label']: f"{sentiment_result['score']:.2f}"
354
+ }
355
+ else:
356
+ sentiment_output = {"ERROR": "Fine-tuned model not loaded. Check path."}
357
+
358
+ # --- b. Review Summarization ---
359
+ if summarizer:
360
+ summary_output = summarizer.summarize(review_text)
361
+ else:
362
+ summary_output = "ERROR: Summarizer model not loaded."
363
+
364
+ # --- c. Dynamic Aspect Extraction & Analysis ---
365
+ aspect_df = None
366
+ if aspect_extractor and aspect_analyzer:
367
+ aspect_dictionary = ASPECT_DICTIONARIES.get(product_category, ASPECT_DICTIONARIES["Default"])
368
+ extracted_aspects = aspect_extractor.extract(review_text, aspect_dictionary=aspect_dictionary)
369
+
370
+ if extracted_aspects:
371
+ aspect_results = aspect_analyzer.analyze(review_text, extracted_aspects)
372
+ aspect_df = pd.DataFrame([
373
+ {'Aspect': aspect, 'Sentiment': result['sentiment'], 'Score': f"{result['score']:.2f}"}
374
+ for aspect, result in aspect_results.items()
375
+ ])
376
+
377
+ return sentiment_output, summary_output, aspect_df
378
+
379
+
380
+ # --- 3. Build the Gradio Interface ---
381
+ with gr.Blocks(theme=gr.themes.Soft()) as demo:
382
+ gr.Markdown("# 🛍️ ReviewSense: Product Review Analysis Engine")
383
+ gr.Markdown(
384
+ "Enter a product review and select the product category. The tool will automatically "
385
+ "detect relevant features and provide an overall sentiment score, a summary, and a "
386
+ "breakdown of sentiment towards each feature."
387
+ )
388
+
389
+ with gr.Row():
390
+ with gr.Column(scale=2):
391
+ review_input = gr.Textbox(
392
+ lines=10,
393
+ label="Enter Product Review Here",
394
+ placeholder="e.g., The camera is amazing, but the battery life is terrible..."
395
+ )
396
+ category_input = gr.Dropdown(
397
+ choices=list(ASPECT_DICTIONARIES.keys()),
398
+ label="Select Product Category",
399
+ value="Phone"
400
+ )
401
+ analyze_button = gr.Button("Analyze Review", variant="primary")
402
+
403
+ with gr.Column(scale=1):
404
+ gr.Markdown("### Overall Sentiment")
405
+ sentiment_output = gr.Label()
406
+
407
+ gr.Markdown("### Generated Summary")
408
+ summary_output = gr.Textbox(lines=5, label="Summary", interactive=False)
409
+
410
+ gr.Markdown("### Detected Aspect Sentiments")
411
+ aspect_output = gr.DataFrame(headers=["Aspect", "Sentiment", "Score"], label="Aspects", interactive=False)
412
+
413
+ # Connect the button to the function
414
+ analyze_button.click(
415
+ fn=analyze_review,
416
+ inputs=[review_input, category_input],
417
+ outputs=[sentiment_output, summary_output, aspect_output]
418
+ )
419
+
420
+ gr.Examples(
421
+ examples=[
422
+ [
423
+ "The camera on this phone is incredible, the pictures are professional quality. However, the battery life is a total disaster, it barely lasts half a day with light use. The screen is bright and responsive, which I love.",
424
+ "Phone"
425
+ ],
426
+ [
427
+ "I am absolutely in love with this coffee maker! It's incredibly easy to use, brews a perfect cup every single time, and the design looks fantastic on my countertop. It's also surprisingly quiet.",
428
+ "Coffee Maker"
429
+ ],
430
+ [
431
+ "An amazing story with characters that felt so real. The plot had me hooked from the first page, though I felt the ending was a bit rushed.",
432
+ "Book"
433
+ ]
434
+ ],
435
+ inputs=[review_input, category_input]
436
+ )
437
+
438
+
439
+ # --- 4. Launch the App ---
440
+ if __name__ == "__main__":
441
+ print("Launching Gradio App...")
442
+ demo.launch()
443
+ >>>>>>> e6de3c4338f79386345fa6e4bba5b0666ad808da
scripts/data_prepare.py ADDED
@@ -0,0 +1,263 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pytorch_lightning as pl
2
+ from torch.utils.data import DataLoader, Dataset
3
+ from transformers import AutoTokenizer
4
+ import pandas as pd
5
+ from sklearn.model_selection import train_test_split
6
+ import torch
7
+ import os
8
+
9
+ def explore_and_preprocess_reviews(
10
+ train_path='data/train.csv',
11
+ test_path='data/test.csv',
12
+ output_dir='data'
13
+ ):
14
+ """
15
+ Loads the Amazon Sentiment Analysis dataset (https://www.kaggle.com/datasets/kritanjalijain/amazon-reviews)
16
+ (you need to extract the train/test splits from the zip file in the data folder),
17
+ performs basic EDA, and preprocesses it for model training.
18
+
19
+ Args:
20
+ train_path (str): Path to the training CSV file.
21
+ test_path (str): Path to the testing CSV file.
22
+ output_dir (str): Directory to save the processed file.
23
+ """
24
+ # --- 1. Load Data ---
25
+ # This dataset typically comes without headers. We'll assign them.
26
+ # Column 1: Sentiment (1 = Negative, 2 = Positive)
27
+ # Column 2: Title
28
+ # Column 3: Review Text
29
+ print(f"Loading data from '{train_path}' and '{test_path}'...")
30
+ try:
31
+ col_names = ['sentiment_orig', 'title', 'review']
32
+ train_df = pd.read_csv(train_path, header=None, names=col_names)
33
+ test_df = pd.read_csv(test_path, header=None, names=col_names)
34
+
35
+ # Combine for unified EDA and preprocessing
36
+ df = pd.concat([train_df, test_df], ignore_index=True)
37
+
38
+ except FileNotFoundError:
39
+ print(f"\nERROR: Make sure '{train_path}' and '{test_path}' are in the specified directory.")
40
+ print("This script is designed for the 'Amazon Reviews for Sentiment Analysis' dataset from Kaggle.")
41
+ return
42
+
43
+ df.dropna(inplace=True)
44
+
45
+ # --- 2. Preprocessing ---
46
+ print("\n--- Preprocessing Data for Sentiment Analysis ---")
47
+
48
+ # a) Create new sentiment labels (0 = Negative, 1 = Positive)
49
+ # This dataset is binary, not three-class like the previous one.
50
+ df['sentiment'] = df['sentiment_orig'].apply(lambda x: 0 if x == 1 else 1)
51
+
52
+ # b) Combine title and review body
53
+ df['full_text'] = df['title'].astype(str) + ". " + df['review'].astype(str)
54
+
55
+ # c) Select and rename columns
56
+ processed_df = df[['full_text', 'sentiment']].copy()
57
+
58
+ # --- 4. Save Processed Data ---
59
+ os.makedirs(output_dir, exist_ok=True)
60
+ output_path = os.path.join(output_dir, 'reviews_processed.csv')
61
+ processed_df.to_csv(output_path, index=False)
62
+ print(f"\nSaved {len(processed_df)} processed reviews to '{output_path}'")
63
+
64
+ class ReviewDataset(Dataset):
65
+ """
66
+ Custom PyTorch Dataset for Amazon Reviews.
67
+
68
+ This class takes a pandas DataFrame of review data, a tokenizer, and a max
69
+ token length, and prepares it for use in a PyTorch model. It handles the
70
+ tokenization of the text and the formatting of the labels for each item.
71
+
72
+ Attributes:
73
+ tokenizer: The Hugging Face tokenizer to use for processing text.
74
+ data (pd.DataFrame): The DataFrame containing the review data.
75
+ max_token_len (int): The maximum sequence length for the tokenizer.
76
+ """
77
+ def __init__(self, data: pd.DataFrame, tokenizer, max_token_len: int):
78
+ """
79
+ Initializes the ReviewDataset.
80
+
81
+ Args:
82
+ data (pd.DataFrame): The input DataFrame containing 'full_text' and
83
+ 'sentiment' columns.
84
+ tokenizer: The pre-trained tokenizer instance.
85
+ max_token_len (int): The maximum length for tokenized sequences.
86
+ """
87
+ self.tokenizer = tokenizer
88
+ self.data = data
89
+ self.max_token_len = max_token_len
90
+
91
+ def __len__(self):
92
+ """
93
+ Returns the total number of samples in the dataset.
94
+ """
95
+ return len(self.data)
96
+
97
+ def __getitem__(self, index: int):
98
+ """
99
+ Retrieves one sample from the dataset at the specified index.
100
+
101
+ This method handles the tokenization of a single review text, including
102
+ padding and truncation, and formats the output into a dictionary of
103
+ tensors ready for the model.
104
+
105
+ Args:
106
+ index (int): The index of the data sample to retrieve.
107
+
108
+ Returns:
109
+ dict: A dictionary containing the tokenized inputs and the label,
110
+ with the following keys:
111
+ - 'input_ids': The token IDs of the review text.
112
+ - 'attention_mask': The attention mask for the review text.
113
+ - 'labels': The sentiment label as a tensor.
114
+ """
115
+ data_row = self.data.iloc[index]
116
+ text = str(data_row.full_text)
117
+ labels = data_row.sentiment
118
+
119
+ encoding = self.tokenizer.encode_plus(
120
+ text,
121
+ add_special_tokens=True,
122
+ max_length=self.max_token_len,
123
+ return_token_type_ids=False,
124
+ padding="max_length",
125
+ truncation=True,
126
+ return_attention_mask=True,
127
+ return_tensors='pt',
128
+ )
129
+
130
+ return dict(
131
+ input_ids=encoding["input_ids"].flatten(),
132
+ attention_mask=encoding["attention_mask"].flatten(),
133
+ labels=torch.tensor(labels, dtype=torch.long)
134
+ )
135
+
136
+ class ReviewDataModule(pl.LightningDataModule):
137
+ """
138
+ PyTorch Lightning DataModule to handle the Amazon Reviews dataset.
139
+
140
+ This class encapsulates all the steps needed to process the data:
141
+ loading, splitting, and creating PyTorch DataLoaders for training,
142
+ validation, and testing. It allows for using a smaller random sample of the
143
+ full dataset for faster experimentation.
144
+
145
+ Attributes:
146
+ data_path (str): Path to the processed CSV file.
147
+ batch_size (int): The size of each data batch.
148
+ max_token_len (int): The maximum sequence length for the tokenizer.
149
+ tokenizer: The Hugging Face tokenizer instance.
150
+ num_workers (int): The number of CPU cores to use for data loading.
151
+ sample_size (int, optional): The number of samples to use. If None,
152
+ the full dataset is used.
153
+ """
154
+ def __init__(self, data_path: str, batch_size: int = 16, max_token_len: int = 256, model_name='distilbert-base-uncased', num_workers: int = 0, sample_size: int = None):
155
+ """
156
+ Initializes the ReviewDataModule.
157
+
158
+ Args:
159
+ data_path (str): The path to the processed CSV data file.
160
+ batch_size (int): The number of samples per batch.
161
+ max_token_len (int): Maximum length of tokenized sequences.
162
+ model_name (str): The name of the pre-trained model to use for the tokenizer.
163
+ num_workers (int): Number of subprocesses to use for data loading.
164
+ sample_size (int, optional): If specified, a random sample of this
165
+ size will be used from the dataset.
166
+ Defaults to None, which uses the full dataset.
167
+ """
168
+ super().__init__()
169
+ self.data_path = data_path
170
+ self.batch_size = batch_size
171
+ self.max_token_len = max_token_len
172
+ self.tokenizer = AutoTokenizer.from_pretrained(model_name)
173
+ self.num_workers = num_workers
174
+ self.sample_size = sample_size
175
+ self.train_df = None
176
+ self.val_df = None
177
+ self.test_df = None
178
+
179
+ def setup(self, stage=None):
180
+ """
181
+ Loads and splits the data for training, validation, and testing.
182
+
183
+ This method is called by PyTorch Lightning. It reads the CSV, handles
184
+ missing values, optionally takes a random sample, and performs a
185
+ stratified train-validation-test split. The indices of the resulting
186
+ DataFrames are reset to prevent potential KeyErrors during data loading.
187
+ """
188
+ df = pd.read_csv(self.data_path)
189
+ df.dropna(inplace=True)
190
+
191
+ # If a sample size is provided, sample the dataframe
192
+ if self.sample_size:
193
+ print(f"Using a sample of {self.sample_size} reviews.")
194
+ df = df.sample(n=self.sample_size, random_state=42)
195
+
196
+ # Stratified split to maintain label distribution
197
+ train_val_df, self.test_df = train_test_split(df, test_size=0.1, random_state=42, stratify=df.sentiment)
198
+ self.train_df, self.val_df = train_test_split(train_val_df, test_size=0.1, random_state=42, stratify=train_val_df.sentiment)
199
+
200
+ # Reset indices to prevent KeyErrors
201
+ self.train_df = self.train_df.reset_index(drop=True)
202
+ self.val_df = self.val_df.reset_index(drop=True)
203
+ self.test_df = self.test_df.reset_index(drop=True)
204
+
205
+ print(f"Size of training set: {len(self.train_df)}")
206
+ print(f"Size of validation set: {len(self.val_df)}")
207
+ print(f"Size of test set: {len(self.test_df)}")
208
+
209
+ def train_dataloader(self):
210
+ """Returns the DataLoader for the training set."""
211
+ return DataLoader(
212
+ ReviewDataset(self.train_df, self.tokenizer, self.max_token_len),
213
+ batch_size=self.batch_size,
214
+ shuffle=True,
215
+ num_workers=self.num_workers
216
+ )
217
+
218
+ def val_dataloader(self):
219
+ """Returns the DataLoader for the validation set."""
220
+ return DataLoader(
221
+ ReviewDataset(self.val_df, self.tokenizer, self.max_token__len),
222
+ batch_size=self.batch_size,
223
+ num_workers=self.num_workers
224
+ )
225
+
226
+ def test_dataloader(self):
227
+ """Returns the DataLoader for the test set."""
228
+ return DataLoader(
229
+ ReviewDataset(self.test_df, self.tokenizer, self.max_token_len),
230
+ batch_size=self.batch_size,
231
+ num_workers=self.num_workers
232
+ )
233
+
234
+ if __name__ == "__main__":
235
+
236
+ #--- Step 1: Preprocess the Reviews Dataset ---
237
+ print("\n--- Preprocessing started ---")
238
+ explore_and_preprocess_reviews()
239
+ print("\n--- Preprocessing finished ---")
240
+ # --- Configuration ---
241
+ data_path = "data/reviews_processed.csv"
242
+ BATCH_SIZE = 64
243
+ MAX_TOKEN_LEN = 256
244
+
245
+ print("Initializing ReviewDataModule...")
246
+ review_datamodule = ReviewDataModule(
247
+ data_path=data_path,
248
+ batch_size=BATCH_SIZE,
249
+ max_token_len=MAX_TOKEN_LEN,
250
+ model_name='distilbert-base-uncased',
251
+ sample_size=100000 # Pass the sample size to the datamodule
252
+ )
253
+ review_datamodule.setup()
254
+
255
+ # Fetch one batch from the training dataloader to inspect its contents
256
+ print("\n--- Fetching one batch from the training dataloader ---")
257
+ train_batch = next(iter(review_datamodule.train_dataloader()))
258
+
259
+ print("\n--- Example Batch ---")
260
+ print(f"Input IDs shape: {train_batch['input_ids'].shape}")
261
+ print(f"Attention Mask shape: {train_batch['attention_mask'].shape}")
262
+ print(f"Labels: {train_batch['labels']}")
263
+ print(f"Labels shape: {train_batch['labels'].shape}")
scripts/main.py CHANGED
@@ -1,3 +1,4 @@
 
1
  # main.py
2
 
3
  import torch
@@ -209,4 +210,115 @@ if __name__ == "__main__":
209
  break
210
  print("\n--- Chat session ended. ---")
211
 
212
- print("\n--- Local Execution Finished ---")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <<<<<<< HEAD
2
  # main.py
3
 
4
  import torch
 
210
  break
211
  print("\n--- Chat session ended. ---")
212
 
213
+ print("\n--- Local Execution Finished ---")
214
+ =======
215
+ import os
216
+ import torch
217
+ import pandas as pd
218
+
219
+ try:
220
+ from data_prepare import ReviewDataset, ReviewDataModule
221
+ from models import SentimentClassifier, ReviewSummarizer, AspectAnalyzer, FineTunedSentimentClassifier, AspectExtractor
222
+ except ImportError:
223
+ print("CRITICAL ERROR: Make sure 'review_summarizer.py', 'aspect_extractor.py', and 'sentiment_classifier_model.py' are in the same directory.")
224
+ exit()
225
+
226
+ # --- Configuration ---
227
+ # --- IMPORTANT: UPDATE THIS PATH ---
228
+ # You need to provide the path to the best checkpoint file that was saved
229
+ # during the training of your sentiment model.
230
+ SENTIMENT_CHECKPOINT_PATH = "checkpoints/sentiment-binary-best-checkpoint.ckpt"
231
+
232
+ # --- Pre-defined Aspect Dictionaries for Different Product Categories ---
233
+ ASPECT_DICTIONARIES = {
234
+ "Phone": ['camera', 'battery', 'battery life', 'screen', 'performance', 'price', 'design'],
235
+ "Coffee Maker": ['ease of use', 'design', 'noise level', 'coffee quality', 'brew time', 'cleaning'],
236
+ "Book": ['plot', 'characters', 'writing style', 'pacing', 'ending'],
237
+ "Default": ['quality', 'price', 'service', 'design', 'features'] # A fallback list
238
+ }
239
+
240
+ def main():
241
+ """
242
+ Main function to run the command-line review analysis tool.
243
+ """
244
+ # --- 1. Load All Models ---
245
+ print("--- Initializing all models ---")
246
+ sentiment_classifier, summarizer, aspect_analyzer, aspect_extractor = None, None, None, None
247
+ try:
248
+ summarizer = ReviewSummarizer(force_cpu=True)
249
+ aspect_analyzer = AspectAnalyzer(force_cpu=True)
250
+ aspect_extractor = AspectExtractor(force_cpu=True)
251
+
252
+ if not os.path.exists(SENTIMENT_CHECKPOINT_PATH):
253
+ print("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
254
+ print("!!! WARNING: Sentiment checkpoint path not found or not set. !!!")
255
+ print(f"!!! Please update the 'SENTIMENT_CHECKPOINT_PATH' variable in main.py")
256
+ print("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
257
+ else:
258
+ sentiment_classifier = FineTunedSentimentClassifier(
259
+ checkpoint_path=SENTIMENT_CHECKPOINT_PATH, force_cpu=True
260
+ )
261
+ print("\n--- All models loaded successfully ---\n")
262
+ except Exception as e:
263
+ print(f"An error occurred during model initialization: {e}")
264
+ return
265
+
266
+ # --- 2. Interactive Loop ---
267
+ while True:
268
+ print("\n==================================================")
269
+ print(" Product Review Analysis Tool ")
270
+ print("==================================================")
271
+
272
+ # Get user input
273
+ review_text = input("Enter the product review text (or type 'quit' to exit):\n> ")
274
+ if review_text.lower() == 'quit':
275
+ break
276
+
277
+ print("\nAvailable Product Categories:")
278
+ for i, category in enumerate(ASPECT_DICTIONARIES.keys(), 1):
279
+ print(f"{i}. {category}")
280
+
281
+ category_choice = input(f"Select a product category (1-{len(ASPECT_DICTIONARIES)}):\n> ")
282
+ try:
283
+ category_idx = int(category_choice) - 1
284
+ product_category = list(ASPECT_DICTIONARIES.keys())[category_idx]
285
+ except (ValueError, IndexError):
286
+ print("Invalid choice. Using 'Default' category.")
287
+ product_category = "Default"
288
+
289
+ # --- 3. Run Analysis ---
290
+ print("\n--- Analyzing Review... ---")
291
+
292
+ # a. Overall Sentiment
293
+ sentiment_result = sentiment_classifier.classify(review_text)
294
+
295
+ # b. Summary
296
+ summary_result = summarizer.summarize(review_text)
297
+
298
+ # c. Aspect Extraction and Analysis
299
+ aspect_dictionary = ASPECT_DICTIONARIES.get(product_category)
300
+ extracted_aspects = aspect_extractor.extract(review_text, aspect_dictionary)
301
+ aspect_results = None
302
+ if extracted_aspects:
303
+ aspect_results = aspect_analyzer.analyze(review_text, extracted_aspects)
304
+
305
+ # --- 4. Display Results ---
306
+ print("\n-------------------- ANALYSIS RESULTS --------------------")
307
+ print(f"\n[ Overall Sentiment ]")
308
+ print(f" - Sentiment: {sentiment_result['label']} (Score: {sentiment_result['score']:.2f})")
309
+
310
+ print(f"\n[ Generated Summary ]")
311
+ print(f" - {summary_result}")
312
+
313
+ print(f"\n[ Detected Aspect Sentiments ]")
314
+ if aspect_results:
315
+ for aspect, result in aspect_results.items():
316
+ print(f" - {aspect.title()}: {result['sentiment']} (Score: {result['score']:.2f})")
317
+ else:
318
+ print(" - No relevant aspects from the dictionary were detected in the review.")
319
+ print("----------------------------------------------------------")
320
+
321
+
322
+ if __name__ == "__main__":
323
+ main()
324
+ >>>>>>> e6de3c4338f79386345fa6e4bba5b0666ad808da
scripts/models.py ADDED
@@ -0,0 +1,256 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pytorch_lightning as pl
2
+ from transformers import AutoModelForSequenceClassification, get_linear_schedule_with_warmup, AutoConfig
3
+ from torch.optim import AdamW
4
+ import torch
5
+ from torchmetrics.functional import accuracy
6
+ from transformers import T5ForConditionalGeneration, T5Tokenizer, AutoTokenizer, pipeline
7
+
8
+ class SentimentClassifier(pl.LightningModule):
9
+ """
10
+ PyTorch Lightning module for the sentiment classification model.
11
+ """
12
+ def __init__(self, model_name='distilbert-base-uncased', n_classes=2, learning_rate=2e-5, n_warmup_steps=0, n_training_steps=0, dropout_prob=0.2): # Added dropout
13
+ super().__init__()
14
+ self.save_hyperparameters()
15
+
16
+ # Configure dropout
17
+ config = AutoConfig.from_pretrained(model_name)
18
+ config.hidden_dropout_prob = dropout_prob
19
+ config.attention_probs_dropout_prob = dropout_prob
20
+ config.num_labels = n_classes
21
+
22
+ self.model = AutoModelForSequenceClassification.from_pretrained(model_name, config=config)
23
+
24
+ def forward(self, input_ids, attention_mask, labels=None):
25
+ return self.model(
26
+ input_ids=input_ids,
27
+ attention_mask=attention_mask,
28
+ labels=labels
29
+ )
30
+
31
+ def training_step(self, batch, batch_idx):
32
+ output = self.forward(**batch)
33
+ self.log("train_loss", output.loss, prog_bar=True, logger=True)
34
+ return output.loss
35
+
36
+ def validation_step(self, batch, batch_idx):
37
+ output = self.forward(**batch)
38
+ preds = torch.argmax(output.logits, dim=1)
39
+ val_acc = accuracy(preds, batch['labels'], task='binary')
40
+ self.log("val_loss", output.loss, prog_bar=True, logger=True)
41
+ self.log("val_accuracy", val_acc, prog_bar=True, logger=True)
42
+ return output.loss
43
+
44
+ def test_step(self, batch, batch_idx):
45
+ output = self.forward(**batch)
46
+ preds = torch.argmax(output.logits, dim=1)
47
+ test_acc = accuracy(preds, batch['labels'], task='binary')
48
+ self.log("test_accuracy", test_acc)
49
+ return test_acc
50
+
51
+ def predict_step(self, batch, batch_idx, dataloader_idx=0):
52
+ output = self.forward(input_ids=batch['input_ids'], attention_mask=batch['attention_mask'])
53
+ return torch.argmax(output.logits, dim=1)
54
+
55
+ def configure_optimizers(self):
56
+ optimizer = AdamW(self.parameters(), lr=self.hparams.learning_rate, weight_decay=0.01)
57
+ scheduler = get_linear_schedule_with_warmup(
58
+ optimizer,
59
+ num_warmup_steps=self.hparams.n_warmup_steps,
60
+ num_training_steps=self.hparams.n_training_steps
61
+ )
62
+ return dict(optimizer=optimizer, lr_scheduler=dict(scheduler=scheduler, interval='step'))
63
+
64
+ class ReviewSummarizer:
65
+ """
66
+ A class to handle the summarization of product reviews using a pre-trained T5 model.
67
+ """
68
+ def __init__(self, model_name='t5-small'):
69
+ """
70
+ Initializes the summarizer with a pre-trained T5 model and tokenizer.
71
+
72
+ Args:
73
+ model_name (str): The name of the pre-trained T5 model to use.
74
+ """
75
+ print(f"Loading summarization model: {model_name}...")
76
+ self.model_name = model_name
77
+ self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
78
+
79
+ # Load the tokenizer and model from Hugging Face
80
+ self.tokenizer = T5Tokenizer.from_pretrained(self.model_name)
81
+ self.model = T5ForConditionalGeneration.from_pretrained(self.model_name).to(self.device)
82
+ print("Summarization model loaded successfully.")
83
+
84
+ def summarize(self, text: str, max_length: int = 50, min_length: int = 10) -> str:
85
+ """
86
+ Generates a summary for a given text.
87
+
88
+ Args:
89
+ text (str): The review text to summarize.
90
+ max_length (int): The maximum length of the generated summary.
91
+ min_length (int): The minimum length of the generated summary.
92
+
93
+ Returns:
94
+ str: The generated summary.
95
+ """
96
+ if not text or not isinstance(text, str):
97
+ return ""
98
+
99
+ # T5 models require a prefix for the task. For summarization, it's "summarize: "
100
+ preprocess_text = f"summarize: {text.strip()}"
101
+
102
+ # Tokenize the input text
103
+ tokenized_text = self.tokenizer.encode(preprocess_text, return_tensors="pt").to(self.device)
104
+
105
+ # Generate the summary
106
+ summary_ids = self.model.generate(
107
+ tokenized_text,
108
+ max_length=max_length,
109
+ min_length=min_length,
110
+ length_penalty=2.0,
111
+ num_beams=4,
112
+ early_stopping=True
113
+ )
114
+
115
+ # Decode the summary and return it
116
+ summary = self.tokenizer.decode(summary_ids[0], skip_special_tokens=True)
117
+ return summary
118
+
119
+ class AspectAnalyzer:
120
+ """
121
+ A class to handle Aspect-Based Sentiment Analysis (ABSA) using a pre-trained model.
122
+ """
123
+ # Changed to a different, currently valid lightweight model for ABSA.
124
+ def __init__(self, model_name='yangheng/deberta-v3-base-absa-v1.1', force_cpu=False):
125
+ """
126
+ Initializes the ABSA pipeline with a pre-trained model.
127
+
128
+ Args:
129
+ model_name (str): The name of the pre-trained ABSA model.
130
+ force_cpu (bool): If True, forces the model to run on the CPU.
131
+ """
132
+ print(f"Loading Aspect-Based Sentiment Analysis model: {model_name}...")
133
+ self.model_name = model_name
134
+
135
+ if force_cpu:
136
+ self.device = -1 # Use -1 for CPU in pipeline
137
+ print("Forcing ABSA model to run on CPU.")
138
+ else:
139
+ self.device = 0 if torch.cuda.is_available() else -1
140
+
141
+ print(f"Using device: {self.device} (0 for GPU, -1 for CPU)")
142
+
143
+ self.absa_pipeline = pipeline(
144
+ "text-classification",
145
+ model=self.model_name,
146
+ tokenizer=self.model_name,
147
+ device=self.device
148
+ )
149
+ print("ABSA model loaded successfully.")
150
+
151
+ def analyze(self, text: str, aspects: list) -> dict:
152
+ """
153
+ Analyzes the sentiment towards a list of aspects within a given text.
154
+ """
155
+ if not text or not isinstance(text, str) or not aspects:
156
+ return {}
157
+
158
+ # The model expects the review and aspect separated by a special token.
159
+ # Note: Different ABSA models might expect different input formats.
160
+ # This format is common but may need adjustment for other models.
161
+ inputs = [f"{text} [SEP] {aspect}" for aspect in aspects]
162
+ results = self.absa_pipeline(inputs)
163
+
164
+ # Process results into a user-friendly dictionary
165
+ aspect_sentiments = {}
166
+ for aspect, result in zip(aspects, results):
167
+ aspect_sentiments[aspect] = {'sentiment': result['label'], 'score': result['score']}
168
+
169
+ return aspect_sentiments
170
+
171
+ class FineTunedSentimentClassifier:
172
+ """
173
+ This class handles loading the fine-tuned checkpoint and making predictions.
174
+ """
175
+ def __init__(self, checkpoint_path, model_name='distilbert-base-uncased', force_cpu=False):
176
+ self.device = 'cpu' if force_cpu else ('cuda' if torch.cuda.is_available() else 'cpu')
177
+ print(f"Loading fine-tuned sentiment model from checkpoint: {checkpoint_path}...")
178
+ print(f"Using device: {self.device}")
179
+
180
+ self.model = SentimentClassifier.load_from_checkpoint(checkpoint_path, map_location=self.device)
181
+ self.model.to(self.device)
182
+ self.model.eval() # Set model to evaluation mode
183
+
184
+ self.tokenizer = AutoTokenizer.from_pretrained(model_name)
185
+ self.labels = ['NEGATIVE', 'POSITIVE']
186
+ print("Fine-tuned sentiment model loaded successfully.")
187
+
188
+ def classify(self, text: str) -> dict:
189
+ encoding = self.tokenizer.encode_plus(
190
+ text, add_special_tokens=True, max_length=128,
191
+ return_token_type_ids=False, padding="max_length",
192
+ truncation=True, return_attention_mask=True, return_tensors='pt',
193
+ )
194
+ input_ids = encoding["input_ids"].to(self.device)
195
+ attention_mask = encoding["attention_mask"].to(self.device)
196
+ with torch.no_grad():
197
+ outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)
198
+ logits = outputs.logits
199
+ probabilities = torch.softmax(logits, dim=1)
200
+ prediction_idx = torch.argmax(probabilities, dim=1).item()
201
+ return {'label': self.labels[prediction_idx], 'score': probabilities[0][prediction_idx].item()}
202
+
203
+ class AspectExtractor:
204
+ """
205
+ This class uses a Part-of-Speech (POS) tagging model to first extract all
206
+ potential aspect terms (nouns) from a review text. It then filters these
207
+ nouns against a pre-defined dictionary of valid aspects for a given
208
+ product category to return only the relevant features.
209
+ """
210
+ def __init__(self, model_name="vblagoje/bert-english-uncased-finetuned-pos", force_cpu=False):
211
+ self.model_name = model_name
212
+ self.device = 'cpu' if force_cpu else ('cuda' if torch.cuda.is_available() else 'cpu')
213
+ print(f"Loading Part-of-Speech (POS) tagging model: {self.model_name}...")
214
+ print(f"Using device: {self.device}")
215
+
216
+ self.pipeline = pipeline(
217
+ "token-classification",
218
+ model=self.model_name,
219
+ device=-1 if self.device == 'cpu' else 0,
220
+ aggregation_strategy="simple"
221
+ )
222
+ print("POS tagging model loaded successfully.")
223
+
224
+ def extract(self, text: str, aspect_dictionary: list) -> list:
225
+ """
226
+ Extracts aspects from the given text that are present in the provided
227
+ aspect dictionary.
228
+
229
+ Args:
230
+ text (str): The review text to analyze.
231
+ aspect_dictionary (list): A list of valid, known aspects for the
232
+ product category.
233
+
234
+ Returns:
235
+ list: A list of aspects that were both found in the text and are
236
+ present in the aspect dictionary.
237
+ """
238
+ if not text or not aspect_dictionary:
239
+ return []
240
+
241
+ # 1. Extract all nouns from the text using the POS model
242
+ model_outputs = self.pipeline(text)
243
+ noun_tags = {'NOUN', 'PROPN'}
244
+ extracted_nouns = {
245
+ output['word'].lower() for output in model_outputs
246
+ if output['entity_group'] in noun_tags
247
+ }
248
+
249
+ # 2. Filter the extracted nouns against the provided dictionary
250
+ # We find the intersection between the two sets.
251
+ valid_aspects = {aspect.lower() for aspect in aspect_dictionary}
252
+
253
+ final_aspects = list(extracted_nouns.intersection(valid_aspects))
254
+
255
+ return final_aspects
256
+
scripts/train_distilbet.py ADDED
@@ -0,0 +1,101 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pytorch_lightning as pl
2
+ from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping
3
+ from pytorch_lightning.loggers import TensorBoardLogger
4
+ import torch
5
+ import seaborn as sns
6
+ import matplotlib.pyplot as plt
7
+ from sklearn.metrics import confusion_matrix
8
+ from data_prepare import ReviewDataModule, ReviewDataset
9
+ from models import SentimentClassifier
10
+
11
+ def train_sentiment_model(data_path='data/reviews_processed.csv', model_name='distilbert-base-uncased', n_epochs=5, sample_size: int = None):
12
+ """
13
+ Main function to train the sentiment analysis model on the Amazon Reviews dataset.
14
+
15
+ Args:
16
+ data_path (str): Path to the processed data file.
17
+ model_name (str): Name of the transformer model to use.
18
+ n_epochs (int): Maximum number of epochs for training.
19
+ sample_size (int, optional): The number of reviews to use for training.
20
+ If None, the full dataset is used.
21
+ """
22
+ # --- 1. Hyperparameters ---
23
+ BATCH_SIZE = 64
24
+ MAX_TOKEN_LEN = 256
25
+ LEARNING_RATE = 2e-5
26
+ N_CLASSES = 2 # Negative, Positive
27
+
28
+ # --- 2. Initialize DataModule ---
29
+ print("Initializing ReviewDataModule...")
30
+ review_datamodule = ReviewDataModule(
31
+ data_path=data_path,
32
+ batch_size=BATCH_SIZE,
33
+ max_token_len=MAX_TOKEN_LEN,
34
+ model_name=model_name,
35
+ sample_size=sample_size # Pass the sample size to the datamodule
36
+ )
37
+ review_datamodule.setup()
38
+
39
+ n_training_steps = len(review_datamodule.train_dataloader()) * n_epochs
40
+ n_warmup_steps = int(n_training_steps * 0.1)
41
+
42
+ # --- 3. Initialize Model ---
43
+ print("Initializing SentimentClassifier model...")
44
+ model = SentimentClassifier(
45
+ model_name=model_name,
46
+ n_classes=N_CLASSES,
47
+ learning_rate=LEARNING_RATE,
48
+ n_warmup_steps=n_warmup_steps,
49
+ n_training_steps=n_training_steps
50
+ )
51
+
52
+ # --- 4. Configure Training Callbacks ---
53
+ checkpoint_callback = ModelCheckpoint(
54
+ dirpath="checkpoints",
55
+ filename="sentiment-binary-best-checkpoint",
56
+ save_top_k=1,
57
+ verbose=True,
58
+ monitor="val_loss",
59
+ mode="min"
60
+ )
61
+ logger = TensorBoardLogger("lightning_logs", name="sentiment-classifier-binary")
62
+ early_stopping_callback = EarlyStopping(monitor='val_loss', patience=2)
63
+
64
+ # --- 5. Initialize Trainer ---
65
+ print("Initializing PyTorch Lightning Trainer...")
66
+ trainer = pl.Trainer(
67
+ logger=logger,
68
+ callbacks=[checkpoint_callback, early_stopping_callback],
69
+ max_epochs=n_epochs,
70
+ accelerator='gpu' if torch.cuda.is_available() else 'cpu',
71
+ devices=1,
72
+ )
73
+
74
+ # --- 6. Start Training ---
75
+ print(f"Starting training with {model_name} for up to {n_epochs} epochs...")
76
+ trainer.fit(model, review_datamodule)
77
+
78
+ # --- 7. Evaluate on Test Set and Generate Confusion Matrix ---
79
+ print("\nTraining complete. Evaluating on the test set...")
80
+ trainer.test(model, datamodule=review_datamodule)
81
+
82
+ predictions = trainer.predict(model, datamodule=review_datamodule)
83
+ if predictions:
84
+ all_preds = torch.cat(predictions).cpu().numpy()
85
+ true_labels = review_datamodule.test_df.sentiment.to_numpy()
86
+ target_names = ['Negative', 'Positive'] # Updated labels
87
+
88
+ cm = confusion_matrix(true_labels, all_preds)
89
+ plt.figure(figsize=(8, 6))
90
+ sns.heatmap(cm, annot=True, fmt='d', cmap='YlGnBu',
91
+ xticklabels=target_names, yticklabels=target_names)
92
+ plt.title('Confusion Matrix for Sentiment Analysis')
93
+ plt.xlabel('Predicted Label')
94
+ plt.ylabel('True Label')
95
+ plt.show()
96
+
97
+
98
+
99
+ if __name__ == "__main__":
100
+ data_path = "data/reviews_processed.csv"
101
+ train_sentiment_model(data_path=data_path, sample_size=100000)
scripts/train_naive_bayes.py ADDED
@@ -0,0 +1,118 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ from sklearn.model_selection import train_test_split, ParameterGrid, StratifiedKFold
4
+ from sklearn.feature_extraction.text import TfidfVectorizer
5
+ from sklearn.naive_bayes import MultinomialNB
6
+ from sklearn.pipeline import Pipeline
7
+ from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
8
+ import seaborn as sns
9
+ import matplotlib.pyplot as plt
10
+ from tqdm.notebook import tqdm
11
+ import os
12
+
13
+ def train_baseline_sentiment_model(data_path='data/reviews_processed.csv', grid_search=True, nb__alpha=0.1, tfidf__max_df=0.75, tfidf__ngram_range=(1, 2), sample_size: int = 50000):
14
+ """
15
+ Trains and evaluates a Multinomial Naive Bayes model for sentiment analysis.
16
+ Can optionally perform a grid search.
17
+
18
+ Args:
19
+ data_path (str): Path to the processed reviews CSV file.
20
+ grid_search (bool): If True, performs a grid search.
21
+ nb__alpha (float): Alpha for MultinomialNB.
22
+ tfidf__max_df (float): max_df for TfidfVectorizer.
23
+ tfidf__ngram_range (tuple): ngram_range for TfidfVectorizer.
24
+ sample_size (int, optional): Number of reviews to use. If None, uses all.
25
+ """
26
+ # --- 1. Load Data ---
27
+ print(f"Loading data from '{data_path}'...")
28
+ if not os.path.exists(data_path):
29
+ print(f"\nERROR: '{data_path}' not found. Please run the EDA script first!")
30
+ return
31
+
32
+ df = pd.read_csv(data_path)
33
+ df.dropna(inplace=True)
34
+
35
+ # --- 2. Sample Data ---
36
+ if sample_size:
37
+ print(f"Using a sample of {sample_size} reviews for training the baseline model.")
38
+ df = df.sample(n=sample_size, random_state=42)
39
+
40
+ # --- 3. Train-Test Split ---
41
+ print("Splitting data into training and testing sets...")
42
+ X_train, X_test, y_train, y_test = train_test_split(
43
+ df['full_text'],
44
+ df['sentiment'],
45
+ test_size=0.2,
46
+ random_state=42,
47
+ stratify=df['sentiment']
48
+ )
49
+
50
+ # --- 4. Create a Pipeline ---
51
+ pipeline = Pipeline([
52
+ ('tfidf', TfidfVectorizer(stop_words='english')),
53
+ ('nb', MultinomialNB()),
54
+ ])
55
+
56
+ best_params = None
57
+
58
+ if grid_search:
59
+ # --- 5a. Perform Grid Search ---
60
+ print("Performing Grid Search to find the best hyperparameters...")
61
+ parameters = {
62
+ 'tfidf__ngram_range': [(1, 1), (1, 2)],
63
+ 'tfidf__max_df': [0.5, 0.75, 1.0],
64
+ 'nb__alpha': [0.1, 0.5, 1.0],
65
+ }
66
+ param_grid = list(ParameterGrid(parameters))
67
+ best_score = -1
68
+
69
+ for params in tqdm(param_grid, desc="Grid Search Progress"):
70
+ pipeline.set_params(**params)
71
+ pipeline.fit(X_train, y_train)
72
+ score = pipeline.score(X_test, y_test)
73
+ if score > best_score:
74
+ best_score = score
75
+ best_params = params
76
+
77
+ print(f"\nBest score on test set: {best_score:.4f}")
78
+ print("Best parameters found:")
79
+ print(best_params)
80
+
81
+ else:
82
+ # --- 5b. Use provided hyperparameters ---
83
+ print("Skipping grid search and using provided hyperparameters...")
84
+ best_params = {
85
+ 'nb__alpha': nb__alpha,
86
+ 'tfidf__max_df': tfidf__max_df,
87
+ 'tfidf__ngram_range': tfidf__ngram_range
88
+ }
89
+
90
+ # --- 6. Train the Final Model ---
91
+ print("\nTraining final model...")
92
+ best_model = pipeline.set_params(**best_params)
93
+ best_model.fit(X_train, y_train)
94
+ print("Model training complete.")
95
+
96
+ # --- 7. Evaluate the Best Model ---
97
+ print("\n--- Model Evaluation ---")
98
+ y_pred = best_model.predict(X_test)
99
+
100
+ accuracy = accuracy_score(y_test, y_pred)
101
+ target_names = ['Negative', 'Positive']
102
+
103
+ print(f"Accuracy: {accuracy:.4f}")
104
+ print("\nClassification Report:")
105
+ print(classification_report(y_test, y_pred, target_names=target_names))
106
+
107
+ print("Confusion Matrix:")
108
+ cm = confusion_matrix(y_test, y_pred)
109
+ plt.figure(figsize=(8, 6))
110
+ sns.heatmap(cm, annot=True, fmt='d', cmap='Greens',
111
+ xticklabels=target_names, yticklabels=target_names)
112
+ plt.title('Confusion Matrix for Naive Bayes on Amazon Reviews')
113
+ plt.xlabel('Predicted Label')
114
+ plt.ylabel('True Label')
115
+ plt.show()
116
+
117
+ if __name__ == "__main__":
118
+ train_baseline_sentiment_model(sample_size=150000, grid_search=False)