elsayedelmandoh commited on
Commit
77ab35f
·
1 Parent(s): 9ac3023
notebooks/02_eda.ipynb CHANGED
The diff for this file is too large to render. See raw diff
 
notebooks/03_data_preprocessing.ipynb CHANGED
@@ -89,8 +89,8 @@
89
  " <th>review_target</th>\n",
90
  " <th>review_title</th>\n",
91
  " <th>review_content</th>\n",
92
- " <th>char_count</th>\n",
93
- " <th>word_count</th>\n",
94
  " </tr>\n",
95
  " </thead>\n",
96
  " <tbody>\n",
@@ -146,12 +146,19 @@
146
  "3 1 Equus 3340 \n",
147
  "4 2 awesome sheets! \n",
148
  "\n",
149
- " review_content char_count word_count \n",
150
- "0 I HAVE HAD THE DX6340 FOR ABOUT A YEAR.I LOVE ... 586 108 \n",
151
- "1 I'm using this book in an introductory organic... 570 88 \n",
152
- "2 I only read the first few chapters and was bom... 214 40 \n",
153
- "3 Feels cheaply made, the battery contacts were ... 193 34 \n",
154
- "4 I love these sheets! They are sleek & smooth w... 198 38 "
 
 
 
 
 
 
 
155
  ]
156
  },
157
  "execution_count": 3,
@@ -176,13 +183,13 @@
176
  "<class 'pandas.DataFrame'>\n",
177
  "RangeIndex: 79972 entries, 0 to 79971\n",
178
  "Data columns (total 5 columns):\n",
179
- " # Column Non-Null Count Dtype\n",
180
- "--- ------ -------------- -----\n",
181
- " 0 review_target 79972 non-null str \n",
182
- " 1 review_title 79972 non-null str \n",
183
- " 2 review_content 79972 non-null str \n",
184
- " 3 char_count 79972 non-null str \n",
185
- " 4 word_count 79972 non-null str \n",
186
  "dtypes: str(5)\n",
187
  "memory usage: 3.1 MB\n"
188
  ]
@@ -230,9 +237,9 @@
230
  " <th>review_target</th>\n",
231
  " <th>review_title</th>\n",
232
  " <th>review_content</th>\n",
233
- " <th>char_count</th>\n",
234
- " <th>word_count</th>\n",
235
- " <th>review_cleaned</th>\n",
236
  " </tr>\n",
237
  " </thead>\n",
238
  " <tbody>\n",
@@ -293,14 +300,21 @@
293
  "3 1 Equus 3340 \n",
294
  "4 2 awesome sheets! \n",
295
  "\n",
296
- " review_content char_count word_count \\\n",
297
- "0 I HAVE HAD THE DX6340 FOR ABOUT A YEAR.I LOVE ... 586 108 \n",
298
- "1 I'm using this book in an introductory organic... 570 88 \n",
299
- "2 I only read the first few chapters and was bom... 214 40 \n",
300
- "3 Feels cheaply made, the battery contacts were ... 193 34 \n",
301
- "4 I love these sheets! They are sleek & smooth w... 198 38 \n",
 
 
 
 
 
 
 
302
  "\n",
303
- " review_cleaned \n",
304
  "0 dx6340 year love picture good 35m easy use unl... \n",
305
  "1 using book introductory organic spectroscopy c... \n",
306
  "2 read first chapter bombarded reference 199 end... \n",
@@ -315,7 +329,138 @@
315
  ],
316
  "source": [
317
  "processed_train = balanced_sample_train.copy()\n",
318
- "processed_train['review_cleaned'] = clean_text(processed_train['review_content'])\n",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
319
  "processed_train.head()"
320
  ]
321
  },
@@ -329,7 +474,7 @@
329
  },
330
  {
331
  "cell_type": "code",
332
- "execution_count": 6,
333
  "id": "2c4e029b",
334
  "metadata": {},
335
  "outputs": [
@@ -346,7 +491,7 @@
346
  "{'csv': PosixPath('data/processed/processed_train.csv')}"
347
  ]
348
  },
349
- "execution_count": 6,
350
  "metadata": {},
351
  "output_type": "execute_result"
352
  }
 
89
  " <th>review_target</th>\n",
90
  " <th>review_title</th>\n",
91
  " <th>review_content</th>\n",
92
+ " <th>review_content_char_count</th>\n",
93
+ " <th>review_content_word_count</th>\n",
94
  " </tr>\n",
95
  " </thead>\n",
96
  " <tbody>\n",
 
146
  "3 1 Equus 3340 \n",
147
  "4 2 awesome sheets! \n",
148
  "\n",
149
+ " review_content \\\n",
150
+ "0 I HAVE HAD THE DX6340 FOR ABOUT A YEAR.I LOVE ... \n",
151
+ "1 I'm using this book in an introductory organic... \n",
152
+ "2 I only read the first few chapters and was bom... \n",
153
+ "3 Feels cheaply made, the battery contacts were ... \n",
154
+ "4 I love these sheets! They are sleek & smooth w... \n",
155
+ "\n",
156
+ " review_content_char_count review_content_word_count \n",
157
+ "0 586 108 \n",
158
+ "1 570 88 \n",
159
+ "2 214 40 \n",
160
+ "3 193 34 \n",
161
+ "4 198 38 "
162
  ]
163
  },
164
  "execution_count": 3,
 
183
  "<class 'pandas.DataFrame'>\n",
184
  "RangeIndex: 79972 entries, 0 to 79971\n",
185
  "Data columns (total 5 columns):\n",
186
+ " # Column Non-Null Count Dtype\n",
187
+ "--- ------ -------------- -----\n",
188
+ " 0 review_target 79972 non-null str \n",
189
+ " 1 review_title 79972 non-null str \n",
190
+ " 2 review_content 79972 non-null str \n",
191
+ " 3 review_content_char_count 79972 non-null str \n",
192
+ " 4 review_content_word_count 79972 non-null str \n",
193
  "dtypes: str(5)\n",
194
  "memory usage: 3.1 MB\n"
195
  ]
 
237
  " <th>review_target</th>\n",
238
  " <th>review_title</th>\n",
239
  " <th>review_content</th>\n",
240
+ " <th>review_content_char_count</th>\n",
241
+ " <th>review_content_word_count</th>\n",
242
+ " <th>review_content_cleaned</th>\n",
243
  " </tr>\n",
244
  " </thead>\n",
245
  " <tbody>\n",
 
300
  "3 1 Equus 3340 \n",
301
  "4 2 awesome sheets! \n",
302
  "\n",
303
+ " review_content \\\n",
304
+ "0 I HAVE HAD THE DX6340 FOR ABOUT A YEAR.I LOVE ... \n",
305
+ "1 I'm using this book in an introductory organic... \n",
306
+ "2 I only read the first few chapters and was bom... \n",
307
+ "3 Feels cheaply made, the battery contacts were ... \n",
308
+ "4 I love these sheets! They are sleek & smooth w... \n",
309
+ "\n",
310
+ " review_content_char_count review_content_word_count \\\n",
311
+ "0 586 108 \n",
312
+ "1 570 88 \n",
313
+ "2 214 40 \n",
314
+ "3 193 34 \n",
315
+ "4 198 38 \n",
316
  "\n",
317
+ " review_content_cleaned \n",
318
  "0 dx6340 year love picture good 35m easy use unl... \n",
319
  "1 using book introductory organic spectroscopy c... \n",
320
  "2 read first chapter bombarded reference 199 end... \n",
 
329
  ],
330
  "source": [
331
  "processed_train = balanced_sample_train.copy()\n",
332
+ "processed_train['review_content_cleaned'] = clean_text(processed_train['review_content'])\n",
333
+ "processed_train.head()"
334
+ ]
335
+ },
336
+ {
337
+ "cell_type": "code",
338
+ "execution_count": 6,
339
+ "id": "b251cec2",
340
+ "metadata": {},
341
+ "outputs": [
342
+ {
343
+ "data": {
344
+ "text/html": [
345
+ "<div>\n",
346
+ "<style scoped>\n",
347
+ " .dataframe tbody tr th:only-of-type {\n",
348
+ " vertical-align: middle;\n",
349
+ " }\n",
350
+ "\n",
351
+ " .dataframe tbody tr th {\n",
352
+ " vertical-align: top;\n",
353
+ " }\n",
354
+ "\n",
355
+ " .dataframe thead th {\n",
356
+ " text-align: right;\n",
357
+ " }\n",
358
+ "</style>\n",
359
+ "<table border=\"1\" class=\"dataframe\">\n",
360
+ " <thead>\n",
361
+ " <tr style=\"text-align: right;\">\n",
362
+ " <th></th>\n",
363
+ " <th>review_target</th>\n",
364
+ " <th>review_title</th>\n",
365
+ " <th>review_content</th>\n",
366
+ " <th>review_content_char_count</th>\n",
367
+ " <th>review_content_word_count</th>\n",
368
+ " <th>review_content_cleaned</th>\n",
369
+ " <th>review_title_cleaned</th>\n",
370
+ " </tr>\n",
371
+ " </thead>\n",
372
+ " <tbody>\n",
373
+ " <tr>\n",
374
+ " <th>0</th>\n",
375
+ " <td>2</td>\n",
376
+ " <td>GREAT CAMRA</td>\n",
377
+ " <td>I HAVE HAD THE DX6340 FOR ABOUT A YEAR.I LOVE ...</td>\n",
378
+ " <td>586</td>\n",
379
+ " <td>108</td>\n",
380
+ " <td>dx6340 year love picture good 35m easy use unl...</td>\n",
381
+ " <td>great camra</td>\n",
382
+ " </tr>\n",
383
+ " <tr>\n",
384
+ " <th>1</th>\n",
385
+ " <td>1</td>\n",
386
+ " <td>not so great</td>\n",
387
+ " <td>I'm using this book in an introductory organic...</td>\n",
388
+ " <td>570</td>\n",
389
+ " <td>88</td>\n",
390
+ " <td>using book introductory organic spectroscopy c...</td>\n",
391
+ " <td>not great</td>\n",
392
+ " </tr>\n",
393
+ " <tr>\n",
394
+ " <th>2</th>\n",
395
+ " <td>1</td>\n",
396
+ " <td>Inaccurate and disappointing</td>\n",
397
+ " <td>I only read the first few chapters and was bom...</td>\n",
398
+ " <td>214</td>\n",
399
+ " <td>40</td>\n",
400
+ " <td>read first chapter bombarded reference 199 end...</td>\n",
401
+ " <td>inaccurate disappointing</td>\n",
402
+ " </tr>\n",
403
+ " <tr>\n",
404
+ " <th>3</th>\n",
405
+ " <td>1</td>\n",
406
+ " <td>Equus 3340</td>\n",
407
+ " <td>Feels cheaply made, the battery contacts were ...</td>\n",
408
+ " <td>193</td>\n",
409
+ " <td>34</td>\n",
410
+ " <td>feel cheaply made battery contact rusted soon ...</td>\n",
411
+ " <td>equus 3340</td>\n",
412
+ " </tr>\n",
413
+ " <tr>\n",
414
+ " <th>4</th>\n",
415
+ " <td>2</td>\n",
416
+ " <td>awesome sheets!</td>\n",
417
+ " <td>I love these sheets! They are sleek &amp; smooth w...</td>\n",
418
+ " <td>198</td>\n",
419
+ " <td>38</td>\n",
420
+ " <td>love sheet sleek smooth really cool feel perfe...</td>\n",
421
+ " <td>awesome sheet</td>\n",
422
+ " </tr>\n",
423
+ " </tbody>\n",
424
+ "</table>\n",
425
+ "</div>"
426
+ ],
427
+ "text/plain": [
428
+ " review_target review_title \\\n",
429
+ "0 2 GREAT CAMRA \n",
430
+ "1 1 not so great \n",
431
+ "2 1 Inaccurate and disappointing \n",
432
+ "3 1 Equus 3340 \n",
433
+ "4 2 awesome sheets! \n",
434
+ "\n",
435
+ " review_content \\\n",
436
+ "0 I HAVE HAD THE DX6340 FOR ABOUT A YEAR.I LOVE ... \n",
437
+ "1 I'm using this book in an introductory organic... \n",
438
+ "2 I only read the first few chapters and was bom... \n",
439
+ "3 Feels cheaply made, the battery contacts were ... \n",
440
+ "4 I love these sheets! They are sleek & smooth w... \n",
441
+ "\n",
442
+ " review_content_char_count review_content_word_count \\\n",
443
+ "0 586 108 \n",
444
+ "1 570 88 \n",
445
+ "2 214 40 \n",
446
+ "3 193 34 \n",
447
+ "4 198 38 \n",
448
+ "\n",
449
+ " review_content_cleaned review_title_cleaned \n",
450
+ "0 dx6340 year love picture good 35m easy use unl... great camra \n",
451
+ "1 using book introductory organic spectroscopy c... not great \n",
452
+ "2 read first chapter bombarded reference 199 end... inaccurate disappointing \n",
453
+ "3 feel cheaply made battery contact rusted soon ... equus 3340 \n",
454
+ "4 love sheet sleek smooth really cool feel perfe... awesome sheet "
455
+ ]
456
+ },
457
+ "execution_count": 6,
458
+ "metadata": {},
459
+ "output_type": "execute_result"
460
+ }
461
+ ],
462
+ "source": [
463
+ "processed_train['review_title_cleaned'] = clean_text(processed_train['review_title'])\n",
464
  "processed_train.head()"
465
  ]
466
  },
 
474
  },
475
  {
476
  "cell_type": "code",
477
+ "execution_count": 7,
478
  "id": "2c4e029b",
479
  "metadata": {},
480
  "outputs": [
 
491
  "{'csv': PosixPath('data/processed/processed_train.csv')}"
492
  ]
493
  },
494
+ "execution_count": 7,
495
  "metadata": {},
496
  "output_type": "execute_result"
497
  }
notebooks/04_feature_engineering.ipynb CHANGED
@@ -92,9 +92,10 @@
92
  " <th>review_target</th>\n",
93
  " <th>review_title</th>\n",
94
  " <th>review_content</th>\n",
95
- " <th>char_count</th>\n",
96
- " <th>word_count</th>\n",
97
- " <th>review_cleaned</th>\n",
 
98
  " </tr>\n",
99
  " </thead>\n",
100
  " <tbody>\n",
@@ -106,6 +107,7 @@
106
  " <td>586</td>\n",
107
  " <td>108</td>\n",
108
  " <td>dx6340 year love picture good 35m easy use unl...</td>\n",
 
109
  " </tr>\n",
110
  " <tr>\n",
111
  " <th>1</th>\n",
@@ -115,6 +117,7 @@
115
  " <td>570</td>\n",
116
  " <td>88</td>\n",
117
  " <td>using book introductory organic spectroscopy c...</td>\n",
 
118
  " </tr>\n",
119
  " <tr>\n",
120
  " <th>2</th>\n",
@@ -124,6 +127,7 @@
124
  " <td>214</td>\n",
125
  " <td>40</td>\n",
126
  " <td>read first chapter bombarded reference 199 end...</td>\n",
 
127
  " </tr>\n",
128
  " <tr>\n",
129
  " <th>3</th>\n",
@@ -133,6 +137,7 @@
133
  " <td>193</td>\n",
134
  " <td>34</td>\n",
135
  " <td>feel cheaply made battery contact rusted soon ...</td>\n",
 
136
  " </tr>\n",
137
  " <tr>\n",
138
  " <th>4</th>\n",
@@ -142,6 +147,7 @@
142
  " <td>198</td>\n",
143
  " <td>38</td>\n",
144
  " <td>love sheet sleek smooth really cool feel perfe...</td>\n",
 
145
  " </tr>\n",
146
  " </tbody>\n",
147
  "</table>\n",
@@ -155,19 +161,26 @@
155
  "3 1 Equus 3340 \n",
156
  "4 2 awesome sheets! \n",
157
  "\n",
158
- " review_content char_count word_count \\\n",
159
- "0 I HAVE HAD THE DX6340 FOR ABOUT A YEAR.I LOVE ... 586 108 \n",
160
- "1 I'm using this book in an introductory organic... 570 88 \n",
161
- "2 I only read the first few chapters and was bom... 214 40 \n",
162
- "3 Feels cheaply made, the battery contacts were ... 193 34 \n",
163
- "4 I love these sheets! They are sleek & smooth w... 198 38 \n",
164
  "\n",
165
- " review_cleaned \n",
166
- "0 dx6340 year love picture good 35m easy use unl... \n",
167
- "1 using book introductory organic spectroscopy c... \n",
168
- "2 read first chapter bombarded reference 199 end... \n",
169
- "3 feel cheaply made battery contact rusted soon ... \n",
170
- "4 love sheet sleek smooth really cool feel perfe... "
 
 
 
 
 
 
 
171
  ]
172
  },
173
  "execution_count": 3,
@@ -192,17 +205,18 @@
192
  "text": [
193
  "<class 'pandas.DataFrame'>\n",
194
  "RangeIndex: 79972 entries, 0 to 79971\n",
195
- "Data columns (total 6 columns):\n",
196
- " # Column Non-Null Count Dtype\n",
197
- "--- ------ -------------- -----\n",
198
- " 0 review_target 79972 non-null str \n",
199
- " 1 review_title 79972 non-null str \n",
200
- " 2 review_content 79972 non-null str \n",
201
- " 3 char_count 79972 non-null str \n",
202
- " 4 word_count 79972 non-null str \n",
203
- " 5 review_cleaned 79972 non-null str \n",
204
- "dtypes: str(6)\n",
205
- "memory usage: 3.7 MB\n"
 
206
  ]
207
  }
208
  ],
@@ -321,7 +335,7 @@
321
  }
322
  ],
323
  "source": [
324
- "show_top_ngrams_by_class(feat_eng_train, 'review_target', 'review_cleaned', ngram_ranges=(1, 1), top_k=15)\n"
325
  ]
326
  },
327
  {
@@ -411,7 +425,7 @@
411
  }
412
  ],
413
  "source": [
414
- "show_top_ngrams_by_class(feat_eng_train, 'review_target', 'review_cleaned', ngram_ranges=(2, 2), top_k=12)\n"
415
  ]
416
  },
417
  {
@@ -501,7 +515,7 @@
501
  }
502
  ],
503
  "source": [
504
- "show_top_ngrams_by_class(feat_eng_train, 'review_target', 'review_cleaned', ngram_ranges=(3, 3), top_k=12)\n"
505
  ]
506
  },
507
  {
@@ -542,15 +556,16 @@
542
  " <th>review_target</th>\n",
543
  " <th>review_title</th>\n",
544
  " <th>review_content</th>\n",
545
- " <th>char_count</th>\n",
546
- " <th>word_count</th>\n",
547
- " <th>review_cleaned</th>\n",
548
- " <th>exclamation_count</th>\n",
549
- " <th>question_count</th>\n",
550
- " <th>punctuation_count</th>\n",
551
- " <th>avg_word_length</th>\n",
552
- " <th>uppercase_count</th>\n",
553
- " <th>uppercase_ratio</th>\n",
 
554
  " </tr>\n",
555
  " </thead>\n",
556
  " <tbody>\n",
@@ -562,6 +577,7 @@
562
  " <td>586</td>\n",
563
  " <td>108</td>\n",
564
  " <td>dx6340 year love picture good 35m easy use unl...</td>\n",
 
565
  " <td>0</td>\n",
566
  " <td>0</td>\n",
567
  " <td>14</td>\n",
@@ -577,6 +593,7 @@
577
  " <td>570</td>\n",
578
  " <td>88</td>\n",
579
  " <td>using book introductory organic spectroscopy c...</td>\n",
 
580
  " <td>0</td>\n",
581
  " <td>0</td>\n",
582
  " <td>12</td>\n",
@@ -592,6 +609,7 @@
592
  " <td>214</td>\n",
593
  " <td>40</td>\n",
594
  " <td>read first chapter bombarded reference 199 end...</td>\n",
 
595
  " <td>0</td>\n",
596
  " <td>0</td>\n",
597
  " <td>5</td>\n",
@@ -607,6 +625,7 @@
607
  " <td>193</td>\n",
608
  " <td>34</td>\n",
609
  " <td>feel cheaply made battery contact rusted soon ...</td>\n",
 
610
  " <td>0</td>\n",
611
  " <td>0</td>\n",
612
  " <td>2</td>\n",
@@ -622,6 +641,7 @@
622
  " <td>198</td>\n",
623
  " <td>38</td>\n",
624
  " <td>love sheet sleek smooth really cool feel perfe...</td>\n",
 
625
  " <td>1</td>\n",
626
  " <td>0</td>\n",
627
  " <td>9</td>\n",
@@ -643,6 +663,7 @@
643
  " <td>...</td>\n",
644
  " <td>...</td>\n",
645
  " <td>...</td>\n",
 
646
  " </tr>\n",
647
  " <tr>\n",
648
  " <th>79967</th>\n",
@@ -652,6 +673,7 @@
652
  " <td>694</td>\n",
653
  " <td>120</td>\n",
654
  " <td>not mistake book quality adult fiction spoonfu...</td>\n",
 
655
  " <td>0</td>\n",
656
  " <td>0</td>\n",
657
  " <td>21</td>\n",
@@ -667,6 +689,7 @@
667
  " <td>518</td>\n",
668
  " <td>98</td>\n",
669
  " <td>grand theft auto very fun like others played b...</td>\n",
 
670
  " <td>0</td>\n",
671
  " <td>0</td>\n",
672
  " <td>11</td>\n",
@@ -682,6 +705,7 @@
682
  " <td>160</td>\n",
683
  " <td>26</td>\n",
684
  " <td>easy fun addictive soundtrack like nothing hea...</td>\n",
 
685
  " <td>1</td>\n",
686
  " <td>0</td>\n",
687
  " <td>9</td>\n",
@@ -697,6 +721,7 @@
697
  " <td>142</td>\n",
698
  " <td>28</td>\n",
699
  " <td>not receive supco part prong different wire no...</td>\n",
 
700
  " <td>0</td>\n",
701
  " <td>0</td>\n",
702
  " <td>3</td>\n",
@@ -712,6 +737,7 @@
712
  " <td>915</td>\n",
713
  " <td>165</td>\n",
714
  " <td>wish book would followed book like stardust br...</td>\n",
 
715
  " <td>0</td>\n",
716
  " <td>10</td>\n",
717
  " <td>17</td>\n",
@@ -721,7 +747,7 @@
721
  " </tr>\n",
722
  " </tbody>\n",
723
  "</table>\n",
724
- "<p>79972 rows × 12 columns</p>\n",
725
  "</div>"
726
  ],
727
  "text/plain": [
@@ -738,59 +764,98 @@
738
  "79970 1 Part was not what ordered \n",
739
  "79971 1 AAAHHH! \n",
740
  "\n",
741
- " review_content char_count \\\n",
742
- "0 I HAVE HAD THE DX6340 FOR ABOUT A YEAR.I LOVE ... 586 \n",
743
- "1 I'm using this book in an introductory organic... 570 \n",
744
- "2 I only read the first few chapters and was bom... 214 \n",
745
- "3 Feels cheaply made, the battery contacts were ... 193 \n",
746
- "4 I love these sheets! They are sleek & smooth w... 198 \n",
747
- "... ... ... \n",
748
- "79967 Do not mistake this book for quality adult fic... 694 \n",
749
- "79968 this grand theft auto is very fun like all the... 518 \n",
750
- "79969 Easy, fun, and addictive.And the soundtrack is... 160 \n",
751
- "79970 You will not receive Supco part. The prongs ar... 142 \n",
752
- "79971 I wish this book would have followed through w... 915 \n",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
753
  "\n",
754
- " word_count review_cleaned \\\n",
755
- "0 108 dx6340 year love picture good 35m easy use unl... \n",
756
- "1 88 using book introductory organic spectroscopy c... \n",
757
- "2 40 read first chapter bombarded reference 199 end... \n",
758
- "3 34 feel cheaply made battery contact rusted soon ... \n",
759
- "4 38 love sheet sleek smooth really cool feel perfe... \n",
760
- "... ... ... \n",
761
- "79967 120 not mistake book quality adult fiction spoonfu... \n",
762
- "79968 98 grand theft auto very fun like others played b... \n",
763
- "79969 26 easy fun addictive soundtrack like nothing hea... \n",
764
- "79970 28 not receive supco part prong different wire no... \n",
765
- "79971 165 wish book would followed book like stardust br... \n",
766
  "\n",
767
- " exclamation_count question_count punctuation_count avg_word_length \\\n",
768
- "0 0 0 14 4.435185 \n",
769
- "1 0 0 12 5.488636 \n",
770
- "2 0 0 5 4.375000 \n",
771
- "3 0 0 2 4.705882 \n",
772
- "4 1 0 9 4.236842 \n",
773
- "... ... ... ... ... \n",
774
- "79967 0 0 21 4.791667 \n",
775
- "79968 0 0 11 4.295918 \n",
776
- "79969 1 0 9 5.192308 \n",
777
- "79970 0 0 3 4.107143 \n",
778
- "79971 0 10 17 4.551515 \n",
779
  "\n",
780
- " uppercase_count uppercase_ratio \n",
781
- "0 455 0.776451 \n",
782
- "1 4 0.007018 \n",
783
- "2 3 0.014019 \n",
784
- "3 2 0.010363 \n",
785
- "4 4 0.020202 \n",
786
- "... ... ... \n",
787
- "79967 16 0.023055 \n",
788
- "79968 0 0.000000 \n",
789
- "79969 4 0.025000 \n",
790
- "79970 4 0.028169 \n",
791
- "79971 29 0.031694 \n",
792
  "\n",
793
- "[79972 rows x 12 columns]"
 
 
 
 
 
 
 
 
 
 
 
 
 
794
  ]
795
  },
796
  "execution_count": 9,
@@ -803,6 +868,471 @@
803
  "feat_eng_train"
804
  ]
805
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
806
  {
807
  "cell_type": "markdown",
808
  "id": "a0afd7c0",
@@ -815,7 +1345,7 @@
815
  },
816
  {
817
  "cell_type": "code",
818
- "execution_count": null,
819
  "id": "8757f67a",
820
  "metadata": {},
821
  "outputs": [
@@ -824,20 +1354,27 @@
824
  "output_type": "stream",
825
  "text": [
826
  "Correlations between meta-features and target:\n",
827
- "exclamation_count 0.027933\n",
828
- "avg_word_length 0.013098\n",
829
- "uppercase_ratio -0.000912\n",
830
- "uppercase_count -0.019469\n",
831
- "char_count -0.067318\n",
832
- "punctuation_count -0.069328\n",
833
- "word_count -0.071107\n",
834
- "question_count -0.090043\n",
 
 
 
 
 
 
 
835
  "dtype: float64\n"
836
  ]
837
  }
838
  ],
839
  "source": [
840
- "meta_cols= ['char_count', 'word_count', 'exclamation_count', 'question_count', 'punctuation_count', 'avg_word_length', 'uppercase_count', 'uppercase_ratio']\n",
841
  "try:\n",
842
  " target_numeric = pd.to_numeric(feat_eng_train['review_target'], errors='coerce')\n",
843
  " if target_numeric.isna().sum() > len(target_numeric) * 0.5:\n",
@@ -881,7 +1418,7 @@
881
  },
882
  {
883
  "cell_type": "code",
884
- "execution_count": 11,
885
  "id": "a67f9b1c",
886
  "metadata": {},
887
  "outputs": [
@@ -895,13 +1432,13 @@
895
  ],
896
  "source": [
897
  "tfidf_train = TfidfVectorizer(ngram_range=(1,2), max_features=20000, stop_words='english')\n",
898
- "tfidf_train = tfidf_train.fit_transform(feat_eng_train['review_cleaned'].astype(str))\n",
899
  "print('TF-IDF shape:', tfidf_train.shape)"
900
  ]
901
  },
902
  {
903
  "cell_type": "code",
904
- "execution_count": 12,
905
  "id": "78552554",
906
  "metadata": {},
907
  "outputs": [
@@ -927,7 +1464,7 @@
927
  " [ 0.06429021, -0.04825038]])"
928
  ]
929
  },
930
- "execution_count": 12,
931
  "metadata": {},
932
  "output_type": "execute_result"
933
  }
@@ -938,7 +1475,7 @@
938
  },
939
  {
940
  "cell_type": "code",
941
- "execution_count": 13,
942
  "id": "a9ab3966",
943
  "metadata": {},
944
  "outputs": [
@@ -964,7 +1501,7 @@
964
  " [-13.600362 , -14.125854 ]], dtype=float32)"
965
  ]
966
  },
967
- "execution_count": 13,
968
  "metadata": {},
969
  "output_type": "execute_result"
970
  }
@@ -983,7 +1520,7 @@
983
  },
984
  {
985
  "cell_type": "code",
986
- "execution_count": 16,
987
  "id": "90061617",
988
  "metadata": {},
989
  "outputs": [
@@ -1000,7 +1537,7 @@
1000
  "{'csv': PosixPath('data/processed/feat_eng_train.csv')}"
1001
  ]
1002
  },
1003
- "execution_count": 16,
1004
  "metadata": {},
1005
  "output_type": "execute_result"
1006
  }
@@ -1019,7 +1556,7 @@
1019
  },
1020
  {
1021
  "cell_type": "code",
1022
- "execution_count": 15,
1023
  "id": "2481b653",
1024
  "metadata": {},
1025
  "outputs": [
@@ -1036,7 +1573,7 @@
1036
  "{'vectorizer': PosixPath('data/vectorizers/tfidf_train.joblib')}"
1037
  ]
1038
  },
1039
- "execution_count": 15,
1040
  "metadata": {},
1041
  "output_type": "execute_result"
1042
  }
 
92
  " <th>review_target</th>\n",
93
  " <th>review_title</th>\n",
94
  " <th>review_content</th>\n",
95
+ " <th>review_content_char_count</th>\n",
96
+ " <th>review_content_word_count</th>\n",
97
+ " <th>review_content_cleaned</th>\n",
98
+ " <th>review_title_cleaned</th>\n",
99
  " </tr>\n",
100
  " </thead>\n",
101
  " <tbody>\n",
 
107
  " <td>586</td>\n",
108
  " <td>108</td>\n",
109
  " <td>dx6340 year love picture good 35m easy use unl...</td>\n",
110
+ " <td>great camra</td>\n",
111
  " </tr>\n",
112
  " <tr>\n",
113
  " <th>1</th>\n",
 
117
  " <td>570</td>\n",
118
  " <td>88</td>\n",
119
  " <td>using book introductory organic spectroscopy c...</td>\n",
120
+ " <td>not great</td>\n",
121
  " </tr>\n",
122
  " <tr>\n",
123
  " <th>2</th>\n",
 
127
  " <td>214</td>\n",
128
  " <td>40</td>\n",
129
  " <td>read first chapter bombarded reference 199 end...</td>\n",
130
+ " <td>inaccurate disappointing</td>\n",
131
  " </tr>\n",
132
  " <tr>\n",
133
  " <th>3</th>\n",
 
137
  " <td>193</td>\n",
138
  " <td>34</td>\n",
139
  " <td>feel cheaply made battery contact rusted soon ...</td>\n",
140
+ " <td>equus 3340</td>\n",
141
  " </tr>\n",
142
  " <tr>\n",
143
  " <th>4</th>\n",
 
147
  " <td>198</td>\n",
148
  " <td>38</td>\n",
149
  " <td>love sheet sleek smooth really cool feel perfe...</td>\n",
150
+ " <td>awesome sheet</td>\n",
151
  " </tr>\n",
152
  " </tbody>\n",
153
  "</table>\n",
 
161
  "3 1 Equus 3340 \n",
162
  "4 2 awesome sheets! \n",
163
  "\n",
164
+ " review_content \\\n",
165
+ "0 I HAVE HAD THE DX6340 FOR ABOUT A YEAR.I LOVE ... \n",
166
+ "1 I'm using this book in an introductory organic... \n",
167
+ "2 I only read the first few chapters and was bom... \n",
168
+ "3 Feels cheaply made, the battery contacts were ... \n",
169
+ "4 I love these sheets! They are sleek & smooth w... \n",
170
  "\n",
171
+ " review_content_char_count review_content_word_count \\\n",
172
+ "0 586 108 \n",
173
+ "1 570 88 \n",
174
+ "2 214 40 \n",
175
+ "3 193 34 \n",
176
+ "4 198 38 \n",
177
+ "\n",
178
+ " review_content_cleaned review_title_cleaned \n",
179
+ "0 dx6340 year love picture good 35m easy use unl... great camra \n",
180
+ "1 using book introductory organic spectroscopy c... not great \n",
181
+ "2 read first chapter bombarded reference 199 end... inaccurate disappointing \n",
182
+ "3 feel cheaply made battery contact rusted soon ... equus 3340 \n",
183
+ "4 love sheet sleek smooth really cool feel perfe... awesome sheet "
184
  ]
185
  },
186
  "execution_count": 3,
 
205
  "text": [
206
  "<class 'pandas.DataFrame'>\n",
207
  "RangeIndex: 79972 entries, 0 to 79971\n",
208
+ "Data columns (total 7 columns):\n",
209
+ " # Column Non-Null Count Dtype\n",
210
+ "--- ------ -------------- -----\n",
211
+ " 0 review_target 79972 non-null str \n",
212
+ " 1 review_title 79972 non-null str \n",
213
+ " 2 review_content 79972 non-null str \n",
214
+ " 3 review_content_char_count 79972 non-null str \n",
215
+ " 4 review_content_word_count 79972 non-null str \n",
216
+ " 5 review_content_cleaned 79972 non-null str \n",
217
+ " 6 review_title_cleaned 79607 non-null str \n",
218
+ "dtypes: str(7)\n",
219
+ "memory usage: 4.3 MB\n"
220
  ]
221
  }
222
  ],
 
335
  }
336
  ],
337
  "source": [
338
+ "show_top_ngrams_by_class(feat_eng_train, 'review_target', 'review_content_cleaned', ngram_ranges=(1, 1), top_k=15)\n"
339
  ]
340
  },
341
  {
 
425
  }
426
  ],
427
  "source": [
428
+ "show_top_ngrams_by_class(feat_eng_train, 'review_target', 'review_content_cleaned', ngram_ranges=(2, 2), top_k=12)\n"
429
  ]
430
  },
431
  {
 
515
  }
516
  ],
517
  "source": [
518
+ "show_top_ngrams_by_class(feat_eng_train, 'review_target', 'review_content_cleaned', ngram_ranges=(3, 3), top_k=12)\n"
519
  ]
520
  },
521
  {
 
556
  " <th>review_target</th>\n",
557
  " <th>review_title</th>\n",
558
  " <th>review_content</th>\n",
559
+ " <th>review_content_char_count</th>\n",
560
+ " <th>review_content_word_count</th>\n",
561
+ " <th>review_content_cleaned</th>\n",
562
+ " <th>review_title_cleaned</th>\n",
563
+ " <th>review_content_exclamation_count</th>\n",
564
+ " <th>review_content_question_count</th>\n",
565
+ " <th>review_content_punctuation_count</th>\n",
566
+ " <th>review_content_avg_word_length</th>\n",
567
+ " <th>review_content_uppercase_count</th>\n",
568
+ " <th>review_content_uppercase_ratio</th>\n",
569
  " </tr>\n",
570
  " </thead>\n",
571
  " <tbody>\n",
 
577
  " <td>586</td>\n",
578
  " <td>108</td>\n",
579
  " <td>dx6340 year love picture good 35m easy use unl...</td>\n",
580
+ " <td>great camra</td>\n",
581
  " <td>0</td>\n",
582
  " <td>0</td>\n",
583
  " <td>14</td>\n",
 
593
  " <td>570</td>\n",
594
  " <td>88</td>\n",
595
  " <td>using book introductory organic spectroscopy c...</td>\n",
596
+ " <td>not great</td>\n",
597
  " <td>0</td>\n",
598
  " <td>0</td>\n",
599
  " <td>12</td>\n",
 
609
  " <td>214</td>\n",
610
  " <td>40</td>\n",
611
  " <td>read first chapter bombarded reference 199 end...</td>\n",
612
+ " <td>inaccurate disappointing</td>\n",
613
  " <td>0</td>\n",
614
  " <td>0</td>\n",
615
  " <td>5</td>\n",
 
625
  " <td>193</td>\n",
626
  " <td>34</td>\n",
627
  " <td>feel cheaply made battery contact rusted soon ...</td>\n",
628
+ " <td>equus 3340</td>\n",
629
  " <td>0</td>\n",
630
  " <td>0</td>\n",
631
  " <td>2</td>\n",
 
641
  " <td>198</td>\n",
642
  " <td>38</td>\n",
643
  " <td>love sheet sleek smooth really cool feel perfe...</td>\n",
644
+ " <td>awesome sheet</td>\n",
645
  " <td>1</td>\n",
646
  " <td>0</td>\n",
647
  " <td>9</td>\n",
 
663
  " <td>...</td>\n",
664
  " <td>...</td>\n",
665
  " <td>...</td>\n",
666
+ " <td>...</td>\n",
667
  " </tr>\n",
668
  " <tr>\n",
669
  " <th>79967</th>\n",
 
673
  " <td>694</td>\n",
674
  " <td>120</td>\n",
675
  " <td>not mistake book quality adult fiction spoonfu...</td>\n",
676
+ " <td>half step johnny tremain</td>\n",
677
  " <td>0</td>\n",
678
  " <td>0</td>\n",
679
  " <td>21</td>\n",
 
689
  " <td>518</td>\n",
690
  " <td>98</td>\n",
691
  " <td>grand theft auto very fun like others played b...</td>\n",
692
+ " <td>pretty good game</td>\n",
693
  " <td>0</td>\n",
694
  " <td>0</td>\n",
695
  " <td>11</td>\n",
 
705
  " <td>160</td>\n",
706
  " <td>26</td>\n",
707
  " <td>easy fun addictive soundtrack like nothing hea...</td>\n",
708
+ " <td>worth looking</td>\n",
709
  " <td>1</td>\n",
710
  " <td>0</td>\n",
711
  " <td>9</td>\n",
 
721
  " <td>142</td>\n",
722
  " <td>28</td>\n",
723
  " <td>not receive supco part prong different wire no...</td>\n",
724
+ " <td>part not ordered</td>\n",
725
  " <td>0</td>\n",
726
  " <td>0</td>\n",
727
  " <td>3</td>\n",
 
737
  " <td>915</td>\n",
738
  " <td>165</td>\n",
739
  " <td>wish book would followed book like stardust br...</td>\n",
740
+ " <td>aahh</td>\n",
741
  " <td>0</td>\n",
742
  " <td>10</td>\n",
743
  " <td>17</td>\n",
 
747
  " </tr>\n",
748
  " </tbody>\n",
749
  "</table>\n",
750
+ "<p>79972 rows × 13 columns</p>\n",
751
  "</div>"
752
  ],
753
  "text/plain": [
 
764
  "79970 1 Part was not what ordered \n",
765
  "79971 1 AAAHHH! \n",
766
  "\n",
767
+ " review_content \\\n",
768
+ "0 I HAVE HAD THE DX6340 FOR ABOUT A YEAR.I LOVE ... \n",
769
+ "1 I'm using this book in an introductory organic... \n",
770
+ "2 I only read the first few chapters and was bom... \n",
771
+ "3 Feels cheaply made, the battery contacts were ... \n",
772
+ "4 I love these sheets! They are sleek & smooth w... \n",
773
+ "... ... \n",
774
+ "79967 Do not mistake this book for quality adult fic... \n",
775
+ "79968 this grand theft auto is very fun like all the... \n",
776
+ "79969 Easy, fun, and addictive.And the soundtrack is... \n",
777
+ "79970 You will not receive Supco part. The prongs ar... \n",
778
+ "79971 I wish this book would have followed through w... \n",
779
+ "\n",
780
+ " review_content_char_count review_content_word_count \\\n",
781
+ "0 586 108 \n",
782
+ "1 570 88 \n",
783
+ "2 214 40 \n",
784
+ "3 193 34 \n",
785
+ "4 198 38 \n",
786
+ "... ... ... \n",
787
+ "79967 694 120 \n",
788
+ "79968 518 98 \n",
789
+ "79969 160 26 \n",
790
+ "79970 142 28 \n",
791
+ "79971 915 165 \n",
792
+ "\n",
793
+ " review_content_cleaned \\\n",
794
+ "0 dx6340 year love picture good 35m easy use unl... \n",
795
+ "1 using book introductory organic spectroscopy c... \n",
796
+ "2 read first chapter bombarded reference 199 end... \n",
797
+ "3 feel cheaply made battery contact rusted soon ... \n",
798
+ "4 love sheet sleek smooth really cool feel perfe... \n",
799
+ "... ... \n",
800
+ "79967 not mistake book quality adult fiction spoonfu... \n",
801
+ "79968 grand theft auto very fun like others played b... \n",
802
+ "79969 easy fun addictive soundtrack like nothing hea... \n",
803
+ "79970 not receive supco part prong different wire no... \n",
804
+ "79971 wish book would followed book like stardust br... \n",
805
  "\n",
806
+ " review_title_cleaned review_content_exclamation_count \\\n",
807
+ "0 great camra 0 \n",
808
+ "1 not great 0 \n",
809
+ "2 inaccurate disappointing 0 \n",
810
+ "3 equus 3340 0 \n",
811
+ "4 awesome sheet 1 \n",
812
+ "... ... ... \n",
813
+ "79967 half step johnny tremain 0 \n",
814
+ "79968 pretty good game 0 \n",
815
+ "79969 worth looking 1 \n",
816
+ "79970 part not ordered 0 \n",
817
+ "79971 aahh 0 \n",
818
  "\n",
819
+ " review_content_question_count review_content_punctuation_count \\\n",
820
+ "0 0 14 \n",
821
+ "1 0 12 \n",
822
+ "2 0 5 \n",
823
+ "3 0 2 \n",
824
+ "4 0 9 \n",
825
+ "... ... ... \n",
826
+ "79967 0 21 \n",
827
+ "79968 0 11 \n",
828
+ "79969 0 9 \n",
829
+ "79970 0 3 \n",
830
+ "79971 10 17 \n",
831
  "\n",
832
+ " review_content_avg_word_length review_content_uppercase_count \\\n",
833
+ "0 4.435185 455 \n",
834
+ "1 5.488636 4 \n",
835
+ "2 4.375000 3 \n",
836
+ "3 4.705882 2 \n",
837
+ "4 4.236842 4 \n",
838
+ "... ... ... \n",
839
+ "79967 4.791667 16 \n",
840
+ "79968 4.295918 0 \n",
841
+ "79969 5.192308 4 \n",
842
+ "79970 4.107143 4 \n",
843
+ "79971 4.551515 29 \n",
844
  "\n",
845
+ " review_content_uppercase_ratio \n",
846
+ "0 0.776451 \n",
847
+ "1 0.007018 \n",
848
+ "2 0.014019 \n",
849
+ "3 0.010363 \n",
850
+ "4 0.020202 \n",
851
+ "... ... \n",
852
+ "79967 0.023055 \n",
853
+ "79968 0.000000 \n",
854
+ "79969 0.025000 \n",
855
+ "79970 0.028169 \n",
856
+ "79971 0.031694 \n",
857
+ "\n",
858
+ "[79972 rows x 13 columns]"
859
  ]
860
  },
861
  "execution_count": 9,
 
868
  "feat_eng_train"
869
  ]
870
  },
871
+ {
872
+ "cell_type": "code",
873
+ "execution_count": 10,
874
+ "id": "117a7ee2",
875
+ "metadata": {},
876
+ "outputs": [
877
+ {
878
+ "data": {
879
+ "text/html": [
880
+ "<div>\n",
881
+ "<style scoped>\n",
882
+ " .dataframe tbody tr th:only-of-type {\n",
883
+ " vertical-align: middle;\n",
884
+ " }\n",
885
+ "\n",
886
+ " .dataframe tbody tr th {\n",
887
+ " vertical-align: top;\n",
888
+ " }\n",
889
+ "\n",
890
+ " .dataframe thead th {\n",
891
+ " text-align: right;\n",
892
+ " }\n",
893
+ "</style>\n",
894
+ "<table border=\"1\" class=\"dataframe\">\n",
895
+ " <thead>\n",
896
+ " <tr style=\"text-align: right;\">\n",
897
+ " <th></th>\n",
898
+ " <th>review_target</th>\n",
899
+ " <th>review_title</th>\n",
900
+ " <th>review_content</th>\n",
901
+ " <th>review_content_char_count</th>\n",
902
+ " <th>review_content_word_count</th>\n",
903
+ " <th>review_content_cleaned</th>\n",
904
+ " <th>review_title_cleaned</th>\n",
905
+ " <th>review_content_exclamation_count</th>\n",
906
+ " <th>review_content_question_count</th>\n",
907
+ " <th>review_content_punctuation_count</th>\n",
908
+ " <th>review_content_avg_word_length</th>\n",
909
+ " <th>review_content_uppercase_count</th>\n",
910
+ " <th>review_content_uppercase_ratio</th>\n",
911
+ " <th>review_title_exclamation_count</th>\n",
912
+ " <th>review_title_question_count</th>\n",
913
+ " <th>review_title_punctuation_count</th>\n",
914
+ " <th>review_title_word_count</th>\n",
915
+ " <th>review_title_avg_word_length</th>\n",
916
+ " <th>review_title_uppercase_count</th>\n",
917
+ " <th>review_title_uppercase_ratio</th>\n",
918
+ " </tr>\n",
919
+ " </thead>\n",
920
+ " <tbody>\n",
921
+ " <tr>\n",
922
+ " <th>0</th>\n",
923
+ " <td>2</td>\n",
924
+ " <td>GREAT CAMRA</td>\n",
925
+ " <td>I HAVE HAD THE DX6340 FOR ABOUT A YEAR.I LOVE ...</td>\n",
926
+ " <td>586</td>\n",
927
+ " <td>108</td>\n",
928
+ " <td>dx6340 year love picture good 35m easy use unl...</td>\n",
929
+ " <td>great camra</td>\n",
930
+ " <td>0</td>\n",
931
+ " <td>0</td>\n",
932
+ " <td>14</td>\n",
933
+ " <td>4.435185</td>\n",
934
+ " <td>455</td>\n",
935
+ " <td>0.776451</td>\n",
936
+ " <td>0</td>\n",
937
+ " <td>0</td>\n",
938
+ " <td>0</td>\n",
939
+ " <td>2</td>\n",
940
+ " <td>5.000000</td>\n",
941
+ " <td>10</td>\n",
942
+ " <td>0.909091</td>\n",
943
+ " </tr>\n",
944
+ " <tr>\n",
945
+ " <th>1</th>\n",
946
+ " <td>1</td>\n",
947
+ " <td>not so great</td>\n",
948
+ " <td>I'm using this book in an introductory organic...</td>\n",
949
+ " <td>570</td>\n",
950
+ " <td>88</td>\n",
951
+ " <td>using book introductory organic spectroscopy c...</td>\n",
952
+ " <td>not great</td>\n",
953
+ " <td>0</td>\n",
954
+ " <td>0</td>\n",
955
+ " <td>12</td>\n",
956
+ " <td>5.488636</td>\n",
957
+ " <td>4</td>\n",
958
+ " <td>0.007018</td>\n",
959
+ " <td>0</td>\n",
960
+ " <td>0</td>\n",
961
+ " <td>0</td>\n",
962
+ " <td>3</td>\n",
963
+ " <td>3.333333</td>\n",
964
+ " <td>0</td>\n",
965
+ " <td>0.000000</td>\n",
966
+ " </tr>\n",
967
+ " <tr>\n",
968
+ " <th>2</th>\n",
969
+ " <td>1</td>\n",
970
+ " <td>Inaccurate and disappointing</td>\n",
971
+ " <td>I only read the first few chapters and was bom...</td>\n",
972
+ " <td>214</td>\n",
973
+ " <td>40</td>\n",
974
+ " <td>read first chapter bombarded reference 199 end...</td>\n",
975
+ " <td>inaccurate disappointing</td>\n",
976
+ " <td>0</td>\n",
977
+ " <td>0</td>\n",
978
+ " <td>5</td>\n",
979
+ " <td>4.375000</td>\n",
980
+ " <td>3</td>\n",
981
+ " <td>0.014019</td>\n",
982
+ " <td>0</td>\n",
983
+ " <td>0</td>\n",
984
+ " <td>0</td>\n",
985
+ " <td>3</td>\n",
986
+ " <td>8.666667</td>\n",
987
+ " <td>1</td>\n",
988
+ " <td>0.035714</td>\n",
989
+ " </tr>\n",
990
+ " <tr>\n",
991
+ " <th>3</th>\n",
992
+ " <td>1</td>\n",
993
+ " <td>Equus 3340</td>\n",
994
+ " <td>Feels cheaply made, the battery contacts were ...</td>\n",
995
+ " <td>193</td>\n",
996
+ " <td>34</td>\n",
997
+ " <td>feel cheaply made battery contact rusted soon ...</td>\n",
998
+ " <td>equus 3340</td>\n",
999
+ " <td>0</td>\n",
1000
+ " <td>0</td>\n",
1001
+ " <td>2</td>\n",
1002
+ " <td>4.705882</td>\n",
1003
+ " <td>2</td>\n",
1004
+ " <td>0.010363</td>\n",
1005
+ " <td>0</td>\n",
1006
+ " <td>0</td>\n",
1007
+ " <td>0</td>\n",
1008
+ " <td>2</td>\n",
1009
+ " <td>4.500000</td>\n",
1010
+ " <td>1</td>\n",
1011
+ " <td>0.100000</td>\n",
1012
+ " </tr>\n",
1013
+ " <tr>\n",
1014
+ " <th>4</th>\n",
1015
+ " <td>2</td>\n",
1016
+ " <td>awesome sheets!</td>\n",
1017
+ " <td>I love these sheets! They are sleek &amp; smooth w...</td>\n",
1018
+ " <td>198</td>\n",
1019
+ " <td>38</td>\n",
1020
+ " <td>love sheet sleek smooth really cool feel perfe...</td>\n",
1021
+ " <td>awesome sheet</td>\n",
1022
+ " <td>1</td>\n",
1023
+ " <td>0</td>\n",
1024
+ " <td>9</td>\n",
1025
+ " <td>4.236842</td>\n",
1026
+ " <td>4</td>\n",
1027
+ " <td>0.020202</td>\n",
1028
+ " <td>1</td>\n",
1029
+ " <td>0</td>\n",
1030
+ " <td>1</td>\n",
1031
+ " <td>2</td>\n",
1032
+ " <td>7.000000</td>\n",
1033
+ " <td>0</td>\n",
1034
+ " <td>0.000000</td>\n",
1035
+ " </tr>\n",
1036
+ " <tr>\n",
1037
+ " <th>...</th>\n",
1038
+ " <td>...</td>\n",
1039
+ " <td>...</td>\n",
1040
+ " <td>...</td>\n",
1041
+ " <td>...</td>\n",
1042
+ " <td>...</td>\n",
1043
+ " <td>...</td>\n",
1044
+ " <td>...</td>\n",
1045
+ " <td>...</td>\n",
1046
+ " <td>...</td>\n",
1047
+ " <td>...</td>\n",
1048
+ " <td>...</td>\n",
1049
+ " <td>...</td>\n",
1050
+ " <td>...</td>\n",
1051
+ " <td>...</td>\n",
1052
+ " <td>...</td>\n",
1053
+ " <td>...</td>\n",
1054
+ " <td>...</td>\n",
1055
+ " <td>...</td>\n",
1056
+ " <td>...</td>\n",
1057
+ " <td>...</td>\n",
1058
+ " </tr>\n",
1059
+ " <tr>\n",
1060
+ " <th>79967</th>\n",
1061
+ " <td>1</td>\n",
1062
+ " <td>A Half-Step Above Johnny Tremain</td>\n",
1063
+ " <td>Do not mistake this book for quality adult fic...</td>\n",
1064
+ " <td>694</td>\n",
1065
+ " <td>120</td>\n",
1066
+ " <td>not mistake book quality adult fiction spoonfu...</td>\n",
1067
+ " <td>half step johnny tremain</td>\n",
1068
+ " <td>0</td>\n",
1069
+ " <td>0</td>\n",
1070
+ " <td>21</td>\n",
1071
+ " <td>4.791667</td>\n",
1072
+ " <td>16</td>\n",
1073
+ " <td>0.023055</td>\n",
1074
+ " <td>0</td>\n",
1075
+ " <td>0</td>\n",
1076
+ " <td>1</td>\n",
1077
+ " <td>5</td>\n",
1078
+ " <td>5.600000</td>\n",
1079
+ " <td>6</td>\n",
1080
+ " <td>0.187500</td>\n",
1081
+ " </tr>\n",
1082
+ " <tr>\n",
1083
+ " <th>79968</th>\n",
1084
+ " <td>2</td>\n",
1085
+ " <td>pretty good game</td>\n",
1086
+ " <td>this grand theft auto is very fun like all the...</td>\n",
1087
+ " <td>518</td>\n",
1088
+ " <td>98</td>\n",
1089
+ " <td>grand theft auto very fun like others played b...</td>\n",
1090
+ " <td>pretty good game</td>\n",
1091
+ " <td>0</td>\n",
1092
+ " <td>0</td>\n",
1093
+ " <td>11</td>\n",
1094
+ " <td>4.295918</td>\n",
1095
+ " <td>0</td>\n",
1096
+ " <td>0.000000</td>\n",
1097
+ " <td>0</td>\n",
1098
+ " <td>0</td>\n",
1099
+ " <td>0</td>\n",
1100
+ " <td>3</td>\n",
1101
+ " <td>4.666667</td>\n",
1102
+ " <td>0</td>\n",
1103
+ " <td>0.000000</td>\n",
1104
+ " </tr>\n",
1105
+ " <tr>\n",
1106
+ " <th>79969</th>\n",
1107
+ " <td>2</td>\n",
1108
+ " <td>WORTH LOOKING FOR</td>\n",
1109
+ " <td>Easy, fun, and addictive.And the soundtrack is...</td>\n",
1110
+ " <td>160</td>\n",
1111
+ " <td>26</td>\n",
1112
+ " <td>easy fun addictive soundtrack like nothing hea...</td>\n",
1113
+ " <td>worth looking</td>\n",
1114
+ " <td>1</td>\n",
1115
+ " <td>0</td>\n",
1116
+ " <td>9</td>\n",
1117
+ " <td>5.192308</td>\n",
1118
+ " <td>4</td>\n",
1119
+ " <td>0.025000</td>\n",
1120
+ " <td>0</td>\n",
1121
+ " <td>0</td>\n",
1122
+ " <td>0</td>\n",
1123
+ " <td>3</td>\n",
1124
+ " <td>5.000000</td>\n",
1125
+ " <td>15</td>\n",
1126
+ " <td>0.882353</td>\n",
1127
+ " </tr>\n",
1128
+ " <tr>\n",
1129
+ " <th>79970</th>\n",
1130
+ " <td>1</td>\n",
1131
+ " <td>Part was not what ordered</td>\n",
1132
+ " <td>You will not receive Supco part. The prongs ar...</td>\n",
1133
+ " <td>142</td>\n",
1134
+ " <td>28</td>\n",
1135
+ " <td>not receive supco part prong different wire no...</td>\n",
1136
+ " <td>part not ordered</td>\n",
1137
+ " <td>0</td>\n",
1138
+ " <td>0</td>\n",
1139
+ " <td>3</td>\n",
1140
+ " <td>4.107143</td>\n",
1141
+ " <td>4</td>\n",
1142
+ " <td>0.028169</td>\n",
1143
+ " <td>0</td>\n",
1144
+ " <td>0</td>\n",
1145
+ " <td>0</td>\n",
1146
+ " <td>5</td>\n",
1147
+ " <td>4.200000</td>\n",
1148
+ " <td>1</td>\n",
1149
+ " <td>0.040000</td>\n",
1150
+ " </tr>\n",
1151
+ " <tr>\n",
1152
+ " <th>79971</th>\n",
1153
+ " <td>1</td>\n",
1154
+ " <td>AAAHHH!</td>\n",
1155
+ " <td>I wish this book would have followed through w...</td>\n",
1156
+ " <td>915</td>\n",
1157
+ " <td>165</td>\n",
1158
+ " <td>wish book would followed book like stardust br...</td>\n",
1159
+ " <td>aahh</td>\n",
1160
+ " <td>0</td>\n",
1161
+ " <td>10</td>\n",
1162
+ " <td>17</td>\n",
1163
+ " <td>4.551515</td>\n",
1164
+ " <td>29</td>\n",
1165
+ " <td>0.031694</td>\n",
1166
+ " <td>1</td>\n",
1167
+ " <td>0</td>\n",
1168
+ " <td>1</td>\n",
1169
+ " <td>1</td>\n",
1170
+ " <td>7.000000</td>\n",
1171
+ " <td>6</td>\n",
1172
+ " <td>0.857143</td>\n",
1173
+ " </tr>\n",
1174
+ " </tbody>\n",
1175
+ "</table>\n",
1176
+ "<p>79972 rows × 20 columns</p>\n",
1177
+ "</div>"
1178
+ ],
1179
+ "text/plain": [
1180
+ " review_target review_title \\\n",
1181
+ "0 2 GREAT CAMRA \n",
1182
+ "1 1 not so great \n",
1183
+ "2 1 Inaccurate and disappointing \n",
1184
+ "3 1 Equus 3340 \n",
1185
+ "4 2 awesome sheets! \n",
1186
+ "... ... ... \n",
1187
+ "79967 1 A Half-Step Above Johnny Tremain \n",
1188
+ "79968 2 pretty good game \n",
1189
+ "79969 2 WORTH LOOKING FOR \n",
1190
+ "79970 1 Part was not what ordered \n",
1191
+ "79971 1 AAAHHH! \n",
1192
+ "\n",
1193
+ " review_content \\\n",
1194
+ "0 I HAVE HAD THE DX6340 FOR ABOUT A YEAR.I LOVE ... \n",
1195
+ "1 I'm using this book in an introductory organic... \n",
1196
+ "2 I only read the first few chapters and was bom... \n",
1197
+ "3 Feels cheaply made, the battery contacts were ... \n",
1198
+ "4 I love these sheets! They are sleek & smooth w... \n",
1199
+ "... ... \n",
1200
+ "79967 Do not mistake this book for quality adult fic... \n",
1201
+ "79968 this grand theft auto is very fun like all the... \n",
1202
+ "79969 Easy, fun, and addictive.And the soundtrack is... \n",
1203
+ "79970 You will not receive Supco part. The prongs ar... \n",
1204
+ "79971 I wish this book would have followed through w... \n",
1205
+ "\n",
1206
+ " review_content_char_count review_content_word_count \\\n",
1207
+ "0 586 108 \n",
1208
+ "1 570 88 \n",
1209
+ "2 214 40 \n",
1210
+ "3 193 34 \n",
1211
+ "4 198 38 \n",
1212
+ "... ... ... \n",
1213
+ "79967 694 120 \n",
1214
+ "79968 518 98 \n",
1215
+ "79969 160 26 \n",
1216
+ "79970 142 28 \n",
1217
+ "79971 915 165 \n",
1218
+ "\n",
1219
+ " review_content_cleaned \\\n",
1220
+ "0 dx6340 year love picture good 35m easy use unl... \n",
1221
+ "1 using book introductory organic spectroscopy c... \n",
1222
+ "2 read first chapter bombarded reference 199 end... \n",
1223
+ "3 feel cheaply made battery contact rusted soon ... \n",
1224
+ "4 love sheet sleek smooth really cool feel perfe... \n",
1225
+ "... ... \n",
1226
+ "79967 not mistake book quality adult fiction spoonfu... \n",
1227
+ "79968 grand theft auto very fun like others played b... \n",
1228
+ "79969 easy fun addictive soundtrack like nothing hea... \n",
1229
+ "79970 not receive supco part prong different wire no... \n",
1230
+ "79971 wish book would followed book like stardust br... \n",
1231
+ "\n",
1232
+ " review_title_cleaned review_content_exclamation_count \\\n",
1233
+ "0 great camra 0 \n",
1234
+ "1 not great 0 \n",
1235
+ "2 inaccurate disappointing 0 \n",
1236
+ "3 equus 3340 0 \n",
1237
+ "4 awesome sheet 1 \n",
1238
+ "... ... ... \n",
1239
+ "79967 half step johnny tremain 0 \n",
1240
+ "79968 pretty good game 0 \n",
1241
+ "79969 worth looking 1 \n",
1242
+ "79970 part not ordered 0 \n",
1243
+ "79971 aahh 0 \n",
1244
+ "\n",
1245
+ " review_content_question_count review_content_punctuation_count \\\n",
1246
+ "0 0 14 \n",
1247
+ "1 0 12 \n",
1248
+ "2 0 5 \n",
1249
+ "3 0 2 \n",
1250
+ "4 0 9 \n",
1251
+ "... ... ... \n",
1252
+ "79967 0 21 \n",
1253
+ "79968 0 11 \n",
1254
+ "79969 0 9 \n",
1255
+ "79970 0 3 \n",
1256
+ "79971 10 17 \n",
1257
+ "\n",
1258
+ " review_content_avg_word_length review_content_uppercase_count \\\n",
1259
+ "0 4.435185 455 \n",
1260
+ "1 5.488636 4 \n",
1261
+ "2 4.375000 3 \n",
1262
+ "3 4.705882 2 \n",
1263
+ "4 4.236842 4 \n",
1264
+ "... ... ... \n",
1265
+ "79967 4.791667 16 \n",
1266
+ "79968 4.295918 0 \n",
1267
+ "79969 5.192308 4 \n",
1268
+ "79970 4.107143 4 \n",
1269
+ "79971 4.551515 29 \n",
1270
+ "\n",
1271
+ " review_content_uppercase_ratio review_title_exclamation_count \\\n",
1272
+ "0 0.776451 0 \n",
1273
+ "1 0.007018 0 \n",
1274
+ "2 0.014019 0 \n",
1275
+ "3 0.010363 0 \n",
1276
+ "4 0.020202 1 \n",
1277
+ "... ... ... \n",
1278
+ "79967 0.023055 0 \n",
1279
+ "79968 0.000000 0 \n",
1280
+ "79969 0.025000 0 \n",
1281
+ "79970 0.028169 0 \n",
1282
+ "79971 0.031694 1 \n",
1283
+ "\n",
1284
+ " review_title_question_count review_title_punctuation_count \\\n",
1285
+ "0 0 0 \n",
1286
+ "1 0 0 \n",
1287
+ "2 0 0 \n",
1288
+ "3 0 0 \n",
1289
+ "4 0 1 \n",
1290
+ "... ... ... \n",
1291
+ "79967 0 1 \n",
1292
+ "79968 0 0 \n",
1293
+ "79969 0 0 \n",
1294
+ "79970 0 0 \n",
1295
+ "79971 0 1 \n",
1296
+ "\n",
1297
+ " review_title_word_count review_title_avg_word_length \\\n",
1298
+ "0 2 5.000000 \n",
1299
+ "1 3 3.333333 \n",
1300
+ "2 3 8.666667 \n",
1301
+ "3 2 4.500000 \n",
1302
+ "4 2 7.000000 \n",
1303
+ "... ... ... \n",
1304
+ "79967 5 5.600000 \n",
1305
+ "79968 3 4.666667 \n",
1306
+ "79969 3 5.000000 \n",
1307
+ "79970 5 4.200000 \n",
1308
+ "79971 1 7.000000 \n",
1309
+ "\n",
1310
+ " review_title_uppercase_count review_title_uppercase_ratio \n",
1311
+ "0 10 0.909091 \n",
1312
+ "1 0 0.000000 \n",
1313
+ "2 1 0.035714 \n",
1314
+ "3 1 0.100000 \n",
1315
+ "4 0 0.000000 \n",
1316
+ "... ... ... \n",
1317
+ "79967 6 0.187500 \n",
1318
+ "79968 0 0.000000 \n",
1319
+ "79969 15 0.882353 \n",
1320
+ "79970 1 0.040000 \n",
1321
+ "79971 6 0.857143 \n",
1322
+ "\n",
1323
+ "[79972 rows x 20 columns]"
1324
+ ]
1325
+ },
1326
+ "execution_count": 10,
1327
+ "metadata": {},
1328
+ "output_type": "execute_result"
1329
+ }
1330
+ ],
1331
+ "source": [
1332
+ "feat_eng_train = add_basic_meta_features(feat_eng_train, text_col='review_title')\n",
1333
+ "feat_eng_train"
1334
+ ]
1335
+ },
1336
  {
1337
  "cell_type": "markdown",
1338
  "id": "a0afd7c0",
 
1345
  },
1346
  {
1347
  "cell_type": "code",
1348
+ "execution_count": 11,
1349
  "id": "8757f67a",
1350
  "metadata": {},
1351
  "outputs": [
 
1354
  "output_type": "stream",
1355
  "text": [
1356
  "Correlations between meta-features and target:\n",
1357
+ "review_title_exclamation_count 0.031584\n",
1358
+ "review_content_exclamation_count 0.027933\n",
1359
+ "review_content_avg_word_length 0.013098\n",
1360
+ "review_title_uppercase_ratio 0.004225\n",
1361
+ "review_content_uppercase_ratio -0.000912\n",
1362
+ "review_title_uppercase_count -0.001417\n",
1363
+ "review_title_avg_word_length -0.006423\n",
1364
+ "review_content_uppercase_count -0.019469\n",
1365
+ "review_title_word_count -0.021987\n",
1366
+ "review_title_punctuation_count -0.037998\n",
1367
+ "review_content_char_count -0.067318\n",
1368
+ "review_content_punctuation_count -0.069328\n",
1369
+ "review_title_question_count -0.070542\n",
1370
+ "review_content_word_count -0.071107\n",
1371
+ "review_content_question_count -0.090043\n",
1372
  "dtype: float64\n"
1373
  ]
1374
  }
1375
  ],
1376
  "source": [
1377
+ "meta_cols= ['review_content_char_count', 'review_content_word_count', 'review_content_exclamation_count', 'review_content_question_count', 'review_content_punctuation_count', 'review_content_avg_word_length', 'review_content_uppercase_count', 'review_content_uppercase_ratio', 'review_title_exclamation_count', 'review_title_question_count', 'review_title_punctuation_count', 'review_title_word_count', 'review_title_avg_word_length', 'review_title_uppercase_count', 'review_title_uppercase_ratio']\n",
1378
  "try:\n",
1379
  " target_numeric = pd.to_numeric(feat_eng_train['review_target'], errors='coerce')\n",
1380
  " if target_numeric.isna().sum() > len(target_numeric) * 0.5:\n",
 
1418
  },
1419
  {
1420
  "cell_type": "code",
1421
+ "execution_count": 12,
1422
  "id": "a67f9b1c",
1423
  "metadata": {},
1424
  "outputs": [
 
1432
  ],
1433
  "source": [
1434
  "tfidf_train = TfidfVectorizer(ngram_range=(1,2), max_features=20000, stop_words='english')\n",
1435
+ "tfidf_train = tfidf_train.fit_transform(feat_eng_train['review_content_cleaned'].astype(str))\n",
1436
  "print('TF-IDF shape:', tfidf_train.shape)"
1437
  ]
1438
  },
1439
  {
1440
  "cell_type": "code",
1441
+ "execution_count": 13,
1442
  "id": "78552554",
1443
  "metadata": {},
1444
  "outputs": [
 
1464
  " [ 0.06429021, -0.04825038]])"
1465
  ]
1466
  },
1467
+ "execution_count": 13,
1468
  "metadata": {},
1469
  "output_type": "execute_result"
1470
  }
 
1475
  },
1476
  {
1477
  "cell_type": "code",
1478
+ "execution_count": 14,
1479
  "id": "a9ab3966",
1480
  "metadata": {},
1481
  "outputs": [
 
1501
  " [-13.600362 , -14.125854 ]], dtype=float32)"
1502
  ]
1503
  },
1504
+ "execution_count": 14,
1505
  "metadata": {},
1506
  "output_type": "execute_result"
1507
  }
 
1520
  },
1521
  {
1522
  "cell_type": "code",
1523
+ "execution_count": 15,
1524
  "id": "90061617",
1525
  "metadata": {},
1526
  "outputs": [
 
1537
  "{'csv': PosixPath('data/processed/feat_eng_train.csv')}"
1538
  ]
1539
  },
1540
+ "execution_count": 15,
1541
  "metadata": {},
1542
  "output_type": "execute_result"
1543
  }
 
1556
  },
1557
  {
1558
  "cell_type": "code",
1559
+ "execution_count": 16,
1560
  "id": "2481b653",
1561
  "metadata": {},
1562
  "outputs": [
 
1573
  "{'vectorizer': PosixPath('data/vectorizers/tfidf_train.joblib')}"
1574
  ]
1575
  },
1576
+ "execution_count": 16,
1577
  "metadata": {},
1578
  "output_type": "execute_result"
1579
  }
src/config/settings.py CHANGED
@@ -1,4 +1,4 @@
1
  from dotenv import loadenv
2
  import os
3
 
4
- PROJECT_NAME = os.getenv("", "")
 
1
  from dotenv import loadenv
2
  import os
3
 
4
+ PROJECT_NAME = os.getenv("PROJECT_NAME", "SentimentSleuth")
src/utils/helpers.py CHANGED
@@ -275,25 +275,31 @@ def show_top_ngrams_by_class(df, target_col='review_target', text_col='review_cl
275
  def add_basic_meta_features(df: pd.DataFrame, text_col: str = 'review_content') -> pd.DataFrame:
276
  """
277
  Add basic meta-features to `df` based on the text column `text_col`.
278
- Features added: exclamation_count, question_count, uppercase_count, uppercase_ratio,
279
- word_count, avg_word_length, punctuation_count.
280
-
281
- The function is tolerant if the column is missing (raises KeyError).
282
  """
283
  if text_col not in df.columns:
284
  raise KeyError(f"Text column '{text_col}' not found in dataframe")
285
 
 
 
 
 
 
286
  s = df[text_col].fillna("").astype(str)
287
  df = df.copy()
288
- df['exclamation_count'] = s.str.count('!')
289
- df['question_count'] = s.str.count('\?')
290
- df['punctuation_count'] = s.str.count(r"[^\w\s]")
291
- df['word_count'] = s.str.split().apply(lambda ws: len(ws) if isinstance(ws, list) else 0)
292
- df['avg_word_length'] = s.str.split().apply(lambda ws: np.mean([len(w) for w in ws]) if isinstance(ws, list) and len(ws) else 0)
293
- # Uppercase counts and ratio (use string length to avoid division by zero)
294
- df['uppercase_count'] = s.apply(lambda x: sum(1 for c in x if c.isupper()))
 
 
295
  lengths = s.str.len().replace(0, 1)
296
- df['uppercase_ratio'] = df['uppercase_count'] / lengths
 
297
  return df
298
 
299
 
 
275
  def add_basic_meta_features(df: pd.DataFrame, text_col: str = 'review_content') -> pd.DataFrame:
276
  """
277
  Add basic meta-features to `df` based on the text column `text_col`.
278
+ Feature column names are prefixed with a sanitized version of `text_col`
279
+ (e.g. "review_title" -> "review_title_exclamation_count") to avoid collisions.
 
 
280
  """
281
  if text_col not in df.columns:
282
  raise KeyError(f"Text column '{text_col}' not found in dataframe")
283
 
284
+ # sanitize column name for use as prefix
285
+ prefix = re.sub(r'\W+', '_', text_col).strip('_').lower()
286
+ if not prefix:
287
+ prefix = 'text'
288
+
289
  s = df[text_col].fillna("").astype(str)
290
  df = df.copy()
291
+
292
+ df[f'{prefix}_exclamation_count'] = s.str.count(r'!')
293
+ df[f'{prefix}_question_count'] = s.str.count(r'\?')
294
+ df[f'{prefix}_punctuation_count'] = s.str.count(r"[^\w\s]")
295
+ df[f'{prefix}_word_count'] = s.str.split().apply(lambda ws: len(ws) if isinstance(ws, list) else 0)
296
+ df[f'{prefix}_avg_word_length'] = s.str.split().apply(
297
+ lambda ws: float(np.mean([len(w) for w in ws])) if isinstance(ws, list) and len(ws) else 0.0
298
+ )
299
+ df[f'{prefix}_uppercase_count'] = s.apply(lambda x: sum(1 for c in x if c.isupper()))
300
  lengths = s.str.len().replace(0, 1)
301
+ df[f'{prefix}_uppercase_ratio'] = df[f'{prefix}_uppercase_count'] / lengths
302
+
303
  return df
304
 
305