Upload Rule-Based Keyword Annotator.ipynb
#15
by
robzjgman
- opened
5 _ Rule-Based/Rule-Based Keyword Annotator.ipynb
ADDED
|
@@ -0,0 +1,736 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"nbformat": 4,
|
| 3 |
+
"nbformat_minor": 0,
|
| 4 |
+
"metadata": {
|
| 5 |
+
"colab": {
|
| 6 |
+
"provenance": []
|
| 7 |
+
},
|
| 8 |
+
"kernelspec": {
|
| 9 |
+
"name": "python3",
|
| 10 |
+
"display_name": "Python 3"
|
| 11 |
+
},
|
| 12 |
+
"language_info": {
|
| 13 |
+
"name": "python"
|
| 14 |
+
}
|
| 15 |
+
},
|
| 16 |
+
"cells": [
|
| 17 |
+
{
|
| 18 |
+
"cell_type": "code",
|
| 19 |
+
"execution_count": null,
|
| 20 |
+
"metadata": {
|
| 21 |
+
"id": "h4-1lYVz9trX",
|
| 22 |
+
"colab": {
|
| 23 |
+
"base_uri": "https://localhost:8080/"
|
| 24 |
+
},
|
| 25 |
+
"outputId": "f3755b19-7a47-4327-e39c-7da6fd5eeab8"
|
| 26 |
+
},
|
| 27 |
+
"outputs": [
|
| 28 |
+
{
|
| 29 |
+
"output_type": "stream",
|
| 30 |
+
"name": "stdout",
|
| 31 |
+
"text": [
|
| 32 |
+
"============================================================\n",
|
| 33 |
+
"RULE-BASED KEYWORD ASPECT ANNOTATOR\n",
|
| 34 |
+
"Automated aspect annotation using keywords and context rules\n",
|
| 35 |
+
"============================================================\n",
|
| 36 |
+
"\n",
|
| 37 |
+
"Reading dataset...\n",
|
| 38 |
+
"Successfully read dataset with 10510 rows\n",
|
| 39 |
+
"Using column 'review' for reviews\n",
|
| 40 |
+
"Dataset shape: (10510, 2)\n",
|
| 41 |
+
"Dropped columns: ['sentiment']\n",
|
| 42 |
+
"Processing all 10510 reviews...\n",
|
| 43 |
+
" Processed 100/10510 reviews...\n",
|
| 44 |
+
" Processed 200/10510 reviews...\n",
|
| 45 |
+
" Processed 300/10510 reviews...\n",
|
| 46 |
+
" Processed 400/10510 reviews...\n",
|
| 47 |
+
" Processed 500/10510 reviews...\n",
|
| 48 |
+
" Processed 600/10510 reviews...\n",
|
| 49 |
+
" Processed 700/10510 reviews...\n",
|
| 50 |
+
" Processed 800/10510 reviews...\n",
|
| 51 |
+
" Processed 900/10510 reviews...\n",
|
| 52 |
+
" Processed 1000/10510 reviews...\n",
|
| 53 |
+
" Processed 1100/10510 reviews...\n",
|
| 54 |
+
" Processed 1200/10510 reviews...\n",
|
| 55 |
+
" Processed 1300/10510 reviews...\n",
|
| 56 |
+
" Processed 1400/10510 reviews...\n",
|
| 57 |
+
" Processed 1500/10510 reviews...\n",
|
| 58 |
+
" Processed 1600/10510 reviews...\n",
|
| 59 |
+
" Processed 1700/10510 reviews...\n",
|
| 60 |
+
" Processed 1800/10510 reviews...\n",
|
| 61 |
+
" Processed 1900/10510 reviews...\n",
|
| 62 |
+
" Processed 2000/10510 reviews...\n",
|
| 63 |
+
" Processed 2100/10510 reviews...\n",
|
| 64 |
+
" Processed 2200/10510 reviews...\n",
|
| 65 |
+
" Processed 2300/10510 reviews...\n",
|
| 66 |
+
" Processed 2400/10510 reviews...\n",
|
| 67 |
+
" Processed 2500/10510 reviews...\n",
|
| 68 |
+
" Processed 2600/10510 reviews...\n",
|
| 69 |
+
" Processed 2700/10510 reviews...\n",
|
| 70 |
+
" Processed 2800/10510 reviews...\n",
|
| 71 |
+
" Processed 2900/10510 reviews...\n",
|
| 72 |
+
" Processed 3000/10510 reviews...\n",
|
| 73 |
+
" Processed 3100/10510 reviews...\n",
|
| 74 |
+
" Processed 3200/10510 reviews...\n",
|
| 75 |
+
" Processed 3300/10510 reviews...\n",
|
| 76 |
+
" Processed 3400/10510 reviews...\n",
|
| 77 |
+
" Processed 3500/10510 reviews...\n",
|
| 78 |
+
" Processed 3600/10510 reviews...\n",
|
| 79 |
+
" Processed 3700/10510 reviews...\n",
|
| 80 |
+
" Processed 3800/10510 reviews...\n",
|
| 81 |
+
" Processed 3900/10510 reviews...\n",
|
| 82 |
+
" Processed 4000/10510 reviews...\n",
|
| 83 |
+
" Processed 4100/10510 reviews...\n",
|
| 84 |
+
" Processed 4200/10510 reviews...\n",
|
| 85 |
+
" Processed 4300/10510 reviews...\n",
|
| 86 |
+
" Processed 4400/10510 reviews...\n",
|
| 87 |
+
" Processed 4500/10510 reviews...\n",
|
| 88 |
+
" Processed 4600/10510 reviews...\n",
|
| 89 |
+
" Processed 4700/10510 reviews...\n",
|
| 90 |
+
" Processed 4800/10510 reviews...\n",
|
| 91 |
+
" Processed 4900/10510 reviews...\n",
|
| 92 |
+
" Processed 5000/10510 reviews...\n",
|
| 93 |
+
" Processed 5100/10510 reviews...\n",
|
| 94 |
+
" Processed 5200/10510 reviews...\n",
|
| 95 |
+
" Processed 5300/10510 reviews...\n",
|
| 96 |
+
" Processed 5400/10510 reviews...\n",
|
| 97 |
+
" Processed 5500/10510 reviews...\n",
|
| 98 |
+
" Processed 5600/10510 reviews...\n",
|
| 99 |
+
" Processed 5700/10510 reviews...\n",
|
| 100 |
+
" Processed 5800/10510 reviews...\n",
|
| 101 |
+
" Processed 5900/10510 reviews...\n",
|
| 102 |
+
" Processed 6000/10510 reviews...\n",
|
| 103 |
+
" Processed 6100/10510 reviews...\n",
|
| 104 |
+
" Processed 6200/10510 reviews...\n",
|
| 105 |
+
" Processed 6300/10510 reviews...\n",
|
| 106 |
+
" Processed 6400/10510 reviews...\n",
|
| 107 |
+
" Processed 6500/10510 reviews...\n",
|
| 108 |
+
" Processed 6600/10510 reviews...\n",
|
| 109 |
+
" Processed 6700/10510 reviews...\n",
|
| 110 |
+
" Processed 6800/10510 reviews...\n",
|
| 111 |
+
" Processed 6900/10510 reviews...\n",
|
| 112 |
+
" Processed 7000/10510 reviews...\n",
|
| 113 |
+
" Processed 7100/10510 reviews...\n",
|
| 114 |
+
" Processed 7200/10510 reviews...\n",
|
| 115 |
+
" Processed 7300/10510 reviews...\n",
|
| 116 |
+
" Processed 7400/10510 reviews...\n",
|
| 117 |
+
" Processed 7500/10510 reviews...\n",
|
| 118 |
+
" Processed 7600/10510 reviews...\n",
|
| 119 |
+
" Processed 7700/10510 reviews...\n",
|
| 120 |
+
" Processed 7800/10510 reviews...\n",
|
| 121 |
+
" Processed 7900/10510 reviews...\n",
|
| 122 |
+
" Processed 8000/10510 reviews...\n",
|
| 123 |
+
" Processed 8100/10510 reviews...\n",
|
| 124 |
+
" Processed 8200/10510 reviews...\n",
|
| 125 |
+
" Processed 8300/10510 reviews...\n",
|
| 126 |
+
" Processed 8400/10510 reviews...\n",
|
| 127 |
+
" Processed 8500/10510 reviews...\n",
|
| 128 |
+
" Processed 8600/10510 reviews...\n",
|
| 129 |
+
" Processed 8700/10510 reviews...\n",
|
| 130 |
+
" Processed 8800/10510 reviews...\n",
|
| 131 |
+
" Processed 8900/10510 reviews...\n",
|
| 132 |
+
" Processed 9000/10510 reviews...\n",
|
| 133 |
+
" Processed 9100/10510 reviews...\n",
|
| 134 |
+
" Processed 9200/10510 reviews...\n",
|
| 135 |
+
" Processed 9300/10510 reviews...\n",
|
| 136 |
+
" Processed 9400/10510 reviews...\n",
|
| 137 |
+
" Processed 9500/10510 reviews...\n",
|
| 138 |
+
" Processed 9600/10510 reviews...\n",
|
| 139 |
+
" Processed 9700/10510 reviews...\n",
|
| 140 |
+
" Processed 9800/10510 reviews...\n",
|
| 141 |
+
" Processed 9900/10510 reviews...\n",
|
| 142 |
+
" Processed 10000/10510 reviews...\n",
|
| 143 |
+
" Processed 10100/10510 reviews...\n",
|
| 144 |
+
" Processed 10200/10510 reviews...\n",
|
| 145 |
+
" Processed 10300/10510 reviews...\n",
|
| 146 |
+
" Processed 10400/10510 reviews...\n",
|
| 147 |
+
" Processed 10500/10510 reviews...\n",
|
| 148 |
+
" Completed processing all 10510 reviews!\n",
|
| 149 |
+
"\n",
|
| 150 |
+
"============================================================\n",
|
| 151 |
+
"SAMPLE ANNOTATION RESULTS (First 10)\n",
|
| 152 |
+
"============================================================\n",
|
| 153 |
+
"\n",
|
| 154 |
+
"[Review 1]\n",
|
| 155 |
+
"Text: at first gumagana cya..pero pagnalowbat cya ndi na ya magamit kahit ilang oras mo cya icharge namamatay agad..poor quality..not for recommended.....\n",
|
| 156 |
+
"General Aspects: Delivery, Price, Product\n",
|
| 157 |
+
"Specific Aspects: DEL#TIME, PRI#VOM, PRO#COND, PRO#FUNC\n",
|
| 158 |
+
"Matched Keywords: {'PRO#COND': ['poor quality'], 'PRO#FUNC': ['gumagana'], 'PRO#GEN': ['pattern_match'], 'DEL#TIME': ['agad'], 'PRI#VOM': ['quality']}\n",
|
| 159 |
+
"----------------------------------------\n",
|
| 160 |
+
"\n",
|
| 161 |
+
"[Review 2]\n",
|
| 162 |
+
"Text: grabi pangalawa ko ng order sa shapee pero puro dismayado ako ang pangit ng tila subrang nipis mabilis ma punit at ang laki ng size niya alam niyo sho...\n",
|
| 163 |
+
"General Aspects: Delivery, Product\n",
|
| 164 |
+
"Specific Aspects: DEL#CORR, PRO#COND, PRO#CORR, PRO#SIZE\n",
|
| 165 |
+
"Matched Keywords: {'PRO#COND': ['item'], 'PRO#CORR': ['item'], 'PRO#SIZE': ['size'], 'PRO#GEN': ['item'], 'DEL#CORR': ['order']}\n",
|
| 166 |
+
"----------------------------------------\n",
|
| 167 |
+
"\n",
|
| 168 |
+
"[Review 3]\n",
|
| 169 |
+
"Text: 2l gray/black order ko. bakit 850ml lang po pinadala nyo. kung di na po available yung product dapat di nyo pinilit yung gusto nyo. kelangan ko po yun...\n",
|
| 170 |
+
"General Aspects: Delivery, Product, Service\n",
|
| 171 |
+
"Specific Aspects: DEL#CORR, PRO#COL, SER#HAND\n",
|
| 172 |
+
"Matched Keywords: {'PRO#COL': ['color'], 'PRO#GEN': ['product'], 'DEL#CORR': ['order'], 'SER#HAND': ['return']}\n",
|
| 173 |
+
"----------------------------------------\n",
|
| 174 |
+
"\n",
|
| 175 |
+
"[Review 4]\n",
|
| 176 |
+
"Text: walang silbing product.. bwesit. di gumagana dalawa pa...\n",
|
| 177 |
+
"General Aspects: Product\n",
|
| 178 |
+
"Specific Aspects: PRO#FUNC\n",
|
| 179 |
+
"Matched Keywords: {'PRO#FUNC': ['gumagana'], 'PRO#GEN': ['product']}\n",
|
| 180 |
+
"----------------------------------------\n",
|
| 181 |
+
"\n",
|
| 182 |
+
"[Review 5]\n",
|
| 183 |
+
"Text: d po maganda naman po yung neck fan, pero po napaka tagal po sya dumating dec,9 po dec, 23 napo sya dumating, chaka po pink po inorder ko bakit po whi...\n",
|
| 184 |
+
"General Aspects: Delivery, Product, Service\n",
|
| 185 |
+
"Specific Aspects: DEL#TIME, PRO#COL, PRO#FUNC, SER#TRU\n",
|
| 186 |
+
"Matched Keywords: {'PRO#COL': ['white'], 'PRO#FUNC': ['fan'], 'PRO#GEN': ['maganda'], 'DEL#TIME': ['pattern_match'], 'SER#TRU': ['seller'], 'SER#GEN': ['seller']}\n",
|
| 187 |
+
"----------------------------------------\n",
|
| 188 |
+
"\n",
|
| 189 |
+
"[Review 6]\n",
|
| 190 |
+
"Text: 0/10 sa effectiveness po nya 0/10 dahil kakalagay ko lang then wala man lang ilang kalahating minuto uminom lang ako wala na agad ung kulay sa lips ko...\n",
|
| 191 |
+
"General Aspects: Delivery, Product, Service\n",
|
| 192 |
+
"Specific Aspects: DEL#TIME, PRO#COL, SER#TRU\n",
|
| 193 |
+
"Matched Keywords: {'PRO#COL': ['kulay'], 'DEL#TIME': ['agad'], 'SER#TRU': ['scam']}\n",
|
| 194 |
+
"----------------------------------------\n",
|
| 195 |
+
"\n",
|
| 196 |
+
"[Review 7]\n",
|
| 197 |
+
"Text: mahina lng ang hangin khit nka maximum na. mabilis ma low batt.. pinapawisan p rin lalo n yung batok...\n",
|
| 198 |
+
"General Aspects: none\n",
|
| 199 |
+
"Specific Aspects: none\n",
|
| 200 |
+
"----------------------------------------\n",
|
| 201 |
+
"\n",
|
| 202 |
+
"[Review 8]\n",
|
| 203 |
+
"Text: maganda dn same sa picture maganda sya ,medyo maliit nga lang sya...\n",
|
| 204 |
+
"General Aspects: Product, Service\n",
|
| 205 |
+
"Specific Aspects: PRO#SIZE, SER#RES\n",
|
| 206 |
+
"Matched Keywords: {'PRO#SIZE': ['maliit'], 'PRO#GEN': ['maganda'], 'SER#RES': ['picture']}\n",
|
| 207 |
+
"----------------------------------------\n",
|
| 208 |
+
"\n",
|
| 209 |
+
"[Review 9]\n",
|
| 210 |
+
"Text: not worth it ang bilis n'ya ma-lowbat and ang tagal n'ya pang i-charge...\n",
|
| 211 |
+
"General Aspects: Price, Product\n",
|
| 212 |
+
"Specific Aspects: PRI#VOM, PRO#FUNC\n",
|
| 213 |
+
"Matched Keywords: {'PRO#FUNC': ['charge'], 'PRI#VOM': ['pattern_match']}\n",
|
| 214 |
+
"----------------------------------------\n",
|
| 215 |
+
"\n",
|
| 216 |
+
"[Review 10]\n",
|
| 217 |
+
"Text: mahirap buksan yung payong. need mo pa itulak. sana na check man lang bago ipadala....\n",
|
| 218 |
+
"General Aspects: Service\n",
|
| 219 |
+
"Specific Aspects: SER#HAND\n",
|
| 220 |
+
"Matched Keywords: {'SER#HAND': ['pattern_match']}\n",
|
| 221 |
+
"----------------------------------------\n",
|
| 222 |
+
"\n",
|
| 223 |
+
"✅ Annotated data saved to: annotated_reviews_rule_based.csv\n",
|
| 224 |
+
"Total reviews processed: 10510\n"
|
| 225 |
+
]
|
| 226 |
+
}
|
| 227 |
+
],
|
| 228 |
+
"source": [
|
| 229 |
+
"import pandas as pd\n",
|
| 230 |
+
"import re\n",
|
| 231 |
+
"import csv\n",
|
| 232 |
+
"from collections import defaultdict, Counter\n",
|
| 233 |
+
"import string\n",
|
| 234 |
+
"import nltk\n",
|
| 235 |
+
"from nltk.tokenize import word_tokenize\n",
|
| 236 |
+
"from nltk.corpus import stopwords\n",
|
| 237 |
+
"import warnings\n",
|
| 238 |
+
"warnings.filterwarnings('ignore')\n",
|
| 239 |
+
"\n",
|
| 240 |
+
"# Download required NLTK data\n",
|
| 241 |
+
"try:\n",
|
| 242 |
+
" nltk.data.find('tokenizers/punkt')\n",
|
| 243 |
+
"except LookupError:\n",
|
| 244 |
+
" nltk.download('punkt')\n",
|
| 245 |
+
" nltk.download('punkt_tab')\n",
|
| 246 |
+
"\n",
|
| 247 |
+
"try:\n",
|
| 248 |
+
" nltk.data.find('corpora/stopwords')\n",
|
| 249 |
+
"except LookupError:\n",
|
| 250 |
+
" nltk.download('stopwords')\n",
|
| 251 |
+
"\n",
|
| 252 |
+
"class RuleBasedKeywordAnnotator:\n",
|
| 253 |
+
" def __init__(self, stopwords_file='stopwords-new.txt'):\n",
|
| 254 |
+
" \"\"\"\n",
|
| 255 |
+
" Initialize the rule-based keyword aspect annotator\n",
|
| 256 |
+
" \"\"\"\n",
|
| 257 |
+
" # Load Filipino stopwords\n",
|
| 258 |
+
" self.filipino_stopwords = self.load_filipino_stopwords(stopwords_file)\n",
|
| 259 |
+
"\n",
|
| 260 |
+
" # English stopwords\n",
|
| 261 |
+
" self.english_stopwords = set(stopwords.words('english'))\n",
|
| 262 |
+
"\n",
|
| 263 |
+
" # Combined stopwords (lowercased)\n",
|
| 264 |
+
" self.all_stopwords = self.filipino_stopwords.union(self.english_stopwords)\n",
|
| 265 |
+
"\n",
|
| 266 |
+
" # Define general aspects based on codebook\n",
|
| 267 |
+
" self.general_aspects = {\n",
|
| 268 |
+
" 'Product': 'PRO',\n",
|
| 269 |
+
" 'Price': 'PRI',\n",
|
| 270 |
+
" 'Delivery': 'DEL',\n",
|
| 271 |
+
" 'Service': 'SER'\n",
|
| 272 |
+
" }\n",
|
| 273 |
+
"\n",
|
| 274 |
+
" # Define specific aspects with improved keywords and patterns\n",
|
| 275 |
+
" self.specific_aspects = {\n",
|
| 276 |
+
" 'PRO#COL': {\n",
|
| 277 |
+
" 'general': 'Product',\n",
|
| 278 |
+
" 'name': 'Color',\n",
|
| 279 |
+
" 'keywords': ['color', 'black', 'white', 'pink', 'blue', 'green', 'wrong color', 'colors', 'kulay', 'faded'],\n",
|
| 280 |
+
" 'patterns': [r'true to color', r'same as photo', r'iba.*kulay', r'mali.*color',\n",
|
| 281 |
+
" r'mas ma(pula|puti|itim)', r'pangit.*shade', r'maganda.*kulay']\n",
|
| 282 |
+
" },\n",
|
| 283 |
+
" 'PRO#COND': {\n",
|
| 284 |
+
" 'general': 'Product',\n",
|
| 285 |
+
" 'name': 'Condition',\n",
|
| 286 |
+
" 'keywords': ['sira', 'damage', 'item', 'damaged', 'items', 'box', 'yupi box', 'sira box', 'basag',\n",
|
| 287 |
+
" 'gasgas', 'leak', 'crack', 'butas butas', 'butas', 'old', 'stock', 'stocks', 'baligtad',\n",
|
| 288 |
+
" 'low quality', 'poor quality'],\n",
|
| 289 |
+
" 'patterns': [r'may gasgas', r'incomplete parts', r'factory defect', r'kulang.*parts',\n",
|
| 290 |
+
" r'may yupi.*box.*pero.*sealed', r'sealed.*pero.*may (gasgas|yupi|dent)']\n",
|
| 291 |
+
" },\n",
|
| 292 |
+
" 'PRO#CORR': {\n",
|
| 293 |
+
" 'general': 'Product',\n",
|
| 294 |
+
" 'name': 'Correctness',\n",
|
| 295 |
+
" 'keywords': ['advertisement', 'false', 'ads', 'fake', 'design', 'tugma', 'meters', 'description', 'item',\n",
|
| 296 |
+
" 'expectation', 'expectation vs reality', 'vs reality', 'reality', 'expectations', 'product description'],\n",
|
| 297 |
+
" 'patterns': [r'sabi.*pero', r'akala ko.*pero', r'advertised as.*pero', r'sa photo.*pero',\n",
|
| 298 |
+
" r'(dual|single)\\s*sim', r'walang.*charger', r'kasama.*charger']\n",
|
| 299 |
+
" },\n",
|
| 300 |
+
" 'PRO#DUR': {\n",
|
| 301 |
+
" 'general': 'Product',\n",
|
| 302 |
+
" 'name': 'Durability',\n",
|
| 303 |
+
" 'keywords': ['week', 'hours', 'minutes', 'one day', 'using', 'llast'],\n",
|
| 304 |
+
" 'patterns': [r'\\d+\\s*(araw|days|linggo|week|buwan|month).*[pa lang|palang].*sira',\n",
|
| 305 |
+
" r'madaling masira', r'hindi tumagal', r'namatay agad', r'sira na agad',\n",
|
| 306 |
+
" r'tatlong araw.*sira', r'after.*week.*sira', r'gumana.*nung una.*pero.*namatay']\n",
|
| 307 |
+
" },\n",
|
| 308 |
+
" 'PRO#EFF': {\n",
|
| 309 |
+
" 'general': 'Product',\n",
|
| 310 |
+
" 'name': 'Effectiveness',\n",
|
| 311 |
+
" 'keywords': ['dikit', 'glue', 'waterproof', 'water proof', 'long lasting', 'magstraight', 'hair removal'],\n",
|
| 312 |
+
" 'patterns': [r'hindi.*effective', r'sobrang effective', r'ang bilis.*effect',\n",
|
| 313 |
+
" r'walang.*effect', r'okay.*pero hindi.*effective']\n",
|
| 314 |
+
" },\n",
|
| 315 |
+
" 'PRO#FUNC': {\n",
|
| 316 |
+
" 'general': 'Product',\n",
|
| 317 |
+
" 'name': 'Functionality',\n",
|
| 318 |
+
" 'keywords': ['gumagana', 'battery', 'rechargeable', 'battery life', 'remote', 'sound', 'sounds', 'mic',\n",
|
| 319 |
+
" 'audio', 'music', 'tunog', 'charger', 'charge', 'charging', 'charging pin', 'working',\n",
|
| 320 |
+
" 'earphones', 'earphone', 'gumagana mic', 'bluetooth', 'disconnect', 'connect',\n",
|
| 321 |
+
" 'defective', 'defect', 'malfunction', 'wire', 'cable', 'cord', 'usb', 'pen', 'stylus',\n",
|
| 322 |
+
" 'low battery', 'fan', 'power button', 'power', 'remote control', 'side fan', 'gumagana left', 'automatic'],\n",
|
| 323 |
+
" 'patterns': [r'hindi gumagana', r'ayaw mag', r'not working', r'defective.*item',\n",
|
| 324 |
+
" r'walang.*tunog', r'hindi.*nag-o.*on', r'power button.*hindi']\n",
|
| 325 |
+
" },\n",
|
| 326 |
+
" 'PRO#MAT': {\n",
|
| 327 |
+
" 'general': 'Product',\n",
|
| 328 |
+
" 'name': 'Material',\n",
|
| 329 |
+
" 'keywords': ['manipis', 'rubber', 'plastic', 'material', 'sandal', 'leather', 'cotton', 'spandex',\n",
|
| 330 |
+
" 'cotton tela', 'fabric', 'texture', 'tint', 'sticky', 'matte', 'tela', 'thick'],\n",
|
| 331 |
+
" 'patterns': [r'good quality.*material', r'cheap.*fabric', r'makapal.*tela',\n",
|
| 332 |
+
" r'manipis.*tela', r'solid.*metal']\n",
|
| 333 |
+
" },\n",
|
| 334 |
+
" 'PRO#SENS': {\n",
|
| 335 |
+
" 'general': 'Product',\n",
|
| 336 |
+
" 'name': 'Sensory',\n",
|
| 337 |
+
" 'keywords': ['food', 'masarap', 'taste', 'foods', 'scent', 'perfume', 'smell', 'amoy', 'fragrance',\n",
|
| 338 |
+
" 'amoy', 'mabango', 'amoy goma', 'amoy cloud', 'alcohol', 'amoy alcohol', 'pabango', 'oil',\n",
|
| 339 |
+
" 'oil based', 'bango'],\n",
|
| 340 |
+
" 'patterns': [r'ang bango', r'may amoy', r'masarap.*lasa', r'satisfying.*sound',\n",
|
| 341 |
+
" r'amoy.*baby', r'mabaho.*amoy']\n",
|
| 342 |
+
" },\n",
|
| 343 |
+
" 'PRO#SIZE': {\n",
|
| 344 |
+
" 'general': 'Product',\n",
|
| 345 |
+
" 'name': 'Size/Measurement',\n",
|
| 346 |
+
" 'keywords': ['size', 'maliit size', 'sizes', 'add size', 'maliit', 'large', 'xl', 'kasya', 'liit',\n",
|
| 347 |
+
" 'strap', 'maiksi', 'inches', 'liters', 'liter'],\n",
|
| 348 |
+
" 'patterns': [r'true to size', r'ang liit.*large', r'parang.*size', r'hindi kasya',\n",
|
| 349 |
+
" r'masyadong.*maliit', r'oversized']\n",
|
| 350 |
+
" },\n",
|
| 351 |
+
" 'PRO#GEN': {\n",
|
| 352 |
+
" 'general': 'Product',\n",
|
| 353 |
+
" 'name': 'General',\n",
|
| 354 |
+
" 'keywords': ['item', 'product', 'maganda', 'good', 'quality', 'ganda', 'ok', 'super ganda', 'love',\n",
|
| 355 |
+
" 'thank', 'thankyou', 'nice', 'goods', 'comfortable'],\n",
|
| 356 |
+
" 'patterns': [r'okay.*quality', r'good product', r'maganda.*product', r'poor quality',\n",
|
| 357 |
+
" r'standard.*product']\n",
|
| 358 |
+
" },\n",
|
| 359 |
+
"\n",
|
| 360 |
+
" # Delivery aspects\n",
|
| 361 |
+
" 'DEL#COND': {\n",
|
| 362 |
+
" 'general': 'Delivery',\n",
|
| 363 |
+
" 'name': 'Condition',\n",
|
| 364 |
+
" 'keywords': ['bubble wrap', 'naka bubble wrap', 'box', 'fragile', 'yupi box', 'plastic', 'packaging', 'sealed', 'parcel'],\n",
|
| 365 |
+
" 'patterns': [r'yupi.*box', r'basang-basa', r'bubble wrap', r'damaged.*shipping',\n",
|
| 366 |
+
" r'dumating.*yupi', r'box.*yupi', r'parcel.*basa']\n",
|
| 367 |
+
" },\n",
|
| 368 |
+
" 'DEL#CORR': {\n",
|
| 369 |
+
" 'general': 'Delivery',\n",
|
| 370 |
+
" 'name': 'Correctness',\n",
|
| 371 |
+
" 'keywords': ['order', 'mali', 'complete orders', 'wrong item', 'pinadala'],\n",
|
| 372 |
+
" 'patterns': [r'ordered.*pero.*received', r'mali.*pinadala', r'iba.*dumating',\n",
|
| 373 |
+
" r'wrong.*item', r'(large|medium|small|XL|XS).*order.*pero.*(large|medium|small|XL|XS).*dumating',\n",
|
| 374 |
+
" r'inorder.*pero.*dumating', r'order ko.*pero.*pinadala']\n",
|
| 375 |
+
" },\n",
|
| 376 |
+
" 'DEL#TIME': {\n",
|
| 377 |
+
" 'general': 'Delivery',\n",
|
| 378 |
+
" 'name': 'Timeliness',\n",
|
| 379 |
+
" 'keywords': ['dumating', 'agad', 'tagal dumating', 'fast delivery', 'fast shipping', 'bilis dumating',\n",
|
| 380 |
+
" 'bilis shipping', 'shipped immediately', 'shipping', 'takes', 'weeks', 'shipped'],\n",
|
| 381 |
+
" 'patterns': [r'late.*delivery', r'tagal.*dumating', r'\\d+\\s*(days|araw|weeks|linggo).*delivery',\n",
|
| 382 |
+
" r'fast delivery', r'tagal.*shipping', r'bilis.*delivery']\n",
|
| 383 |
+
" },\n",
|
| 384 |
+
" 'DEL#GEN': {\n",
|
| 385 |
+
" 'general': 'Delivery',\n",
|
| 386 |
+
" 'name': 'General',\n",
|
| 387 |
+
" 'keywords': ['rider', 'kuya rider', 'shipping', 'parcel', 'ship', 'delivery', 'courier', 'secure'],\n",
|
| 388 |
+
" 'patterns': [r'okay.*delivery', r'smooth.*shipping', r'mait.*nag-deliver',\n",
|
| 389 |
+
" r'delivery.*fine']\n",
|
| 390 |
+
" },\n",
|
| 391 |
+
"\n",
|
| 392 |
+
" # Price aspects\n",
|
| 393 |
+
" 'PRI#AFF': {\n",
|
| 394 |
+
" 'general': 'Price',\n",
|
| 395 |
+
" 'name': 'Affordability',\n",
|
| 396 |
+
" 'keywords': ['affordable', 'mura'],\n",
|
| 397 |
+
" 'patterns': [r'sobrang mura', r'affordable.*price', r'budget.*friendly',\n",
|
| 398 |
+
" r'mura lang', r'nakakuha.*discount']\n",
|
| 399 |
+
" },\n",
|
| 400 |
+
" 'PRI#VOM': {\n",
|
| 401 |
+
" 'general': 'Price',\n",
|
| 402 |
+
" 'name': 'Value for Money',\n",
|
| 403 |
+
" 'keywords': ['price', 'worth', 'worth price', 'good price', 'quality', 'sulit', 'sayang pera', 'waste money'],\n",
|
| 404 |
+
" 'patterns': [r'sulit.*price', r'worth.*money', r'hindi worth', r'sayang.*pera',\n",
|
| 405 |
+
" r'not worth it', r'worth it', r'hindi.*sulit', r'sobrang.*sulit']\n",
|
| 406 |
+
" },\n",
|
| 407 |
+
" 'PRI#GEN': {\n",
|
| 408 |
+
" 'general': 'Price',\n",
|
| 409 |
+
" 'name': 'General',\n",
|
| 410 |
+
" 'keywords': ['price', 'sakto price', 'okay price', 'ok price'],\n",
|
| 411 |
+
" 'patterns': [r'okay.*presyo', r'standard.*price', r'sakto.*presyo']\n",
|
| 412 |
+
" },\n",
|
| 413 |
+
"\n",
|
| 414 |
+
" # Service aspects\n",
|
| 415 |
+
" 'SER#HAND': {\n",
|
| 416 |
+
" 'general': 'Service',\n",
|
| 417 |
+
" 'name': 'Handling',\n",
|
| 418 |
+
" 'keywords': ['refund', 'return', 'return refund', 'check', 'double check'],\n",
|
| 419 |
+
" 'patterns': [r'sana.*check', r'hindi.*check', r'double.*check', r'maingat.*pack',\n",
|
| 420 |
+
" r'need.*check.*bago.*padala']\n",
|
| 421 |
+
" },\n",
|
| 422 |
+
" 'SER#RES': {\n",
|
| 423 |
+
" 'general': 'Service',\n",
|
| 424 |
+
" 'name': 'Responsiveness',\n",
|
| 425 |
+
" 'keywords': ['picture', 'pictures', 'nag send', 'message', 'response', 'reply', 'chat', 'seller responsive'],\n",
|
| 426 |
+
" 'patterns': [r'tagal.*sagot', r'walang.*reply', r'responsive.*seller', r'seen.*zone',\n",
|
| 427 |
+
" r'hindi.*sumasagot', r'mabait.*kausap']\n",
|
| 428 |
+
" },\n",
|
| 429 |
+
" 'SER#TRU': {\n",
|
| 430 |
+
" 'general': 'Service',\n",
|
| 431 |
+
" 'name': 'Trustworthiness',\n",
|
| 432 |
+
" 'keywords': ['seller', 'scammer', 'scam'],\n",
|
| 433 |
+
" 'patterns': [r'scam.*shop', r'fake.*product', r'hindi legit', r'nanloko',\n",
|
| 434 |
+
" r'pinapalitan.*order']\n",
|
| 435 |
+
" },\n",
|
| 436 |
+
" 'SER#GEN': {\n",
|
| 437 |
+
" 'general': 'Service',\n",
|
| 438 |
+
" 'name': 'General',\n",
|
| 439 |
+
" 'keywords': ['seller', 'service', 'staff', 'shop', 'thank seller', 'order received', 'recommend shop',\n",
|
| 440 |
+
" 'poor service', 'rude staff', 'irresponsible'],\n",
|
| 441 |
+
" 'patterns': [r'good.*service', r'bad.*service', r'okay.*service', r'seller.*helpful']\n",
|
| 442 |
+
" }\n",
|
| 443 |
+
" }\n",
|
| 444 |
+
"\n",
|
| 445 |
+
" def load_filipino_stopwords(self, filepath):\n",
|
| 446 |
+
" \"\"\"Load Filipino stopwords from file\"\"\"\n",
|
| 447 |
+
" with open(filepath, 'r', encoding='utf-8') as f:\n",
|
| 448 |
+
" return set([line.strip().lower() for line in f if line.strip()])\n",
|
| 449 |
+
"\n",
|
| 450 |
+
" def preprocess_text(self, text):\n",
|
| 451 |
+
" \"\"\"Preprocess Taglish text while preserving context\"\"\"\n",
|
| 452 |
+
" if pd.isna(text):\n",
|
| 453 |
+
" return \"\"\n",
|
| 454 |
+
"\n",
|
| 455 |
+
" # Convert to string and lowercase\n",
|
| 456 |
+
" text = str(text).lower()\n",
|
| 457 |
+
"\n",
|
| 458 |
+
" # Remove URLs\n",
|
| 459 |
+
" text = re.sub(r'http\\S+|www.\\S+', '', text)\n",
|
| 460 |
+
"\n",
|
| 461 |
+
" # Remove email addresses\n",
|
| 462 |
+
" text = re.sub(r'\\S+@\\S+', '', text)\n",
|
| 463 |
+
"\n",
|
| 464 |
+
" # Keep the text mostly intact for pattern matching\n",
|
| 465 |
+
" # Just normalize multiple spaces\n",
|
| 466 |
+
" text = re.sub(r'\\s+', ' ', text)\n",
|
| 467 |
+
"\n",
|
| 468 |
+
" return text.strip()\n",
|
| 469 |
+
"\n",
|
| 470 |
+
" def check_patterns(self, text, patterns):\n",
|
| 471 |
+
" \"\"\"Check if any pattern matches in the text\"\"\"\n",
|
| 472 |
+
" for pattern in patterns:\n",
|
| 473 |
+
" if re.search(pattern, text, re.IGNORECASE):\n",
|
| 474 |
+
" return True\n",
|
| 475 |
+
" return False\n",
|
| 476 |
+
"\n",
|
| 477 |
+
" def check_keywords(self, text, keywords):\n",
|
| 478 |
+
" \"\"\"Improved keyword checking with better context awareness\"\"\"\n",
|
| 479 |
+
" text_lower = text.lower()\n",
|
| 480 |
+
" for keyword in keywords:\n",
|
| 481 |
+
" # Use word boundaries for more accurate matching\n",
|
| 482 |
+
" # But be flexible for compound words\n",
|
| 483 |
+
" if len(keyword.split()) > 1:\n",
|
| 484 |
+
" # Multi-word keywords\n",
|
| 485 |
+
" if keyword.lower() in text_lower:\n",
|
| 486 |
+
" return True\n",
|
| 487 |
+
" else:\n",
|
| 488 |
+
" # Single word keywords - use word boundaries\n",
|
| 489 |
+
" pattern = r'\\b' + re.escape(keyword.lower()) + r'\\b'\n",
|
| 490 |
+
" if re.search(pattern, text_lower):\n",
|
| 491 |
+
" return True\n",
|
| 492 |
+
" return False\n",
|
| 493 |
+
"\n",
|
| 494 |
+
" def apply_context_rules(self, text, initial_aspects):\n",
|
| 495 |
+
" \"\"\"Apply improved context-based rules from the codebook\"\"\"\n",
|
| 496 |
+
" text_lower = text.lower()\n",
|
| 497 |
+
" refined_aspects = initial_aspects.copy()\n",
|
| 498 |
+
"\n",
|
| 499 |
+
" # Rule 1: Color mentioned with ordering context -> DEL#CORR\n",
|
| 500 |
+
" if re.search(r'(order|inorder).*pero.*(dumating|natanggap|received)', text_lower):\n",
|
| 501 |
+
" if 'PRO#COL' in refined_aspects and any(color in text_lower for color in ['gray', 'blue', 'red', 'black', 'white', 'pink']):\n",
|
| 502 |
+
" refined_aspects.remove('PRO#COL')\n",
|
| 503 |
+
" refined_aspects.add('DEL#CORR')\n",
|
| 504 |
+
"\n",
|
| 505 |
+
" # Rule 2: Initial functionality then failure -> PRO#DUR\n",
|
| 506 |
+
" if re.search(r'(gumana|working).*(una|first|initially).*pero.*(namatay|sira|broken|stopped)', text_lower):\n",
|
| 507 |
+
" if 'PRO#FUNC' in refined_aspects:\n",
|
| 508 |
+
" refined_aspects.remove('PRO#FUNC')\n",
|
| 509 |
+
" refined_aspects.add('PRO#DUR')\n",
|
| 510 |
+
"\n",
|
| 511 |
+
" # Rule 3: Size ordering mismatch -> DEL#CORR\n",
|
| 512 |
+
" size_pattern = r'(ordered?|inorder).*(large|medium|small|xs|xl|xxl).*pero.*(received?|natanggap|dumating).*(large|medium|small|xs|xl|xxl)'\n",
|
| 513 |
+
" if re.search(size_pattern, text_lower):\n",
|
| 514 |
+
" if 'PRO#SIZE' in refined_aspects:\n",
|
| 515 |
+
" refined_aspects.remove('PRO#SIZE')\n",
|
| 516 |
+
" refined_aspects.add('DEL#CORR')\n",
|
| 517 |
+
"\n",
|
| 518 |
+
" # Rule 4: \"not worth it\" should always be PRI#VOM\n",
|
| 519 |
+
" if re.search(r'(not|hindi).*worth', text_lower) or 'sayang' in text_lower:\n",
|
| 520 |
+
" refined_aspects.add('PRI#VOM')\n",
|
| 521 |
+
" # Remove incorrect time references\n",
|
| 522 |
+
" if 'DEL#TIME' in refined_aspects and 'bilis' not in text_lower and 'tagal' not in text_lower:\n",
|
| 523 |
+
" refined_aspects.discard('DEL#TIME')\n",
|
| 524 |
+
"\n",
|
| 525 |
+
" # Rule 5: \"mura pero hindi worth it\" -> both PRI#AFF and PRI#VOM\n",
|
| 526 |
+
" if 'mura' in text_lower and ('hindi worth' in text_lower or 'not worth' in text_lower):\n",
|
| 527 |
+
" refined_aspects.add('PRI#AFF')\n",
|
| 528 |
+
" refined_aspects.add('PRI#VOM')\n",
|
| 529 |
+
"\n",
|
| 530 |
+
" # Rule 6: Time-based durability issues\n",
|
| 531 |
+
" durability_time_pattern = r'(\\d+|tatlong|dalawang|isang)\\s*(araw|days|linggo|week|buwan|month).*pa\\s*lang.*sira'\n",
|
| 532 |
+
" if re.search(durability_time_pattern, text_lower):\n",
|
| 533 |
+
" refined_aspects.add('PRO#DUR')\n",
|
| 534 |
+
" if 'PRO#FUNC' in refined_aspects:\n",
|
| 535 |
+
" refined_aspects.remove('PRO#FUNC')\n",
|
| 536 |
+
"\n",
|
| 537 |
+
" # Rule 7: Wrong variant/model delivered\n",
|
| 538 |
+
" if re.search(r'(mali|wrong|iba).*(model|variant|item).*pinadala', text_lower):\n",
|
| 539 |
+
" refined_aspects.add('DEL#CORR')\n",
|
| 540 |
+
"\n",
|
| 541 |
+
" # Rule 8: Packaging issues during shipping vs product condition\n",
|
| 542 |
+
" if 'yupi' in text_lower or 'basang-basa' in text_lower:\n",
|
| 543 |
+
" if 'box' in text_lower or 'parcel' in text_lower or 'package' in text_lower:\n",
|
| 544 |
+
" refined_aspects.add('DEL#COND')\n",
|
| 545 |
+
" # Remove product condition if it's clearly about shipping\n",
|
| 546 |
+
" if 'PRO#COND' in refined_aspects and 'sealed' not in text_lower:\n",
|
| 547 |
+
" refined_aspects.discard('PRO#COND')\n",
|
| 548 |
+
"\n",
|
| 549 |
+
" # Rule 9: \"bilis\" context disambiguation\n",
|
| 550 |
+
" if 'bilis' in text_lower:\n",
|
| 551 |
+
" # Check if it's about delivery\n",
|
| 552 |
+
" if any(word in text_lower for word in ['delivery', 'dumating', 'shipping', 'padala']):\n",
|
| 553 |
+
" refined_aspects.add('DEL#TIME')\n",
|
| 554 |
+
" refined_aspects.discard('PRO#EFF') # Remove if incorrectly added\n",
|
| 555 |
+
" # Check if it's about product effectiveness\n",
|
| 556 |
+
" elif any(word in text_lower for word in ['effect', 'epekto', 'resulta', 'gumana']):\n",
|
| 557 |
+
" refined_aspects.add('PRO#EFF')\n",
|
| 558 |
+
" refined_aspects.discard('DEL#TIME') # Remove if incorrectly added\n",
|
| 559 |
+
"\n",
|
| 560 |
+
" # Rule 10: Remove overly generic aspects if more specific ones exist\n",
|
| 561 |
+
" if len(refined_aspects) > 1:\n",
|
| 562 |
+
" # If we have specific product aspects, remove PRO#GEN\n",
|
| 563 |
+
" specific_product_aspects = {'PRO#COL', 'PRO#COND', 'PRO#CORR', 'PRO#DUR',\n",
|
| 564 |
+
" 'PRO#EFF', 'PRO#FUNC', 'PRO#MAT', 'PRO#SENS', 'PRO#SIZE'}\n",
|
| 565 |
+
" if refined_aspects.intersection(specific_product_aspects) and 'PRO#GEN' in refined_aspects:\n",
|
| 566 |
+
" refined_aspects.discard('PRO#GEN')\n",
|
| 567 |
+
"\n",
|
| 568 |
+
" # Similar for other general aspects\n",
|
| 569 |
+
" if {'DEL#COND', 'DEL#CORR', 'DEL#TIME'}.intersection(refined_aspects) and 'DEL#GEN' in refined_aspects:\n",
|
| 570 |
+
" refined_aspects.discard('DEL#GEN')\n",
|
| 571 |
+
"\n",
|
| 572 |
+
" if {'PRI#AFF', 'PRI#VOM'}.intersection(refined_aspects) and 'PRI#GEN' in refined_aspects:\n",
|
| 573 |
+
" refined_aspects.discard('PRI#GEN')\n",
|
| 574 |
+
"\n",
|
| 575 |
+
" if {'SER#HAND', 'SER#RES', 'SER#TRU'}.intersection(refined_aspects) and 'SER#GEN' in refined_aspects:\n",
|
| 576 |
+
" refined_aspects.discard('SER#GEN')\n",
|
| 577 |
+
"\n",
|
| 578 |
+
" return refined_aspects\n",
|
| 579 |
+
"\n",
|
| 580 |
+
" def annotate_aspects(self, text):\n",
|
| 581 |
+
" \"\"\"\n",
|
| 582 |
+
" Annotate aspects in the given text following the codebook rules\n",
|
| 583 |
+
" Returns: list of identified aspect tags and their details\n",
|
| 584 |
+
" \"\"\"\n",
|
| 585 |
+
" if pd.isna(text) or text.strip() == \"\":\n",
|
| 586 |
+
" return [], {}\n",
|
| 587 |
+
"\n",
|
| 588 |
+
" processed_text = self.preprocess_text(text)\n",
|
| 589 |
+
" identified_aspects = set()\n",
|
| 590 |
+
" aspect_details = defaultdict(list)\n",
|
| 591 |
+
"\n",
|
| 592 |
+
" # Check each specific aspect\n",
|
| 593 |
+
" for aspect_tag, aspect_info in self.specific_aspects.items():\n",
|
| 594 |
+
" matched = False\n",
|
| 595 |
+
"\n",
|
| 596 |
+
" # First check patterns (higher priority)\n",
|
| 597 |
+
" if 'patterns' in aspect_info:\n",
|
| 598 |
+
" if self.check_patterns(processed_text, aspect_info['patterns']):\n",
|
| 599 |
+
" matched = True\n",
|
| 600 |
+
" aspect_details[aspect_tag].append('pattern_match')\n",
|
| 601 |
+
"\n",
|
| 602 |
+
" # Then check keywords\n",
|
| 603 |
+
" if not matched and self.check_keywords(processed_text, aspect_info['keywords']):\n",
|
| 604 |
+
" matched = True\n",
|
| 605 |
+
" # Find which keywords matched\n",
|
| 606 |
+
" for keyword in aspect_info['keywords']:\n",
|
| 607 |
+
" if len(keyword.split()) > 1:\n",
|
| 608 |
+
" if keyword.lower() in processed_text:\n",
|
| 609 |
+
" aspect_details[aspect_tag].append(keyword)\n",
|
| 610 |
+
" break\n",
|
| 611 |
+
" else:\n",
|
| 612 |
+
" pattern = r'\\b' + re.escape(keyword.lower()) + r'\\b'\n",
|
| 613 |
+
" if re.search(pattern, processed_text):\n",
|
| 614 |
+
" aspect_details[aspect_tag].append(keyword)\n",
|
| 615 |
+
" break\n",
|
| 616 |
+
"\n",
|
| 617 |
+
" if matched:\n",
|
| 618 |
+
" identified_aspects.add(aspect_tag)\n",
|
| 619 |
+
"\n",
|
| 620 |
+
" # Apply context-based refinement rules\n",
|
| 621 |
+
" identified_aspects = self.apply_context_rules(processed_text, identified_aspects)\n",
|
| 622 |
+
"\n",
|
| 623 |
+
" # Convert to sorted list for consistent output\n",
|
| 624 |
+
" identified_aspects = sorted(list(identified_aspects))\n",
|
| 625 |
+
"\n",
|
| 626 |
+
" return identified_aspects, dict(aspect_details)\n",
|
| 627 |
+
"\n",
|
| 628 |
+
" def get_general_aspects(self, specific_aspects):\n",
|
| 629 |
+
" \"\"\"Get general aspects from specific aspect tags\"\"\"\n",
|
| 630 |
+
" general = set()\n",
|
| 631 |
+
" for aspect_tag in specific_aspects:\n",
|
| 632 |
+
" if aspect_tag in self.specific_aspects:\n",
|
| 633 |
+
" general.add(self.specific_aspects[aspect_tag]['general'])\n",
|
| 634 |
+
" return sorted(list(general))\n",
|
| 635 |
+
"\n",
|
| 636 |
+
" def process_dataset(self, df, text_column='review'):\n",
|
| 637 |
+
" \"\"\"\n",
|
| 638 |
+
" Process entire dataset and annotate aspects according to codebook\n",
|
| 639 |
+
" \"\"\"\n",
|
| 640 |
+
" total_rows = len(df)\n",
|
| 641 |
+
" print(f\"Processing all {total_rows} reviews...\")\n",
|
| 642 |
+
"\n",
|
| 643 |
+
" results = []\n",
|
| 644 |
+
"\n",
|
| 645 |
+
" # Process all reviews in their original order\n",
|
| 646 |
+
" for idx in range(total_rows):\n",
|
| 647 |
+
" if (idx + 1) % 100 == 0:\n",
|
| 648 |
+
" print(f\" Processed {idx + 1}/{total_rows} reviews...\")\n",
|
| 649 |
+
"\n",
|
| 650 |
+
" row = df.iloc[idx]\n",
|
| 651 |
+
" review_text = row[text_column]\n",
|
| 652 |
+
" specific_aspects, aspect_details = self.annotate_aspects(review_text)\n",
|
| 653 |
+
" general_aspects = self.get_general_aspects(specific_aspects)\n",
|
| 654 |
+
"\n",
|
| 655 |
+
" result = {\n",
|
| 656 |
+
" 'review_id': idx + 1, # Start from 1\n",
|
| 657 |
+
" 'review': review_text,\n",
|
| 658 |
+
" 'general_aspects': ', '.join(general_aspects) if general_aspects else 'none',\n",
|
| 659 |
+
" 'specific_aspects': ', '.join(specific_aspects) if specific_aspects else 'none',\n",
|
| 660 |
+
" 'aspect_keywords': str(aspect_details) if aspect_details else '{}'\n",
|
| 661 |
+
" }\n",
|
| 662 |
+
"\n",
|
| 663 |
+
" # Add original columns (except sentiment if it exists)\n",
|
| 664 |
+
" for col in df.columns:\n",
|
| 665 |
+
" if col != text_column and col.lower() not in ['sentiment', 'sentiments']:\n",
|
| 666 |
+
" result[col] = row[col]\n",
|
| 667 |
+
"\n",
|
| 668 |
+
" results.append(result)\n",
|
| 669 |
+
"\n",
|
| 670 |
+
" print(f\" Completed processing all {total_rows} reviews!\")\n",
|
| 671 |
+
" return pd.DataFrame(results)\n",
|
| 672 |
+
"\n",
|
| 673 |
+
"# Main execution\n",
|
| 674 |
+
"def main():\n",
|
| 675 |
+
" print(\"=\" * 60)\n",
|
| 676 |
+
" print(\"RULE-BASED KEYWORD ASPECT ANNOTATOR\")\n",
|
| 677 |
+
" print(\"Automated aspect annotation using keywords and context rules\")\n",
|
| 678 |
+
" print(\"=\" * 60)\n",
|
| 679 |
+
"\n",
|
| 680 |
+
" # Initialize annotator\n",
|
| 681 |
+
" annotator = RuleBasedKeywordAnnotator('stopwords-new.txt')\n",
|
| 682 |
+
"\n",
|
| 683 |
+
" # Read the dataset\n",
|
| 684 |
+
" print(\"\\nReading dataset...\")\n",
|
| 685 |
+
" df = pd.read_csv('SentiTaglish_ProductsAndServices.csv', encoding='utf-8')\n",
|
| 686 |
+
" print(f\"Successfully read dataset with {len(df)} rows\")\n",
|
| 687 |
+
"\n",
|
| 688 |
+
" # Identify review column\n",
|
| 689 |
+
" review_column = None\n",
|
| 690 |
+
" for col in df.columns:\n",
|
| 691 |
+
" if 'review' in col.lower() or 'text' in col.lower() or 'comment' in col.lower():\n",
|
| 692 |
+
" review_column = col\n",
|
| 693 |
+
" break\n",
|
| 694 |
+
"\n",
|
| 695 |
+
" if review_column is None:\n",
|
| 696 |
+
" review_column = df.columns[0] # Use first column as fallback\n",
|
| 697 |
+
"\n",
|
| 698 |
+
" print(f\"Using column '{review_column}' for reviews\")\n",
|
| 699 |
+
" print(f\"Dataset shape: {df.shape}\")\n",
|
| 700 |
+
"\n",
|
| 701 |
+
" # Drop sentiment/sentiments column if exists\n",
|
| 702 |
+
" columns_to_drop = [col for col in df.columns if col.lower() in ['sentiment', 'sentiments']]\n",
|
| 703 |
+
" if columns_to_drop:\n",
|
| 704 |
+
" df = df.drop(columns=columns_to_drop)\n",
|
| 705 |
+
" print(f\"Dropped columns: {columns_to_drop}\")\n",
|
| 706 |
+
"\n",
|
| 707 |
+
" # Process entire dataset\n",
|
| 708 |
+
" annotated_df = annotator.process_dataset(df, text_column=review_column)\n",
|
| 709 |
+
"\n",
|
| 710 |
+
" # Display sample results\n",
|
| 711 |
+
" print(\"\\n\" + \"=\" * 60)\n",
|
| 712 |
+
" print(\"SAMPLE ANNOTATION RESULTS (First 10)\")\n",
|
| 713 |
+
" print(\"=\" * 60)\n",
|
| 714 |
+
"\n",
|
| 715 |
+
" sample = annotated_df.head(10)\n",
|
| 716 |
+
" for idx, row in sample.iterrows():\n",
|
| 717 |
+
" print(f\"\\n[Review {row['review_id']}]\")\n",
|
| 718 |
+
" print(f\"Text: {row['review'][:150]}...\")\n",
|
| 719 |
+
" print(f\"General Aspects: {row['general_aspects']}\")\n",
|
| 720 |
+
" print(f\"Specific Aspects: {row['specific_aspects']}\")\n",
|
| 721 |
+
" if row['aspect_keywords'] != '{}':\n",
|
| 722 |
+
" print(f\"Matched Keywords: {row['aspect_keywords']}\")\n",
|
| 723 |
+
" print(\"-\" * 40)\n",
|
| 724 |
+
"\n",
|
| 725 |
+
" # Save to CSV\n",
|
| 726 |
+
" output_filename = 'annotated_reviews_rule_based.csv'\n",
|
| 727 |
+
" annotated_df.to_csv(output_filename, index=False, encoding='utf-8')\n",
|
| 728 |
+
" print(f\"\\n✅ Annotated data saved to: {output_filename}\")\n",
|
| 729 |
+
" print(f\"Total reviews processed: {len(annotated_df)}\")\n",
|
| 730 |
+
"\n",
|
| 731 |
+
"if __name__ == \"__main__\":\n",
|
| 732 |
+
" main()"
|
| 733 |
+
]
|
| 734 |
+
}
|
| 735 |
+
]
|
| 736 |
+
}
|