Upload Rule-Based Keyword Annotator.ipynb

#15
by robzjgman - opened
5 _ Rule-Based/Rule-Based Keyword Annotator.ipynb ADDED
@@ -0,0 +1,736 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "nbformat": 4,
3
+ "nbformat_minor": 0,
4
+ "metadata": {
5
+ "colab": {
6
+ "provenance": []
7
+ },
8
+ "kernelspec": {
9
+ "name": "python3",
10
+ "display_name": "Python 3"
11
+ },
12
+ "language_info": {
13
+ "name": "python"
14
+ }
15
+ },
16
+ "cells": [
17
+ {
18
+ "cell_type": "code",
19
+ "execution_count": null,
20
+ "metadata": {
21
+ "id": "h4-1lYVz9trX",
22
+ "colab": {
23
+ "base_uri": "https://localhost:8080/"
24
+ },
25
+ "outputId": "f3755b19-7a47-4327-e39c-7da6fd5eeab8"
26
+ },
27
+ "outputs": [
28
+ {
29
+ "output_type": "stream",
30
+ "name": "stdout",
31
+ "text": [
32
+ "============================================================\n",
33
+ "RULE-BASED KEYWORD ASPECT ANNOTATOR\n",
34
+ "Automated aspect annotation using keywords and context rules\n",
35
+ "============================================================\n",
36
+ "\n",
37
+ "Reading dataset...\n",
38
+ "Successfully read dataset with 10510 rows\n",
39
+ "Using column 'review' for reviews\n",
40
+ "Dataset shape: (10510, 2)\n",
41
+ "Dropped columns: ['sentiment']\n",
42
+ "Processing all 10510 reviews...\n",
43
+ " Processed 100/10510 reviews...\n",
44
+ " Processed 200/10510 reviews...\n",
45
+ " Processed 300/10510 reviews...\n",
46
+ " Processed 400/10510 reviews...\n",
47
+ " Processed 500/10510 reviews...\n",
48
+ " Processed 600/10510 reviews...\n",
49
+ " Processed 700/10510 reviews...\n",
50
+ " Processed 800/10510 reviews...\n",
51
+ " Processed 900/10510 reviews...\n",
52
+ " Processed 1000/10510 reviews...\n",
53
+ " Processed 1100/10510 reviews...\n",
54
+ " Processed 1200/10510 reviews...\n",
55
+ " Processed 1300/10510 reviews...\n",
56
+ " Processed 1400/10510 reviews...\n",
57
+ " Processed 1500/10510 reviews...\n",
58
+ " Processed 1600/10510 reviews...\n",
59
+ " Processed 1700/10510 reviews...\n",
60
+ " Processed 1800/10510 reviews...\n",
61
+ " Processed 1900/10510 reviews...\n",
62
+ " Processed 2000/10510 reviews...\n",
63
+ " Processed 2100/10510 reviews...\n",
64
+ " Processed 2200/10510 reviews...\n",
65
+ " Processed 2300/10510 reviews...\n",
66
+ " Processed 2400/10510 reviews...\n",
67
+ " Processed 2500/10510 reviews...\n",
68
+ " Processed 2600/10510 reviews...\n",
69
+ " Processed 2700/10510 reviews...\n",
70
+ " Processed 2800/10510 reviews...\n",
71
+ " Processed 2900/10510 reviews...\n",
72
+ " Processed 3000/10510 reviews...\n",
73
+ " Processed 3100/10510 reviews...\n",
74
+ " Processed 3200/10510 reviews...\n",
75
+ " Processed 3300/10510 reviews...\n",
76
+ " Processed 3400/10510 reviews...\n",
77
+ " Processed 3500/10510 reviews...\n",
78
+ " Processed 3600/10510 reviews...\n",
79
+ " Processed 3700/10510 reviews...\n",
80
+ " Processed 3800/10510 reviews...\n",
81
+ " Processed 3900/10510 reviews...\n",
82
+ " Processed 4000/10510 reviews...\n",
83
+ " Processed 4100/10510 reviews...\n",
84
+ " Processed 4200/10510 reviews...\n",
85
+ " Processed 4300/10510 reviews...\n",
86
+ " Processed 4400/10510 reviews...\n",
87
+ " Processed 4500/10510 reviews...\n",
88
+ " Processed 4600/10510 reviews...\n",
89
+ " Processed 4700/10510 reviews...\n",
90
+ " Processed 4800/10510 reviews...\n",
91
+ " Processed 4900/10510 reviews...\n",
92
+ " Processed 5000/10510 reviews...\n",
93
+ " Processed 5100/10510 reviews...\n",
94
+ " Processed 5200/10510 reviews...\n",
95
+ " Processed 5300/10510 reviews...\n",
96
+ " Processed 5400/10510 reviews...\n",
97
+ " Processed 5500/10510 reviews...\n",
98
+ " Processed 5600/10510 reviews...\n",
99
+ " Processed 5700/10510 reviews...\n",
100
+ " Processed 5800/10510 reviews...\n",
101
+ " Processed 5900/10510 reviews...\n",
102
+ " Processed 6000/10510 reviews...\n",
103
+ " Processed 6100/10510 reviews...\n",
104
+ " Processed 6200/10510 reviews...\n",
105
+ " Processed 6300/10510 reviews...\n",
106
+ " Processed 6400/10510 reviews...\n",
107
+ " Processed 6500/10510 reviews...\n",
108
+ " Processed 6600/10510 reviews...\n",
109
+ " Processed 6700/10510 reviews...\n",
110
+ " Processed 6800/10510 reviews...\n",
111
+ " Processed 6900/10510 reviews...\n",
112
+ " Processed 7000/10510 reviews...\n",
113
+ " Processed 7100/10510 reviews...\n",
114
+ " Processed 7200/10510 reviews...\n",
115
+ " Processed 7300/10510 reviews...\n",
116
+ " Processed 7400/10510 reviews...\n",
117
+ " Processed 7500/10510 reviews...\n",
118
+ " Processed 7600/10510 reviews...\n",
119
+ " Processed 7700/10510 reviews...\n",
120
+ " Processed 7800/10510 reviews...\n",
121
+ " Processed 7900/10510 reviews...\n",
122
+ " Processed 8000/10510 reviews...\n",
123
+ " Processed 8100/10510 reviews...\n",
124
+ " Processed 8200/10510 reviews...\n",
125
+ " Processed 8300/10510 reviews...\n",
126
+ " Processed 8400/10510 reviews...\n",
127
+ " Processed 8500/10510 reviews...\n",
128
+ " Processed 8600/10510 reviews...\n",
129
+ " Processed 8700/10510 reviews...\n",
130
+ " Processed 8800/10510 reviews...\n",
131
+ " Processed 8900/10510 reviews...\n",
132
+ " Processed 9000/10510 reviews...\n",
133
+ " Processed 9100/10510 reviews...\n",
134
+ " Processed 9200/10510 reviews...\n",
135
+ " Processed 9300/10510 reviews...\n",
136
+ " Processed 9400/10510 reviews...\n",
137
+ " Processed 9500/10510 reviews...\n",
138
+ " Processed 9600/10510 reviews...\n",
139
+ " Processed 9700/10510 reviews...\n",
140
+ " Processed 9800/10510 reviews...\n",
141
+ " Processed 9900/10510 reviews...\n",
142
+ " Processed 10000/10510 reviews...\n",
143
+ " Processed 10100/10510 reviews...\n",
144
+ " Processed 10200/10510 reviews...\n",
145
+ " Processed 10300/10510 reviews...\n",
146
+ " Processed 10400/10510 reviews...\n",
147
+ " Processed 10500/10510 reviews...\n",
148
+ " Completed processing all 10510 reviews!\n",
149
+ "\n",
150
+ "============================================================\n",
151
+ "SAMPLE ANNOTATION RESULTS (First 10)\n",
152
+ "============================================================\n",
153
+ "\n",
154
+ "[Review 1]\n",
155
+ "Text: at first gumagana cya..pero pagnalowbat cya ndi na ya magamit kahit ilang oras mo cya icharge namamatay agad..poor quality..not for recommended.....\n",
156
+ "General Aspects: Delivery, Price, Product\n",
157
+ "Specific Aspects: DEL#TIME, PRI#VOM, PRO#COND, PRO#FUNC\n",
158
+ "Matched Keywords: {'PRO#COND': ['poor quality'], 'PRO#FUNC': ['gumagana'], 'PRO#GEN': ['pattern_match'], 'DEL#TIME': ['agad'], 'PRI#VOM': ['quality']}\n",
159
+ "----------------------------------------\n",
160
+ "\n",
161
+ "[Review 2]\n",
162
+ "Text: grabi pangalawa ko ng order sa shapee pero puro dismayado ako ang pangit ng tila subrang nipis mabilis ma punit at ang laki ng size niya alam niyo sho...\n",
163
+ "General Aspects: Delivery, Product\n",
164
+ "Specific Aspects: DEL#CORR, PRO#COND, PRO#CORR, PRO#SIZE\n",
165
+ "Matched Keywords: {'PRO#COND': ['item'], 'PRO#CORR': ['item'], 'PRO#SIZE': ['size'], 'PRO#GEN': ['item'], 'DEL#CORR': ['order']}\n",
166
+ "----------------------------------------\n",
167
+ "\n",
168
+ "[Review 3]\n",
169
+ "Text: 2l gray/black order ko. bakit 850ml lang po pinadala nyo. kung di na po available yung product dapat di nyo pinilit yung gusto nyo. kelangan ko po yun...\n",
170
+ "General Aspects: Delivery, Product, Service\n",
171
+ "Specific Aspects: DEL#CORR, PRO#COL, SER#HAND\n",
172
+ "Matched Keywords: {'PRO#COL': ['color'], 'PRO#GEN': ['product'], 'DEL#CORR': ['order'], 'SER#HAND': ['return']}\n",
173
+ "----------------------------------------\n",
174
+ "\n",
175
+ "[Review 4]\n",
176
+ "Text: walang silbing product.. bwesit. di gumagana dalawa pa...\n",
177
+ "General Aspects: Product\n",
178
+ "Specific Aspects: PRO#FUNC\n",
179
+ "Matched Keywords: {'PRO#FUNC': ['gumagana'], 'PRO#GEN': ['product']}\n",
180
+ "----------------------------------------\n",
181
+ "\n",
182
+ "[Review 5]\n",
183
+ "Text: d po maganda naman po yung neck fan, pero po napaka tagal po sya dumating dec,9 po dec, 23 napo sya dumating, chaka po pink po inorder ko bakit po whi...\n",
184
+ "General Aspects: Delivery, Product, Service\n",
185
+ "Specific Aspects: DEL#TIME, PRO#COL, PRO#FUNC, SER#TRU\n",
186
+ "Matched Keywords: {'PRO#COL': ['white'], 'PRO#FUNC': ['fan'], 'PRO#GEN': ['maganda'], 'DEL#TIME': ['pattern_match'], 'SER#TRU': ['seller'], 'SER#GEN': ['seller']}\n",
187
+ "----------------------------------------\n",
188
+ "\n",
189
+ "[Review 6]\n",
190
+ "Text: 0/10 sa effectiveness po nya 0/10 dahil kakalagay ko lang then wala man lang ilang kalahating minuto uminom lang ako wala na agad ung kulay sa lips ko...\n",
191
+ "General Aspects: Delivery, Product, Service\n",
192
+ "Specific Aspects: DEL#TIME, PRO#COL, SER#TRU\n",
193
+ "Matched Keywords: {'PRO#COL': ['kulay'], 'DEL#TIME': ['agad'], 'SER#TRU': ['scam']}\n",
194
+ "----------------------------------------\n",
195
+ "\n",
196
+ "[Review 7]\n",
197
+ "Text: mahina lng ang hangin khit nka maximum na. mabilis ma low batt.. pinapawisan p rin lalo n yung batok...\n",
198
+ "General Aspects: none\n",
199
+ "Specific Aspects: none\n",
200
+ "----------------------------------------\n",
201
+ "\n",
202
+ "[Review 8]\n",
203
+ "Text: maganda dn same sa picture maganda sya ,medyo maliit nga lang sya...\n",
204
+ "General Aspects: Product, Service\n",
205
+ "Specific Aspects: PRO#SIZE, SER#RES\n",
206
+ "Matched Keywords: {'PRO#SIZE': ['maliit'], 'PRO#GEN': ['maganda'], 'SER#RES': ['picture']}\n",
207
+ "----------------------------------------\n",
208
+ "\n",
209
+ "[Review 9]\n",
210
+ "Text: not worth it ang bilis n'ya ma-lowbat and ang tagal n'ya pang i-charge...\n",
211
+ "General Aspects: Price, Product\n",
212
+ "Specific Aspects: PRI#VOM, PRO#FUNC\n",
213
+ "Matched Keywords: {'PRO#FUNC': ['charge'], 'PRI#VOM': ['pattern_match']}\n",
214
+ "----------------------------------------\n",
215
+ "\n",
216
+ "[Review 10]\n",
217
+ "Text: mahirap buksan yung payong. need mo pa itulak. sana na check man lang bago ipadala....\n",
218
+ "General Aspects: Service\n",
219
+ "Specific Aspects: SER#HAND\n",
220
+ "Matched Keywords: {'SER#HAND': ['pattern_match']}\n",
221
+ "----------------------------------------\n",
222
+ "\n",
223
+ "✅ Annotated data saved to: annotated_reviews_rule_based.csv\n",
224
+ "Total reviews processed: 10510\n"
225
+ ]
226
+ }
227
+ ],
228
+ "source": [
229
+ "import pandas as pd\n",
230
+ "import re\n",
231
+ "import csv\n",
232
+ "from collections import defaultdict, Counter\n",
233
+ "import string\n",
234
+ "import nltk\n",
235
+ "from nltk.tokenize import word_tokenize\n",
236
+ "from nltk.corpus import stopwords\n",
237
+ "import warnings\n",
238
+ "warnings.filterwarnings('ignore')\n",
239
+ "\n",
240
+ "# Download required NLTK data\n",
241
+ "try:\n",
242
+ " nltk.data.find('tokenizers/punkt')\n",
243
+ "except LookupError:\n",
244
+ " nltk.download('punkt')\n",
245
+ " nltk.download('punkt_tab')\n",
246
+ "\n",
247
+ "try:\n",
248
+ " nltk.data.find('corpora/stopwords')\n",
249
+ "except LookupError:\n",
250
+ " nltk.download('stopwords')\n",
251
+ "\n",
252
+ "class RuleBasedKeywordAnnotator:\n",
253
+ " def __init__(self, stopwords_file='stopwords-new.txt'):\n",
254
+ " \"\"\"\n",
255
+ " Initialize the rule-based keyword aspect annotator\n",
256
+ " \"\"\"\n",
257
+ " # Load Filipino stopwords\n",
258
+ " self.filipino_stopwords = self.load_filipino_stopwords(stopwords_file)\n",
259
+ "\n",
260
+ " # English stopwords\n",
261
+ " self.english_stopwords = set(stopwords.words('english'))\n",
262
+ "\n",
263
+ " # Combined stopwords (lowercased)\n",
264
+ " self.all_stopwords = self.filipino_stopwords.union(self.english_stopwords)\n",
265
+ "\n",
266
+ " # Define general aspects based on codebook\n",
267
+ " self.general_aspects = {\n",
268
+ " 'Product': 'PRO',\n",
269
+ " 'Price': 'PRI',\n",
270
+ " 'Delivery': 'DEL',\n",
271
+ " 'Service': 'SER'\n",
272
+ " }\n",
273
+ "\n",
274
+ " # Define specific aspects with improved keywords and patterns\n",
275
+ " self.specific_aspects = {\n",
276
+ " 'PRO#COL': {\n",
277
+ " 'general': 'Product',\n",
278
+ " 'name': 'Color',\n",
279
+ " 'keywords': ['color', 'black', 'white', 'pink', 'blue', 'green', 'wrong color', 'colors', 'kulay', 'faded'],\n",
280
+ " 'patterns': [r'true to color', r'same as photo', r'iba.*kulay', r'mali.*color',\n",
281
+ " r'mas ma(pula|puti|itim)', r'pangit.*shade', r'maganda.*kulay']\n",
282
+ " },\n",
283
+ " 'PRO#COND': {\n",
284
+ " 'general': 'Product',\n",
285
+ " 'name': 'Condition',\n",
286
+ " 'keywords': ['sira', 'damage', 'item', 'damaged', 'items', 'box', 'yupi box', 'sira box', 'basag',\n",
287
+ " 'gasgas', 'leak', 'crack', 'butas butas', 'butas', 'old', 'stock', 'stocks', 'baligtad',\n",
288
+ " 'low quality', 'poor quality'],\n",
289
+ " 'patterns': [r'may gasgas', r'incomplete parts', r'factory defect', r'kulang.*parts',\n",
290
+ " r'may yupi.*box.*pero.*sealed', r'sealed.*pero.*may (gasgas|yupi|dent)']\n",
291
+ " },\n",
292
+ " 'PRO#CORR': {\n",
293
+ " 'general': 'Product',\n",
294
+ " 'name': 'Correctness',\n",
295
+ " 'keywords': ['advertisement', 'false', 'ads', 'fake', 'design', 'tugma', 'meters', 'description', 'item',\n",
296
+ " 'expectation', 'expectation vs reality', 'vs reality', 'reality', 'expectations', 'product description'],\n",
297
+ " 'patterns': [r'sabi.*pero', r'akala ko.*pero', r'advertised as.*pero', r'sa photo.*pero',\n",
298
+ " r'(dual|single)\\s*sim', r'walang.*charger', r'kasama.*charger']\n",
299
+ " },\n",
300
+ " 'PRO#DUR': {\n",
301
+ " 'general': 'Product',\n",
302
+ " 'name': 'Durability',\n",
303
+ " 'keywords': ['week', 'hours', 'minutes', 'one day', 'using', 'llast'],\n",
304
+ " 'patterns': [r'\\d+\\s*(araw|days|linggo|week|buwan|month).*[pa lang|palang].*sira',\n",
305
+ " r'madaling masira', r'hindi tumagal', r'namatay agad', r'sira na agad',\n",
306
+ " r'tatlong araw.*sira', r'after.*week.*sira', r'gumana.*nung una.*pero.*namatay']\n",
307
+ " },\n",
308
+ " 'PRO#EFF': {\n",
309
+ " 'general': 'Product',\n",
310
+ " 'name': 'Effectiveness',\n",
311
+ " 'keywords': ['dikit', 'glue', 'waterproof', 'water proof', 'long lasting', 'magstraight', 'hair removal'],\n",
312
+ " 'patterns': [r'hindi.*effective', r'sobrang effective', r'ang bilis.*effect',\n",
313
+ " r'walang.*effect', r'okay.*pero hindi.*effective']\n",
314
+ " },\n",
315
+ " 'PRO#FUNC': {\n",
316
+ " 'general': 'Product',\n",
317
+ " 'name': 'Functionality',\n",
318
+ " 'keywords': ['gumagana', 'battery', 'rechargeable', 'battery life', 'remote', 'sound', 'sounds', 'mic',\n",
319
+ " 'audio', 'music', 'tunog', 'charger', 'charge', 'charging', 'charging pin', 'working',\n",
320
+ " 'earphones', 'earphone', 'gumagana mic', 'bluetooth', 'disconnect', 'connect',\n",
321
+ " 'defective', 'defect', 'malfunction', 'wire', 'cable', 'cord', 'usb', 'pen', 'stylus',\n",
322
+ " 'low battery', 'fan', 'power button', 'power', 'remote control', 'side fan', 'gumagana left', 'automatic'],\n",
323
+ " 'patterns': [r'hindi gumagana', r'ayaw mag', r'not working', r'defective.*item',\n",
324
+ " r'walang.*tunog', r'hindi.*nag-o.*on', r'power button.*hindi']\n",
325
+ " },\n",
326
+ " 'PRO#MAT': {\n",
327
+ " 'general': 'Product',\n",
328
+ " 'name': 'Material',\n",
329
+ " 'keywords': ['manipis', 'rubber', 'plastic', 'material', 'sandal', 'leather', 'cotton', 'spandex',\n",
330
+ " 'cotton tela', 'fabric', 'texture', 'tint', 'sticky', 'matte', 'tela', 'thick'],\n",
331
+ " 'patterns': [r'good quality.*material', r'cheap.*fabric', r'makapal.*tela',\n",
332
+ " r'manipis.*tela', r'solid.*metal']\n",
333
+ " },\n",
334
+ " 'PRO#SENS': {\n",
335
+ " 'general': 'Product',\n",
336
+ " 'name': 'Sensory',\n",
337
+ " 'keywords': ['food', 'masarap', 'taste', 'foods', 'scent', 'perfume', 'smell', 'amoy', 'fragrance',\n",
338
+ " 'amoy', 'mabango', 'amoy goma', 'amoy cloud', 'alcohol', 'amoy alcohol', 'pabango', 'oil',\n",
339
+ " 'oil based', 'bango'],\n",
340
+ " 'patterns': [r'ang bango', r'may amoy', r'masarap.*lasa', r'satisfying.*sound',\n",
341
+ " r'amoy.*baby', r'mabaho.*amoy']\n",
342
+ " },\n",
343
+ " 'PRO#SIZE': {\n",
344
+ " 'general': 'Product',\n",
345
+ " 'name': 'Size/Measurement',\n",
346
+ " 'keywords': ['size', 'maliit size', 'sizes', 'add size', 'maliit', 'large', 'xl', 'kasya', 'liit',\n",
347
+ " 'strap', 'maiksi', 'inches', 'liters', 'liter'],\n",
348
+ " 'patterns': [r'true to size', r'ang liit.*large', r'parang.*size', r'hindi kasya',\n",
349
+ " r'masyadong.*maliit', r'oversized']\n",
350
+ " },\n",
351
+ " 'PRO#GEN': {\n",
352
+ " 'general': 'Product',\n",
353
+ " 'name': 'General',\n",
354
+ " 'keywords': ['item', 'product', 'maganda', 'good', 'quality', 'ganda', 'ok', 'super ganda', 'love',\n",
355
+ " 'thank', 'thankyou', 'nice', 'goods', 'comfortable'],\n",
356
+ " 'patterns': [r'okay.*quality', r'good product', r'maganda.*product', r'poor quality',\n",
357
+ " r'standard.*product']\n",
358
+ " },\n",
359
+ "\n",
360
+ " # Delivery aspects\n",
361
+ " 'DEL#COND': {\n",
362
+ " 'general': 'Delivery',\n",
363
+ " 'name': 'Condition',\n",
364
+ " 'keywords': ['bubble wrap', 'naka bubble wrap', 'box', 'fragile', 'yupi box', 'plastic', 'packaging', 'sealed', 'parcel'],\n",
365
+ " 'patterns': [r'yupi.*box', r'basang-basa', r'bubble wrap', r'damaged.*shipping',\n",
366
+ " r'dumating.*yupi', r'box.*yupi', r'parcel.*basa']\n",
367
+ " },\n",
368
+ " 'DEL#CORR': {\n",
369
+ " 'general': 'Delivery',\n",
370
+ " 'name': 'Correctness',\n",
371
+ " 'keywords': ['order', 'mali', 'complete orders', 'wrong item', 'pinadala'],\n",
372
+ " 'patterns': [r'ordered.*pero.*received', r'mali.*pinadala', r'iba.*dumating',\n",
373
+ " r'wrong.*item', r'(large|medium|small|XL|XS).*order.*pero.*(large|medium|small|XL|XS).*dumating',\n",
374
+ " r'inorder.*pero.*dumating', r'order ko.*pero.*pinadala']\n",
375
+ " },\n",
376
+ " 'DEL#TIME': {\n",
377
+ " 'general': 'Delivery',\n",
378
+ " 'name': 'Timeliness',\n",
379
+ " 'keywords': ['dumating', 'agad', 'tagal dumating', 'fast delivery', 'fast shipping', 'bilis dumating',\n",
380
+ " 'bilis shipping', 'shipped immediately', 'shipping', 'takes', 'weeks', 'shipped'],\n",
381
+ " 'patterns': [r'late.*delivery', r'tagal.*dumating', r'\\d+\\s*(days|araw|weeks|linggo).*delivery',\n",
382
+ " r'fast delivery', r'tagal.*shipping', r'bilis.*delivery']\n",
383
+ " },\n",
384
+ " 'DEL#GEN': {\n",
385
+ " 'general': 'Delivery',\n",
386
+ " 'name': 'General',\n",
387
+ " 'keywords': ['rider', 'kuya rider', 'shipping', 'parcel', 'ship', 'delivery', 'courier', 'secure'],\n",
388
+ " 'patterns': [r'okay.*delivery', r'smooth.*shipping', r'mait.*nag-deliver',\n",
389
+ " r'delivery.*fine']\n",
390
+ " },\n",
391
+ "\n",
392
+ " # Price aspects\n",
393
+ " 'PRI#AFF': {\n",
394
+ " 'general': 'Price',\n",
395
+ " 'name': 'Affordability',\n",
396
+ " 'keywords': ['affordable', 'mura'],\n",
397
+ " 'patterns': [r'sobrang mura', r'affordable.*price', r'budget.*friendly',\n",
398
+ " r'mura lang', r'nakakuha.*discount']\n",
399
+ " },\n",
400
+ " 'PRI#VOM': {\n",
401
+ " 'general': 'Price',\n",
402
+ " 'name': 'Value for Money',\n",
403
+ " 'keywords': ['price', 'worth', 'worth price', 'good price', 'quality', 'sulit', 'sayang pera', 'waste money'],\n",
404
+ " 'patterns': [r'sulit.*price', r'worth.*money', r'hindi worth', r'sayang.*pera',\n",
405
+ " r'not worth it', r'worth it', r'hindi.*sulit', r'sobrang.*sulit']\n",
406
+ " },\n",
407
+ " 'PRI#GEN': {\n",
408
+ " 'general': 'Price',\n",
409
+ " 'name': 'General',\n",
410
+ " 'keywords': ['price', 'sakto price', 'okay price', 'ok price'],\n",
411
+ " 'patterns': [r'okay.*presyo', r'standard.*price', r'sakto.*presyo']\n",
412
+ " },\n",
413
+ "\n",
414
+ " # Service aspects\n",
415
+ " 'SER#HAND': {\n",
416
+ " 'general': 'Service',\n",
417
+ " 'name': 'Handling',\n",
418
+ " 'keywords': ['refund', 'return', 'return refund', 'check', 'double check'],\n",
419
+ " 'patterns': [r'sana.*check', r'hindi.*check', r'double.*check', r'maingat.*pack',\n",
420
+ " r'need.*check.*bago.*padala']\n",
421
+ " },\n",
422
+ " 'SER#RES': {\n",
423
+ " 'general': 'Service',\n",
424
+ " 'name': 'Responsiveness',\n",
425
+ " 'keywords': ['picture', 'pictures', 'nag send', 'message', 'response', 'reply', 'chat', 'seller responsive'],\n",
426
+ " 'patterns': [r'tagal.*sagot', r'walang.*reply', r'responsive.*seller', r'seen.*zone',\n",
427
+ " r'hindi.*sumasagot', r'mabait.*kausap']\n",
428
+ " },\n",
429
+ " 'SER#TRU': {\n",
430
+ " 'general': 'Service',\n",
431
+ " 'name': 'Trustworthiness',\n",
432
+ " 'keywords': ['seller', 'scammer', 'scam'],\n",
433
+ " 'patterns': [r'scam.*shop', r'fake.*product', r'hindi legit', r'nanloko',\n",
434
+ " r'pinapalitan.*order']\n",
435
+ " },\n",
436
+ " 'SER#GEN': {\n",
437
+ " 'general': 'Service',\n",
438
+ " 'name': 'General',\n",
439
+ " 'keywords': ['seller', 'service', 'staff', 'shop', 'thank seller', 'order received', 'recommend shop',\n",
440
+ " 'poor service', 'rude staff', 'irresponsible'],\n",
441
+ " 'patterns': [r'good.*service', r'bad.*service', r'okay.*service', r'seller.*helpful']\n",
442
+ " }\n",
443
+ " }\n",
444
+ "\n",
445
+ " def load_filipino_stopwords(self, filepath):\n",
446
+ " \"\"\"Load Filipino stopwords from file\"\"\"\n",
447
+ " with open(filepath, 'r', encoding='utf-8') as f:\n",
448
+ " return set([line.strip().lower() for line in f if line.strip()])\n",
449
+ "\n",
450
+ " def preprocess_text(self, text):\n",
451
+ " \"\"\"Preprocess Taglish text while preserving context\"\"\"\n",
452
+ " if pd.isna(text):\n",
453
+ " return \"\"\n",
454
+ "\n",
455
+ " # Convert to string and lowercase\n",
456
+ " text = str(text).lower()\n",
457
+ "\n",
458
+ " # Remove URLs\n",
459
+ " text = re.sub(r'http\\S+|www.\\S+', '', text)\n",
460
+ "\n",
461
+ " # Remove email addresses\n",
462
+ " text = re.sub(r'\\S+@\\S+', '', text)\n",
463
+ "\n",
464
+ " # Keep the text mostly intact for pattern matching\n",
465
+ " # Just normalize multiple spaces\n",
466
+ " text = re.sub(r'\\s+', ' ', text)\n",
467
+ "\n",
468
+ " return text.strip()\n",
469
+ "\n",
470
+ " def check_patterns(self, text, patterns):\n",
471
+ " \"\"\"Check if any pattern matches in the text\"\"\"\n",
472
+ " for pattern in patterns:\n",
473
+ " if re.search(pattern, text, re.IGNORECASE):\n",
474
+ " return True\n",
475
+ " return False\n",
476
+ "\n",
477
+ " def check_keywords(self, text, keywords):\n",
478
+ " \"\"\"Improved keyword checking with better context awareness\"\"\"\n",
479
+ " text_lower = text.lower()\n",
480
+ " for keyword in keywords:\n",
481
+ " # Use word boundaries for more accurate matching\n",
482
+ " # But be flexible for compound words\n",
483
+ " if len(keyword.split()) > 1:\n",
484
+ " # Multi-word keywords\n",
485
+ " if keyword.lower() in text_lower:\n",
486
+ " return True\n",
487
+ " else:\n",
488
+ " # Single word keywords - use word boundaries\n",
489
+ " pattern = r'\\b' + re.escape(keyword.lower()) + r'\\b'\n",
490
+ " if re.search(pattern, text_lower):\n",
491
+ " return True\n",
492
+ " return False\n",
493
+ "\n",
494
+ " def apply_context_rules(self, text, initial_aspects):\n",
495
+ " \"\"\"Apply improved context-based rules from the codebook\"\"\"\n",
496
+ " text_lower = text.lower()\n",
497
+ " refined_aspects = initial_aspects.copy()\n",
498
+ "\n",
499
+ " # Rule 1: Color mentioned with ordering context -> DEL#CORR\n",
500
+ " if re.search(r'(order|inorder).*pero.*(dumating|natanggap|received)', text_lower):\n",
501
+ " if 'PRO#COL' in refined_aspects and any(color in text_lower for color in ['gray', 'blue', 'red', 'black', 'white', 'pink']):\n",
502
+ " refined_aspects.remove('PRO#COL')\n",
503
+ " refined_aspects.add('DEL#CORR')\n",
504
+ "\n",
505
+ " # Rule 2: Initial functionality then failure -> PRO#DUR\n",
506
+ " if re.search(r'(gumana|working).*(una|first|initially).*pero.*(namatay|sira|broken|stopped)', text_lower):\n",
507
+ " if 'PRO#FUNC' in refined_aspects:\n",
508
+ " refined_aspects.remove('PRO#FUNC')\n",
509
+ " refined_aspects.add('PRO#DUR')\n",
510
+ "\n",
511
+ " # Rule 3: Size ordering mismatch -> DEL#CORR\n",
512
+ " size_pattern = r'(ordered?|inorder).*(large|medium|small|xs|xl|xxl).*pero.*(received?|natanggap|dumating).*(large|medium|small|xs|xl|xxl)'\n",
513
+ " if re.search(size_pattern, text_lower):\n",
514
+ " if 'PRO#SIZE' in refined_aspects:\n",
515
+ " refined_aspects.remove('PRO#SIZE')\n",
516
+ " refined_aspects.add('DEL#CORR')\n",
517
+ "\n",
518
+ " # Rule 4: \"not worth it\" should always be PRI#VOM\n",
519
+ " if re.search(r'(not|hindi).*worth', text_lower) or 'sayang' in text_lower:\n",
520
+ " refined_aspects.add('PRI#VOM')\n",
521
+ " # Remove incorrect time references\n",
522
+ " if 'DEL#TIME' in refined_aspects and 'bilis' not in text_lower and 'tagal' not in text_lower:\n",
523
+ " refined_aspects.discard('DEL#TIME')\n",
524
+ "\n",
525
+ " # Rule 5: \"mura pero hindi worth it\" -> both PRI#AFF and PRI#VOM\n",
526
+ " if 'mura' in text_lower and ('hindi worth' in text_lower or 'not worth' in text_lower):\n",
527
+ " refined_aspects.add('PRI#AFF')\n",
528
+ " refined_aspects.add('PRI#VOM')\n",
529
+ "\n",
530
+ " # Rule 6: Time-based durability issues\n",
531
+ " durability_time_pattern = r'(\\d+|tatlong|dalawang|isang)\\s*(araw|days|linggo|week|buwan|month).*pa\\s*lang.*sira'\n",
532
+ " if re.search(durability_time_pattern, text_lower):\n",
533
+ " refined_aspects.add('PRO#DUR')\n",
534
+ " if 'PRO#FUNC' in refined_aspects:\n",
535
+ " refined_aspects.remove('PRO#FUNC')\n",
536
+ "\n",
537
+ " # Rule 7: Wrong variant/model delivered\n",
538
+ " if re.search(r'(mali|wrong|iba).*(model|variant|item).*pinadala', text_lower):\n",
539
+ " refined_aspects.add('DEL#CORR')\n",
540
+ "\n",
541
+ " # Rule 8: Packaging issues during shipping vs product condition\n",
542
+ " if 'yupi' in text_lower or 'basang-basa' in text_lower:\n",
543
+ " if 'box' in text_lower or 'parcel' in text_lower or 'package' in text_lower:\n",
544
+ " refined_aspects.add('DEL#COND')\n",
545
+ " # Remove product condition if it's clearly about shipping\n",
546
+ " if 'PRO#COND' in refined_aspects and 'sealed' not in text_lower:\n",
547
+ " refined_aspects.discard('PRO#COND')\n",
548
+ "\n",
549
+ " # Rule 9: \"bilis\" context disambiguation\n",
550
+ " if 'bilis' in text_lower:\n",
551
+ " # Check if it's about delivery\n",
552
+ " if any(word in text_lower for word in ['delivery', 'dumating', 'shipping', 'padala']):\n",
553
+ " refined_aspects.add('DEL#TIME')\n",
554
+ " refined_aspects.discard('PRO#EFF') # Remove if incorrectly added\n",
555
+ " # Check if it's about product effectiveness\n",
556
+ " elif any(word in text_lower for word in ['effect', 'epekto', 'resulta', 'gumana']):\n",
557
+ " refined_aspects.add('PRO#EFF')\n",
558
+ " refined_aspects.discard('DEL#TIME') # Remove if incorrectly added\n",
559
+ "\n",
560
+ " # Rule 10: Remove overly generic aspects if more specific ones exist\n",
561
+ " if len(refined_aspects) > 1:\n",
562
+ " # If we have specific product aspects, remove PRO#GEN\n",
563
+ " specific_product_aspects = {'PRO#COL', 'PRO#COND', 'PRO#CORR', 'PRO#DUR',\n",
564
+ " 'PRO#EFF', 'PRO#FUNC', 'PRO#MAT', 'PRO#SENS', 'PRO#SIZE'}\n",
565
+ " if refined_aspects.intersection(specific_product_aspects) and 'PRO#GEN' in refined_aspects:\n",
566
+ " refined_aspects.discard('PRO#GEN')\n",
567
+ "\n",
568
+ " # Similar for other general aspects\n",
569
+ " if {'DEL#COND', 'DEL#CORR', 'DEL#TIME'}.intersection(refined_aspects) and 'DEL#GEN' in refined_aspects:\n",
570
+ " refined_aspects.discard('DEL#GEN')\n",
571
+ "\n",
572
+ " if {'PRI#AFF', 'PRI#VOM'}.intersection(refined_aspects) and 'PRI#GEN' in refined_aspects:\n",
573
+ " refined_aspects.discard('PRI#GEN')\n",
574
+ "\n",
575
+ " if {'SER#HAND', 'SER#RES', 'SER#TRU'}.intersection(refined_aspects) and 'SER#GEN' in refined_aspects:\n",
576
+ " refined_aspects.discard('SER#GEN')\n",
577
+ "\n",
578
+ " return refined_aspects\n",
579
+ "\n",
580
+ " def annotate_aspects(self, text):\n",
581
+ " \"\"\"\n",
582
+ " Annotate aspects in the given text following the codebook rules\n",
583
+ " Returns: list of identified aspect tags and their details\n",
584
+ " \"\"\"\n",
585
+ " if pd.isna(text) or text.strip() == \"\":\n",
586
+ " return [], {}\n",
587
+ "\n",
588
+ " processed_text = self.preprocess_text(text)\n",
589
+ " identified_aspects = set()\n",
590
+ " aspect_details = defaultdict(list)\n",
591
+ "\n",
592
+ " # Check each specific aspect\n",
593
+ " for aspect_tag, aspect_info in self.specific_aspects.items():\n",
594
+ " matched = False\n",
595
+ "\n",
596
+ " # First check patterns (higher priority)\n",
597
+ " if 'patterns' in aspect_info:\n",
598
+ " if self.check_patterns(processed_text, aspect_info['patterns']):\n",
599
+ " matched = True\n",
600
+ " aspect_details[aspect_tag].append('pattern_match')\n",
601
+ "\n",
602
+ " # Then check keywords\n",
603
+ " if not matched and self.check_keywords(processed_text, aspect_info['keywords']):\n",
604
+ " matched = True\n",
605
+ " # Find which keywords matched\n",
606
+ " for keyword in aspect_info['keywords']:\n",
607
+ " if len(keyword.split()) > 1:\n",
608
+ " if keyword.lower() in processed_text:\n",
609
+ " aspect_details[aspect_tag].append(keyword)\n",
610
+ " break\n",
611
+ " else:\n",
612
+ " pattern = r'\\b' + re.escape(keyword.lower()) + r'\\b'\n",
613
+ " if re.search(pattern, processed_text):\n",
614
+ " aspect_details[aspect_tag].append(keyword)\n",
615
+ " break\n",
616
+ "\n",
617
+ " if matched:\n",
618
+ " identified_aspects.add(aspect_tag)\n",
619
+ "\n",
620
+ " # Apply context-based refinement rules\n",
621
+ " identified_aspects = self.apply_context_rules(processed_text, identified_aspects)\n",
622
+ "\n",
623
+ " # Convert to sorted list for consistent output\n",
624
+ " identified_aspects = sorted(list(identified_aspects))\n",
625
+ "\n",
626
+ " return identified_aspects, dict(aspect_details)\n",
627
+ "\n",
628
+ " def get_general_aspects(self, specific_aspects):\n",
629
+ " \"\"\"Get general aspects from specific aspect tags\"\"\"\n",
630
+ " general = set()\n",
631
+ " for aspect_tag in specific_aspects:\n",
632
+ " if aspect_tag in self.specific_aspects:\n",
633
+ " general.add(self.specific_aspects[aspect_tag]['general'])\n",
634
+ " return sorted(list(general))\n",
635
+ "\n",
636
+ " def process_dataset(self, df, text_column='review'):\n",
637
+ " \"\"\"\n",
638
+ " Process entire dataset and annotate aspects according to codebook\n",
639
+ " \"\"\"\n",
640
+ " total_rows = len(df)\n",
641
+ " print(f\"Processing all {total_rows} reviews...\")\n",
642
+ "\n",
643
+ " results = []\n",
644
+ "\n",
645
+ " # Process all reviews in their original order\n",
646
+ " for idx in range(total_rows):\n",
647
+ " if (idx + 1) % 100 == 0:\n",
648
+ " print(f\" Processed {idx + 1}/{total_rows} reviews...\")\n",
649
+ "\n",
650
+ " row = df.iloc[idx]\n",
651
+ " review_text = row[text_column]\n",
652
+ " specific_aspects, aspect_details = self.annotate_aspects(review_text)\n",
653
+ " general_aspects = self.get_general_aspects(specific_aspects)\n",
654
+ "\n",
655
+ " result = {\n",
656
+ " 'review_id': idx + 1, # Start from 1\n",
657
+ " 'review': review_text,\n",
658
+ " 'general_aspects': ', '.join(general_aspects) if general_aspects else 'none',\n",
659
+ " 'specific_aspects': ', '.join(specific_aspects) if specific_aspects else 'none',\n",
660
+ " 'aspect_keywords': str(aspect_details) if aspect_details else '{}'\n",
661
+ " }\n",
662
+ "\n",
663
+ " # Add original columns (except sentiment if it exists)\n",
664
+ " for col in df.columns:\n",
665
+ " if col != text_column and col.lower() not in ['sentiment', 'sentiments']:\n",
666
+ " result[col] = row[col]\n",
667
+ "\n",
668
+ " results.append(result)\n",
669
+ "\n",
670
+ " print(f\" Completed processing all {total_rows} reviews!\")\n",
671
+ " return pd.DataFrame(results)\n",
672
+ "\n",
673
+ "# Main execution\n",
674
+ "def main():\n",
675
+ " print(\"=\" * 60)\n",
676
+ " print(\"RULE-BASED KEYWORD ASPECT ANNOTATOR\")\n",
677
+ " print(\"Automated aspect annotation using keywords and context rules\")\n",
678
+ " print(\"=\" * 60)\n",
679
+ "\n",
680
+ " # Initialize annotator\n",
681
+ " annotator = RuleBasedKeywordAnnotator('stopwords-new.txt')\n",
682
+ "\n",
683
+ " # Read the dataset\n",
684
+ " print(\"\\nReading dataset...\")\n",
685
+ " df = pd.read_csv('SentiTaglish_ProductsAndServices.csv', encoding='utf-8')\n",
686
+ " print(f\"Successfully read dataset with {len(df)} rows\")\n",
687
+ "\n",
688
+ " # Identify review column\n",
689
+ " review_column = None\n",
690
+ " for col in df.columns:\n",
691
+ " if 'review' in col.lower() or 'text' in col.lower() or 'comment' in col.lower():\n",
692
+ " review_column = col\n",
693
+ " break\n",
694
+ "\n",
695
+ " if review_column is None:\n",
696
+ " review_column = df.columns[0] # Use first column as fallback\n",
697
+ "\n",
698
+ " print(f\"Using column '{review_column}' for reviews\")\n",
699
+ " print(f\"Dataset shape: {df.shape}\")\n",
700
+ "\n",
701
+ " # Drop sentiment/sentiments column if exists\n",
702
+ " columns_to_drop = [col for col in df.columns if col.lower() in ['sentiment', 'sentiments']]\n",
703
+ " if columns_to_drop:\n",
704
+ " df = df.drop(columns=columns_to_drop)\n",
705
+ " print(f\"Dropped columns: {columns_to_drop}\")\n",
706
+ "\n",
707
+ " # Process entire dataset\n",
708
+ " annotated_df = annotator.process_dataset(df, text_column=review_column)\n",
709
+ "\n",
710
+ " # Display sample results\n",
711
+ " print(\"\\n\" + \"=\" * 60)\n",
712
+ " print(\"SAMPLE ANNOTATION RESULTS (First 10)\")\n",
713
+ " print(\"=\" * 60)\n",
714
+ "\n",
715
+ " sample = annotated_df.head(10)\n",
716
+ " for idx, row in sample.iterrows():\n",
717
+ " print(f\"\\n[Review {row['review_id']}]\")\n",
718
+ " print(f\"Text: {row['review'][:150]}...\")\n",
719
+ " print(f\"General Aspects: {row['general_aspects']}\")\n",
720
+ " print(f\"Specific Aspects: {row['specific_aspects']}\")\n",
721
+ " if row['aspect_keywords'] != '{}':\n",
722
+ " print(f\"Matched Keywords: {row['aspect_keywords']}\")\n",
723
+ " print(\"-\" * 40)\n",
724
+ "\n",
725
+ " # Save to CSV\n",
726
+ " output_filename = 'annotated_reviews_rule_based.csv'\n",
727
+ " annotated_df.to_csv(output_filename, index=False, encoding='utf-8')\n",
728
+ " print(f\"\\n✅ Annotated data saved to: {output_filename}\")\n",
729
+ " print(f\"Total reviews processed: {len(annotated_df)}\")\n",
730
+ "\n",
731
+ "if __name__ == \"__main__\":\n",
732
+ " main()"
733
+ ]
734
+ }
735
+ ]
736
+ }