aaron-rae-nicolas
/

Aspect-Identifcation-and-Extraction-Model

+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "colab": {
+      "provenance": []
+    },
+    "kernelspec": {
+      "name": "python3",
+      "display_name": "Python 3"
+    },
+    "language_info": {
+      "name": "python"
+    }
+  },
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "h4-1lYVz9trX",
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "outputId": "f3755b19-7a47-4327-e39c-7da6fd5eeab8"
+      },
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "============================================================\n",
+            "RULE-BASED KEYWORD ASPECT ANNOTATOR\n",
+            "Automated aspect annotation using keywords and context rules\n",
+            "============================================================\n",
+            "\n",
+            "Reading dataset...\n",
+            "Successfully read dataset with 10510 rows\n",
+            "Using column 'review' for reviews\n",
+            "Dataset shape: (10510, 2)\n",
+            "Dropped columns: ['sentiment']\n",
+            "Processing all 10510 reviews...\n",
+            "  Processed 100/10510 reviews...\n",
+            "  Processed 200/10510 reviews...\n",
+            "  Processed 300/10510 reviews...\n",
+            "  Processed 400/10510 reviews...\n",
+            "  Processed 500/10510 reviews...\n",
+            "  Processed 600/10510 reviews...\n",
+            "  Processed 700/10510 reviews...\n",
+            "  Processed 800/10510 reviews...\n",
+            "  Processed 900/10510 reviews...\n",
+            "  Processed 1000/10510 reviews...\n",
+            "  Processed 1100/10510 reviews...\n",
+            "  Processed 1200/10510 reviews...\n",
+            "  Processed 1300/10510 reviews...\n",
+            "  Processed 1400/10510 reviews...\n",
+            "  Processed 1500/10510 reviews...\n",
+            "  Processed 1600/10510 reviews...\n",
+            "  Processed 1700/10510 reviews...\n",
+            "  Processed 1800/10510 reviews...\n",
+            "  Processed 1900/10510 reviews...\n",
+            "  Processed 2000/10510 reviews...\n",
+            "  Processed 2100/10510 reviews...\n",
+            "  Processed 2200/10510 reviews...\n",
+            "  Processed 2300/10510 reviews...\n",
+            "  Processed 2400/10510 reviews...\n",
+            "  Processed 2500/10510 reviews...\n",
+            "  Processed 2600/10510 reviews...\n",
+            "  Processed 2700/10510 reviews...\n",
+            "  Processed 2800/10510 reviews...\n",
+            "  Processed 2900/10510 reviews...\n",
+            "  Processed 3000/10510 reviews...\n",
+            "  Processed 3100/10510 reviews...\n",
+            "  Processed 3200/10510 reviews...\n",
+            "  Processed 3300/10510 reviews...\n",
+            "  Processed 3400/10510 reviews...\n",
+            "  Processed 3500/10510 reviews...\n",
+            "  Processed 3600/10510 reviews...\n",
+            "  Processed 3700/10510 reviews...\n",
+            "  Processed 3800/10510 reviews...\n",
+            "  Processed 3900/10510 reviews...\n",
+            "  Processed 4000/10510 reviews...\n",
+            "  Processed 4100/10510 reviews...\n",
+            "  Processed 4200/10510 reviews...\n",
+            "  Processed 4300/10510 reviews...\n",
+            "  Processed 4400/10510 reviews...\n",
+            "  Processed 4500/10510 reviews...\n",
+            "  Processed 4600/10510 reviews...\n",
+            "  Processed 4700/10510 reviews...\n",
+            "  Processed 4800/10510 reviews...\n",
+            "  Processed 4900/10510 reviews...\n",
+            "  Processed 5000/10510 reviews...\n",
+            "  Processed 5100/10510 reviews...\n",
+            "  Processed 5200/10510 reviews...\n",
+            "  Processed 5300/10510 reviews...\n",
+            "  Processed 5400/10510 reviews...\n",
+            "  Processed 5500/10510 reviews...\n",
+            "  Processed 5600/10510 reviews...\n",
+            "  Processed 5700/10510 reviews...\n",
+            "  Processed 5800/10510 reviews...\n",
+            "  Processed 5900/10510 reviews...\n",
+            "  Processed 6000/10510 reviews...\n",
+            "  Processed 6100/10510 reviews...\n",
+            "  Processed 6200/10510 reviews...\n",
+            "  Processed 6300/10510 reviews...\n",
+            "  Processed 6400/10510 reviews...\n",
+            "  Processed 6500/10510 reviews...\n",
+            "  Processed 6600/10510 reviews...\n",
+            "  Processed 6700/10510 reviews...\n",
+            "  Processed 6800/10510 reviews...\n",
+            "  Processed 6900/10510 reviews...\n",
+            "  Processed 7000/10510 reviews...\n",
+            "  Processed 7100/10510 reviews...\n",
+            "  Processed 7200/10510 reviews...\n",
+            "  Processed 7300/10510 reviews...\n",
+            "  Processed 7400/10510 reviews...\n",
+            "  Processed 7500/10510 reviews...\n",
+            "  Processed 7600/10510 reviews...\n",
+            "  Processed 7700/10510 reviews...\n",
+            "  Processed 7800/10510 reviews...\n",
+            "  Processed 7900/10510 reviews...\n",
+            "  Processed 8000/10510 reviews...\n",
+            "  Processed 8100/10510 reviews...\n",
+            "  Processed 8200/10510 reviews...\n",
+            "  Processed 8300/10510 reviews...\n",
+            "  Processed 8400/10510 reviews...\n",
+            "  Processed 8500/10510 reviews...\n",
+            "  Processed 8600/10510 reviews...\n",
+            "  Processed 8700/10510 reviews...\n",
+            "  Processed 8800/10510 reviews...\n",
+            "  Processed 8900/10510 reviews...\n",
+            "  Processed 9000/10510 reviews...\n",
+            "  Processed 9100/10510 reviews...\n",
+            "  Processed 9200/10510 reviews...\n",
+            "  Processed 9300/10510 reviews...\n",
+            "  Processed 9400/10510 reviews...\n",
+            "  Processed 9500/10510 reviews...\n",
+            "  Processed 9600/10510 reviews...\n",
+            "  Processed 9700/10510 reviews...\n",
+            "  Processed 9800/10510 reviews...\n",
+            "  Processed 9900/10510 reviews...\n",
+            "  Processed 10000/10510 reviews...\n",
+            "  Processed 10100/10510 reviews...\n",
+            "  Processed 10200/10510 reviews...\n",
+            "  Processed 10300/10510 reviews...\n",
+            "  Processed 10400/10510 reviews...\n",
+            "  Processed 10500/10510 reviews...\n",
+            "  Completed processing all 10510 reviews!\n",
+            "\n",
+            "============================================================\n",
+            "SAMPLE ANNOTATION RESULTS (First 10)\n",
+            "============================================================\n",
+            "\n",
+            "[Review 1]\n",
+            "Text: at first gumagana cya..pero pagnalowbat cya ndi na ya magamit kahit ilang oras mo cya icharge namamatay agad..poor quality..not for recommended.....\n",
+            "General Aspects: Delivery, Price, Product\n",
+            "Specific Aspects: DEL#TIME, PRI#VOM, PRO#COND, PRO#FUNC\n",
+            "Matched Keywords: {'PRO#COND': ['poor quality'], 'PRO#FUNC': ['gumagana'], 'PRO#GEN': ['pattern_match'], 'DEL#TIME': ['agad'], 'PRI#VOM': ['quality']}\n",
+            "----------------------------------------\n",
+            "\n",
+            "[Review 2]\n",
+            "Text: grabi pangalawa ko ng order sa shapee pero puro dismayado ako ang pangit ng tila subrang nipis mabilis ma punit at ang laki ng size niya alam niyo sho...\n",
+            "General Aspects: Delivery, Product\n",
+            "Specific Aspects: DEL#CORR, PRO#COND, PRO#CORR, PRO#SIZE\n",
+            "Matched Keywords: {'PRO#COND': ['item'], 'PRO#CORR': ['item'], 'PRO#SIZE': ['size'], 'PRO#GEN': ['item'], 'DEL#CORR': ['order']}\n",
+            "----------------------------------------\n",
+            "\n",
+            "[Review 3]\n",
+            "Text: 2l gray/black order ko. bakit 850ml lang po pinadala nyo. kung di na po available yung product dapat di nyo pinilit yung gusto nyo. kelangan ko po yun...\n",
+            "General Aspects: Delivery, Product, Service\n",
+            "Specific Aspects: DEL#CORR, PRO#COL, SER#HAND\n",
+            "Matched Keywords: {'PRO#COL': ['color'], 'PRO#GEN': ['product'], 'DEL#CORR': ['order'], 'SER#HAND': ['return']}\n",
+            "----------------------------------------\n",
+            "\n",
+            "[Review 4]\n",
+            "Text: walang silbing product.. bwesit. di gumagana dalawa pa...\n",
+            "General Aspects: Product\n",
+            "Specific Aspects: PRO#FUNC\n",
+            "Matched Keywords: {'PRO#FUNC': ['gumagana'], 'PRO#GEN': ['product']}\n",
+            "----------------------------------------\n",
+            "\n",
+            "[Review 5]\n",
+            "Text: d po maganda naman po yung neck fan, pero po napaka tagal po sya dumating dec,9 po dec, 23 napo sya dumating, chaka po pink po inorder ko bakit po whi...\n",
+            "General Aspects: Delivery, Product, Service\n",
+            "Specific Aspects: DEL#TIME, PRO#COL, PRO#FUNC, SER#TRU\n",
+            "Matched Keywords: {'PRO#COL': ['white'], 'PRO#FUNC': ['fan'], 'PRO#GEN': ['maganda'], 'DEL#TIME': ['pattern_match'], 'SER#TRU': ['seller'], 'SER#GEN': ['seller']}\n",
+            "----------------------------------------\n",
+            "\n",
+            "[Review 6]\n",
+            "Text: 0/10 sa effectiveness po nya 0/10 dahil kakalagay ko lang then wala man lang ilang kalahating minuto uminom lang ako wala na agad ung kulay sa lips ko...\n",
+            "General Aspects: Delivery, Product, Service\n",
+            "Specific Aspects: DEL#TIME, PRO#COL, SER#TRU\n",
+            "Matched Keywords: {'PRO#COL': ['kulay'], 'DEL#TIME': ['agad'], 'SER#TRU': ['scam']}\n",
+            "----------------------------------------\n",
+            "\n",
+            "[Review 7]\n",
+            "Text: mahina lng ang hangin khit nka maximum na. mabilis ma low batt.. pinapawisan p rin lalo n yung batok...\n",
+            "General Aspects: none\n",
+            "Specific Aspects: none\n",
+            "----------------------------------------\n",
+            "\n",
+            "[Review 8]\n",
+            "Text: maganda dn same sa picture maganda sya ,medyo maliit nga lang sya...\n",
+            "General Aspects: Product, Service\n",
+            "Specific Aspects: PRO#SIZE, SER#RES\n",
+            "Matched Keywords: {'PRO#SIZE': ['maliit'], 'PRO#GEN': ['maganda'], 'SER#RES': ['picture']}\n",
+            "----------------------------------------\n",
+            "\n",
+            "[Review 9]\n",
+            "Text: not worth it ang bilis n'ya ma-lowbat and ang tagal n'ya pang i-charge...\n",
+            "General Aspects: Price, Product\n",
+            "Specific Aspects: PRI#VOM, PRO#FUNC\n",
+            "Matched Keywords: {'PRO#FUNC': ['charge'], 'PRI#VOM': ['pattern_match']}\n",
+            "----------------------------------------\n",
+            "\n",
+            "[Review 10]\n",
+            "Text: mahirap buksan yung payong. need mo pa itulak. sana na check man lang bago ipadala....\n",
+            "General Aspects: Service\n",
+            "Specific Aspects: SER#HAND\n",
+            "Matched Keywords: {'SER#HAND': ['pattern_match']}\n",
+            "----------------------------------------\n",
+            "\n",
+            "✅ Annotated data saved to: annotated_reviews_rule_based.csv\n",
+            "Total reviews processed: 10510\n"
+          ]
+        }
+      ],
+      "source": [
+        "import pandas as pd\n",
+        "import re\n",
+        "import csv\n",
+        "from collections import defaultdict, Counter\n",
+        "import string\n",
+        "import nltk\n",
+        "from nltk.tokenize import word_tokenize\n",
+        "from nltk.corpus import stopwords\n",
+        "import warnings\n",
+        "warnings.filterwarnings('ignore')\n",
+        "\n",
+        "# Download required NLTK data\n",
+        "try:\n",
+        "    nltk.data.find('tokenizers/punkt')\n",
+        "except LookupError:\n",
+        "    nltk.download('punkt')\n",
+        "    nltk.download('punkt_tab')\n",
+        "\n",
+        "try:\n",
+        "    nltk.data.find('corpora/stopwords')\n",
+        "except LookupError:\n",
+        "    nltk.download('stopwords')\n",
+        "\n",
+        "class RuleBasedKeywordAnnotator:\n",
+        "    def __init__(self, stopwords_file='stopwords-new.txt'):\n",
+        "        \"\"\"\n",
+        "        Initialize the rule-based keyword aspect annotator\n",
+        "        \"\"\"\n",
+        "        # Load Filipino stopwords\n",
+        "        self.filipino_stopwords = self.load_filipino_stopwords(stopwords_file)\n",
+        "\n",
+        "        # English stopwords\n",
+        "        self.english_stopwords = set(stopwords.words('english'))\n",
+        "\n",
+        "        # Combined stopwords (lowercased)\n",
+        "        self.all_stopwords = self.filipino_stopwords.union(self.english_stopwords)\n",
+        "\n",
+        "        # Define general aspects based on codebook\n",
+        "        self.general_aspects = {\n",
+        "            'Product': 'PRO',\n",
+        "            'Price': 'PRI',\n",
+        "            'Delivery': 'DEL',\n",
+        "            'Service': 'SER'\n",
+        "        }\n",
+        "\n",
+        "        # Define specific aspects with improved keywords and patterns\n",
+        "        self.specific_aspects = {\n",
+        "            'PRO#COL': {\n",
+        "                'general': 'Product',\n",
+        "                'name': 'Color',\n",
+        "                'keywords': ['color', 'black', 'white', 'pink', 'blue', 'green', 'wrong color', 'colors', 'kulay', 'faded'],\n",
+        "                'patterns': [r'true to color', r'same as photo', r'iba.*kulay', r'mali.*color',\n",
+        "                             r'mas ma(pula|puti|itim)', r'pangit.*shade', r'maganda.*kulay']\n",
+        "            },\n",
+        "            'PRO#COND': {\n",
+        "                'general': 'Product',\n",
+        "                'name': 'Condition',\n",
+        "                'keywords': ['sira', 'damage', 'item', 'damaged', 'items', 'box', 'yupi box', 'sira box', 'basag',\n",
+        "                             'gasgas', 'leak', 'crack', 'butas butas', 'butas', 'old', 'stock', 'stocks', 'baligtad',\n",
+        "                             'low quality', 'poor quality'],\n",
+        "                'patterns': [r'may gasgas', r'incomplete parts', r'factory defect', r'kulang.*parts',\n",
+        "                             r'may yupi.*box.*pero.*sealed', r'sealed.*pero.*may (gasgas|yupi|dent)']\n",
+        "            },\n",
+        "            'PRO#CORR': {\n",
+        "                'general': 'Product',\n",
+        "                'name': 'Correctness',\n",
+        "                'keywords': ['advertisement', 'false', 'ads', 'fake', 'design', 'tugma', 'meters', 'description', 'item',\n",
+        "                             'expectation', 'expectation vs reality', 'vs reality', 'reality', 'expectations', 'product description'],\n",
+        "                'patterns': [r'sabi.*pero', r'akala ko.*pero', r'advertised as.*pero', r'sa photo.*pero',\n",
+        "                             r'(dual|single)\\s*sim', r'walang.*charger', r'kasama.*charger']\n",
+        "            },\n",
+        "            'PRO#DUR': {\n",
+        "                'general': 'Product',\n",
+        "                'name': 'Durability',\n",
+        "                'keywords': ['week', 'hours', 'minutes', 'one day', 'using', 'llast'],\n",
+        "                'patterns': [r'\\d+\\s*(araw|days|linggo|week|buwan|month).*[pa lang|palang].*sira',\n",
+        "                             r'madaling masira', r'hindi tumagal', r'namatay agad', r'sira na agad',\n",
+        "                             r'tatlong araw.*sira', r'after.*week.*sira', r'gumana.*nung una.*pero.*namatay']\n",
+        "            },\n",
+        "            'PRO#EFF': {\n",
+        "                'general': 'Product',\n",
+        "                'name': 'Effectiveness',\n",
+        "                'keywords': ['dikit', 'glue', 'waterproof', 'water proof', 'long lasting', 'magstraight', 'hair removal'],\n",
+        "                'patterns': [r'hindi.*effective', r'sobrang effective', r'ang bilis.*effect',\n",
+        "                             r'walang.*effect', r'okay.*pero hindi.*effective']\n",
+        "            },\n",
+        "            'PRO#FUNC': {\n",
+        "                'general': 'Product',\n",
+        "                'name': 'Functionality',\n",
+        "                'keywords': ['gumagana', 'battery', 'rechargeable', 'battery life', 'remote', 'sound', 'sounds', 'mic',\n",
+        "                             'audio', 'music', 'tunog', 'charger', 'charge', 'charging', 'charging pin', 'working',\n",
+        "                             'earphones', 'earphone', 'gumagana mic', 'bluetooth', 'disconnect', 'connect',\n",
+        "                             'defective', 'defect', 'malfunction', 'wire', 'cable', 'cord', 'usb', 'pen', 'stylus',\n",
+        "                             'low battery', 'fan', 'power button', 'power', 'remote control', 'side fan', 'gumagana left', 'automatic'],\n",
+        "                'patterns': [r'hindi gumagana', r'ayaw mag', r'not working', r'defective.*item',\n",
+        "                             r'walang.*tunog', r'hindi.*nag-o.*on', r'power button.*hindi']\n",
+        "            },\n",
+        "            'PRO#MAT': {\n",
+        "                'general': 'Product',\n",
+        "                'name': 'Material',\n",
+        "                'keywords': ['manipis', 'rubber', 'plastic', 'material', 'sandal', 'leather', 'cotton', 'spandex',\n",
+        "                             'cotton tela', 'fabric', 'texture', 'tint', 'sticky', 'matte', 'tela', 'thick'],\n",
+        "                'patterns': [r'good quality.*material', r'cheap.*fabric', r'makapal.*tela',\n",
+        "                             r'manipis.*tela', r'solid.*metal']\n",
+        "            },\n",
+        "            'PRO#SENS': {\n",
+        "                'general': 'Product',\n",
+        "                'name': 'Sensory',\n",
+        "                'keywords': ['food', 'masarap', 'taste', 'foods', 'scent', 'perfume', 'smell', 'amoy', 'fragrance',\n",
+        "                             'amoy', 'mabango', 'amoy goma', 'amoy cloud', 'alcohol', 'amoy alcohol', 'pabango', 'oil',\n",
+        "                             'oil based', 'bango'],\n",
+        "                'patterns': [r'ang bango', r'may amoy', r'masarap.*lasa', r'satisfying.*sound',\n",
+        "                             r'amoy.*baby', r'mabaho.*amoy']\n",
+        "            },\n",
+        "            'PRO#SIZE': {\n",
+        "                'general': 'Product',\n",
+        "                'name': 'Size/Measurement',\n",
+        "                'keywords': ['size', 'maliit size', 'sizes', 'add size', 'maliit', 'large', 'xl', 'kasya', 'liit',\n",
+        "                             'strap', 'maiksi', 'inches', 'liters', 'liter'],\n",
+        "                'patterns': [r'true to size', r'ang liit.*large', r'parang.*size', r'hindi kasya',\n",
+        "                             r'masyadong.*maliit', r'oversized']\n",
+        "            },\n",
+        "            'PRO#GEN': {\n",
+        "                'general': 'Product',\n",
+        "                'name': 'General',\n",
+        "                'keywords': ['item', 'product', 'maganda', 'good', 'quality', 'ganda', 'ok', 'super ganda', 'love',\n",
+        "                             'thank', 'thankyou', 'nice', 'goods', 'comfortable'],\n",
+        "                'patterns': [r'okay.*quality', r'good product', r'maganda.*product', r'poor quality',\n",
+        "                             r'standard.*product']\n",
+        "            },\n",
+        "\n",
+        "            # Delivery aspects\n",
+        "            'DEL#COND': {\n",
+        "                'general': 'Delivery',\n",
+        "                'name': 'Condition',\n",
+        "                'keywords': ['bubble wrap', 'naka bubble wrap', 'box', 'fragile', 'yupi box', 'plastic', 'packaging', 'sealed', 'parcel'],\n",
+        "                'patterns': [r'yupi.*box', r'basang-basa', r'bubble wrap', r'damaged.*shipping',\n",
+        "                             r'dumating.*yupi', r'box.*yupi', r'parcel.*basa']\n",
+        "            },\n",
+        "            'DEL#CORR': {\n",
+        "                'general': 'Delivery',\n",
+        "                'name': 'Correctness',\n",
+        "                'keywords': ['order', 'mali', 'complete orders', 'wrong item', 'pinadala'],\n",
+        "                'patterns': [r'ordered.*pero.*received', r'mali.*pinadala', r'iba.*dumating',\n",
+        "                             r'wrong.*item', r'(large|medium|small|XL|XS).*order.*pero.*(large|medium|small|XL|XS).*dumating',\n",
+        "                             r'inorder.*pero.*dumating', r'order ko.*pero.*pinadala']\n",
+        "            },\n",
+        "            'DEL#TIME': {\n",
+        "                'general': 'Delivery',\n",
+        "                'name': 'Timeliness',\n",
+        "                'keywords': ['dumating', 'agad', 'tagal dumating', 'fast delivery', 'fast shipping', 'bilis dumating',\n",
+        "                             'bilis shipping', 'shipped immediately', 'shipping', 'takes', 'weeks', 'shipped'],\n",
+        "                'patterns': [r'late.*delivery', r'tagal.*dumating', r'\\d+\\s*(days|araw|weeks|linggo).*delivery',\n",
+        "                             r'fast delivery', r'tagal.*shipping', r'bilis.*delivery']\n",
+        "            },\n",
+        "            'DEL#GEN': {\n",
+        "                'general': 'Delivery',\n",
+        "                'name': 'General',\n",
+        "                'keywords': ['rider', 'kuya rider', 'shipping', 'parcel', 'ship', 'delivery', 'courier', 'secure'],\n",
+        "                'patterns': [r'okay.*delivery', r'smooth.*shipping', r'mait.*nag-deliver',\n",
+        "                             r'delivery.*fine']\n",
+        "            },\n",
+        "\n",
+        "            # Price aspects\n",
+        "            'PRI#AFF': {\n",
+        "                'general': 'Price',\n",
+        "                'name': 'Affordability',\n",
+        "                'keywords': ['affordable', 'mura'],\n",
+        "                'patterns': [r'sobrang mura', r'affordable.*price', r'budget.*friendly',\n",
+        "                             r'mura lang', r'nakakuha.*discount']\n",
+        "            },\n",
+        "            'PRI#VOM': {\n",
+        "                'general': 'Price',\n",
+        "                'name': 'Value for Money',\n",
+        "                'keywords': ['price', 'worth', 'worth price', 'good price', 'quality', 'sulit', 'sayang pera', 'waste money'],\n",
+        "                'patterns': [r'sulit.*price', r'worth.*money', r'hindi worth', r'sayang.*pera',\n",
+        "                             r'not worth it', r'worth it', r'hindi.*sulit', r'sobrang.*sulit']\n",
+        "            },\n",
+        "            'PRI#GEN': {\n",
+        "                'general': 'Price',\n",
+        "                'name': 'General',\n",
+        "                'keywords': ['price', 'sakto price', 'okay price', 'ok price'],\n",
+        "                'patterns': [r'okay.*presyo', r'standard.*price', r'sakto.*presyo']\n",
+        "            },\n",
+        "\n",
+        "            # Service aspects\n",
+        "            'SER#HAND': {\n",
+        "                'general': 'Service',\n",
+        "                'name': 'Handling',\n",
+        "                'keywords': ['refund', 'return', 'return refund', 'check', 'double check'],\n",
+        "                'patterns': [r'sana.*check', r'hindi.*check', r'double.*check', r'maingat.*pack',\n",
+        "                             r'need.*check.*bago.*padala']\n",
+        "            },\n",
+        "            'SER#RES': {\n",
+        "                'general': 'Service',\n",
+        "                'name': 'Responsiveness',\n",
+        "                'keywords': ['picture', 'pictures', 'nag send', 'message', 'response', 'reply', 'chat', 'seller responsive'],\n",
+        "                'patterns': [r'tagal.*sagot', r'walang.*reply', r'responsive.*seller', r'seen.*zone',\n",
+        "                             r'hindi.*sumasagot', r'mabait.*kausap']\n",
+        "            },\n",
+        "            'SER#TRU': {\n",
+        "                'general': 'Service',\n",
+        "                'name': 'Trustworthiness',\n",
+        "                'keywords': ['seller', 'scammer', 'scam'],\n",
+        "                'patterns': [r'scam.*shop', r'fake.*product', r'hindi legit', r'nanloko',\n",
+        "                             r'pinapalitan.*order']\n",
+        "            },\n",
+        "            'SER#GEN': {\n",
+        "                'general': 'Service',\n",
+        "                'name': 'General',\n",
+        "                'keywords': ['seller', 'service', 'staff', 'shop', 'thank seller', 'order received', 'recommend shop',\n",
+        "                             'poor service', 'rude staff', 'irresponsible'],\n",
+        "                'patterns': [r'good.*service', r'bad.*service', r'okay.*service', r'seller.*helpful']\n",
+        "            }\n",
+        "        }\n",
+        "\n",
+        "    def load_filipino_stopwords(self, filepath):\n",
+        "        \"\"\"Load Filipino stopwords from file\"\"\"\n",
+        "        with open(filepath, 'r', encoding='utf-8') as f:\n",
+        "            return set([line.strip().lower() for line in f if line.strip()])\n",
+        "\n",
+        "    def preprocess_text(self, text):\n",
+        "        \"\"\"Preprocess Taglish text while preserving context\"\"\"\n",
+        "        if pd.isna(text):\n",
+        "            return \"\"\n",
+        "\n",
+        "        # Convert to string and lowercase\n",
+        "        text = str(text).lower()\n",
+        "\n",
+        "        # Remove URLs\n",
+        "        text = re.sub(r'http\\S+|www.\\S+', '', text)\n",
+        "\n",
+        "        # Remove email addresses\n",
+        "        text = re.sub(r'\\S+@\\S+', '', text)\n",
+        "\n",
+        "        # Keep the text mostly intact for pattern matching\n",
+        "        # Just normalize multiple spaces\n",
+        "        text = re.sub(r'\\s+', ' ', text)\n",
+        "\n",
+        "        return text.strip()\n",
+        "\n",
+        "    def check_patterns(self, text, patterns):\n",
+        "        \"\"\"Check if any pattern matches in the text\"\"\"\n",
+        "        for pattern in patterns:\n",
+        "            if re.search(pattern, text, re.IGNORECASE):\n",
+        "                return True\n",
+        "        return False\n",
+        "\n",
+        "    def check_keywords(self, text, keywords):\n",
+        "        \"\"\"Improved keyword checking with better context awareness\"\"\"\n",
+        "        text_lower = text.lower()\n",
+        "        for keyword in keywords:\n",
+        "            # Use word boundaries for more accurate matching\n",
+        "            # But be flexible for compound words\n",
+        "            if len(keyword.split()) > 1:\n",
+        "                # Multi-word keywords\n",
+        "                if keyword.lower() in text_lower:\n",
+        "                    return True\n",
+        "            else:\n",
+        "                # Single word keywords - use word boundaries\n",
+        "                pattern = r'\\b' + re.escape(keyword.lower()) + r'\\b'\n",
+        "                if re.search(pattern, text_lower):\n",
+        "                    return True\n",
+        "        return False\n",
+        "\n",
+        "    def apply_context_rules(self, text, initial_aspects):\n",
+        "        \"\"\"Apply improved context-based rules from the codebook\"\"\"\n",
+        "        text_lower = text.lower()\n",
+        "        refined_aspects = initial_aspects.copy()\n",
+        "\n",
+        "        # Rule 1: Color mentioned with ordering context -> DEL#CORR\n",
+        "        if re.search(r'(order|inorder).*pero.*(dumating|natanggap|received)', text_lower):\n",
+        "            if 'PRO#COL' in refined_aspects and any(color in text_lower for color in ['gray', 'blue', 'red', 'black', 'white', 'pink']):\n",
+        "                refined_aspects.remove('PRO#COL')\n",
+        "                refined_aspects.add('DEL#CORR')\n",
+        "\n",
+        "        # Rule 2: Initial functionality then failure -> PRO#DUR\n",
+        "        if re.search(r'(gumana|working).*(una|first|initially).*pero.*(namatay|sira|broken|stopped)', text_lower):\n",
+        "            if 'PRO#FUNC' in refined_aspects:\n",
+        "                refined_aspects.remove('PRO#FUNC')\n",
+        "                refined_aspects.add('PRO#DUR')\n",
+        "\n",
+        "        # Rule 3: Size ordering mismatch -> DEL#CORR\n",
+        "        size_pattern = r'(ordered?|inorder).*(large|medium|small|xs|xl|xxl).*pero.*(received?|natanggap|dumating).*(large|medium|small|xs|xl|xxl)'\n",
+        "        if re.search(size_pattern, text_lower):\n",
+        "            if 'PRO#SIZE' in refined_aspects:\n",
+        "                refined_aspects.remove('PRO#SIZE')\n",
+        "                refined_aspects.add('DEL#CORR')\n",
+        "\n",
+        "        # Rule 4: \"not worth it\" should always be PRI#VOM\n",
+        "        if re.search(r'(not|hindi).*worth', text_lower) or 'sayang' in text_lower:\n",
+        "            refined_aspects.add('PRI#VOM')\n",
+        "            # Remove incorrect time references\n",
+        "            if 'DEL#TIME' in refined_aspects and 'bilis' not in text_lower and 'tagal' not in text_lower:\n",
+        "                refined_aspects.discard('DEL#TIME')\n",
+        "\n",
+        "        # Rule 5: \"mura pero hindi worth it\" -> both PRI#AFF and PRI#VOM\n",
+        "        if 'mura' in text_lower and ('hindi worth' in text_lower or 'not worth' in text_lower):\n",
+        "            refined_aspects.add('PRI#AFF')\n",
+        "            refined_aspects.add('PRI#VOM')\n",
+        "\n",
+        "        # Rule 6: Time-based durability issues\n",
+        "        durability_time_pattern = r'(\\d+|tatlong|dalawang|isang)\\s*(araw|days|linggo|week|buwan|month).*pa\\s*lang.*sira'\n",
+        "        if re.search(durability_time_pattern, text_lower):\n",
+        "            refined_aspects.add('PRO#DUR')\n",
+        "            if 'PRO#FUNC' in refined_aspects:\n",
+        "                refined_aspects.remove('PRO#FUNC')\n",
+        "\n",
+        "        # Rule 7: Wrong variant/model delivered\n",
+        "        if re.search(r'(mali|wrong|iba).*(model|variant|item).*pinadala', text_lower):\n",
+        "            refined_aspects.add('DEL#CORR')\n",
+        "\n",
+        "        # Rule 8: Packaging issues during shipping vs product condition\n",
+        "        if 'yupi' in text_lower or 'basang-basa' in text_lower:\n",
+        "            if 'box' in text_lower or 'parcel' in text_lower or 'package' in text_lower:\n",
+        "                refined_aspects.add('DEL#COND')\n",
+        "                # Remove product condition if it's clearly about shipping\n",
+        "                if 'PRO#COND' in refined_aspects and 'sealed' not in text_lower:\n",
+        "                    refined_aspects.discard('PRO#COND')\n",
+        "\n",
+        "        # Rule 9: \"bilis\" context disambiguation\n",
+        "        if 'bilis' in text_lower:\n",
+        "            # Check if it's about delivery\n",
+        "            if any(word in text_lower for word in ['delivery', 'dumating', 'shipping', 'padala']):\n",
+        "                refined_aspects.add('DEL#TIME')\n",
+        "                refined_aspects.discard('PRO#EFF')  # Remove if incorrectly added\n",
+        "            # Check if it's about product effectiveness\n",
+        "            elif any(word in text_lower for word in ['effect', 'epekto', 'resulta', 'gumana']):\n",
+        "                refined_aspects.add('PRO#EFF')\n",
+        "                refined_aspects.discard('DEL#TIME')  # Remove if incorrectly added\n",
+        "\n",
+        "        # Rule 10: Remove overly generic aspects if more specific ones exist\n",
+        "        if len(refined_aspects) > 1:\n",
+        "            # If we have specific product aspects, remove PRO#GEN\n",
+        "            specific_product_aspects = {'PRO#COL', 'PRO#COND', 'PRO#CORR', 'PRO#DUR',\n",
+        "                                        'PRO#EFF', 'PRO#FUNC', 'PRO#MAT', 'PRO#SENS', 'PRO#SIZE'}\n",
+        "            if refined_aspects.intersection(specific_product_aspects) and 'PRO#GEN' in refined_aspects:\n",
+        "                refined_aspects.discard('PRO#GEN')\n",
+        "\n",
+        "            # Similar for other general aspects\n",
+        "            if {'DEL#COND', 'DEL#CORR', 'DEL#TIME'}.intersection(refined_aspects) and 'DEL#GEN' in refined_aspects:\n",
+        "                refined_aspects.discard('DEL#GEN')\n",
+        "\n",
+        "            if {'PRI#AFF', 'PRI#VOM'}.intersection(refined_aspects) and 'PRI#GEN' in refined_aspects:\n",
+        "                refined_aspects.discard('PRI#GEN')\n",
+        "\n",
+        "            if {'SER#HAND', 'SER#RES', 'SER#TRU'}.intersection(refined_aspects) and 'SER#GEN' in refined_aspects:\n",
+        "                refined_aspects.discard('SER#GEN')\n",
+        "\n",
+        "        return refined_aspects\n",
+        "\n",
+        "    def annotate_aspects(self, text):\n",
+        "        \"\"\"\n",
+        "        Annotate aspects in the given text following the codebook rules\n",
+        "        Returns: list of identified aspect tags and their details\n",
+        "        \"\"\"\n",
+        "        if pd.isna(text) or text.strip() == \"\":\n",
+        "            return [], {}\n",
+        "\n",
+        "        processed_text = self.preprocess_text(text)\n",
+        "        identified_aspects = set()\n",
+        "        aspect_details = defaultdict(list)\n",
+        "\n",
+        "        # Check each specific aspect\n",
+        "        for aspect_tag, aspect_info in self.specific_aspects.items():\n",
+        "            matched = False\n",
+        "\n",
+        "            # First check patterns (higher priority)\n",
+        "            if 'patterns' in aspect_info:\n",
+        "                if self.check_patterns(processed_text, aspect_info['patterns']):\n",
+        "                    matched = True\n",
+        "                    aspect_details[aspect_tag].append('pattern_match')\n",
+        "\n",
+        "            # Then check keywords\n",
+        "            if not matched and self.check_keywords(processed_text, aspect_info['keywords']):\n",
+        "                matched = True\n",
+        "                # Find which keywords matched\n",
+        "                for keyword in aspect_info['keywords']:\n",
+        "                    if len(keyword.split()) > 1:\n",
+        "                        if keyword.lower() in processed_text:\n",
+        "                            aspect_details[aspect_tag].append(keyword)\n",
+        "                            break\n",
+        "                    else:\n",
+        "                        pattern = r'\\b' + re.escape(keyword.lower()) + r'\\b'\n",
+        "                        if re.search(pattern, processed_text):\n",
+        "                            aspect_details[aspect_tag].append(keyword)\n",
+        "                            break\n",
+        "\n",
+        "            if matched:\n",
+        "                identified_aspects.add(aspect_tag)\n",
+        "\n",
+        "        # Apply context-based refinement rules\n",
+        "        identified_aspects = self.apply_context_rules(processed_text, identified_aspects)\n",
+        "\n",
+        "        # Convert to sorted list for consistent output\n",
+        "        identified_aspects = sorted(list(identified_aspects))\n",
+        "\n",
+        "        return identified_aspects, dict(aspect_details)\n",
+        "\n",
+        "    def get_general_aspects(self, specific_aspects):\n",
+        "        \"\"\"Get general aspects from specific aspect tags\"\"\"\n",
+        "        general = set()\n",
+        "        for aspect_tag in specific_aspects:\n",
+        "            if aspect_tag in self.specific_aspects:\n",
+        "                general.add(self.specific_aspects[aspect_tag]['general'])\n",
+        "        return sorted(list(general))\n",
+        "\n",
+        "    def process_dataset(self, df, text_column='review'):\n",
+        "        \"\"\"\n",
+        "        Process entire dataset and annotate aspects according to codebook\n",
+        "        \"\"\"\n",
+        "        total_rows = len(df)\n",
+        "        print(f\"Processing all {total_rows} reviews...\")\n",
+        "\n",
+        "        results = []\n",
+        "\n",
+        "        # Process all reviews in their original order\n",
+        "        for idx in range(total_rows):\n",
+        "            if (idx + 1) % 100 == 0:\n",
+        "                print(f\"  Processed {idx + 1}/{total_rows} reviews...\")\n",
+        "\n",
+        "            row = df.iloc[idx]\n",
+        "            review_text = row[text_column]\n",
+        "            specific_aspects, aspect_details = self.annotate_aspects(review_text)\n",
+        "            general_aspects = self.get_general_aspects(specific_aspects)\n",
+        "\n",
+        "            result = {\n",
+        "                'review_id': idx + 1,  # Start from 1\n",
+        "                'review': review_text,\n",
+        "                'general_aspects': ', '.join(general_aspects) if general_aspects else 'none',\n",
+        "                'specific_aspects': ', '.join(specific_aspects) if specific_aspects else 'none',\n",
+        "                'aspect_keywords': str(aspect_details) if aspect_details else '{}'\n",
+        "            }\n",
+        "\n",
+        "            # Add original columns (except sentiment if it exists)\n",
+        "            for col in df.columns:\n",
+        "                if col != text_column and col.lower() not in ['sentiment', 'sentiments']:\n",
+        "                    result[col] = row[col]\n",
+        "\n",
+        "            results.append(result)\n",
+        "\n",
+        "        print(f\"  Completed processing all {total_rows} reviews!\")\n",
+        "        return pd.DataFrame(results)\n",
+        "\n",
+        "# Main execution\n",
+        "def main():\n",
+        "    print(\"=\" * 60)\n",
+        "    print(\"RULE-BASED KEYWORD ASPECT ANNOTATOR\")\n",
+        "    print(\"Automated aspect annotation using keywords and context rules\")\n",
+        "    print(\"=\" * 60)\n",
+        "\n",
+        "    # Initialize annotator\n",
+        "    annotator = RuleBasedKeywordAnnotator('stopwords-new.txt')\n",
+        "\n",
+        "    # Read the dataset\n",
+        "    print(\"\\nReading dataset...\")\n",
+        "    df = pd.read_csv('SentiTaglish_ProductsAndServices.csv', encoding='utf-8')\n",
+        "    print(f\"Successfully read dataset with {len(df)} rows\")\n",
+        "\n",
+        "    # Identify review column\n",
+        "    review_column = None\n",
+        "    for col in df.columns:\n",
+        "        if 'review' in col.lower() or 'text' in col.lower() or 'comment' in col.lower():\n",
+        "            review_column = col\n",
+        "            break\n",
+        "\n",
+        "    if review_column is None:\n",
+        "        review_column = df.columns[0]  # Use first column as fallback\n",
+        "\n",
+        "    print(f\"Using column '{review_column}' for reviews\")\n",
+        "    print(f\"Dataset shape: {df.shape}\")\n",
+        "\n",
+        "    # Drop sentiment/sentiments column if exists\n",
+        "    columns_to_drop = [col for col in df.columns if col.lower() in ['sentiment', 'sentiments']]\n",
+        "    if columns_to_drop:\n",
+        "        df = df.drop(columns=columns_to_drop)\n",
+        "        print(f\"Dropped columns: {columns_to_drop}\")\n",
+        "\n",
+        "    # Process entire dataset\n",
+        "    annotated_df = annotator.process_dataset(df, text_column=review_column)\n",
+        "\n",
+        "    # Display sample results\n",
+        "    print(\"\\n\" + \"=\" * 60)\n",
+        "    print(\"SAMPLE ANNOTATION RESULTS (First 10)\")\n",
+        "    print(\"=\" * 60)\n",
+        "\n",
+        "    sample = annotated_df.head(10)\n",
+        "    for idx, row in sample.iterrows():\n",
+        "        print(f\"\\n[Review {row['review_id']}]\")\n",
+        "        print(f\"Text: {row['review'][:150]}...\")\n",
+        "        print(f\"General Aspects: {row['general_aspects']}\")\n",
+        "        print(f\"Specific Aspects: {row['specific_aspects']}\")\n",
+        "        if row['aspect_keywords'] != '{}':\n",
+        "            print(f\"Matched Keywords: {row['aspect_keywords']}\")\n",
+        "        print(\"-\" * 40)\n",
+        "\n",
+        "    # Save to CSV\n",
+        "    output_filename = 'annotated_reviews_rule_based.csv'\n",
+        "    annotated_df.to_csv(output_filename, index=False, encoding='utf-8')\n",
+        "    print(f\"\\n✅ Annotated data saved to: {output_filename}\")\n",
+        "    print(f\"Total reviews processed: {len(annotated_df)}\")\n",
+        "\n",
+        "if __name__ == \"__main__\":\n",
+        "    main()"
+      ]
+    }
+  ]
+}