File size: 13,687 Bytes
6832b36
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
#!/usr/bin/env python
# -*- coding: utf-8 -*-

# Path to the locally fine-tuned model
LOCAL_MODEL_PATH = "./models/finetuned_classification"

# Hugging Face model name (fallback)
MODEL_NAME = "rmtariq/malay_classification"

# Categories from the new dataset
CATEGORIES = ["Politik", "Perpaduan", "Keluarga", "Belia", "Perumahan", "Internet", "Pengguna", "Makanan", "Pekerjaan", "Pengangkutan", "Sukan", "Ekonomi", "Hiburan", "Jenayah", "Alam Sekitar", "Teknologi", "Pendidikan", "Agama", "Sosial", "Kesihatan", "Halal"]

"""
Claim Classifier
---------------

Classifies claims based on priority index data, sentiment analysis, and content patterns.
Also provides functions for classifying claims into categories using a fine-tuned model.
"""

import json
import os
import re
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer


def classify_specific_claims(claim):
    """
    Classify specific claims that the model might not handle correctly.

    Args:
        claim (str): The claim text to classify

    Returns:
        tuple: (category, confidence) or (None, None) if not a specific claim
    """
    claim_lower = claim.lower()

    # Specific claim patterns and their categories
    specific_claims = [
        {
            "pattern": r"ketua polis|kpn|tan sri razarudin|saman|ugutan",
            "category": "Jenayah",
            "confidence": 0.95
        },
        {
            "pattern": r"zakat fitrah|zakat|beras|dimakan",
            "category": "Agama",
            "confidence": 0.95
        },
        {
            "pattern": r"kerajaan.+cukai|cukai.+minyak sawit|minyak sawit mentah",
            "category": "Ekonomi",
            "confidence": 0.95
        },
        {
            "pattern": r"kanta lekap|dijual.+dalam talian|online",
            "category": "Pengguna",
            "confidence": 0.95
        },
        {
            "pattern": r"kelongsong|peluru|dijajah|musuh",
            "category": "Politik",
            "confidence": 0.95
        }
    ]

    # Check if the claim matches any of the specific patterns
    for specific_claim in specific_claims:
        if re.search(specific_claim["pattern"], claim_lower):
            return specific_claim["category"], specific_claim["confidence"]

    # If no match, return None
    return None, None
def load_model():
    """
    Load the classification model and tokenizer.
    First tries to load from local path, then falls back to Hugging Face.
    """
    try:
        # Try to load from local path first
        if os.path.exists(LOCAL_MODEL_PATH):
            print(f"Loading model from local path: {LOCAL_MODEL_PATH}")
            tokenizer = AutoTokenizer.from_pretrained(LOCAL_MODEL_PATH)
            model = AutoModelForSequenceClassification.from_pretrained(LOCAL_MODEL_PATH)
            return model, tokenizer
        else:
            # Fall back to Hugging Face
            print(f"Local model not found. Loading from Hugging Face: {MODEL_NAME}")
            tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
            model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)
            return model, tokenizer
    except Exception as e:
        print(f"Error loading model: {str(e)}")
        # Fall back to bert-base-multilingual-cased if all else fails
        print("Falling back to bert-base-multilingual-cased")
        tokenizer = AutoTokenizer.from_pretrained("bert-base-multilingual-cased")
        model = AutoModelForSequenceClassification.from_pretrained(
            "bert-base-multilingual-cased",
            num_labels=len(CATEGORIES)
        )
        return model, tokenizer


def classify_claim(claim, model=None, tokenizer=None):
    """
    Classify a claim into one of the categories.

    Args:
        claim (str): The claim text to classify
        model: Optional pre-loaded model
        tokenizer: Optional pre-loaded tokenizer

    Returns:
        tuple: (category, confidence)
    """
    # First check if it's a specific claim
    category, confidence = classify_specific_claims(claim)
    if category is not None:
        return category, confidence

    # If not a specific claim, use the model
    if model is None or tokenizer is None:
        model, tokenizer = load_model()

    # Prepare the input
    inputs = tokenizer(claim, return_tensors="pt", truncation=True, max_length=128)

    # Get the prediction
    with torch.no_grad():
        outputs = model(**inputs)

    # Get the predicted class
    logits = outputs.logits
    predicted_class_id = logits.argmax().item()

    # Get the confidence score
    probabilities = torch.nn.functional.softmax(logits, dim=1)[0]
    confidence = probabilities[predicted_class_id].item()

    # Map to category
    try:
        # Try to use the model's id2label mapping
        if hasattr(model.config, 'id2label'):
            category = model.config.id2label[predicted_class_id]
        else:
            # Fall back to our CATEGORIES list
            category = CATEGORIES[predicted_class_id]
    except (IndexError, KeyError):
        # If the predicted class ID is out of range, fall back to a default category
        category = "Lain-lain"
        confidence = 0.0

    return category, confidence
def classify(priority_data):
    """
    Classify a claim based on priority data.

    Args:
        priority_data (dict): Dictionary containing priority flags and other data

    Returns:
        str: Classification verdict (TRUE, FALSE, PARTIALLY_TRUE, UNVERIFIED)
    """
    # Extract priority flags from the data
    if isinstance(priority_data, dict):
        if "priority_flags" in priority_data:
            priority_flags = priority_data["priority_flags"]
        else:
            # Assume the dictionary itself contains the flags
            priority_flags = priority_data
    else:
        raise ValueError("Input must be a dictionary containing priority flags.")

    # Get sentiment counts if available
    sentiment_counts = {}
    if "sentiment_counts" in priority_data:
        sentiment_counts = priority_data["sentiment_counts"]
        # Convert keys to strings if they're not already
        if any(not isinstance(k, str) for k in sentiment_counts.keys()):
            sentiment_counts = {str(k): v for k, v in sentiment_counts.items()}

    # Get priority score if available
    priority_score = priority_data.get("priority_score", sum(priority_flags.values()))

    # Get claim and keywords
    claim = priority_data.get("claim", "").lower()
    keywords = priority_data.get("keywords", [])
    keywords_lower = [k.lower() for k in keywords]

    # Check for specific claim patterns
    is_azan_claim = any(word in claim for word in ["azan", "larang", "masjid", "pembesar suara"])
    is_religious_claim = any(word in claim for word in ["islam", "agama", "masjid", "surau", "sembahyang", "solat", "zakat"])

    # Check for economic impact
    economic_related = priority_flags.get("economic_impact", 0) == 1

    # Check for government involvement
    government_related = priority_flags.get("affects_government", 0) == 1

    # Check for law-related content
    law_related = priority_flags.get("law_related", 0) == 1

    # Check for confusion potential
    causes_confusion = priority_flags.get("cause_confusion", 0) == 1

    # Check for negative sentiment dominance
    negative_dominant = False
    if sentiment_counts:
        pos = int(sentiment_counts.get("positive", sentiment_counts.get("1", 0)))
        neg = int(sentiment_counts.get("negative", sentiment_counts.get("2", 0)))
        neu = int(sentiment_counts.get("neutral", sentiment_counts.get("0", 0)))
        negative_dominant = neg > pos and neg > neu

    # Special case for azan claim (like the example provided)
    if is_azan_claim and is_religious_claim and "larangan" in claim:
        return "FALSE"  # Claim about banning azan is false

    # Determine verdict based on multiple factors
    if priority_score >= 7.0 and negative_dominant and (government_related or law_related):
        return "FALSE"
    elif priority_score >= 5.0 and causes_confusion:
        return "PARTIALLY_TRUE"
    elif priority_score <= 3.0 and not negative_dominant:
        return "TRUE"
    elif economic_related and government_related:
        # Special case for economic policies by government
        if negative_dominant:
            return "FALSE"
        elif causes_confusion:
            return "PARTIALLY_TRUE"
        else:
            return "TRUE"
    else:
        return "UNVERIFIED"

def get_verdict(priority_data):
    """
    Get verdict from priority data, which can be a file path or dictionary.

    Args:
        priority_data (str or dict): File path to JSON or dictionary with priority data

    Returns:
        str: Classification verdict
    """
    if isinstance(priority_data, str):
        try:
            if not os.path.exists(priority_data):
                print(f"⚠️ Warning: File not found: {priority_data}")
                return "UNVERIFIED"
            try:
                with open(priority_data, "r") as f:
                    priority_data = json.load(f)
            except Exception as e:
                print(f"⚠️ Error reading file: {e}")
                return "UNVERIFIED"
        except Exception as e:
            print(f"⚠️ Error checking file existence: {e}")
            return "UNVERIFIED"

    if not isinstance(priority_data, dict):
        print("⚠️ Warning: Input is not a dictionary")
        return "UNVERIFIED"

    return classify(priority_data)

def get_verdict_explanation(verdict):
    """
    Get a human-readable explanation for a verdict.

    Args:
        verdict (str): Classification verdict

    Returns:
        tuple: (explanation text, color)
    """
    if verdict == "TRUE":
        return ("Claim appears to be factually accurate based on available data and sentiment analysis.", "#009933")  # Green
    elif verdict == "FALSE":
        return ("Claim appears to be false based on available data and sentiment analysis.", "#FF0000")  # Red
    elif verdict == "PARTIALLY_TRUE":
        return ("Claim contains a mix of accurate and inaccurate information based on available data.", "#FFCC00")  # Amber
    else:  # UNVERIFIED
        return ("Insufficient data to verify this claim. More information is needed.", "#0099CC")  # Blue

# Example CLI usage:
if __name__ == "__main__":
    import argparse

    parser = argparse.ArgumentParser(description="Classify a claim based on priority data or category")
    parser.add_argument("--json", help="Path to priority JSON file")
    parser.add_argument("--claim-id", type=int, help="Claim ID to analyze")
    parser.add_argument("--db", default="data/claims.db", help="Path to database file")
    parser.add_argument("--claim", help="Claim text to classify into a category")
    parser.add_argument("--category", action="store_true", help="Classify claim into a category")

    args = parser.parse_args()

    if args.category or args.claim:
        # Use the new classification model
        if not args.claim:
            print("[❌] Error: --claim must be provided with --category")
            exit(1)

        print(f"[πŸ“₯] Classifying claim: {args.claim}")
        category, confidence = classify_claim(args.claim)
        print(f"[🏁] Category: {category}")
        print(f"[πŸ“Š] Confidence: {confidence:.4f}")

    elif args.json:
        print(f"[πŸ“₯] Reading priority flags from: {args.json}")
        verdict = get_verdict(args.json)
        explanation, color = get_verdict_explanation(verdict)
        print(f"[🏁] Final Verdict: {verdict}")
        print(f"[πŸ“] Explanation: {explanation}")

    elif args.claim_id:
        try:
            # Import only if needed
            try:
                from priority_indexer import calculate_priority_from_db
                print(f"[πŸ“₯] Calculating priority for claim ID: {args.claim_id}")
                priority_data = calculate_priority_from_db(args.claim_id, args.db)
                if priority_data:
                    verdict = classify(priority_data)
                else:
                    verdict = "UNVERIFIED"
            except ImportError:
                print("[⚠️] Warning: priority_indexer module not found")
                verdict = "UNVERIFIED"

            explanation, color = get_verdict_explanation(verdict)
            print(f"[🏁] Final Verdict: {verdict}")
            print(f"[πŸ“] Explanation: {explanation}")

        except Exception as e:
            print(f"[❌] Error: {e}")
            verdict = "UNVERIFIED"
            explanation, color = get_verdict_explanation(verdict)
            print(f"[🏁] Final Verdict: {verdict}")
            print(f"[πŸ“] Explanation: {explanation}")
    else:
        print("[❌] Error: Either --json, --claim-id, or --claim with --category must be provided")
        exit(1)

    # Test the classification model with sample claims
    if args.category and not args.claim:
        print("\n[πŸ§ͺ] Testing classification model with sample claims:")
        test_claims = [
            "Projek mega kerajaan penuh dengan ketirisan.",
            "Harga barang keperluan naik setiap bulan.",
            "Program vaksinasi tidak mencakupi golongan luar bandar.",
            "Makanan di hotel lima bintang tidak jelas status halalnya."
        ]

        model, tokenizer = load_model()

        for claim in test_claims:
            category, confidence = classify_claim(claim, model, tokenizer)
            print(f"Claim: {claim}")
            print(f"Category: {category}")
            print(f"Confidence: {confidence:.4f}")
            print("-" * 50)