File size: 1,978 Bytes
eb6b502
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
"""
test_multilingual_anomaly.py
Test the multilingual anomaly detection fix.
"""
import sys
from pathlib import Path

if sys.platform == 'win32':
    sys.stdout.reconfigure(encoding='utf-8')

sys.path.insert(0, str(Path('.').resolve()))

from src.graphs.vectorizationAgentGraph import graph
from datetime import datetime

test_texts = [
    {"text": "URGENT: Massive landslide in Ratnapura!", "post_id": "EN_001"},
    {"text": "Normal stock market day", "post_id": "EN_002"},
    {"text": "ආර්ථික අර්බුදය නිසා ජනතාව දුෂ්කරතාවන්ට මුහුණ දෙයි", "post_id": "SI_001"},
    {"text": "கொழும்பில் பெரும் மழை பெய்தது", "post_id": "TA_001"},
    {"text": "Breaking news about corruption scandal", "post_id": "EN_003"},
]

result = graph.invoke({
    "input_texts": test_texts,
    "batch_id": datetime.now().strftime("%Y%m%d_%H%M%S"),
})

print("=" * 60)
print("MULTILINGUAL ANOMALY DETECTION TEST")
print("=" * 60)

anomaly_results = result.get("anomaly_results", {})
print(f"\nStatus: {anomaly_results.get('status')}")
print(f"Model: {anomaly_results.get('model_used')}")
print(f"Total analyzed: {anomaly_results.get('total_analyzed')}")

anomalies = anomaly_results.get("anomalies", [])
print(f"\nAnomalies found: {len(anomalies)}")
for a in anomalies:
    method = a.get("detection_method", "unknown")
    print(f"  - {a.get('post_id')}: {a.get('language')} | method: {method} | score: {a.get('anomaly_score', 0):.2f}")

lang_results = result.get("language_detection_results", [])
print(f"\nLanguage Detection:")
for lr in lang_results:
    print(f"  - {lr.get('post_id')}: {lr.get('language')} (conf: {lr.get('confidence', 0):.2f})")

# Summary
print("\n" + "=" * 60)
print("The fix ensures:")
print("  - English texts: Isolation Forest ML model")
print("  - Sinhala/Tamil: Magnitude-based heuristic (avoids false positives)")
print("=" * 60)