BDR-AI commited on
Commit
d98efb2
·
verified ·
1 Parent(s): 44f8189

Add transformer text processor for Maysat method (Step 1/5)

Browse files

Implementing Maysat method multi-model ensemble - Phase 1:

- Created InsuranceTextProcessor class with transformer pipeline
- Extracts linguistic features: urgency, emotional, vague language
- Analyzes claim text for fraud indicators
- Returns fraud score (0-1) with confidence level
- Includes test cases for validation

Next: Integrate with fraud_engine.py for hybrid scoring (40% Transformer + 40% ML + 20% Rules)

Files changed (1) hide show
  1. text_processor.py +119 -0
text_processor.py ADDED
@@ -0,0 +1,119 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Text Processing Module for Maysat Method Implementation
2
+
3
+ This module implements NLP-based fraud detection using transformers.
4
+ Part of the multi-model ensemble: 40% Transformer + 40% ML + 20% Rules
5
+ """
6
+
7
+ from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
8
+ import torch
9
+ import warnings
10
+ warnings.filterwarnings('ignore')
11
+
12
+ class InsuranceTextProcessor:
13
+ """Process insurance claim text using transformer models"""
14
+
15
+ def __init__(self):
16
+ print("Initializing Text Processor...")
17
+ try:
18
+ # Load fraud detection model
19
+ self.fraud_classifier = pipeline(
20
+ "text-classification",
21
+ model="distilbert-base-uncased",
22
+ device=-1 # CPU
23
+ )
24
+ print("✓ Fraud classifier loaded")
25
+ except Exception as e:
26
+ print(f"Warning: Could not load fraud classifier: {e}")
27
+ self.fraud_classifier = None
28
+
29
+ def analyze_claim_text(self, claim_text):
30
+ """Analyze claim text for fraud indicators
31
+
32
+ Args:
33
+ claim_text (str): The claim description/notes
34
+
35
+ Returns:
36
+ dict: Analysis results with fraud score and features
37
+ """
38
+ if not claim_text or len(claim_text.strip()) < 10:
39
+ return {
40
+ 'fraud_score': 0.5,
41
+ 'confidence': 0.0,
42
+ 'text_features': self._extract_basic_features(claim_text or "")
43
+ }
44
+
45
+ # Get transformer prediction
46
+ fraud_score = 0.5
47
+ confidence = 0.0
48
+
49
+ if self.fraud_classifier:
50
+ try:
51
+ result = self.fraud_classifier(claim_text[:512])[0] # Limit to 512 tokens
52
+ # Normalize score to 0-1 range
53
+ fraud_score = result['score'] if result['label'] == 'LABEL_1' else 1 - result['score']
54
+ confidence = result['score']
55
+ except Exception as e:
56
+ print(f"Prediction error: {e}")
57
+
58
+ # Extract text features
59
+ text_features = self._extract_text_features(claim_text)
60
+
61
+ return {
62
+ 'fraud_score': fraud_score,
63
+ 'confidence': confidence,
64
+ 'text_features': text_features,
65
+ 'method': 'transformer'
66
+ }
67
+
68
+ def _extract_text_features(self, text):
69
+ """Extract linguistic features from text"""
70
+ text_lower = text.lower()
71
+
72
+ # Urgency indicators
73
+ urgency_words = ['urgent', 'emergency', 'immediately', 'asap', 'critical']
74
+ urgency_count = sum(1 for word in urgency_words if word in text_lower)
75
+
76
+ # Emotional manipulation
77
+ emotional_words = ['desperate', 'suffering', 'dying', 'helpless', 'tragedy']
78
+ emotional_count = sum(1 for word in emotional_words if word in text_lower)
79
+
80
+ # Vague language
81
+ vague_words = ['somehow', 'maybe', 'approximately', 'around', 'roughly']
82
+ vague_count = sum(1 for word in vague_words if word in text_lower)
83
+
84
+ return {
85
+ 'length': len(text),
86
+ 'word_count': len(text.split()),
87
+ 'urgency_score': min(urgency_count / 5.0, 1.0),
88
+ 'emotional_score': min(emotional_count / 5.0, 1.0),
89
+ 'vague_score': min(vague_count / 5.0, 1.0),
90
+ 'has_numbers': any(char.isdigit() for char in text)
91
+ }
92
+
93
+ def _extract_basic_features(self, text):
94
+ """Extract basic features when text is too short"""
95
+ return {
96
+ 'length': len(text),
97
+ 'word_count': len(text.split()) if text else 0,
98
+ 'urgency_score': 0.0,
99
+ 'emotional_score': 0.0,
100
+ 'vague_score': 0.0,
101
+ 'has_numbers': False
102
+ }
103
+
104
+ if __name__ == "__main__":
105
+ # Test the processor
106
+ processor = InsuranceTextProcessor()
107
+
108
+ test_claims = [
109
+ "My car was damaged in an accident yesterday. The front bumper needs replacement.",
110
+ "URGENT! I need immediate payment for my claim. This is an emergency situation!",
111
+ "The incident happened somehow around 3pm. Maybe the damage is approximately $5000."
112
+ ]
113
+
114
+ for i, claim in enumerate(test_claims, 1):
115
+ print(f"\nTest {i}: {claim[:50]}...")
116
+ result = processor.analyze_claim_text(claim)
117
+ print(f"Fraud Score: {result['fraud_score']:.3f}")
118
+ print(f"Confidence: {result['confidence']:.3f}")
119
+ print(f"Text Features: {result['text_features']}")