File size: 4,493 Bytes
d98efb2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
"""Text Processing Module for Maysat Method Implementation

This module implements NLP-based fraud detection using transformers.
Part of the multi-model ensemble: 40% Transformer + 40% ML + 20% Rules
"""

from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
import torch
import warnings
warnings.filterwarnings('ignore')

class InsuranceTextProcessor:
    """Process insurance claim text using transformer models"""
    
    def __init__(self):
        print("Initializing Text Processor...")
        try:
            # Load fraud detection model
            self.fraud_classifier = pipeline(
                "text-classification",
                model="distilbert-base-uncased",
                device=-1  # CPU
            )
            print("✓ Fraud classifier loaded")
        except Exception as e:
            print(f"Warning: Could not load fraud classifier: {e}")
            self.fraud_classifier = None
    
    def analyze_claim_text(self, claim_text):
        """Analyze claim text for fraud indicators
        
        Args:
            claim_text (str): The claim description/notes
            
        Returns:
            dict: Analysis results with fraud score and features
        """
        if not claim_text or len(claim_text.strip()) < 10:
            return {
                'fraud_score': 0.5,
                'confidence': 0.0,
                'text_features': self._extract_basic_features(claim_text or "")
            }
        
        # Get transformer prediction
        fraud_score = 0.5
        confidence = 0.0
        
        if self.fraud_classifier:
            try:
                result = self.fraud_classifier(claim_text[:512])[0]  # Limit to 512 tokens
                # Normalize score to 0-1 range
                fraud_score = result['score'] if result['label'] == 'LABEL_1' else 1 - result['score']
                confidence = result['score']
            except Exception as e:
                print(f"Prediction error: {e}")
        
        # Extract text features
        text_features = self._extract_text_features(claim_text)
        
        return {
            'fraud_score': fraud_score,
            'confidence': confidence,
            'text_features': text_features,
            'method': 'transformer'
        }
    
    def _extract_text_features(self, text):
        """Extract linguistic features from text"""
        text_lower = text.lower()
        
        # Urgency indicators
        urgency_words = ['urgent', 'emergency', 'immediately', 'asap', 'critical']
        urgency_count = sum(1 for word in urgency_words if word in text_lower)
        
        # Emotional manipulation
        emotional_words = ['desperate', 'suffering', 'dying', 'helpless', 'tragedy']
        emotional_count = sum(1 for word in emotional_words if word in text_lower)
        
        # Vague language
        vague_words = ['somehow', 'maybe', 'approximately', 'around', 'roughly']
        vague_count = sum(1 for word in vague_words if word in text_lower)
        
        return {
            'length': len(text),
            'word_count': len(text.split()),
            'urgency_score': min(urgency_count / 5.0, 1.0),
            'emotional_score': min(emotional_count / 5.0, 1.0),
            'vague_score': min(vague_count / 5.0, 1.0),
            'has_numbers': any(char.isdigit() for char in text)
        }
    
    def _extract_basic_features(self, text):
        """Extract basic features when text is too short"""
        return {
            'length': len(text),
            'word_count': len(text.split()) if text else 0,
            'urgency_score': 0.0,
            'emotional_score': 0.0,
            'vague_score': 0.0,
            'has_numbers': False
        }

if __name__ == "__main__":
    # Test the processor
    processor = InsuranceTextProcessor()
    
    test_claims = [
        "My car was damaged in an accident yesterday. The front bumper needs replacement.",
        "URGENT! I need immediate payment for my claim. This is an emergency situation!",
        "The incident happened somehow around 3pm. Maybe the damage is approximately $5000."
    ]
    
    for i, claim in enumerate(test_claims, 1):
        print(f"\nTest {i}: {claim[:50]}...")
        result = processor.analyze_claim_text(claim)
        print(f"Fraud Score: {result['fraud_score']:.3f}")
        print(f"Confidence: {result['confidence']:.3f}")
        print(f"Text Features: {result['text_features']}")