| | """Text Processing Module for Maysat Method Implementation |
| | |
| | This module implements NLP-based fraud detection using transformers. |
| | Part of the multi-model ensemble: 40% Transformer + 40% ML + 20% Rules |
| | """ |
| |
|
| | from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification |
| | import torch |
| | import warnings |
| | warnings.filterwarnings('ignore') |
| |
|
| | class InsuranceTextProcessor: |
| | """Process insurance claim text using transformer models""" |
| | |
| | def __init__(self): |
| | print("Initializing Text Processor...") |
| | try: |
| | |
| | self.fraud_classifier = pipeline( |
| | "text-classification", |
| | model="distilbert-base-uncased", |
| | device=-1 |
| | ) |
| | print("✓ Fraud classifier loaded") |
| | except Exception as e: |
| | print(f"Warning: Could not load fraud classifier: {e}") |
| | self.fraud_classifier = None |
| | |
| | def analyze_claim_text(self, claim_text): |
| | """Analyze claim text for fraud indicators |
| | |
| | Args: |
| | claim_text (str): The claim description/notes |
| | |
| | Returns: |
| | dict: Analysis results with fraud score and features |
| | """ |
| | if not claim_text or len(claim_text.strip()) < 10: |
| | return { |
| | 'fraud_score': 0.5, |
| | 'confidence': 0.0, |
| | 'text_features': self._extract_basic_features(claim_text or "") |
| | } |
| | |
| | |
| | fraud_score = 0.5 |
| | confidence = 0.0 |
| | |
| | if self.fraud_classifier: |
| | try: |
| | result = self.fraud_classifier(claim_text[:512])[0] |
| | |
| | fraud_score = result['score'] if result['label'] == 'LABEL_1' else 1 - result['score'] |
| | confidence = result['score'] |
| | except Exception as e: |
| | print(f"Prediction error: {e}") |
| | |
| | |
| | text_features = self._extract_text_features(claim_text) |
| | |
| | return { |
| | 'fraud_score': fraud_score, |
| | 'confidence': confidence, |
| | 'text_features': text_features, |
| | 'method': 'transformer' |
| | } |
| | |
| | def _extract_text_features(self, text): |
| | """Extract linguistic features from text""" |
| | text_lower = text.lower() |
| | |
| | |
| | urgency_words = ['urgent', 'emergency', 'immediately', 'asap', 'critical'] |
| | urgency_count = sum(1 for word in urgency_words if word in text_lower) |
| | |
| | |
| | emotional_words = ['desperate', 'suffering', 'dying', 'helpless', 'tragedy'] |
| | emotional_count = sum(1 for word in emotional_words if word in text_lower) |
| | |
| | |
| | vague_words = ['somehow', 'maybe', 'approximately', 'around', 'roughly'] |
| | vague_count = sum(1 for word in vague_words if word in text_lower) |
| | |
| | return { |
| | 'length': len(text), |
| | 'word_count': len(text.split()), |
| | 'urgency_score': min(urgency_count / 5.0, 1.0), |
| | 'emotional_score': min(emotional_count / 5.0, 1.0), |
| | 'vague_score': min(vague_count / 5.0, 1.0), |
| | 'has_numbers': any(char.isdigit() for char in text) |
| | } |
| | |
| | def _extract_basic_features(self, text): |
| | """Extract basic features when text is too short""" |
| | return { |
| | 'length': len(text), |
| | 'word_count': len(text.split()) if text else 0, |
| | 'urgency_score': 0.0, |
| | 'emotional_score': 0.0, |
| | 'vague_score': 0.0, |
| | 'has_numbers': False |
| | } |
| |
|
| | if __name__ == "__main__": |
| | |
| | processor = InsuranceTextProcessor() |
| | |
| | test_claims = [ |
| | "My car was damaged in an accident yesterday. The front bumper needs replacement.", |
| | "URGENT! I need immediate payment for my claim. This is an emergency situation!", |
| | "The incident happened somehow around 3pm. Maybe the damage is approximately $5000." |
| | ] |
| | |
| | for i, claim in enumerate(test_claims, 1): |
| | print(f"\nTest {i}: {claim[:50]}...") |
| | result = processor.analyze_claim_text(claim) |
| | print(f"Fraud Score: {result['fraud_score']:.3f}") |
| | print(f"Confidence: {result['confidence']:.3f}") |
| | print(f"Text Features: {result['text_features']}") |
| |
|