"""Text Processing Module for Maysat Method Implementation This module implements NLP-based fraud detection using transformers. Part of the multi-model ensemble: 40% Transformer + 40% ML + 20% Rules """ from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification import torch import warnings warnings.filterwarnings('ignore') class InsuranceTextProcessor: """Process insurance claim text using transformer models""" def __init__(self): print("Initializing Text Processor...") try: # Load fraud detection model self.fraud_classifier = pipeline( "text-classification", model="distilbert-base-uncased", device=-1 # CPU ) print("✓ Fraud classifier loaded") except Exception as e: print(f"Warning: Could not load fraud classifier: {e}") self.fraud_classifier = None def analyze_claim_text(self, claim_text): """Analyze claim text for fraud indicators Args: claim_text (str): The claim description/notes Returns: dict: Analysis results with fraud score and features """ if not claim_text or len(claim_text.strip()) < 10: return { 'fraud_score': 0.5, 'confidence': 0.0, 'text_features': self._extract_basic_features(claim_text or "") } # Get transformer prediction fraud_score = 0.5 confidence = 0.0 if self.fraud_classifier: try: result = self.fraud_classifier(claim_text[:512])[0] # Limit to 512 tokens # Normalize score to 0-1 range fraud_score = result['score'] if result['label'] == 'LABEL_1' else 1 - result['score'] confidence = result['score'] except Exception as e: print(f"Prediction error: {e}") # Extract text features text_features = self._extract_text_features(claim_text) return { 'fraud_score': fraud_score, 'confidence': confidence, 'text_features': text_features, 'method': 'transformer' } def _extract_text_features(self, text): """Extract linguistic features from text""" text_lower = text.lower() # Urgency indicators urgency_words = ['urgent', 'emergency', 'immediately', 'asap', 'critical'] urgency_count = sum(1 for word in urgency_words if word in text_lower) # Emotional manipulation emotional_words = ['desperate', 'suffering', 'dying', 'helpless', 'tragedy'] emotional_count = sum(1 for word in emotional_words if word in text_lower) # Vague language vague_words = ['somehow', 'maybe', 'approximately', 'around', 'roughly'] vague_count = sum(1 for word in vague_words if word in text_lower) return { 'length': len(text), 'word_count': len(text.split()), 'urgency_score': min(urgency_count / 5.0, 1.0), 'emotional_score': min(emotional_count / 5.0, 1.0), 'vague_score': min(vague_count / 5.0, 1.0), 'has_numbers': any(char.isdigit() for char in text) } def _extract_basic_features(self, text): """Extract basic features when text is too short""" return { 'length': len(text), 'word_count': len(text.split()) if text else 0, 'urgency_score': 0.0, 'emotional_score': 0.0, 'vague_score': 0.0, 'has_numbers': False } if __name__ == "__main__": # Test the processor processor = InsuranceTextProcessor() test_claims = [ "My car was damaged in an accident yesterday. The front bumper needs replacement.", "URGENT! I need immediate payment for my claim. This is an emergency situation!", "The incident happened somehow around 3pm. Maybe the damage is approximately $5000." ] for i, claim in enumerate(test_claims, 1): print(f"\nTest {i}: {claim[:50]}...") result = processor.analyze_claim_text(claim) print(f"Fraud Score: {result['fraud_score']:.3f}") print(f"Confidence: {result['confidence']:.3f}") print(f"Text Features: {result['text_features']}")