FraudSimulator-AI / text_processor.py
BDR-AI's picture
Add transformer text processor for Maysat method (Step 1/5)
d98efb2 verified
"""Text Processing Module for Maysat Method Implementation
This module implements NLP-based fraud detection using transformers.
Part of the multi-model ensemble: 40% Transformer + 40% ML + 20% Rules
"""
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
import torch
import warnings
warnings.filterwarnings('ignore')
class InsuranceTextProcessor:
"""Process insurance claim text using transformer models"""
def __init__(self):
print("Initializing Text Processor...")
try:
# Load fraud detection model
self.fraud_classifier = pipeline(
"text-classification",
model="distilbert-base-uncased",
device=-1 # CPU
)
print("✓ Fraud classifier loaded")
except Exception as e:
print(f"Warning: Could not load fraud classifier: {e}")
self.fraud_classifier = None
def analyze_claim_text(self, claim_text):
"""Analyze claim text for fraud indicators
Args:
claim_text (str): The claim description/notes
Returns:
dict: Analysis results with fraud score and features
"""
if not claim_text or len(claim_text.strip()) < 10:
return {
'fraud_score': 0.5,
'confidence': 0.0,
'text_features': self._extract_basic_features(claim_text or "")
}
# Get transformer prediction
fraud_score = 0.5
confidence = 0.0
if self.fraud_classifier:
try:
result = self.fraud_classifier(claim_text[:512])[0] # Limit to 512 tokens
# Normalize score to 0-1 range
fraud_score = result['score'] if result['label'] == 'LABEL_1' else 1 - result['score']
confidence = result['score']
except Exception as e:
print(f"Prediction error: {e}")
# Extract text features
text_features = self._extract_text_features(claim_text)
return {
'fraud_score': fraud_score,
'confidence': confidence,
'text_features': text_features,
'method': 'transformer'
}
def _extract_text_features(self, text):
"""Extract linguistic features from text"""
text_lower = text.lower()
# Urgency indicators
urgency_words = ['urgent', 'emergency', 'immediately', 'asap', 'critical']
urgency_count = sum(1 for word in urgency_words if word in text_lower)
# Emotional manipulation
emotional_words = ['desperate', 'suffering', 'dying', 'helpless', 'tragedy']
emotional_count = sum(1 for word in emotional_words if word in text_lower)
# Vague language
vague_words = ['somehow', 'maybe', 'approximately', 'around', 'roughly']
vague_count = sum(1 for word in vague_words if word in text_lower)
return {
'length': len(text),
'word_count': len(text.split()),
'urgency_score': min(urgency_count / 5.0, 1.0),
'emotional_score': min(emotional_count / 5.0, 1.0),
'vague_score': min(vague_count / 5.0, 1.0),
'has_numbers': any(char.isdigit() for char in text)
}
def _extract_basic_features(self, text):
"""Extract basic features when text is too short"""
return {
'length': len(text),
'word_count': len(text.split()) if text else 0,
'urgency_score': 0.0,
'emotional_score': 0.0,
'vague_score': 0.0,
'has_numbers': False
}
if __name__ == "__main__":
# Test the processor
processor = InsuranceTextProcessor()
test_claims = [
"My car was damaged in an accident yesterday. The front bumper needs replacement.",
"URGENT! I need immediate payment for my claim. This is an emergency situation!",
"The incident happened somehow around 3pm. Maybe the damage is approximately $5000."
]
for i, claim in enumerate(test_claims, 1):
print(f"\nTest {i}: {claim[:50]}...")
result = processor.analyze_claim_text(claim)
print(f"Fraud Score: {result['fraud_score']:.3f}")
print(f"Confidence: {result['confidence']:.3f}")
print(f"Text Features: {result['text_features']}")