Spaces:

gcc-insurance-intelligence-lab
/

FraudSimulator-AI

Running

App Files Files Community

FraudSimulator-AI / text_processor.py

BDR-AI

Add transformer text processor for Maysat method (Step 1/5)

d98efb2 verified 1 day ago

raw

history blame contribute delete

4.49 kB

	"""Text Processing Module for Maysat Method Implementation

	This module implements NLP-based fraud detection using transformers.
	Part of the multi-model ensemble: 40% Transformer + 40% ML + 20% Rules
	"""

	from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
	import torch
	import warnings
	warnings.filterwarnings('ignore')

	class InsuranceTextProcessor:
	"""Process insurance claim text using transformer models"""

	def __init__(self):
	print("Initializing Text Processor...")
	try:
	# Load fraud detection model
	self.fraud_classifier = pipeline(
	"text-classification",
	model="distilbert-base-uncased",
	device=-1 # CPU
	)
	print("✓ Fraud classifier loaded")
	except Exception as e:
	print(f"Warning: Could not load fraud classifier: {e}")
	self.fraud_classifier = None

	def analyze_claim_text(self, claim_text):
	"""Analyze claim text for fraud indicators

	Args:
	claim_text (str): The claim description/notes

	Returns:
	dict: Analysis results with fraud score and features
	"""
	if not claim_text or len(claim_text.strip()) < 10:
	return {
	'fraud_score': 0.5,
	'confidence': 0.0,
	'text_features': self._extract_basic_features(claim_text or "")
	}

	# Get transformer prediction
	fraud_score = 0.5
	confidence = 0.0

	if self.fraud_classifier:
	try:
	result = self.fraud_classifier(claim_text[:512])[0] # Limit to 512 tokens
	# Normalize score to 0-1 range
	fraud_score = result['score'] if result['label'] == 'LABEL_1' else 1 - result['score']
	confidence = result['score']
	except Exception as e:
	print(f"Prediction error: {e}")

	# Extract text features
	text_features = self._extract_text_features(claim_text)

	return {
	'fraud_score': fraud_score,
	'confidence': confidence,
	'text_features': text_features,
	'method': 'transformer'
	}

	def _extract_text_features(self, text):
	"""Extract linguistic features from text"""
	text_lower = text.lower()

	# Urgency indicators
	urgency_words = ['urgent', 'emergency', 'immediately', 'asap', 'critical']
	urgency_count = sum(1 for word in urgency_words if word in text_lower)

	# Emotional manipulation
	emotional_words = ['desperate', 'suffering', 'dying', 'helpless', 'tragedy']
	emotional_count = sum(1 for word in emotional_words if word in text_lower)

	# Vague language
	vague_words = ['somehow', 'maybe', 'approximately', 'around', 'roughly']
	vague_count = sum(1 for word in vague_words if word in text_lower)

	return {
	'length': len(text),
	'word_count': len(text.split()),
	'urgency_score': min(urgency_count / 5.0, 1.0),
	'emotional_score': min(emotional_count / 5.0, 1.0),
	'vague_score': min(vague_count / 5.0, 1.0),
	'has_numbers': any(char.isdigit() for char in text)
	}

	def _extract_basic_features(self, text):
	"""Extract basic features when text is too short"""
	return {
	'length': len(text),
	'word_count': len(text.split()) if text else 0,
	'urgency_score': 0.0,
	'emotional_score': 0.0,
	'vague_score': 0.0,
	'has_numbers': False
	}

	if __name__ == "__main__":
	# Test the processor
	processor = InsuranceTextProcessor()

	test_claims = [
	"My car was damaged in an accident yesterday. The front bumper needs replacement.",
	"URGENT! I need immediate payment for my claim. This is an emergency situation!",
	"The incident happened somehow around 3pm. Maybe the damage is approximately $5000."
	]

	for i, claim in enumerate(test_claims, 1):
	print(f"\nTest {i}: {claim[:50]}...")
	result = processor.analyze_claim_text(claim)
	print(f"Fraud Score: {result['fraud_score']:.3f}")
	print(f"Confidence: {result['confidence']:.3f}")
	print(f"Text Features: {result['text_features']}")