"""Text Processing Module for Maysat Method Implementation

This module implements NLP-based fraud detection using transformers.
Part of the multi-model ensemble: 40% Transformer + 40% ML + 20% Rules
"""

from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
import torch
import warnings
warnings.filterwarnings('ignore')

class InsuranceTextProcessor:
    """Process insurance claim text using transformer models"""
    
    def __init__(self):
        print("Initializing Text Processor...")
        try:
            # Load fraud detection model
            self.fraud_classifier = pipeline(
                "text-classification",
                model="distilbert-base-uncased",
                device=-1  # CPU
            )
            print("✓ Fraud classifier loaded")
        except Exception as e:
            print(f"Warning: Could not load fraud classifier: {e}")
            self.fraud_classifier = None
    
    def analyze_claim_text(self, claim_text):
        """Analyze claim text for fraud indicators
        
        Args:
            claim_text (str): The claim description/notes
            
        Returns:
            dict: Analysis results with fraud score and features
        """
        if not claim_text or len(claim_text.strip()) < 10:
            return {
                'fraud_score': 0.5,
                'confidence': 0.0,
                'text_features': self._extract_basic_features(claim_text or "")
            }
        
        # Get transformer prediction
        fraud_score = 0.5
        confidence = 0.0
        
        if self.fraud_classifier:
            try:
                result = self.fraud_classifier(claim_text[:512])[0]  # Limit to 512 tokens
                # Normalize score to 0-1 range
                fraud_score = result['score'] if result['label'] == 'LABEL_1' else 1 - result['score']
                confidence = result['score']
            except Exception as e:
                print(f"Prediction error: {e}")
        
        # Extract text features
        text_features = self._extract_text_features(claim_text)
        
        return {
            'fraud_score': fraud_score,
            'confidence': confidence,
            'text_features': text_features,
            'method': 'transformer'
        }
    
    def _extract_text_features(self, text):
        """Extract linguistic features from text"""
        text_lower = text.lower()
        
        # Urgency indicators
        urgency_words = ['urgent', 'emergency', 'immediately', 'asap', 'critical']
        urgency_count = sum(1 for word in urgency_words if word in text_lower)
        
        # Emotional manipulation
        emotional_words = ['desperate', 'suffering', 'dying', 'helpless', 'tragedy']
        emotional_count = sum(1 for word in emotional_words if word in text_lower)
        
        # Vague language
        vague_words = ['somehow', 'maybe', 'approximately', 'around', 'roughly']
        vague_count = sum(1 for word in vague_words if word in text_lower)
        
        return {
            'length': len(text),
            'word_count': len(text.split()),
            'urgency_score': min(urgency_count / 5.0, 1.0),
            'emotional_score': min(emotional_count / 5.0, 1.0),
            'vague_score': min(vague_count / 5.0, 1.0),
            'has_numbers': any(char.isdigit() for char in text)
        }
    
    def _extract_basic_features(self, text):
        """Extract basic features when text is too short"""
        return {
            'length': len(text),
            'word_count': len(text.split()) if text else 0,
            'urgency_score': 0.0,
            'emotional_score': 0.0,
            'vague_score': 0.0,
            'has_numbers': False
        }

if __name__ == "__main__":
    # Test the processor
    processor = InsuranceTextProcessor()
    
    test_claims = [
        "My car was damaged in an accident yesterday. The front bumper needs replacement.",
        "URGENT! I need immediate payment for my claim. This is an emergency situation!",
        "The incident happened somehow around 3pm. Maybe the damage is approximately $5000."
    ]
    
    for i, claim in enumerate(test_claims, 1):
        print(f"\nTest {i}: {claim[:50]}...")
        result = processor.analyze_claim_text(claim)
        print(f"Fraud Score: {result['fraud_score']:.3f}")
        print(f"Confidence: {result['confidence']:.3f}")
        print(f"Text Features: {result['text_features']}")