File size: 5,688 Bytes

"""
FinEE Schema - Core data structures for financial entity extraction.

This module defines the data classes used throughout the extraction pipeline.
All fields are optional to support partial extraction and additive merging.
"""

from dataclasses import dataclass, field, asdict
from enum import Enum
from typing import Optional, Dict, Any, List
from datetime import date
import json


class TransactionType(str, Enum):
    """Transaction type enumeration."""
    DEBIT = "debit"
    CREDIT = "credit"
    UNKNOWN = "unknown"


class Category(str, Enum):
    """Transaction category enumeration."""
    FOOD = "food"
    SHOPPING = "shopping"
    TRANSPORT = "transport"
    UTILITIES = "utilities"
    ENTERTAINMENT = "entertainment"
    TRANSFER = "transfer"
    SALARY = "salary"
    INVESTMENT = "investment"
    HEALTHCARE = "healthcare"
    EDUCATION = "education"
    OTHER = "other"


class Confidence(str, Enum):
    """Extraction confidence levels."""
    HIGH = "high"        # All fields from regex/rules
    MEDIUM = "medium"    # Mix of regex + LLM
    LOW = "low"          # Mostly LLM or incomplete
    FAILED = "failed"    # Extraction failed


class ExtractionSource(str, Enum):
    """Source of each extracted field."""
    REGEX = "regex"
    RULES = "rules"
    LLM = "llm"
    CACHE = "cache"


@dataclass
class FieldMeta:
    """Metadata for a single extracted field."""
    source: ExtractionSource
    confidence: float  # 0.0 to 1.0
    raw_value: Optional[str] = None  # Original value before normalization


@dataclass
class ExtractionResult:
    """
    Complete extraction result with all financial entities.
    
    All fields are optional to support partial extraction.
    The `meta` dict tracks the source and confidence of each field.
    """
    # Core fields
    amount: Optional[float] = None
    type: Optional[TransactionType] = None
    date: Optional[str] = None  # Normalized to DD-MM-YYYY
    
    # Transaction details
    account: Optional[str] = None
    reference: Optional[str] = None
    vpa: Optional[str] = None
    
    # Enrichment fields
    merchant: Optional[str] = None
    category: Optional[Category] = None
    payment_method: Optional[str] = None
    bank: Optional[str] = None
    
    # Metadata
    confidence: Confidence = Confidence.LOW
    confidence_score: float = 0.0
    processing_time_ms: float = 0.0
    from_cache: bool = False
    
    # Field-level metadata
    meta: Dict[str, FieldMeta] = field(default_factory=dict)
    
    # Raw data
    raw_input: Optional[str] = None
    raw_llm_output: Optional[str] = None
    
    def to_dict(self) -> Dict[str, Any]:
        """Convert to dictionary, excluding None values and meta."""
        result = {}
        for k, v in asdict(self).items():
            if v is not None and k not in ('meta', 'raw_input', 'raw_llm_output'):
                if isinstance(v, Enum):
                    result[k] = v.value
                elif k == 'meta':
                    continue
                else:
                    result[k] = v
        return result
    
    def to_json(self, indent: int = 2) -> str:
        """Convert to JSON string."""
        return json.dumps(self.to_dict(), indent=indent)
    
    def get_missing_fields(self, required: List[str] = None, desired: List[str] = None) -> List[str]:
        """Get list of missing fields."""
        if required is None:
            required = ['amount', 'type']
        if desired is None:
            desired = ['merchant', 'category', 'date', 'reference']
        
        missing = []
        for field_name in required + desired:
            if getattr(self, field_name, None) is None:
                missing.append(field_name)
        return missing
    
    def is_complete(self) -> bool:
        """Check if all required fields are present."""
        return self.amount is not None and self.type is not None
    
    def merge(self, other: 'ExtractionResult', overwrite: bool = False) -> 'ExtractionResult':
        """
        Merge another result into this one (additive).
        
        By default, existing values are NOT overwritten.
        Set overwrite=True to prefer `other`'s values.
        """
        for field_name in ['amount', 'type', 'date', 'account', 'reference', 
                           'vpa', 'merchant', 'category', 'payment_method', 'bank']:
            current_value = getattr(self, field_name)
            other_value = getattr(other, field_name)
            
            if other_value is not None:
                if current_value is None or overwrite:
                    setattr(self, field_name, other_value)
                    if field_name in other.meta:
                        self.meta[field_name] = other.meta[field_name]
        
        return self


@dataclass
class ExtractionConfig:
    """Configuration for the extraction pipeline."""
    # Cache settings
    cache_enabled: bool = True
    cache_max_size: int = 1000
    
    # LLM settings
    use_llm: bool = False  # Set to True to enable LLM (requires model download)
    llm_timeout_seconds: float = 10.0
    llm_max_tokens: int = 200
    llm_temperature: float = 0.1
    
    # Model settings
    model_path: Optional[str] = None
    model_id: str = "Ranjit0034/finance-entity-extractor"
    
    # Pipeline settings
    required_fields: List[str] = field(default_factory=lambda: ['amount', 'type'])
    desired_fields: List[str] = field(default_factory=lambda: ['merchant', 'category', 'date', 'reference'])
    
    # Confidence thresholds
    high_confidence_threshold: float = 0.9
    medium_confidence_threshold: float = 0.7


# Type aliases for clarity
RawText = str
JSONOutput = str