File size: 5,688 Bytes
dcc24f8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c876830
dcc24f8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
"""
FinEE Schema - Core data structures for financial entity extraction.

This module defines the data classes used throughout the extraction pipeline.
All fields are optional to support partial extraction and additive merging.
"""

from dataclasses import dataclass, field, asdict
from enum import Enum
from typing import Optional, Dict, Any, List
from datetime import date
import json


class TransactionType(str, Enum):
    """Transaction type enumeration."""
    DEBIT = "debit"
    CREDIT = "credit"
    UNKNOWN = "unknown"


class Category(str, Enum):
    """Transaction category enumeration."""
    FOOD = "food"
    SHOPPING = "shopping"
    TRANSPORT = "transport"
    UTILITIES = "utilities"
    ENTERTAINMENT = "entertainment"
    TRANSFER = "transfer"
    SALARY = "salary"
    INVESTMENT = "investment"
    HEALTHCARE = "healthcare"
    EDUCATION = "education"
    OTHER = "other"


class Confidence(str, Enum):
    """Extraction confidence levels."""
    HIGH = "high"        # All fields from regex/rules
    MEDIUM = "medium"    # Mix of regex + LLM
    LOW = "low"          # Mostly LLM or incomplete
    FAILED = "failed"    # Extraction failed


class ExtractionSource(str, Enum):
    """Source of each extracted field."""
    REGEX = "regex"
    RULES = "rules"
    LLM = "llm"
    CACHE = "cache"


@dataclass
class FieldMeta:
    """Metadata for a single extracted field."""
    source: ExtractionSource
    confidence: float  # 0.0 to 1.0
    raw_value: Optional[str] = None  # Original value before normalization


@dataclass
class ExtractionResult:
    """
    Complete extraction result with all financial entities.
    
    All fields are optional to support partial extraction.
    The `meta` dict tracks the source and confidence of each field.
    """
    # Core fields
    amount: Optional[float] = None
    type: Optional[TransactionType] = None
    date: Optional[str] = None  # Normalized to DD-MM-YYYY
    
    # Transaction details
    account: Optional[str] = None
    reference: Optional[str] = None
    vpa: Optional[str] = None
    
    # Enrichment fields
    merchant: Optional[str] = None
    category: Optional[Category] = None
    payment_method: Optional[str] = None
    bank: Optional[str] = None
    
    # Metadata
    confidence: Confidence = Confidence.LOW
    confidence_score: float = 0.0
    processing_time_ms: float = 0.0
    from_cache: bool = False
    
    # Field-level metadata
    meta: Dict[str, FieldMeta] = field(default_factory=dict)
    
    # Raw data
    raw_input: Optional[str] = None
    raw_llm_output: Optional[str] = None
    
    def to_dict(self) -> Dict[str, Any]:
        """Convert to dictionary, excluding None values and meta."""
        result = {}
        for k, v in asdict(self).items():
            if v is not None and k not in ('meta', 'raw_input', 'raw_llm_output'):
                if isinstance(v, Enum):
                    result[k] = v.value
                elif k == 'meta':
                    continue
                else:
                    result[k] = v
        return result
    
    def to_json(self, indent: int = 2) -> str:
        """Convert to JSON string."""
        return json.dumps(self.to_dict(), indent=indent)
    
    def get_missing_fields(self, required: List[str] = None, desired: List[str] = None) -> List[str]:
        """Get list of missing fields."""
        if required is None:
            required = ['amount', 'type']
        if desired is None:
            desired = ['merchant', 'category', 'date', 'reference']
        
        missing = []
        for field_name in required + desired:
            if getattr(self, field_name, None) is None:
                missing.append(field_name)
        return missing
    
    def is_complete(self) -> bool:
        """Check if all required fields are present."""
        return self.amount is not None and self.type is not None
    
    def merge(self, other: 'ExtractionResult', overwrite: bool = False) -> 'ExtractionResult':
        """
        Merge another result into this one (additive).
        
        By default, existing values are NOT overwritten.
        Set overwrite=True to prefer `other`'s values.
        """
        for field_name in ['amount', 'type', 'date', 'account', 'reference', 
                           'vpa', 'merchant', 'category', 'payment_method', 'bank']:
            current_value = getattr(self, field_name)
            other_value = getattr(other, field_name)
            
            if other_value is not None:
                if current_value is None or overwrite:
                    setattr(self, field_name, other_value)
                    if field_name in other.meta:
                        self.meta[field_name] = other.meta[field_name]
        
        return self


@dataclass
class ExtractionConfig:
    """Configuration for the extraction pipeline."""
    # Cache settings
    cache_enabled: bool = True
    cache_max_size: int = 1000
    
    # LLM settings
    use_llm: bool = False  # Set to True to enable LLM (requires model download)
    llm_timeout_seconds: float = 10.0
    llm_max_tokens: int = 200
    llm_temperature: float = 0.1
    
    # Model settings
    model_path: Optional[str] = None
    model_id: str = "Ranjit0034/finance-entity-extractor"
    
    # Pipeline settings
    required_fields: List[str] = field(default_factory=lambda: ['amount', 'type'])
    desired_fields: List[str] = field(default_factory=lambda: ['merchant', 'category', 'date', 'reference'])
    
    # Confidence thresholds
    high_confidence_threshold: float = 0.9
    medium_confidence_threshold: float = 0.7


# Type aliases for clarity
RawText = str
JSONOutput = str