|
|
""" |
|
|
FinEE Schema - Core data structures for financial entity extraction. |
|
|
|
|
|
This module defines the data classes used throughout the extraction pipeline. |
|
|
All fields are optional to support partial extraction and additive merging. |
|
|
""" |
|
|
|
|
|
from dataclasses import dataclass, field, asdict |
|
|
from enum import Enum |
|
|
from typing import Optional, Dict, Any, List |
|
|
from datetime import date |
|
|
import json |
|
|
|
|
|
|
|
|
class TransactionType(str, Enum): |
|
|
"""Transaction type enumeration.""" |
|
|
DEBIT = "debit" |
|
|
CREDIT = "credit" |
|
|
UNKNOWN = "unknown" |
|
|
|
|
|
|
|
|
class Category(str, Enum): |
|
|
"""Transaction category enumeration.""" |
|
|
FOOD = "food" |
|
|
SHOPPING = "shopping" |
|
|
TRANSPORT = "transport" |
|
|
UTILITIES = "utilities" |
|
|
ENTERTAINMENT = "entertainment" |
|
|
TRANSFER = "transfer" |
|
|
SALARY = "salary" |
|
|
INVESTMENT = "investment" |
|
|
HEALTHCARE = "healthcare" |
|
|
EDUCATION = "education" |
|
|
OTHER = "other" |
|
|
|
|
|
|
|
|
class Confidence(str, Enum): |
|
|
"""Extraction confidence levels.""" |
|
|
HIGH = "high" |
|
|
MEDIUM = "medium" |
|
|
LOW = "low" |
|
|
FAILED = "failed" |
|
|
|
|
|
|
|
|
class ExtractionSource(str, Enum): |
|
|
"""Source of each extracted field.""" |
|
|
REGEX = "regex" |
|
|
RULES = "rules" |
|
|
LLM = "llm" |
|
|
CACHE = "cache" |
|
|
|
|
|
|
|
|
@dataclass |
|
|
class FieldMeta: |
|
|
"""Metadata for a single extracted field.""" |
|
|
source: ExtractionSource |
|
|
confidence: float |
|
|
raw_value: Optional[str] = None |
|
|
|
|
|
|
|
|
@dataclass |
|
|
class ExtractionResult: |
|
|
""" |
|
|
Complete extraction result with all financial entities. |
|
|
|
|
|
All fields are optional to support partial extraction. |
|
|
The `meta` dict tracks the source and confidence of each field. |
|
|
""" |
|
|
|
|
|
amount: Optional[float] = None |
|
|
type: Optional[TransactionType] = None |
|
|
date: Optional[str] = None |
|
|
|
|
|
|
|
|
account: Optional[str] = None |
|
|
reference: Optional[str] = None |
|
|
vpa: Optional[str] = None |
|
|
|
|
|
|
|
|
merchant: Optional[str] = None |
|
|
category: Optional[Category] = None |
|
|
payment_method: Optional[str] = None |
|
|
bank: Optional[str] = None |
|
|
|
|
|
|
|
|
confidence: Confidence = Confidence.LOW |
|
|
confidence_score: float = 0.0 |
|
|
processing_time_ms: float = 0.0 |
|
|
from_cache: bool = False |
|
|
|
|
|
|
|
|
meta: Dict[str, FieldMeta] = field(default_factory=dict) |
|
|
|
|
|
|
|
|
raw_input: Optional[str] = None |
|
|
raw_llm_output: Optional[str] = None |
|
|
|
|
|
def to_dict(self) -> Dict[str, Any]: |
|
|
"""Convert to dictionary, excluding None values and meta.""" |
|
|
result = {} |
|
|
for k, v in asdict(self).items(): |
|
|
if v is not None and k not in ('meta', 'raw_input', 'raw_llm_output'): |
|
|
if isinstance(v, Enum): |
|
|
result[k] = v.value |
|
|
elif k == 'meta': |
|
|
continue |
|
|
else: |
|
|
result[k] = v |
|
|
return result |
|
|
|
|
|
def to_json(self, indent: int = 2) -> str: |
|
|
"""Convert to JSON string.""" |
|
|
return json.dumps(self.to_dict(), indent=indent) |
|
|
|
|
|
def get_missing_fields(self, required: List[str] = None, desired: List[str] = None) -> List[str]: |
|
|
"""Get list of missing fields.""" |
|
|
if required is None: |
|
|
required = ['amount', 'type'] |
|
|
if desired is None: |
|
|
desired = ['merchant', 'category', 'date', 'reference'] |
|
|
|
|
|
missing = [] |
|
|
for field_name in required + desired: |
|
|
if getattr(self, field_name, None) is None: |
|
|
missing.append(field_name) |
|
|
return missing |
|
|
|
|
|
def is_complete(self) -> bool: |
|
|
"""Check if all required fields are present.""" |
|
|
return self.amount is not None and self.type is not None |
|
|
|
|
|
def merge(self, other: 'ExtractionResult', overwrite: bool = False) -> 'ExtractionResult': |
|
|
""" |
|
|
Merge another result into this one (additive). |
|
|
|
|
|
By default, existing values are NOT overwritten. |
|
|
Set overwrite=True to prefer `other`'s values. |
|
|
""" |
|
|
for field_name in ['amount', 'type', 'date', 'account', 'reference', |
|
|
'vpa', 'merchant', 'category', 'payment_method', 'bank']: |
|
|
current_value = getattr(self, field_name) |
|
|
other_value = getattr(other, field_name) |
|
|
|
|
|
if other_value is not None: |
|
|
if current_value is None or overwrite: |
|
|
setattr(self, field_name, other_value) |
|
|
if field_name in other.meta: |
|
|
self.meta[field_name] = other.meta[field_name] |
|
|
|
|
|
return self |
|
|
|
|
|
|
|
|
@dataclass |
|
|
class ExtractionConfig: |
|
|
"""Configuration for the extraction pipeline.""" |
|
|
|
|
|
cache_enabled: bool = True |
|
|
cache_max_size: int = 1000 |
|
|
|
|
|
|
|
|
use_llm: bool = False |
|
|
llm_timeout_seconds: float = 10.0 |
|
|
llm_max_tokens: int = 200 |
|
|
llm_temperature: float = 0.1 |
|
|
|
|
|
|
|
|
model_path: Optional[str] = None |
|
|
model_id: str = "Ranjit0034/finance-entity-extractor" |
|
|
|
|
|
|
|
|
required_fields: List[str] = field(default_factory=lambda: ['amount', 'type']) |
|
|
desired_fields: List[str] = field(default_factory=lambda: ['merchant', 'category', 'date', 'reference']) |
|
|
|
|
|
|
|
|
high_confidence_threshold: float = 0.9 |
|
|
medium_confidence_threshold: float = 0.7 |
|
|
|
|
|
|
|
|
|
|
|
RawText = str |
|
|
JSONOutput = str |
|
|
|