Ranjit Behera
Fix: Default to regex-only mode for instant usage
c876830
"""
FinEE Schema - Core data structures for financial entity extraction.
This module defines the data classes used throughout the extraction pipeline.
All fields are optional to support partial extraction and additive merging.
"""
from dataclasses import dataclass, field, asdict
from enum import Enum
from typing import Optional, Dict, Any, List
from datetime import date
import json
class TransactionType(str, Enum):
"""Transaction type enumeration."""
DEBIT = "debit"
CREDIT = "credit"
UNKNOWN = "unknown"
class Category(str, Enum):
"""Transaction category enumeration."""
FOOD = "food"
SHOPPING = "shopping"
TRANSPORT = "transport"
UTILITIES = "utilities"
ENTERTAINMENT = "entertainment"
TRANSFER = "transfer"
SALARY = "salary"
INVESTMENT = "investment"
HEALTHCARE = "healthcare"
EDUCATION = "education"
OTHER = "other"
class Confidence(str, Enum):
"""Extraction confidence levels."""
HIGH = "high" # All fields from regex/rules
MEDIUM = "medium" # Mix of regex + LLM
LOW = "low" # Mostly LLM or incomplete
FAILED = "failed" # Extraction failed
class ExtractionSource(str, Enum):
"""Source of each extracted field."""
REGEX = "regex"
RULES = "rules"
LLM = "llm"
CACHE = "cache"
@dataclass
class FieldMeta:
"""Metadata for a single extracted field."""
source: ExtractionSource
confidence: float # 0.0 to 1.0
raw_value: Optional[str] = None # Original value before normalization
@dataclass
class ExtractionResult:
"""
Complete extraction result with all financial entities.
All fields are optional to support partial extraction.
The `meta` dict tracks the source and confidence of each field.
"""
# Core fields
amount: Optional[float] = None
type: Optional[TransactionType] = None
date: Optional[str] = None # Normalized to DD-MM-YYYY
# Transaction details
account: Optional[str] = None
reference: Optional[str] = None
vpa: Optional[str] = None
# Enrichment fields
merchant: Optional[str] = None
category: Optional[Category] = None
payment_method: Optional[str] = None
bank: Optional[str] = None
# Metadata
confidence: Confidence = Confidence.LOW
confidence_score: float = 0.0
processing_time_ms: float = 0.0
from_cache: bool = False
# Field-level metadata
meta: Dict[str, FieldMeta] = field(default_factory=dict)
# Raw data
raw_input: Optional[str] = None
raw_llm_output: Optional[str] = None
def to_dict(self) -> Dict[str, Any]:
"""Convert to dictionary, excluding None values and meta."""
result = {}
for k, v in asdict(self).items():
if v is not None and k not in ('meta', 'raw_input', 'raw_llm_output'):
if isinstance(v, Enum):
result[k] = v.value
elif k == 'meta':
continue
else:
result[k] = v
return result
def to_json(self, indent: int = 2) -> str:
"""Convert to JSON string."""
return json.dumps(self.to_dict(), indent=indent)
def get_missing_fields(self, required: List[str] = None, desired: List[str] = None) -> List[str]:
"""Get list of missing fields."""
if required is None:
required = ['amount', 'type']
if desired is None:
desired = ['merchant', 'category', 'date', 'reference']
missing = []
for field_name in required + desired:
if getattr(self, field_name, None) is None:
missing.append(field_name)
return missing
def is_complete(self) -> bool:
"""Check if all required fields are present."""
return self.amount is not None and self.type is not None
def merge(self, other: 'ExtractionResult', overwrite: bool = False) -> 'ExtractionResult':
"""
Merge another result into this one (additive).
By default, existing values are NOT overwritten.
Set overwrite=True to prefer `other`'s values.
"""
for field_name in ['amount', 'type', 'date', 'account', 'reference',
'vpa', 'merchant', 'category', 'payment_method', 'bank']:
current_value = getattr(self, field_name)
other_value = getattr(other, field_name)
if other_value is not None:
if current_value is None or overwrite:
setattr(self, field_name, other_value)
if field_name in other.meta:
self.meta[field_name] = other.meta[field_name]
return self
@dataclass
class ExtractionConfig:
"""Configuration for the extraction pipeline."""
# Cache settings
cache_enabled: bool = True
cache_max_size: int = 1000
# LLM settings
use_llm: bool = False # Set to True to enable LLM (requires model download)
llm_timeout_seconds: float = 10.0
llm_max_tokens: int = 200
llm_temperature: float = 0.1
# Model settings
model_path: Optional[str] = None
model_id: str = "Ranjit0034/finance-entity-extractor"
# Pipeline settings
required_fields: List[str] = field(default_factory=lambda: ['amount', 'type'])
desired_fields: List[str] = field(default_factory=lambda: ['merchant', 'category', 'date', 'reference'])
# Confidence thresholds
high_confidence_threshold: float = 0.9
medium_confidence_threshold: float = 0.7
# Type aliases for clarity
RawText = str
JSONOutput = str