finance-entity-extractor / src /finee /schema.py

Ranjit Behera

Fix: Default to regex-only mode for instant usage

c876830 23 days ago

5.69 kB

	"""
	FinEE Schema - Core data structures for financial entity extraction.

	This module defines the data classes used throughout the extraction pipeline.
	All fields are optional to support partial extraction and additive merging.
	"""

	from dataclasses import dataclass, field, asdict
	from enum import Enum
	from typing import Optional, Dict, Any, List
	from datetime import date
	import json


	class TransactionType(str, Enum):
	"""Transaction type enumeration."""
	DEBIT = "debit"
	CREDIT = "credit"
	UNKNOWN = "unknown"


	class Category(str, Enum):
	"""Transaction category enumeration."""
	FOOD = "food"
	SHOPPING = "shopping"
	TRANSPORT = "transport"
	UTILITIES = "utilities"
	ENTERTAINMENT = "entertainment"
	TRANSFER = "transfer"
	SALARY = "salary"
	INVESTMENT = "investment"
	HEALTHCARE = "healthcare"
	EDUCATION = "education"
	OTHER = "other"


	class Confidence(str, Enum):
	"""Extraction confidence levels."""
	HIGH = "high" # All fields from regex/rules
	MEDIUM = "medium" # Mix of regex + LLM
	LOW = "low" # Mostly LLM or incomplete
	FAILED = "failed" # Extraction failed


	class ExtractionSource(str, Enum):
	"""Source of each extracted field."""
	REGEX = "regex"
	RULES = "rules"
	LLM = "llm"
	CACHE = "cache"


	@dataclass
	class FieldMeta:
	"""Metadata for a single extracted field."""
	source: ExtractionSource
	confidence: float # 0.0 to 1.0
	raw_value: Optional[str] = None # Original value before normalization


	@dataclass
	class ExtractionResult:
	"""
	Complete extraction result with all financial entities.

	All fields are optional to support partial extraction.
	The `meta` dict tracks the source and confidence of each field.
	"""
	# Core fields
	amount: Optional[float] = None
	type: Optional[TransactionType] = None
	date: Optional[str] = None # Normalized to DD-MM-YYYY

	# Transaction details
	account: Optional[str] = None
	reference: Optional[str] = None
	vpa: Optional[str] = None

	# Enrichment fields
	merchant: Optional[str] = None
	category: Optional[Category] = None
	payment_method: Optional[str] = None
	bank: Optional[str] = None

	# Metadata
	confidence: Confidence = Confidence.LOW
	confidence_score: float = 0.0
	processing_time_ms: float = 0.0
	from_cache: bool = False

	# Field-level metadata
	meta: Dict[str, FieldMeta] = field(default_factory=dict)

	# Raw data
	raw_input: Optional[str] = None
	raw_llm_output: Optional[str] = None

	def to_dict(self) -> Dict[str, Any]:
	"""Convert to dictionary, excluding None values and meta."""
	result = {}
	for k, v in asdict(self).items():
	if v is not None and k not in ('meta', 'raw_input', 'raw_llm_output'):
	if isinstance(v, Enum):
	result[k] = v.value
	elif k == 'meta':
	continue
	else:
	result[k] = v
	return result

	def to_json(self, indent: int = 2) -> str:
	"""Convert to JSON string."""
	return json.dumps(self.to_dict(), indent=indent)

	def get_missing_fields(self, required: List[str] = None, desired: List[str] = None) -> List[str]:
	"""Get list of missing fields."""
	if required is None:
	required = ['amount', 'type']
	if desired is None:
	desired = ['merchant', 'category', 'date', 'reference']

	missing = []
	for field_name in required + desired:
	if getattr(self, field_name, None) is None:
	missing.append(field_name)
	return missing

	def is_complete(self) -> bool:
	"""Check if all required fields are present."""
	return self.amount is not None and self.type is not None

	def merge(self, other: 'ExtractionResult', overwrite: bool = False) -> 'ExtractionResult':
	"""
	Merge another result into this one (additive).

	By default, existing values are NOT overwritten.
	Set overwrite=True to prefer `other`'s values.
	"""
	for field_name in ['amount', 'type', 'date', 'account', 'reference',
	'vpa', 'merchant', 'category', 'payment_method', 'bank']:
	current_value = getattr(self, field_name)
	other_value = getattr(other, field_name)

	if other_value is not None:
	if current_value is None or overwrite:
	setattr(self, field_name, other_value)
	if field_name in other.meta:
	self.meta[field_name] = other.meta[field_name]

	return self


	@dataclass
	class ExtractionConfig:
	"""Configuration for the extraction pipeline."""
	# Cache settings
	cache_enabled: bool = True
	cache_max_size: int = 1000

	# LLM settings
	use_llm: bool = False # Set to True to enable LLM (requires model download)
	llm_timeout_seconds: float = 10.0
	llm_max_tokens: int = 200
	llm_temperature: float = 0.1

	# Model settings
	model_path: Optional[str] = None
	model_id: str = "Ranjit0034/finance-entity-extractor"

	# Pipeline settings
	required_fields: List[str] = field(default_factory=lambda: ['amount', 'type'])
	desired_fields: List[str] = field(default_factory=lambda: ['merchant', 'category', 'date', 'reference'])

	# Confidence thresholds
	high_confidence_threshold: float = 0.9
	medium_confidence_threshold: float = 0.7


	# Type aliases for clarity
	RawText = str
	JSONOutput = str