finance-entity-extractor / src /finee /normalizer.py

Ranjit Behera

FinEE v1.0 - Finance Entity Extractor

dcc24f8 21 days ago

5.94 kB

	"""
	FinEE Normalizer - Data normalization utilities.

	Handles normalization of:
	- Amounts (₹2,500.00 → 2500.0)
	- Dates (various formats → DD-MM-YYYY)
	- Account numbers (masking, formatting)
	- Reference numbers (padding)
	"""

	import re
	from datetime import datetime, date
	from typing import Optional, Union
	from dateutil import parser as date_parser


	def normalize_amount(amount_str: Union[str, float, int, None]) -> Optional[float]:
	"""
	Normalize amount string to float.

	Handles:
	- Currency symbols (Rs., ₹, INR)
	- Commas (2,500.00)
	- Spaces (Rs. 2 500)

	Args:
	amount_str: Amount in various formats

	Returns:
	Float amount or None if parsing fails
	"""
	if amount_str is None:
	return None

	if isinstance(amount_str, (int, float)):
	return float(amount_str)

	if not isinstance(amount_str, str):
	return None

	# Remove currency symbols (specific prefixes)
	cleaned = amount_str.strip()
	cleaned = re.sub(r'^(?:Rs\.?\|INR\|₹)\s*', '', cleaned, flags=re.IGNORECASE)

	# Remove commas
	cleaned = cleaned.replace(',', '')

	# Handle Indian lakhs/crores notation (if present)
	cleaned = cleaned.replace(' ', '')

	try:
	return float(cleaned)
	except ValueError:
	return None


	def normalize_date(date_str: Optional[str], output_format: str = '%d-%m-%Y') -> Optional[str]:
	"""
	Normalize date string to standard format.

	Handles:
	- DD-MM-YY, DD-MM-YYYY
	- DD/MM/YY, DD/MM/YYYY
	- DD Mon YYYY (28 Dec 2025)
	- YYYY-MM-DD (ISO format)

	Args:
	date_str: Date in various formats
	output_format: Output format (default: DD-MM-YYYY)

	Returns:
	Normalized date string or None if parsing fails
	"""
	if not date_str:
	return None

	# Clean input
	date_str = date_str.strip()

	# Common Indian date formats to try
	formats = [
	'%d-%m-%Y', # 28-12-2025
	'%d-%m-%y', # 28-12-25
	'%d/%m/%Y', # 28/12/2025
	'%d/%m/%y', # 28/12/25
	'%d %b %Y', # 28 Dec 2025
	'%d %b %y', # 28 Dec 25
	'%d %B %Y', # 28 December 2025
	'%d %B %y', # 28 December 25
	'%Y-%m-%d', # 2025-12-28 (ISO)
	'%d.%m.%Y', # 28.12.2025
	'%d.%m.%y', # 28.12.25
	]

	# Try each format
	for fmt in formats:
	try:
	parsed = datetime.strptime(date_str, fmt)

	# Handle 2-digit years (assume 20xx for years < 50)
	if parsed.year < 100:
	if parsed.year < 50:
	parsed = parsed.replace(year=parsed.year + 2000)
	else:
	parsed = parsed.replace(year=parsed.year + 1900)

	return parsed.strftime(output_format)
	except ValueError:
	continue

	# Fallback to dateutil parser
	try:
	parsed = date_parser.parse(date_str, dayfirst=True)
	return parsed.strftime(output_format)
	except (ValueError, TypeError):
	return None


	def normalize_account(account_str: Optional[str], mask: bool = False) -> Optional[str]:
	"""
	Normalize account number.

	Args:
	account_str: Account number string
	mask: If True, mask all but last 4 digits

	Returns:
	Normalized account number
	"""
	if not account_str:
	return None

	# Extract digits only
	digits = re.sub(r'\D', '', str(account_str))

	if not digits:
	return None

	if mask and len(digits) > 4:
	return '' (len(digits) - 4) + digits[-4:]

	return digits


	def normalize_reference(ref_str: Optional[str]) -> Optional[str]:
	"""
	Normalize transaction reference number.

	Args:
	ref_str: Reference number string

	Returns:
	Normalized reference number
	"""
	if not ref_str:
	return None

	# Extract alphanumeric characters
	cleaned = re.sub(r'[^A-Za-z0-9]', '', str(ref_str))

	return cleaned if cleaned else None


	def normalize_vpa(vpa_str: Optional[str]) -> Optional[str]:
	"""
	Normalize UPI VPA.

	Args:
	vpa_str: VPA string

	Returns:
	Lowercase VPA
	"""
	if not vpa_str:
	return None

	# Remove extra whitespace and lowercase
	cleaned = vpa_str.strip().lower()

	# Validate VPA format (should have @)
	if '@' not in cleaned:
	return None

	return cleaned


	def normalize_merchant(merchant_str: Optional[str]) -> Optional[str]:
	"""
	Normalize merchant name.

	Args:
	merchant_str: Merchant name string

	Returns:
	Cleaned merchant name
	"""
	if not merchant_str:
	return None

	# Title case and clean
	cleaned = merchant_str.strip()

	# Remove common prefixes/suffixes
	prefixes = ['payment to', 'paid to', 'transfer to', 'upi-']
	for prefix in prefixes:
	if cleaned.lower().startswith(prefix):
	cleaned = cleaned[len(prefix):].strip()

	return cleaned if cleaned else None


	def normalize_type(type_str: Optional[str]) -> Optional[str]:
	"""
	Normalize transaction type.

	Args:
	type_str: Type string (debit/credit variants)

	Returns:
	'debit' or 'credit'
	"""
	if not type_str:
	return None

	type_lower = str(type_str).lower().strip()

	debit_keywords = ['debit', 'debited', 'withdrawn', 'sent', 'paid', 'spent', 'purchase']
	credit_keywords = ['credit', 'credited', 'received', 'refund', 'cashback', 'reversed']

	for kw in debit_keywords:
	if kw in type_lower:
	return 'debit'

	for kw in credit_keywords:
	if kw in type_lower:
	return 'credit'

	return None