Spaces:

point9
/

Invoice_Digitization_Agent

Sleeping

App Files Files Community

Invoice_Digitization_Agent / backend /feature_builder /feature_builder.py

Dipan04

Deploy Invoice Digitization Agent

8a859a8 9 days ago

raw

history blame contribute delete

10.8 kB

	"""
	Feature builder that matches ML training pipeline exactly.
	Generates features for inference from invoice data + aggregates.
	FIXED: Handles None values properly with robust defaults.
	"""

	import pandas as pd
	import numpy as np
	from datetime import datetime
	from typing import Dict, Optional


	# Default values for new customers (from training)
	DEFAULTS = {
	'cust_avg_days': 18.0,
	'cust_median_days': 15.0,
	'cust_std_days': 0.0,
	'cust_min_days': 12,
	'cust_max_days': 25,
	'cust_invoice_count': 1,
	'cust_avg_amount': 30000.0,
	'cust_total_amount': 30000.0,
	'cust_pct_overdue': 0.0,
	'payment_terms_avg_days': 15.0,
	'payment_terms_median_days': 15.0,
	'payment_terms_count': 100,
	'business_avg_days': 17.0,
	'business_median_days': 15.0,
	'business_count': 1000
	}


	def safe_float(value, default=0.0):
	"""Safely convert to float with default."""
	if value is None:
	return float(default)
	try:
	return float(value)
	except (ValueError, TypeError):
	return float(default)


	def safe_int(value, default=0):
	"""Safely convert to int with default."""
	if value is None:
	return int(default)
	try:
	return int(value)
	except (ValueError, TypeError):
	return int(default)


	def parse_date(date_str: str) -> datetime:
	"""Parse date string to datetime."""
	if isinstance(date_str, datetime):
	return date_str

	for fmt in ["%Y-%m-%d %H:%M:%S", "%Y-%m-%d", "%Y%m%d"]:
	try:
	return datetime.strptime(str(date_str), fmt)
	except ValueError:
	continue

	raise ValueError(f"Cannot parse date: {date_str}")


	def build_features(
	invoice_data: Dict,
	customer_agg: Optional[Dict] = None,
	payment_terms_agg: Optional[Dict] = None,
	business_code_agg: Optional[Dict] = None
	) -> Dict:
	"""
	Build feature vector matching ML training pipeline.

	Args:
	invoice_data: Invoice details (posting_date, amount, etc.)
	customer_agg: Customer aggregates from DB (or None for defaults)
	payment_terms_agg: Payment terms aggregates from DB
	business_code_agg: Business code aggregates from DB

	Returns:
	Dict of features ready for model.predict()
	"""

	# Parse dates
	posting_date = parse_date(invoice_data['posting_date'])

	# Use provided aggregates or empty dicts (will use defaults)
	cust_agg = customer_agg or {}
	pmt_agg = payment_terms_agg or {}
	biz_agg = business_code_agg or {}

	# Build feature dictionary
	features = {}

	# ============================================
	# Categorical Features (encoded as integers)
	# ============================================

	# Business code mapping
	business_code = invoice_data.get('business_code', 'U001')
	business_code_map = {'U001': 0, 'U002': 1, 'U005': 2, 'U007': 3, 'U013': 4, 'CA02': 5}
	features['business_code'] = business_code_map.get(business_code, 0)

	# Payment terms (simplified hash encoding)
	payment_terms = invoice_data.get('cust_payment_terms', 'NAH4')
	features['cust_payment_terms'] = abs(hash(payment_terms)) % 74

	# Currency
	currency_map = {'USD': 0, 'CAD': 1}
	features['invoice_currency'] = currency_map.get(invoice_data.get('invoice_currency', 'USD'), 0)

	# Document type
	doc_type_map = {'RV': 0, 'AB': 1}
	features['document_type'] = doc_type_map.get(invoice_data.get('document_type', 'RV'), 0)

	# Amount category
	amount = safe_float(invoice_data.get('total_open_amount'), 30000.0)
	if amount < 5000:
	amount_cat = 0 # small
	elif amount < 20000:
	amount_cat = 1 # medium
	elif amount < 50000:
	amount_cat = 2 # large
	else:
	amount_cat = 3 # very_large
	features['amount_category'] = amount_cat

	# ============================================
	# Numerical Features
	# ============================================

	features['buisness_year'] = safe_float(invoice_data.get('business_year', posting_date.year))
	features['total_open_amount'] = amount
	features['amount_log'] = float(np.log1p(amount))

	# Temporal features
	features['posting_year'] = posting_date.year
	features['posting_month'] = posting_date.month
	features['posting_quarter'] = (posting_date.month - 1) // 3 + 1
	features['posting_day'] = posting_date.day
	features['posting_dayofweek'] = posting_date.weekday()
	features['posting_is_weekend'] = 1 if posting_date.weekday() >= 5 else 0
	features['posting_is_month_end'] = 1 if posting_date.day >= 28 else 0
	features['posting_is_month_start'] = 1 if posting_date.day <= 3 else 0

	# Days between dates
	features['days_posting_to_due'] = safe_int(invoice_data.get('days_posting_to_due'), 15)
	features['days_create_to_posting'] = safe_int(invoice_data.get('days_create_to_posting'), 0)
	features['days_baseline_to_posting'] = safe_int(invoice_data.get('days_baseline_to_posting'), 0)

	# Document create date alt (as integer YYYYMMDD)
	doc_create_alt = invoice_data.get('document_create_date_alt')
	if doc_create_alt:
	try:
	cleaned = str(doc_create_alt).replace('-', '').replace(' ', '').replace(':', '')[:8]
	features['document_create_date.1'] = int(cleaned)
	except:
	features['document_create_date.1'] = int(posting_date.strftime('%Y%m%d'))
	else:
	features['document_create_date.1'] = int(posting_date.strftime('%Y%m%d'))

	# ============================================
	# Customer Aggregates (with robust defaults)
	# ============================================

	features['cust_avg_days'] = safe_float(
	cust_agg.get('cust_avg_days'),
	DEFAULTS['cust_avg_days']
	)
	features['cust_median_days'] = safe_float(
	cust_agg.get('cust_median_days'),
	DEFAULTS['cust_median_days']
	)
	features['cust_std_days'] = safe_float(
	cust_agg.get('cust_std_days'),
	DEFAULTS['cust_std_days']
	)
	features['cust_min_days'] = safe_int(
	cust_agg.get('cust_min_days'),
	DEFAULTS['cust_min_days']
	)
	features['cust_max_days'] = safe_int(
	cust_agg.get('cust_max_days'),
	DEFAULTS['cust_max_days']
	)
	features['cust_invoice_count'] = safe_int(
	cust_agg.get('cust_invoice_count'),
	DEFAULTS['cust_invoice_count']
	)
	features['cust_avg_amount'] = safe_float(
	cust_agg.get('cust_avg_amount'),
	DEFAULTS['cust_avg_amount']
	)
	features['cust_total_amount'] = safe_float(
	cust_agg.get('cust_total_amount'),
	DEFAULTS['cust_total_amount']
	)

	# ============================================
	# Payment Terms Aggregates
	# ============================================

	features['payment_terms_avg_days'] = safe_float(
	pmt_agg.get('payment_terms_avg_days'),
	DEFAULTS['payment_terms_avg_days']
	)
	features['payment_terms_median_days'] = safe_float(
	pmt_agg.get('payment_terms_median_days'),
	DEFAULTS['payment_terms_median_days']
	)
	features['payment_terms_count'] = safe_int(
	pmt_agg.get('payment_terms_count'),
	DEFAULTS['payment_terms_count']
	)

	# ============================================
	# Business Code Aggregates
	# ============================================

	features['business_avg_days'] = safe_float(
	biz_agg.get('business_avg_days'),
	DEFAULTS['business_avg_days']
	)
	features['business_median_days'] = safe_float(
	biz_agg.get('business_median_days'),
	DEFAULTS['business_median_days']
	)
	features['business_count'] = safe_int(
	biz_agg.get('business_count'),
	DEFAULTS['business_count']
	)

	# ============================================
	# Interaction Features
	# ============================================

	cust_avg_amt = features['cust_avg_amount']
	if cust_avg_amt > 0:
	features['amount_vs_cust_avg'] = float(amount / cust_avg_amt)
	else:
	features['amount_vs_cust_avg'] = 1.0

	features['is_large_for_customer'] = 1 if amount > cust_avg_amt * 1.5 else 0

	# ============================================
	# Other required fields
	# ============================================

	features['isOpen'] = safe_int(invoice_data.get('is_open'), 1)
	features['posting_id'] = safe_float(invoice_data.get('posting_id'), 1.0)

	return features


	def features_to_dataframe(features: Dict) -> pd.DataFrame:
	"""
	Convert feature dict to DataFrame with correct column order.
	Must match training feature order exactly.
	"""

	# Expected column order from training
	COLUMN_ORDER = [
	'business_code', 'buisness_year', 'document_create_date.1',
	'invoice_currency', 'document_type', 'total_open_amount',
	'cust_payment_terms', 'isOpen', 'posting_year', 'posting_month',
	'posting_quarter', 'posting_day', 'posting_dayofweek',
	'posting_is_weekend', 'posting_is_month_end', 'posting_is_month_start',
	'days_posting_to_due', 'days_create_to_posting', 'days_baseline_to_posting',
	'amount_log', 'amount_category', 'cust_avg_days', 'cust_median_days',
	'cust_std_days', 'cust_min_days', 'cust_max_days', 'cust_invoice_count',
	'cust_avg_amount', 'cust_total_amount', 'payment_terms_avg_days',
	'payment_terms_median_days', 'payment_terms_count', 'business_avg_days',
	'business_median_days', 'business_count', 'amount_vs_cust_avg',
	'is_large_for_customer'
	]

	# Ensure all columns present with safe defaults
	for col in COLUMN_ORDER:
	if col not in features:
	features[col] = 0.0 # Fallback

	# Create DataFrame with correct order
	df = pd.DataFrame([features])[COLUMN_ORDER]

	return df


	if __name__ == "__main__":
	# Test with minimal data
	test_invoice = {
	'posting_date': '2024-01-15',
	'total_open_amount': 50000.0,
	'business_code': 'U001',
	'cust_payment_terms': 'NAH4',
	'invoice_currency': 'USD',
	'document_type': 'RV',
	'business_year': 2024,
	'days_posting_to_due': 15,
	'is_open': 1
	}

	# Test with no aggregates (should use defaults)
	features = build_features(test_invoice, None, None, None)
	df = features_to_dataframe(features)

	print("✅ Features built successfully:")
	print(f"Shape: {df.shape}")
	print(f"Columns: {len(df.columns)}")
	print(f"\nSample features:")
	print(df[['cust_avg_days', 'payment_terms_avg_days', 'business_avg_days']].T)