Dipan04's picture
Deploy Invoice Digitization Agent
8a859a8
"""
Feature builder that matches ML training pipeline exactly.
Generates features for inference from invoice data + aggregates.
FIXED: Handles None values properly with robust defaults.
"""
import pandas as pd
import numpy as np
from datetime import datetime
from typing import Dict, Optional
# Default values for new customers (from training)
DEFAULTS = {
'cust_avg_days': 18.0,
'cust_median_days': 15.0,
'cust_std_days': 0.0,
'cust_min_days': 12,
'cust_max_days': 25,
'cust_invoice_count': 1,
'cust_avg_amount': 30000.0,
'cust_total_amount': 30000.0,
'cust_pct_overdue': 0.0,
'payment_terms_avg_days': 15.0,
'payment_terms_median_days': 15.0,
'payment_terms_count': 100,
'business_avg_days': 17.0,
'business_median_days': 15.0,
'business_count': 1000
}
def safe_float(value, default=0.0):
"""Safely convert to float with default."""
if value is None:
return float(default)
try:
return float(value)
except (ValueError, TypeError):
return float(default)
def safe_int(value, default=0):
"""Safely convert to int with default."""
if value is None:
return int(default)
try:
return int(value)
except (ValueError, TypeError):
return int(default)
def parse_date(date_str: str) -> datetime:
"""Parse date string to datetime."""
if isinstance(date_str, datetime):
return date_str
for fmt in ["%Y-%m-%d %H:%M:%S", "%Y-%m-%d", "%Y%m%d"]:
try:
return datetime.strptime(str(date_str), fmt)
except ValueError:
continue
raise ValueError(f"Cannot parse date: {date_str}")
def build_features(
invoice_data: Dict,
customer_agg: Optional[Dict] = None,
payment_terms_agg: Optional[Dict] = None,
business_code_agg: Optional[Dict] = None
) -> Dict:
"""
Build feature vector matching ML training pipeline.
Args:
invoice_data: Invoice details (posting_date, amount, etc.)
customer_agg: Customer aggregates from DB (or None for defaults)
payment_terms_agg: Payment terms aggregates from DB
business_code_agg: Business code aggregates from DB
Returns:
Dict of features ready for model.predict()
"""
# Parse dates
posting_date = parse_date(invoice_data['posting_date'])
# Use provided aggregates or empty dicts (will use defaults)
cust_agg = customer_agg or {}
pmt_agg = payment_terms_agg or {}
biz_agg = business_code_agg or {}
# Build feature dictionary
features = {}
# ============================================
# Categorical Features (encoded as integers)
# ============================================
# Business code mapping
business_code = invoice_data.get('business_code', 'U001')
business_code_map = {'U001': 0, 'U002': 1, 'U005': 2, 'U007': 3, 'U013': 4, 'CA02': 5}
features['business_code'] = business_code_map.get(business_code, 0)
# Payment terms (simplified hash encoding)
payment_terms = invoice_data.get('cust_payment_terms', 'NAH4')
features['cust_payment_terms'] = abs(hash(payment_terms)) % 74
# Currency
currency_map = {'USD': 0, 'CAD': 1}
features['invoice_currency'] = currency_map.get(invoice_data.get('invoice_currency', 'USD'), 0)
# Document type
doc_type_map = {'RV': 0, 'AB': 1}
features['document_type'] = doc_type_map.get(invoice_data.get('document_type', 'RV'), 0)
# Amount category
amount = safe_float(invoice_data.get('total_open_amount'), 30000.0)
if amount < 5000:
amount_cat = 0 # small
elif amount < 20000:
amount_cat = 1 # medium
elif amount < 50000:
amount_cat = 2 # large
else:
amount_cat = 3 # very_large
features['amount_category'] = amount_cat
# ============================================
# Numerical Features
# ============================================
features['buisness_year'] = safe_float(invoice_data.get('business_year', posting_date.year))
features['total_open_amount'] = amount
features['amount_log'] = float(np.log1p(amount))
# Temporal features
features['posting_year'] = posting_date.year
features['posting_month'] = posting_date.month
features['posting_quarter'] = (posting_date.month - 1) // 3 + 1
features['posting_day'] = posting_date.day
features['posting_dayofweek'] = posting_date.weekday()
features['posting_is_weekend'] = 1 if posting_date.weekday() >= 5 else 0
features['posting_is_month_end'] = 1 if posting_date.day >= 28 else 0
features['posting_is_month_start'] = 1 if posting_date.day <= 3 else 0
# Days between dates
features['days_posting_to_due'] = safe_int(invoice_data.get('days_posting_to_due'), 15)
features['days_create_to_posting'] = safe_int(invoice_data.get('days_create_to_posting'), 0)
features['days_baseline_to_posting'] = safe_int(invoice_data.get('days_baseline_to_posting'), 0)
# Document create date alt (as integer YYYYMMDD)
doc_create_alt = invoice_data.get('document_create_date_alt')
if doc_create_alt:
try:
cleaned = str(doc_create_alt).replace('-', '').replace(' ', '').replace(':', '')[:8]
features['document_create_date.1'] = int(cleaned)
except:
features['document_create_date.1'] = int(posting_date.strftime('%Y%m%d'))
else:
features['document_create_date.1'] = int(posting_date.strftime('%Y%m%d'))
# ============================================
# Customer Aggregates (with robust defaults)
# ============================================
features['cust_avg_days'] = safe_float(
cust_agg.get('cust_avg_days'),
DEFAULTS['cust_avg_days']
)
features['cust_median_days'] = safe_float(
cust_agg.get('cust_median_days'),
DEFAULTS['cust_median_days']
)
features['cust_std_days'] = safe_float(
cust_agg.get('cust_std_days'),
DEFAULTS['cust_std_days']
)
features['cust_min_days'] = safe_int(
cust_agg.get('cust_min_days'),
DEFAULTS['cust_min_days']
)
features['cust_max_days'] = safe_int(
cust_agg.get('cust_max_days'),
DEFAULTS['cust_max_days']
)
features['cust_invoice_count'] = safe_int(
cust_agg.get('cust_invoice_count'),
DEFAULTS['cust_invoice_count']
)
features['cust_avg_amount'] = safe_float(
cust_agg.get('cust_avg_amount'),
DEFAULTS['cust_avg_amount']
)
features['cust_total_amount'] = safe_float(
cust_agg.get('cust_total_amount'),
DEFAULTS['cust_total_amount']
)
# ============================================
# Payment Terms Aggregates
# ============================================
features['payment_terms_avg_days'] = safe_float(
pmt_agg.get('payment_terms_avg_days'),
DEFAULTS['payment_terms_avg_days']
)
features['payment_terms_median_days'] = safe_float(
pmt_agg.get('payment_terms_median_days'),
DEFAULTS['payment_terms_median_days']
)
features['payment_terms_count'] = safe_int(
pmt_agg.get('payment_terms_count'),
DEFAULTS['payment_terms_count']
)
# ============================================
# Business Code Aggregates
# ============================================
features['business_avg_days'] = safe_float(
biz_agg.get('business_avg_days'),
DEFAULTS['business_avg_days']
)
features['business_median_days'] = safe_float(
biz_agg.get('business_median_days'),
DEFAULTS['business_median_days']
)
features['business_count'] = safe_int(
biz_agg.get('business_count'),
DEFAULTS['business_count']
)
# ============================================
# Interaction Features
# ============================================
cust_avg_amt = features['cust_avg_amount']
if cust_avg_amt > 0:
features['amount_vs_cust_avg'] = float(amount / cust_avg_amt)
else:
features['amount_vs_cust_avg'] = 1.0
features['is_large_for_customer'] = 1 if amount > cust_avg_amt * 1.5 else 0
# ============================================
# Other required fields
# ============================================
features['isOpen'] = safe_int(invoice_data.get('is_open'), 1)
features['posting_id'] = safe_float(invoice_data.get('posting_id'), 1.0)
return features
def features_to_dataframe(features: Dict) -> pd.DataFrame:
"""
Convert feature dict to DataFrame with correct column order.
Must match training feature order exactly.
"""
# Expected column order from training
COLUMN_ORDER = [
'business_code', 'buisness_year', 'document_create_date.1',
'invoice_currency', 'document_type', 'total_open_amount',
'cust_payment_terms', 'isOpen', 'posting_year', 'posting_month',
'posting_quarter', 'posting_day', 'posting_dayofweek',
'posting_is_weekend', 'posting_is_month_end', 'posting_is_month_start',
'days_posting_to_due', 'days_create_to_posting', 'days_baseline_to_posting',
'amount_log', 'amount_category', 'cust_avg_days', 'cust_median_days',
'cust_std_days', 'cust_min_days', 'cust_max_days', 'cust_invoice_count',
'cust_avg_amount', 'cust_total_amount', 'payment_terms_avg_days',
'payment_terms_median_days', 'payment_terms_count', 'business_avg_days',
'business_median_days', 'business_count', 'amount_vs_cust_avg',
'is_large_for_customer'
]
# Ensure all columns present with safe defaults
for col in COLUMN_ORDER:
if col not in features:
features[col] = 0.0 # Fallback
# Create DataFrame with correct order
df = pd.DataFrame([features])[COLUMN_ORDER]
return df
if __name__ == "__main__":
# Test with minimal data
test_invoice = {
'posting_date': '2024-01-15',
'total_open_amount': 50000.0,
'business_code': 'U001',
'cust_payment_terms': 'NAH4',
'invoice_currency': 'USD',
'document_type': 'RV',
'business_year': 2024,
'days_posting_to_due': 15,
'is_open': 1
}
# Test with no aggregates (should use defaults)
features = build_features(test_invoice, None, None, None)
df = features_to_dataframe(features)
print("✅ Features built successfully:")
print(f"Shape: {df.shape}")
print(f"Columns: {len(df.columns)}")
print(f"\nSample features:")
print(df[['cust_avg_days', 'payment_terms_avg_days', 'business_avg_days']].T)