""" Feature builder that matches ML training pipeline exactly. Generates features for inference from invoice data + aggregates. FIXED: Handles None values properly with robust defaults. """ import pandas as pd import numpy as np from datetime import datetime from typing import Dict, Optional # Default values for new customers (from training) DEFAULTS = { 'cust_avg_days': 18.0, 'cust_median_days': 15.0, 'cust_std_days': 0.0, 'cust_min_days': 12, 'cust_max_days': 25, 'cust_invoice_count': 1, 'cust_avg_amount': 30000.0, 'cust_total_amount': 30000.0, 'cust_pct_overdue': 0.0, 'payment_terms_avg_days': 15.0, 'payment_terms_median_days': 15.0, 'payment_terms_count': 100, 'business_avg_days': 17.0, 'business_median_days': 15.0, 'business_count': 1000 } def safe_float(value, default=0.0): """Safely convert to float with default.""" if value is None: return float(default) try: return float(value) except (ValueError, TypeError): return float(default) def safe_int(value, default=0): """Safely convert to int with default.""" if value is None: return int(default) try: return int(value) except (ValueError, TypeError): return int(default) def parse_date(date_str: str) -> datetime: """Parse date string to datetime.""" if isinstance(date_str, datetime): return date_str for fmt in ["%Y-%m-%d %H:%M:%S", "%Y-%m-%d", "%Y%m%d"]: try: return datetime.strptime(str(date_str), fmt) except ValueError: continue raise ValueError(f"Cannot parse date: {date_str}") def build_features( invoice_data: Dict, customer_agg: Optional[Dict] = None, payment_terms_agg: Optional[Dict] = None, business_code_agg: Optional[Dict] = None ) -> Dict: """ Build feature vector matching ML training pipeline. Args: invoice_data: Invoice details (posting_date, amount, etc.) customer_agg: Customer aggregates from DB (or None for defaults) payment_terms_agg: Payment terms aggregates from DB business_code_agg: Business code aggregates from DB Returns: Dict of features ready for model.predict() """ # Parse dates posting_date = parse_date(invoice_data['posting_date']) # Use provided aggregates or empty dicts (will use defaults) cust_agg = customer_agg or {} pmt_agg = payment_terms_agg or {} biz_agg = business_code_agg or {} # Build feature dictionary features = {} # ============================================ # Categorical Features (encoded as integers) # ============================================ # Business code mapping business_code = invoice_data.get('business_code', 'U001') business_code_map = {'U001': 0, 'U002': 1, 'U005': 2, 'U007': 3, 'U013': 4, 'CA02': 5} features['business_code'] = business_code_map.get(business_code, 0) # Payment terms (simplified hash encoding) payment_terms = invoice_data.get('cust_payment_terms', 'NAH4') features['cust_payment_terms'] = abs(hash(payment_terms)) % 74 # Currency currency_map = {'USD': 0, 'CAD': 1} features['invoice_currency'] = currency_map.get(invoice_data.get('invoice_currency', 'USD'), 0) # Document type doc_type_map = {'RV': 0, 'AB': 1} features['document_type'] = doc_type_map.get(invoice_data.get('document_type', 'RV'), 0) # Amount category amount = safe_float(invoice_data.get('total_open_amount'), 30000.0) if amount < 5000: amount_cat = 0 # small elif amount < 20000: amount_cat = 1 # medium elif amount < 50000: amount_cat = 2 # large else: amount_cat = 3 # very_large features['amount_category'] = amount_cat # ============================================ # Numerical Features # ============================================ features['buisness_year'] = safe_float(invoice_data.get('business_year', posting_date.year)) features['total_open_amount'] = amount features['amount_log'] = float(np.log1p(amount)) # Temporal features features['posting_year'] = posting_date.year features['posting_month'] = posting_date.month features['posting_quarter'] = (posting_date.month - 1) // 3 + 1 features['posting_day'] = posting_date.day features['posting_dayofweek'] = posting_date.weekday() features['posting_is_weekend'] = 1 if posting_date.weekday() >= 5 else 0 features['posting_is_month_end'] = 1 if posting_date.day >= 28 else 0 features['posting_is_month_start'] = 1 if posting_date.day <= 3 else 0 # Days between dates features['days_posting_to_due'] = safe_int(invoice_data.get('days_posting_to_due'), 15) features['days_create_to_posting'] = safe_int(invoice_data.get('days_create_to_posting'), 0) features['days_baseline_to_posting'] = safe_int(invoice_data.get('days_baseline_to_posting'), 0) # Document create date alt (as integer YYYYMMDD) doc_create_alt = invoice_data.get('document_create_date_alt') if doc_create_alt: try: cleaned = str(doc_create_alt).replace('-', '').replace(' ', '').replace(':', '')[:8] features['document_create_date.1'] = int(cleaned) except: features['document_create_date.1'] = int(posting_date.strftime('%Y%m%d')) else: features['document_create_date.1'] = int(posting_date.strftime('%Y%m%d')) # ============================================ # Customer Aggregates (with robust defaults) # ============================================ features['cust_avg_days'] = safe_float( cust_agg.get('cust_avg_days'), DEFAULTS['cust_avg_days'] ) features['cust_median_days'] = safe_float( cust_agg.get('cust_median_days'), DEFAULTS['cust_median_days'] ) features['cust_std_days'] = safe_float( cust_agg.get('cust_std_days'), DEFAULTS['cust_std_days'] ) features['cust_min_days'] = safe_int( cust_agg.get('cust_min_days'), DEFAULTS['cust_min_days'] ) features['cust_max_days'] = safe_int( cust_agg.get('cust_max_days'), DEFAULTS['cust_max_days'] ) features['cust_invoice_count'] = safe_int( cust_agg.get('cust_invoice_count'), DEFAULTS['cust_invoice_count'] ) features['cust_avg_amount'] = safe_float( cust_agg.get('cust_avg_amount'), DEFAULTS['cust_avg_amount'] ) features['cust_total_amount'] = safe_float( cust_agg.get('cust_total_amount'), DEFAULTS['cust_total_amount'] ) # ============================================ # Payment Terms Aggregates # ============================================ features['payment_terms_avg_days'] = safe_float( pmt_agg.get('payment_terms_avg_days'), DEFAULTS['payment_terms_avg_days'] ) features['payment_terms_median_days'] = safe_float( pmt_agg.get('payment_terms_median_days'), DEFAULTS['payment_terms_median_days'] ) features['payment_terms_count'] = safe_int( pmt_agg.get('payment_terms_count'), DEFAULTS['payment_terms_count'] ) # ============================================ # Business Code Aggregates # ============================================ features['business_avg_days'] = safe_float( biz_agg.get('business_avg_days'), DEFAULTS['business_avg_days'] ) features['business_median_days'] = safe_float( biz_agg.get('business_median_days'), DEFAULTS['business_median_days'] ) features['business_count'] = safe_int( biz_agg.get('business_count'), DEFAULTS['business_count'] ) # ============================================ # Interaction Features # ============================================ cust_avg_amt = features['cust_avg_amount'] if cust_avg_amt > 0: features['amount_vs_cust_avg'] = float(amount / cust_avg_amt) else: features['amount_vs_cust_avg'] = 1.0 features['is_large_for_customer'] = 1 if amount > cust_avg_amt * 1.5 else 0 # ============================================ # Other required fields # ============================================ features['isOpen'] = safe_int(invoice_data.get('is_open'), 1) features['posting_id'] = safe_float(invoice_data.get('posting_id'), 1.0) return features def features_to_dataframe(features: Dict) -> pd.DataFrame: """ Convert feature dict to DataFrame with correct column order. Must match training feature order exactly. """ # Expected column order from training COLUMN_ORDER = [ 'business_code', 'buisness_year', 'document_create_date.1', 'invoice_currency', 'document_type', 'total_open_amount', 'cust_payment_terms', 'isOpen', 'posting_year', 'posting_month', 'posting_quarter', 'posting_day', 'posting_dayofweek', 'posting_is_weekend', 'posting_is_month_end', 'posting_is_month_start', 'days_posting_to_due', 'days_create_to_posting', 'days_baseline_to_posting', 'amount_log', 'amount_category', 'cust_avg_days', 'cust_median_days', 'cust_std_days', 'cust_min_days', 'cust_max_days', 'cust_invoice_count', 'cust_avg_amount', 'cust_total_amount', 'payment_terms_avg_days', 'payment_terms_median_days', 'payment_terms_count', 'business_avg_days', 'business_median_days', 'business_count', 'amount_vs_cust_avg', 'is_large_for_customer' ] # Ensure all columns present with safe defaults for col in COLUMN_ORDER: if col not in features: features[col] = 0.0 # Fallback # Create DataFrame with correct order df = pd.DataFrame([features])[COLUMN_ORDER] return df if __name__ == "__main__": # Test with minimal data test_invoice = { 'posting_date': '2024-01-15', 'total_open_amount': 50000.0, 'business_code': 'U001', 'cust_payment_terms': 'NAH4', 'invoice_currency': 'USD', 'document_type': 'RV', 'business_year': 2024, 'days_posting_to_due': 15, 'is_open': 1 } # Test with no aggregates (should use defaults) features = build_features(test_invoice, None, None, None) df = features_to_dataframe(features) print("✅ Features built successfully:") print(f"Shape: {df.shape}") print(f"Columns: {len(df.columns)}") print(f"\nSample features:") print(df[['cust_avg_days', 'payment_terms_avg_days', 'business_avg_days']].T)