|
|
""" |
|
|
Feature builder that matches ML training pipeline exactly. |
|
|
Generates features for inference from invoice data + aggregates. |
|
|
FIXED: Handles None values properly with robust defaults. |
|
|
""" |
|
|
|
|
|
import pandas as pd |
|
|
import numpy as np |
|
|
from datetime import datetime |
|
|
from typing import Dict, Optional |
|
|
|
|
|
|
|
|
|
|
|
DEFAULTS = { |
|
|
'cust_avg_days': 18.0, |
|
|
'cust_median_days': 15.0, |
|
|
'cust_std_days': 0.0, |
|
|
'cust_min_days': 12, |
|
|
'cust_max_days': 25, |
|
|
'cust_invoice_count': 1, |
|
|
'cust_avg_amount': 30000.0, |
|
|
'cust_total_amount': 30000.0, |
|
|
'cust_pct_overdue': 0.0, |
|
|
'payment_terms_avg_days': 15.0, |
|
|
'payment_terms_median_days': 15.0, |
|
|
'payment_terms_count': 100, |
|
|
'business_avg_days': 17.0, |
|
|
'business_median_days': 15.0, |
|
|
'business_count': 1000 |
|
|
} |
|
|
|
|
|
|
|
|
def safe_float(value, default=0.0): |
|
|
"""Safely convert to float with default.""" |
|
|
if value is None: |
|
|
return float(default) |
|
|
try: |
|
|
return float(value) |
|
|
except (ValueError, TypeError): |
|
|
return float(default) |
|
|
|
|
|
|
|
|
def safe_int(value, default=0): |
|
|
"""Safely convert to int with default.""" |
|
|
if value is None: |
|
|
return int(default) |
|
|
try: |
|
|
return int(value) |
|
|
except (ValueError, TypeError): |
|
|
return int(default) |
|
|
|
|
|
|
|
|
def parse_date(date_str: str) -> datetime: |
|
|
"""Parse date string to datetime.""" |
|
|
if isinstance(date_str, datetime): |
|
|
return date_str |
|
|
|
|
|
for fmt in ["%Y-%m-%d %H:%M:%S", "%Y-%m-%d", "%Y%m%d"]: |
|
|
try: |
|
|
return datetime.strptime(str(date_str), fmt) |
|
|
except ValueError: |
|
|
continue |
|
|
|
|
|
raise ValueError(f"Cannot parse date: {date_str}") |
|
|
|
|
|
|
|
|
def build_features( |
|
|
invoice_data: Dict, |
|
|
customer_agg: Optional[Dict] = None, |
|
|
payment_terms_agg: Optional[Dict] = None, |
|
|
business_code_agg: Optional[Dict] = None |
|
|
) -> Dict: |
|
|
""" |
|
|
Build feature vector matching ML training pipeline. |
|
|
|
|
|
Args: |
|
|
invoice_data: Invoice details (posting_date, amount, etc.) |
|
|
customer_agg: Customer aggregates from DB (or None for defaults) |
|
|
payment_terms_agg: Payment terms aggregates from DB |
|
|
business_code_agg: Business code aggregates from DB |
|
|
|
|
|
Returns: |
|
|
Dict of features ready for model.predict() |
|
|
""" |
|
|
|
|
|
|
|
|
posting_date = parse_date(invoice_data['posting_date']) |
|
|
|
|
|
|
|
|
cust_agg = customer_agg or {} |
|
|
pmt_agg = payment_terms_agg or {} |
|
|
biz_agg = business_code_agg or {} |
|
|
|
|
|
|
|
|
features = {} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
business_code = invoice_data.get('business_code', 'U001') |
|
|
business_code_map = {'U001': 0, 'U002': 1, 'U005': 2, 'U007': 3, 'U013': 4, 'CA02': 5} |
|
|
features['business_code'] = business_code_map.get(business_code, 0) |
|
|
|
|
|
|
|
|
payment_terms = invoice_data.get('cust_payment_terms', 'NAH4') |
|
|
features['cust_payment_terms'] = abs(hash(payment_terms)) % 74 |
|
|
|
|
|
|
|
|
currency_map = {'USD': 0, 'CAD': 1} |
|
|
features['invoice_currency'] = currency_map.get(invoice_data.get('invoice_currency', 'USD'), 0) |
|
|
|
|
|
|
|
|
doc_type_map = {'RV': 0, 'AB': 1} |
|
|
features['document_type'] = doc_type_map.get(invoice_data.get('document_type', 'RV'), 0) |
|
|
|
|
|
|
|
|
amount = safe_float(invoice_data.get('total_open_amount'), 30000.0) |
|
|
if amount < 5000: |
|
|
amount_cat = 0 |
|
|
elif amount < 20000: |
|
|
amount_cat = 1 |
|
|
elif amount < 50000: |
|
|
amount_cat = 2 |
|
|
else: |
|
|
amount_cat = 3 |
|
|
features['amount_category'] = amount_cat |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
features['buisness_year'] = safe_float(invoice_data.get('business_year', posting_date.year)) |
|
|
features['total_open_amount'] = amount |
|
|
features['amount_log'] = float(np.log1p(amount)) |
|
|
|
|
|
|
|
|
features['posting_year'] = posting_date.year |
|
|
features['posting_month'] = posting_date.month |
|
|
features['posting_quarter'] = (posting_date.month - 1) // 3 + 1 |
|
|
features['posting_day'] = posting_date.day |
|
|
features['posting_dayofweek'] = posting_date.weekday() |
|
|
features['posting_is_weekend'] = 1 if posting_date.weekday() >= 5 else 0 |
|
|
features['posting_is_month_end'] = 1 if posting_date.day >= 28 else 0 |
|
|
features['posting_is_month_start'] = 1 if posting_date.day <= 3 else 0 |
|
|
|
|
|
|
|
|
features['days_posting_to_due'] = safe_int(invoice_data.get('days_posting_to_due'), 15) |
|
|
features['days_create_to_posting'] = safe_int(invoice_data.get('days_create_to_posting'), 0) |
|
|
features['days_baseline_to_posting'] = safe_int(invoice_data.get('days_baseline_to_posting'), 0) |
|
|
|
|
|
|
|
|
doc_create_alt = invoice_data.get('document_create_date_alt') |
|
|
if doc_create_alt: |
|
|
try: |
|
|
cleaned = str(doc_create_alt).replace('-', '').replace(' ', '').replace(':', '')[:8] |
|
|
features['document_create_date.1'] = int(cleaned) |
|
|
except: |
|
|
features['document_create_date.1'] = int(posting_date.strftime('%Y%m%d')) |
|
|
else: |
|
|
features['document_create_date.1'] = int(posting_date.strftime('%Y%m%d')) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
features['cust_avg_days'] = safe_float( |
|
|
cust_agg.get('cust_avg_days'), |
|
|
DEFAULTS['cust_avg_days'] |
|
|
) |
|
|
features['cust_median_days'] = safe_float( |
|
|
cust_agg.get('cust_median_days'), |
|
|
DEFAULTS['cust_median_days'] |
|
|
) |
|
|
features['cust_std_days'] = safe_float( |
|
|
cust_agg.get('cust_std_days'), |
|
|
DEFAULTS['cust_std_days'] |
|
|
) |
|
|
features['cust_min_days'] = safe_int( |
|
|
cust_agg.get('cust_min_days'), |
|
|
DEFAULTS['cust_min_days'] |
|
|
) |
|
|
features['cust_max_days'] = safe_int( |
|
|
cust_agg.get('cust_max_days'), |
|
|
DEFAULTS['cust_max_days'] |
|
|
) |
|
|
features['cust_invoice_count'] = safe_int( |
|
|
cust_agg.get('cust_invoice_count'), |
|
|
DEFAULTS['cust_invoice_count'] |
|
|
) |
|
|
features['cust_avg_amount'] = safe_float( |
|
|
cust_agg.get('cust_avg_amount'), |
|
|
DEFAULTS['cust_avg_amount'] |
|
|
) |
|
|
features['cust_total_amount'] = safe_float( |
|
|
cust_agg.get('cust_total_amount'), |
|
|
DEFAULTS['cust_total_amount'] |
|
|
) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
features['payment_terms_avg_days'] = safe_float( |
|
|
pmt_agg.get('payment_terms_avg_days'), |
|
|
DEFAULTS['payment_terms_avg_days'] |
|
|
) |
|
|
features['payment_terms_median_days'] = safe_float( |
|
|
pmt_agg.get('payment_terms_median_days'), |
|
|
DEFAULTS['payment_terms_median_days'] |
|
|
) |
|
|
features['payment_terms_count'] = safe_int( |
|
|
pmt_agg.get('payment_terms_count'), |
|
|
DEFAULTS['payment_terms_count'] |
|
|
) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
features['business_avg_days'] = safe_float( |
|
|
biz_agg.get('business_avg_days'), |
|
|
DEFAULTS['business_avg_days'] |
|
|
) |
|
|
features['business_median_days'] = safe_float( |
|
|
biz_agg.get('business_median_days'), |
|
|
DEFAULTS['business_median_days'] |
|
|
) |
|
|
features['business_count'] = safe_int( |
|
|
biz_agg.get('business_count'), |
|
|
DEFAULTS['business_count'] |
|
|
) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
cust_avg_amt = features['cust_avg_amount'] |
|
|
if cust_avg_amt > 0: |
|
|
features['amount_vs_cust_avg'] = float(amount / cust_avg_amt) |
|
|
else: |
|
|
features['amount_vs_cust_avg'] = 1.0 |
|
|
|
|
|
features['is_large_for_customer'] = 1 if amount > cust_avg_amt * 1.5 else 0 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
features['isOpen'] = safe_int(invoice_data.get('is_open'), 1) |
|
|
features['posting_id'] = safe_float(invoice_data.get('posting_id'), 1.0) |
|
|
|
|
|
return features |
|
|
|
|
|
|
|
|
def features_to_dataframe(features: Dict) -> pd.DataFrame: |
|
|
""" |
|
|
Convert feature dict to DataFrame with correct column order. |
|
|
Must match training feature order exactly. |
|
|
""" |
|
|
|
|
|
|
|
|
COLUMN_ORDER = [ |
|
|
'business_code', 'buisness_year', 'document_create_date.1', |
|
|
'invoice_currency', 'document_type', 'total_open_amount', |
|
|
'cust_payment_terms', 'isOpen', 'posting_year', 'posting_month', |
|
|
'posting_quarter', 'posting_day', 'posting_dayofweek', |
|
|
'posting_is_weekend', 'posting_is_month_end', 'posting_is_month_start', |
|
|
'days_posting_to_due', 'days_create_to_posting', 'days_baseline_to_posting', |
|
|
'amount_log', 'amount_category', 'cust_avg_days', 'cust_median_days', |
|
|
'cust_std_days', 'cust_min_days', 'cust_max_days', 'cust_invoice_count', |
|
|
'cust_avg_amount', 'cust_total_amount', 'payment_terms_avg_days', |
|
|
'payment_terms_median_days', 'payment_terms_count', 'business_avg_days', |
|
|
'business_median_days', 'business_count', 'amount_vs_cust_avg', |
|
|
'is_large_for_customer' |
|
|
] |
|
|
|
|
|
|
|
|
for col in COLUMN_ORDER: |
|
|
if col not in features: |
|
|
features[col] = 0.0 |
|
|
|
|
|
|
|
|
df = pd.DataFrame([features])[COLUMN_ORDER] |
|
|
|
|
|
return df |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
|
|
|
test_invoice = { |
|
|
'posting_date': '2024-01-15', |
|
|
'total_open_amount': 50000.0, |
|
|
'business_code': 'U001', |
|
|
'cust_payment_terms': 'NAH4', |
|
|
'invoice_currency': 'USD', |
|
|
'document_type': 'RV', |
|
|
'business_year': 2024, |
|
|
'days_posting_to_due': 15, |
|
|
'is_open': 1 |
|
|
} |
|
|
|
|
|
|
|
|
features = build_features(test_invoice, None, None, None) |
|
|
df = features_to_dataframe(features) |
|
|
|
|
|
print("✅ Features built successfully:") |
|
|
print(f"Shape: {df.shape}") |
|
|
print(f"Columns: {len(df.columns)}") |
|
|
print(f"\nSample features:") |
|
|
print(df[['cust_avg_days', 'payment_terms_avg_days', 'business_avg_days']].T) |