Spaces:
Sleeping
Sleeping
| """ | |
| Feature builder that matches ML training pipeline exactly. | |
| Generates features for inference from invoice data + aggregates. | |
| FIXED: Handles None values properly with robust defaults. | |
| """ | |
| import pandas as pd | |
| import numpy as np | |
| from datetime import datetime | |
| from typing import Dict, Optional | |
| # Default values for new customers (from training) | |
| DEFAULTS = { | |
| 'cust_avg_days': 18.0, | |
| 'cust_median_days': 15.0, | |
| 'cust_std_days': 0.0, | |
| 'cust_min_days': 12, | |
| 'cust_max_days': 25, | |
| 'cust_invoice_count': 1, | |
| 'cust_avg_amount': 30000.0, | |
| 'cust_total_amount': 30000.0, | |
| 'cust_pct_overdue': 0.0, | |
| 'payment_terms_avg_days': 15.0, | |
| 'payment_terms_median_days': 15.0, | |
| 'payment_terms_count': 100, | |
| 'business_avg_days': 17.0, | |
| 'business_median_days': 15.0, | |
| 'business_count': 1000 | |
| } | |
| def safe_float(value, default=0.0): | |
| """Safely convert to float with default.""" | |
| if value is None: | |
| return float(default) | |
| try: | |
| return float(value) | |
| except (ValueError, TypeError): | |
| return float(default) | |
| def safe_int(value, default=0): | |
| """Safely convert to int with default.""" | |
| if value is None: | |
| return int(default) | |
| try: | |
| return int(value) | |
| except (ValueError, TypeError): | |
| return int(default) | |
| def parse_date(date_str: str) -> datetime: | |
| """Parse date string to datetime.""" | |
| if isinstance(date_str, datetime): | |
| return date_str | |
| for fmt in ["%Y-%m-%d %H:%M:%S", "%Y-%m-%d", "%Y%m%d"]: | |
| try: | |
| return datetime.strptime(str(date_str), fmt) | |
| except ValueError: | |
| continue | |
| raise ValueError(f"Cannot parse date: {date_str}") | |
| def build_features( | |
| invoice_data: Dict, | |
| customer_agg: Optional[Dict] = None, | |
| payment_terms_agg: Optional[Dict] = None, | |
| business_code_agg: Optional[Dict] = None | |
| ) -> Dict: | |
| """ | |
| Build feature vector matching ML training pipeline. | |
| Args: | |
| invoice_data: Invoice details (posting_date, amount, etc.) | |
| customer_agg: Customer aggregates from DB (or None for defaults) | |
| payment_terms_agg: Payment terms aggregates from DB | |
| business_code_agg: Business code aggregates from DB | |
| Returns: | |
| Dict of features ready for model.predict() | |
| """ | |
| # Parse dates | |
| posting_date = parse_date(invoice_data['posting_date']) | |
| # Use provided aggregates or empty dicts (will use defaults) | |
| cust_agg = customer_agg or {} | |
| pmt_agg = payment_terms_agg or {} | |
| biz_agg = business_code_agg or {} | |
| # Build feature dictionary | |
| features = {} | |
| # ============================================ | |
| # Categorical Features (encoded as integers) | |
| # ============================================ | |
| # Business code mapping | |
| business_code = invoice_data.get('business_code', 'U001') | |
| business_code_map = {'U001': 0, 'U002': 1, 'U005': 2, 'U007': 3, 'U013': 4, 'CA02': 5} | |
| features['business_code'] = business_code_map.get(business_code, 0) | |
| # Payment terms (simplified hash encoding) | |
| payment_terms = invoice_data.get('cust_payment_terms', 'NAH4') | |
| features['cust_payment_terms'] = abs(hash(payment_terms)) % 74 | |
| # Currency | |
| currency_map = {'USD': 0, 'CAD': 1} | |
| features['invoice_currency'] = currency_map.get(invoice_data.get('invoice_currency', 'USD'), 0) | |
| # Document type | |
| doc_type_map = {'RV': 0, 'AB': 1} | |
| features['document_type'] = doc_type_map.get(invoice_data.get('document_type', 'RV'), 0) | |
| # Amount category | |
| amount = safe_float(invoice_data.get('total_open_amount'), 30000.0) | |
| if amount < 5000: | |
| amount_cat = 0 # small | |
| elif amount < 20000: | |
| amount_cat = 1 # medium | |
| elif amount < 50000: | |
| amount_cat = 2 # large | |
| else: | |
| amount_cat = 3 # very_large | |
| features['amount_category'] = amount_cat | |
| # ============================================ | |
| # Numerical Features | |
| # ============================================ | |
| features['buisness_year'] = safe_float(invoice_data.get('business_year', posting_date.year)) | |
| features['total_open_amount'] = amount | |
| features['amount_log'] = float(np.log1p(amount)) | |
| # Temporal features | |
| features['posting_year'] = posting_date.year | |
| features['posting_month'] = posting_date.month | |
| features['posting_quarter'] = (posting_date.month - 1) // 3 + 1 | |
| features['posting_day'] = posting_date.day | |
| features['posting_dayofweek'] = posting_date.weekday() | |
| features['posting_is_weekend'] = 1 if posting_date.weekday() >= 5 else 0 | |
| features['posting_is_month_end'] = 1 if posting_date.day >= 28 else 0 | |
| features['posting_is_month_start'] = 1 if posting_date.day <= 3 else 0 | |
| # Days between dates | |
| features['days_posting_to_due'] = safe_int(invoice_data.get('days_posting_to_due'), 15) | |
| features['days_create_to_posting'] = safe_int(invoice_data.get('days_create_to_posting'), 0) | |
| features['days_baseline_to_posting'] = safe_int(invoice_data.get('days_baseline_to_posting'), 0) | |
| # Document create date alt (as integer YYYYMMDD) | |
| doc_create_alt = invoice_data.get('document_create_date_alt') | |
| if doc_create_alt: | |
| try: | |
| cleaned = str(doc_create_alt).replace('-', '').replace(' ', '').replace(':', '')[:8] | |
| features['document_create_date.1'] = int(cleaned) | |
| except: | |
| features['document_create_date.1'] = int(posting_date.strftime('%Y%m%d')) | |
| else: | |
| features['document_create_date.1'] = int(posting_date.strftime('%Y%m%d')) | |
| # ============================================ | |
| # Customer Aggregates (with robust defaults) | |
| # ============================================ | |
| features['cust_avg_days'] = safe_float( | |
| cust_agg.get('cust_avg_days'), | |
| DEFAULTS['cust_avg_days'] | |
| ) | |
| features['cust_median_days'] = safe_float( | |
| cust_agg.get('cust_median_days'), | |
| DEFAULTS['cust_median_days'] | |
| ) | |
| features['cust_std_days'] = safe_float( | |
| cust_agg.get('cust_std_days'), | |
| DEFAULTS['cust_std_days'] | |
| ) | |
| features['cust_min_days'] = safe_int( | |
| cust_agg.get('cust_min_days'), | |
| DEFAULTS['cust_min_days'] | |
| ) | |
| features['cust_max_days'] = safe_int( | |
| cust_agg.get('cust_max_days'), | |
| DEFAULTS['cust_max_days'] | |
| ) | |
| features['cust_invoice_count'] = safe_int( | |
| cust_agg.get('cust_invoice_count'), | |
| DEFAULTS['cust_invoice_count'] | |
| ) | |
| features['cust_avg_amount'] = safe_float( | |
| cust_agg.get('cust_avg_amount'), | |
| DEFAULTS['cust_avg_amount'] | |
| ) | |
| features['cust_total_amount'] = safe_float( | |
| cust_agg.get('cust_total_amount'), | |
| DEFAULTS['cust_total_amount'] | |
| ) | |
| # ============================================ | |
| # Payment Terms Aggregates | |
| # ============================================ | |
| features['payment_terms_avg_days'] = safe_float( | |
| pmt_agg.get('payment_terms_avg_days'), | |
| DEFAULTS['payment_terms_avg_days'] | |
| ) | |
| features['payment_terms_median_days'] = safe_float( | |
| pmt_agg.get('payment_terms_median_days'), | |
| DEFAULTS['payment_terms_median_days'] | |
| ) | |
| features['payment_terms_count'] = safe_int( | |
| pmt_agg.get('payment_terms_count'), | |
| DEFAULTS['payment_terms_count'] | |
| ) | |
| # ============================================ | |
| # Business Code Aggregates | |
| # ============================================ | |
| features['business_avg_days'] = safe_float( | |
| biz_agg.get('business_avg_days'), | |
| DEFAULTS['business_avg_days'] | |
| ) | |
| features['business_median_days'] = safe_float( | |
| biz_agg.get('business_median_days'), | |
| DEFAULTS['business_median_days'] | |
| ) | |
| features['business_count'] = safe_int( | |
| biz_agg.get('business_count'), | |
| DEFAULTS['business_count'] | |
| ) | |
| # ============================================ | |
| # Interaction Features | |
| # ============================================ | |
| cust_avg_amt = features['cust_avg_amount'] | |
| if cust_avg_amt > 0: | |
| features['amount_vs_cust_avg'] = float(amount / cust_avg_amt) | |
| else: | |
| features['amount_vs_cust_avg'] = 1.0 | |
| features['is_large_for_customer'] = 1 if amount > cust_avg_amt * 1.5 else 0 | |
| # ============================================ | |
| # Other required fields | |
| # ============================================ | |
| features['isOpen'] = safe_int(invoice_data.get('is_open'), 1) | |
| features['posting_id'] = safe_float(invoice_data.get('posting_id'), 1.0) | |
| return features | |
| def features_to_dataframe(features: Dict) -> pd.DataFrame: | |
| """ | |
| Convert feature dict to DataFrame with correct column order. | |
| Must match training feature order exactly. | |
| """ | |
| # Expected column order from training | |
| COLUMN_ORDER = [ | |
| 'business_code', 'buisness_year', 'document_create_date.1', | |
| 'invoice_currency', 'document_type', 'total_open_amount', | |
| 'cust_payment_terms', 'isOpen', 'posting_year', 'posting_month', | |
| 'posting_quarter', 'posting_day', 'posting_dayofweek', | |
| 'posting_is_weekend', 'posting_is_month_end', 'posting_is_month_start', | |
| 'days_posting_to_due', 'days_create_to_posting', 'days_baseline_to_posting', | |
| 'amount_log', 'amount_category', 'cust_avg_days', 'cust_median_days', | |
| 'cust_std_days', 'cust_min_days', 'cust_max_days', 'cust_invoice_count', | |
| 'cust_avg_amount', 'cust_total_amount', 'payment_terms_avg_days', | |
| 'payment_terms_median_days', 'payment_terms_count', 'business_avg_days', | |
| 'business_median_days', 'business_count', 'amount_vs_cust_avg', | |
| 'is_large_for_customer' | |
| ] | |
| # Ensure all columns present with safe defaults | |
| for col in COLUMN_ORDER: | |
| if col not in features: | |
| features[col] = 0.0 # Fallback | |
| # Create DataFrame with correct order | |
| df = pd.DataFrame([features])[COLUMN_ORDER] | |
| return df | |
| if __name__ == "__main__": | |
| # Test with minimal data | |
| test_invoice = { | |
| 'posting_date': '2024-01-15', | |
| 'total_open_amount': 50000.0, | |
| 'business_code': 'U001', | |
| 'cust_payment_terms': 'NAH4', | |
| 'invoice_currency': 'USD', | |
| 'document_type': 'RV', | |
| 'business_year': 2024, | |
| 'days_posting_to_due': 15, | |
| 'is_open': 1 | |
| } | |
| # Test with no aggregates (should use defaults) | |
| features = build_features(test_invoice, None, None, None) | |
| df = features_to_dataframe(features) | |
| print("✅ Features built successfully:") | |
| print(f"Shape: {df.shape}") | |
| print(f"Columns: {len(df.columns)}") | |
| print(f"\nSample features:") | |
| print(df[['cust_avg_days', 'payment_terms_avg_days', 'business_avg_days']].T) |