"""
Feature builder that matches ML training pipeline exactly.
Generates features for inference from invoice data + aggregates.
FIXED: Handles None values properly with robust defaults.
"""

import pandas as pd
import numpy as np
from datetime import datetime
from typing import Dict, Optional


# Default values for new customers (from training)
DEFAULTS = {
    'cust_avg_days': 18.0,
    'cust_median_days': 15.0,
    'cust_std_days': 0.0,
    'cust_min_days': 12,
    'cust_max_days': 25,
    'cust_invoice_count': 1,
    'cust_avg_amount': 30000.0,
    'cust_total_amount': 30000.0,
    'cust_pct_overdue': 0.0,
    'payment_terms_avg_days': 15.0,
    'payment_terms_median_days': 15.0,
    'payment_terms_count': 100,
    'business_avg_days': 17.0,
    'business_median_days': 15.0,
    'business_count': 1000
}


def safe_float(value, default=0.0):
    """Safely convert to float with default."""
    if value is None:
        return float(default)
    try:
        return float(value)
    except (ValueError, TypeError):
        return float(default)


def safe_int(value, default=0):
    """Safely convert to int with default."""
    if value is None:
        return int(default)
    try:
        return int(value)
    except (ValueError, TypeError):
        return int(default)


def parse_date(date_str: str) -> datetime:
    """Parse date string to datetime."""
    if isinstance(date_str, datetime):
        return date_str
    
    for fmt in ["%Y-%m-%d %H:%M:%S", "%Y-%m-%d", "%Y%m%d"]:
        try:
            return datetime.strptime(str(date_str), fmt)
        except ValueError:
            continue
    
    raise ValueError(f"Cannot parse date: {date_str}")


def build_features(
    invoice_data: Dict,
    customer_agg: Optional[Dict] = None,
    payment_terms_agg: Optional[Dict] = None,
    business_code_agg: Optional[Dict] = None
) -> Dict:
    """
    Build feature vector matching ML training pipeline.
    
    Args:
        invoice_data: Invoice details (posting_date, amount, etc.)
        customer_agg: Customer aggregates from DB (or None for defaults)
        payment_terms_agg: Payment terms aggregates from DB
        business_code_agg: Business code aggregates from DB
        
    Returns:
        Dict of features ready for model.predict()
    """
    
    # Parse dates
    posting_date = parse_date(invoice_data['posting_date'])
    
    # Use provided aggregates or empty dicts (will use defaults)
    cust_agg = customer_agg or {}
    pmt_agg = payment_terms_agg or {}
    biz_agg = business_code_agg or {}
    
    # Build feature dictionary
    features = {}
    
    # ============================================
    # Categorical Features (encoded as integers)
    # ============================================
    
    # Business code mapping
    business_code = invoice_data.get('business_code', 'U001')
    business_code_map = {'U001': 0, 'U002': 1, 'U005': 2, 'U007': 3, 'U013': 4, 'CA02': 5}
    features['business_code'] = business_code_map.get(business_code, 0)
    
    # Payment terms (simplified hash encoding)
    payment_terms = invoice_data.get('cust_payment_terms', 'NAH4')
    features['cust_payment_terms'] = abs(hash(payment_terms)) % 74
    
    # Currency
    currency_map = {'USD': 0, 'CAD': 1}
    features['invoice_currency'] = currency_map.get(invoice_data.get('invoice_currency', 'USD'), 0)
    
    # Document type
    doc_type_map = {'RV': 0, 'AB': 1}
    features['document_type'] = doc_type_map.get(invoice_data.get('document_type', 'RV'), 0)
    
    # Amount category
    amount = safe_float(invoice_data.get('total_open_amount'), 30000.0)
    if amount < 5000:
        amount_cat = 0  # small
    elif amount < 20000:
        amount_cat = 1  # medium
    elif amount < 50000:
        amount_cat = 2  # large
    else:
        amount_cat = 3  # very_large
    features['amount_category'] = amount_cat
    
    # ============================================
    # Numerical Features
    # ============================================
    
    features['buisness_year'] = safe_float(invoice_data.get('business_year', posting_date.year))
    features['total_open_amount'] = amount
    features['amount_log'] = float(np.log1p(amount))
    
    # Temporal features
    features['posting_year'] = posting_date.year
    features['posting_month'] = posting_date.month
    features['posting_quarter'] = (posting_date.month - 1) // 3 + 1
    features['posting_day'] = posting_date.day
    features['posting_dayofweek'] = posting_date.weekday()
    features['posting_is_weekend'] = 1 if posting_date.weekday() >= 5 else 0
    features['posting_is_month_end'] = 1 if posting_date.day >= 28 else 0
    features['posting_is_month_start'] = 1 if posting_date.day <= 3 else 0
    
    # Days between dates
    features['days_posting_to_due'] = safe_int(invoice_data.get('days_posting_to_due'), 15)
    features['days_create_to_posting'] = safe_int(invoice_data.get('days_create_to_posting'), 0)
    features['days_baseline_to_posting'] = safe_int(invoice_data.get('days_baseline_to_posting'), 0)
    
    # Document create date alt (as integer YYYYMMDD)
    doc_create_alt = invoice_data.get('document_create_date_alt')
    if doc_create_alt:
        try:
            cleaned = str(doc_create_alt).replace('-', '').replace(' ', '').replace(':', '')[:8]
            features['document_create_date.1'] = int(cleaned)
        except:
            features['document_create_date.1'] = int(posting_date.strftime('%Y%m%d'))
    else:
        features['document_create_date.1'] = int(posting_date.strftime('%Y%m%d'))
    
    # ============================================
    # Customer Aggregates (with robust defaults)
    # ============================================
    
    features['cust_avg_days'] = safe_float(
        cust_agg.get('cust_avg_days'),
        DEFAULTS['cust_avg_days']
    )
    features['cust_median_days'] = safe_float(
        cust_agg.get('cust_median_days'),
        DEFAULTS['cust_median_days']
    )
    features['cust_std_days'] = safe_float(
        cust_agg.get('cust_std_days'),
        DEFAULTS['cust_std_days']
    )
    features['cust_min_days'] = safe_int(
        cust_agg.get('cust_min_days'),
        DEFAULTS['cust_min_days']
    )
    features['cust_max_days'] = safe_int(
        cust_agg.get('cust_max_days'),
        DEFAULTS['cust_max_days']
    )
    features['cust_invoice_count'] = safe_int(
        cust_agg.get('cust_invoice_count'),
        DEFAULTS['cust_invoice_count']
    )
    features['cust_avg_amount'] = safe_float(
        cust_agg.get('cust_avg_amount'),
        DEFAULTS['cust_avg_amount']
    )
    features['cust_total_amount'] = safe_float(
        cust_agg.get('cust_total_amount'),
        DEFAULTS['cust_total_amount']
    )
    
    # ============================================
    # Payment Terms Aggregates
    # ============================================
    
    features['payment_terms_avg_days'] = safe_float(
        pmt_agg.get('payment_terms_avg_days'),
        DEFAULTS['payment_terms_avg_days']
    )
    features['payment_terms_median_days'] = safe_float(
        pmt_agg.get('payment_terms_median_days'),
        DEFAULTS['payment_terms_median_days']
    )
    features['payment_terms_count'] = safe_int(
        pmt_agg.get('payment_terms_count'),
        DEFAULTS['payment_terms_count']
    )
    
    # ============================================
    # Business Code Aggregates
    # ============================================
    
    features['business_avg_days'] = safe_float(
        biz_agg.get('business_avg_days'),
        DEFAULTS['business_avg_days']
    )
    features['business_median_days'] = safe_float(
        biz_agg.get('business_median_days'),
        DEFAULTS['business_median_days']
    )
    features['business_count'] = safe_int(
        biz_agg.get('business_count'),
        DEFAULTS['business_count']
    )
    
    # ============================================
    # Interaction Features
    # ============================================
    
    cust_avg_amt = features['cust_avg_amount']
    if cust_avg_amt > 0:
        features['amount_vs_cust_avg'] = float(amount / cust_avg_amt)
    else:
        features['amount_vs_cust_avg'] = 1.0
    
    features['is_large_for_customer'] = 1 if amount > cust_avg_amt * 1.5 else 0
    
    # ============================================
    # Other required fields
    # ============================================
    
    features['isOpen'] = safe_int(invoice_data.get('is_open'), 1)
    features['posting_id'] = safe_float(invoice_data.get('posting_id'), 1.0)
    
    return features


def features_to_dataframe(features: Dict) -> pd.DataFrame:
    """
    Convert feature dict to DataFrame with correct column order.
    Must match training feature order exactly.
    """
    
    # Expected column order from training
    COLUMN_ORDER = [
        'business_code', 'buisness_year', 'document_create_date.1',
        'invoice_currency', 'document_type', 'total_open_amount',
        'cust_payment_terms', 'isOpen', 'posting_year', 'posting_month',
        'posting_quarter', 'posting_day', 'posting_dayofweek',
        'posting_is_weekend', 'posting_is_month_end', 'posting_is_month_start',
        'days_posting_to_due', 'days_create_to_posting', 'days_baseline_to_posting',
        'amount_log', 'amount_category', 'cust_avg_days', 'cust_median_days',
        'cust_std_days', 'cust_min_days', 'cust_max_days', 'cust_invoice_count',
        'cust_avg_amount', 'cust_total_amount', 'payment_terms_avg_days',
        'payment_terms_median_days', 'payment_terms_count', 'business_avg_days',
        'business_median_days', 'business_count', 'amount_vs_cust_avg',
        'is_large_for_customer'
    ]
    
    # Ensure all columns present with safe defaults
    for col in COLUMN_ORDER:
        if col not in features:
            features[col] = 0.0  # Fallback
    
    # Create DataFrame with correct order
    df = pd.DataFrame([features])[COLUMN_ORDER]
    
    return df


if __name__ == "__main__":
    # Test with minimal data
    test_invoice = {
        'posting_date': '2024-01-15',
        'total_open_amount': 50000.0,
        'business_code': 'U001',
        'cust_payment_terms': 'NAH4',
        'invoice_currency': 'USD',
        'document_type': 'RV',
        'business_year': 2024,
        'days_posting_to_due': 15,
        'is_open': 1
    }
    
    # Test with no aggregates (should use defaults)
    features = build_features(test_invoice, None, None, None)
    df = features_to_dataframe(features)
    
    print("✅ Features built successfully:")
    print(f"Shape: {df.shape}")
    print(f"Columns: {len(df.columns)}")
    print(f"\nSample features:")
    print(df[['cust_avg_days', 'payment_terms_avg_days', 'business_avg_days']].T)