#!/usr/bin/env python3
"""Debug preprocessing pipeline"""

import pickle
import pandas as pd
import numpy as np

# Load models
with open('models/rf_model.pkl', 'rb') as f:
    model = pickle.load(f)
with open('models/scaler.pkl', 'rb') as f:
    scaler = pickle.load(f)
with open('models/le_dict.pkl', 'rb') as f:
    le_dict = pickle.load(f)
with open('models/feature_names.pkl', 'rb') as f:
    feature_names = pickle.load(f)

print("Expected feature names:", feature_names)
print("\nLE Dict keys:", list(le_dict.keys()))
print("Scaler n_features:", scaler.n_features_in_)

# Test input
test_input = {
    'A1_prefer_detail_not_big_picture': 0,
    'A2_must_have_sameness': 0,
    'A3_prefer_reading_systematically': 0,
    'A4_feel_anxious_in_social': 0,
    'A5_prefer_talking_one_to_one': 0,
    'A6_notice_small_changes': 0,
    'A7_trouble_focus_on_changing': 0,
    'A8_often_daydream': 0,
    'A9_focused_on_one_topic': 0,
    'A10_difficult_small_talk': 0,
    'age': 30,
    'gender': 'M',
    'ethnicity': 'White',
    'jundice': 'no',
    'autism_family_member': 'no',
    'country': 'USA',
    'used_app_before': 'no',
    'screening_type': 'Questionnaire'
}

print("\n" + "="*70)
print("STEP 1: Create DataFrame")
df = pd.DataFrame([test_input])
print("Columns:", list(df.columns))
print("Shape:", df.shape)

print("\n" + "="*70)
print("STEP 2: Encode categorical variables")
df_encoded = df.copy()
for col in le_dict.keys():
    if col in df_encoded.columns:
        val = df_encoded[col].values[0]
        print(f"  {col}: '{val}' ->", end=" ")
        try:
            df_encoded[col] = le_dict[col].transform([val])[0]
            print(f"{df_encoded[col].values[0]} ✓")
        except Exception as e:
            print(f"ERROR: {e}")

print("\nEncoded DataFrame:")
print(df_encoded)

print("\n" + "="*70)
print("STEP 3: Scale numeric features")
numeric_cols = ['age'] + [c for c in feature_names if c.startswith('A')]
print("Numeric columns for scaling:", numeric_cols)

# Check if all numeric cols exist
for col in numeric_cols:
    if col not in df_encoded.columns:
        print(f"  ERROR: {col} not in DataFrame!")
    else:
        print(f"  {col}: {df_encoded[col].values[0]} ✓")

print("\nScaling...")
df_scaled = df_encoded.copy()
try:
    df_scaled[numeric_cols] = scaler.transform(df_encoded[numeric_cols])
    print("Scaling successful ✓")
except Exception as e:
    print(f"Scaling ERROR: {e}")
    print("  Scaler expects these features:", scaler.get_feature_names_out() if hasattr(scaler, 'get_feature_names_out') else "N/A")

print("\n" + "="*70)
print("STEP 4: Select features in exact order")
print("Required feature order:", feature_names)

try:
    df_final = df_scaled[feature_names].copy()
    print("Feature selection successful ✓")
    print("Final shape:", df_final.shape)
    print("Final columns:", list(df_final.columns))
except Exception as e:
    print(f"Feature selection ERROR: {e}")
    print("  Available columns:", list(df_scaled.columns))

print("\n" + "="*70)
print("STEP 5: Predict")
try:
    pred = model.predict_proba(df_final)[0]
    print(f"Prediction successful ✓")
    print(f"  No Autism: {pred[0]:.2%}")
    print(f"  Autism: {pred[1]:.2%}")
except Exception as e:
    print(f"Prediction ERROR: {e}")