#!/usr/bin/env python3 """Debug preprocessing pipeline""" import pickle import pandas as pd import numpy as np # Load models with open('models/rf_model.pkl', 'rb') as f: model = pickle.load(f) with open('models/scaler.pkl', 'rb') as f: scaler = pickle.load(f) with open('models/le_dict.pkl', 'rb') as f: le_dict = pickle.load(f) with open('models/feature_names.pkl', 'rb') as f: feature_names = pickle.load(f) print("Expected feature names:", feature_names) print("\nLE Dict keys:", list(le_dict.keys())) print("Scaler n_features:", scaler.n_features_in_) # Test input test_input = { 'A1_prefer_detail_not_big_picture': 0, 'A2_must_have_sameness': 0, 'A3_prefer_reading_systematically': 0, 'A4_feel_anxious_in_social': 0, 'A5_prefer_talking_one_to_one': 0, 'A6_notice_small_changes': 0, 'A7_trouble_focus_on_changing': 0, 'A8_often_daydream': 0, 'A9_focused_on_one_topic': 0, 'A10_difficult_small_talk': 0, 'age': 30, 'gender': 'M', 'ethnicity': 'White', 'jundice': 'no', 'autism_family_member': 'no', 'country': 'USA', 'used_app_before': 'no', 'screening_type': 'Questionnaire' } print("\n" + "="*70) print("STEP 1: Create DataFrame") df = pd.DataFrame([test_input]) print("Columns:", list(df.columns)) print("Shape:", df.shape) print("\n" + "="*70) print("STEP 2: Encode categorical variables") df_encoded = df.copy() for col in le_dict.keys(): if col in df_encoded.columns: val = df_encoded[col].values[0] print(f" {col}: '{val}' ->", end=" ") try: df_encoded[col] = le_dict[col].transform([val])[0] print(f"{df_encoded[col].values[0]} ✓") except Exception as e: print(f"ERROR: {e}") print("\nEncoded DataFrame:") print(df_encoded) print("\n" + "="*70) print("STEP 3: Scale numeric features") numeric_cols = ['age'] + [c for c in feature_names if c.startswith('A')] print("Numeric columns for scaling:", numeric_cols) # Check if all numeric cols exist for col in numeric_cols: if col not in df_encoded.columns: print(f" ERROR: {col} not in DataFrame!") else: print(f" {col}: {df_encoded[col].values[0]} ✓") print("\nScaling...") df_scaled = df_encoded.copy() try: df_scaled[numeric_cols] = scaler.transform(df_encoded[numeric_cols]) print("Scaling successful ✓") except Exception as e: print(f"Scaling ERROR: {e}") print(" Scaler expects these features:", scaler.get_feature_names_out() if hasattr(scaler, 'get_feature_names_out') else "N/A") print("\n" + "="*70) print("STEP 4: Select features in exact order") print("Required feature order:", feature_names) try: df_final = df_scaled[feature_names].copy() print("Feature selection successful ✓") print("Final shape:", df_final.shape) print("Final columns:", list(df_final.columns)) except Exception as e: print(f"Feature selection ERROR: {e}") print(" Available columns:", list(df_scaled.columns)) print("\n" + "="*70) print("STEP 5: Predict") try: pred = model.predict_proba(df_final)[0] print(f"Prediction successful ✓") print(f" No Autism: {pred[0]:.2%}") print(f" Autism: {pred[1]:.2%}") except Exception as e: print(f"Prediction ERROR: {e}")