Spaces:

apexherbert200
/

Credily-backend-test

Runtime error

App Files Files Community

apexherbert200 commited on Jan 5

Commit

a2dbe57

0 Parent(s):

Initial clean commit (no runtime data)

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.claude/settings.local.json +8 -0
.dockerignore +28 -0
.env.example +17 -0
.gitattributes +35 -0
.gitignore +13 -0
DEPLOYMENT_GUIDE.md +570 -0
Dockerfile +33 -0
README.md +12 -0
credily.egg-info/PKG-INFO +33 -0
credily.egg-info/SOURCES.txt +18 -0
credily.egg-info/dependency_links.txt +1 -0
credily.egg-info/entry_points.txt +2 -0
credily.egg-info/requires.txt +11 -0
credily.egg-info/top_level.txt +1 -0
credily/__init__.py +38 -0
credily/__pycache__/__init__.cpython-314.pyc +0 -0
credily/__pycache__/agnostic_pipeline.cpython-314.pyc +0 -0
credily/__pycache__/analyzer.cpython-314.pyc +0 -0
credily/__pycache__/automl.cpython-314.pyc +0 -0
credily/__pycache__/balancing.cpython-314.pyc +0 -0
credily/__pycache__/cleaning.cpython-314.pyc +0 -0
credily/__pycache__/cli.cpython-314.pyc +0 -0
credily/__pycache__/profiler.cpython-314.pyc +0 -0
credily/__pycache__/reporting.cpython-314.pyc +0 -0
credily/__pycache__/safety.cpython-314.pyc +0 -0
credily/__pycache__/utils.cpython-314.pyc +0 -0
credily/agnostic_pipeline.py +537 -0
credily/analyzer.py +214 -0
credily/api/__init__.py +8 -0
credily/api/__pycache__/__init__.cpython-314.pyc +0 -0
credily/api/__pycache__/database.cpython-314.pyc +0 -0
credily/api/__pycache__/errors.cpython-314.pyc +0 -0
credily/api/__pycache__/main.cpython-314.pyc +0 -0
credily/api/__pycache__/schemas.cpython-314.pyc +0 -0
credily/api/database.py +368 -0
credily/api/errors.py +232 -0
credily/api/main.py +1035 -0
credily/api/schemas.py +229 -0
credily/automl.py +1073 -0
credily/balancing.py +375 -0
credily/cleaning.py +643 -0
credily/cli.py +367 -0
credily/metrics.py +49 -0
credily/model.py +240 -0
credily/preprocessing.py +63 -0
credily/profiler.py +184 -0
credily/reporting.py +257 -0
credily/safety.py +634 -0
credily/utils.py +199 -0
debug_output/model.pkl +3 -0

.claude/settings.local.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+  "permissions": {
+    "allow": [
+      "Bash(python -c \"from credily.cli import cli; cli\\(\\)\":*)",
+      "Bash(pip install:*)"
+    ]
+  }
+}

.dockerignore ADDED Viewed

	@@ -0,0 +1,28 @@

+# Python
+__pycache__/
+*.pyc
+*.pyo
+*.pyd
+# Virtual env
+venv/
+.venv/
+# ML artifacts
+credily_data/
+credily_models/
+credily_output/
+debug_output/
+debug_output_smote/
+*.pkl
+*.zip
+# Build artifacts
+credily.egg-info/
+# Git
+.git/
+.gitignore
+# Env
+.env

.env.example ADDED Viewed

	@@ -0,0 +1,17 @@

+# Credily Backend Environment Configuration
+# Copy this file to .env and fill in your values
+# Database Configuration
+# For development: Leave unset to use SQLite (default)
+# For production: Set to your PostgreSQL connection string
+# DATABASE_URL=postgresql://user:password@host:port/database
+# Production PostgreSQL with PgBouncer (Supabase example)
+# DATABASE_URL=postgresql://postgres.xxxxx:password@aws-1-eu-central-2.pooler.supabase.com:6543/postgres?pgbouncer=true
+# API Configuration (optional)
+# HOST=0.0.0.0
+# PORT=8000
+# CORS Origins (comma-separated for production)
+# CORS_ORIGINS=https://your-frontend.com,https://api.your-domain.com

.gitattributes ADDED Viewed

	@@ -0,0 +1,35 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,13 @@

+credily_data/
+credily_models/
+credily_output/
+*.db
+*.pkl
+*.zip
+.env
+venv/
+.venv/
+__pycache__/
+credily.egg-info/

DEPLOYMENT_GUIDE.md ADDED Viewed

	@@ -0,0 +1,570 @@

+# Credily Backend Deployment Guide
+A comprehensive guide to deploying and using the Credily credit risk prediction backend.
+---
+## Table of Contents
+1. [Prerequisites](#prerequisites)
+2. [Installation](#installation)
+3. [Configuration](#configuration)
+4. [Running the API Server](#running-the-api-server)
+5. [Testing the Deployment](#testing-the-deployment)
+6. [API Endpoints](#api-endpoints)
+7. [Model Training Workflow](#model-training-workflow)
+8. [Prediction Workflow](#prediction-workflow)
+9. [Data Requirements](#data-requirements)
+10. [Troubleshooting](#troubleshooting)
+---
+## Prerequisites
+### System Requirements
+- Python 3.10+ (tested with Python 3.14)
+- 4GB+ RAM recommended
+- Windows/Linux/macOS
+### Required Python Packages
+```
+fastapi
+uvicorn
+pandas
+numpy
+scikit-learn
+joblib
+imbalanced-learn
+xgboost (optional)
+lightgbm (optional)
+```
+---
+## Installation
+### 1. Clone/Download the Repository
+```bash
+cd Credily_backend
+```
+### 2. Create Virtual Environment (Recommended)
+```bash
+python -m venv venv
+# Windows
+.\venv\Scripts\activate
+# Linux/macOS
+source venv/bin/activate
+```
+### 3. Install Dependencies
+```bash
+pip install -r requirements.txt
+```
+### 4. Verify Installation
+```bash
+python debug_pipeline.py
+```
+Expected output:
+```
+ALL TESTS PASSED - Pipeline ready for deployment!
+```
+---
+## Configuration
+### Environment Variables
+Create a `.env` file in the project root (see `.env.example`):
+```env
+# Database Configuration
+# For development: Leave unset to use SQLite (stored in credily_data/credily.db)
+# For production: Set to your PostgreSQL connection string
+DATABASE_URL=postgresql://user:password@host:port/database
+# Production PostgreSQL with PgBouncer (Supabase example)
+# DATABASE_URL=postgresql://postgres.xxxxx:password@aws-1-eu-central-2.pooler.supabase.com:6543/postgres?pgbouncer=true
+# Server settings (optional)
+HOST=0.0.0.0
+PORT=8000
+# CORS Origins (comma-separated for production)
+# CORS_ORIGINS=https://your-frontend.com
+```
+### Database Configuration
+The API supports two database backends:
+| Environment | Database | Configuration |
+|-------------|----------|---------------|
+| Development | SQLite | Default - no config needed |
+| Production | PostgreSQL | Set `DATABASE_URL` environment variable |
+**SQLite (Development)**
+- Used automatically when `DATABASE_URL` is not set
+- Data stored in `credily_data/credily.db`
+- No additional setup required
+**PostgreSQL (Production)**
+- Set the `DATABASE_URL` environment variable
+- Supports connection pooling (e.g., PgBouncer)
+- Tables are created automatically on first run
+### API Configuration
+The API can be configured via `credily/api/main.py`:
+| Setting | Default | Description |
+|---------|---------|-------------|
+| `host` | `0.0.0.0` | Server host |
+| `port` | `8000` | Server port |
+| `reload` | `True` | Auto-reload on code changes (dev only) |
+---
+## Running the API Server
+### Development Mode
+```bash
+cd Credily_backend
+python -m uvicorn credily.api.main:app --reload --host 0.0.0.0 --port 8000
+```
+### Production Mode
+```bash
+python -m uvicorn credily.api.main:app --host 0.0.0.0 --port 8000 --workers 4
+```
+### Using the CLI
+```bash
+# Start server via CLI
+python -m credily.cli serve --port 8000
+```
+### Verify Server is Running
+Open browser: http://localhost:8000/docs
+You should see the Swagger UI with all available endpoints.
+---
+## Testing the Deployment
+### 1. Health Check
+```bash
+curl http://localhost:8000/health
+```
+Expected response:
+```json
+{"status": "healthy"}
+```
+### 2. Run Debug Tests
+```bash
+python debug_pipeline.py
+```
+### 3. Test with Sample Data
+```bash
+curl -X POST "http://localhost:8000/api/profile" \
+  -H "Content-Type: application/json" \
+  -d '[{"age": 25, "income": 50000, "target": 0}]'
+```
+---
+## API Endpoints
+### Core Endpoints
+| Method | Endpoint | Description |
+|--------|----------|-------------|
+| `GET` | `/health` | Health check |
+| `POST` | `/api/train` | Train a new model |
+| `POST` | `/api/predict` | Make predictions |
+| `POST` | `/api/predict/single` | Single record prediction |
+| `POST` | `/api/profile` | Profile dataset |
+| `GET` | `/api/models` | List saved models |
+### Training Endpoint
+**POST `/api/train`**
+Request body:
+```json
+{
+  "data": [
+    {"feature1": 1, "feature2": "a", "target": 0},
+    {"feature1": 2, "feature2": "b", "target": 1}
+  ],
+  "target_column": "target",
+  "clean_data": true,
+  "clean_mode": "thorough",
+  "balance_data": true,
+  "balance_method": "smote",
+  "calibrate": true,
+  "optimize_threshold": true
+}
+```
+Response:
+```json
+{
+  "success": true,
+  "model_path": "credily_output/model.pkl",
+  "download_url": "/download/model_20240101_120000.zip",
+  "results": {
+    "best_model": "RandomForest",
+    "test_auc": 0.85,
+    "optimal_threshold": 0.42
+  }
+}
+```
+### Prediction Endpoint
+**POST `/api/predict`**
+Request body:
+```json
+{
+  "data": [
+    {"feature1": 1, "feature2": "a"},
+    {"feature1": 2, "feature2": "b"}
+  ],
+  "model_path": "C:/path/to/model.pkl",
+  "include_proba": true,
+  "threshold": null,
+  "save_results": false
+}
+```
+Response:
+```json
+{
+  "success": true,
+  "predictions": [
+    {"index": 0, "prediction": 0, "probability": 0.23, "risk_level": "low"},
+    {"index": 1, "prediction": 1, "probability": 0.78, "risk_level": "high"}
+  ],
+  "summary": {
+    "total_records": 2,
+    "predicted_positive": 1,
+    "positive_rate": 0.5
+  }
+}
+```
+---
+## Model Training Workflow
+### Step 1: Prepare Your Data
+Your training data should be a CSV or JSON with:
+- Feature columns (numeric and/or categorical)
+- Target column (binary: 0/1 or string labels)
+**Example CSV:**
+```csv
+age,income,employment,education,target
+25,50000,employed,bachelor,0
+45,80000,self-employed,master,1
+```
+### Step 2: Data Cleaning (Automatic)
+The pipeline automatically:
+- Removes ID columns (`id`, `customer_id`, `loan_id`, etc.)
+- Removes unnamed columns
+- Replaces invalid values (`?`, `N/A`, `NULL`) with NaN
+- Handles missing values (creates `_missing` indicator columns)
+- Handles outliers (IQR capping)
+- Removes low-variance features
+- Removes highly correlated features
+- Standardizes categorical values
+### Step 3: Train via API
+```python
+import requests
+import pandas as pd
+# Load your data
+df = pd.read_csv('your_data.csv')
+data = df.to_dict(orient='records')
+# Train model
+response = requests.post(
+    'http://localhost:8000/api/train',
+    json={
+        'data': data,
+        'target_column': 'target',
+        'clean_data': True,
+        'balance_data': True,
+        'calibrate': True
+    }
+)
+result = response.json()
+print(f"Model saved to: {result['model_path']}")
+print(f"Test AUC: {result['results']['test_auc']}")
+```
+### Step 4: Download Model
+The trained model is saved as a `.pkl` file containing:
+- Trained sklearn pipeline
+- Feature names
+- Expected columns (for prediction alignment)
+- Optimal threshold
+- Model metadata
+---
+## Prediction Workflow
+### Step 1: Prepare Prediction Data
+Your prediction data should have the same features as training data.
+**Important:**
+- Target column is NOT required
+- ID columns will be automatically ignored
+- Missing columns will be filled with NaN (imputed)
+- Extra columns will be removed
+### Step 2: Make Predictions
+```python
+import requests
+import pandas as pd
+# Load prediction data
+df = pd.read_csv('new_data.csv')
+data = df.to_dict(orient='records')
+# Predict
+response = requests.post(
+    'http://localhost:8000/api/predict',
+    json={
+        'data': data,
+        'model_path': 'C:/path/to/model.pkl',
+        'include_proba': True
+    }
+)
+result = response.json()
+for pred in result['predictions']:
+    print(f"Record {pred['index']}: {pred['risk_level']} ({pred['probability']:.2%})")
+```
+### Step 3: Interpret Results
+| Risk Level | Probability Range | Interpretation |
+|------------|------------------|----------------|
+| `very_low` | 0.00 - 0.25 | Low risk of default |
+| `low` | 0.25 - 0.50 | Below average risk |
+| `medium` | 0.50 - 0.75 | Above average risk |
+| `high` | 0.75 - 1.00 | High risk of default |
+---
+## Data Requirements
+### Supported Data Types
+| Type | Examples | Handling |
+|------|----------|----------|
+| Numeric | `age`, `income`, `score` | StandardScaler + median imputation |
+| Categorical | `employment`, `education` | OneHotEncoder + mode imputation |
+| Binary target | `0/1`, `yes/no`, `+/-` | Auto-converted to 0/1 |
+### Columns Automatically Removed
+The following columns are automatically detected and removed:
+1. **ID columns** (by name pattern):
+   - `id`, `ID`, `_id`
+   - `customer_id`, `user_id`, `account_id`
+   - `loan_id`, `application_id`, `transaction_id`
+   - `index`, `idx`, `key`, `pk`
+   - `uuid`, `guid`
+2. **ID columns** (by characteristics):
+   - 100% unique values (object type)
+   - Sequential integers (auto-increment pattern)
+3. **Other:**
+   - Unnamed columns (`Unnamed: 0`, etc.)
+   - High-missing columns (>50% missing)
+   - Low-variance columns (<0.01 variance)
+   - Highly correlated columns (>0.95 correlation)
+### Missing Value Handling
+| Column Type | Missing < 5% | Missing 5-50% | Missing > 50% |
+|-------------|--------------|---------------|---------------|
+| Numeric | Median impute | Median impute + `_missing` flag | Drop column |
+| Categorical | Mode impute | Mode impute + `_missing` flag | Drop column |
+---
+## Troubleshooting
+### Common Issues
+#### 1. "Model not trained" Error
+```
+ValueError: Model not trained. Call train() first or load a saved model.
+```
+**Solution:** Ensure you're passing the correct `model_path` to the predict endpoint.
+#### 2. Missing Columns Warning
+```
+Column alignment applied: {'missing_columns': ['feature_x'], ...}
+```
+**This is normal.** Missing columns are automatically filled with NaN and imputed. The model will still make predictions.
+#### 3. SMOTE Fails
+```
+Warning: SMOTE failed (...). Using random oversampling instead.
+```
+**This can happen when:**
+- Minority class has too few samples (< 6)
+- All numeric columns have NaN
+**Solution:** Use `balance_method='random_oversample'` or increase data size.
+#### 4. Import Errors
+```
+ModuleNotFoundError: No module named 'imblearn'
+```
+**Solution:**
+```bash
+pip install imbalanced-learn
+```
+#### 5. Memory Errors
+```
+MemoryError: Unable to allocate...
+```
+**Solution:**
+- Reduce dataset size
+- Use `clean_mode='aggressive'` to drop more columns
+- Increase system RAM
+### Debug Commands
+```bash
+# Test full pipeline
+python debug_pipeline.py
+# Check installed packages
+pip list | grep -E "sklearn|pandas|imblearn"
+# Verify API is running
+curl http://localhost:8000/health
+# Check model contents
+python -c "import joblib; m = joblib.load('model.pkl'); print(m.keys())"
+```
+### Log Files
+API logs are printed to stdout. For production, redirect to a file:
+```bash
+python -m uvicorn credily.api.main:app > api.log 2>&1
+```
+---
+## Production Deployment
+### Using Docker
+```dockerfile
+FROM python:3.11-slim
+WORKDIR /app
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+COPY . .
+EXPOSE 8000
+CMD ["uvicorn", "credily.api.main:app", "--host", "0.0.0.0", "--port", "8000"]
+```
+Build and run:
+```bash
+docker build -t credily-backend .
+docker run -p 8000:8000 credily-backend
+```
+### Using Gunicorn (Linux)
+```bash
+pip install gunicorn
+gunicorn credily.api.main:app -w 4 -k uvicorn.workers.UvicornWorker -b 0.0.0.0:8000
+```
+### Nginx Reverse Proxy
+```nginx
+server {
+    listen 80;
+    server_name your-domain.com;
+    location / {
+        proxy_pass http://127.0.0.1:8000;
+        proxy_set_header Host $host;
+        proxy_set_header X-Real-IP $remote_addr;
+    }
+}
+```
+---
+## Quick Reference
+### CLI Commands
+```bash
+# Train model
+python -m credily.cli train --data data.csv --target target
+# Predict
+python -m credily.cli predict --model model.pkl --data new_data.csv
+# Start API server
+python -m credily.cli serve --port 8000
+```
+### Python SDK
+```python
+from credily.automl import CredilyPipeline
+# Train
+pipeline = CredilyPipeline(target_column='target')
+results = pipeline.train(df)
+# Load and predict
+pipeline = CredilyPipeline.load('model.pkl')
+predictions = pipeline.predict(new_df, include_proba=True)
+```
+---
+## Support
+For issues and feature requests, please check:
+- Debug tests: `python debug_pipeline.py`
+- API docs: http://localhost:8000/docs
+- Swagger UI: http://localhost:8000/redoc

Dockerfile ADDED Viewed

	@@ -0,0 +1,33 @@

+FROM python:3.11-slim
+# System deps for ML + Postgres
+RUN apt-get update && apt-get install -y \
+    build-essential \
+    gcc \
+    g++ \
+    libpq-dev \
+    curl \
+    && rm -rf /var/lib/apt/lists/*
+ENV PYTHONDONTWRITEBYTECODE=1
+ENV PYTHONUNBUFFERED=1
+WORKDIR /app
+# Install deps first (better caching)
+COPY requirements.txt .
+RUN pip install --upgrade pip && pip install -r requirements.txt
+# Copy app code
+COPY . .
+# Runtime directories (mounted as volumes)
+RUN mkdir -p credily_data credily_models credily_output
+# Non-root user (important for prod)
+RUN useradd -m credilyuser && chown -R credilyuser:credilyuser /app
+USER credilyuser
+EXPOSE 7860
+CMD ["uvicorn", "credily.api.main:app", "--host", "0.0.0.0", "--port", "7860", "--workers", "4"]

README.md ADDED Viewed

	@@ -0,0 +1,12 @@

+---
+title: Credily Backend Test
+emoji: 📚
+colorFrom: yellow
+colorTo: green
+sdk: docker
+pinned: false
+license: apache-2.0
+short_description: 'A backend test to try out engine '
+---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

credily.egg-info/PKG-INFO ADDED Viewed

	@@ -0,0 +1,33 @@

+Metadata-Version: 2.4
+Name: credily
+Version: 0.1.0
+Summary: Fast, Explainable AutoML for Tabular Data (Finance Focus)
+Author: Your Name
+Author-email: your.email@example.com
+Classifier: Development Status :: 3 - Alpha
+Classifier: Intended Audience :: Developers
+Classifier: Intended Audience :: Financial and Insurance Industry
+Classifier: License :: OSI Approved :: MIT License
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.8
+Classifier: Programming Language :: Python :: 3.9
+Classifier: Programming Language :: Python :: 3.10
+Classifier: Programming Language :: Python :: 3.11
+Requires-Python: >=3.8
+Requires-Dist: pandas>=1.3.0
+Requires-Dist: numpy>=1.21.0
+Requires-Dist: scikit-learn>=1.0.0
+Requires-Dist: matplotlib>=3.4.0
+Requires-Dist: joblib>=1.0.0
+Requires-Dist: click>=8.0.0
+Provides-Extra: full
+Requires-Dist: xgboost>=1.5.0; extra == "full"
+Requires-Dist: lightgbm>=3.3.0; extra == "full"
+Requires-Dist: imbalanced-learn>=0.9.0; extra == "full"
+Dynamic: author
+Dynamic: author-email
+Dynamic: classifier
+Dynamic: provides-extra
+Dynamic: requires-dist
+Dynamic: requires-python
+Dynamic: summary

credily.egg-info/SOURCES.txt ADDED Viewed

	@@ -0,0 +1,18 @@

+setup.py
+credily/__init__.py
+credily/analyzer.py
+credily/automl.py
+credily/balancing.py
+credily/cleaning.py
+credily/cli.py
+credily/metrics.py
+credily/model.py
+credily/preprocessing.py
+credily/profiler.py
+credily/reporting.py
+credily.egg-info/PKG-INFO
+credily.egg-info/SOURCES.txt
+credily.egg-info/dependency_links.txt
+credily.egg-info/entry_points.txt
+credily.egg-info/requires.txt
+credily.egg-info/top_level.txt

credily.egg-info/dependency_links.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+

credily.egg-info/entry_points.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ [console_scripts]
2	+ credily = credily.cli:main

credily.egg-info/requires.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+pandas>=1.3.0
+numpy>=1.21.0
+scikit-learn>=1.0.0
+matplotlib>=3.4.0
+joblib>=1.0.0
+click>=8.0.0
+[full]
+xgboost>=1.5.0
+lightgbm>=3.3.0
+imbalanced-learn>=0.9.0

credily.egg-info/top_level.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ credily

credily/__init__.py ADDED Viewed

	@@ -0,0 +1,38 @@

+"""
+Credily - Fast, Explainable AutoML for Tabular Data (Finance Focus)
+A command-line tool for automated machine learning that:
+- Profiles your data automatically
+- Infers the ML task type (classification/regression)
+- Preprocesses data (imputation, scaling, encoding)
+- Trains multiple models (Logistic Regression, Random Forest, XGBoost, LightGBM)
+- Selects the best performer using cross-validation
+- Exports model (.pkl) and reports (HTML/JSON)
+"""
+import os
+import sys
+import warnings
+# Suppress joblib resource tracker warnings on Windows
+if sys.platform == 'win32':
+    os.environ.setdefault('LOKY_PICKLER', 'pickle')
+    warnings.filterwarnings('ignore', category=UserWarning, module='joblib')
+from .automl import CredilyPipeline
+from .profiler import DataProfiler
+from .analyzer import BusinessAnalyzer
+from .cleaning import DataCleaner
+from .balancing import DataBalancer
+from .agnostic_pipeline import AgnosticPipeline, QuickPipeline
+__version__ = '0.1.0'
+__all__ = [
+    'CredilyPipeline',
+    'DataProfiler',
+    'BusinessAnalyzer',
+    'DataCleaner',
+    'DataBalancer',
+    'AgnosticPipeline',
+    'QuickPipeline'
+]

credily/__pycache__/__init__.cpython-314.pyc ADDED Viewed

Binary file (1.41 kB). View file

credily/__pycache__/agnostic_pipeline.cpython-314.pyc ADDED Viewed

Binary file (27.1 kB). View file

credily/__pycache__/analyzer.cpython-314.pyc ADDED Viewed

Binary file (8.61 kB). View file

credily/__pycache__/automl.cpython-314.pyc ADDED Viewed

Binary file (43.7 kB). View file

credily/__pycache__/balancing.cpython-314.pyc ADDED Viewed

Binary file (22.4 kB). View file

credily/__pycache__/cleaning.cpython-314.pyc ADDED Viewed

Binary file (35.2 kB). View file

credily/__pycache__/cli.cpython-314.pyc ADDED Viewed

Binary file (23.6 kB). View file

credily/__pycache__/profiler.cpython-314.pyc ADDED Viewed

Binary file (9.67 kB). View file

credily/__pycache__/reporting.cpython-314.pyc ADDED Viewed

Binary file (11.8 kB). View file

credily/__pycache__/safety.cpython-314.pyc ADDED Viewed

Binary file (28.1 kB). View file

credily/__pycache__/utils.cpython-314.pyc ADDED Viewed

Binary file (8.19 kB). View file

credily/agnostic_pipeline.py ADDED Viewed

	@@ -0,0 +1,537 @@

+"""
+Agnostic ML Preprocessing Pipeline for Credily.
+A flexible, data-agnostic pipeline for preprocessing any dataset for ML tasks.
+"""
+import pandas as pd
+import numpy as np
+from sklearn.preprocessing import LabelEncoder
+from typing import Optional, Callable, Dict, Any, List, Tuple, Union
+class AgnosticPipeline:
+    """
+    Agnostic Pipeline to preprocess any dataset for ML tasks (binary classification or regression).
+    Features:
+    - Dynamic target handling (numeric, categorical, or already binary)
+    - Automatic missing value handling with indicator flags
+    - Automatic encoding for categorical variables
+    - Automatic dropping of high-cardinality identifiers
+    - Returns X (features) and y (target) ready for modeling
+    This pipeline is designed to work with ANY dataset without domain-specific assumptions.
+    """
+    def __init__(
+        self,
+        binary_threshold: Optional[float] = None,
+        binary_rule: Optional[Callable[[pd.Series], int]] = None,
+        positive_classes: Optional[List[str]] = None,
+        task_type: str = 'binary',
+        id_uniqueness_threshold: float = 0.9,
+        low_cardinality_threshold: int = 20,
+        flag_missing: bool = True,
+        verbose: bool = True
+    ):
+        """
+        Initialize the AgnosticPipeline.
+        Args:
+            binary_threshold: Numeric threshold to convert target to binary.
+                              Values BELOW threshold become 1 (positive/default class).
+                              Example: binary_threshold=600 for credit score → score<600 = default
+            binary_rule: Custom function to convert target to binary.
+                         Takes a row (pd.Series) and returns 0 or 1.
+                         Example: lambda row: 1 if row['score'] < 600 else 0
+            positive_classes: List of class labels to treat as positive (1) for binary grouping.
+                              All other classes become negative (0).
+                              Example: ['Poor', 'Standard'] → these become 1, 'Good' becomes 0
+            task_type: 'binary' for binary classification, 'multiclass' for multi-class, 'regression' for numeric
+            id_uniqueness_threshold: Columns with >threshold unique ratio are dropped as IDs
+            low_cardinality_threshold: Max unique values for one-hot encoding (others get label encoded)
+            flag_missing: Whether to create _missing indicator columns
+            verbose: Print processing steps
+        """
+        self.binary_threshold = binary_threshold
+        self.binary_rule = binary_rule
+        self.positive_classes = positive_classes
+        self.task_type = task_type
+        self.id_uniqueness_threshold = id_uniqueness_threshold
+        self.low_cardinality_threshold = low_cardinality_threshold
+        self.flag_missing = flag_missing
+        self.verbose = verbose
+        # Storage for fitted state (for transform on new data)
+        self.id_cols: List[str] = []
+        self.num_cols: List[str] = []
+        self.cat_cols: List[str] = []
+        self.low_card_cols: List[str] = []
+        self.high_card_cols: List[str] = []
+        self.label_encoders: Dict[str, LabelEncoder] = {}
+        self.target_label_encoder: Optional[LabelEncoder] = None  # For multiclass targets
+        self.numeric_medians: Dict[str, float] = {}
+        self.feature_columns: List[str] = []
+        self.class_names: List[str] = []  # For multiclass
+        self.n_classes: int = 2  # Number of classes
+        self.is_fitted: bool = False
+        self.processing_report: Dict[str, Any] = {}
+    def _log(self, message: str):
+        """Print message if verbose mode is enabled."""
+        if self.verbose:
+            print(message)
+    def _detect_columns(self, df: pd.DataFrame, target_column: Optional[str] = None) -> pd.DataFrame:
+        """
+        Detect column types and drop high-cardinality identifier columns.
+        Args:
+            df: Input dataframe
+            target_column: Name of target column (excluded from ID detection)
+        Returns:
+            DataFrame with ID columns removed
+        """
+        df = df.copy()
+        n_rows = len(df)
+        nunique = df.nunique()
+        # Identify ID columns (>threshold unique values ratio)
+        self.id_cols = []
+        for col in df.columns:
+            if col == target_column:
+                continue
+            unique_ratio = nunique[col] / n_rows
+            if unique_ratio > self.id_uniqueness_threshold:
+                self.id_cols.append(col)
+        if self.id_cols:
+            df = df.drop(columns=self.id_cols, errors='ignore')
+            self._log(f"  [1] Dropped {len(self.id_cols)} ID columns: {self.id_cols}")
+        # Detect column types (excluding target)
+        feature_df = df.drop(columns=[target_column], errors='ignore') if target_column else df
+        self.num_cols = feature_df.select_dtypes(include=['number']).columns.tolist()
+        self.cat_cols = feature_df.select_dtypes(include=['object', 'category']).columns.tolist()
+        self._log(f"  [2] Detected {len(self.num_cols)} numeric columns")
+        self._log(f"  [3] Detected {len(self.cat_cols)} categorical columns")
+        return df
+    def _handle_target(
+        self,
+        df: pd.DataFrame,
+        target_column: str
+    ) -> Tuple[pd.DataFrame, pd.Series]:
+        """
+        Create y (target) based on task_type and user settings.
+        Args:
+            df: Input dataframe with target column
+            target_column: Name of target column
+        Returns:
+            Tuple of (X features dataframe, y target series)
+        """
+        df = df.copy()
+        if self.task_type == 'regression':
+            # For regression, keep target as-is (must be numeric)
+            y = df[target_column].astype(float)
+            self._log(f"  [4] Target '{target_column}' kept as numeric for regression")
+            self.n_classes = 0  # Regression has no classes
+        elif self.task_type == 'multiclass':
+            # Multi-class classification: encode categorical target to integers
+            unique_vals = df[target_column].dropna().unique()
+            self.n_classes = len(unique_vals)
+            self.class_names = sorted([str(v) for v in unique_vals])
+            # Create label encoder for target
+            self.target_label_encoder = LabelEncoder()
+            y = pd.Series(
+                self.target_label_encoder.fit_transform(df[target_column].astype(str)),
+                index=df.index
+            )
+            label_map = dict(zip(self.target_label_encoder.classes_, range(len(self.target_label_encoder.classes_))))
+            self._log(f"  [4] Target '{target_column}' encoded for multiclass: {label_map}")
+            self._log(f"      Number of classes: {self.n_classes}")
+        elif self.task_type == 'binary':
+            self.n_classes = 2
+            if self.binary_rule is not None:
+                # Apply custom rule (function takes row, returns 0 or 1)
+                y = df.apply(self.binary_rule, axis=1)
+                self._log(f"  [4] Target created using custom binary_rule")
+            elif self.binary_threshold is not None:
+                # Apply threshold: values BELOW threshold = 1 (positive/default)
+                y = df[target_column].apply(lambda x: 1 if x < self.binary_threshold else 0)
+                self._log(f"  [4] Target '{target_column}' binarized: < {self.binary_threshold} → 1 (positive)")
+            elif self.positive_classes is not None:
+                # Binary grouping: specified classes become positive (1), others become negative (0)
+                y = df[target_column].apply(
+                    lambda x: 1 if str(x) in [str(c) for c in self.positive_classes] else 0
+                )
+                self._log(f"  [4] Target '{target_column}' grouped: {self.positive_classes} → 1 (positive), others → 0")
+            else:
+                # Check if already binary
+                unique_vals = df[target_column].dropna().unique()
+                if len(unique_vals) == 2:
+                    # Auto-convert to 0/1
+                    sorted_vals = sorted(unique_vals, key=lambda x: str(x))
+                    label_map = {sorted_vals[0]: 0, sorted_vals[1]: 1}
+                    y = df[target_column].map(label_map)
+                    self.class_names = [str(sorted_vals[0]), str(sorted_vals[1])]
+                    self._log(f"  [4] Target '{target_column}' auto-mapped: {label_map}")
+                else:
+                    raise ValueError(
+                        f"For binary task with non-binary target, provide binary_threshold, positive_classes, or binary_rule. "
+                        f"Target has {len(unique_vals)} unique values: {list(unique_vals)[:5]}"
+                    )
+        else:
+            raise ValueError(f"task_type must be 'binary', 'multiclass', or 'regression', got '{self.task_type}'")
+        X = df.drop(columns=[target_column])
+        # Log class distribution
+        if self.task_type in ['binary', 'multiclass']:
+            class_counts = y.value_counts()
+            self._log(f"      Class distribution: {dict(class_counts)}")
+        return X, y
+    def _preprocess_features(self, df: pd.DataFrame, fit: bool = True) -> pd.DataFrame:
+        """
+        Handle missing values and encode categorical variables.
+        Args:
+            df: Features dataframe (X)
+            fit: Whether to fit encoders (True for training, False for inference)
+        Returns:
+            Preprocessed dataframe
+        """
+        df = df.copy()
+        missing_flags_created = []
+        # ===== NUMERIC COLUMNS =====
+        for col in self.num_cols:
+            if col not in df.columns:
+                continue
+            # Create missing flag
+            if self.flag_missing:
+                missing_count = df[col].isna().sum()
+                if missing_count > 0:
+                    df[col + "_missing"] = df[col].isna().astype(int)
+                    missing_flags_created.append(col + "_missing")
+            # Fill missing with median
+            if fit:
+                self.numeric_medians[col] = df[col].median()
+            median_val = self.numeric_medians.get(col, 0)
+            df[col] = df[col].fillna(median_val)
+        if missing_flags_created:
+            self._log(f"  [5] Created {len(missing_flags_created)} numeric missing flags")
+        # ===== CATEGORICAL COLUMNS =====
+        cat_missing_flags = []
+        for col in self.cat_cols:
+            if col not in df.columns:
+                continue
+            # Create missing flag
+            if self.flag_missing:
+                missing_count = df[col].isna().sum()
+                if missing_count > 0:
+                    df[col + "_missing"] = df[col].isna().astype(int)
+                    cat_missing_flags.append(col + "_missing")
+            # Fill missing with "Missing" placeholder
+            df[col] = df[col].fillna("Missing")
+        if cat_missing_flags:
+            self._log(f"  [6] Created {len(cat_missing_flags)} categorical missing flags")
+        # ===== ENCODING =====
+        if fit:
+            # Determine low vs high cardinality
+            self.low_card_cols = [c for c in self.cat_cols if c in df.columns and df[c].nunique() <= self.low_cardinality_threshold]
+            self.high_card_cols = [c for c in self.cat_cols if c in df.columns and df[c].nunique() > self.low_cardinality_threshold]
+        # One-hot encode low-cardinality columns
+        if self.low_card_cols:
+            existing_low_card = [c for c in self.low_card_cols if c in df.columns]
+            if existing_low_card:
+                df = pd.get_dummies(df, columns=existing_low_card, drop_first=True, dtype=int)
+                self._log(f"  [7] One-hot encoded {len(existing_low_card)} low-cardinality columns")
+        # Label encode high-cardinality columns
+        if self.high_card_cols:
+            for col in self.high_card_cols:
+                if col not in df.columns:
+                    continue
+                if fit:
+                    le = LabelEncoder()
+                    # Fit on all values including "Missing"
+                    df[col] = le.fit_transform(df[col].astype(str))
+                    self.label_encoders[col] = le
+                else:
+                    le = self.label_encoders.get(col)
+                    if le:
+                        # Handle unseen categories
+                        df[col] = df[col].astype(str).apply(
+                            lambda x: le.transform([x])[0] if x in le.classes_ else -1
+                        )
+            if self.high_card_cols:
+                self._log(f"  [8] Label encoded {len(self.high_card_cols)} high-cardinality columns")
+        return df
+    def fit_transform(
+        self,
+        df: pd.DataFrame,
+        target_column: str
+    ) -> Tuple[pd.DataFrame, pd.Series]:
+        """
+        Main method to preprocess the dataset for training.
+        Args:
+            df: Raw dataframe with features and target
+            target_column: Name of the target column
+        Returns:
+            Tuple of (X features, y target) ready for ML modeling
+        """
+        self._log(f"\n{'='*60}")
+        self._log("AGNOSTIC PIPELINE - FIT TRANSFORM")
+        self._log(f"{'='*60}")
+        self._log(f"Input shape: {df.shape[0]} rows, {df.shape[1]} columns")
+        # Reset state
+        self.is_fitted = False
+        self.processing_report = {'input_shape': df.shape}
+        # Step 1: Detect columns & drop IDs
+        df = self._detect_columns(df, target_column)
+        # Step 2: Handle target
+        X, y = self._handle_target(df, target_column)
+        # Step 3: Preprocess features
+        X = self._preprocess_features(X, fit=True)
+        # Store feature columns for transform
+        self.feature_columns = X.columns.tolist()
+        self.is_fitted = True
+        # Summary
+        if self.task_type == 'regression':
+            target_info = {'mean': float(y.mean()), 'std': float(y.std())}
+        else:
+            target_info = y.value_counts().to_dict()
+        self.processing_report.update({
+            'output_shape': X.shape,
+            'task_type': self.task_type,
+            'n_classes': self.n_classes,
+            'class_names': self.class_names if self.class_names else None,
+            'target_distribution': target_info,
+            'positive_classes': self.positive_classes,
+            'binary_threshold': self.binary_threshold,
+            'id_columns_dropped': self.id_cols,
+            'numeric_columns': self.num_cols,
+            'categorical_columns': self.cat_cols,
+            'low_cardinality_encoded': self.low_card_cols,
+            'high_cardinality_encoded': self.high_card_cols
+        })
+        self._log(f"\n{'='*60}")
+        self._log("PREPROCESSING COMPLETE")
+        self._log(f"  Output shape: {X.shape[0]} rows, {X.shape[1]} features")
+        self._log(f"{'='*60}\n")
+        return X, y
+    def transform(self, df: pd.DataFrame) -> pd.DataFrame:
+        """
+        Transform new data using fitted preprocessing.
+        Args:
+            df: New dataframe with features only (no target)
+        Returns:
+            Preprocessed features dataframe
+        """
+        if not self.is_fitted:
+            raise ValueError("Pipeline not fitted. Call fit_transform() first.")
+        df = df.copy()
+        # Drop ID columns
+        df = df.drop(columns=self.id_cols, errors='ignore')
+        # Preprocess features
+        df = self._preprocess_features(df, fit=False)
+        # Align columns with training
+        for col in self.feature_columns:
+            if col not in df.columns:
+                df[col] = 0  # Default value for missing columns
+        # Keep only expected columns in correct order
+        df = df[self.feature_columns]
+        return df
+    def get_report(self) -> Dict[str, Any]:
+        """Get the processing report."""
+        return self.processing_report
+class QuickPipeline:
+    """
+    Convenience class that combines AgnosticPipeline with DataCleaner for full preprocessing.
+    """
+    def __init__(
+        self,
+        target_column: str,
+        binary_threshold: Optional[float] = None,
+        binary_rule: Optional[Callable[[pd.Series], int]] = None,
+        task_type: str = 'binary',
+        clean_data: bool = True,
+        clean_mode: str = 'thorough',
+        verbose: bool = True
+    ):
+        """
+        Initialize QuickPipeline.
+        Args:
+            target_column: Name of target column
+            binary_threshold: Threshold for binary classification
+            binary_rule: Custom rule for binary classification
+            task_type: 'binary' or 'regression'
+            clean_data: Whether to apply DataCleaner first
+            clean_mode: Cleaning mode ('basic', 'thorough', 'aggressive')
+            verbose: Print processing steps
+        """
+        self.target_column = target_column
+        self.clean_data = clean_data
+        self.clean_mode = clean_mode
+        self.verbose = verbose
+        # Initialize sub-pipelines
+        self.agnostic_pipeline = AgnosticPipeline(
+            binary_threshold=binary_threshold,
+            binary_rule=binary_rule,
+            task_type=task_type,
+            verbose=verbose
+        )
+        self.cleaner = None
+        self.cleaning_report = None
+    def fit_transform(self, df: pd.DataFrame) -> Tuple[pd.DataFrame, pd.Series]:
+        """
+        Full preprocessing: Clean → AgnosticPipeline.
+        Args:
+            df: Raw dataframe
+        Returns:
+            Tuple of (X features, y target)
+        """
+        # Step 1: Clean data (optional)
+        if self.clean_data:
+            from .cleaning import DataCleaner
+            self.cleaner = DataCleaner(
+                target_column=self.target_column,
+                clean_mode=self.clean_mode
+            )
+            df = self.cleaner.clean(df, verbose=self.verbose)
+            self.cleaning_report = self.cleaner.get_report()
+        # Step 2: Agnostic preprocessing
+        X, y = self.agnostic_pipeline.fit_transform(df, self.target_column)
+        return X, y
+    def transform(self, df: pd.DataFrame) -> pd.DataFrame:
+        """Transform new data."""
+        return self.agnostic_pipeline.transform(df)
+    def get_full_report(self) -> Dict[str, Any]:
+        """Get combined report from both stages."""
+        return {
+            'cleaning': self.cleaning_report,
+            'preprocessing': self.agnostic_pipeline.get_report()
+        }
+# ============================================================
+# Example usage
+# ============================================================
+if __name__ == "__main__":
+    # Example 1: Basic usage with credit score threshold
+    print("=" * 70)
+    print("EXAMPLE 1: Credit Score Binary Classification")
+    print("=" * 70)
+    # Create sample data
+    sample_data = pd.DataFrame({
+        'customer_id': range(1000, 1100),  # Will be dropped as ID
+        'age': np.random.randint(18, 70, 100),
+        'income': np.random.randint(20000, 150000, 100),
+        'employment_type': np.random.choice(['Employed', 'Self-Employed', 'Unemployed', None], 100),
+        'loan_amount': np.random.randint(5000, 50000, 100),
+        'credit_score': np.random.randint(400, 850, 100)  # Target
+    })
+    # Initialize pipeline: credit score < 600 = default (1)
+    pipeline = AgnosticPipeline(
+        binary_threshold=600,
+        task_type='binary'
+    )
+    # Preprocess
+    X, y = pipeline.fit_transform(sample_data, target_column='credit_score')
+    print(f"\nFeatures shape: {X.shape}")
+    print(f"Target distribution:\n{y.value_counts()}")
+    print(f"\nFeature columns: {X.columns.tolist()}")
+    # Example 2: Custom binary rule
+    print("\n" + "=" * 70)
+    print("EXAMPLE 2: Custom Binary Rule")
+    print("=" * 70)
+    # Custom rule: default if score < 600 AND income < 50000
+    custom_rule = lambda row: 1 if (row['credit_score'] < 600 and row['income'] < 50000) else 0
+    pipeline2 = AgnosticPipeline(
+        binary_rule=custom_rule,
+        task_type='binary'
+    )
+    X2, y2 = pipeline2.fit_transform(sample_data, target_column='credit_score')
+    print(f"\nTarget distribution with custom rule:\n{y2.value_counts()}")
+    # Example 3: Regression task
+    print("\n" + "=" * 70)
+    print("EXAMPLE 3: Regression Task")
+    print("=" * 70)
+    pipeline3 = AgnosticPipeline(task_type='regression')
+    X3, y3 = pipeline3.fit_transform(sample_data, target_column='credit_score')
+    print(f"\nTarget stats: mean={y3.mean():.2f}, std={y3.std():.2f}")

credily/analyzer.py ADDED Viewed

	@@ -0,0 +1,214 @@

+"""
+Business context analyzer for TabulaML.
+Analyzes model performance in finance-specific contexts.
+"""
+import numpy as np
+import pandas as pd
+from typing import Dict, Any, List
+from sklearn.metrics import precision_score, recall_score, confusion_matrix
+class BusinessAnalyzer:
+    """
+    Analyzes model performance in business contexts.
+    Optimized for finance use cases like credit scoring, fraud detection, etc.
+    """
+    CONTEXTS = {
+        'credit_scoring': {
+            'description': 'Loan default prediction',
+            'positive_label': 'default',
+            'cost_fp': 100,       # Cost of rejecting good customer (lost revenue)
+            'cost_fn': 1000,      # Cost of approving bad customer (default loss)
+            'revenue_tp': 50,     # Revenue from correctly rejecting bad customer
+            'revenue_tn': 200,    # Revenue from correctly approving good customer
+        },
+        'fraud_detection': {
+            'description': 'Transaction fraud detection',
+            'positive_label': 'fraud',
+            'cost_fp': 10,        # Cost of blocking legitimate transaction
+            'cost_fn': 500,       # Cost of missing fraud
+            'revenue_tp': 500,    # Savings from catching fraud
+            'revenue_tn': 0,      # Normal transaction
+        },
+        'churn_prediction': {
+            'description': 'Customer churn prediction',
+            'positive_label': 'churn',
+            'cost_fp': 50,        # Cost of unnecessary retention effort
+            'cost_fn': 300,       # Cost of losing customer
+            'revenue_tp': 250,    # Value of retained customer
+            'revenue_tn': 0,      # No action needed
+        },
+        'insurance_claims': {
+            'description': 'Insurance claims prediction',
+            'positive_label': 'claim',
+            'cost_fp': 20,        # Cost of extra investigation
+            'cost_fn': 1000,      # Cost of missing fraudulent claim
+            'revenue_tp': 800,    # Savings from detecting bad claim
+            'revenue_tn': 0,      # Normal claim processing
+        },
+        'collections': {
+            'description': 'Debt collection prioritization',
+            'positive_label': 'will_pay',
+            'cost_fp': 30,        # Cost of unnecessary collection effort
+            'cost_fn': 200,       # Lost recovery
+            'revenue_tp': 150,    # Successful recovery
+            'revenue_tn': 0,      # No action
+        },
+    }
+    def __init__(self, context: str = 'credit_scoring'):
+        if context not in self.CONTEXTS:
+            raise ValueError(f"Unknown context: {context}. Available: {list(self.CONTEXTS.keys())}")
+        self.context = context
+        self.config = self.CONTEXTS[context]
+    def analyze(
+        self,
+        pipeline,
+        df: pd.DataFrame,
+        target_column: str
+    ) -> Dict[str, Any]:
+        """
+        Analyze model performance in business context.
+        Args:
+            pipeline: Trained TabulaMLPipeline
+            df: Test dataframe with features and target
+            target_column: Name of target column
+        Returns:
+            dict: Business analysis report
+        """
+        X = df.drop(columns=[target_column])
+        y_true = df[target_column].values
+        y_proba = pipeline.best_model.predict_proba(X)[:, 1]
+        # Find optimal threshold
+        thresholds = np.arange(0.1, 0.9, 0.05)
+        best_threshold = 0.5
+        best_profit = float('-inf')
+        threshold_analysis = []
+        for thresh in thresholds:
+            y_pred = (y_proba >= thresh).astype(int)
+            profit = self._calculate_profit(y_true, y_pred)
+            threshold_analysis.append({
+                'threshold': thresh,
+                'profit': profit,
+                'precision': precision_score(y_true, y_pred, zero_division=0),
+                'recall': recall_score(y_true, y_pred, zero_division=0)
+            })
+            if profit > best_profit:
+                best_profit = profit
+                best_threshold = thresh
+        # Calculate metrics at optimal threshold
+        y_pred_optimal = (y_proba >= best_threshold).astype(int)
+        cm = confusion_matrix(y_true, y_pred_optimal)
+        tn, fp, fn, tp = cm.ravel()
+        # Financial calculations
+        expected_profit = self._calculate_profit(y_true, y_pred_optimal)
+        risk_exposure = fn * self.config['cost_fn']
+        # Generate recommendations
+        recommendations = self._generate_recommendations(
+            precision_score(y_true, y_pred_optimal, zero_division=0),
+            recall_score(y_true, y_pred_optimal, zero_division=0),
+            best_threshold
+        )
+        return {
+            'context': self.context,
+            'context_description': self.config['description'],
+            'optimal_threshold': best_threshold,
+            'expected_profit': expected_profit,
+            'risk_exposure': risk_exposure,
+            'precision': precision_score(y_true, y_pred_optimal, zero_division=0),
+            'recall': recall_score(y_true, y_pred_optimal, zero_division=0),
+            'confusion_matrix': {
+                'true_negatives': int(tn),
+                'false_positives': int(fp),
+                'false_negatives': int(fn),
+                'true_positives': int(tp)
+            },
+            'threshold_analysis': threshold_analysis,
+            'recommendations': recommendations
+        }
+    def _calculate_profit(self, y_true: np.ndarray, y_pred: np.ndarray) -> float:
+        """Calculate expected profit based on confusion matrix."""
+        cm = confusion_matrix(y_true, y_pred)
+        tn, fp, fn, tp = cm.ravel()
+        profit = (
+            tp * self.config['revenue_tp'] +
+            tn * self.config['revenue_tn'] -
+            fp * self.config['cost_fp'] -
+            fn * self.config['cost_fn']
+        )
+        return profit
+    def _generate_recommendations(
+        self,
+        precision: float,
+        recall: float,
+        threshold: float
+    ) -> List[str]:
+        """Generate business recommendations based on metrics."""
+        recommendations = []
+        if self.context == 'credit_scoring':
+            if precision < 0.7:
+                recommendations.append(
+                    "Low precision: Consider raising the approval threshold to reduce bad debt"
+                )
+            if recall < 0.6:
+                recommendations.append(
+                    "Low recall: Many defaulters are being approved. Review underwriting criteria"
+                )
+            if threshold > 0.6:
+                recommendations.append(
+                    f"High threshold ({threshold:.2f}): May be rejecting too many good applicants"
+                )
+        elif self.context == 'fraud_detection':
+            if recall < 0.8:
+                recommendations.append(
+                    "Critical: Low fraud detection rate. Lower threshold or add features"
+                )
+            if precision < 0.5:
+                recommendations.append(
+                    "High false positive rate causing customer friction. Review flagging rules"
+                )
+        elif self.context == 'churn_prediction':
+            if recall < 0.7:
+                recommendations.append(
+                    "Missing too many churners. Expand retention campaigns"
+                )
+            if precision < 0.5:
+                recommendations.append(
+                    "Retention budget being wasted on non-churners. Refine targeting"
+                )
+        # General recommendations
+        if precision > 0.8 and recall > 0.8:
+            recommendations.append(
+                "Model performing well. Consider A/B testing in production"
+            )
+        if not recommendations:
+            recommendations.append(
+                "Model metrics are within acceptable range for this context"
+            )
+        return recommendations
+    @classmethod
+    def list_contexts(cls) -> Dict[str, str]:
+        """List all available business contexts."""
+        return {k: v['description'] for k, v in cls.CONTEXTS.items()}

credily/api/__init__.py ADDED Viewed

	@@ -0,0 +1,8 @@

+"""
+Credily FastAPI REST API module.
+Exposes ML functionality as HTTP endpoints.
+"""
+from .main import app
+__all__ = ['app']

credily/api/__pycache__/__init__.cpython-314.pyc ADDED Viewed

Binary file (335 Bytes). View file

credily/api/__pycache__/database.cpython-314.pyc ADDED Viewed

Binary file (20.2 kB). View file

credily/api/__pycache__/errors.cpython-314.pyc ADDED Viewed

Binary file (13.1 kB). View file

credily/api/__pycache__/main.cpython-314.pyc ADDED Viewed

Binary file (48.1 kB). View file

credily/api/__pycache__/schemas.cpython-314.pyc ADDED Viewed

Binary file (18.3 kB). View file

credily/api/database.py ADDED Viewed

	@@ -0,0 +1,368 @@

+"""
+Database module for storing prediction history and reports.
+Supports both SQLite (development) and PostgreSQL (production).
+"""
+import json
+import os
+from datetime import datetime
+from pathlib import Path
+from typing import Optional, List, Dict, Any
+from contextlib import contextmanager
+from urllib.parse import urlparse
+# Database configuration
+DATABASE_URL = os.environ.get("DATABASE_URL")
+# Determine database type
+if DATABASE_URL and DATABASE_URL.startswith("postgresql"):
+    DB_TYPE = "postgresql"
+    try:
+        import psycopg2
+        from psycopg2.extras import RealDictCursor
+    except ImportError:
+        raise ImportError(
+            "psycopg2 is required for PostgreSQL. Install with: pip install psycopg2-binary"
+        )
+else:
+    DB_TYPE = "sqlite"
+    import sqlite3
+    DB_PATH = Path("credily_data") / "credily.db"
+def init_db():
+    """Initialize the database with required tables."""
+    if DB_TYPE == "sqlite":
+        DB_PATH.parent.mkdir(parents=True, exist_ok=True)
+    with get_db() as conn:
+        cursor = conn.cursor()
+        if DB_TYPE == "postgresql":
+            # PostgreSQL syntax
+            cursor.execute("""
+                CREATE TABLE IF NOT EXISTS prediction_sessions (
+                    id TEXT PRIMARY KEY,
+                    model_path TEXT NOT NULL,
+                    model_name TEXT,
+                    threshold_used DOUBLE PRECISION,
+                    total_records INTEGER,
+                    predicted_positive INTEGER,
+                    predicted_negative INTEGER,
+                    positive_rate DOUBLE PRECISION,
+                    avg_probability DOUBLE PRECISION,
+                    created_at TEXT NOT NULL
+                )
+            """)
+            cursor.execute("""
+                CREATE TABLE IF NOT EXISTS prediction_results (
+                    id SERIAL PRIMARY KEY,
+                    session_id TEXT NOT NULL,
+                    record_index INTEGER,
+                    prediction INTEGER,
+                    probability DOUBLE PRECISION,
+                    risk_level TEXT,
+                    input_data TEXT,
+                    FOREIGN KEY (session_id) REFERENCES prediction_sessions(id) ON DELETE CASCADE
+                )
+            """)
+            cursor.execute("""
+                CREATE TABLE IF NOT EXISTS training_reports (
+                    id TEXT PRIMARY KEY,
+                    model_name TEXT,
+                    best_model TEXT,
+                    best_score DOUBLE PRECISION,
+                    test_auc DOUBLE PRECISION,
+                    test_pr_auc DOUBLE PRECISION,
+                    optimal_threshold DOUBLE PRECISION,
+                    model_scores TEXT,
+                    classification_report TEXT,
+                    confusion_matrix TEXT,
+                    feature_importances TEXT,
+                    cleaning_report TEXT,
+                    balancing_report TEXT,
+                    created_at TEXT NOT NULL
+                )
+            """)
+        else:
+            # SQLite syntax
+            cursor.execute("""
+                CREATE TABLE IF NOT EXISTS prediction_sessions (
+                    id TEXT PRIMARY KEY,
+                    model_path TEXT NOT NULL,
+                    model_name TEXT,
+                    threshold_used REAL,
+                    total_records INTEGER,
+                    predicted_positive INTEGER,
+                    predicted_negative INTEGER,
+                    positive_rate REAL,
+                    avg_probability REAL,
+                    created_at TEXT NOT NULL
+                )
+            """)
+            cursor.execute("""
+                CREATE TABLE IF NOT EXISTS prediction_results (
+                    id INTEGER PRIMARY KEY AUTOINCREMENT,
+                    session_id TEXT NOT NULL,
+                    record_index INTEGER,
+                    prediction INTEGER,
+                    probability REAL,
+                    risk_level TEXT,
+                    input_data TEXT,
+                    FOREIGN KEY (session_id) REFERENCES prediction_sessions(id)
+                )
+            """)
+            cursor.execute("""
+                CREATE TABLE IF NOT EXISTS training_reports (
+                    id TEXT PRIMARY KEY,
+                    model_name TEXT,
+                    best_model TEXT,
+                    best_score REAL,
+                    test_auc REAL,
+                    test_pr_auc REAL,
+                    optimal_threshold REAL,
+                    model_scores TEXT,
+                    classification_report TEXT,
+                    confusion_matrix TEXT,
+                    feature_importances TEXT,
+                    cleaning_report TEXT,
+                    balancing_report TEXT,
+                    created_at TEXT NOT NULL
+                )
+            """)
+        conn.commit()
+@contextmanager
+def get_db():
+    """Get database connection context manager."""
+    if DB_TYPE == "postgresql":
+        conn = psycopg2.connect(DATABASE_URL)
+        conn.autocommit = False
+        try:
+            yield conn
+        finally:
+            conn.close()
+    else:
+        conn = sqlite3.connect(str(DB_PATH))
+        conn.row_factory = sqlite3.Row
+        try:
+            yield conn
+        finally:
+            conn.close()
+def _dict_from_row(row, cursor_description=None):
+    """Convert a database row to a dictionary."""
+    if DB_TYPE == "postgresql":
+        if cursor_description:
+            columns = [desc[0] for desc in cursor_description]
+            return dict(zip(columns, row))
+        return dict(row) if hasattr(row, 'keys') else row
+    else:
+        return dict(row)
+def _p(query: str) -> str:
+    """Convert SQLite-style ? placeholders to PostgreSQL %s if needed."""
+    if DB_TYPE == "postgresql":
+        return query.replace("?", "%s")
+    return query
+# ============== Prediction Sessions ==============
+def save_prediction_session(
+    session_id: str,
+    model_path: str,
+    model_name: Optional[str],
+    threshold_used: float,
+    total_records: int,
+    predicted_positive: int,
+    predicted_negative: int,
+    positive_rate: float,
+    avg_probability: Optional[float] = None
+) -> str:
+    """Save a prediction session."""
+    with get_db() as conn:
+        cursor = conn.cursor()
+        cursor.execute(_p("""
+            INSERT INTO prediction_sessions
+            (id, model_path, model_name, threshold_used, total_records,
+             predicted_positive, predicted_negative, positive_rate, avg_probability, created_at)
+            VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
+        """), (
+            session_id, model_path, model_name, threshold_used, total_records,
+            predicted_positive, predicted_negative, positive_rate, avg_probability,
+            datetime.now().isoformat()
+        ))
+        conn.commit()
+    return session_id
+def save_prediction_results(session_id: str, results: List[Dict[str, Any]]):
+    """Save individual prediction results."""
+    with get_db() as conn:
+        cursor = conn.cursor()
+        for result in results:
+            cursor.execute(_p("""
+                INSERT INTO prediction_results
+                (session_id, record_index, prediction, probability, risk_level, input_data)
+                VALUES (?, ?, ?, ?, ?, ?)
+            """), (
+                session_id,
+                result.get('index'),
+                result.get('prediction'),
+                result.get('probability'),
+                result.get('risk_level'),
+                json.dumps(result.get('input_data', {}))
+            ))
+        conn.commit()
+def get_prediction_sessions(limit: int = 50, offset: int = 0) -> List[Dict[str, Any]]:
+    """Get prediction session history."""
+    with get_db() as conn:
+        cursor = conn.cursor()
+        cursor.execute(_p("""
+            SELECT * FROM prediction_sessions
+            ORDER BY created_at DESC
+            LIMIT ? OFFSET ?
+        """), (limit, offset))
+        rows = cursor.fetchall()
+        return [_dict_from_row(row, cursor.description) for row in rows]
+def get_prediction_session(session_id: str) -> Optional[Dict[str, Any]]:
+    """Get a specific prediction session."""
+    with get_db() as conn:
+        cursor = conn.cursor()
+        cursor.execute(_p("SELECT * FROM prediction_sessions WHERE id = ?"), (session_id,))
+        row = cursor.fetchone()
+        if row:
+            return _dict_from_row(row, cursor.description)
+        return None
+def get_prediction_results(session_id: str) -> List[Dict[str, Any]]:
+    """Get prediction results for a session."""
+    with get_db() as conn:
+        cursor = conn.cursor()
+        cursor.execute(_p("""
+            SELECT * FROM prediction_results
+            WHERE session_id = ?
+            ORDER BY record_index
+        """), (session_id,))
+        rows = cursor.fetchall()
+        results = []
+        for row in rows:
+            result = _dict_from_row(row, cursor.description)
+            if result.get('input_data'):
+                result['input_data'] = json.loads(result['input_data'])
+            results.append(result)
+        return results
+def delete_prediction_session(session_id: str) -> bool:
+    """Delete a prediction session and its results."""
+    with get_db() as conn:
+        cursor = conn.cursor()
+        cursor.execute(_p("DELETE FROM prediction_results WHERE session_id = ?"), (session_id,))
+        cursor.execute(_p("DELETE FROM prediction_sessions WHERE id = ?"), (session_id,))
+        conn.commit()
+        return cursor.rowcount > 0
+# ============== Training Reports ==============
+def save_training_report(
+    report_id: str,
+    model_name: str,
+    results: Dict[str, Any]
+) -> str:
+    """Save a training report."""
+    with get_db() as conn:
+        cursor = conn.cursor()
+        cursor.execute(_p("""
+            INSERT INTO training_reports
+            (id, model_name, best_model, best_score, test_auc, test_pr_auc,
+             optimal_threshold, model_scores, classification_report, confusion_matrix,
+             feature_importances, cleaning_report, balancing_report, created_at)
+            VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
+        """), (
+            report_id,
+            model_name,
+            results.get('best_model'),
+            results.get('best_score'),
+            results.get('test_auc'),
+            results.get('test_pr_auc'),
+            results.get('optimal_threshold'),
+            json.dumps(results.get('model_scores', {})),
+            json.dumps(results.get('classification_report', {})),
+            json.dumps(results.get('confusion_matrix', [])),
+            json.dumps(results.get('feature_importances', {})),
+            json.dumps(results.get('cleaning_report')),
+            json.dumps(results.get('balancing_report')),
+            datetime.now().isoformat()
+        ))
+        conn.commit()
+    return report_id
+def get_training_reports(limit: int = 50, offset: int = 0) -> List[Dict[str, Any]]:
+    """Get training report history."""
+    with get_db() as conn:
+        cursor = conn.cursor()
+        cursor.execute(_p("""
+            SELECT id, model_name, best_model, best_score, test_auc,
+                   test_pr_auc, optimal_threshold, created_at
+            FROM training_reports
+            ORDER BY created_at DESC
+            LIMIT ? OFFSET ?
+        """), (limit, offset))
+        rows = cursor.fetchall()
+        return [_dict_from_row(row, cursor.description) for row in rows]
+def get_training_report(report_id: str) -> Optional[Dict[str, Any]]:
+    """Get a specific training report."""
+    with get_db() as conn:
+        cursor = conn.cursor()
+        cursor.execute(_p("SELECT * FROM training_reports WHERE id = ?"), (report_id,))
+        row = cursor.fetchone()
+        if row:
+            result = _dict_from_row(row, cursor.description)
+            # Parse JSON fields
+            for field in ['model_scores', 'classification_report', 'confusion_matrix',
+                         'feature_importances', 'cleaning_report', 'balancing_report']:
+                if result.get(field):
+                    result[field] = json.loads(result[field])
+            return result
+        return None
+def delete_training_report(report_id: str) -> bool:
+    """Delete a training report."""
+    with get_db() as conn:
+        cursor = conn.cursor()
+        cursor.execute(_p("DELETE FROM training_reports WHERE id = ?"), (report_id,))
+        conn.commit()
+        return cursor.rowcount > 0
+def get_db_info() -> Dict[str, Any]:
+    """Get database connection info (for debugging/health checks)."""
+    return {
+        "type": DB_TYPE,
+        "url": DATABASE_URL[:30] + "..." if DATABASE_URL else None,
+        "path": str(DB_PATH) if DB_TYPE == "sqlite" else None
+    }
+# Initialize database on module import
+init_db()

credily/api/errors.py ADDED Viewed

	@@ -0,0 +1,232 @@

+"""
+Error handling module for Credily API.
+Provides user-friendly error messages while logging detailed errors for developers.
+"""
+import logging
+import traceback
+from typing import Optional, Dict, Any
+from fastapi import HTTPException
+# Configure logging for developer errors
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+)
+logger = logging.getLogger('credily.api')
+class UserFriendlyError(Exception):
+    """Exception with user-friendly message and optional developer details."""
+    def __init__(self, user_message: str, detail: str = None, status_code: int = 400):
+        self.user_message = user_message
+        self.detail = detail
+        self.status_code = status_code
+        super().__init__(user_message)
+# Error message mappings for common exceptions
+ERROR_MAPPINGS = {
+    # Data errors
+    "No data provided": "Please provide data to process. The data field cannot be empty.",
+    "Target column": "The target column was not found in your data. Please check the column name.",
+    "not found": "The requested resource was not found. Please check the path and try again.",
+    # Model errors
+    "Model not trained": "No trained model available. Please train a model first or provide a valid model path.",
+    "Failed to load model": "Unable to load the model file. Please ensure the file is valid and not corrupted.",
+    "model.pkl": "Invalid model file format. Please provide a valid Credily model file.",
+    # Data quality errors
+    "Binary classification requires 2 classes": "Your data must have exactly 2 classes in the target column for binary classification.",
+    "less than 2 classes": "Your target column has only one class. Binary classification requires at least 2 different classes.",
+    "more than 2 classes": "Your target column has more than 2 classes. This tool currently supports binary classification only.",
+    # File errors
+    "not found at": "The specified file could not be found. Please check the path and try again.",
+    "Permission denied": "Unable to access the file. Please check file permissions.",
+    # Memory errors
+    "MemoryError": "The dataset is too large to process. Please try with a smaller dataset or contact support.",
+    # Connection errors
+    "Connection": "Unable to connect to the service. Please try again later.",
+    # Validation errors
+    "validation error": "Invalid data format. Please check your input data matches the required format.",
+}
+def get_user_friendly_message(error: Exception) -> str:
+    """
+    Convert a technical error message to a user-friendly message.
+    Args:
+        error: The exception that occurred
+    Returns:
+        A user-friendly error message
+    """
+    error_str = str(error).lower()
+    error_type = type(error).__name__
+    # Check for specific error patterns
+    for pattern, friendly_message in ERROR_MAPPINGS.items():
+        if pattern.lower() in error_str:
+            return friendly_message
+    # Handle specific exception types
+    if isinstance(error, FileNotFoundError):
+        return "The requested file was not found. Please check the path and try again."
+    if isinstance(error, PermissionError):
+        return "Unable to access the file due to permission restrictions."
+    if isinstance(error, ValueError):
+        # Try to make ValueError messages more user-friendly
+        if "column" in error_str:
+            return "There was an issue with the data columns. Please check your data format."
+        if "shape" in error_str:
+            return "The data dimensions are incorrect. Please ensure your data is properly formatted."
+        if "dtype" in error_str or "type" in error_str:
+            return "There was a data type mismatch. Please ensure all values are in the correct format."
+        return "Invalid value provided. Please check your input data."
+    if isinstance(error, KeyError):
+        return "A required field is missing from your data. Please check the data format."
+    if isinstance(error, TypeError):
+        return "Invalid data type provided. Please check your input format."
+    if "memory" in error_str:
+        return "The operation requires more memory than available. Please try with a smaller dataset."
+    if "timeout" in error_str:
+        return "The operation timed out. Please try again with a smaller dataset or later."
+    # Default message for unknown errors
+    return "An unexpected error occurred. Please try again or contact support if the problem persists."
+def handle_api_error(
+    error: Exception,
+    operation: str = "operation",
+    context: Optional[Dict[str, Any]] = None
+) -> HTTPException:
+    """
+    Handle an API error by logging details and returning a user-friendly HTTPException.
+    Args:
+        error: The exception that occurred
+        operation: Description of what operation was being performed
+        context: Optional additional context for logging
+    Returns:
+        HTTPException with user-friendly message
+    """
+    # Get user-friendly message
+    if isinstance(error, UserFriendlyError):
+        user_message = error.user_message
+        status_code = error.status_code
+    elif isinstance(error, HTTPException):
+        # Already an HTTP exception, but might need friendlier message
+        user_message = get_user_friendly_message(Exception(error.detail))
+        status_code = error.status_code
+    else:
+        user_message = get_user_friendly_message(error)
+        status_code = 500 if not isinstance(error, (ValueError, KeyError, TypeError)) else 400
+    # Log detailed error for developers
+    logger.error(
+        f"Error during {operation}: {type(error).__name__}: {str(error)}",
+        extra={'context': context or {}}
+    )
+    logger.debug(f"Full traceback:\n{traceback.format_exc()}")
+    return HTTPException(status_code=status_code, detail=user_message)
+def log_warning(message: str, context: Optional[Dict[str, Any]] = None):
+    """Log a warning message."""
+    logger.warning(message, extra={'context': context or {}})
+def log_info(message: str, context: Optional[Dict[str, Any]] = None):
+    """Log an info message."""
+    logger.info(message, extra={'context': context or {}})
+# Pre-defined user-friendly error responses
+class APIErrors:
+    """Common API error responses."""
+    @staticmethod
+    def no_data() -> HTTPException:
+        return HTTPException(
+            status_code=400,
+            detail="Please provide data to process. The data field cannot be empty."
+        )
+    @staticmethod
+    def model_not_found(path: str = None) -> HTTPException:
+        msg = "Model not found."
+        if path:
+            msg = f"Model not found at the specified location. Please check the path and try again."
+        return HTTPException(status_code=404, detail=msg)
+    @staticmethod
+    def invalid_model() -> HTTPException:
+        return HTTPException(
+            status_code=400,
+            detail="Invalid model file. Please ensure you're using a valid Credily model."
+        )
+    @staticmethod
+    def target_not_found(column: str) -> HTTPException:
+        return HTTPException(
+            status_code=400,
+            detail=f"Target column '{column}' not found in your data. Please check the column name and try again."
+        )
+    @staticmethod
+    def insufficient_classes() -> HTTPException:
+        return HTTPException(
+            status_code=400,
+            detail="Your data must have exactly 2 classes for binary classification. Please check your target column."
+        )
+    @staticmethod
+    def training_failed() -> HTTPException:
+        return HTTPException(
+            status_code=500,
+            detail="Model training failed. Please check your data format and try again."
+        )
+    @staticmethod
+    def prediction_failed() -> HTTPException:
+        return HTTPException(
+            status_code=500,
+            detail="Prediction failed. Please ensure your data matches the format used for training."
+        )
+    @staticmethod
+    def file_not_found(filename: str = None) -> HTTPException:
+        msg = "The requested file was not found."
+        if filename:
+            msg = f"File '{filename}' not found. It may have expired or been removed."
+        return HTTPException(status_code=404, detail=msg)
+    @staticmethod
+    def invalid_data_format() -> HTTPException:
+        return HTTPException(
+            status_code=400,
+            detail="Invalid data format. Please provide data as a list of records (JSON objects)."
+        )
+    @staticmethod
+    def server_error() -> HTTPException:
+        return HTTPException(
+            status_code=500,
+            detail="An internal error occurred. Please try again later."
+        )

credily/api/main.py ADDED Viewed

	@@ -0,0 +1,1035 @@

+"""
+FastAPI application for Credily Credit Scoring API.
+"""
+import os
+import io
+import uuid
+import json
+import shutil
+import zipfile
+import tempfile
+from datetime import datetime
+from pathlib import Path
+from typing import Optional
+import numpy as np
+import pandas as pd
+from fastapi import FastAPI, HTTPException, Query, File, UploadFile, Form
+from fastapi.middleware.cors import CORSMiddleware
+from fastapi.responses import StreamingResponse
+from .schemas import (
+    TrainRequest, TrainResponse, TrainConfig,
+    PredictRequest, PredictResponse, PredictionResult,
+    SinglePredictRequest,
+    ProfileRequest, ProfileResponse, ColumnProfile,
+    PredictionHistoryResponse, PredictionSessionInfo, PredictionSessionDetailResponse,
+    TrainingReportsResponse, TrainingReportInfo, TrainingReportDetailResponse,
+    HealthResponse, SafetyReportSchema
+)
+from .database import (
+    init_db, save_prediction_session, save_prediction_results,
+    get_prediction_sessions, get_prediction_session, get_prediction_results,
+    delete_prediction_session, save_training_report, get_training_reports,
+    get_training_report, delete_training_report, get_db_info, DB_TYPE
+)
+from .errors import handle_api_error, APIErrors, logger, log_info
+from ..automl import CredilyPipeline
+from ..profiler import DataProfiler
+from .. import __version__
+# ============== App Configuration ==============
+app = FastAPI(
+    title="Credily Credit Scoring API",
+    description="REST API for AI-powered credit scoring and risk assessment",
+    version=__version__,
+    docs_url="/docs",
+    redoc_url="/redoc"
+)
+# CORS configuration - allow frontend to communicate
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=[
+        "http://localhost:5173",
+        "http://localhost:8080",
+        "http://localhost:8081",
+        "http://localhost:8082",
+        "http://localhost:3000",
+        "http://127.0.0.1:5173",
+        "http://127.0.0.1:3000",
+        "http://127.0.0.1:8080",
+        "http://127.0.0.1:8081",
+        "http://127.0.0.1:8082",
+        # Network IP access
+        "http://172.20.10.3:8080",
+        "http://172.20.10.3:5173",
+        "http://172.20.10.3:3000",
+        # Production frontend URLs (Lovable)
+        "https://id-preview--449e5a89-ee15-4a86-a93c-cf19ebb9c17e.lovable.app",
+        "https://credily-credit-scoring-ai.lovable.app",
+        "https://credily-six.vercel.app",
+    ],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+# Temporary storage for model downloads (cleaned up after download)
+TEMP_MODELS_DIR = Path(tempfile.gettempdir()) / "credily_temp_models"
+TEMP_MODELS_DIR.mkdir(exist_ok=True)
+# Storage for uploaded models (model_id -> model_path mapping)
+UPLOADED_MODELS_DIR = TEMP_MODELS_DIR / "uploaded_models"
+UPLOADED_MODELS_DIR.mkdir(exist_ok=True)
+uploaded_models: dict[str, str] = {}  # model_id -> model.pkl path
+# ============== Helper Functions ==============
+def convert_numpy_types(obj):
+    """Recursively convert numpy types to native Python types."""
+    if isinstance(obj, dict):
+        return {k: convert_numpy_types(v) for k, v in obj.items()}
+    elif isinstance(obj, list):
+        return [convert_numpy_types(item) for item in obj]
+    elif isinstance(obj, np.ndarray):
+        return obj.tolist()
+    elif isinstance(obj, (np.int64, np.int32, np.int16, np.int8)):
+        return int(obj)
+    elif isinstance(obj, (np.float64, np.float32, np.float16)):
+        return float(obj)
+    elif isinstance(obj, np.bool_):
+        return bool(obj)
+    else:
+        return obj
+def classify_risk(probability: float, threshold: float) -> str:
+    """Classify risk level based on probability."""
+    if probability >= threshold + 0.2:
+        return "high"
+    elif probability >= threshold:
+        return "medium"
+    elif probability >= threshold - 0.15:
+        return "low"
+    else:
+        return "very_low"
+def create_model_zip(model_dir: Path, model_name: str) -> Path:
+    """Create a zip file containing all model artifacts."""
+    zip_filename = f"{model_name}.zip"
+    zip_path = TEMP_MODELS_DIR / zip_filename
+    with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
+        for file_path in model_dir.rglob('*'):
+            if file_path.is_file():
+                arcname = file_path.relative_to(model_dir)
+                zipf.write(file_path, arcname)
+    return zip_path
+def load_model_from_path(model_path: str) -> CredilyPipeline:
+    """Load a model from the specified path."""
+    path = Path(model_path)
+    if not path.exists():
+        logger.warning(f"Model file not found: {model_path}")
+        raise APIErrors.model_not_found(model_path)
+    # If it's a zip file, extract it first
+    if path.suffix == '.zip':
+        extract_dir = TEMP_MODELS_DIR / f"extract_{uuid.uuid4().hex[:8]}"
+        extract_dir.mkdir(exist_ok=True)
+        try:
+            with zipfile.ZipFile(path, 'r') as zipf:
+                zipf.extractall(extract_dir)
+        except zipfile.BadZipFile:
+            shutil.rmtree(extract_dir, ignore_errors=True)
+            logger.error(f"Invalid zip file: {model_path}")
+            raise APIErrors.invalid_model()
+        # Find the model.pkl file
+        pkl_files = list(extract_dir.rglob('model.pkl'))
+        if not pkl_files:
+            shutil.rmtree(extract_dir)
+            logger.error(f"No model.pkl in zip: {model_path}")
+            raise APIErrors.invalid_model()
+        model_path = str(pkl_files[0])
+    try:
+        pipeline = CredilyPipeline.load(model_path)
+        log_info(f"Model loaded successfully: {model_path}")
+        return pipeline
+    except Exception as e:
+        logger.error(f"Failed to load model: {e}", exc_info=True)
+        raise HTTPException(
+            status_code=400,
+            detail="Unable to load the model file. Please ensure it's a valid Credily model."
+        )
+def process_uploaded_model(file_content: bytes, filename: str) -> tuple[str, str]:
+    """
+    Process an uploaded model file and return (model_id, model_path).
+    Supports .zip files containing model.pkl or direct .pkl files.
+    """
+    model_id = f"model_{uuid.uuid4().hex[:12]}"
+    model_dir = UPLOADED_MODELS_DIR / model_id
+    model_dir.mkdir(exist_ok=True)
+    try:
+        if filename.endswith('.zip'):
+            # Extract zip file
+            zip_path = model_dir / filename
+            zip_path.write_bytes(file_content)
+            try:
+                with zipfile.ZipFile(zip_path, 'r') as zipf:
+                    zipf.extractall(model_dir)
+            except zipfile.BadZipFile:
+                shutil.rmtree(model_dir, ignore_errors=True)
+                raise HTTPException(
+                    status_code=400,
+                    detail="Invalid zip file. Please upload a valid model zip file."
+                )
+            # Find model.pkl
+            pkl_files = list(model_dir.rglob('model.pkl'))
+            if not pkl_files:
+                shutil.rmtree(model_dir, ignore_errors=True)
+                raise HTTPException(
+                    status_code=400,
+                    detail="No model.pkl found in the zip file. Please upload a valid Credily model."
+                )
+            model_path = str(pkl_files[0])
+        elif filename.endswith('.pkl'):
+            # Direct pkl file
+            model_path = str(model_dir / 'model.pkl')
+            Path(model_path).write_bytes(file_content)
+        else:
+            shutil.rmtree(model_dir, ignore_errors=True)
+            raise HTTPException(
+                status_code=400,
+                detail="Unsupported file format. Please upload a .zip or .pkl file."
+            )
+        # Validate model can be loaded
+        try:
+            pipeline = CredilyPipeline.load(model_path)
+            log_info(f"Uploaded model validated: {model_id}")
+        except Exception as e:
+            shutil.rmtree(model_dir, ignore_errors=True)
+            logger.error(f"Invalid model file: {e}")
+            raise HTTPException(
+                status_code=400,
+                detail="Invalid model file. Please upload a valid Credily model."
+            )
+        # Store mapping
+        uploaded_models[model_id] = model_path
+        return model_id, model_path
+    except HTTPException:
+        raise
+    except Exception as e:
+        shutil.rmtree(model_dir, ignore_errors=True)
+        logger.error(f"Failed to process uploaded model: {e}", exc_info=True)
+        raise HTTPException(
+            status_code=500,
+            detail="Failed to process the uploaded model file."
+        )
+# ============== Startup Event ==============
+@app.on_event("startup")
+async def startup_event():
+    """Initialize database on startup."""
+    init_db()
+# ============== API Endpoints ==============
+@app.get("/", response_model=HealthResponse)
+async def health_check():
+    """Health check endpoint."""
+    db_info = get_db_info()
+    db_status = f"healthy ({db_info['type']})"
+    return HealthResponse(
+        status="healthy",
+        version=__version__,
+        database_status=db_status
+    )
+@app.get("/api/health", response_model=HealthResponse)
+async def api_health():
+    """API health check endpoint."""
+    return await health_check()
+# ============== Training Endpoints ==============
+@app.post("/api/train", response_model=TrainResponse)
+async def train_model(request: TrainRequest):
+    """
+    Train a new credit scoring model.
+    - Accepts training data as JSON records
+    - Automatically profiles, cleans, and balances data
+    - Trains multiple models and selects the best performer
+    - Returns model as a downloadable zip file
+    - Saves training report to database
+    """
+    try:
+        # Convert request data to DataFrame
+        df = pd.DataFrame(request.data)
+        if df.empty:
+            raise APIErrors.no_data()
+        # Get config or use defaults
+        config = request.config or TrainConfig()
+        # Generate unique model name/ID
+        timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
+        model_name = request.model_name or f"credily_model_{timestamp}"
+        model_name = model_name.replace(' ', '_')
+        report_id = f"report_{timestamp}_{uuid.uuid4().hex[:8]}"
+        # Create temporary output directory
+        temp_output_dir = TEMP_MODELS_DIR / f"train_{uuid.uuid4().hex[:8]}"
+        # Create pipeline with config
+        pipeline = CredilyPipeline(
+            target_column=config.target_column,
+            output_dir=str(temp_output_dir),
+            test_size=config.test_size,
+            cv_folds=config.cv_folds,
+            clean_data=config.clean_data,
+            clean_mode=config.clean_mode,
+            flag_missing=config.flag_missing,
+            balance_data=config.balance_data,
+            balance_method=config.balance_method,
+            calibrate=config.calibrate,
+            optimize_threshold=config.optimize_threshold,
+            conservative_mode=config.conservative_mode,
+            binary_threshold=config.binary_threshold,
+            positive_classes=config.positive_classes
+        )
+        # Profile and train
+        pipeline.profile(df)
+        results = pipeline.train(df)
+        # Convert numpy types to native Python types
+        results = convert_numpy_types(results)
+        # Save training report to database
+        save_training_report(report_id, model_name, results)
+        # Create zip file for download
+        zip_path = create_model_zip(temp_output_dir, model_name)
+        # Clean up temp training directory
+        shutil.rmtree(temp_output_dir)
+        # Prepare test predictions data for visualization
+        test_preds = results.get("test_predictions")
+        test_predictions_data = None
+        if test_preds:
+            test_predictions_data = {
+                "y_true": test_preds["y_true"],
+                "y_pred": test_preds["y_pred"],
+                "y_proba": test_preds["y_proba"],
+                "n_samples": test_preds["n_samples"]
+            }
+        # Prepare ROC curve data
+        roc_data = results.get("roc_curve")
+        roc_curve_data = None
+        if roc_data:
+            roc_curve_data = {
+                "x": roc_data["fpr"],
+                "y": roc_data["tpr"]
+            }
+        # Prepare PR curve data
+        pr_data = results.get("pr_curve")
+        pr_curve_data = None
+        if pr_data:
+            pr_curve_data = {
+                "x": pr_data["recall"],
+                "y": pr_data["precision"]
+            }
+        # Prepare safety report data
+        safety_report_data = None
+        model_valid = True  # Default to valid if no safety report
+        raw_safety = results.get("safety_report")
+        if raw_safety:
+            safety_report_data = SafetyReportSchema(
+                status=raw_safety.get("status", "PASS"),
+                model_valid=raw_safety.get("model_valid", True),
+                dropped_features=raw_safety.get("dropped_features", {}),
+                warnings=raw_safety.get("warnings", []),
+                errors=raw_safety.get("errors", []),
+                leakage_detected=raw_safety.get("leakage_detected", {}),
+                redundant_features=raw_safety.get("redundant_features", []),
+                feature_dominance=raw_safety.get("feature_dominance", {}),
+                overfitting_metrics=raw_safety.get("overfitting_metrics", {})
+            )
+            model_valid = raw_safety.get("model_valid", True)
+        # Convert model_test_metrics to serializable format
+        model_test_metrics = results.get("model_test_metrics")
+        if model_test_metrics:
+            model_test_metrics = {
+                name: {
+                    "pr_auc": float(m["pr_auc"]),
+                    "roc_auc": float(m["roc_auc"]),
+                    "default_recall": float(m["default_recall"]),
+                    "fp_count": int(m["fp_count"]),
+                    "threshold": float(m["threshold"]),
+                    "cv_score": float(m["cv_score"])
+                }
+                for name, m in model_test_metrics.items()
+            }
+        return TrainResponse(
+            success=True,
+            report_id=report_id,
+            model_name=model_name,
+            best_model=results["best_model"],
+            best_score=float(results["best_score"]),
+            test_auc=float(results["test_auc"]),
+            test_pr_auc=float(results["test_pr_auc"]),
+            optimal_threshold=float(results["optimal_threshold"]),
+            model_scores={k: float(v) for k, v in results["model_scores"].items()},
+            model_test_metrics=model_test_metrics,
+            model_ranking=results.get("model_ranking"),
+            classification_report=results["classification_report"],
+            confusion_matrix=results["confusion_matrix"],
+            feature_importances={k: float(v) for k, v in results["feature_importances"].items()},
+            download_url=f"/api/train/download/{model_name}",
+            message=f"Model trained successfully. Best model: {results['best_model']} with ROC-AUC: {float(results['test_auc']):.4f}. Download your model using the download URL.",
+            model_valid=model_valid,
+            safety_report=safety_report_data,
+            test_predictions=test_predictions_data,
+            roc_curve=roc_curve_data,
+            pr_curve=pr_curve_data,
+            sanity_warnings=results.get("sanity_warnings", [])
+        )
+    except HTTPException:
+        raise
+    except ValueError as e:
+        logger.error(f"Training validation error: {e}")
+        raise handle_api_error(e, "model training")
+    except Exception as e:
+        logger.error(f"Training failed: {e}", exc_info=True)
+        raise handle_api_error(e, "model training")
+@app.get("/api/train/download/{model_name}")
+async def download_model(model_name: str):
+    """
+    Download a trained model zip file.
+    The model should be extracted and the model.pkl path used for predictions.
+    """
+    zip_path = TEMP_MODELS_DIR / f"{model_name}.zip"
+    if not zip_path.exists():
+        raise HTTPException(
+            status_code=404,
+            detail=f"Model '{model_name}' not found. It may have expired or been downloaded already."
+        )
+    def iterfile():
+        with open(zip_path, 'rb') as f:
+            yield from f
+        # Clean up after download
+        try:
+            zip_path.unlink()
+        except:
+            pass
+    return StreamingResponse(
+        iterfile(),
+        media_type="application/zip",
+        headers={
+            "Content-Disposition": f"attachment; filename={model_name}.zip"
+        }
+    )
+# ============== Model Upload Endpoints ==============
+@app.post("/api/models/upload")
+async def upload_model(model_file: UploadFile = File(...)):
+    """
+    Upload a trained model file for predictions.
+    - Accepts .zip files (from model training) or .pkl files
+    - Returns a model_id that can be used for predictions
+    - Models are stored temporarily on the server
+    """
+    if not model_file.filename:
+        raise HTTPException(status_code=400, detail="No file provided")
+    # Validate file extension
+    if not (model_file.filename.endswith('.zip') or model_file.filename.endswith('.pkl')):
+        raise HTTPException(
+            status_code=400,
+            detail="Unsupported file format. Please upload a .zip or .pkl file."
+        )
+    # Read file content
+    content = await model_file.read()
+    # Process and validate the model
+    model_id, model_path = process_uploaded_model(content, model_file.filename)
+    return {
+        "success": True,
+        "model_id": model_id,
+        "filename": model_file.filename,
+        "message": "Model uploaded successfully. Use the model_id for predictions."
+    }
+@app.get("/api/models/{model_id}")
+async def get_uploaded_model_info(model_id: str):
+    """Get information about an uploaded model."""
+    if model_id not in uploaded_models:
+        raise HTTPException(status_code=404, detail=f"Model '{model_id}' not found")
+    model_path = uploaded_models[model_id]
+    try:
+        pipeline = CredilyPipeline.load(model_path)
+        return {
+            "success": True,
+            "model_id": model_id,
+            "best_model": pipeline.best_model_name,
+            "optimal_threshold": pipeline.optimal_threshold,
+            "features": pipeline.feature_columns[:10] if hasattr(pipeline, 'feature_columns') else []
+        }
+    except Exception as e:
+        logger.error(f"Failed to load model info: {e}")
+        raise HTTPException(status_code=500, detail="Failed to load model information")
+@app.delete("/api/models/{model_id}")
+async def delete_uploaded_model(model_id: str):
+    """Delete an uploaded model."""
+    if model_id not in uploaded_models:
+        raise HTTPException(status_code=404, detail=f"Model '{model_id}' not found")
+    model_path = uploaded_models[model_id]
+    model_dir = UPLOADED_MODELS_DIR / model_id
+    try:
+        if model_dir.exists():
+            shutil.rmtree(model_dir)
+        del uploaded_models[model_id]
+        return {"success": True, "message": f"Model '{model_id}' deleted"}
+    except Exception as e:
+        logger.error(f"Failed to delete model: {e}")
+        raise HTTPException(status_code=500, detail="Failed to delete model")
+# ============== Prediction Endpoints ==============
+@app.post("/api/predict/with-upload", response_model=PredictResponse)
+async def predict_with_upload(
+    data_file: UploadFile = File(...),
+    model_file: UploadFile = File(...),
+    threshold: Optional[float] = Form(None),
+    save_results: bool = Form(True)
+):
+    """
+    Make predictions by uploading both model and data files.
+    - model_file: .zip or .pkl model file
+    - data_file: .csv file with data to predict
+    - threshold: optional custom threshold (0.0-1.0)
+    - save_results: whether to save to database
+    """
+    # Validate file types
+    if not model_file.filename or not (model_file.filename.endswith('.zip') or model_file.filename.endswith('.pkl')):
+        raise HTTPException(status_code=400, detail="Model must be a .zip or .pkl file")
+    if not data_file.filename or not data_file.filename.endswith('.csv'):
+        raise HTTPException(status_code=400, detail="Data must be a .csv file")
+    try:
+        # Process model upload
+        model_content = await model_file.read()
+        model_id, model_path = process_uploaded_model(model_content, model_file.filename)
+        # Read and parse CSV data
+        data_content = await data_file.read()
+        df = pd.read_csv(io.BytesIO(data_content))
+        if df.empty:
+            raise APIErrors.no_data()
+        # Load model and make predictions
+        pipeline = CredilyPipeline.load(model_path)
+        result_df = pipeline.predict(df, include_proba=True, threshold=threshold)
+        actual_threshold = threshold or pipeline.optimal_threshold
+        # Build response
+        predictions = []
+        for idx, row in result_df.iterrows():
+            prob = row.get("proba_1", None)
+            pred_result = PredictionResult(
+                index=int(idx),
+                prediction=int(row["prediction"]),
+                probability=float(prob) if prob is not None else None,
+                risk_level=classify_risk(prob, actual_threshold) if prob is not None else None
+            )
+            predictions.append(pred_result)
+        # Summary
+        pred_series = result_df["prediction"]
+        total_records = len(predictions)
+        predicted_positive = int(pred_series.sum())
+        predicted_negative = int((pred_series == 0).sum())
+        positive_rate = float(pred_series.mean())
+        summary = {
+            "total_records": total_records,
+            "predicted_positive": predicted_positive,
+            "predicted_negative": predicted_negative,
+            "positive_rate": positive_rate,
+        }
+        avg_probability = None
+        if "proba_1" in result_df.columns:
+            avg_probability = float(result_df["proba_1"].mean())
+            summary["avg_probability"] = avg_probability
+            summary["risk_distribution"] = {
+                "very_low": sum(1 for p in predictions if p.risk_level == "very_low"),
+                "low": sum(1 for p in predictions if p.risk_level == "low"),
+                "medium": sum(1 for p in predictions if p.risk_level == "medium"),
+                "high": sum(1 for p in predictions if p.risk_level == "high"),
+            }
+        # Save to database if requested
+        session_id = None
+        if save_results:
+            session_id = f"pred_{datetime.now().strftime('%Y%m%d_%H%M%S')}_{uuid.uuid4().hex[:8]}"
+            save_prediction_session(
+                session_id=session_id,
+                model_path=f"uploaded:{model_id}",
+                model_name=pipeline.best_model_name,
+                threshold_used=actual_threshold,
+                total_records=total_records,
+                predicted_positive=predicted_positive,
+                predicted_negative=predicted_negative,
+                positive_rate=positive_rate,
+                avg_probability=avg_probability
+            )
+            # Save individual results
+            results_to_save = []
+            for i, pred in enumerate(predictions):
+                results_to_save.append({
+                    'index': pred.index,
+                    'prediction': pred.prediction,
+                    'probability': pred.probability,
+                    'risk_level': pred.risk_level,
+                    'input_data': df.iloc[i].to_dict() if i < len(df) else {}
+                })
+            save_prediction_results(session_id, results_to_save)
+        return PredictResponse(
+            success=True,
+            session_id=session_id,
+            model_path=f"uploaded:{model_id}",
+            threshold_used=actual_threshold,
+            predictions=predictions,
+            summary=summary
+        )
+    except HTTPException:
+        raise
+    except Exception as e:
+        logger.error(f"Prediction with upload failed: {e}", exc_info=True)
+        raise handle_api_error(e, "prediction")
+@app.post("/api/predict", response_model=PredictResponse)
+async def predict(request: PredictRequest):
+    """
+    Make credit risk predictions on new data.
+    - Requires the absolute path to a trained model file (.pkl or .zip)
+    - Returns predictions with probability scores
+    - Optionally saves results to database for history tracking
+    """
+    try:
+        # Load model from path
+        pipeline = load_model_from_path(request.model_path)
+        # Convert data to DataFrame
+        df = pd.DataFrame(request.data)
+        if df.empty:
+            raise APIErrors.no_data()
+        # Make predictions
+        result_df = pipeline.predict(
+            df,
+            include_proba=request.include_proba,
+            threshold=request.threshold
+        )
+        threshold = request.threshold or pipeline.optimal_threshold
+        # Build response
+        predictions = []
+        for idx, row in result_df.iterrows():
+            prob = row.get("proba_1", None)
+            pred_result = PredictionResult(
+                index=int(idx),
+                prediction=int(row["prediction"]),
+                probability=float(prob) if prob is not None else None,
+                risk_level=classify_risk(prob, threshold) if prob is not None else None
+            )
+            predictions.append(pred_result)
+        # Summary statistics
+        pred_series = result_df["prediction"]
+        total_records = len(predictions)
+        predicted_positive = int(pred_series.sum())
+        predicted_negative = int((pred_series == 0).sum())
+        positive_rate = float(pred_series.mean())
+        summary = {
+            "total_records": total_records,
+            "predicted_positive": predicted_positive,
+            "predicted_negative": predicted_negative,
+            "positive_rate": positive_rate,
+        }
+        avg_probability = None
+        if request.include_proba and "proba_1" in result_df.columns:
+            avg_probability = float(result_df["proba_1"].mean())
+            summary["avg_probability"] = avg_probability
+            summary["risk_distribution"] = {
+                "very_low": sum(1 for p in predictions if p.risk_level == "very_low"),
+                "low": sum(1 for p in predictions if p.risk_level == "low"),
+                "medium": sum(1 for p in predictions if p.risk_level == "medium"),
+                "high": sum(1 for p in predictions if p.risk_level == "high"),
+            }
+        # Save to database if requested
+        session_id = None
+        if request.save_results:
+            session_id = f"pred_{datetime.now().strftime('%Y%m%d_%H%M%S')}_{uuid.uuid4().hex[:8]}"
+            # Save session
+            save_prediction_session(
+                session_id=session_id,
+                model_path=request.model_path,
+                model_name=pipeline.best_model_name,
+                threshold_used=threshold,
+                total_records=total_records,
+                predicted_positive=predicted_positive,
+                predicted_negative=predicted_negative,
+                positive_rate=positive_rate,
+                avg_probability=avg_probability
+            )
+            # Save individual results with input data
+            results_to_save = []
+            for i, (pred, input_row) in enumerate(zip(predictions, request.data)):
+                results_to_save.append({
+                    'index': pred.index,
+                    'prediction': pred.prediction,
+                    'probability': pred.probability,
+                    'risk_level': pred.risk_level,
+                    'input_data': input_row
+                })
+            save_prediction_results(session_id, results_to_save)
+        return PredictResponse(
+            success=True,
+            session_id=session_id,
+            model_path=request.model_path,
+            threshold_used=threshold,
+            predictions=predictions,
+            summary=summary
+        )
+    except HTTPException:
+        raise
+    except Exception as e:
+        logger.error(f"Prediction failed: {e}", exc_info=True)
+        raise handle_api_error(e, "prediction")
+@app.post("/api/predict/single")
+async def predict_single(request: SinglePredictRequest):
+    """
+    Make a single prediction (convenience endpoint).
+    Accepts a single record and returns prediction.
+    """
+    predict_request = PredictRequest(
+        data=[request.data],
+        model_path=request.model_path,
+        include_proba=True,
+        threshold=request.threshold,
+        save_results=False  # Don't save single predictions by default
+    )
+    response = await predict(predict_request)
+    if response.predictions:
+        pred = response.predictions[0]
+        return {
+            "prediction": pred.prediction,
+            "probability": pred.probability,
+            "risk_level": pred.risk_level,
+            "threshold": response.threshold_used
+        }
+    raise APIErrors.prediction_failed()
+# ============== Profiling Endpoints ==============
+@app.post("/api/profile", response_model=ProfileResponse)
+async def profile_data(request: ProfileRequest):
+    """
+    Profile a dataset to understand its structure and quality.
+    - Analyzes column types and distributions
+    - Identifies missing values and anomalies
+    - Provides recommendations for preprocessing
+    """
+    try:
+        df = pd.DataFrame(request.data)
+        if df.empty:
+            raise APIErrors.no_data()
+        # Create profiler
+        profiler = DataProfiler(target_column=request.target_column)
+        profile = profiler.profile(df)
+        # Build column profiles
+        columns = []
+        for col in df.columns:
+            col_data = df[col]
+            columns.append(ColumnProfile(
+                name=col,
+                dtype=str(col_data.dtype),
+                non_null_count=int(col_data.notna().sum()),
+                null_count=int(col_data.isna().sum()),
+                null_percentage=float(col_data.isna().mean() * 100),
+                unique_count=int(col_data.nunique()),
+                sample_values=col_data.dropna().head(5).tolist()
+            ))
+        # Numeric summary
+        numeric_cols = df.select_dtypes(include=["number"])
+        numeric_summary = None
+        if not numeric_cols.empty:
+            numeric_summary = numeric_cols.describe().to_dict()
+        # Target analysis
+        target_analysis = None
+        if request.target_column and request.target_column in df.columns:
+            target = df[request.target_column]
+            target_analysis = {
+                "class_distribution": target.value_counts().to_dict(),
+                "class_balance": float(target.value_counts(normalize=True).min()),
+                "is_binary": len(target.unique()) == 2
+            }
+        # Generate recommendations
+        recommendations = []
+        for col_profile in columns:
+            if col_profile.null_percentage > 30:
+                recommendations.append(f"Column '{col_profile.name}' has {col_profile.null_percentage:.1f}% missing values - consider removing or imputing")
+            if col_profile.unique_count == 1:
+                recommendations.append(f"Column '{col_profile.name}' has only one unique value - consider removing (no predictive value)")
+            if col_profile.unique_count == len(df) and col_profile.dtype == "object":
+                recommendations.append(f"Column '{col_profile.name}' appears to be an ID column - consider removing")
+        if target_analysis and target_analysis["class_balance"] < 0.2:
+            recommendations.append("Target class is highly imbalanced - SMOTE or other balancing techniques recommended")
+        return ProfileResponse(
+            success=True,
+            n_rows=len(df),
+            n_columns=len(df.columns),
+            columns=columns,
+            numeric_summary=numeric_summary,
+            target_analysis=target_analysis,
+            recommendations=recommendations
+        )
+    except HTTPException:
+        raise
+    except Exception as e:
+        logger.error(f"Profiling failed: {e}", exc_info=True)
+        raise handle_api_error(e, "data profiling")
+# ============== Prediction History Endpoints ==============
+@app.get("/api/predictions", response_model=PredictionHistoryResponse)
+async def list_prediction_history(
+    limit: int = Query(50, ge=1, le=100),
+    offset: int = Query(0, ge=0)
+):
+    """List prediction history sessions."""
+    sessions = get_prediction_sessions(limit=limit, offset=offset)
+    return PredictionHistoryResponse(
+        success=True,
+        sessions=[PredictionSessionInfo(**s) for s in sessions],
+        total=len(sessions)
+    )
+@app.get("/api/predictions/{session_id}", response_model=PredictionSessionDetailResponse)
+async def get_prediction_detail(session_id: str):
+    """Get detailed prediction session including all results."""
+    session = get_prediction_session(session_id)
+    if not session:
+        raise HTTPException(status_code=404, detail=f"Prediction session '{session_id}' not found")
+    results = get_prediction_results(session_id)
+    return PredictionSessionDetailResponse(
+        success=True,
+        session=PredictionSessionInfo(**session),
+        results=results
+    )
+@app.delete("/api/predictions/{session_id}")
+async def delete_prediction(session_id: str):
+    """Delete a prediction session and its results."""
+    if delete_prediction_session(session_id):
+        return {"success": True, "message": f"Prediction session '{session_id}' deleted"}
+    raise HTTPException(status_code=404, detail=f"Prediction session '{session_id}' not found")
+@app.get("/api/predictions/{session_id}/export")
+async def export_prediction_results(session_id: str, format: str = Query("csv", enum=["csv", "json"])):
+    """Export prediction results as CSV or JSON."""
+    session = get_prediction_session(session_id)
+    if not session:
+        raise HTTPException(status_code=404, detail=f"Prediction session '{session_id}' not found")
+    results = get_prediction_results(session_id)
+    if format == "csv":
+        # Create CSV
+        df = pd.DataFrame(results)
+        if 'input_data' in df.columns:
+            # Expand input_data into columns
+            input_df = pd.DataFrame(df['input_data'].tolist())
+            df = pd.concat([df.drop('input_data', axis=1), input_df], axis=1)
+        output = io.StringIO()
+        df.to_csv(output, index=False)
+        output.seek(0)
+        return StreamingResponse(
+            iter([output.getvalue()]),
+            media_type="text/csv",
+            headers={"Content-Disposition": f"attachment; filename=predictions_{session_id}.csv"}
+        )
+    else:
+        return {
+            "session": session,
+            "results": results
+        }
+# ============== Training Reports Endpoints ==============
+@app.get("/api/reports", response_model=TrainingReportsResponse)
+async def list_training_reports(
+    limit: int = Query(50, ge=1, le=100),
+    offset: int = Query(0, ge=0)
+):
+    """List training reports history."""
+    reports = get_training_reports(limit=limit, offset=offset)
+    return TrainingReportsResponse(
+        success=True,
+        reports=[TrainingReportInfo(**r) for r in reports],
+        total=len(reports)
+    )
+@app.get("/api/reports/{report_id}", response_model=TrainingReportDetailResponse)
+async def get_report_detail(report_id: str):
+    """Get detailed training report."""
+    report = get_training_report(report_id)
+    if not report:
+        raise HTTPException(status_code=404, detail=f"Training report '{report_id}' not found")
+    return TrainingReportDetailResponse(
+        success=True,
+        report=report
+    )
+@app.delete("/api/reports/{report_id}")
+async def delete_report(report_id: str):
+    """Delete a training report."""
+    if delete_training_report(report_id):
+        return {"success": True, "message": f"Training report '{report_id}' deleted"}
+    raise HTTPException(status_code=404, detail=f"Training report '{report_id}' not found")
+# ============== File Upload Endpoint ==============
+@app.post("/api/upload")
+async def upload_csv(file_content: str, filename: str):
+    """
+    Upload CSV data as base64 encoded string.
+    Returns the data as JSON records for further processing.
+    """
+    import base64
+    try:
+        # Decode base64 content
+        decoded = base64.b64decode(file_content)
+        # Read as CSV
+        df = pd.read_csv(io.BytesIO(decoded))
+        return {
+            "success": True,
+            "filename": filename,
+            "rows": len(df),
+            "columns": list(df.columns),
+            "data": df.to_dict(orient="records")
+        }
+    except Exception as e:
+        logger.error(f"CSV parsing failed: {e}", exc_info=True)
+        raise HTTPException(
+            status_code=400,
+            detail="Unable to parse the CSV file. Please ensure it's a valid CSV format."
+        )

credily/api/schemas.py ADDED Viewed

	@@ -0,0 +1,229 @@

+"""
+Pydantic schemas for API request/response validation.
+"""
+from typing import Dict, List, Any, Optional
+from pydantic import BaseModel, Field
+# ============== Training Schemas ==============
+class TrainConfig(BaseModel):
+    """Configuration for model training."""
+    target_column: str = Field(default="target", description="Name of the target column")
+    test_size: float = Field(default=0.2, ge=0.1, le=0.5, description="Test set proportion")
+    cv_folds: int = Field(default=5, ge=2, le=10, description="Cross-validation folds")
+    clean_data: bool = Field(default=True, description="Whether to clean data before training")
+    clean_mode: str = Field(default="thorough", description="Cleaning mode: basic, thorough, aggressive")
+    flag_missing: bool = Field(default=True, description="Create missing value indicator columns (set False if *_missing features dominate)")
+    balance_data: bool = Field(default=True, description="Whether to balance imbalanced classes")
+    balance_method: str = Field(default="smote", description="Balancing method: smote, random_oversample, random_undersample, smote_tomek, tomek, nearmiss, none")
+    calibrate: bool = Field(default=True, description="Whether to calibrate probabilities")
+    optimize_threshold: bool = Field(default=True, description="Whether to optimize classification threshold")
+    conservative_mode: str = Field(default="auto", description="Regularization mode: auto (detect small datasets), always, never")
+    # Agnostic pipeline options
+    task_type: str = Field(default="binary", description="Classification type: 'binary' for binary classification, 'multiclass' for multi-class classification")
+    binary_threshold: Optional[float] = Field(default=None, description="Threshold to convert numeric target to binary (values BELOW threshold = positive class). Example: 600 for credit score means score < 600 = default")
+    positive_classes: Optional[List[str]] = Field(default=None, description="List of class labels to treat as positive (1) for binary grouping. All other classes become negative (0). Example: ['Poor', 'Standard'] → these become 1, 'Good' becomes 0")
+class TrainRequest(BaseModel):
+    """Request body for training a model."""
+    data: List[Dict[str, Any]] = Field(..., description="Training data as list of records")
+    config: Optional[TrainConfig] = Field(default=None, description="Training configuration")
+    model_name: Optional[str] = Field(default=None, description="Custom name for the model")
+class TestPredictions(BaseModel):
+    """Test set predictions for visualization."""
+    y_true: List[int]
+    y_pred: List[int]
+    y_proba: List[float]
+    n_samples: int
+class CurveData(BaseModel):
+    """Data points for ROC or PR curves."""
+    x: List[float]  # FPR for ROC, Recall for PR
+    y: List[float]  # TPR for ROC, Precision for PR
+class SafetyReportSchema(BaseModel):
+    """Safety validation report."""
+    status: str  # PASS, WARN, FAIL
+    model_valid: bool
+    dropped_features: Dict[str, str] = {}  # feature: reason
+    warnings: List[str] = []
+    errors: List[str] = []
+    leakage_detected: Dict[str, float] = {}  # feature: correlation
+    redundant_features: List[Dict[str, Any]] = []
+    feature_dominance: Dict[str, float] = {}
+    overfitting_metrics: Dict[str, Any] = {}
+class ModelTestMetrics(BaseModel):
+    """Test set metrics for a single model."""
+    pr_auc: float
+    roc_auc: float
+    default_recall: float
+    fp_count: int
+    threshold: float
+    cv_score: float
+class TrainResponse(BaseModel):
+    """Response from model training."""
+    success: bool
+    report_id: str
+    model_name: str
+    best_model: str
+    best_score: float
+    test_auc: float
+    test_pr_auc: float
+    optimal_threshold: float
+    model_scores: Dict[str, float]
+    model_test_metrics: Optional[Dict[str, ModelTestMetrics]] = None  # Full test metrics for each model
+    model_ranking: Optional[List[str]] = None  # Ordered list of model names by rank
+    classification_report: Dict[str, Any]
+    confusion_matrix: List[List[int]]
+    feature_importances: Dict[str, float]
+    message: str
+    download_url: str  # URL to download the model zip file
+    # Safety validation
+    model_valid: bool = True  # Whether model passes all safety checks
+    safety_report: Optional[SafetyReportSchema] = None
+    # Model performance visualization data
+    test_predictions: Optional[TestPredictions] = None
+    roc_curve: Optional[CurveData] = None
+    pr_curve: Optional[CurveData] = None
+    sanity_warnings: Optional[List[str]] = None
+# ============== Prediction Schemas ==============
+class PredictRequest(BaseModel):
+    """Request body for making predictions."""
+    data: List[Dict[str, Any]] = Field(..., description="Data to predict on as list of records")
+    model_path: str = Field(..., description="Absolute path to the trained model file (.pkl)")
+    include_proba: bool = Field(default=True, description="Whether to include probability scores")
+    threshold: Optional[float] = Field(default=None, ge=0.0, le=1.0, description="Custom threshold")
+    save_results: bool = Field(default=True, description="Whether to save results to database")
+class PredictionResult(BaseModel):
+    """Single prediction result."""
+    index: int
+    prediction: int
+    probability: Optional[float] = None
+    risk_level: Optional[str] = None
+class PredictResponse(BaseModel):
+    """Response from prediction."""
+    success: bool
+    session_id: Optional[str] = None
+    model_path: str
+    threshold_used: float
+    predictions: List[PredictionResult]
+    summary: Dict[str, Any]
+class SinglePredictRequest(BaseModel):
+    """Request for single prediction."""
+    data: Dict[str, Any] = Field(..., description="Single record to predict")
+    model_path: str = Field(..., description="Absolute path to the trained model file (.pkl)")
+    threshold: Optional[float] = Field(default=None, ge=0.0, le=1.0, description="Custom threshold")
+# ============== Profiling Schemas ==============
+class ProfileRequest(BaseModel):
+    """Request body for data profiling."""
+    data: List[Dict[str, Any]] = Field(..., description="Data to profile as list of records")
+    target_column: Optional[str] = Field(default=None, description="Target column name if known")
+class ColumnProfile(BaseModel):
+    """Profile of a single column."""
+    name: str
+    dtype: str
+    non_null_count: int
+    null_count: int
+    null_percentage: float
+    unique_count: int
+    sample_values: List[Any]
+class ProfileResponse(BaseModel):
+    """Response from data profiling."""
+    success: bool
+    n_rows: int
+    n_columns: int
+    columns: List[ColumnProfile]
+    numeric_summary: Optional[Dict[str, Any]] = None
+    target_analysis: Optional[Dict[str, Any]] = None
+    recommendations: List[str]
+# ============== History Schemas ==============
+class PredictionSessionInfo(BaseModel):
+    """Summary info about a prediction session."""
+    id: str
+    model_path: str
+    model_name: Optional[str]
+    threshold_used: float
+    total_records: int
+    predicted_positive: int
+    predicted_negative: int
+    positive_rate: float
+    avg_probability: Optional[float]
+    created_at: str
+class PredictionHistoryResponse(BaseModel):
+    """Response listing prediction history."""
+    success: bool
+    sessions: List[PredictionSessionInfo]
+    total: int
+class PredictionSessionDetailResponse(BaseModel):
+    """Detailed response for a prediction session."""
+    success: bool
+    session: PredictionSessionInfo
+    results: List[Dict[str, Any]]
+class TrainingReportInfo(BaseModel):
+    """Summary info about a training report."""
+    id: str
+    model_name: Optional[str]
+    best_model: str
+    best_score: float
+    test_auc: float
+    test_pr_auc: float
+    optimal_threshold: float
+    created_at: str
+class TrainingReportsResponse(BaseModel):
+    """Response listing training reports."""
+    success: bool
+    reports: List[TrainingReportInfo]
+    total: int
+class TrainingReportDetailResponse(BaseModel):
+    """Detailed response for a training report."""
+    success: bool
+    report: Dict[str, Any]
+# ============== Health Check Schemas ==============
+class HealthResponse(BaseModel):
+    """Health check response."""
+    status: str
+    version: str
+    database_status: str

credily/automl.py ADDED Viewed

	@@ -0,0 +1,1073 @@

+"""
+AutoML pipeline module for Credily.
+Trains and compares multiple models with cross-validation.
+"""
+import os
+import sys
+import json
+import warnings
+import joblib
+import pandas as pd
+import numpy as np
+# Suppress joblib resource tracker warnings on Windows
+if sys.platform == 'win32':
+    os.environ.setdefault('LOKY_PICKLER', 'pickle')
+    warnings.filterwarnings('ignore', message='.*resource_tracker.*')
+    warnings.filterwarnings('ignore', message='.*Cannot register.*')
+from pathlib import Path
+from typing import Optional, Dict, Any, List
+from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
+from sklearn.pipeline import Pipeline
+from sklearn.compose import ColumnTransformer
+from sklearn.impute import SimpleImputer
+from sklearn.preprocessing import StandardScaler, OneHotEncoder
+from sklearn.linear_model import LogisticRegression
+from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
+from sklearn.metrics import (
+    classification_report, roc_auc_score, precision_recall_curve,
+    confusion_matrix, f1_score, average_precision_score
+)
+from sklearn.calibration import CalibratedClassifierCV
+from .profiler import DataProfiler
+from .reporting import ReportGenerator
+from .cleaning import DataCleaner
+from .balancing import DataBalancer
+from .agnostic_pipeline import AgnosticPipeline
+from .safety import SafetyValidator, SafetyConfig, check_perfect_score_warning
+# ============== Sanity Check Thresholds ==============
+LEAKAGE_CORRELATION_THRESHOLD = 0.95  # Features with >95% correlation to target are suspicious
+CV_TEST_DROP_THRESHOLD = 0.05  # Warn if test score drops >5% from CV score
+FEATURE_DOMINANCE_THRESHOLD = 0.50  # Warn if single feature has >50% importance
+MIN_MINORITY_SAMPLES = 50  # Minimum minority class samples for reliable PR-AUC
+# ============== Small Dataset Thresholds ==============
+SMALL_DATASET_THRESHOLD = 10000  # Datasets below this trigger conservative mode
+VERY_SMALL_DATASET_THRESHOLD = 5000  # Datasets below this use extra regularization
+# ============== Model Selection Thresholds ==============
+RANDOM_FOREST_MIN_SAMPLES = 20000  # Only train RF for datasets > 20k rows
+RF_PRAUC_TOLERANCE = 0.02  # Discard RF if PR-AUC < GB PR-AUC by this margin
+class CredilyPipeline:
+    """
+    AutoML pipeline for binary classification tasks.
+    """
+    def __init__(
+        self,
+        target_column: str = 'target',
+        output_dir: str = 'credily_output',
+        test_size: float = 0.2,
+        cv_folds: int = 5,
+        random_state: int = 42,
+        # Cleaning options
+        clean_data: bool = True,
+        clean_mode: str = 'thorough',
+        outlier_method: str = 'iqr',
+        flag_missing: bool = True,  # Create missing value indicator columns
+        # Balancing options
+        balance_data: bool = True,
+        balance_method: str = 'smote',
+        # Parallel processing (default=1 to avoid Windows joblib issues)
+        n_jobs: int = 1,
+        # Advanced options
+        calibrate: bool = True,
+        calibration_method: str = 'isotonic',
+        optimize_threshold: bool = True,
+        threshold_metric: str = 'f1',
+        # Regularization options (auto-tuned for small datasets)
+        conservative_mode: str = 'auto',  # 'auto', 'always', 'never'
+        # Agnostic pipeline options (for dynamic target handling)
+        binary_threshold: Optional[float] = None,  # Threshold to convert numeric target to binary
+        binary_rule: Optional[callable] = None,  # Custom function for target conversion
+        positive_classes: Optional[list] = None  # List of classes to treat as positive for binary grouping
+    ):
+        self.target_column = target_column
+        self.output_dir = Path(output_dir)
+        self.test_size = test_size
+        self.cv_folds = cv_folds
+        self.random_state = random_state
+        # Cleaning and balancing options
+        self.clean_data = clean_data
+        self.clean_mode = clean_mode
+        self.outlier_method = outlier_method
+        self.flag_missing = flag_missing
+        self.balance_data = balance_data
+        self.balance_method = balance_method
+        self.n_jobs = n_jobs
+        # Advanced options
+        self.calibrate = calibrate
+        self.calibration_method = calibration_method
+        self.optimize_threshold = optimize_threshold
+        self.threshold_metric = threshold_metric
+        self.optimal_threshold = 0.5
+        self.conservative_mode = conservative_mode
+        self.is_small_dataset = False  # Set during training
+        # Agnostic pipeline options
+        self.binary_threshold = binary_threshold
+        self.binary_rule = binary_rule
+        self.positive_classes = positive_classes
+        self.agnostic_pipeline = None  # Initialized when needed
+        self.preprocessor = None
+        self.best_model = None
+        self.best_model_name = None
+        self.best_score = None
+        self.feature_names = None
+        self.numeric_columns = None
+        self.categorical_columns = None
+        self.expected_columns = None  # All expected input columns (for prediction alignment)
+        self.profiler = DataProfiler(target_column=target_column)
+        self.profile_report = None
+        self.training_results = None
+        self.cleaning_report = None
+        self.balancing_report = None
+        self.agnostic_report = None  # Report from AgnosticPipeline
+        self.class_ratio = None
+        self.sanity_warnings = []  # Sanity check warnings
+        self.safety_report = None  # Safety validation report
+        # Models will be initialized after we know the class ratio
+        self.models = None
+    def _init_models(self, class_ratio: float = 1.0, n_samples: int = 0) -> Dict[str, Any]:
+        """
+        Initialize all models with proper n_jobs and class weight settings.
+        Args:
+            class_ratio: Ratio of negative to positive samples for scale_pos_weight
+            n_samples: Number of samples in dataset (for conservative mode tuning)
+        """
+        # Determine if we should use conservative mode
+        use_conservative = False
+        if self.conservative_mode == 'always':
+            use_conservative = True
+        elif self.conservative_mode == 'auto' and n_samples < SMALL_DATASET_THRESHOLD:
+            use_conservative = True
+            self.is_small_dataset = True
+        # Extra conservative for very small datasets
+        very_small = n_samples < VERY_SMALL_DATASET_THRESHOLD
+        if use_conservative:
+            print(f"\n[CONSERVATIVE MODE] Dataset has {n_samples} samples - using regularized hyperparameters")
+        # Configure hyperparameters based on dataset size
+        if use_conservative:
+            # Conservative settings to prevent overfitting
+            rf_depth = 6 if very_small else 8
+            rf_estimators = 100
+            rf_min_samples_leaf = 10 if very_small else 5
+            gb_depth = 3 if very_small else 4
+            gb_estimators = 100
+            gb_learning_rate = 0.05 if very_small else 0.1
+            gb_subsample = 0.8
+            gb_min_samples_leaf = 10 if very_small else 5
+            xgb_depth = 4 if very_small else 5
+            xgb_estimators = 100
+            xgb_learning_rate = 0.05 if very_small else 0.1
+            xgb_subsample = 0.8
+            xgb_colsample = 0.8
+            lgb_depth = 4 if very_small else 5
+            lgb_estimators = 100
+            lgb_learning_rate = 0.05 if very_small else 0.1
+        else:
+            # Standard settings for larger datasets
+            rf_depth = 10
+            rf_estimators = 200
+            rf_min_samples_leaf = 1
+            gb_depth = 5
+            gb_estimators = 100
+            gb_learning_rate = 0.1
+            gb_subsample = 1.0
+            gb_min_samples_leaf = 1
+            xgb_depth = 6
+            xgb_estimators = 200
+            xgb_learning_rate = 0.1
+            xgb_subsample = 1.0
+            xgb_colsample = 1.0
+            lgb_depth = 6
+            lgb_estimators = 200
+            lgb_learning_rate = 0.1
+        # Core models: Logistic Regression (baseline) + Gradient Boosting (champion)
+        models = {
+            'LogisticRegression': LogisticRegression(
+                max_iter=1000,
+                class_weight='balanced',
+                random_state=self.random_state
+                # Note: n_jobs removed - deprecated in sklearn 1.8+
+            ),
+            'GradientBoosting': GradientBoostingClassifier(
+                n_estimators=gb_estimators,
+                max_depth=gb_depth,
+                learning_rate=gb_learning_rate,
+                subsample=gb_subsample,
+                min_samples_leaf=gb_min_samples_leaf,
+                random_state=self.random_state
+            ),
+        }
+        # RandomForest: Only train for larger datasets (>20k rows) with strict regularization
+        # RF tends to overfit on smaller datasets - GB is more reliable
+        if n_samples >= RANDOM_FOREST_MIN_SAMPLES:
+            models['RandomForest'] = RandomForestClassifier(
+                n_estimators=rf_estimators,
+                max_depth=min(rf_depth, 8),  # Strict regularization: max_depth <= 8
+                min_samples_leaf=max(rf_min_samples_leaf, 5),
+                class_weight='balanced',
+                random_state=self.random_state,
+                n_jobs=self.n_jobs
+            )
+        # Try to import optional models
+        try:
+            from xgboost import XGBClassifier
+            # Use scale_pos_weight = (negatives / positives) for imbalanced data
+            models['XGBoost'] = XGBClassifier(
+                n_estimators=xgb_estimators,
+                max_depth=xgb_depth,
+                learning_rate=xgb_learning_rate,
+                subsample=xgb_subsample,
+                colsample_bytree=xgb_colsample,
+                scale_pos_weight=class_ratio,
+                random_state=self.random_state,
+                n_jobs=self.n_jobs,
+                use_label_encoder=False,
+                eval_metric='logloss'
+            )
+        except ImportError:
+            pass
+        try:
+            from lightgbm import LGBMClassifier
+            models['LightGBM'] = LGBMClassifier(
+                n_estimators=lgb_estimators,
+                max_depth=lgb_depth,
+                learning_rate=lgb_learning_rate,
+                class_weight='balanced',
+                random_state=self.random_state,
+                n_jobs=self.n_jobs,
+                verbose=-1
+            )
+        except ImportError:
+            pass
+        return models
+    def profile(self, df: pd.DataFrame) -> Dict[str, Any]:
+        """Profile the dataset."""
+        self.profile_report = self.profiler.profile(df)
+        return self.profile_report
+    def train(self, df: pd.DataFrame) -> Dict[str, Any]:
+        """
+        Train multiple models and select the best performer.
+        Args:
+            df: Training dataframe with features and target
+        Returns:
+            dict: Training results with best model and scores
+        """
+        if self.target_column not in df.columns:
+            raise ValueError(f"Target column '{self.target_column}' not found")
+        # Create output directory
+        self.output_dir.mkdir(parents=True, exist_ok=True)
+        # Step 1: Clean data
+        if self.clean_data:
+            cleaner = DataCleaner(
+                target_column=self.target_column,
+                clean_mode=self.clean_mode,
+                outlier_method=self.outlier_method,
+                flag_missing=self.flag_missing  # Control missing value indicators
+            )
+            df = cleaner.clean(df)
+            self.cleaning_report = cleaner.get_report()
+        # Step 1.2: Safety Validation - Pre-training checks
+        print("\n[SAFETY] Running pre-training safety validation...")
+        safety_validator = SafetyValidator(verbose=True)
+        df, self.safety_report = safety_validator.run_pre_training_checks(df, self.target_column)
+        # Add dropped features to sanity warnings
+        if self.safety_report.dropped_features:
+            self.sanity_warnings.append(
+                f"Safety: Dropped {len(self.safety_report.dropped_features)} features due to leakage/redundancy"
+            )
+        for warning in self.safety_report.warnings:
+            self.sanity_warnings.append(f"Safety: {warning}")
+        # Step 1.5: Apply AgnosticPipeline for dynamic target handling (if configured)
+        if self.binary_threshold is not None or self.binary_rule is not None or self.positive_classes is not None:
+            print("\n[AGNOSTIC PIPELINE] Dynamic target transformation enabled")
+            self.agnostic_pipeline = AgnosticPipeline(
+                binary_threshold=self.binary_threshold,
+                binary_rule=self.binary_rule,
+                positive_classes=self.positive_classes,
+                task_type='binary',
+                flag_missing=self.flag_missing,
+                verbose=True
+            )
+            X, y = self.agnostic_pipeline.fit_transform(df, self.target_column)
+            self.agnostic_report = self.agnostic_pipeline.get_report()
+        else:
+            X = df.drop(columns=[self.target_column])
+            y = df[self.target_column]
+        n_samples = len(X)
+        # Validate we have exactly 2 classes for binary classification
+        class_counts = y.value_counts()
+        n_classes = len(class_counts)
+        if n_classes < 2:
+            raise ValueError(
+                f"Binary classification requires 2 classes, but found {n_classes}. "
+                f"Classes: {list(class_counts.index)}. "
+                "Check if data cleaning removed one class or if target column has issues."
+            )
+        elif n_classes > 2:
+            print(f"Warning: Found {n_classes} classes. Treating as multi-class problem.")
+        # Calculate class ratio for XGBoost scale_pos_weight
+        if len(class_counts) == 2:
+            n_negative = class_counts.get(0, class_counts.iloc[0])
+            n_positive = class_counts.get(1, class_counts.iloc[1])
+            self.class_ratio = n_negative / n_positive if n_positive > 0 else 1.0
+            print(f"Class ratio (neg/pos): {self.class_ratio:.2f}")
+        else:
+            self.class_ratio = 1.0
+        # Initialize models with class ratio and sample count for proper tuning
+        self.models = self._init_models(class_ratio=self.class_ratio, n_samples=n_samples)
+        # Identify column types
+        self.numeric_columns = X.select_dtypes(include=[np.number]).columns.tolist()
+        self.categorical_columns = X.select_dtypes(include=['object', 'category']).columns.tolist()
+        # Store all expected columns for prediction alignment
+        self.expected_columns = X.columns.tolist()
+        # Create preprocessor
+        self.preprocessor = self._create_preprocessor()
+        # Split data
+        X_train, X_test, y_train, y_test = train_test_split(
+            X, y,
+            test_size=self.test_size,
+            stratify=y,
+            random_state=self.random_state
+        )
+        # Step 2: Balance training data (only on training set to avoid data leakage)
+        if self.balance_data:
+            balancer = DataBalancer(
+                method=self.balance_method,
+                random_state=self.random_state
+            )
+            X_train, y_train = balancer.balance(X_train, y_train)
+            self.balancing_report = balancer.get_report()
+        # Train and evaluate models
+        model_scores = {}
+        cv = StratifiedKFold(n_splits=self.cv_folds, shuffle=True, random_state=self.random_state)
+        print(f"\nTraining {len(self.models)} models with {self.cv_folds}-fold CV...")
+        print("-" * 50)
+        for name, model in self.models.items():
+            pipeline = Pipeline([
+                ('preprocessor', self.preprocessor),
+                ('classifier', model)
+            ])
+            try:
+                scores = cross_val_score(
+                    pipeline, X_train, y_train,
+                    cv=cv, scoring='roc_auc', n_jobs=self.n_jobs
+                )
+                mean_score = scores.mean()
+                std_score = scores.std()
+                model_scores[name] = mean_score
+                print(f"  {name}: ROC-AUC = {mean_score:.4f} (+/- {std_score:.4f})")
+            except Exception as e:
+                print(f"  {name}: Failed - {str(e)}")
+                model_scores[name] = 0.0
+        # Auto-discard RandomForest if it underperforms GradientBoosting
+        # This prevents RF from being selected when it overfits
+        if 'RandomForest' in model_scores and 'GradientBoosting' in model_scores:
+            rf_score = model_scores['RandomForest']
+            gb_score = model_scores['GradientBoosting']
+            if rf_score < gb_score - RF_PRAUC_TOLERANCE:
+                print(f"  [AUTO-DISCARD] RandomForest ({rf_score:.4f}) underperforms GradientBoosting ({gb_score:.4f}) - removing from candidates")
+                del model_scores['RandomForest']
+        print("-" * 50)
+        print(f"\nTraining and evaluating all candidate models on test set...")
+        print("-" * 50)
+        # Train ALL candidate models on full training data and evaluate on test set
+        # This enables proper ranking based on test set metrics
+        trained_models = {}
+        model_test_metrics = {}
+        for name in model_scores.keys():
+            print(f"\n  Training {name}...")
+            pipeline = Pipeline([
+                ('preprocessor', self.preprocessor),
+                ('classifier', self.models[name])
+            ])
+            pipeline.fit(X_train, y_train)
+            # Temporarily store feature names for this model
+            temp_model = self.best_model
+            self.best_model = pipeline
+            self._extract_feature_names()
+            # Apply calibration if enabled
+            if self.calibrate:
+                pipeline = self._calibrate_model(X_train, y_train)
+            self.best_model = temp_model  # Restore
+            trained_models[name] = pipeline
+            # Evaluate on test set
+            y_proba = pipeline.predict_proba(X_test)[:, 1]
+            # Find optimal threshold for this model
+            if self.optimize_threshold:
+                threshold = self._find_optimal_threshold(y_test, y_proba)
+            else:
+                threshold = 0.5
+            y_pred = (y_proba >= threshold).astype(int)
+            # Compute metrics for ranking
+            pr_auc = average_precision_score(y_test, y_proba)
+            roc_auc = roc_auc_score(y_test, y_proba)
+            # Compute confusion matrix to get Default Recall and FP count
+            # Default class is 1 (positive class), recall = TP / (TP + FN)
+            cm = confusion_matrix(y_test, y_pred)
+            tn, fp, fn, tp = cm.ravel()
+            default_recall = tp / (tp + fn) if (tp + fn) > 0 else 0.0
+            fp_count = fp
+            model_test_metrics[name] = {
+                'pr_auc': pr_auc,
+                'roc_auc': roc_auc,
+                'default_recall': default_recall,
+                'fp_count': fp_count,
+                'threshold': threshold,
+                'cv_score': model_scores[name]
+            }
+            print(f"    CV ROC-AUC: {model_scores[name]:.4f}")
+            print(f"    Test PR-AUC: {pr_auc:.4f} | Test ROC-AUC: {roc_auc:.4f}")
+            print(f"    Default Recall: {default_recall:.4f} | FP Count: {fp_count}")
+        # Rank models: PR-AUC (desc) → Default Recall (desc) → FP Count (asc)
+        print("\n" + "-" * 50)
+        print("Model Ranking (PR-AUC → Default Recall → FP Count):")
+        print("-" * 50)
+        def model_ranking_key(name):
+            metrics = model_test_metrics[name]
+            # Higher PR-AUC is better (negate for descending)
+            # Higher Default Recall is better (negate for descending)
+            # Lower FP Count is better (keep positive for ascending)
+            return (-metrics['pr_auc'], -metrics['default_recall'], metrics['fp_count'])
+        ranked_models = sorted(model_test_metrics.keys(), key=model_ranking_key)
+        for rank, name in enumerate(ranked_models, 1):
+            m = model_test_metrics[name]
+            marker = "  ★" if rank == 1 else ""
+            print(f"  {rank}. {name}: PR-AUC={m['pr_auc']:.4f}, Recall={m['default_recall']:.4f}, FP={m['fp_count']}{marker}")
+        # Select the best model based on ranking
+        self.best_model_name = ranked_models[0]
+        self.best_model = trained_models[self.best_model_name]
+        self.best_score = model_scores[self.best_model_name]  # CV score for consistency
+        self.optimal_threshold = model_test_metrics[self.best_model_name]['threshold']
+        # Store all model test metrics for reporting
+        self.model_test_metrics = model_test_metrics
+        print("-" * 50)
+        print(f"Selected model: {self.best_model_name}")
+        print(f"  CV ROC-AUC: {self.best_score:.4f}")
+        print(f"  Test PR-AUC: {model_test_metrics[self.best_model_name]['pr_auc']:.4f}")
+        print(f"  Default Recall: {model_test_metrics[self.best_model_name]['default_recall']:.4f}")
+        print(f"  FP Count: {model_test_metrics[self.best_model_name]['fp_count']}")
+        # Re-extract feature names for the selected model
+        self._extract_feature_names()
+        # Get test metrics for the selected model
+        y_proba = self.best_model.predict_proba(X_test)[:, 1]
+        test_auc = roc_auc_score(y_test, y_proba)
+        test_pr_auc = average_precision_score(y_test, y_proba)
+        # Optimize threshold if enabled
+        if self.optimize_threshold:
+            self.optimal_threshold = self._find_optimal_threshold(y_test, y_proba)
+            print(f"Optimal threshold ({self.threshold_metric}): {self.optimal_threshold:.3f}")
+        # Use optimal threshold for predictions
+        y_pred = (y_proba >= self.optimal_threshold).astype(int)
+        print(f"\nTest set evaluation (threshold={self.optimal_threshold:.3f}):")
+        print(classification_report(y_test, y_pred))
+        print(f"Test ROC-AUC: {test_auc:.4f}")
+        print(f"Test PR-AUC: {test_pr_auc:.4f}")
+        # Get feature importances
+        feature_importances = self._get_feature_importances()
+        # Run sanity checks
+        self.sanity_warnings = self._run_sanity_checks(
+            X=X,
+            y=y,
+            cv_score=self.best_score,
+            test_score=test_auc,
+            importances=feature_importances
+        )
+        self._print_sanity_warnings()
+        # Step 6: Post-training safety validation
+        print("\n[SAFETY] Running post-training safety validation...")
+        self.safety_report = safety_validator.run_post_training_checks(
+            feature_importances=feature_importances,
+            cv_score=self.best_score,
+            test_auc=test_auc
+        )
+        # Check for perfect scores (likely leakage)
+        perfect_score_warnings = check_perfect_score_warning({
+            'Test AUC': test_auc,
+            'Test PR-AUC': test_pr_auc,
+            'CV Score': self.best_score
+        })
+        for warning in perfect_score_warnings:
+            self.sanity_warnings.append(warning)
+            self.safety_report.add_warning(warning)
+        # Add post-training safety warnings/errors to sanity warnings
+        for error in self.safety_report.errors:
+            self.sanity_warnings.append(f"SAFETY FAIL: {error}")
+        for warning in self.safety_report.warnings:
+            if warning not in self.sanity_warnings:
+                self.sanity_warnings.append(f"Safety: {warning}")
+        # Compute ROC curve points for visualization
+        from sklearn.metrics import roc_curve, precision_recall_curve
+        fpr, tpr, roc_thresholds = roc_curve(y_test, y_proba)
+        precision_curve, recall_curve, pr_thresholds = precision_recall_curve(y_test, y_proba)
+        # Compile results
+        self.training_results = {
+            'best_model': self.best_model_name,
+            'best_score': self.best_score,
+            'test_auc': test_auc,
+            'test_pr_auc': test_pr_auc,
+            'optimal_threshold': self.optimal_threshold,
+            'class_ratio': self.class_ratio,
+            'model_scores': model_scores,
+            'model_test_metrics': model_test_metrics,  # Full test metrics for all models
+            'model_ranking': ranked_models,  # Ordered list of model names by rank
+            'classification_report': classification_report(y_test, y_pred, output_dict=True),
+            'confusion_matrix': confusion_matrix(y_test, y_pred).tolist(),
+            'feature_importances': feature_importances,
+            'cleaning_report': self.cleaning_report,
+            'balancing_report': self.balancing_report,
+            'agnostic_report': self.agnostic_report,
+            'safety_report': self.safety_report.to_dict() if self.safety_report else None,
+            'model_valid': self.safety_report.model_valid if self.safety_report else True,
+            'calibrated': self.calibrate,
+            'sanity_warnings': self.sanity_warnings,  # Include warnings in results
+            # Test predictions for visualization
+            'test_predictions': {
+                'y_true': y_test.tolist(),
+                'y_pred': y_pred.tolist(),
+                'y_proba': y_proba.tolist(),
+                'n_samples': len(y_test)
+            },
+            # ROC and PR curve data for charting
+            'roc_curve': {
+                'fpr': fpr.tolist(),
+                'tpr': tpr.tolist()
+            },
+            'pr_curve': {
+                'precision': precision_curve.tolist(),
+                'recall': recall_curve.tolist()
+            }
+        }
+        # Save outputs
+        self._save_outputs()
+        return self.training_results
+    def predict(
+        self,
+        df: pd.DataFrame,
+        include_proba: bool = False,
+        threshold: Optional[float] = None
+    ) -> pd.DataFrame:
+        """
+        Make predictions on new data using optimal threshold.
+        Args:
+            df: Input dataframe with features
+            include_proba: Whether to include prediction probabilities
+            threshold: Custom threshold (uses optimal if None)
+        Returns:
+            DataFrame with predictions
+        """
+        if self.best_model is None:
+            raise ValueError("Model not trained. Call train() first or load a saved model.")
+        if self.target_column in df.columns:
+            X = df.drop(columns=[self.target_column])
+        else:
+            X = df.copy()
+        # Use agnostic pipeline transform if it was used during training
+        if self.agnostic_pipeline is not None:
+            X = self.agnostic_pipeline.transform(X)
+            alignment_info = {}  # Agnostic pipeline handles alignment
+        else:
+            # Align prediction data with expected columns from training
+            X, alignment_info = self._align_prediction_data(X)
+        result = df.copy()
+        # Get probabilities
+        probas = self.best_model.predict_proba(X)
+        # Use optimal threshold
+        thresh = threshold if threshold is not None else self.optimal_threshold
+        result['prediction'] = (probas[:, 1] >= thresh).astype(int)
+        if include_proba:
+            result['proba_0'] = probas[:, 0]
+            result['proba_1'] = probas[:, 1]
+            result['threshold_used'] = thresh
+        # Add alignment info as metadata if there were issues
+        if alignment_info.get('missing_columns') or alignment_info.get('extra_columns'):
+            print(f"Column alignment applied: {alignment_info}")
+        return result
+    def _align_prediction_data(self, X: pd.DataFrame) -> tuple:
+        """
+        Align prediction data to match the expected columns from training.
+        This handles:
+        1. Missing columns: Added with NaN (will be imputed by the pipeline)
+        2. Extra columns: Removed (not needed for prediction)
+        3. Column order: Reordered to match training order
+        Args:
+            X: Input dataframe for prediction
+        Returns:
+            Tuple of (aligned dataframe, alignment info dict)
+        """
+        alignment_info = {
+            'missing_columns': [],
+            'extra_columns': [],
+            'columns_added_with_nan': []
+        }
+        if self.expected_columns is None:
+            # Fallback for older models without expected_columns
+            # Use numeric_columns + categorical_columns
+            if self.numeric_columns is not None and self.categorical_columns is not None:
+                self.expected_columns = self.numeric_columns + self.categorical_columns
+        if self.expected_columns is None:
+            # No column info available, return as-is
+            return X, alignment_info
+        input_cols = set(X.columns)
+        expected_cols = set(self.expected_columns)
+        # Find missing and extra columns
+        missing = expected_cols - input_cols
+        extra = input_cols - expected_cols
+        alignment_info['missing_columns'] = list(missing)
+        alignment_info['extra_columns'] = list(extra)
+        # Add missing columns with NaN (they will be imputed by SimpleImputer)
+        for col in missing:
+            # Determine the appropriate dtype for the missing column
+            if col in self.numeric_columns:
+                X[col] = np.nan
+            else:
+                # For categorical columns, use NaN (will be imputed with most_frequent)
+                X[col] = np.nan
+            alignment_info['columns_added_with_nan'].append(col)
+        # Remove extra columns (not needed for prediction)
+        X = X.drop(columns=list(extra), errors='ignore')
+        # Reorder columns to match training order
+        X = X[self.expected_columns]
+        return X, alignment_info
+    def _create_preprocessor(self) -> ColumnTransformer:
+        """Create the preprocessing pipeline."""
+        numeric_transformer = Pipeline([
+            ('imputer', SimpleImputer(strategy='median')),
+            ('scaler', StandardScaler())
+        ])
+        categorical_transformer = Pipeline([
+            ('imputer', SimpleImputer(strategy='most_frequent')),
+            ('encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
+        ])
+        return ColumnTransformer(
+            transformers=[
+                ('num', numeric_transformer, self.numeric_columns),
+                ('cat', categorical_transformer, self.categorical_columns)
+            ],
+            remainder='drop'
+        )
+    def _extract_feature_names(self):
+        """Extract feature names from fitted preprocessor."""
+        preprocessor = self.best_model.named_steps['preprocessor']
+        feature_names = list(self.numeric_columns)
+        if self.categorical_columns:
+            cat_encoder = preprocessor.named_transformers_['cat'].named_steps['encoder']
+            cat_names = cat_encoder.get_feature_names_out(self.categorical_columns)
+            feature_names.extend(cat_names.tolist())
+        self.feature_names = feature_names
+    def _get_feature_importances(self) -> Dict[str, float]:
+        """Get feature importances from the trained model."""
+        # Handle calibrated models
+        if hasattr(self.best_model, 'calibrated_classifiers_'):
+            # CalibratedClassifierCV wraps the original pipeline
+            base_classifier = self.best_model.calibrated_classifiers_[0].estimator
+            if hasattr(base_classifier, 'named_steps'):
+                classifier = base_classifier.named_steps['classifier']
+            else:
+                classifier = base_classifier
+        elif hasattr(self.best_model, 'named_steps'):
+            classifier = self.best_model.named_steps['classifier']
+        else:
+            return {}
+        if hasattr(classifier, 'feature_importances_'):
+            importances = classifier.feature_importances_
+        elif hasattr(classifier, 'coef_'):
+            importances = np.abs(classifier.coef_[0])
+        else:
+            return {}
+        return dict(zip(self.feature_names, importances.tolist()))
+    def _calibrate_model(self, X_train: pd.DataFrame, y_train: pd.Series) -> CalibratedClassifierCV:
+        """
+        Apply probability calibration using isotonic regression or Platt scaling.
+        Args:
+            X_train: Training features
+            y_train: Training labels
+        Returns:
+            Calibrated model
+        """
+        calibrated = CalibratedClassifierCV(
+            self.best_model,
+            method=self.calibration_method,
+            cv=3
+        )
+        calibrated.fit(X_train, y_train)
+        return calibrated
+    def _find_optimal_threshold(self, y_true: pd.Series, y_proba: np.ndarray) -> float:
+        """
+        Find optimal classification threshold based on specified metric.
+        Args:
+            y_true: True labels
+            y_proba: Predicted probabilities
+        Returns:
+            Optimal threshold value
+        """
+        thresholds = np.arange(0.1, 0.9, 0.01)
+        best_threshold = 0.5
+        best_score = 0
+        if self.threshold_metric == 'f1':
+            for thresh in thresholds:
+                y_pred = (y_proba >= thresh).astype(int)
+                score = f1_score(y_true, y_pred, zero_division=0)
+                if score > best_score:
+                    best_score = score
+                    best_threshold = thresh
+        elif self.threshold_metric == 'precision_recall_balance':
+            # Find threshold where precision and recall are closest
+            precision, recall, pr_thresholds = precision_recall_curve(y_true, y_proba)
+            # Find index where precision and recall are most balanced
+            diff = np.abs(precision[:-1] - recall[:-1])
+            best_idx = np.argmin(diff)
+            best_threshold = pr_thresholds[best_idx]
+        elif self.threshold_metric == 'youden':
+            # Youden's J statistic: sensitivity + specificity - 1
+            for thresh in thresholds:
+                y_pred = (y_proba >= thresh).astype(int)
+                tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
+                sensitivity = tp / (tp + fn) if (tp + fn) > 0 else 0
+                specificity = tn / (tn + fp) if (tn + fp) > 0 else 0
+                score = sensitivity + specificity - 1
+                if score > best_score:
+                    best_score = score
+                    best_threshold = thresh
+        elif self.threshold_metric == 'cost':
+            # Cost-sensitive: penalize false negatives more (missing defaults is expensive)
+            # Default: FN costs 5x more than FP
+            fn_cost = 5.0
+            fp_cost = 1.0
+            best_cost = float('inf')
+            for thresh in thresholds:
+                y_pred = (y_proba >= thresh).astype(int)
+                tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
+                total_cost = fn * fn_cost + fp * fp_cost
+                if total_cost < best_cost:
+                    best_cost = total_cost
+                    best_threshold = thresh
+        return best_threshold
+    def _save_outputs(self):
+        """Save model and reports."""
+        # Save model
+        model_path = self.output_dir / 'model.pkl'
+        model_data = {
+            'pipeline': self.best_model,
+            'model_name': self.best_model_name,
+            'feature_names': self.feature_names,
+            'numeric_columns': self.numeric_columns,
+            'categorical_columns': self.categorical_columns,
+            'expected_columns': self.expected_columns,  # For prediction alignment
+            'target_column': self.target_column,
+            'optimal_threshold': self.optimal_threshold,
+            'class_ratio': self.class_ratio,
+            'calibrated': self.calibrate,
+            # Agnostic pipeline info
+            'agnostic_pipeline': self.agnostic_pipeline,
+            'binary_threshold': self.binary_threshold,
+            'binary_rule': self.binary_rule,
+            'positive_classes': self.positive_classes
+        }
+        joblib.dump(model_data, model_path)
+        print(f"\nModel saved to: {model_path}")
+        # Save JSON report
+        report_path = self.output_dir / 'report.json'
+        with open(report_path, 'w') as f:
+            json.dump(self.training_results, f, indent=2, default=str)
+        print(f"Report saved to: {report_path}")
+        # Generate HTML report
+        reporter = ReportGenerator(self.output_dir)
+        reporter.generate_html_report(
+            self.training_results,
+            self.profile_report
+        )
+    # ============== Sanity Checks ==============
+    def _check_leakage(self, X: pd.DataFrame, y: pd.Series) -> List[str]:
+        """
+        Check for potential feature leakage by detecting features highly correlated with target.
+        Returns:
+            List of warning messages for suspicious features
+        """
+        warnings = []
+        numeric_cols = X.select_dtypes(include=[np.number]).columns
+        for col in numeric_cols:
+            try:
+                # Calculate point-biserial correlation for numeric features
+                corr = np.abs(X[col].corr(y))
+                if corr >= LEAKAGE_CORRELATION_THRESHOLD:
+                    warnings.append(
+                        f"LEAKAGE WARNING: Feature '{col}' has {corr:.2%} correlation with target. "
+                        f"This may indicate data leakage - investigate if this feature contains future information."
+                    )
+            except Exception:
+                pass  # Skip columns that can't be correlated
+        return warnings
+    def _check_minority_samples(self, y: pd.Series) -> List[str]:
+        """
+        Check if minority class has enough samples for reliable metrics.
+        Returns:
+            List of warning messages
+        """
+        warnings = []
+        class_counts = y.value_counts()
+        min_count = class_counts.min()
+        if min_count < MIN_MINORITY_SAMPLES:
+            warnings.append(
+                f"MINORITY CLASS WARNING: Only {min_count} samples in minority class. "
+                f"PR-AUC and other metrics may be unreliable. Consider collecting more data "
+                f"or using techniques like SMOTE cautiously."
+            )
+        return warnings
+    def _check_cv_test_gap(self, cv_score: float, test_score: float) -> List[str]:
+        """
+        Check for significant gap between CV and test scores (indicates overfitting).
+        Returns:
+            List of warning messages
+        """
+        warnings = []
+        gap = cv_score - test_score
+        if gap > CV_TEST_DROP_THRESHOLD:
+            warnings.append(
+                f"OVERFITTING WARNING: Test AUC ({test_score:.4f}) is {gap:.4f} lower than "
+                f"CV AUC ({cv_score:.4f}). This indicates potential overfitting. "
+                f"Consider: more regularization, simpler model, or more training data."
+            )
+        return warnings
+    def _check_feature_dominance(self, importances: Dict[str, float]) -> List[str]:
+        """
+        Check if a single feature dominates the model (potential leakage or oversensitivity).
+        Returns:
+            List of warning messages
+        """
+        warnings = []
+        if not importances:
+            return warnings
+        total = sum(importances.values())
+        if total == 0:
+            return warnings
+        for feature, importance in importances.items():
+            pct = importance / total
+            if pct > FEATURE_DOMINANCE_THRESHOLD:
+                warnings.append(
+                    f"FEATURE DOMINANCE WARNING: Feature '{feature}' accounts for {pct:.1%} "
+                    f"of model importance. Investigate for potential leakage or consider "
+                    f"if the model is too dependent on a single variable."
+                )
+        return warnings
+    def _run_sanity_checks(
+        self,
+        X: pd.DataFrame,
+        y: pd.Series,
+        cv_score: float,
+        test_score: float,
+        importances: Dict[str, float]
+    ) -> List[str]:
+        """
+        Run all sanity checks and collect warnings.
+        Returns:
+            List of all warning messages
+        """
+        all_warnings = []
+        # Check for leakage
+        all_warnings.extend(self._check_leakage(X, y))
+        # Check minority class samples
+        all_warnings.extend(self._check_minority_samples(y))
+        # Check CV vs test gap
+        all_warnings.extend(self._check_cv_test_gap(cv_score, test_score))
+        # Check feature dominance
+        all_warnings.extend(self._check_feature_dominance(importances))
+        return all_warnings
+    def _print_sanity_warnings(self):
+        """Print all sanity warnings."""
+        if self.sanity_warnings:
+            print(f"\n{'='*60}")
+            print("SANITY CHECK WARNINGS")
+            print(f"{'='*60}")
+            for i, warning in enumerate(self.sanity_warnings, 1):
+                print(f"\n[{i}] {warning}")
+            print(f"\n{'='*60}")
+    @classmethod
+    def load(cls, model_path: str) -> 'CredilyPipeline':
+        """Load a trained pipeline from disk."""
+        model_data = joblib.load(model_path)
+        instance = cls(target_column=model_data['target_column'])
+        instance.best_model = model_data['pipeline']
+        instance.best_model_name = model_data['model_name']
+        instance.feature_names = model_data['feature_names']
+        instance.numeric_columns = model_data['numeric_columns']
+        instance.categorical_columns = model_data['categorical_columns']
+        instance.expected_columns = model_data.get('expected_columns', None)
+        instance.optimal_threshold = model_data.get('optimal_threshold', 0.5)
+        instance.class_ratio = model_data.get('class_ratio', 1.0)
+        instance.calibrate = model_data.get('calibrated', False)
+        # Agnostic pipeline info
+        instance.agnostic_pipeline = model_data.get('agnostic_pipeline', None)
+        instance.binary_threshold = model_data.get('binary_threshold', None)
+        instance.binary_rule = model_data.get('binary_rule', None)
+        instance.positive_classes = model_data.get('positive_classes', None)
+        # Fallback for older models without expected_columns
+        if instance.expected_columns is None:
+            if instance.numeric_columns is not None and instance.categorical_columns is not None:
+                instance.expected_columns = instance.numeric_columns + instance.categorical_columns
+        return instance

credily/balancing.py ADDED Viewed

	@@ -0,0 +1,375 @@

+"""
+Data balancing module for Credily.
+Handles imbalanced datasets for binary classification.
+"""
+import pandas as pd
+import numpy as np
+from typing import Tuple, Optional, Dict, Any
+class DataBalancer:
+    """
+    Balances imbalanced datasets for binary classification.
+    """
+    METHODS = ['smote', 'random_oversample', 'random_undersample', 'smote_tomek', 'tomek', 'nearmiss', 'none']
+    def __init__(
+        self,
+        method: str = 'smote',
+        sampling_strategy: str = 'auto',
+        random_state: int = 42
+    ):
+        """
+        Initialize the DataBalancer.
+        Args:
+            method: Balancing method ('smote', 'random_oversample', 'random_undersample', 'smote_tomek', 'none')
+            sampling_strategy: Sampling strategy ('auto', 'minority', or float ratio)
+            random_state: Random seed for reproducibility
+        """
+        if method not in self.METHODS:
+            raise ValueError(f"Unknown method: {method}. Choose from {self.METHODS}")
+        self.method = method
+        self.sampling_strategy = sampling_strategy
+        self.random_state = random_state
+        self.balancing_report = {}
+    def balance(
+        self,
+        X: pd.DataFrame,
+        y: pd.Series,
+        verbose: bool = True
+    ) -> Tuple[pd.DataFrame, pd.Series]:
+        """
+        Balance the dataset.
+        Args:
+            X: Feature dataframe
+            y: Target series
+            verbose: Print balancing steps
+        Returns:
+            Balanced (X, y) tuple
+        """
+        if self.method == 'none':
+            return X, y
+        original_counts = y.value_counts().to_dict()
+        if verbose:
+            print(f"\n{'='*50}")
+            print("DATA BALANCING")
+            print(f"{'='*50}")
+            print(f"Original class distribution: {original_counts}")
+            print(f"Method: {self.method}")
+        # Check imbalance ratio
+        majority = max(original_counts.values())
+        minority = min(original_counts.values())
+        imbalance_ratio = majority / minority if minority > 0 else float('inf')
+        if imbalance_ratio < 1.5:
+            if verbose:
+                print("Dataset is already balanced (ratio < 1.5). Skipping.")
+            return X, y
+        # Apply balancing method
+        if self.method == 'smote':
+            X_bal, y_bal = self._apply_smote(X, y)
+        elif self.method == 'random_oversample':
+            X_bal, y_bal = self._apply_random_oversample(X, y)
+        elif self.method == 'random_undersample':
+            X_bal, y_bal = self._apply_random_undersample(X, y)
+        elif self.method == 'smote_tomek':
+            X_bal, y_bal = self._apply_smote_tomek(X, y)
+        elif self.method == 'tomek':
+            X_bal, y_bal = self._apply_tomek_links(X, y)
+        elif self.method == 'nearmiss':
+            X_bal, y_bal = self._apply_nearmiss(X, y)
+        else:
+            X_bal, y_bal = X, y
+        final_counts = y_bal.value_counts().to_dict()
+        if verbose:
+            print(f"Final class distribution: {final_counts}")
+            print(f"Samples before: {len(y)}, after: {len(y_bal)}")
+            print(f"{'='*50}\n")
+        self.balancing_report = {
+            'method': self.method,
+            'original_counts': original_counts,
+            'final_counts': final_counts,
+            'original_imbalance_ratio': imbalance_ratio,
+            'samples_before': len(y),
+            'samples_after': len(y_bal)
+        }
+        return X_bal, y_bal
+    def _apply_smote(self, X: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, pd.Series]:
+        """
+        Apply SMOTE oversampling.
+        Uses SMOTENC for mixed numeric/categorical data, regular SMOTE for all-numeric data.
+        """
+        try:
+            # Check if we have categorical columns
+            categorical_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()
+            if categorical_cols:
+                # Use SMOTENC for mixed data types
+                return self._apply_smotenc(X, y, categorical_cols)
+            else:
+                # Use regular SMOTE for all-numeric data
+                from imblearn.over_sampling import SMOTE
+                smote = SMOTE(
+                    sampling_strategy=self.sampling_strategy,
+                    random_state=self.random_state,
+                    k_neighbors=min(5, y.value_counts().min() - 1)
+                )
+                X_res, y_res = smote.fit_resample(X, y)
+                return pd.DataFrame(X_res, columns=X.columns), pd.Series(y_res, name=y.name)
+        except ImportError:
+            print("  Warning: imbalanced-learn not installed. Using random oversampling instead.")
+            print("  Install with: pip install imbalanced-learn")
+            return self._apply_random_oversample(X, y)
+        except Exception as e:
+            print(f"  Warning: SMOTE failed ({e}). Using random oversampling instead.")
+            return self._apply_random_oversample(X, y)
+    def _apply_smotenc(self, X: pd.DataFrame, y: pd.Series, categorical_cols: list) -> Tuple[pd.DataFrame, pd.Series]:
+        """
+        Apply SMOTENC for datasets with mixed numeric and categorical features.
+        SMOTENC properly handles categorical columns by preserving their discrete nature.
+        """
+        try:
+            from imblearn.over_sampling import SMOTENC
+            # Get categorical feature indices
+            categorical_indices = [X.columns.get_loc(col) for col in categorical_cols]
+            numeric_cols = [col for col in X.columns if col not in categorical_cols]
+            # Create a copy and handle missing values
+            X_encoded = X.copy()
+            # Track which numeric values were NaN (for restoration later)
+            numeric_nan_masks = {}
+            for col in numeric_cols:
+                if X_encoded[col].isna().any():
+                    numeric_nan_masks[col] = X_encoded[col].isna()
+                    # Fill with median for SMOTE (will be restored for original samples)
+                    X_encoded[col] = X_encoded[col].fillna(X_encoded[col].median())
+            # Encode categorical columns
+            encoders = {}
+            for col in categorical_cols:
+                # Fill NaN with a placeholder before encoding
+                X_encoded[col] = X_encoded[col].fillna('__MISSING__')
+                # Convert to category codes
+                X_encoded[col] = X_encoded[col].astype('category')
+                encoders[col] = dict(enumerate(X_encoded[col].cat.categories))
+                X_encoded[col] = X_encoded[col].cat.codes
+            smotenc = SMOTENC(
+                categorical_features=categorical_indices,
+                sampling_strategy=self.sampling_strategy,
+                random_state=self.random_state,
+                k_neighbors=min(5, y.value_counts().min() - 1)
+            )
+            X_res, y_res = smotenc.fit_resample(X_encoded, y)
+            # Convert back to DataFrame
+            X_res = pd.DataFrame(X_res, columns=X.columns)
+            # Decode categorical columns back to original string values
+            for col in categorical_cols:
+                reverse_encoder = {v: k for k, v in encoders[col].items()}
+                # Ensure integer type for mapping, then convert to string
+                X_res[col] = X_res[col].round().astype(int).map(reverse_encoder)
+                # Convert to string type (required for sklearn's categorical handling)
+                X_res[col] = X_res[col].astype(str)
+                # Restore NaN for __MISSING__ values
+                X_res.loc[X_res[col] == '__MISSING__', col] = np.nan
+            # Note: For synthetic samples, NaN values are not restored because
+            # SMOTE creates new samples by interpolating existing values.
+            # This is the expected behavior.
+            return X_res, pd.Series(y_res, name=y.name)
+        except ImportError:
+            print("  Warning: SMOTENC not available. Using random oversampling instead.")
+            return self._apply_random_oversample(X, y)
+        except Exception as e:
+            print(f"  Warning: SMOTENC failed ({e}). Using random oversampling instead.")
+            return self._apply_random_oversample(X, y)
+    def _apply_random_oversample(self, X: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, pd.Series]:
+        """Apply random oversampling."""
+        try:
+            from imblearn.over_sampling import RandomOverSampler
+            ros = RandomOverSampler(
+                sampling_strategy=self.sampling_strategy,
+                random_state=self.random_state
+            )
+            X_res, y_res = ros.fit_resample(X, y)
+            return pd.DataFrame(X_res, columns=X.columns), pd.Series(y_res, name=y.name)
+        except ImportError:
+            # Fallback: manual random oversampling
+            return self._manual_oversample(X, y)
+    def _apply_random_undersample(self, X: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, pd.Series]:
+        """Apply random undersampling."""
+        try:
+            from imblearn.under_sampling import RandomUnderSampler
+            rus = RandomUnderSampler(
+                sampling_strategy=self.sampling_strategy,
+                random_state=self.random_state
+            )
+            X_res, y_res = rus.fit_resample(X, y)
+            return pd.DataFrame(X_res, columns=X.columns), pd.Series(y_res, name=y.name)
+        except ImportError:
+            # Fallback: manual random undersampling
+            return self._manual_undersample(X, y)
+    def _apply_smote_tomek(self, X: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, pd.Series]:
+        """Apply SMOTE + Tomek links (combined over/undersampling)."""
+        try:
+            from imblearn.combine import SMOTETomek
+            smt = SMOTETomek(
+                sampling_strategy=self.sampling_strategy,
+                random_state=self.random_state
+            )
+            X_res, y_res = smt.fit_resample(X, y)
+            return pd.DataFrame(X_res, columns=X.columns), pd.Series(y_res, name=y.name)
+        except ImportError:
+            print("  Warning: imbalanced-learn not installed. Using SMOTE instead.")
+            return self._apply_smote(X, y)
+    def _apply_tomek_links(self, X: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, pd.Series]:
+        """
+        Apply Tomek Links for cleaning noisy/borderline samples.
+        Removes majority class samples that form Tomek links with minority class.
+        """
+        try:
+            from imblearn.under_sampling import TomekLinks
+            tomek = TomekLinks(sampling_strategy='majority')
+            X_res, y_res = tomek.fit_resample(X, y)
+            return pd.DataFrame(X_res, columns=X.columns), pd.Series(y_res, name=y.name)
+        except ImportError:
+            print("  Warning: imbalanced-learn not installed. Skipping Tomek Links.")
+            return X, y
+    def _apply_nearmiss(self, X: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, pd.Series]:
+        """
+        Apply NearMiss undersampling.
+        Selects majority samples closest to minority samples.
+        """
+        try:
+            from imblearn.under_sampling import NearMiss
+            nm = NearMiss(
+                sampling_strategy=self.sampling_strategy,
+                version=1  # NearMiss-1: closest to minority samples
+            )
+            X_res, y_res = nm.fit_resample(X, y)
+            return pd.DataFrame(X_res, columns=X.columns), pd.Series(y_res, name=y.name)
+        except ImportError:
+            print("  Warning: imbalanced-learn not installed. Using random undersampling instead.")
+            return self._apply_random_undersample(X, y)
+    def _manual_oversample(self, X: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, pd.Series]:
+        """Manual random oversampling without imbalanced-learn."""
+        counts = y.value_counts()
+        majority_count = counts.max()
+        dfs = []
+        for class_val in counts.index:
+            class_mask = y == class_val
+            class_df = X[class_mask]
+            class_y = y[class_mask]
+            if len(class_df) < majority_count:
+                # Oversample minority class
+                n_samples = majority_count - len(class_df)
+                sampled_idx = np.random.choice(class_df.index, size=n_samples, replace=True)
+                extra_X = class_df.loc[sampled_idx].reset_index(drop=True)
+                extra_y = class_y.loc[sampled_idx].reset_index(drop=True)
+                dfs.append((pd.concat([class_df, extra_X], ignore_index=True),
+                           pd.concat([class_y, extra_y], ignore_index=True)))
+            else:
+                dfs.append((class_df.reset_index(drop=True), class_y.reset_index(drop=True)))
+        X_res = pd.concat([d[0] for d in dfs], ignore_index=True)
+        y_res = pd.concat([d[1] for d in dfs], ignore_index=True)
+        return X_res, y_res
+    def _manual_undersample(self, X: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, pd.Series]:
+        """Manual random undersampling without imbalanced-learn."""
+        counts = y.value_counts()
+        minority_count = counts.min()
+        dfs = []
+        for class_val in counts.index:
+            class_mask = y == class_val
+            class_df = X[class_mask]
+            class_y = y[class_mask]
+            if len(class_df) > minority_count:
+                # Undersample majority class
+                sampled_idx = np.random.choice(class_df.index, size=minority_count, replace=False)
+                dfs.append((class_df.loc[sampled_idx].reset_index(drop=True),
+                           class_y.loc[sampled_idx].reset_index(drop=True)))
+            else:
+                dfs.append((class_df.reset_index(drop=True), class_y.reset_index(drop=True)))
+        X_res = pd.concat([d[0] for d in dfs], ignore_index=True)
+        y_res = pd.concat([d[1] for d in dfs], ignore_index=True)
+        return X_res, y_res
+    def get_report(self) -> Dict[str, Any]:
+        """Get the balancing report."""
+        return self.balancing_report
+def check_imbalance(y: pd.Series) -> Dict[str, Any]:
+    """
+    Check class imbalance in target variable.
+    Args:
+        y: Target series
+    Returns:
+        Dictionary with imbalance statistics
+    """
+    counts = y.value_counts()
+    majority = counts.max()
+    minority = counts.min()
+    ratio = majority / minority if minority > 0 else float('inf')
+    return {
+        'class_counts': counts.to_dict(),
+        'majority_class': counts.idxmax(),
+        'minority_class': counts.idxmin(),
+        'imbalance_ratio': ratio,
+        'is_imbalanced': ratio > 1.5,
+        'recommendation': _get_recommendation(ratio, len(y))
+    }
+def _get_recommendation(ratio: float, n_samples: int) -> str:
+    """Get balancing recommendation based on imbalance ratio."""
+    if ratio < 1.5:
+        return "Dataset is balanced. No action needed."
+    elif ratio < 3:
+        return "Mild imbalance. Consider using class_weight='balanced' or SMOTE."
+    elif ratio < 10:
+        return "Moderate imbalance. Use SMOTE or random oversampling."
+    else:
+        if n_samples > 10000:
+            return "Severe imbalance. Use SMOTE-Tomek or random undersampling."
+        else:
+            return "Severe imbalance with limited data. Use SMOTE carefully to avoid overfitting."

credily/cleaning.py ADDED Viewed

	@@ -0,0 +1,643 @@

+"""
+Data cleaning module for Credily.
+Comprehensive data cleaning for ML readiness with thorough defaults.
+"""
+import pandas as pd
+import numpy as np
+import re
+from typing import Optional, Dict, Any, List, Tuple
+class DataCleaner:
+    """
+    Comprehensive data cleaner for tabular datasets.
+    Default mode is 'thorough' which applies all best practices automatically.
+    """
+    CLEAN_MODES = ['basic', 'thorough', 'aggressive']
+    def __init__(
+        self,
+        target_column: Optional[str] = None,
+        clean_mode: str = 'thorough',
+        # Outlier settings
+        outlier_method: str = 'iqr',
+        outlier_threshold: float = 3.0,          # was 1.5 → preserve tail risk (defaults live here)
+        # Missing value settings
+        max_missing_threshold: float = 0.5,
+        flag_missing: bool = True,
+        # Duplicate settings
+        remove_duplicates: bool = True,
+        # Feature settings
+        remove_low_variance: bool = True,
+        variance_threshold: float = 0.001,        # was 0.01 → keep rare but important risk flags
+        remove_high_correlation: bool = True,
+        correlation_threshold: float = 0.95,
+        max_cardinality: int = 100,                # was 50 → preserve categorical risk segmentation
+        # Validation
+        validate_negative: bool = True
+    ):
+        """
+        Initialize the DataCleaner.
+        Args:
+            target_column: Name of target column (excluded from cleaning)
+            clean_mode: 'basic', 'thorough' (default), or 'aggressive'
+            outlier_method: 'iqr', 'zscore', or 'none'
+            outlier_threshold: Threshold for outlier detection
+            max_missing_threshold: Drop columns with more than this % missing
+            flag_missing: Create indicator columns for missing values
+            remove_duplicates: Whether to remove duplicate rows
+            remove_low_variance: Remove near-constant features
+            variance_threshold: Minimum variance to keep feature
+            remove_high_correlation: Remove highly correlated features
+            correlation_threshold: Max correlation allowed
+            max_cardinality: Max unique values for categorical encoding
+            validate_negative: Flag unexpected negative values
+        """
+        if clean_mode not in self.CLEAN_MODES:
+            raise ValueError(f"clean_mode must be one of {self.CLEAN_MODES}")
+        self.target_column = target_column
+        self.clean_mode = clean_mode
+        self.outlier_method = outlier_method
+        self.outlier_threshold = outlier_threshold
+        self.max_missing_threshold = max_missing_threshold
+        self.flag_missing = flag_missing
+        self.remove_duplicates = remove_duplicates
+        self.remove_low_variance = remove_low_variance
+        self.variance_threshold = variance_threshold
+        self.remove_high_correlation = remove_high_correlation
+        self.correlation_threshold = correlation_threshold
+        self.max_cardinality = max_cardinality
+        self.validate_negative = validate_negative
+        self.cleaning_report = {}
+        # Adjust settings based on mode
+        self._apply_mode_settings()
+    def _apply_mode_settings(self):
+        """Adjust settings based on clean_mode."""
+        if self.clean_mode == 'basic':
+            self.flag_missing = False
+            self.remove_low_variance = False
+            self.remove_high_correlation = False
+            self.validate_negative = False
+        elif self.clean_mode == 'aggressive':
+            self.max_missing_threshold = 0.3
+            self.variance_threshold = 0.05
+            self.correlation_threshold = 0.9
+            self.max_cardinality = 30
+    def clean(self, df: pd.DataFrame, verbose: bool = True) -> pd.DataFrame:
+        """
+        Clean the dataset comprehensively.
+        Args:
+            df: Input dataframe
+            verbose: Print cleaning steps
+        Returns:
+            Cleaned dataframe
+        """
+        original_shape = df.shape
+        df_clean = df.copy()
+        # Initialize tracking
+        report = {
+            'original_shape': original_shape,
+            'steps_applied': [],
+            'warnings': []
+        }
+        if verbose:
+            print(f"\n{'='*60}")
+            print(f"DATA CLEANING (Mode: {self.clean_mode})")
+            print(f"{'='*60}")
+            print(f"Original shape: {original_shape[0]} rows, {original_shape[1]} columns")
+        # ===== STEP 0: Drop Unnamed columns (common pandas artifact) =====
+        unnamed_cols = [col for col in df_clean.columns if col.startswith('Unnamed')]
+        if unnamed_cols:
+            df_clean = df_clean.drop(columns=unnamed_cols)
+            report['unnamed_dropped'] = unnamed_cols
+            report['steps_applied'].append('drop_unnamed')
+            if verbose:
+                print(f"  [0] Dropped {len(unnamed_cols)} 'Unnamed' columns")
+        # ===== STEP 0.5: Drop ID columns =====
+        df_clean, id_cols_dropped = self._drop_id_columns(df_clean)
+        if id_cols_dropped:
+            report['id_columns_dropped'] = id_cols_dropped
+            report['steps_applied'].append('drop_id_columns')
+            if verbose:
+                print(f"  [0.5] Dropped {len(id_cols_dropped)} ID columns: {id_cols_dropped}")
+        # ===== STEP 1: Remove exact duplicates =====
+        if self.remove_duplicates:
+            before = len(df_clean)
+            df_clean = df_clean.drop_duplicates()
+            removed = before - len(df_clean)
+            if removed > 0:
+                report['duplicates_removed'] = removed
+                report['steps_applied'].append('remove_duplicates')
+                if verbose:
+                    print(f"  [1] Removed {removed} duplicate rows")
+        # ===== STEP 2: Handle invalid/placeholder values =====
+        df_clean, invalid_count = self._replace_invalid_values(df_clean)
+        if invalid_count > 0:
+            report['invalid_values_replaced'] = invalid_count
+            report['steps_applied'].append('replace_invalid_values')
+            if verbose:
+                print(f"  [2] Replaced {invalid_count} invalid/placeholder values")
+        # ===== STEP 3: Fix data types =====
+        df_clean, type_fixes = self._fix_data_types(df_clean)
+        if type_fixes:
+            report['type_fixes'] = type_fixes
+            report['steps_applied'].append('fix_data_types')
+            if verbose:
+                print(f"  [3] Fixed data types for {len(type_fixes)} columns")
+        # ===== STEP 4: Handle infinite values =====
+        df_clean, inf_count = self._handle_infinite_values(df_clean)
+        if inf_count > 0:
+            report['infinite_replaced'] = inf_count
+            report['steps_applied'].append('handle_infinite')
+            if verbose:
+                print(f"  [4] Replaced {inf_count} infinite values")
+        # ===== STEP 5: Drop high-missing columns =====
+        df_clean, cols_dropped = self._drop_high_missing_columns(df_clean)
+        if cols_dropped:
+            report['high_missing_dropped'] = cols_dropped
+            report['steps_applied'].append('drop_high_missing')
+            if verbose:
+                print(f"  [5] Dropped {len(cols_dropped)} high-missing columns: {cols_dropped[:5]}{'...' if len(cols_dropped) > 5 else ''}")
+        # ===== STEP 6: Flag missing values (create indicator columns) =====
+        if self.flag_missing:
+            df_clean, missing_flags = self._flag_missing_values(df_clean)
+            if missing_flags:
+                report['missing_flags_created'] = missing_flags
+                report['steps_applied'].append('flag_missing')
+                if verbose:
+                    print(f"  [6] Created {len(missing_flags)} missing value indicator columns")
+        # ===== STEP 7: Handle outliers =====
+        if self.outlier_method != 'none':
+            df_clean, outlier_count = self._handle_outliers(df_clean)
+            if outlier_count > 0:
+                report['outliers_capped'] = outlier_count
+                report['steps_applied'].append('handle_outliers')
+                if verbose:
+                    print(f"  [7] Capped {outlier_count} outlier values (method: {self.outlier_method})")
+        # ===== STEP 8: Validate negative values =====
+        if self.validate_negative:
+            df_clean, neg_issues = self._validate_negative_values(df_clean)
+            if neg_issues:
+                report['negative_value_issues'] = neg_issues
+                report['warnings'].append(f"Unexpected negative values in: {list(neg_issues.keys())}")
+                if verbose:
+                    print(f"  [8] Found unexpected negatives in {len(neg_issues)} columns")
+        # ===== STEP 9: Fix categorical inconsistencies =====
+        df_clean, cat_fixes = self._fix_categorical_inconsistencies(df_clean)
+        if cat_fixes:
+            report['categorical_fixes'] = cat_fixes
+            report['steps_applied'].append('fix_categorical')
+            if verbose:
+                print(f"  [9] Standardized {len(cat_fixes)} categorical columns")
+        # ===== STEP 10: Handle high cardinality categoricals =====
+        df_clean, high_card = self._handle_high_cardinality(df_clean)
+        if high_card:
+            report['high_cardinality_handled'] = high_card
+            report['steps_applied'].append('handle_high_cardinality')
+            if verbose:
+                print(f"  [10] Handled {len(high_card)} high-cardinality columns")
+        # ===== STEP 11: Remove low variance features =====
+        if self.remove_low_variance:
+            df_clean, low_var_removed = self._remove_low_variance_features(df_clean)
+            if low_var_removed:
+                report['low_variance_removed'] = low_var_removed
+                report['steps_applied'].append('remove_low_variance')
+                if verbose:
+                    print(f"  [11] Removed {len(low_var_removed)} low-variance columns")
+        # ===== STEP 12: Remove highly correlated features =====
+        if self.remove_high_correlation:
+            df_clean, corr_removed = self._remove_correlated_features(df_clean)
+            if corr_removed:
+                report['correlated_removed'] = corr_removed
+                report['steps_applied'].append('remove_correlated')
+                if verbose:
+                    print(f"  [12] Removed {len(corr_removed)} highly correlated columns")
+        # ===== STEP 13: Validate target labels =====
+        if self.target_column and self.target_column in df_clean.columns:
+            df_clean, label_issues = self._validate_labels(df_clean)
+            report['label_validation'] = label_issues
+            report['steps_applied'].append('validate_labels')
+            if verbose:
+                print(f"  [13] Target validation: {label_issues.get('status', 'complete')}")
+        # ===== STEP 14: Final missing value check =====
+        missing_summary = df_clean.isnull().sum()
+        cols_with_missing = missing_summary[missing_summary > 0]
+        if len(cols_with_missing) > 0:
+            report['remaining_missing'] = cols_with_missing.to_dict()
+            if verbose:
+                print(f"  [14] Remaining missing values in {len(cols_with_missing)} columns (will be imputed during training)")
+        # Final summary
+        final_shape = df_clean.shape
+        report['final_shape'] = final_shape
+        report['rows_removed'] = original_shape[0] - final_shape[0]
+        report['columns_removed'] = original_shape[1] - final_shape[1]
+        if verbose:
+            print(f"\n{'='*60}")
+            print(f"CLEANING COMPLETE")
+            print(f"  Final shape: {final_shape[0]} rows, {final_shape[1]} columns")
+            print(f"  Rows removed: {report['rows_removed']}")
+            print(f"  Columns removed: {report['columns_removed']}")
+            print(f"  Steps applied: {len(report['steps_applied'])}")
+            if report['warnings']:
+                print(f"  Warnings: {len(report['warnings'])}")
+            print(f"{'='*60}\n")
+        self.cleaning_report = report
+        return df_clean
+    def _drop_id_columns(self, df: pd.DataFrame) -> Tuple[pd.DataFrame, List[str]]:
+        """
+        Detect and drop ID columns that provide no predictive value.
+        ID columns are identified by:
+        1. Column name patterns (id, index, key, etc.)
+        2. Unique value count equals row count (for non-numeric columns)
+        3. Sequential integer patterns
+        """
+        id_cols = []
+        n_rows = len(df)
+        # Common ID column name patterns (case-insensitive)
+        id_patterns = [
+            r'^id$', r'^_id$', r'_id$', r'^index$', r'^idx$',
+            r'^row_?num', r'^row_?id', r'^record_?id', r'^key$',
+            r'^pk$', r'^primary_?key', r'^unique_?id', r'^uuid$',
+            r'^guid$', r'^serial', r'^sequence', r'^customer_?id$',
+            r'^user_?id$', r'^account_?id$', r'^transaction_?id$',
+            r'^loan_?id$', r'^application_?id$', r'^case_?id$',
+            r'^member_?id$', r'^client_?id$', r'^order_?id$',
+            r'^sk_id', r'^member_id$'
+        ]
+        for col in df.columns:
+            # Skip target column
+            if col == self.target_column:
+                continue
+            col_lower = col.lower()
+            # Check 1: Name-based detection
+            is_id_name = any(re.match(pattern, col_lower) for pattern in id_patterns)
+            # Check 2: All unique values (non-numeric columns with 100% unique)
+            is_all_unique = False
+            if df[col].dtype == 'object':
+                n_unique = df[col].nunique()
+                if n_unique == n_rows:
+                    is_all_unique = True
+            # Check 3: Sequential integers (likely auto-increment ID)
+            is_sequential = False
+            if pd.api.types.is_integer_dtype(df[col]):
+                sorted_vals = df[col].dropna().sort_values()
+                if len(sorted_vals) > 1:
+                    diffs = sorted_vals.diff().dropna()
+                    # Check if mostly sequential (increments of 1)
+                    if (diffs == 1).mean() > 0.95:
+                        # Also check if it spans nearly the full range
+                        if sorted_vals.iloc[-1] - sorted_vals.iloc[0] >= n_rows * 0.9:
+                            is_sequential = True
+            # Drop if matches any criteria
+            if is_id_name or is_all_unique or is_sequential:
+                id_cols.append(col)
+        if id_cols:
+            df = df.drop(columns=id_cols)
+        return df, id_cols
+    def _replace_invalid_values(self, df: pd.DataFrame) -> Tuple[pd.DataFrame, int]:
+        """Replace common invalid/placeholder values with NaN."""
+        # Note: '-' removed from invalid patterns as it can be a valid label (e.g., +/- classification)
+        invalid_patterns = ['?', 'N/A', 'n/a', 'NA', 'na', 'NULL', 'null',
+                           'None', 'none', 'NaN', 'nan', '', ' ', '--',
+                           'missing', 'Missing', 'MISSING', 'unknown', 'Unknown']
+        count = 0
+        for col in df.columns:
+            # Skip target column to preserve labels
+            if col == self.target_column:
+                continue
+            if df[col].dtype == 'object':
+                mask = df[col].isin(invalid_patterns)
+                count += mask.sum()
+                df.loc[mask, col] = np.nan
+        return df, count
+    def _fix_data_types(self, df: pd.DataFrame) -> Tuple[pd.DataFrame, List[str]]:
+        """Fix common data type issues."""
+        fixed_cols = []
+        for col in df.columns:
+            if col == self.target_column:
+                continue
+            if df[col].dtype == 'object':
+                # Try to convert to numeric
+                numeric_converted = pd.to_numeric(df[col], errors='coerce')
+                non_null_original = df[col].notna().sum()
+                non_null_converted = numeric_converted.notna().sum()
+                # Only convert if we don't lose too much data
+                if non_null_converted >= non_null_original * 0.9:
+                    df[col] = numeric_converted
+                    fixed_cols.append(col)
+        return df, fixed_cols
+    def _handle_infinite_values(self, df: pd.DataFrame) -> Tuple[pd.DataFrame, int]:
+        """Replace infinite values with NaN."""
+        numeric_cols = df.select_dtypes(include=[np.number]).columns
+        inf_count = 0
+        for col in numeric_cols:
+            mask = np.isinf(df[col])
+            inf_count += mask.sum()
+            df.loc[mask, col] = np.nan
+        return df, inf_count
+    def _drop_high_missing_columns(self, df: pd.DataFrame) -> Tuple[pd.DataFrame, List[str]]:
+        """Drop columns with too many missing values."""
+        missing_pct = df.isnull().sum() / len(df)
+        cols_to_drop = missing_pct[missing_pct > self.max_missing_threshold].index.tolist()
+        if self.target_column in cols_to_drop:
+            cols_to_drop.remove(self.target_column)
+        df = df.drop(columns=cols_to_drop)
+        return df, cols_to_drop
+    def _flag_missing_values(self, df: pd.DataFrame) -> Tuple[pd.DataFrame, List[str]]:
+        """Create indicator columns for features with significant missing values."""
+        flags_created = []
+        missing_pct = df.isnull().sum() / len(df)
+        # Only flag columns with 5-50% missing (meaningful missingness)
+        cols_to_flag = missing_pct[(missing_pct >= 0.05) & (missing_pct <= 0.5)].index.tolist()
+        if self.target_column in cols_to_flag:
+            cols_to_flag.remove(self.target_column)
+        for col in cols_to_flag:
+            flag_col = f"{col}_missing"
+            df[flag_col] = df[col].isnull().astype(int)
+            flags_created.append(flag_col)
+        return df, flags_created
+    def _handle_outliers(self, df: pd.DataFrame) -> Tuple[pd.DataFrame, int]:
+        """Handle outliers using specified method."""
+        numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
+        if self.target_column in numeric_cols:
+            numeric_cols.remove(self.target_column)
+        # Also exclude flag columns
+        numeric_cols = [c for c in numeric_cols if not c.endswith('_missing')]
+        outliers_count = 0
+        for col in numeric_cols:
+            if df[col].nunique() < 3:  # Skip near-constant columns
+                continue
+            if self.outlier_method == 'iqr':
+                Q1 = df[col].quantile(0.25)
+                Q3 = df[col].quantile(0.75)
+                IQR = Q3 - Q1
+                if IQR == 0:
+                    continue
+                lower = Q1 - self.outlier_threshold * IQR
+                upper = Q3 + self.outlier_threshold * IQR
+            elif self.outlier_method == 'zscore':
+                mean = df[col].mean()
+                std = df[col].std()
+                if std == 0:
+                    continue
+                lower = mean - self.outlier_threshold * std
+                upper = mean + self.outlier_threshold * std
+            else:
+                continue
+            outliers = ((df[col] < lower) | (df[col] > upper)).sum()
+            outliers_count += outliers
+            df[col] = df[col].clip(lower=lower, upper=upper)
+        return df, outliers_count
+    def _validate_negative_values(self, df: pd.DataFrame) -> Tuple[pd.DataFrame, Dict[str, int]]:
+        """Check for unexpected negative values in typically positive columns."""
+        issues = {}
+        positive_keywords = ['age', 'income', 'salary', 'amount', 'balance', 'count',
+                            'quantity', 'price', 'rate', 'score', 'years', 'months']
+        numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
+        for col in numeric_cols:
+            if col == self.target_column:
+                continue
+            col_lower = col.lower()
+            if any(kw in col_lower for kw in positive_keywords):
+                neg_count = (df[col] < 0).sum()
+                if neg_count > 0:
+                    issues[col] = neg_count
+                    # Optionally clip to 0
+                    df[col] = df[col].clip(lower=0)
+        return df, issues
+    def _fix_categorical_inconsistencies(self, df: pd.DataFrame) -> Tuple[pd.DataFrame, List[str]]:
+        """Fix common inconsistencies in categorical columns."""
+        categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()
+        # Skip target column - labels should be handled separately
+        if self.target_column in categorical_cols:
+            categorical_cols.remove(self.target_column)
+        fixed_cols = []
+        for col in categorical_cols:
+            original_unique = df[col].nunique()
+            if df[col].dtype == 'object':
+                # Strip whitespace and standardize case
+                df[col] = df[col].astype(str).str.strip().str.lower()
+                # Common replacements (for features, not labels)
+                replacements = {
+                    'yes': 'yes', 'y': 'yes', 'true': 'yes',
+                    'no': 'no', 'n': 'no', 'false': 'no',
+                    'male': 'male', 'm': 'male', 'man': 'male',
+                    'female': 'female', 'f': 'female', 'woman': 'female',
+                    'nan': np.nan, 'none': np.nan, 'null': np.nan, '': np.nan
+                }
+                df[col] = df[col].replace(replacements)
+            new_unique = df[col].nunique()
+            if new_unique < original_unique:
+                fixed_cols.append(col)
+        return df, fixed_cols
+    def _handle_high_cardinality(self, df: pd.DataFrame) -> Tuple[pd.DataFrame, List[str]]:
+        """Handle categorical columns with too many unique values."""
+        handled = []
+        categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()
+        if self.target_column in categorical_cols:
+            categorical_cols.remove(self.target_column)
+        for col in categorical_cols:
+            n_unique = df[col].nunique()
+            if n_unique > self.max_cardinality:
+                # Keep top categories, group rest as 'other'
+                top_cats = df[col].value_counts().head(self.max_cardinality - 1).index.tolist()
+                df[col] = df[col].apply(lambda x: x if x in top_cats else 'other')
+                handled.append(col)
+        return df, handled
+    def _remove_low_variance_features(self, df: pd.DataFrame) -> Tuple[pd.DataFrame, List[str]]:
+        """Remove features with very low variance (near-constant)."""
+        removed = []
+        numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
+        if self.target_column in numeric_cols:
+            numeric_cols.remove(self.target_column)
+        # Exclude flag columns
+        numeric_cols = [c for c in numeric_cols if not c.endswith('_missing')]
+        for col in numeric_cols:
+            variance = df[col].var()
+            if variance is not None and variance < self.variance_threshold:
+                df = df.drop(columns=[col])
+                removed.append(col)
+        return df, removed
+    def _remove_correlated_features(self, df: pd.DataFrame) -> Tuple[pd.DataFrame, List[str]]:
+        """Remove highly correlated features (keep first, remove duplicates)."""
+        removed = []
+        numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
+        if self.target_column in numeric_cols:
+            numeric_cols.remove(self.target_column)
+        # Exclude flag columns
+        numeric_cols = [c for c in numeric_cols if not c.endswith('_missing')]
+        if len(numeric_cols) < 2:
+            return df, removed
+        corr_matrix = df[numeric_cols].corr().abs()
+        upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
+        to_drop = [col for col in upper.columns if any(upper[col] > self.correlation_threshold)]
+        df = df.drop(columns=to_drop)
+        removed = to_drop
+        return df, removed
+    def _validate_labels(self, df: pd.DataFrame) -> Tuple[pd.DataFrame, Dict[str, Any]]:
+        """
+        Validate and standardize target labels for binary classification.
+        Data-agnostic: simply maps any two classes to 0 and 1.
+        """
+        issues = {}
+        target = df[self.target_column]
+        unique_values = target.dropna().unique()
+        n_classes = len(unique_values)
+        issues['n_classes'] = n_classes
+        issues['unique_values'] = [str(v) for v in unique_values]
+        # Check for missing labels
+        missing_labels = target.isnull().sum()
+        if missing_labels > 0:
+            issues['missing_labels'] = int(missing_labels)
+            df = df.dropna(subset=[self.target_column])
+            # Recalculate unique values after dropping nulls
+            unique_values = df[self.target_column].dropna().unique()
+            n_classes = len(unique_values)
+        # Standardize binary labels to 0/1
+        if n_classes == 2:
+            # Data-agnostic approach: sort values and map first to 0, second to 1
+            sorted_vals = sorted(unique_values, key=lambda x: str(x))
+            label_map = {sorted_vals[0]: 0, sorted_vals[1]: 1}
+            df[self.target_column] = df[self.target_column].map(label_map)
+            issues['label_mapping'] = {str(k): v for k, v in label_map.items()}
+            issues['status'] = 'standardized to 0/1'
+        elif n_classes > 2:
+            issues['status'] = 'warning: more than 2 classes'
+        elif n_classes < 2:
+            issues['status'] = 'error: less than 2 classes'
+        return df, issues
+    def get_report(self) -> Dict[str, Any]:
+        """Get the cleaning report."""
+        return self.cleaning_report
+def detect_outliers(df: pd.DataFrame, column: str, method: str = 'iqr') -> pd.Series:
+    """
+    Detect outliers in a column.
+    Args:
+        df: Input dataframe
+        column: Column name to check
+        method: Detection method ('iqr' or 'zscore')
+    Returns:
+        Boolean series indicating outliers
+    """
+    if method == 'iqr':
+        Q1 = df[column].quantile(0.25)
+        Q3 = df[column].quantile(0.75)
+        IQR = Q3 - Q1
+        return (df[column] < Q1 - 1.5 * IQR) | (df[column] > Q3 + 1.5 * IQR)
+    elif method == 'zscore':
+        z_scores = np.abs((df[column] - df[column].mean()) / df[column].std())
+        return z_scores > 3
+    else:
+        raise ValueError(f"Unknown method: {method}")

credily/cli.py ADDED Viewed

	@@ -0,0 +1,367 @@

+"""
+Command Line Interface for Credily.
+"""
+import os
+import sys
+import warnings
+import multiprocessing
+# Suppress joblib resource tracker warnings on Windows (must be set before ANY imports)
+if sys.platform == 'win32':
+    # Force spawn method to avoid fork-related issues
+    try:
+        multiprocessing.set_start_method('spawn', force=True)
+    except RuntimeError:
+        pass  # Already set
+    os.environ['LOKY_PICKLER'] = 'pickle'
+    os.environ['JOBLIB_MULTIPROCESSING'] = '0'
+    os.environ['LOKY_MAX_CPU_COUNT'] = '1'
+    # Suppress all joblib/multiprocessing warnings
+    warnings.filterwarnings('ignore', category=UserWarning, module='joblib')
+    warnings.filterwarnings('ignore', category=UserWarning, module='loky')
+    warnings.filterwarnings('ignore', message='.*resource_tracker.*')
+    warnings.filterwarnings('ignore', message='.*Cannot register.*')
+    warnings.filterwarnings('ignore', message='.*leaked.*')
+import click
+from pathlib import Path
+@click.group()
+@click.version_option(version='0.1.0', prog_name='Credily')
+def cli():
+    """
+Credily — Data-Agnostic Credit Risk Modeling Engine for Tabular Data
+Credily is a command-line tool for building, evaluating, and deploying
+binary classification models for credit risk and default prediction.
+It is designed to adapt to diverse tabular datasets without hard-coded
+assumptions, while exposing interpretable risk signals and tunable
+decision policies.
+\b
+Key capabilities:
+- Automatically profiles tabular datasets and detects imbalance
+- Trains and evaluates multiple tree-based and linear models
+- Selects the best model using cross-validated ROC-AUC
+- Calibrates predicted probabilities for decision-making
+- Optimizes decision thresholds for imbalanced data
+- Generates credit-ready reports (ROC, PR-AUC, confusion matrix)
+- Exports trained models and structured reports (HTML / JSON)
+\b
+Typical use cases:
+- Credit scoring and default risk assessment
+- Loan approval / rejection systems
+- Risk-based customer segmentation
+- Decision support for financial products
+\b
+Quick Start:
+    credily train data.csv
+    credily train  # Interactive mode
+"""
+    pass
+@cli.command()
+@click.argument('data', type=click.Path(exists=True), required=False)
+@click.option('-t', '--target', help='Target column name')
+@click.option('-o', '--output', default='credily_output', help='Output directory')
+@click.option('--test-size', type=float, default=0.2, help='Test split ratio (default: 0.2)')
+@click.option('--cv', type=int, default=5, help='Cross-validation folds (default: 5)')
+@click.option('--no-profile', is_flag=True, help='Skip data profiling')
+@click.option('--no-clean', is_flag=True, help='Skip data cleaning')
+@click.option('--clean-mode', type=click.Choice(['basic', 'thorough', 'aggressive']), default='thorough', help='Cleaning mode: basic, thorough (default), or aggressive')
+@click.option('--outlier-method', type=click.Choice(['iqr', 'zscore', 'none']), default='iqr', help='Outlier detection method')
+@click.option('--no-balance', is_flag=True, help='Skip data balancing')
+@click.option('--balance-method', type=click.Choice(['smote', 'random_oversample', 'random_undersample', 'smote_tomek', 'tomek', 'nearmiss', 'none']), default='smote', help='Balancing method')
+@click.option('--parallel', is_flag=True, help='Enable parallel processing (may cause warnings on Windows)')
+@click.option('--no-calibrate', is_flag=True, help='Skip probability calibration')
+@click.option('--calibration-method', type=click.Choice(['isotonic', 'sigmoid']), default='isotonic', help='Calibration method (isotonic or sigmoid/Platt)')
+@click.option('--threshold-metric', type=click.Choice(['f1', 'cost', 'youden', 'precision_recall_balance']), default='f1', help='Metric for threshold optimization')
+@click.option('--no-threshold-opt', is_flag=True, help='Skip threshold optimization (use 0.5)')
+@click.option('--binary-threshold', type=float, default=None, help='Threshold to convert numeric target to binary (values BELOW threshold = positive class)')
+def train(data, target, output, test_size, cv, no_profile, no_clean, clean_mode, outlier_method, no_balance, balance_method, parallel, no_calibrate, calibration_method, threshold_metric, no_threshold_opt, binary_threshold):
+    """
+    Train a model on your dataset.
+    Supported file formats: CSV (.csv), Text (.txt, .tsv), Excel (.xlsx, .xls)
+    """
+    from .automl import CredilyPipeline
+    from .utils import load_data, get_supported_formats
+    if data is None:
+        # Interactive mode
+        click.echo(f"\n{get_supported_formats()}")
+        data = click.prompt('Enter path to data file')
+        if not Path(data).exists():
+            click.echo(click.style(f"Error: File '{data}' not found", fg='red'))
+            sys.exit(1)
+    click.echo(click.style(f"\n📊 Loading data from: {data}", fg='cyan'))
+    try:
+        df = load_data(data)
+    except ValueError as e:
+        click.echo(click.style(f"Error: {e}", fg='red'))
+        sys.exit(1)
+    except ImportError as e:
+        click.echo(click.style(f"Error: {e}", fg='red'))
+        sys.exit(1)
+    click.echo(f"   Shape: {df.shape[0]} rows, {df.shape[1]} columns")
+    if target is None:
+        click.echo(f"\n   Columns: {', '.join(df.columns.tolist())}")
+        target = click.prompt('Enter target column name')
+    if target not in df.columns:
+        click.echo(click.style(f"Error: Target '{target}' not found in data", fg='red'))
+        sys.exit(1)
+    pipeline = CredilyPipeline(
+        target_column=target,
+        output_dir=output,
+        test_size=test_size,
+        cv_folds=cv,
+        clean_data=not no_clean,
+        clean_mode=clean_mode,
+        outlier_method=outlier_method,
+        balance_data=not no_balance,
+        balance_method=balance_method,
+        n_jobs=-1 if parallel else 1,
+        calibrate=not no_calibrate,
+        calibration_method=calibration_method,
+        optimize_threshold=not no_threshold_opt,
+        threshold_metric=threshold_metric,
+        binary_threshold=binary_threshold
+    )
+    if not no_profile:
+        click.echo(click.style("\n🔍 Profiling data...", fg='cyan'))
+        pipeline.profile(df)
+    click.echo(click.style("\n🚀 Training models...", fg='cyan'))
+    results = pipeline.train(df)
+    click.echo(click.style(f"\n✅ Training complete!", fg='green'))
+    click.echo(f"   Best model: {results['best_model']}")
+    click.echo(f"   ROC-AUC: {results['best_score']:.4f}")
+    click.echo(f"   Output saved to: {output}/")
+@cli.command()
+@click.argument('data', type=click.Path(exists=True))
+@click.option('-m', '--model', default='credily_output/model.pkl', help='Path to trained model')
+@click.option('-o', '--output', default='predictions', help='Output file path (without extension)')
+@click.option('--format', 'output_format', type=click.Choice(['excel', 'csv', 'both']), default='excel', help='Output format: excel (default), csv, or both')
+@click.option('--proba', is_flag=True, default=True, help='Include prediction probabilities (default: True)')
+@click.option('--no-proba', is_flag=True, help='Exclude prediction probabilities')
+def predict(data, model, output, output_format, proba, no_proba):
+    """
+    Make predictions using a trained model.
+    Outputs the FULL dataset with predictions and probabilities.
+    Supported input formats: CSV (.csv), Text (.txt, .tsv), Excel (.xlsx, .xls)
+    """
+    from .automl import CredilyPipeline
+    from .utils import load_data, save_to_excel
+    include_proba = proba and not no_proba
+    click.echo(click.style(f"\n📦 Loading model from: {model}", fg='cyan'))
+    pipeline = CredilyPipeline.load(model)
+    click.echo(click.style(f"📊 Loading data from: {data}", fg='cyan'))
+    try:
+        df = load_data(data)
+    except (ValueError, ImportError) as e:
+        click.echo(click.style(f"Error: {e}", fg='red'))
+        sys.exit(1)
+    click.echo(f"   Shape: {df.shape[0]} rows, {df.shape[1]} columns")
+    click.echo(click.style("\n🔮 Making predictions...", fg='cyan'))
+    predictions = pipeline.predict(df, include_proba=include_proba)
+    # Show prediction summary
+    pred_counts = predictions['prediction'].value_counts().sort_index()
+    click.echo(f"\n   Prediction distribution:")
+    for pred_val, count in pred_counts.items():
+        pct = count / len(predictions) * 100
+        click.echo(f"   - Class {pred_val}: {count} ({pct:.1f}%)")
+    # Save outputs
+    saved_files = []
+    if output_format in ['excel', 'both']:
+        try:
+            excel_path = save_to_excel(predictions, output)
+            saved_files.append(excel_path)
+            click.echo(click.style(f"\n✅ Excel report saved to: {excel_path}", fg='green'))
+        except ImportError as e:
+            click.echo(click.style(f"Warning: {e}", fg='yellow'))
+            if output_format == 'excel':
+                output_format = 'csv'
+                click.echo("   Falling back to CSV format...")
+    if output_format in ['csv', 'both']:
+        csv_path = f"{output}.csv"
+        predictions.to_csv(csv_path, index=False)
+        saved_files.append(csv_path)
+        click.echo(click.style(f"\n✅ CSV saved to: {csv_path}", fg='green'))
+    click.echo(f"\n   Total records: {len(predictions)}")
+@cli.command()
+@click.argument('data', type=click.Path(exists=True))
+@click.option('-t', '--target', help='Target column name (optional)')
+def profile(data, target):
+    """
+    Profile a dataset without training.
+    Supported formats: CSV (.csv), Text (.txt, .tsv), Excel (.xlsx, .xls)
+    """
+    from .profiler import DataProfiler
+    from .utils import load_data
+    click.echo(click.style(f"\n📊 Loading data from: {data}", fg='cyan'))
+    try:
+        df = load_data(data)
+    except (ValueError, ImportError) as e:
+        click.echo(click.style(f"Error: {e}", fg='red'))
+        sys.exit(1)
+    profiler = DataProfiler(target_column=target)
+    report = profiler.profile(df)
+    click.echo(click.style("\n" + "=" * 60, fg='cyan'))
+    click.echo(click.style("DATA PROFILE REPORT", fg='cyan', bold=True))
+    click.echo(click.style("=" * 60, fg='cyan'))
+    click.echo(f"\n📋 Basic Info:")
+    click.echo(f"   Rows: {report['n_rows']}")
+    click.echo(f"   Columns: {report['n_cols']}")
+    click.echo(f"   Memory: {report['memory_mb']:.2f} MB")
+    click.echo(f"\n📊 Column Types:")
+    click.echo(f"   Numeric: {report['n_numeric']}")
+    click.echo(f"   Categorical: {report['n_categorical']}")
+    click.echo(f"\n⚠️  Data Quality:")
+    click.echo(f"   Missing values: {report['missing_pct']:.1f}%")
+    click.echo(f"   Duplicate rows: {report['duplicate_rows']}")
+    if target and 'target_info' in report:
+        click.echo(f"\n🎯 Target Analysis:")
+        click.echo(f"   Task type: {report['target_info']['task_type']}")
+        click.echo(f"   Classes: {report['target_info']['n_classes']}")
+        click.echo(f"   Class balance: {report['target_info']['balance']}")
+@cli.command()
+@click.argument('report_path', type=click.Path(exists=True))
+def show(report_path):
+    """Display a training report."""
+    import json
+    from pathlib import Path
+    path = Path(report_path)
+    if path.suffix == '.json':
+        with open(path) as f:
+            report = json.load(f)
+        click.echo(click.style("\n" + "=" * 60, fg='cyan'))
+        click.echo(click.style("TRAINING REPORT", fg='cyan', bold=True))
+        click.echo(click.style("=" * 60, fg='cyan'))
+        click.echo(f"\n🏆 Best Model: {report.get('best_model', 'N/A')}")
+        click.echo(f"   ROC-AUC: {report.get('best_score', 0):.4f}")
+        if 'model_scores' in report:
+            click.echo(f"\n📊 All Models:")
+            for model, score in report['model_scores'].items():
+                click.echo(f"   {model}: {score:.4f}")
+    elif path.suffix == '.html':
+        import webbrowser
+        webbrowser.open(f'file://{path.absolute()}')
+        click.echo(click.style(f"Opening report in browser...", fg='cyan'))
+    else:
+        click.echo(click.style(f"Error: Unsupported format '{path.suffix}'", fg='red'))
+@cli.command()
+@click.argument('data', type=click.Path(exists=True))
+@click.option('-m', '--model', default='credily_output/model.pkl', help='Path to trained model')
+@click.option('-t', '--target', required=True, help='Target column name')
+@click.option('-c', '--context', default='credit_scoring', help='Business context')
+def analyze(data, model, target, context):
+    """
+    Analyze model performance in different business contexts.
+    Supported formats: CSV (.csv), Text (.txt, .tsv), Excel (.xlsx, .xls)
+    """
+    from .automl import CredilyPipeline
+    from .analyzer import BusinessAnalyzer
+    from .utils import load_data
+    click.echo(click.style(f"\n📦 Loading model from: {model}", fg='cyan'))
+    pipeline = CredilyPipeline.load(model)
+    click.echo(click.style(f"📊 Loading data from: {data}", fg='cyan'))
+    try:
+        df = load_data(data)
+    except (ValueError, ImportError) as e:
+        click.echo(click.style(f"Error: {e}", fg='red'))
+        sys.exit(1)
+    analyzer = BusinessAnalyzer(context=context)
+    report = analyzer.analyze(pipeline, df, target)
+    click.echo(click.style("\n" + "=" * 60, fg='cyan'))
+    click.echo(click.style(f"BUSINESS ANALYSIS: {context.upper()}", fg='cyan', bold=True))
+    click.echo(click.style("=" * 60, fg='cyan'))
+    click.echo(f"\n💰 Financial Impact:")
+    click.echo(f"   Expected profit: ${report['expected_profit']:,.2f}")
+    click.echo(f"   Risk exposure: ${report['risk_exposure']:,.2f}")
+    click.echo(f"\n📊 Threshold Analysis:")
+    click.echo(f"   Optimal threshold: {report['optimal_threshold']:.2f}")
+    click.echo(f"   Precision at threshold: {report['precision']:.2%}")
+    click.echo(f"   Recall at threshold: {report['recall']:.2%}")
+    click.echo(f"\n🎯 Recommendations:")
+    for rec in report['recommendations']:
+        click.echo(f"   • {rec}")
+@cli.command('list-contexts')
+def list_contexts():
+    """List all available business contexts with descriptions."""
+    contexts = {
+        'credit_scoring': 'Loan default prediction - optimizes for minimizing bad debt',
+        'fraud_detection': 'Transaction fraud - optimizes for catching fraud with low false positives',
+        'churn_prediction': 'Customer churn - optimizes for retention ROI',
+        'insurance_claims': 'Claims prediction - optimizes for loss ratio',
+        'collections': 'Debt collection - optimizes for recovery rate',
+    }
+    click.echo(click.style("\n📋 Available Business Contexts:", fg='cyan', bold=True))
+    click.echo()
+    for name, desc in contexts.items():
+        click.echo(f"   {click.style(name, fg='green', bold=True)}")
+        click.echo(f"      {desc}\n")
+def main():
+    cli()
+if __name__ == '__main__':
+    main()

credily/metrics.py ADDED Viewed

	@@ -0,0 +1,49 @@

+"""
+Metrics module for TabulaML.
+Handles model evaluation and visualization.
+"""
+import numpy as np
+import matplotlib.pyplot as plt
+from sklearn.metrics import classification_report, roc_auc_score
+def print_classification_metrics(y_true, y_pred, y_proba=None):
+    """
+    Print classification metrics including precision, recall, F1-score.
+    Args:
+        y_true: True labels
+        y_pred: Predicted labels
+        y_proba: Predicted probabilities for positive class (optional)
+    """
+    print("\n" + "=" * 50)
+    print("CLASSIFICATION REPORT")
+    print("=" * 50)
+    print(classification_report(y_true, y_pred))
+    if y_proba is not None:
+        roc_auc = roc_auc_score(y_true, y_proba)
+        print(f"ROC-AUC Score: {roc_auc:.4f}")
+    print("=" * 50 + "\n")
+def plot_feature_importance(feature_names: list, importances: np.ndarray, top_n: int = 20):
+    """
+    Generate and display a feature importance chart.
+    Args:
+        feature_names: List of feature names
+        importances: Array of feature importances
+        top_n: Number of top features to display
+    """
+    indices = np.argsort(importances)[::-1][:top_n]
+    plt.figure(figsize=(10, 8))
+    plt.title("Feature Importances (Top {})".format(min(top_n, len(feature_names))))
+    plt.barh(range(len(indices)), importances[indices][::-1], align='center')
+    plt.yticks(range(len(indices)), [feature_names[i] for i in indices][::-1])
+    plt.xlabel("Importance")
+    plt.ylabel("Feature")
+    plt.tight_layout()
+    plt.show()

credily/model.py ADDED Viewed

	@@ -0,0 +1,240 @@

+"""
+Core TabulaML model module.
+Contains the TabulaMLModel class for training, prediction, and evaluation.
+"""
+import os
+import joblib
+import pandas as pd
+import numpy as np
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.model_selection import train_test_split
+from sklearn.pipeline import Pipeline
+from .preprocessing import identify_column_types, create_preprocessor
+from .metrics import print_classification_metrics, plot_feature_importance
+class TabulaMLModel:
+    """
+    A data-generic binary classification model for credit scoring.
+    This class provides methods to train, predict, and evaluate
+    a Random Forest model on any tabular dataset with numeric
+    and categorical features.
+    """
+    def __init__(
+        self,
+        target_column: str = 'target',
+        model_path: str = 'credit_model.pkl',
+        n_estimators: int = 200,
+        max_depth: int = 10,
+        random_state: int = 42
+    ):
+        """
+        Initialize the TabulaMLModel.
+        Args:
+            target_column: Name of the binary target column
+            model_path: Path to save/load the trained model
+            n_estimators: Number of trees in the forest
+            max_depth: Maximum depth of trees
+            random_state: Random seed for reproducibility
+        """
+        self.target_column = target_column
+        self.model_path = model_path
+        self.n_estimators = n_estimators
+        self.max_depth = max_depth
+        self.random_state = random_state
+        self.pipeline = None
+        self.feature_names = None
+        self.numeric_columns = None
+        self.categorical_columns = None
+    def train(self, dataframe: pd.DataFrame, test_size: float = 0.2):
+        """
+        Train the model on the provided dataframe.
+        Args:
+            dataframe: Input dataframe with features and target column
+            test_size: Proportion of data for testing (default 0.2)
+        Returns:
+            dict: Training results including test metrics
+        """
+        if self.target_column not in dataframe.columns:
+            raise ValueError(f"Target column '{self.target_column}' not found in dataframe")
+        X = dataframe.drop(columns=[self.target_column])
+        y = dataframe[self.target_column]
+        self.numeric_columns, self.categorical_columns = identify_column_types(
+            dataframe, self.target_column
+        )
+        print(f"Numeric features: {len(self.numeric_columns)}")
+        print(f"Categorical features: {len(self.categorical_columns)}")
+        X_train, X_test, y_train, y_test = train_test_split(
+            X, y,
+            test_size=test_size,
+            stratify=y,
+            random_state=self.random_state
+        )
+        preprocessor = create_preprocessor(
+            self.numeric_columns,
+            self.categorical_columns
+        )
+        model = RandomForestClassifier(
+            n_estimators=self.n_estimators,
+            max_depth=self.max_depth,
+            class_weight='balanced',
+            random_state=self.random_state,
+            n_jobs=-1
+        )
+        self.pipeline = Pipeline(steps=[
+            ('preprocessor', preprocessor),
+            ('classifier', model)
+        ])
+        print("\nTraining model...")
+        self.pipeline.fit(X_train, y_train)
+        self._extract_feature_names()
+        self._save_model()
+        print("\nEvaluating on test set...")
+        y_pred = self.pipeline.predict(X_test)
+        y_proba = self.pipeline.predict_proba(X_test)[:, 1]
+        print_classification_metrics(y_test, y_pred, y_proba)
+        return {
+            'X_test': X_test,
+            'y_test': y_test,
+            'y_pred': y_pred,
+            'y_proba': y_proba
+        }
+    def predict(self, dataframe: pd.DataFrame) -> np.ndarray:
+        """
+        Make predictions on new data.
+        Args:
+            dataframe: Input dataframe with features (no target column needed)
+        Returns:
+            np.ndarray: Predicted class labels
+        """
+        if self.pipeline is None:
+            self._load_model()
+        if self.target_column in dataframe.columns:
+            dataframe = dataframe.drop(columns=[self.target_column])
+        return self.pipeline.predict(dataframe)
+    def predict_proba(self, dataframe: pd.DataFrame) -> np.ndarray:
+        """
+        Get prediction probabilities for new data.
+        Args:
+            dataframe: Input dataframe with features
+        Returns:
+            np.ndarray: Predicted probabilities for each class
+        """
+        if self.pipeline is None:
+            self._load_model()
+        if self.target_column in dataframe.columns:
+            dataframe = dataframe.drop(columns=[self.target_column])
+        return self.pipeline.predict_proba(dataframe)
+    def evaluate(self, dataframe: pd.DataFrame, show_feature_importance: bool = True):
+        """
+        Evaluate the model on a dataset and print metrics.
+        Args:
+            dataframe: Input dataframe with features and target column
+            show_feature_importance: Whether to display feature importance chart
+        """
+        if self.pipeline is None:
+            self._load_model()
+        if self.target_column not in dataframe.columns:
+            raise ValueError(f"Target column '{self.target_column}' required for evaluation")
+        X = dataframe.drop(columns=[self.target_column])
+        y = dataframe[self.target_column]
+        y_pred = self.pipeline.predict(X)
+        y_proba = self.pipeline.predict_proba(X)[:, 1]
+        print_classification_metrics(y, y_pred, y_proba)
+        if show_feature_importance and self.feature_names is not None:
+            importances = self.pipeline.named_steps['classifier'].feature_importances_
+            plot_feature_importance(self.feature_names, importances)
+    def _extract_feature_names(self):
+        """Extract feature names from the fitted preprocessor."""
+        preprocessor = self.pipeline.named_steps['preprocessor']
+        feature_names = []
+        feature_names.extend(self.numeric_columns)
+        if self.categorical_columns:
+            cat_encoder = preprocessor.named_transformers_['cat'].named_steps['encoder']
+            cat_feature_names = cat_encoder.get_feature_names_out(self.categorical_columns)
+            feature_names.extend(cat_feature_names.tolist())
+        self.feature_names = feature_names
+    def _save_model(self):
+        """Save the trained model to disk."""
+        model_data = {
+            'pipeline': self.pipeline,
+            'feature_names': self.feature_names,
+            'numeric_columns': self.numeric_columns,
+            'categorical_columns': self.categorical_columns,
+            'target_column': self.target_column
+        }
+        joblib.dump(model_data, self.model_path)
+        print(f"\nModel saved to: {self.model_path}")
+    def _load_model(self):
+        """Load a trained model from disk."""
+        if not os.path.exists(self.model_path):
+            raise FileNotFoundError(f"No model found at: {self.model_path}")
+        model_data = joblib.load(self.model_path)
+        self.pipeline = model_data['pipeline']
+        self.feature_names = model_data['feature_names']
+        self.numeric_columns = model_data['numeric_columns']
+        self.categorical_columns = model_data['categorical_columns']
+        self.target_column = model_data['target_column']
+        print(f"Model loaded from: {self.model_path}")
+    @classmethod
+    def load(cls, model_path: str = 'credit_model.pkl'):
+        """
+        Class method to load a pre-trained model.
+        Args:
+            model_path: Path to the saved model file
+        Returns:
+            TabulaMLModel: Loaded model instance
+        """
+        instance = cls(model_path=model_path)
+        instance._load_model()
+        return instance

credily/preprocessing.py ADDED Viewed

	@@ -0,0 +1,63 @@

+"""
+Preprocessing module for TabulaML.
+Handles missing value imputation and categorical encoding.
+"""
+import pandas as pd
+import numpy as np
+from sklearn.impute import SimpleImputer
+from sklearn.preprocessing import OneHotEncoder
+from sklearn.compose import ColumnTransformer
+from sklearn.pipeline import Pipeline
+def identify_column_types(df: pd.DataFrame, target_column: str = None):
+    """
+    Identify numeric and categorical columns in the dataframe.
+    Args:
+        df: Input dataframe
+        target_column: Name of target column to exclude from features
+    Returns:
+        tuple: (numeric_columns, categorical_columns)
+    """
+    cols = df.columns.tolist()
+    if target_column and target_column in cols:
+        cols.remove(target_column)
+    numeric_cols = df[cols].select_dtypes(include=[np.number]).columns.tolist()
+    categorical_cols = df[cols].select_dtypes(include=['object', 'category']).columns.tolist()
+    return numeric_cols, categorical_cols
+def create_preprocessor(numeric_columns: list, categorical_columns: list):
+    """
+    Create a preprocessing pipeline for numeric and categorical features.
+    Args:
+        numeric_columns: List of numeric column names
+        categorical_columns: List of categorical column names
+    Returns:
+        ColumnTransformer: Preprocessing pipeline
+    """
+    numeric_transformer = Pipeline(steps=[
+        ('imputer', SimpleImputer(strategy='median'))
+    ])
+    categorical_transformer = Pipeline(steps=[
+        ('imputer', SimpleImputer(strategy='most_frequent')),
+        ('encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
+    ])
+    preprocessor = ColumnTransformer(
+        transformers=[
+            ('num', numeric_transformer, numeric_columns),
+            ('cat', categorical_transformer, categorical_columns)
+        ],
+        remainder='drop'
+    )
+    return preprocessor

credily/profiler.py ADDED Viewed

	@@ -0,0 +1,184 @@

+"""
+Data profiling module for TabulaML.
+Analyzes datasets and infers ML task types.
+"""
+import pandas as pd
+import numpy as np
+from typing import Optional, Dict, Any
+class DataProfiler:
+    """
+    Profiles tabular datasets for ML readiness.
+    """
+    def __init__(self, target_column: Optional[str] = None):
+        self.target_column = target_column
+    def profile(self, df: pd.DataFrame) -> Dict[str, Any]:
+        """
+        Generate a comprehensive profile of the dataset.
+        Args:
+            df: Input dataframe
+        Returns:
+            dict: Profile report with statistics and recommendations
+        """
+        report = {}
+        # Basic info
+        report['n_rows'] = len(df)
+        report['n_cols'] = len(df.columns)
+        report['memory_mb'] = df.memory_usage(deep=True).sum() / (1024 * 1024)
+        # Column types
+        numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
+        categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()
+        if self.target_column:
+            if self.target_column in numeric_cols:
+                numeric_cols.remove(self.target_column)
+            if self.target_column in categorical_cols:
+                categorical_cols.remove(self.target_column)
+        report['n_numeric'] = len(numeric_cols)
+        report['n_categorical'] = len(categorical_cols)
+        report['numeric_columns'] = numeric_cols
+        report['categorical_columns'] = categorical_cols
+        # Missing values
+        missing_total = df.isnull().sum().sum()
+        total_cells = df.shape[0] * df.shape[1]
+        report['missing_pct'] = (missing_total / total_cells) * 100 if total_cells > 0 else 0
+        report['missing_by_column'] = df.isnull().sum().to_dict()
+        # Duplicates
+        report['duplicate_rows'] = df.duplicated().sum()
+        # Column statistics
+        report['column_stats'] = self._get_column_stats(df, numeric_cols, categorical_cols)
+        # Target analysis
+        if self.target_column and self.target_column in df.columns:
+            report['target_info'] = self._analyze_target(df[self.target_column])
+        # Data quality warnings
+        report['warnings'] = self._generate_warnings(df, report)
+        return report
+    def _get_column_stats(
+        self,
+        df: pd.DataFrame,
+        numeric_cols: list,
+        categorical_cols: list
+    ) -> Dict[str, Dict]:
+        """Get statistics for each column."""
+        stats = {}
+        for col in numeric_cols:
+            stats[col] = {
+                'type': 'numeric',
+                'mean': df[col].mean(),
+                'std': df[col].std(),
+                'min': df[col].min(),
+                'max': df[col].max(),
+                'missing': df[col].isnull().sum(),
+                'zeros': (df[col] == 0).sum(),
+                'unique': df[col].nunique()
+            }
+        for col in categorical_cols:
+            value_counts = df[col].value_counts()
+            stats[col] = {
+                'type': 'categorical',
+                'unique': df[col].nunique(),
+                'missing': df[col].isnull().sum(),
+                'top_value': value_counts.index[0] if len(value_counts) > 0 else None,
+                'top_freq': value_counts.iloc[0] if len(value_counts) > 0 else 0
+            }
+        return stats
+    def _analyze_target(self, target: pd.Series) -> Dict[str, Any]:
+        """Analyze the target variable."""
+        # Drop NaN values for analysis
+        target_clean = target.dropna()
+        n_unique = target_clean.nunique()
+        if n_unique == 0:
+            return {
+                'task_type': 'unknown',
+                'n_classes': 0,
+                'value_counts': {},
+                'balance': 'N/A (no valid values)',
+                'imbalance_ratio': 0,
+                'is_imbalanced': False,
+                'warning': 'Target column has no valid values'
+            }
+        if n_unique == 2:
+            task_type = 'binary_classification'
+        elif n_unique <= 10:
+            task_type = 'multiclass_classification'
+        elif target.dtype in [np.float64, np.float32]:
+            task_type = 'regression'
+        else:
+            task_type = 'multiclass_classification' if n_unique <= 50 else 'regression'
+        value_counts = target_clean.value_counts()
+        majority_class = value_counts.iloc[0] if len(value_counts) > 0 else 0
+        minority_class = value_counts.iloc[-1] if len(value_counts) > 0 else 0
+        imbalance_ratio = majority_class / minority_class if minority_class > 0 else float('inf')
+        return {
+            'task_type': task_type,
+            'n_classes': n_unique,
+            'value_counts': value_counts.to_dict(),
+            'balance': f"{minority_class}:{majority_class} (1:{imbalance_ratio:.1f})",
+            'imbalance_ratio': imbalance_ratio,
+            'is_imbalanced': imbalance_ratio > 3
+        }
+    def _generate_warnings(self, df: pd.DataFrame, report: Dict) -> list:
+        """Generate data quality warnings."""
+        warnings = []
+        if report['missing_pct'] > 20:
+            warnings.append(f"High missing value rate: {report['missing_pct']:.1f}%")
+        if report['duplicate_rows'] > 0:
+            warnings.append(f"Found {report['duplicate_rows']} duplicate rows")
+        # Check for high cardinality categoricals
+        for col, stats in report['column_stats'].items():
+            if stats['type'] == 'categorical' and stats['unique'] > 50:
+                warnings.append(f"High cardinality in '{col}': {stats['unique']} unique values")
+        # Check target imbalance
+        if 'target_info' in report and report['target_info'].get('is_imbalanced'):
+            warnings.append(f"Target is imbalanced (ratio: 1:{report['target_info']['imbalance_ratio']:.1f})")
+        return warnings
+    def infer_task_type(self, df: pd.DataFrame) -> str:
+        """
+        Infer the ML task type from the target column.
+        Returns:
+            str: 'binary_classification', 'multiclass_classification', or 'regression'
+        """
+        if not self.target_column or self.target_column not in df.columns:
+            raise ValueError("Target column must be specified and present in dataframe")
+        target = df[self.target_column]
+        n_unique = target.nunique()
+        if n_unique == 2:
+            return 'binary_classification'
+        elif n_unique <= 10 or target.dtype == 'object':
+            return 'multiclass_classification'
+        else:
+            return 'regression'

credily/reporting.py ADDED Viewed

	@@ -0,0 +1,257 @@

+"""
+Report generation module for Credily.
+Generates HTML and JSON reports.
+"""
+import json
+from pathlib import Path
+from typing import Dict, Any, Optional
+from datetime import datetime
+class ReportGenerator:
+    """Generates training reports in various formats."""
+    def __init__(self, output_dir: str):
+        self.output_dir = Path(output_dir)
+        self.output_dir.mkdir(parents=True, exist_ok=True)
+    def generate_html_report(
+        self,
+        training_results: Dict[str, Any],
+        profile_report: Optional[Dict[str, Any]] = None
+    ) -> str:
+        """
+        Generate an HTML report.
+        Args:
+            training_results: Results from training
+            profile_report: Optional data profile report
+        Returns:
+            Path to generated HTML file
+        """
+        html = self._build_html(training_results, profile_report)
+        report_path = self.output_dir / 'report.html'
+        with open(report_path, 'w', encoding='utf-8') as f:
+            f.write(html)
+        print(f"HTML report saved to: {report_path}")
+        return str(report_path)
+    def _build_html(
+        self,
+        results: Dict[str, Any],
+        profile: Optional[Dict[str, Any]]
+    ) -> str:
+        """Build HTML content."""
+        timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+        # Extract label mapping to show original class names
+        label_mapping = {}
+        cleaning_report = results.get('cleaning_report', {})
+        if cleaning_report:
+            label_validation = cleaning_report.get('label_validation', {})
+            label_mapping = label_validation.get('label_mapping', {})
+        # Create reverse mapping: {0: 'original_negative', 1: 'original_positive'}
+        reverse_label_map = {str(v): k for k, v in label_mapping.items()} if label_mapping else {}
+        # Model comparison table
+        model_rows = ""
+        for model, score in results.get('model_scores', {}).items():
+            is_best = model == results.get('best_model', '')
+            highlight = 'style="background-color: #d4edda; font-weight: bold;"' if is_best else ''
+            model_rows += f"<tr {highlight}><td>{model}</td><td>{score:.4f}</td></tr>"
+        # Feature importance table
+        feature_rows = ""
+        importances = results.get('feature_importances', {})
+        sorted_features = sorted(importances.items(), key=lambda x: x[1], reverse=True)[:20]
+        for feature, importance in sorted_features:
+            feature_rows += f"<tr><td>{feature}</td><td>{importance:.4f}</td></tr>"
+        # Classification report with original class labels
+        clf_report = results.get('classification_report', {})
+        clf_rows = ""
+        for label, metrics in clf_report.items():
+            if isinstance(metrics, dict):
+                # Use original class name if available, otherwise use the label
+                display_label = reverse_label_map.get(str(label), label)
+                # For aggregate metrics like 'macro avg', keep the original name
+                if label in ['macro avg', 'weighted avg', 'accuracy']:
+                    display_label = label
+                clf_rows += f"""<tr>
+                    <td>{display_label}</td>
+                    <td>{metrics.get('precision', 0):.3f}</td>
+                    <td>{metrics.get('recall', 0):.3f}</td>
+                    <td>{metrics.get('f1-score', 0):.3f}</td>
+                    <td>{metrics.get('support', 0)}</td>
+                </tr>"""
+        # Profile section
+        profile_section = ""
+        if profile:
+            warnings_html = "".join([f"<li>{w}</li>" for w in profile.get('warnings', [])])
+            profile_section = f"""
+            <section class="card">
+                <h2>Data Profile</h2>
+                <div class="stats-grid">
+                    <div class="stat">
+                        <span class="stat-value">{profile.get('n_rows', 0):,}</span>
+                        <span class="stat-label">Rows</span>
+                    </div>
+                    <div class="stat">
+                        <span class="stat-value">{profile.get('n_cols', 0)}</span>
+                        <span class="stat-label">Columns</span>
+                    </div>
+                    <div class="stat">
+                        <span class="stat-value">{profile.get('n_numeric', 0)}</span>
+                        <span class="stat-label">Numeric</span>
+                    </div>
+                    <div class="stat">
+                        <span class="stat-value">{profile.get('n_categorical', 0)}</span>
+                        <span class="stat-label">Categorical</span>
+                    </div>
+                    <div class="stat">
+                        <span class="stat-value">{profile.get('missing_pct', 0):.1f}%</span>
+                        <span class="stat-label">Missing</span>
+                    </div>
+                </div>
+                {f'<h3>Warnings</h3><ul class="warnings">{warnings_html}</ul>' if warnings_html else ''}
+            </section>
+            """
+        html = f"""<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>Credily Training Report</title>
+    <style>
+        * {{ box-sizing: border-box; margin: 0; padding: 0; }}
+        body {{
+            font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, Oxygen, Ubuntu, sans-serif;
+            line-height: 1.6;
+            color: #333;
+            background: #f5f7fa;
+            padding: 2rem;
+        }}
+        .container {{ max-width: 1200px; margin: 0 auto; }}
+        header {{
+            background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
+            color: white;
+            padding: 2rem;
+            border-radius: 12px;
+            margin-bottom: 2rem;
+        }}
+        header h1 {{ font-size: 2rem; margin-bottom: 0.5rem; }}
+        header p {{ opacity: 0.9; }}
+        .card {{
+            background: white;
+            border-radius: 12px;
+            padding: 1.5rem;
+            margin-bottom: 1.5rem;
+            box-shadow: 0 2px 8px rgba(0,0,0,0.08);
+        }}
+        .card h2 {{
+            color: #667eea;
+            margin-bottom: 1rem;
+            padding-bottom: 0.5rem;
+            border-bottom: 2px solid #f0f0f0;
+        }}
+        .stats-grid {{
+            display: grid;
+            grid-template-columns: repeat(auto-fit, minmax(120px, 1fr));
+            gap: 1rem;
+        }}
+        .stat {{
+            text-align: center;
+            padding: 1rem;
+            background: #f8f9fa;
+            border-radius: 8px;
+        }}
+        .stat-value {{
+            display: block;
+            font-size: 1.5rem;
+            font-weight: bold;
+            color: #667eea;
+        }}
+        .stat-label {{ color: #666; font-size: 0.9rem; }}
+        .best-model {{
+            background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
+            color: white;
+            padding: 1.5rem;
+            border-radius: 8px;
+            text-align: center;
+            margin-bottom: 1rem;
+        }}
+        .best-model h3 {{ font-size: 1.2rem; margin-bottom: 0.5rem; }}
+        .best-model .score {{ font-size: 2rem; font-weight: bold; }}
+        table {{
+            width: 100%;
+            border-collapse: collapse;
+            margin-top: 1rem;
+        }}
+        th, td {{
+            padding: 0.75rem;
+            text-align: left;
+            border-bottom: 1px solid #eee;
+        }}
+        th {{ background: #f8f9fa; font-weight: 600; }}
+        tr:hover {{ background: #f8f9fa; }}
+        .warnings {{ color: #856404; background: #fff3cd; padding: 1rem; border-radius: 8px; margin-top: 1rem; }}
+        .warnings li {{ margin-left: 1.5rem; }}
+        footer {{ text-align: center; color: #666; margin-top: 2rem; font-size: 0.9rem; }}
+    </style>
+</head>
+<body>
+    <div class="container">
+        <header>
+            <h1>Credily Training Report</h1>
+            <p>Generated: {timestamp}</p>
+        </header>
+        {profile_section}
+        <section class="card">
+            <h2>Model Performance</h2>
+            <div class="best-model">
+                <h3>Best Model: {results.get('best_model', 'N/A')}</h3>
+                <div class="score">ROC-AUC: {results.get('best_score', 0):.4f}</div>
+                <p>Test AUC: {results.get('test_auc', 0):.4f}</p>
+            </div>
+            <h3>Model Comparison</h3>
+            <table>
+                <thead><tr><th>Model</th><th>CV ROC-AUC</th></tr></thead>
+                <tbody>{model_rows}</tbody>
+            </table>
+        </section>
+        <section class="card">
+            <h2>Classification Report</h2>
+            <table>
+                <thead>
+                    <tr><th>Class</th><th>Precision</th><th>Recall</th><th>F1-Score</th><th>Support</th></tr>
+                </thead>
+                <tbody>{clf_rows}</tbody>
+            </table>
+        </section>
+        <section class="card">
+            <h2>Feature Importance (Top 20)</h2>
+            <table>
+                <thead><tr><th>Feature</th><th>Importance</th></tr></thead>
+                <tbody>{feature_rows}</tbody>
+            </table>
+        </section>
+        <footer>
+            <p>Generated by Credily - Fast, Explainable AutoML for Tabular Data</p>
+        </footer>
+    </div>
+</body>
+</html>"""
+        return html

credily/safety.py ADDED Viewed

	@@ -0,0 +1,634 @@

+"""
+AutoML Safety & Leakage Prevention Module for Credily.
+This module implements comprehensive checks to detect and prevent:
+- Data leakage (features that directly encode the target)
+- Feature dominance (single feature explaining too much variance)
+- Overfitting (CV vs Test performance gap)
+- Feature redundancy (highly correlated features)
+These protections ensure production-safe, reliable models.
+"""
+import pandas as pd
+import numpy as np
+from typing import Dict, List, Tuple, Optional, Any
+from scipy import stats
+from sklearn.feature_selection import mutual_info_classif
+import warnings
+# Suppress warnings for cleaner output
+warnings.filterwarnings('ignore')
+# ============== Configuration Thresholds ==============
+class SafetyConfig:
+    """Configuration thresholds for safety checks."""
+    # Feature-Target Leakage
+    LEAKAGE_DROP_THRESHOLD = 0.95      # Auto-drop if |corr| >= this
+    LEAKAGE_WARN_THRESHOLD = 0.90      # Warn if |corr| >= this
+    # Feature-Feature Redundancy
+    REDUNDANCY_THRESHOLD = 0.98        # Drop one if |corr| >= this
+    # Feature Dominance (Post-Training)
+    DOMINANCE_INVALID_THRESHOLD = 0.85  # Mark model INVALID if max importance >= this
+    DOMINANCE_WARN_THRESHOLD = 0.70     # Warn if max importance >= this
+    # Overfitting Guard
+    OVERFIT_INVALID_GAP = 0.10         # Mark INVALID if CV-Test gap >= this
+    OVERFIT_WARN_GAP = 0.05            # Warn if gap >= this
+    # Minimum acceptable test performance
+    MIN_TEST_AUC = 0.60                # Must beat random baseline
+    # Target-related column name patterns to exclude
+    TARGET_NAME_PATTERNS = [
+        'target', 'label', 'outcome', 'result',
+        'default', 'churn', 'flag', 'class', 'status'
+    ]
+class SafetyReport:
+    """Container for safety check results."""
+    def __init__(self):
+        self.status = "PASS"  # PASS, WARN, FAIL
+        self.dropped_features: Dict[str, str] = {}  # feature: reason
+        self.warnings: List[str] = []
+        self.errors: List[str] = []
+        self.leakage_report: Dict[str, float] = {}
+        self.redundancy_report: List[Tuple[str, str, float]] = []
+        self.dominance_report: Dict[str, float] = {}
+        self.overfitting_report: Dict[str, float] = {}
+        self.model_valid = True
+    def add_dropped_feature(self, feature: str, reason: str):
+        """Record a dropped feature."""
+        self.dropped_features[feature] = reason
+    def add_warning(self, message: str):
+        """Add a warning message."""
+        self.warnings.append(message)
+        if self.status == "PASS":
+            self.status = "WARN"
+    def add_error(self, message: str):
+        """Add an error and mark model as invalid."""
+        self.errors.append(message)
+        self.status = "FAIL"
+        self.model_valid = False
+    def to_dict(self) -> Dict[str, Any]:
+        """Convert report to dictionary for JSON serialization."""
+        return {
+            'status': self.status,
+            'model_valid': self.model_valid,
+            'dropped_features': self.dropped_features,
+            'warnings': self.warnings,
+            'errors': self.errors,
+            'leakage_detected': self.leakage_report,
+            'redundant_features': [
+                {'feature1': f1, 'feature2': f2, 'correlation': corr}
+                for f1, f2, corr in self.redundancy_report
+            ],
+            'feature_dominance': self.dominance_report,
+            'overfitting_metrics': self.overfitting_report
+        }
+class SafetyValidator:
+    """
+    Main safety validation class for AutoML pipelines.
+    Implements comprehensive checks to prevent data leakage,
+    feature dominance, and overfitting.
+    """
+    def __init__(self, config: Optional[SafetyConfig] = None, verbose: bool = True):
+        """
+        Initialize the safety validator.
+        Args:
+            config: Safety configuration thresholds
+            verbose: Print detailed logs
+        """
+        self.config = config or SafetyConfig()
+        self.verbose = verbose
+        self.report = SafetyReport()
+    def _log(self, message: str):
+        """Print message if verbose mode is enabled."""
+        if self.verbose:
+            print(message)
+    # ============== Step 1: Column Hygiene ==============
+    def check_column_hygiene(
+        self,
+        df: pd.DataFrame,
+        target_column: str
+    ) -> Tuple[pd.DataFrame, List[str]]:
+        """
+        Check and clean column names, removing potential target leaks.
+        Args:
+            df: Input dataframe
+            target_column: Name of target column
+        Returns:
+            Tuple of (cleaned dataframe, list of dropped columns)
+        """
+        self._log("\n" + "="*60)
+        self._log("[SAFETY] Step 1: Column Hygiene Check")
+        self._log("="*60)
+        dropped = []
+        df = df.copy()
+        for col in df.columns:
+            if col == target_column:
+                continue
+            col_lower = col.lower()
+            # Check for target-related names
+            for pattern in self.config.TARGET_NAME_PATTERNS:
+                if pattern in col_lower:
+                    dropped.append(col)
+                    self.report.add_dropped_feature(
+                        col,
+                        f"Column name contains target-related pattern: '{pattern}'"
+                    )
+                    self._log(f"  [DROP] '{col}' - contains pattern '{pattern}'")
+                    break
+        if dropped:
+            df = df.drop(columns=dropped, errors='ignore')
+            self._log(f"  Dropped {len(dropped)} columns with target-related names")
+        else:
+            self._log("  No target-related column names detected")
+        return df, dropped
+    # ============== Step 2: Feature-Target Leakage Detection ==============
+    def _compute_correlation(
+        self,
+        feature: pd.Series,
+        target: pd.Series
+    ) -> float:
+        """
+        Compute correlation between a feature and binary/numeric target.
+        Uses Point-Biserial for numeric features, Cramér's V for categorical.
+        """
+        # Handle missing values
+        mask = ~(feature.isna() | target.isna())
+        feature = feature[mask]
+        target = target[mask]
+        if len(feature) < 10:
+            return 0.0
+        try:
+            # Check if feature is numeric
+            if pd.api.types.is_numeric_dtype(feature):
+                # Point-biserial or Pearson correlation
+                corr, _ = stats.pearsonr(feature.astype(float), target.astype(float))
+                return abs(corr) if not np.isnan(corr) else 0.0
+            else:
+                # Cramér's V for categorical
+                return self._cramers_v(feature, target)
+        except Exception:
+            return 0.0
+    def _cramers_v(self, x: pd.Series, y: pd.Series) -> float:
+        """Compute Cramér's V statistic for categorical-categorical association."""
+        try:
+            confusion_matrix = pd.crosstab(x, y)
+            chi2 = stats.chi2_contingency(confusion_matrix)[0]
+            n = confusion_matrix.sum().sum()
+            min_dim = min(confusion_matrix.shape) - 1
+            if min_dim == 0 or n == 0:
+                return 0.0
+            return np.sqrt(chi2 / (n * min_dim))
+        except Exception:
+            return 0.0
+    def detect_leakage(
+        self,
+        X: pd.DataFrame,
+        y: pd.Series
+    ) -> Tuple[pd.DataFrame, List[str]]:
+        """
+        Detect and remove features with high correlation to target.
+        Args:
+            X: Feature dataframe
+            y: Target series
+        Returns:
+            Tuple of (cleaned X, list of dropped features)
+        """
+        self._log("\n" + "="*60)
+        self._log("[SAFETY] Step 2: Feature-Target Leakage Detection")
+        self._log("="*60)
+        dropped = []
+        correlations = {}
+        for col in X.columns:
+            corr = self._compute_correlation(X[col], y)
+            correlations[col] = corr
+            if corr >= self.config.LEAKAGE_DROP_THRESHOLD:
+                dropped.append(col)
+                self.report.add_dropped_feature(
+                    col,
+                    f"High correlation with target: {corr:.4f} (threshold: {self.config.LEAKAGE_DROP_THRESHOLD})"
+                )
+                self.report.leakage_report[col] = corr
+                self._log(f"  [DROP] '{col}' - correlation: {corr:.4f} >= {self.config.LEAKAGE_DROP_THRESHOLD}")
+            elif corr >= self.config.LEAKAGE_WARN_THRESHOLD:
+                self.report.add_warning(
+                    f"Feature '{col}' has high correlation with target: {corr:.4f}"
+                )
+                self.report.leakage_report[col] = corr
+                self._log(f"  [WARN] '{col}' - correlation: {corr:.4f} (high-risk)")
+        if dropped:
+            X = X.drop(columns=dropped, errors='ignore')
+            self._log(f"\n  Dropped {len(dropped)} leaky features")
+        else:
+            self._log("  No leakage detected")
+        return X, dropped
+    # ============== Step 3: Feature-Feature Redundancy ==============
+    def remove_redundant_features(
+        self,
+        X: pd.DataFrame,
+        y: pd.Series
+    ) -> Tuple[pd.DataFrame, List[str]]:
+        """
+        Remove highly correlated feature pairs, keeping the more informative one.
+        Args:
+            X: Feature dataframe
+            y: Target series
+        Returns:
+            Tuple of (cleaned X, list of dropped features)
+        """
+        self._log("\n" + "="*60)
+        self._log("[SAFETY] Step 3: Feature-Feature Redundancy Check")
+        self._log("="*60)
+        dropped = []
+        # Only check numeric columns for correlation
+        numeric_cols = X.select_dtypes(include=[np.number]).columns.tolist()
+        if len(numeric_cols) < 2:
+            self._log("  Not enough numeric features to check redundancy")
+            return X, dropped
+        # Compute correlation matrix
+        try:
+            corr_matrix = X[numeric_cols].corr().abs()
+        except Exception as e:
+            self._log(f"  Could not compute correlation matrix: {e}")
+            return X, dropped
+        # Find highly correlated pairs
+        pairs_checked = set()
+        for i, col1 in enumerate(numeric_cols):
+            for col2 in numeric_cols[i+1:]:
+                if (col1, col2) in pairs_checked or (col2, col1) in pairs_checked:
+                    continue
+                pairs_checked.add((col1, col2))
+                corr = corr_matrix.loc[col1, col2]
+                if corr >= self.config.REDUNDANCY_THRESHOLD:
+                    # Decide which to drop based on:
+                    # 1. Lower missingness
+                    # 2. Higher mutual information with target
+                    miss1 = X[col1].isna().sum()
+                    miss2 = X[col2].isna().sum()
+                    # Keep the one with less missing
+                    if miss1 != miss2:
+                        to_drop = col1 if miss1 > miss2 else col2
+                    else:
+                        # Use mutual information as tiebreaker
+                        try:
+                            mi1 = mutual_info_classif(
+                                X[[col1]].fillna(0), y, random_state=42
+                            )[0]
+                            mi2 = mutual_info_classif(
+                                X[[col2]].fillna(0), y, random_state=42
+                            )[0]
+                            to_drop = col1 if mi1 < mi2 else col2
+                        except Exception:
+                            to_drop = col2  # Default to dropping second
+                    if to_drop not in dropped:
+                        dropped.append(to_drop)
+                        self.report.add_dropped_feature(
+                            to_drop,
+                            f"Redundant with '{col1 if to_drop == col2 else col2}' (corr: {corr:.4f})"
+                        )
+                        self.report.redundancy_report.append((col1, col2, corr))
+                        self._log(f"  [DROP] '{to_drop}' - redundant with '{col1 if to_drop == col2 else col2}' (corr: {corr:.4f})")
+        if dropped:
+            X = X.drop(columns=dropped, errors='ignore')
+            self._log(f"\n  Dropped {len(dropped)} redundant features")
+        else:
+            self._log("  No redundant feature pairs detected")
+        return X, dropped
+    # ============== Step 4: Feature Dominance Validation ==============
+    def validate_feature_dominance(
+        self,
+        feature_importances: Dict[str, float]
+    ) -> bool:
+        """
+        Check if any single feature dominates the model.
+        Args:
+            feature_importances: Dictionary of feature -> importance
+        Returns:
+            True if model passes validation, False otherwise
+        """
+        self._log("\n" + "="*60)
+        self._log("[SAFETY] Step 4: Feature Dominance Validation")
+        self._log("="*60)
+        if not feature_importances:
+            self._log("  No feature importances provided")
+            return True
+        # Normalize importances
+        total = sum(feature_importances.values())
+        if total == 0:
+            return True
+        normalized = {k: v/total for k, v in feature_importances.items()}
+        self.report.dominance_report = normalized
+        # Find max importance
+        max_feature = max(normalized, key=normalized.get)
+        max_importance = normalized[max_feature]
+        self._log(f"  Top feature: '{max_feature}' with {max_importance:.2%} importance")
+        if max_importance >= self.config.DOMINANCE_INVALID_THRESHOLD:
+            self.report.add_error(
+                f"Feature dominance detected: '{max_feature}' has {max_importance:.2%} importance. "
+                f"Model likely learned the target indirectly. Threshold: {self.config.DOMINANCE_INVALID_THRESHOLD:.0%}"
+            )
+            self._log(f"  [FAIL] Feature dominance violation - model INVALID")
+            return False
+        elif max_importance >= self.config.DOMINANCE_WARN_THRESHOLD:
+            self.report.add_warning(
+                f"High feature importance: '{max_feature}' explains {max_importance:.2%} of predictions. "
+                f"Consider investigating this feature."
+            )
+            self._log(f"  [WARN] High feature importance detected")
+        else:
+            self._log("  Feature importances are well-distributed")
+        return True
+    # ============== Step 5: Overfitting Guard ==============
+    def validate_overfitting(
+        self,
+        cv_score: float,
+        test_score: float,
+        metric_name: str = "AUC"
+    ) -> bool:
+        """
+        Check for overfitting by comparing CV and test performance.
+        Args:
+            cv_score: Cross-validation score
+            test_score: Held-out test score
+            metric_name: Name of the metric being compared
+        Returns:
+            True if model passes validation, False otherwise
+        """
+        self._log("\n" + "="*60)
+        self._log("[SAFETY] Step 5: Overfitting Guard")
+        self._log("="*60)
+        gap = cv_score - test_score
+        self.report.overfitting_report = {
+            'cv_score': cv_score,
+            'test_score': test_score,
+            'gap': gap,
+            'metric': metric_name
+        }
+        self._log(f"  CV {metric_name}: {cv_score:.4f}")
+        self._log(f"  Test {metric_name}: {test_score:.4f}")
+        self._log(f"  Gap: {gap:.4f}")
+        if gap >= self.config.OVERFIT_INVALID_GAP:
+            self.report.add_error(
+                f"Overfitting detected: CV-Test gap is {gap:.4f} "
+                f"(threshold: {self.config.OVERFIT_INVALID_GAP}). "
+                f"Model performance will not generalize."
+            )
+            self._log(f"  [FAIL] Overfitting violation - model INVALID")
+            return False
+        elif gap >= self.config.OVERFIT_WARN_GAP:
+            self.report.add_warning(
+                f"Potential overfitting: CV-Test gap is {gap:.4f}. "
+                f"Monitor model performance closely in production."
+            )
+            self._log(f"  [WARN] Potential overfitting detected")
+        else:
+            self._log("  No significant overfitting detected")
+        return True
+    # ============== Step 6: Model Acceptance Criteria ==============
+    def validate_model_acceptance(
+        self,
+        test_auc: float,
+        feature_importances: Dict[str, float],
+        cv_score: float
+    ) -> bool:
+        """
+        Final validation to determine if model can be exported.
+        Args:
+            test_auc: Test set AUC score
+            feature_importances: Feature importance dictionary
+            cv_score: Cross-validation score
+        Returns:
+            True if model passes all criteria, False otherwise
+        """
+        self._log("\n" + "="*60)
+        self._log("[SAFETY] Step 6: Model Acceptance Criteria")
+        self._log("="*60)
+        passed = True
+        # Check minimum performance
+        if test_auc < self.config.MIN_TEST_AUC:
+            self.report.add_error(
+                f"Test AUC ({test_auc:.4f}) is below minimum threshold "
+                f"({self.config.MIN_TEST_AUC}). Model does not beat baseline."
+            )
+            passed = False
+            self._log(f"  [FAIL] Test AUC below minimum threshold")
+        # Check feature dominance
+        dominance_ok = self.validate_feature_dominance(feature_importances)
+        if not dominance_ok:
+            passed = False
+        # Check overfitting
+        overfit_ok = self.validate_overfitting(cv_score, test_auc)
+        if not overfit_ok:
+            passed = False
+        # Final status
+        if passed and self.report.status != "FAIL":
+            self._log("\n  [PASS] Model meets all acceptance criteria")
+        else:
+            self._log("\n  [FAIL] Model does NOT meet acceptance criteria")
+            self.report.model_valid = False
+        return passed
+    # ============== Main Validation Pipeline ==============
+    def run_pre_training_checks(
+        self,
+        df: pd.DataFrame,
+        target_column: str
+    ) -> Tuple[pd.DataFrame, SafetyReport]:
+        """
+        Run all pre-training safety checks.
+        Args:
+            df: Input dataframe with features and target
+            target_column: Name of target column
+        Returns:
+            Tuple of (cleaned dataframe, safety report)
+        """
+        self._log("\n" + "#"*60)
+        self._log("# SAFETY VALIDATION - PRE-TRAINING CHECKS")
+        self._log("#"*60)
+        # Step 1: Column hygiene
+        df, _ = self.check_column_hygiene(df, target_column)
+        # Extract X and y
+        X = df.drop(columns=[target_column])
+        y = df[target_column]
+        # Step 2: Leakage detection
+        X, _ = self.detect_leakage(X, y)
+        # Step 3: Redundancy removal
+        X, _ = self.remove_redundant_features(X, y)
+        # Reconstruct dataframe
+        df_clean = pd.concat([X, y], axis=1)
+        self._log("\n" + "#"*60)
+        self._log(f"# PRE-TRAINING CHECKS COMPLETE")
+        self._log(f"# Status: {self.report.status}")
+        self._log(f"# Features dropped: {len(self.report.dropped_features)}")
+        self._log(f"# Warnings: {len(self.report.warnings)}")
+        self._log("#"*60 + "\n")
+        return df_clean, self.report
+    def run_post_training_checks(
+        self,
+        feature_importances: Dict[str, float],
+        cv_score: float,
+        test_auc: float
+    ) -> SafetyReport:
+        """
+        Run all post-training safety checks.
+        Args:
+            feature_importances: Feature importance dictionary
+            cv_score: Cross-validation score
+            test_auc: Test set AUC
+        Returns:
+            Updated safety report
+        """
+        self._log("\n" + "#"*60)
+        self._log("# SAFETY VALIDATION - POST-TRAINING CHECKS")
+        self._log("#"*60)
+        # Run acceptance validation
+        self.validate_model_acceptance(test_auc, feature_importances, cv_score)
+        self._log("\n" + "#"*60)
+        self._log(f"# POST-TRAINING CHECKS COMPLETE")
+        self._log(f"# Final Status: {self.report.status}")
+        self._log(f"# Model Valid: {self.report.model_valid}")
+        self._log("#"*60 + "\n")
+        return self.report
+# ============== Utility Functions ==============
+def check_perfect_score_warning(metrics: Dict[str, float]) -> List[str]:
+    """
+    Check for suspiciously perfect scores that may indicate leakage.
+    Args:
+        metrics: Dictionary of metric names to scores
+    Returns:
+        List of warning messages
+    """
+    warnings = []
+    for metric, score in metrics.items():
+        if score >= 0.999:
+            warnings.append(
+                f"CRITICAL: {metric} = {score:.4f} is suspiciously perfect. "
+                f"This almost certainly indicates data leakage. "
+                f"Do NOT trust this model."
+            )
+        elif score >= 0.98:
+            warnings.append(
+                f"WARNING: {metric} = {score:.4f} is very high. "
+                f"Verify there is no data leakage before using this model."
+            )
+    return warnings

credily/utils.py ADDED Viewed

	@@ -0,0 +1,199 @@

+"""
+Utility functions for Credily.
+File loading, format detection, and export utilities.
+"""
+import pandas as pd
+from pathlib import Path
+from typing import Optional, Union
+def load_data(file_path: Union[str, Path], **kwargs) -> pd.DataFrame:
+    """
+    Load data from various file formats (CSV, TXT, Excel).
+    Args:
+        file_path: Path to the data file
+        **kwargs: Additional arguments passed to the reader
+    Returns:
+        DataFrame with loaded data
+    Supported formats:
+        - .csv: Comma-separated values
+        - .txt: Tab or comma-separated text files
+        - .xlsx, .xls: Excel files
+        - .tsv: Tab-separated values
+    """
+    path = Path(file_path)
+    suffix = path.suffix.lower()
+    if suffix == '.csv':
+        return pd.read_csv(path, **kwargs)
+    elif suffix == '.txt':
+        # Try to auto-detect delimiter for text files
+        with open(path, 'r', encoding='utf-8', errors='ignore') as f:
+            first_line = f.readline()
+        # Detect delimiter
+        if '\t' in first_line:
+            delimiter = '\t'
+        elif ';' in first_line:
+            delimiter = ';'
+        elif '|' in first_line:
+            delimiter = '|'
+        else:
+            delimiter = ','
+        return pd.read_csv(path, delimiter=delimiter, **kwargs)
+    elif suffix == '.tsv':
+        return pd.read_csv(path, delimiter='\t', **kwargs)
+    elif suffix in ['.xlsx', '.xls']:
+        # Check if openpyxl is available for xlsx
+        try:
+            if suffix == '.xlsx':
+                return pd.read_excel(path, engine='openpyxl', **kwargs)
+            else:
+                return pd.read_excel(path, **kwargs)
+        except ImportError:
+            raise ImportError(
+                "Excel support requires 'openpyxl' package. "
+                "Install it with: pip install openpyxl"
+            )
+    else:
+        raise ValueError(
+            f"Unsupported file format: '{suffix}'. "
+            f"Supported formats: .csv, .txt, .tsv, .xlsx, .xls"
+        )
+def save_to_excel(
+    df: pd.DataFrame,
+    file_path: Union[str, Path],
+    sheet_name: str = 'Predictions',
+    include_summary: bool = True
+) -> str:
+    """
+    Save DataFrame to Excel with optional summary sheet.
+    Args:
+        df: DataFrame to save
+        file_path: Output path (will add .xlsx if needed)
+        sheet_name: Name of the main data sheet
+        include_summary: Whether to include a summary sheet
+    Returns:
+        Path to saved file
+    """
+    try:
+        from openpyxl import Workbook
+        from openpyxl.utils.dataframe import dataframe_to_rows
+        from openpyxl.styles import Font, PatternFill, Alignment, Border, Side
+    except ImportError:
+        raise ImportError(
+            "Excel export requires 'openpyxl' package. "
+            "Install it with: pip install openpyxl"
+        )
+    path = Path(file_path)
+    if path.suffix.lower() not in ['.xlsx', '.xls']:
+        path = path.with_suffix('.xlsx')
+    wb = Workbook()
+    ws = wb.active
+    ws.title = sheet_name
+    # Style definitions
+    header_font = Font(bold=True, color='FFFFFF')
+    header_fill = PatternFill(start_color='667eea', end_color='667eea', fill_type='solid')
+    thin_border = Border(
+        left=Side(style='thin'),
+        right=Side(style='thin'),
+        top=Side(style='thin'),
+        bottom=Side(style='thin')
+    )
+    # Write data
+    for r_idx, row in enumerate(dataframe_to_rows(df, index=False, header=True), 1):
+        for c_idx, value in enumerate(row, 1):
+            cell = ws.cell(row=r_idx, column=c_idx, value=value)
+            cell.border = thin_border
+            if r_idx == 1:  # Header row
+                cell.font = header_font
+                cell.fill = header_fill
+                cell.alignment = Alignment(horizontal='center')
+    # Auto-adjust column widths
+    for column in ws.columns:
+        max_length = 0
+        column_letter = column[0].column_letter
+        for cell in column:
+            try:
+                if len(str(cell.value)) > max_length:
+                    max_length = len(str(cell.value))
+            except:
+                pass
+        adjusted_width = min(max_length + 2, 50)
+        ws.column_dimensions[column_letter].width = adjusted_width
+    # Add summary sheet if predictions exist
+    if include_summary and 'prediction' in df.columns:
+        summary_ws = wb.create_sheet('Summary')
+        # Prediction distribution
+        pred_counts = df['prediction'].value_counts().sort_index()
+        summary_data = [
+            ['Prediction Summary Report'],
+            [''],
+            ['Total Records', len(df)],
+            [''],
+            ['Prediction Distribution'],
+        ]
+        for pred_val, count in pred_counts.items():
+            pct = count / len(df) * 100
+            summary_data.append([f'Class {pred_val}', count, f'{pct:.1f}%'])
+        # Add probability stats if available
+        if 'proba_1' in df.columns:
+            summary_data.extend([
+                [''],
+                ['Probability Statistics (Class 1)'],
+                ['Mean', f"{df['proba_1'].mean():.4f}"],
+                ['Median', f"{df['proba_1'].median():.4f}"],
+                ['Min', f"{df['proba_1'].min():.4f}"],
+                ['Max', f"{df['proba_1'].max():.4f}"],
+            ])
+        if 'threshold_used' in df.columns:
+            summary_data.extend([
+                [''],
+                ['Threshold Used', df['threshold_used'].iloc[0]],
+            ])
+        for r_idx, row in enumerate(summary_data, 1):
+            for c_idx, value in enumerate(row, 1):
+                cell = summary_ws.cell(row=r_idx, column=c_idx, value=value)
+                if r_idx == 1:
+                    cell.font = Font(bold=True, size=14)
+                elif value in ['Prediction Distribution', 'Probability Statistics (Class 1)']:
+                    cell.font = Font(bold=True)
+        # Adjust column widths
+        summary_ws.column_dimensions['A'].width = 30
+        summary_ws.column_dimensions['B'].width = 15
+        summary_ws.column_dimensions['C'].width = 15
+    wb.save(path)
+    return str(path)
+def get_supported_formats() -> str:
+    """Return a string listing supported file formats."""
+    return "Supported formats: CSV (.csv), Text (.txt, .tsv), Excel (.xlsx, .xls)"

debug_output/model.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1df29e3b079f840118439eec948ecbafe781fc11d53971fbfe288fef7d919aab
+size 2274753