"""
Prepare Milk Spoilage Classification Model for Hugging Face Deployment

This script:
1. Loads training data from CSV files
2. Trains a RandomForest model with tuned hyperparameters
3. Exports model artifacts (model.joblib, config.json, requirements.txt, README.md)
"""

import json
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import joblib


def load_and_prepare_data():
    """Load and prepare training data from CSV files."""
    print("Loading training data...")
    train_df = pd.read_csv("data/train_df.csv")
    test_df = pd.read_csv("data/test_df.csv")
    
    # Select relevant columns and drop NaN values
    feature_cols = ['SPC_D7', 'SPC_D14', 'SPC_D21', 'TGN_D7', 'TGN_D14', 'TGN_D21']
    target_col = 'spoilagetype'
    
    train_set = train_df[feature_cols + [target_col]].dropna()
    test_set = test_df[feature_cols + [target_col]].dropna()
    
    X_train = train_set[feature_cols]
    y_train = train_set[target_col]
    X_test = test_set[feature_cols]
    y_test = test_set[target_col]
    
    print(f"Training samples: {len(X_train)}")
    print(f"Test samples: {len(X_test)}")
    
    return X_train, y_train, X_test, y_test, feature_cols


def train_model(X_train, y_train):
    """Train RandomForest model with best hyperparameters from notebook."""
    print("\nTraining RandomForest model...")
    
    # Best hyperparameters from GridSearchCV in notebook
    model = RandomForestClassifier(
        n_estimators=100,
        max_depth=None,
        min_samples_split=5,
        min_samples_leaf=1,
        random_state=42
    )
    
    model.fit(X_train, y_train)
    print("Model training complete!")
    
    return model


def evaluate_model(model, X_test, y_test):
    """Evaluate model performance on test set."""
    print("\nEvaluating model on test set...")
    y_pred = model.predict(X_test)
    
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Test Accuracy: {accuracy:.4f}")
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred, digits=4))
    
    return accuracy


def save_model(model, filepath="model/model.joblib"):
    """Save trained model to disk."""
    print(f"\nSaving model to {filepath}...")
    joblib.dump(model, filepath)
    print("Model saved!")


def create_config(model, feature_cols, filepath="model/config.json"):
    """Create config.json with model metadata."""
    print(f"\nCreating {filepath}...")
    
    config = {
        "model_type": "RandomForestClassifier",
        "framework": "sklearn",
        "task": "classification",
        "features": feature_cols,
        "feature_descriptions": {
            "SPC_D7": "Standard Plate Count at Day 7 (log CFU/mL)",
            "SPC_D14": "Standard Plate Count at Day 14 (log CFU/mL)",
            "SPC_D21": "Standard Plate Count at Day 21 (log CFU/mL)",
            "TGN_D7": "Total Gram-Negative count at Day 7 (log CFU/mL)",
            "TGN_D14": "Total Gram-Negative count at Day 14 (log CFU/mL)",
            "TGN_D21": "Total Gram-Negative count at Day 21 (log CFU/mL)"
        },
        "classes": list(model.classes_),
        "class_descriptions": {
            "PPC": "Post-Pasteurization Contamination",
            "no spoilage": "No spoilage detected",
            "spore spoilage": "Spore-forming bacteria spoilage"
        },
        "hyperparameters": {
            "n_estimators": 100,
            "max_depth": None,
            "min_samples_split": 5,
            "min_samples_leaf": 1,
            "random_state": 42
        }
    }
    
    with open(filepath, 'w') as f:
        json.dump(config, f, indent=2)
    
    print("Config saved!")


def create_requirements(filepath="requirements.txt"):
    """Create requirements.txt for inference."""
    print(f"\nCreating {filepath}...")
    
    requirements = """scikit-learn>=1.0
joblib>=1.0
numpy>=1.20
pandas>=1.3
"""
    
    with open(filepath, 'w') as f:
        f.write(requirements)
    
    print("Requirements saved!")


def create_readme(model, accuracy, feature_cols, filepath="README.md"):
    """Create README.md model card."""
    print(f"\nCreating {filepath}...")
    
    readme_content = f"""---
license: mit
library_name: sklearn
tags:
  - sklearn
  - classification
  - random-forest
  - food-science
  - milk-quality
pipeline_tag: tabular-classification
---

# Milk Spoilage Classification Model

A Random Forest classifier for predicting milk spoilage type based on microbial count data.

## Model Description

This model classifies milk samples into three spoilage categories based on Standard Plate Count (SPC) and Total Gram-Negative (TGN) bacterial counts measured at days 7, 14, and 21 of shelf life.

### Classes

- **PPC**: Post-Pasteurization Contamination
- **no spoilage**: No spoilage detected
- **spore spoilage**: Spore-forming bacteria spoilage

### Input Features

| Feature | Description |
|---------|-------------|
| SPC_D7 | Standard Plate Count at Day 7 (log CFU/mL) |
| SPC_D14 | Standard Plate Count at Day 14 (log CFU/mL) |
| SPC_D21 | Standard Plate Count at Day 21 (log CFU/mL) |
| TGN_D7 | Total Gram-Negative count at Day 7 (log CFU/mL) |
| TGN_D14 | Total Gram-Negative count at Day 14 (log CFU/mL) |
| TGN_D21 | Total Gram-Negative count at Day 21 (log CFU/mL) |

## Performance

- **Test Accuracy**: {accuracy:.2%}

## Usage

### Using the Inference API

```python
import requests

API_URL = "https://api-inference.huggingface.co/models/chenhaoq87/MilkSpoilageClassifier"
headers = {{"Authorization": "Bearer YOUR_HF_TOKEN"}}

# Input: [SPC_D7, SPC_D14, SPC_D21, TGN_D7, TGN_D14, TGN_D21]
payload = {{"inputs": [[4.5, 5.2, 6.1, 3.2, 4.0, 4.8]]}}

response = requests.post(API_URL, headers=headers, json=payload)
print(response.json())
```

### Local Usage

```python
import joblib
import numpy as np

# Load the model
model = joblib.load("model.joblib")

# Prepare input features
# [SPC_D7, SPC_D14, SPC_D21, TGN_D7, TGN_D14, TGN_D21]
features = np.array([[4.5, 5.2, 6.1, 3.2, 4.0, 4.8]])

# Make prediction
prediction = model.predict(features)
probabilities = model.predict_proba(features)

print(f"Predicted class: {{prediction[0]}}")
print(f"Class probabilities: {{dict(zip(model.classes_, probabilities[0]))}}")
```

## Model Details

- **Model Type**: Random Forest Classifier
- **Framework**: scikit-learn
- **Number of Estimators**: 100
- **Max Depth**: None (unlimited)
- **Min Samples Split**: 5
- **Min Samples Leaf**: 1

## Citation

If you use this model, please cite the original research on milk spoilage classification.

## License

MIT License
"""
    
    with open(filepath, 'w') as f:
        f.write(readme_content)
    
    print("README saved!")


def main():
    """Main function to prepare all model artifacts."""
    print("=" * 60)
    print("Milk Spoilage Classification Model - Artifact Preparation")
    print("=" * 60)
    
    # Load data
    X_train, y_train, X_test, y_test, feature_cols = load_and_prepare_data()
    
    # Train model
    model = train_model(X_train, y_train)
    
    # Evaluate model
    accuracy = evaluate_model(model, X_test, y_test)
    
    # Save artifacts
    save_model(model)
    create_config(model, feature_cols)
    create_requirements()
    create_readme(model, accuracy, feature_cols)
    
    print("\n" + "=" * 60)
    print("All artifacts created successfully!")
    print("=" * 60)
    print("\nGenerated files:")
    print("  - model/model.joblib")
    print("  - model/config.json")
    print("  - requirements.txt")
    print("  - README.md")
    print("\nNext step: Run 'python scripts/upload_to_hf.py' to upload to Hugging Face")


if __name__ == "__main__":
    main()