from fastapi import FastAPI, Query, HTTPException
from pydantic import BaseModel
import numpy as np
import pandas as pd
import xgboost as xgb
import joblib
import pickle
import os
from typing import List, Optional

app = FastAPI(
    title="TerraForm AI",
    description="AI model for predicting locations suitable for afforestation",
    version="1.0.0"
)

# Create a models directory if it doesn't exist
os.makedirs('/tmp/models', exist_ok=True)

# Define model paths using the temporary directory
MODEL_PATH = '/tmp/models/model.pkl'
SCALER_PATH = '/tmp/models/scaler.pkl'

# Global variables
data = None
model = None
scaler = None
features = ['Average Annual Rainfall (inches)', 'Soil Suitability (0 to 1)',
           'Wildlife Benefit Potential (0 to 1)', 'Population',
           'Area available for afforestation (acres)', 'Lack of tree cover']

# Try to load the data but don't fail if it's not available
# We'll handle this in the startup event
try:
    if os.path.exists('data.csv'):
        data = pd.read_csv('data.csv')
    if os.path.exists(MODEL_PATH):
        with open(MODEL_PATH, 'rb') as f:
            model = pickle.load(f)
    if os.path.exists(SCALER_PATH):
        with open(SCALER_PATH, 'rb') as f:
            scaler = pickle.load(f)
except Exception as e:
    print(f"Error loading data or model: {e}")

class LocationInput(BaseModel):
    rainfall: float
    soil_suitability: float
    wildlife_potential: float
    population: float
    area: float
    lack_of_tree_cover: float

class LocationResponse(BaseModel):
    suitability: str
    probability: float

@app.get("/")
def root():
    return {
        "message": "TerraForm AI API is running. Use /docs to explore the API."
    }

@app.post("/predict", response_model=LocationResponse)
async def predict_suitability(location: LocationInput):
    """
    Predict the suitability of a location for afforestation based on provided parameters.
    
    - **rainfall**: Average annual rainfall in inches
    - **soil_suitability**: Soil suitability rating (0 to 1)
    - **wildlife_potential**: Wildlife benefit potential (0 to 1)
    - **population**: Population count
    - **area**: Area available for afforestation in acres
    - **lack_of_tree_cover**: Measure of lack of tree cover (0 to 1)
    """
    try:
        # Create feature array for the new location as a DataFrame with proper column names
        new_location = pd.DataFrame([[
            location.rainfall, 
            location.soil_suitability, 
            location.wildlife_potential, 
            location.population,
            location.area, 
            location.lack_of_tree_cover
        ]], columns=features)
        
        # Apply scaling if scaler is available
        if scaler is not None:
            new_location_scaled = scaler.transform(new_location)
        else:
            new_location_scaled = new_location.values
        
        # Make prediction
        prediction = model.predict(new_location_scaled)[0]
        probability = model.predict_proba(new_location_scaled)[0][1]
        
        suitability = "Good" if prediction == 1 else "Not Good"
        
        return LocationResponse(
            suitability=suitability,
            probability=float(probability)
        )
    except Exception as e:
        import traceback
        print(f"Error in prediction: {str(e)}")
        print(traceback.format_exc())
        raise HTTPException(status_code=500, detail=f"Prediction error: {str(e)}")

@app.get("/locations/{state}", response_model=List[dict])
async def get_state_locations(state: str):
    """
    Get suitable locations for afforestation in the specified state.
    """
    if data is None or model is None:
        raise HTTPException(status_code=503, detail="Data or model not loaded")
    
    state_data = data[data["State"] == state].copy()
    
    if state_data.empty:
        return []
    
    # Prepare features for prediction
    X_state = state_data[features]
    
    # Apply scaling if needed
    if scaler is not None:
        X_state_scaled = scaler.transform(X_state)
    else:
        X_state_scaled = X_state.values
    
    # Predict suitability
    predictions = model.predict(X_state_scaled)
    probabilities = model.predict_proba(X_state_scaled)[:, 1]
    
    # Add predictions to the state data
    state_data["Prediction"] = predictions
    state_data["Probability"] = probabilities
    
    # Filter for good locations (Prediction == 1)
    good_locations = state_data[state_data["Prediction"] == 1]
    
    if good_locations.empty:
        return []
    
    # Return the relevant data
    return good_locations[["City", "Probability"]].sort_values(
        by="Probability", ascending=False
    ).to_dict(orient="records")

@app.get("/states", response_model=List[str])
async def get_states():
    """
    Get a list of all available states in the dataset.
    """
    if data is None:
        raise HTTPException(status_code=503, detail="Data not loaded")
    
    return sorted(data["State"].unique().tolist())

@app.on_event("startup")
async def startup_event():
    """Train and save the model if it doesn't exist"""
    global data, model, scaler, features
    
    print("Starting up TerraForm AI application...")
    
    # Check if data file exists
    if not os.path.exists('data.csv'):
        print("Error: data.csv file not found!")
        return
    
    # Load data if not loaded yet
    if data is None:
        try:
            data = pd.read_csv('data.csv')
            print("Successfully loaded data.csv")
        except Exception as e:
            print(f"Failed to load data.csv: {e}")
            return
    
    # Check if model and scaler need to be created
    if not os.path.exists(MODEL_PATH) or not os.path.exists(SCALER_PATH):
        print("Training model and creating necessary files...")
        
        try:
            # Importing necessary packages
            from sklearn.model_selection import train_test_split
            from sklearn.preprocessing import MinMaxScaler
            
            # Data preparation
            print(f"Data shape: {data.shape}")
            print("Preparing data for model training...")
            
            data['Normalized Rainfall'] = (data['Average Annual Rainfall (inches)'] - data['Average Annual Rainfall (inches)'].min()) / (data['Average Annual Rainfall (inches)'].max() - data['Average Annual Rainfall (inches)'].min())
            data['Normalized Population'] = (data['Population'] - data['Population'].min()) / (data['Population'].max() - data['Population'].min())
            data['Normalized Area'] = (data['Area available for afforestation (acres)'] - data['Area available for afforestation (acres)'].min()) / (data['Area available for afforestation (acres)'].max() - data['Area available for afforestation (acres)'].min())
            
            data['afforestation_score'] = (
                0.3 * data['Normalized Rainfall'] +
                0.35 * data['Soil Suitability (0 to 1)'] +
                0.1 * data['Wildlife Benefit Potential (0 to 1)'] -
                0.08 * np.sqrt(data['Normalized Population']) +
                0.07 * data['Normalized Area'] +
                0.1 * data['Lack of tree cover']
            )
            
            # Define threshold
            raw_threshold = 0.5
            data["good_for_afforestation"] = (data["afforestation_score"] > raw_threshold).astype(int)
            
            print("Splitting data for training...")
            # Select features for modeling
            X = data[features]
            y = data['good_for_afforestation']
            
            # Split data
            X_train, X_test, y_train, y_test = train_test_split(
                X, y, test_size=0.2, random_state=42
            )
            
            print("Scaling data...")
            # Create and fit the scaler - keeping the DataFrame structure
            scaler = MinMaxScaler()
            X_train_scaled = scaler.fit_transform(X_train)
            
            print("Training model...")
            # Create and train the model
            model = xgb.XGBClassifier(
                max_depth=4,
                learning_rate=0.1,
                n_estimators=100,
                subsample=0.9,
                objective="binary:logistic", 
                random_state=42
            )
            
            model.fit(X_train_scaled, y_train)
            
            print(f"Saving model to {MODEL_PATH} and scaler to {SCALER_PATH}...")
            # Save the model and scaler to the temporary directory
            with open(MODEL_PATH, 'wb') as f:
                pickle.dump(model, f)
            with open(SCALER_PATH, 'wb') as f:
                pickle.dump(scaler, f)
            
            print("Model and scaler saved successfully!")
        except Exception as e:
            print(f"Error during model training: {e}")
            import traceback
            traceback.print_exc()
            return
    else:
        # Load model and scaler if they exist but weren't loaded
        if model is None:
            try:
                with open(MODEL_PATH, 'rb') as f:
                    model = pickle.load(f)
                print(f"Successfully loaded existing model from {MODEL_PATH}")
            except Exception as e:
                print(f"Failed to load model: {e}")
                
        if scaler is None:
            try:
                with open(SCALER_PATH, 'rb') as f:
                    scaler = pickle.load(f)
                print(f"Successfully loaded existing scaler from {SCALER_PATH}")
            except Exception as e:
                print(f"Failed to load scaler: {e}")
    
    print("Application startup completed successfully!")