import pandas as pd
import numpy as np
import joblib
import os

# Load the trained model
rf_model = joblib.load(os.path.join(os.path.dirname(__file__), 'random_forest_model.joblib'))

# Load historical data
data_path = os.path.join(os.path.dirname(__file__), 'sidama_data_2020_2024_combined.csv')
df_merged = pd.read_csv(data_path)

# Define features
features = [
    'ndvi_mean_mean_flowering', 'ndvi_mean_mean_fruit', 'ndvi_mean_max_season',
    'soil_moisture_mean_mean_season', 'Rainfall_mean_fruit', 'Humidity_mean_season',
    'elevation_mean'
]

# Get the feature values for the most recent year (2024)
features_2024 = df_merged[df_merged['year'] == 2024][features].iloc[0].to_dict()

def get_features(year):
    # If the year is in the historical data (2020�2024), use the actual features
    if year in df_merged['year'].values:
        feature_values = df_merged[df_merged['year'] == year][features].iloc[0].to_dict()
    else:
        # For future years, use the 2024 feature values
        feature_values = features_2024.copy()
    return feature_values

def predict_yield(year, hectares):
    # Get feature values for the year
    feature_values = get_features(year)
    
    # Create a DataFrame for the input
    input_data = pd.DataFrame([feature_values.values()], columns=features)
    
    # Predict yield per hectare
    yield_per_ha = rf_model.predict(input_data)[0]
    
    # Calculate total yield
    total_yield_kg = yield_per_ha * hectares
    
    # Compute confidence interval (per hectare)
    lower_bound = yield_per_ha - 1.96 * 173.0
    upper_bound = yield_per_ha + 1.96 * 173.0
    
    # Compute total yield confidence interval
    total_lower_bound = lower_bound * hectares
    total_upper_bound = upper_bound * hectares
    
    return {
        'year': year,
        'hectares': hectares,
        'predicted_yield_per_ha_kg': yield_per_ha,
        'total_predicted_yield_kg': total_yield_kg,
        'confidence_interval_per_ha_lower': lower_bound,
        'confidence_interval_per_ha_upper': upper_bound,
        'total_confidence_interval_lower': total_lower_bound,
        'total_confidence_interval_upper': total_upper_bound
    }

# Make sure function is exposed at module level
__all__ = ['predict_yield']

if __name__ == "__main__":
    year = 2025
    hectares = 2.5
    result = predict_yield(year, hectares)
    print(f"Predicted coffee yield for {result['year']} (per hectare): {result['predicted_yield_per_ha_kg']:.2f} kg/ha")
    print(f"Total predicted yield for {result['hectares']} hectares: {result['total_predicted_yield_kg']:.2f} kg")
    print(f"95% Confidence Interval (per hectare): {result['confidence_interval_per_ha_lower']:.2f} � {result['confidence_interval_per_ha_upper']:.2f} kg/ha")
    print(f"95% Confidence Interval (total): {result['total_confidence_interval_lower']:.2f} � {result['total_confidence_interval_upper']:.2f} kg")