from datasets import load_dataset
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, accuracy_score
from category_encoders import OneHotEncoder
import gradio as gr

# Load the dataset
dataset = load_dataset("ombhojane/ckv5")
df = pd.DataFrame(dataset['train'])

# Preprocessing
encoder = OneHotEncoder(cols=['Biodiversity', 'Existing Infrastructure'], use_cat_names=True)
scaler = StandardScaler()

df_encoded = encoder.fit_transform(df)
df_encoded[['Land Size (acres)', 'Budget (INR)']] = scaler.fit_transform(df_encoded[['Land Size (acres)', 'Budget (INR)']])

# Splitting features and target
X = df_encoded.drop('Service', axis=1)
y = df_encoded['Service']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Model Training
model = RandomForestClassifier()
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10]
}

grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

best_model = grid_search.best_estimator_

def predict_service(land_size, biodiversity, budget, infrastructure):
    input_df = pd.DataFrame(columns=X_train.columns)
    input_df.loc[0] = 0
    input_df['Land Size (acres)'] = land_size
    input_df['Budget (INR)'] = budget
    for col in encoder.get_feature_names_out():
        if biodiversity in col:
            input_df[col] = 1 if 'Biodiversity_' + biodiversity in col else 0
        if infrastructure in col:
            input_df[col] = 1 if 'Existing Infrastructure_' + infrastructure in col else 0
    input_df[['Land Size (acres)', 'Budget (INR)']] = scaler.transform(input_df[['Land Size (acres)', 'Budget (INR)']])
    input_df = input_df[X_train.columns]
    prediction = best_model.predict(input_df)
    return prediction[0]

def gradio_interface(land_size, biodiversity, budget, infrastructure):
    prediction = predict_service(land_size, biodiversity, budget, infrastructure)
    return f"The predicted service is: {prediction}"


iface = gr.Interface(fn=gradio_interface,
                     inputs=[
                         gr.Number(label="Land Size (acres)"),
                         gr.Dropdown(label="Biodiversity", choices=df['Biodiversity'].unique().tolist()),
                         gr.Number(label="Budget (INR)"),
                         gr.Dropdown(label="Existing Infrastructure", choices=df['Existing Infrastructure'].unique().tolist())
                     ],
                     outputs=gr.Text(label="Predicted Service"),
                     title="Agrotourism Service Planner",
                     description="Please give land size, available budget, and existing infrastructure to find best suitable agrotourism service on your farm!")

iface.launch()