import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
import gradio as gr

# ----------------------------
# Load dataset
CSV_PATH = "expanded_test_score_dataset_filled.csv"
df = pd.read_csv(CSV_PATH)

# Normalize column names (remove spaces, fix symbols)
df.columns = df.columns.str.strip().str.replace(" ", "_").str.replace("-", "_")

print("Columns in dataset:", df.columns.tolist())

# Define features EXACTLY as they appear after normalization
feature_cols = [
    "Weighted",           # Weighted GPA (column looked cut off before, check real name)
    "Credits_Ea",
    "AP",
    "Honors",
    "Teacher_Experience",
    "Study_Hours",
    "Confidence",
    "Procrastination",
    "Field_Average_F",
    "Study_Quality",
    "HW_Hour",
    "Participation",
    "Attendance",
    "Workday_Positivity",
    "Incentive",
    "Field_Proficiency"
]

# Ensure numeric
for col in feature_cols:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors="coerce").fillna(0)
    else:
        print(f"⚠️ Warning: column {col} not found in dataset")

# Target
if "Predicted_Test_Score" in df.columns:
    y = pd.to_numeric(df["Predicted_Test_Score"], errors="coerce").fillna(0)
else:
    y = pd.Series([0] * len(df))  # placeholder target if missing

X = df[[c for c in feature_cols if c in df.columns]]

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Accuracy
train_r2 = r2_score(y_train, model.predict(X_train))
test_r2 = r2_score(y_test, model.predict(X_test))

# ----------------------------
# Prediction function
def predict(*vals):
    features = [list(vals)]
    prediction = model.predict(features)[0]
    return round(prediction, 2), round(train_r2, 3), round(test_r2, 3)

# ----------------------------
# Gradio UI
with gr.Blocks() as demo:
    gr.Markdown("# 📊 Expanded Test Score Predictor")

    inputs = []
    for col in feature_cols:
        if col in df.columns:
            inputs.append(gr.Slider(1, 10, step=1, label=col))

    output_score = gr.Number(label="Predicted Test Score")
    output_train = gr.Number(label="Training R² Score")
    output_test = gr.Number(label="Testing R² Score")

    btn = gr.Button("Predict")
    btn.click(predict, inputs=inputs, outputs=[output_score, output_train, output_test])

demo.launch()