import pandas as pd
import numpy as np

from sklearn.model_selection import StratifiedKFold, cross_validate
from sklearn.metrics import make_scorer, f1_score
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline


# Load dataset
df = pd.read_csv("Feature_Extracted_Corpus.csv")

# Target
y = df["group"]

# Drop non-feature columns
X = df.drop(columns=["id", "text", "group", "grade"])

# Categorical columns
categorical_cols = ["sentence_construction_type", "sentence_type"]

# Preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_cols)
    ],
    remainder="passthrough"
)

# 5-fold stratified CV
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Scoring
scoring = {
    'accuracy': 'accuracy',
    'f1_macro': make_scorer(f1_score, average='macro')
}

# Models
models = {
    "Gaussian NB": GaussianNB(),
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Linear SVM": LinearSVC(),
    "Random Forest": RandomForestClassifier(random_state=42)
}

results = {}

for name, model in models.items():
    print(f"\nTraining {name}...")
    
    pipeline = Pipeline([
        ("preprocessing", preprocessor),
        ("classifier", model)
    ])
    
    scores = cross_validate(
        pipeline,
        X,
        y,
        cv=skf,
        scoring=scoring
    )
    
    results[name] = {
        "Accuracy Mean": np.mean(scores['test_accuracy']),
        "Accuracy Std": np.std(scores['test_accuracy']),
        "F1 Macro Mean": np.mean(scores['test_f1_macro']),
        "F1 Macro Std": np.std(scores['test_f1_macro'])
    }

print("\n=== FINAL RESULTS ===")
for model_name, metrics in results.items():
    print(f"\n{model_name}")
    for metric, value in metrics.items():
        print(f"{metric}: {value:.4f}")