import pandas as pd import numpy as np from sklearn.model_selection import StratifiedKFold, cross_validate from sklearn.metrics import make_scorer, f1_score from sklearn.naive_bayes import GaussianNB from sklearn.linear_model import LogisticRegression from sklearn.svm import LinearSVC from sklearn.ensemble import RandomForestClassifier from sklearn.preprocessing import OneHotEncoder from sklearn.compose import ColumnTransformer from sklearn.pipeline import Pipeline # Load dataset df = pd.read_csv("Feature_Extracted_Corpus.csv") # Target y = df["group"] # Drop non-feature columns X = df.drop(columns=["id", "text", "group", "grade"]) # Categorical columns categorical_cols = ["sentence_construction_type", "sentence_type"] # Preprocessor preprocessor = ColumnTransformer( transformers=[ ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_cols) ], remainder="passthrough" ) # 5-fold stratified CV skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42) # Scoring scoring = { 'accuracy': 'accuracy', 'f1_macro': make_scorer(f1_score, average='macro') } # Models models = { "Gaussian NB": GaussianNB(), "Logistic Regression": LogisticRegression(max_iter=1000), "Linear SVM": LinearSVC(), "Random Forest": RandomForestClassifier(random_state=42) } results = {} for name, model in models.items(): print(f"\nTraining {name}...") pipeline = Pipeline([ ("preprocessing", preprocessor), ("classifier", model) ]) scores = cross_validate( pipeline, X, y, cv=skf, scoring=scoring ) results[name] = { "Accuracy Mean": np.mean(scores['test_accuracy']), "Accuracy Std": np.std(scores['test_accuracy']), "F1 Macro Mean": np.mean(scores['test_f1_macro']), "F1 Macro Std": np.std(scores['test_f1_macro']) } print("\n=== FINAL RESULTS ===") for model_name, metrics in results.items(): print(f"\n{model_name}") for metric, value in metrics.items(): print(f"{metric}: {value:.4f}")