| import pandas as pd |
| import numpy as np |
|
|
| from sklearn.model_selection import StratifiedKFold, cross_validate |
| from sklearn.metrics import make_scorer, f1_score |
| from sklearn.naive_bayes import GaussianNB |
| from sklearn.linear_model import LogisticRegression |
| from sklearn.svm import LinearSVC |
| from sklearn.ensemble import RandomForestClassifier |
| from sklearn.preprocessing import OneHotEncoder |
| from sklearn.compose import ColumnTransformer |
| from sklearn.pipeline import Pipeline |
|
|
|
|
| |
| df = pd.read_csv("Feature_Extracted_Corpus.csv") |
|
|
| |
| y = df["group"] |
|
|
| |
| X = df.drop(columns=["id", "text", "group", "grade"]) |
|
|
| |
| categorical_cols = ["sentence_construction_type", "sentence_type"] |
|
|
| |
| preprocessor = ColumnTransformer( |
| transformers=[ |
| ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_cols) |
| ], |
| remainder="passthrough" |
| ) |
|
|
| |
| skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42) |
|
|
| |
| scoring = { |
| 'accuracy': 'accuracy', |
| 'f1_macro': make_scorer(f1_score, average='macro') |
| } |
|
|
| |
| models = { |
| "Gaussian NB": GaussianNB(), |
| "Logistic Regression": LogisticRegression(max_iter=1000), |
| "Linear SVM": LinearSVC(), |
| "Random Forest": RandomForestClassifier(random_state=42) |
| } |
|
|
| results = {} |
|
|
| for name, model in models.items(): |
| print(f"\nTraining {name}...") |
| |
| pipeline = Pipeline([ |
| ("preprocessing", preprocessor), |
| ("classifier", model) |
| ]) |
| |
| scores = cross_validate( |
| pipeline, |
| X, |
| y, |
| cv=skf, |
| scoring=scoring |
| ) |
| |
| results[name] = { |
| "Accuracy Mean": np.mean(scores['test_accuracy']), |
| "Accuracy Std": np.std(scores['test_accuracy']), |
| "F1 Macro Mean": np.mean(scores['test_f1_macro']), |
| "F1 Macro Std": np.std(scores['test_f1_macro']) |
| } |
|
|
| print("\n=== FINAL RESULTS ===") |
| for model_name, metrics in results.items(): |
| print(f"\n{model_name}") |
| for metric, value in metrics.items(): |
| print(f"{metric}: {value:.4f}") |