Alalay / models /train_models.py
Jandayl's picture
Deploy Filipino NLP
b052258
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold, cross_validate
from sklearn.metrics import make_scorer, f1_score
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
# Load dataset
df = pd.read_csv("Feature_Extracted_Corpus.csv")
# Target
y = df["group"]
# Drop non-feature columns
X = df.drop(columns=["id", "text", "group", "grade"])
# Categorical columns
categorical_cols = ["sentence_construction_type", "sentence_type"]
# Preprocessor
preprocessor = ColumnTransformer(
transformers=[
("cat", OneHotEncoder(handle_unknown="ignore"), categorical_cols)
],
remainder="passthrough"
)
# 5-fold stratified CV
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
# Scoring
scoring = {
'accuracy': 'accuracy',
'f1_macro': make_scorer(f1_score, average='macro')
}
# Models
models = {
"Gaussian NB": GaussianNB(),
"Logistic Regression": LogisticRegression(max_iter=1000),
"Linear SVM": LinearSVC(),
"Random Forest": RandomForestClassifier(random_state=42)
}
results = {}
for name, model in models.items():
print(f"\nTraining {name}...")
pipeline = Pipeline([
("preprocessing", preprocessor),
("classifier", model)
])
scores = cross_validate(
pipeline,
X,
y,
cv=skf,
scoring=scoring
)
results[name] = {
"Accuracy Mean": np.mean(scores['test_accuracy']),
"Accuracy Std": np.std(scores['test_accuracy']),
"F1 Macro Mean": np.mean(scores['test_f1_macro']),
"F1 Macro Std": np.std(scores['test_f1_macro'])
}
print("\n=== FINAL RESULTS ===")
for model_name, metrics in results.items():
print(f"\n{model_name}")
for metric, value in metrics.items():
print(f"{metric}: {value:.4f}")