Mastouri
commited on
Commit
·
8d416da
1
Parent(s):
98f1ca3
Improved Logistic Regression with hyperparameter tuning and TF-IDF enhancements
Browse files- logistic_reg.py +38 -36
logistic_reg.py
CHANGED
|
@@ -1,11 +1,12 @@
|
|
| 1 |
from datasets import load_dataset
|
| 2 |
import pandas as pd
|
| 3 |
-
import numpy as np
|
| 4 |
from sklearn.feature_extraction.text import TfidfVectorizer
|
| 5 |
from sklearn.preprocessing import MultiLabelBinarizer
|
| 6 |
-
from sklearn.
|
| 7 |
-
|
| 8 |
-
from
|
|
|
|
|
|
|
| 9 |
|
| 10 |
# Step 1: Load the Dataset Repository
|
| 11 |
dataset = load_dataset("meriemm6/commit-classification-dataset", data_files={"train": "training.csv", "validation": "validation.csv"})
|
|
@@ -27,43 +28,44 @@ validation_data['Ground truth'] = validation_data['Ground truth'].fillna("mainte
|
|
| 27 |
train_data['Ground truth'] = train_data['Ground truth'].apply(lambda x: x.split(', '))
|
| 28 |
validation_data['Ground truth'] = validation_data['Ground truth'].apply(lambda x: x.split(', '))
|
| 29 |
|
| 30 |
-
#
|
| 31 |
-
|
| 32 |
-
y_train_encoded = mlb.fit_transform(train_data['Ground truth'])
|
| 33 |
-
y_val_encoded = mlb.transform(validation_data['Ground truth'])
|
| 34 |
-
|
| 35 |
-
# Step 3: TF-IDF Vectorization (Increased Features)
|
| 36 |
-
tfidf_vectorizer = TfidfVectorizer(max_features=10000, stop_words="english")
|
| 37 |
X_train_tfidf = tfidf_vectorizer.fit_transform(train_data['Message'])
|
| 38 |
X_val_tfidf = tfidf_vectorizer.transform(validation_data['Message'])
|
| 39 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 40 |
|
|
|
|
|
|
|
|
|
|
| 41 |
|
| 42 |
-
|
| 43 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 44 |
|
| 45 |
-
# Step
|
| 46 |
-
|
| 47 |
-
|
|
|
|
|
|
|
| 48 |
|
| 49 |
-
# Step
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
objective="binary:logistic",
|
| 54 |
-
use_label_encoder=False,
|
| 55 |
-
eval_metric="logloss",
|
| 56 |
-
scale_pos_weight=scale_pos_weight[i], # Class weights
|
| 57 |
-
max_depth=6, # Reduced to prevent overfitting
|
| 58 |
-
learning_rate=0.03, # Lower learning rate for better generalization
|
| 59 |
-
n_estimators=300, # Increased estimators for better performance
|
| 60 |
-
subsample=0.8,
|
| 61 |
-
colsample_bytree=0.8,
|
| 62 |
-
min_child_weight=1 # Prevents overfitting on small datasets
|
| 63 |
-
)
|
| 64 |
-
model.fit(X_train_tfidf, y_train_encoded[:, i])
|
| 65 |
-
models.append(model)
|
| 66 |
|
| 67 |
-
|
| 68 |
-
for idx, model in enumerate(models):
|
| 69 |
-
dump(model, f"xgboost_model_label_{idx}.joblib")
|
|
|
|
| 1 |
from datasets import load_dataset
|
| 2 |
import pandas as pd
|
|
|
|
| 3 |
from sklearn.feature_extraction.text import TfidfVectorizer
|
| 4 |
from sklearn.preprocessing import MultiLabelBinarizer
|
| 5 |
+
from sklearn.linear_model import LogisticRegression
|
| 6 |
+
from sklearn.multiclass import OneVsRestClassifier
|
| 7 |
+
from sklearn.metrics import classification_report, hamming_loss
|
| 8 |
+
from sklearn.model_selection import GridSearchCV
|
| 9 |
+
from joblib import dump
|
| 10 |
|
| 11 |
# Step 1: Load the Dataset Repository
|
| 12 |
dataset = load_dataset("meriemm6/commit-classification-dataset", data_files={"train": "training.csv", "validation": "validation.csv"})
|
|
|
|
| 28 |
train_data['Ground truth'] = train_data['Ground truth'].apply(lambda x: x.split(', '))
|
| 29 |
validation_data['Ground truth'] = validation_data['Ground truth'].apply(lambda x: x.split(', '))
|
| 30 |
|
| 31 |
+
# Step 3: TF-IDF Vectorization (Enhanced Features)
|
| 32 |
+
tfidf_vectorizer = TfidfVectorizer(max_features=10000, stop_words='english', ngram_range=(1, 2))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
X_train_tfidf = tfidf_vectorizer.fit_transform(train_data['Message'])
|
| 34 |
X_val_tfidf = tfidf_vectorizer.transform(validation_data['Message'])
|
| 35 |
|
| 36 |
+
# Step 4: MultiLabel Encoding
|
| 37 |
+
mlb = MultiLabelBinarizer()
|
| 38 |
+
y_train_encoded = mlb.fit_transform(train_data['Ground truth'])
|
| 39 |
+
y_val_encoded = mlb.transform(validation_data['Ground truth'])
|
| 40 |
|
| 41 |
+
# Step 5: Hyperparameter Tuning for Logistic Regression
|
| 42 |
+
log_reg = LogisticRegression(class_weight='balanced', max_iter=5000, random_state=42)
|
| 43 |
+
multi_log_reg = OneVsRestClassifier(log_reg)
|
| 44 |
|
| 45 |
+
param_grid = {
|
| 46 |
+
'estimator__C': [0.1, 1, 10], # Regularization strength
|
| 47 |
+
'estimator__solver': ['lbfgs', 'liblinear'], # Optimizers
|
| 48 |
+
}
|
| 49 |
+
grid_search = GridSearchCV(
|
| 50 |
+
estimator=multi_log_reg,
|
| 51 |
+
param_grid=param_grid,
|
| 52 |
+
scoring='f1_weighted',
|
| 53 |
+
cv=3,
|
| 54 |
+
verbose=2,
|
| 55 |
+
n_jobs=-1
|
| 56 |
+
)
|
| 57 |
+
grid_search.fit(X_train_tfidf, y_train_encoded)
|
| 58 |
+
best_model = grid_search.best_estimator_
|
| 59 |
|
| 60 |
+
# Step 6: Validation Metrics
|
| 61 |
+
y_val_pred = best_model.predict(X_val_tfidf)
|
| 62 |
+
print("Validation Metrics:")
|
| 63 |
+
print(f"F1 Score: {classification_report(y_val_encoded, y_val_pred, target_names=mlb.classes_, zero_division=0)}")
|
| 64 |
+
print(f"Hamming Loss: {hamming_loss(y_val_encoded, y_val_pred):.4f}")
|
| 65 |
|
| 66 |
+
# Step 7: Save the Model and Preprocessing Artifacts
|
| 67 |
+
dump(best_model, "optimized_logistic_model.joblib") # Save the optimized Logistic Regression model
|
| 68 |
+
dump(tfidf_vectorizer, "tfidf_vectorizer.joblib") # Save the TF-IDF vectorizer
|
| 69 |
+
dump(mlb, "label_binarizer.joblib") # Save the MultiLabelBinarizer
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 70 |
|
| 71 |
+
print("Optimized model and preprocessing files saved successfully.")
|
|
|
|
|
|