| from datasets import load_dataset |
| import pandas as pd |
| from sklearn.feature_extraction.text import TfidfVectorizer |
| from sklearn.preprocessing import MultiLabelBinarizer |
| from sklearn.linear_model import LogisticRegression |
| from sklearn.multiclass import OneVsRestClassifier |
| from sklearn.metrics import classification_report, hamming_loss |
| from sklearn.model_selection import GridSearchCV |
| from joblib import dump |
|
|
| |
| dataset = load_dataset("meriemm6/commit-classification-dataset", data_files={"train": "training.csv", "validation": "validation.csv"}) |
|
|
| |
| train_data = dataset["train"].to_pandas() |
| validation_data = dataset["validation"].to_pandas() |
|
|
| |
| |
| train_data['Message'] = train_data['Message'].fillna("unknown") |
| validation_data['Message'] = validation_data['Message'].fillna("unknown") |
|
|
| |
| train_data['Ground truth'] = train_data['Ground truth'].fillna("maintenance/other") |
| validation_data['Ground truth'] = validation_data['Ground truth'].fillna("maintenance/other") |
|
|
| |
| train_data['Ground truth'] = train_data['Ground truth'].apply(lambda x: x.split(', ')) |
| validation_data['Ground truth'] = validation_data['Ground truth'].apply(lambda x: x.split(', ')) |
|
|
| |
| tfidf_vectorizer = TfidfVectorizer(max_features=10000, stop_words='english', ngram_range=(1, 2)) |
| X_train_tfidf = tfidf_vectorizer.fit_transform(train_data['Message']) |
| X_val_tfidf = tfidf_vectorizer.transform(validation_data['Message']) |
|
|
| |
| mlb = MultiLabelBinarizer() |
| y_train_encoded = mlb.fit_transform(train_data['Ground truth']) |
| y_val_encoded = mlb.transform(validation_data['Ground truth']) |
|
|
| |
| log_reg = LogisticRegression(class_weight='balanced', max_iter=5000, random_state=42) |
| multi_log_reg = OneVsRestClassifier(log_reg) |
|
|
| param_grid = { |
| 'estimator__C': [0.1, 1, 10], |
| 'estimator__solver': ['lbfgs', 'liblinear'], |
| } |
| grid_search = GridSearchCV( |
| estimator=multi_log_reg, |
| param_grid=param_grid, |
| scoring='f1_weighted', |
| cv=3, |
| verbose=2, |
| n_jobs=-1 |
| ) |
| grid_search.fit(X_train_tfidf, y_train_encoded) |
| best_model = grid_search.best_estimator_ |
|
|
| |
| y_val_pred = best_model.predict(X_val_tfidf) |
| print("Validation Metrics:") |
| print(f"F1 Score: {classification_report(y_val_encoded, y_val_pred, target_names=mlb.classes_, zero_division=0)}") |
| print(f"Hamming Loss: {hamming_loss(y_val_encoded, y_val_pred):.4f}") |
|
|
| |
| dump(best_model, "logistic_model.joblib") |
| dump(tfidf_vectorizer, "tfidf_vectorizer.joblib") |
| dump(mlb, "label_binarizer.joblib") |
|
|
| print("Optimized model and preprocessing files saved successfully.") |
|
|