|
|
"""Module for training different types of models for code comment classification."""
|
|
|
|
|
|
import argparse
|
|
|
import logging
|
|
|
import os
|
|
|
|
|
|
import dagshub
|
|
|
from datasets import Dataset
|
|
|
import mlflow
|
|
|
import yaml
|
|
|
|
|
|
from .utils import load_dataset_splits, parse_labels_column
|
|
|
|
|
|
logging.basicConfig(
|
|
|
level=logging.INFO,
|
|
|
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
|
|
|
)
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
|
|
|
dagshub.init(repo_owner="se4ai2526-uniba", repo_name="TheClouds", mlflow=True)
|
|
|
|
|
|
|
|
|
def train_model(lang, model_type, data_path, model_output_path, params):
|
|
|
"""Trains and saves a model for a specific language and model type."""
|
|
|
print(f"--- Starting training for language: {lang} with model: {model_type} ---")
|
|
|
|
|
|
ds = load_dataset_splits(data_path)
|
|
|
|
|
|
train_df = ds[f"{lang}_train"]
|
|
|
eval_df = ds[f"{lang}_test"]
|
|
|
|
|
|
train_df = parse_labels_column(train_df)
|
|
|
eval_df = parse_labels_column(eval_df)
|
|
|
|
|
|
|
|
|
train_dataset = Dataset.from_pandas(train_df, preserve_index=False)
|
|
|
eval_dataset = Dataset.from_pandas(eval_df, preserve_index=False)
|
|
|
|
|
|
if model_type == "setfit":
|
|
|
from setfit import SetFitModel, Trainer, TrainingArguments
|
|
|
|
|
|
mlflow.set_experiment("SetFit Training")
|
|
|
with mlflow.start_run(run_name=f"train-{lang}-{model_type}"):
|
|
|
mlflow.log_param("language", lang)
|
|
|
mlflow.log_param("model_type", model_type)
|
|
|
model = SetFitModel.from_pretrained(
|
|
|
"sentence-transformers/paraphrase-MiniLM-L6-v2",
|
|
|
multi_target_strategy="multi-output",
|
|
|
)
|
|
|
args = TrainingArguments(**params)
|
|
|
trainer = Trainer(
|
|
|
model=model,
|
|
|
args=args,
|
|
|
train_dataset=train_dataset,
|
|
|
eval_dataset=eval_dataset,
|
|
|
column_mapping={"combo": "text", "labels": "label"},
|
|
|
)
|
|
|
|
|
|
mlflow.log_param("num_epochs", args.num_epochs)
|
|
|
mlflow.log_param("num_iterations", args.num_iterations)
|
|
|
|
|
|
trainer.train()
|
|
|
|
|
|
eval_metrics = trainer.evaluate()
|
|
|
for metric_name, metric_value in eval_metrics.items():
|
|
|
mlflow.log_metric(metric_name, metric_value)
|
|
|
|
|
|
trainer.model.save_pretrained(model_output_path)
|
|
|
|
|
|
mlflow.transformers.log_model(
|
|
|
transformers_model=model_output_path,
|
|
|
artifact_path=f"{lang}_setfit_model",
|
|
|
task="text-classification",
|
|
|
)
|
|
|
mlflow.end_run()
|
|
|
|
|
|
elif model_type == "random_forest":
|
|
|
import joblib
|
|
|
import numpy as np
|
|
|
from sklearn.ensemble import RandomForestClassifier
|
|
|
from sklearn.feature_extraction.text import TfidfVectorizer
|
|
|
from sklearn.multioutput import MultiOutputClassifier
|
|
|
from sklearn.pipeline import Pipeline
|
|
|
|
|
|
mlflow.set_experiment("Random Forest Training")
|
|
|
with mlflow.start_run(run_name=f"train-{lang}-{model_type}"):
|
|
|
mlflow.log_param("language", lang)
|
|
|
mlflow.log_param("model_type", model_type)
|
|
|
mlflow.log_params(params)
|
|
|
|
|
|
tfidf_params = {
|
|
|
"ngram_range": tuple(params.pop("ngram_range", (1, 1))),
|
|
|
"max_features": params.pop("max_features", None),
|
|
|
"min_df": params.pop("min_df", 1),
|
|
|
"max_df": params.pop("max_df", 1.0),
|
|
|
}
|
|
|
|
|
|
rf_params = params
|
|
|
pipeline = Pipeline(
|
|
|
[
|
|
|
("tfidf", TfidfVectorizer(**tfidf_params)),
|
|
|
(
|
|
|
"clf",
|
|
|
MultiOutputClassifier(
|
|
|
RandomForestClassifier(
|
|
|
random_state=42, class_weight="balanced", **rf_params
|
|
|
)
|
|
|
),
|
|
|
),
|
|
|
]
|
|
|
)
|
|
|
|
|
|
X_train = train_dataset["combo"]
|
|
|
y_train = np.array(train_dataset["labels"])
|
|
|
|
|
|
pipeline.fit(X_train, y_train)
|
|
|
|
|
|
X_test = eval_dataset["combo"]
|
|
|
y_test = np.array(eval_dataset["labels"])
|
|
|
|
|
|
score = pipeline.score(X_test, y_test)
|
|
|
mlflow.log_metric("accuracy", score)
|
|
|
|
|
|
os.makedirs(os.path.dirname(model_output_path), exist_ok=True)
|
|
|
joblib.dump(pipeline, f"{model_output_path}.joblib")
|
|
|
|
|
|
mlflow.sklearn.log_model(
|
|
|
sk_model=pipeline, artifact_path=f"{lang}_random_forest_model"
|
|
|
)
|
|
|
mlflow.end_run()
|
|
|
|
|
|
elif model_type == "transformer":
|
|
|
from .transformer import (
|
|
|
TransformerConfig,
|
|
|
TransformerTrainer,
|
|
|
)
|
|
|
|
|
|
mlflow.set_experiment("Transformer Training")
|
|
|
with mlflow.start_run(run_name=f"train-{lang}-{model_type}"):
|
|
|
mlflow.log_param("language", lang)
|
|
|
mlflow.log_param("model_type", model_type)
|
|
|
mlflow.log_params(params)
|
|
|
|
|
|
cfg = TransformerConfig(
|
|
|
lang=lang,
|
|
|
raw_data_dir="data/raw",
|
|
|
processed_data_dir="data/processed/transformer",
|
|
|
model_output_path=model_output_path,
|
|
|
pretrained_model_name=params.get(
|
|
|
"pretrained_model_name", "microsoft/codebert-base"
|
|
|
),
|
|
|
max_length=params.get("max_length", 128),
|
|
|
batch_size=params.get("batch_size", 16),
|
|
|
lr=params.get("lr", 2e-5),
|
|
|
num_epochs=params.get("num_epochs", 5),
|
|
|
warmup_ratio=params.get("warmup_ratio", 0.1),
|
|
|
pos_weight_cap=params.get("pos_weight_cap", 30.0),
|
|
|
threshold=params.get("threshold", 0.5),
|
|
|
preprocessing=params.get("preprocessing", False),
|
|
|
preprocessing_factor=params.get("preprocessing_factor", 1.0),
|
|
|
)
|
|
|
|
|
|
logger.info(
|
|
|
"Starting transformer training for language '%s' with config: %s",
|
|
|
lang,
|
|
|
cfg,
|
|
|
)
|
|
|
|
|
|
trainer = TransformerTrainer(cfg)
|
|
|
metrics = trainer.run()
|
|
|
|
|
|
logger.info("Final transformer metrics for %s: %s", lang, metrics)
|
|
|
|
|
|
for name, value in metrics.items():
|
|
|
mlflow.log_metric(f"final_{name}", value)
|
|
|
|
|
|
mlflow.end_run()
|
|
|
|
|
|
else:
|
|
|
raise ValueError(f"Unsupported model_type: {model_type}")
|
|
|
|
|
|
print(f"Model for {lang}-{model_type} saved to {model_output_path}")
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
parser = argparse.ArgumentParser()
|
|
|
parser.add_argument("--lang", type=str, required=True)
|
|
|
parser.add_argument("--model_type", type=str, required=True)
|
|
|
args = parser.parse_args()
|
|
|
|
|
|
with open("params.yaml", "r") as f:
|
|
|
all_params = yaml.safe_load(f)
|
|
|
|
|
|
model_params = all_params[args.model_type].copy()
|
|
|
|
|
|
train_model(
|
|
|
lang=args.lang,
|
|
|
model_type=args.model_type,
|
|
|
data_path="data/raw",
|
|
|
model_output_path=f"models/{args.lang}/{args.model_type}",
|
|
|
params=model_params,
|
|
|
)
|
|
|
|