File size: 7,613 Bytes

04c2975

"""
Loads best params from optimization_summary.txt, refits the model once on the
train split, and appends a wall-time record to wall_clock_ml.jsonl.
"""
import json
import time
import joblib
import argparse
import re
import numpy as np
from pathlib import Path
from datetime import datetime
# Classification trainers
from train_ml import (
    load_split_data as load_split_cls,
    train_cuml_svc,
    train_cuml_elastic_net,
    train_xgb,
    train_svm,
)
# Regression trainers
from train_ml_regression import (
    load_split_data as load_split_reg,
    train_cuml_elasticnet_reg,
    train_svr_reg,
    train_xgb_reg,
)

MODEL_FILE_MAP = [
    ("best_model_cuml_svc.joblib",  "svm_gpu",  "classification"),
    ("best_model_cuml_enet.joblib", "enet_gpu",  "auto"),
    ("best_model_svr.joblib",       "svr",       "regression"),
    ("best_model.joblib",           "svm",       "classification"),
    ("best_model.json",             "xgb",       "auto"),
]

def detect_model_type(model_dir: Path) -> tuple:
    """Returns (model_type, task)."""
    for fname, model_type, task in MODEL_FILE_MAP:
        if (model_dir / fname).exists():
            if task == "auto":
                if (model_dir / "scaler.joblib").exists():
                    task = "regression"
                    if model_type == "xgb":
                        model_type = "xgb_reg"
                else:
                    task = "classification"
            return model_type, task
    raise FileNotFoundError(
        f"No recognised model file in {model_dir}. "
        f"Expected one of: {[f for f, _, _ in MODEL_FILE_MAP]}"
    )


def parse_best_params(model_dir: Path) -> dict:
    """
    Extracts the JSON block after 'Best params:' in optimization_summary.txt.
    """
    summary_path = model_dir / "optimization_summary.txt"
    if not summary_path.exists():
        raise FileNotFoundError(f"optimization_summary.txt not found in {model_dir}")

    text = summary_path.read_text()
    match = re.search(r"Best params:\s*(\{.*?\})\s*={10,}", text, re.DOTALL)
    if not match:
        raise ValueError(
            f"Could not find 'Best params:' JSON block in {summary_path}.\n"
            f"File contents:\n{text}"
        )
    return json.loads(match.group(1))

def parse_objective_and_wt(model_dir: Path) -> tuple:
    """
    Expects layout: .../training_classifiers/<objective>/<model>_<wt>/
    Example: hemolysis/svm_gpu_smiles -> objective=hemolysis, wt=smiles
    """
    parts        = model_dir.parts
    model_folder = parts[-1].lower()
    objective    = parts[-2]

    for suffix, wt in [("_chemberta", "chemberta"), ("_smiles", "smiles"), ("_wt", "wt")]:
        if model_folder.endswith(suffix):
            return objective, wt
    return objective, "wt"

def refit_and_time(model_dir: Path, dataset_path: str) -> tuple:
    model_type, task = detect_model_type(model_dir)
    best_params      = parse_best_params(model_dir)

    print(f"  Model type : {model_type}  ({task})")
    print(f"  Best params: {best_params}")

    # Load scaler if present (regression models)
    scaler_path = model_dir / "scaler.joblib"
    scaler      = joblib.load(scaler_path) if scaler_path.exists() else None

    load_fn = load_split_reg if task == "regression" else load_split_cls
    data    = load_fn(dataset_path)
    print(f"  Train: {data.X_train.shape}  Val: {data.X_val.shape}")

    # Build params
    if model_type == "xgb":
        params = {
            "objective":        "binary:logistic",
            "eval_metric":      "logloss",
            "lambda":           best_params["lambda"],
            "alpha":            best_params["alpha"],
            "colsample_bytree": best_params["colsample_bytree"],
            "subsample":        best_params["subsample"],
            "learning_rate":    best_params["learning_rate"],
            "max_depth":        best_params["max_depth"],
            "min_child_weight": best_params["min_child_weight"],
            "gamma":            best_params["gamma"],
            "tree_method":      "hist",
            "device":           "cuda",
            "num_boost_round":       best_params["num_boost_round"],
            "early_stopping_rounds": best_params["early_stopping_rounds"],
        }
        train_fn = train_xgb

    elif model_type == "xgb_reg":
        params = {
            "objective":        "reg:squarederror",
            "eval_metric":      "rmse",
            "lambda":           best_params["lambda"],
            "alpha":            best_params["alpha"],
            "gamma":            best_params["gamma"],
            "max_depth":        best_params["max_depth"],
            "min_child_weight": best_params["min_child_weight"],
            "subsample":        best_params["subsample"],
            "colsample_bytree": best_params["colsample_bytree"],
            "learning_rate":    best_params["learning_rate"],
            "tree_method":      "hist",
            "device":           "cuda",
            "num_boost_round":       best_params["num_boost_round"],
            "early_stopping_rounds": best_params["early_stopping_rounds"],
        }
        train_fn = train_xgb_reg

    elif model_type == "svm_gpu":
        params   = best_params
        train_fn = train_cuml_svc

    elif model_type == "enet_gpu" and task == "classification":
        params   = best_params
        train_fn = train_cuml_elastic_net

    elif model_type == "enet_gpu" and task == "regression":
        params   = best_params
        train_fn = train_cuml_elasticnet_reg

    elif model_type == "svm":
        params   = best_params
        train_fn = train_svm

    elif model_type == "svr":
        params   = best_params
        train_fn = train_svr_reg

    else:
        raise ValueError(f"Unhandled model_type={model_type}, task={task}")

    # Timed block
    t0 = time.perf_counter()

    X_train = data.X_train
    X_val   = data.X_val
    if scaler is not None:
        X_train = scaler.transform(X_train).astype(np.float32)
        X_val   = scaler.transform(X_val).astype(np.float32)

    train_fn(X_train, data.y_train, X_val, data.y_val, params)

    wall_s = time.perf_counter() - t0
    print(f"  Wall time: {wall_s:.1f}s")
    return wall_s, model_type

def write_wall_time(logs_dir: Path, objective: str, wt: str,
                    model_type: str, wall_s: float):
    logs_dir.mkdir(parents=True, exist_ok=True)
    date_str   = datetime.now().strftime("%m_%d")
    jsonl_path = logs_dir / f"{date_str}_wall_clock_ml.jsonl"

    record = {
        "model":     model_type,
        "objective": objective,
        "wt":        wt,
        "wall_s":    round(wall_s),
    }
    with open(jsonl_path, "a") as f:
        f.write(json.dumps(record) + "\n")
    print(f"  Appended to {jsonl_path}: {record}")

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--model_dir",    type=str, required=True,
                        help="e.g. .../hemolysis/svm_gpu_smiles")
    parser.add_argument("--dataset_path", type=str, required=True,
                        help="HuggingFace dataset path for this objective/embedding")
    parser.add_argument("--logs_dir",     type=str, required=True,
                        help="Directory to write *_wall_clock_ml.jsonl")
    args = parser.parse_args()

    model_dir = Path(args.model_dir)
    objective, wt = parse_objective_and_wt(model_dir)
    print(f"\nObjective: {objective}  Embedding: {wt}")

    wall_s, model_type = refit_and_time(model_dir, args.dataset_path)
    write_wall_time(Path(args.logs_dir), objective, wt, model_type, wall_s)