File size: 2,423 Bytes
094ac5e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
from __future__ import annotations

from pathlib import Path
from typing import Any

import pandas as pd

from .paths import ensure_dir
from .utils import model_file_size_mb, save_json


RANK_COLUMNS = ["f1", "roc_auc", "balanced_accuracy"]


def rank_models(metrics_df: pd.DataFrame, config: dict[str, Any]) -> pd.DataFrame:
    if metrics_df.empty:
        return pd.DataFrame()
    candidates = metrics_df[metrics_df["split"] == "test"].copy()
    if candidates.empty:
        candidates = metrics_df[metrics_df["split"] == "val"].copy()
    for col in RANK_COLUMNS:
        candidates[col] = pd.to_numeric(candidates[col], errors="coerce").fillna(0.0)
    if "model_size_mb" not in candidates:
        candidates["model_size_mb"] = candidates["model_path"].apply(model_file_size_mb)
    candidates["model_size_mb"] = pd.to_numeric(candidates["model_size_mb"], errors="coerce").fillna(1e9)
    if "avg_inference_ms" not in candidates:
        candidates["avg_inference_ms"] = 1e9
    candidates["avg_inference_ms"] = pd.to_numeric(candidates["avg_inference_ms"], errors="coerce").fillna(1e9)
    leaderboard = candidates.sort_values(
        by=["f1", "roc_auc", "balanced_accuracy", "model_size_mb", "avg_inference_ms"],
        ascending=[False, False, False, True, True],
    ).reset_index(drop=True)
    leaderboard.insert(0, "rank", range(1, len(leaderboard) + 1))
    output_dir = ensure_dir(config["paths"]["output_dir"])
    leaderboard_path = output_dir / "leaderboard.csv"
    leaderboard.to_csv(leaderboard_path, index=False)
    save_json(leaderboard.to_dict(orient="records"), output_dir / "leaderboard.json")
    if not leaderboard.empty:
        best = leaderboard.iloc[0].to_dict()
        save_json(best, output_dir / "best_model.json")
    return leaderboard


def load_best_model_record(config: dict[str, Any]) -> dict[str, Any]:
    from .utils import load_json

    path = Path(config["paths"]["output_dir"]) / "best_model.json"
    if not path.exists():
        raise FileNotFoundError(
            f"Best model record not found at {path}. Run training and evaluation first."
        )
    record = load_json(path)
    model_path = record.get("model_path")
    if model_path and not Path(str(model_path)).exists():
        candidate = Path(config["paths"]["model_dir"]) / Path(str(model_path)).name
        if candidate.exists():
            record["model_path"] = str(candidate)
    return record