File size: 3,654 Bytes
4d2d78e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
import json
import logging

import polars as pl

logger = logging.getLogger(__name__)


FEATURES = {
    "Performance": {
        "metrics.result": "continuous",
        "metrics.result_per_accelerator": "continuous",
        "metrics.accuracy": "continuous",
    },
    "Model": {
        "model.name": "categorical",
        "model.mlperf_name": "categorical",
        "model.architecture": "categorical",
        "model.number_of_parameters": "continuous",
        "model.weight_data_types": "categorical",
    },
    "Accelerator": {
        "system.accelerator.vendor": "categorical",
        "system.accelerator.name": "categorical",
        "system.accelerator.count_per_node": "continuous",
        "system.accelerator.total_count": "continuous",
        "system.accelerator.memory_capacity": "continuous",
        "system.accelerator.memory_config": "text",
        "system.interconnect.accelerator": "categorical",
    },
    "CPU": {
        "system.cpu.vendor": "categorical",
        "system.cpu.model": "categorical",
        "system.cpu.core_count": "continuous",
        "system.cpu.count_per_node": "continuous",
        "system.cpu.frequency": "continuous",
        "system.cpu.caches": "text",
        "system.cpu.vcpu_count": "continuous",
    },
    "System": {
        "system.name": "text",
        "system.type": "categorical",
        "system.cooling": "categorical",
        "system.number_of_nodes": "continuous",
        "system.memory.capacity": "continuous",
        "system.memory.configuration": "text",
        "system.interconnect.accelerator_host": "categorical",
    },
    "Software": {
        "software.framework": "categorical",
        "software.version": "categorical",
        "software.operating_system": "categorical",
    },
    "Submission": {
        "submission.organization": "categorical",
        "submission.division": "categorical",
        "submission.scenario": "categorical",
        "submission.availability": "boolean",
    },
}


def get_features_by_type(feature_type: str) -> list[str]:
    """Get all features of a specific type."""
    result = []
    for group in FEATURES.values():
        for feature, typ in group.items():
            if typ == feature_type:
                result.append(feature)
    return result


FEATURE_TYPES = {
    "continuous": get_features_by_type("continuous"),
    "categorical": get_features_by_type("categorical"),
    "boolean": get_features_by_type("boolean"),
    "text": get_features_by_type("text"),
}

UI_FEATURE_GROUPS = {
    group: list(features.keys()) for group, features in FEATURES.items()
}


def get_feature_type(feature_name: str) -> str:
    """Get the type of a feature from the FEATURES dictionary."""
    for group in FEATURES.values():
        if feature_name in group:
            return group[feature_name]
    return "categorical"


def load_data(file_path: str = "data.json") -> pl.DataFrame:
    """Load processed benchmark data."""
    logger.info(f"Loading processed data from {file_path}")

    try:
        with open(file_path, "r") as f:
            data = json.load(f)

        for item in data:
            for key, value in item.items():
                if isinstance(value, str):
                    if value.isdigit():
                        item[key] = int(value)
                    elif value.replace(".", "", 1).isdigit():
                        item[key] = float(value)

        df = pl.DataFrame(data, infer_schema_length=None)
        logger.info(f"Loaded {len(df)} benchmark results")
        return df

    except Exception as e:
        logger.error(f"Error loading data: {e}")
        return pl.DataFrame()