Spaces:

ctuning
/

FlexBoard

Sleeping

App Files Files Community

Grigori Fursin commited on May 22, 2025

Commit

4d2d78e

unverified ·

1 Parent(s): 37e9e3e

first commit

Browse files

Files changed (10) hide show

.python-version +1 -0
README.md +31 -2
__init__.py +0 -0
app.py +1445 -0
cost_calculator.py +137 -0
data.json +0 -0
predictor.py +900 -0
recommender.py +97 -0
requirements.txt +11 -0
utils.py +115 -0

.python-version ADDED Viewed

	@@ -0,0 +1 @@


1	+ 3.12

README.md CHANGED Viewed

@@ -1,8 +1,8 @@
 ---
 title: FlexBoard
-emoji: 🏢
 colorFrom: blue
-colorTo: yellow
 sdk: gradio
 sdk_version: 5.30.0
 app_file: app.py
@@ -12,3 +12,32 @@ short_description: FlexBoard to analyze FlexBench and MLPerf results
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
 title: FlexBoard
+emoji: 🐢
 colorFrom: blue
+colorTo: indigo
 sdk: gradio
 sdk_version: 5.30.0
 app_file: app.py
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
+# FlexBoard
+## Installation
+```bash
+# Create a virtual environment
+python -m venv .venv
+# Activate the virtual environment
+source .venv/bin/activate
+# Install the required packages
+pip install -r requirements.txt
+# Run the application
+python -m app
+```
+## License and Copyright
+This project is licensed under the [Apache License 2.0](LICENSE.md).
+© 2025 FlexAI
+## Authors and maintaners
+[Daniel Altunay](https://www.linkedin.com/in/daltunay) and [Grigori Fursin](https://cKnowledge.org/gfursin) (FCS Labs)

__init__.py ADDED Viewed

File without changes

app.py ADDED Viewed

	@@ -0,0 +1,1445 @@

+"""MLPerf Hardware Configuration Finder application."""
+import logging
+import os
+import gradio as gr
+import pandas as pd
+import plotly.graph_objects as go
+import polars as pl
+from cost_calculator import (
+    calculate_costs,
+    get_device_costs,
+    initialize_device_costs,
+    update_device_costs,
+)
+from plotly.subplots import make_subplots
+from predictor import PerformancePredictor
+from recommender import ConfigurationFinder
+from utils import get_feature_type, load_data
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+logger.info("Loading benchmark data...")
+df = load_data()
+pd_df = df.to_pandas() if not df.is_empty() else pd.DataFrame()
+logger.info(f"Loaded {len(pd_df)} benchmark records total")
+initialize_device_costs(pd_df)
+predictor = PerformancePredictor(pd_df) if not pd_df.empty else None
+config_finder = ConfigurationFinder(pd_df) if not pd_df.empty else None
+def extract_metadata(df: pl.DataFrame) -> dict:
+    """Extract metadata for UI filters from dataset."""
+    metadata = {}
+    if df.is_empty():
+        return metadata
+    metadata["architectures"] = sorted(
+        df.filter(pl.col("model.architecture").is_not_null())
+        .get_column("model.architecture")
+        .unique()
+        .to_list()
+    )
+    model_sizes = sorted(
+        df.filter(pl.col("model.number_of_parameters").is_not_null())
+        .get_column("model.number_of_parameters")
+        .unique()
+        .to_list()
+    )
+    if model_sizes:
+        metadata["model_sizes"] = model_sizes
+        metadata["model_size_min"] = min(model_sizes)
+        metadata["model_size_max"] = max(model_sizes)
+        metadata["model_size_values"] = sorted(model_sizes)
+    metadata["weight_data_types"] = sorted(
+        df.filter(pl.col("model.weight_data_types").is_not_null())
+        .get_column("model.weight_data_types")
+        .unique()
+        .to_list()
+    )
+    metadata["accelerator_vendors"] = sorted(
+        df.filter(pl.col("system.accelerator.vendor").is_not_null())
+        .get_column("system.accelerator.vendor")
+        .unique()
+        .to_list()
+    )
+    metadata["cpu_vendors"] = sorted(
+        df.filter(pl.col("system.cpu.vendor").is_not_null())
+        .get_column("system.cpu.vendor")
+        .unique()
+        .to_list()
+    )
+    metadata["accelerator_models"] = sorted(
+        df.filter(pl.col("system.accelerator.name").is_not_null())
+        .get_column("system.accelerator.name")
+        .unique()
+        .to_list()
+    )
+    metadata["cpu_models"] = sorted(
+        df.filter(pl.col("system.cpu.model").is_not_null())
+        .get_column("system.cpu.model")
+        .unique()
+        .to_list()
+    )
+    memory_values = df.filter(
+        pl.col("system.accelerator.memory_capacity").is_not_null()
+    )
+    metadata["gpu_memory_min"] = max(
+        1,
+        round(
+            float(memory_values.get_column("system.accelerator.memory_capacity").min())
+        ),
+    )
+    metadata["gpu_memory_max"] = min(
+        1024,
+        round(
+            float(memory_values.get_column("system.accelerator.memory_capacity").max())
+        ),
+    )
+    memory_values = df.filter(pl.col("system.memory.capacity").is_not_null())
+    metadata["cpu_memory_min"] = max(
+        1, round(float(memory_values.get_column("system.memory.capacity").min()))
+    )
+    metadata["cpu_memory_max"] = min(
+        16384, round(float(memory_values.get_column("system.memory.capacity").max()))
+    )
+    metadata["interconnect_types"] = sorted(
+        df.filter(pl.col("system.interconnect.accelerator").is_not_null())
+        .get_column("system.interconnect.accelerator")
+        .unique()
+        .to_list()
+    )
+    acc_counts = sorted(
+        df.filter(pl.col("system.accelerator.total_count").is_not_null())
+        .get_column("system.accelerator.total_count")
+        .unique()
+        .cast(pl.Int64)
+        .to_list()
+    )
+    metadata["accelerator_counts"] = acc_counts
+    metadata["min_accelerators"] = min(acc_counts)
+    metadata["max_accelerators"] = max(acc_counts)
+    metadata["node_counts"] = sorted(
+        df.filter(pl.col("system.number_of_nodes").is_not_null())
+        .get_column("system.number_of_nodes")
+        .unique()
+        .cast(pl.Int64)
+        .to_list()
+    )
+    frameworks = []
+    for col in df.columns:
+        if col.startswith("software.framework.") and col != "software.framework":
+            framework_name = col.replace("software.framework.", "")
+            frameworks.append(framework_name)
+            versions = (
+                df.filter(pl.col(col).is_not_null()).get_column(col).unique().to_list()
+            )
+            if versions:
+                metadata[f"{framework_name}_versions"] = sorted(versions)
+    metadata["frameworks"] = sorted(frameworks)
+    metadata["operating_systems"] = sorted(
+        df.filter(pl.col("software.operating_system").is_not_null())
+        .get_column("software.operating_system")
+        .unique()
+        .to_list()
+    )
+    result_per_acc = df.filter(pl.col("metrics.result_per_accelerator").is_not_null())
+    metadata["result_per_accelerator_ranges"] = {
+        "min": float(result_per_acc.get_column("metrics.result_per_accelerator").min()),
+        "max": float(result_per_acc.get_column("metrics.result_per_accelerator").max()),
+        "median": float(
+            result_per_acc.get_column("metrics.result_per_accelerator").median()
+        ),
+    }
+    return metadata
+metadata = extract_metadata(df)
+def apply_continuous_feature_tolerance(
+    df: pd.DataFrame, feature: str, value: float, tolerance: float = 0.1
+) -> pd.DataFrame:
+    """Apply tolerance for continuous feature searches."""
+    lower_bound = value * (1 - tolerance)
+    upper_bound = value * (1 + tolerance)
+    return df[(df[feature] >= lower_bound) & (df[feature] <= upper_bound)]
+def find_best_configs(
+    workload_specs: dict,
+    constraints: dict,
+    include_predictions: bool = True,
+    optimization_metric: str = "performance",
+) -> pd.DataFrame:
+    """Find best hardware configurations for workload."""
+    if pd_df.empty:
+        return pd.DataFrame()
+    filtered_df = pd_df.copy()
+    if workload_specs.get("model_size") is not None:
+        filtered_df = apply_continuous_feature_tolerance(
+            filtered_df,
+            "model.number_of_parameters",
+            float(workload_specs["model_size"]),
+        )
+    if (
+        workload_specs.get("weight_data_type")
+        and workload_specs["weight_data_type"] != "Any"
+    ):
+        filtered_df = filtered_df[
+            filtered_df["model.weight_data_types"] == workload_specs["weight_data_type"]
+        ]
+    if workload_specs.get("architecture") and workload_specs["architecture"] != "Any":
+        filtered_df = filtered_df[
+            filtered_df["model.architecture"] == workload_specs["architecture"]
+        ]
+    clean_constraints = {k: v for k, v in constraints.items() if v and v != "Any"}
+    for feature, value in clean_constraints.items():
+        if feature in filtered_df.columns:
+            if get_feature_type(feature) == "continuous":
+                filtered_df = apply_continuous_feature_tolerance(
+                    filtered_df, feature, float(value)
+                )
+            else:
+                filtered_df = filtered_df[filtered_df[feature] == value]
+    if constraints.get("min_gpu_memory") is not None:
+        filtered_df = filtered_df[
+            filtered_df["system.accelerator.memory_capacity"]
+            >= constraints["min_gpu_memory"]
+        ]
+    if constraints.get("max_gpu_memory") is not None:
+        filtered_df = filtered_df[
+            filtered_df["system.accelerator.memory_capacity"]
+            <= constraints["max_gpu_memory"]
+        ]
+    if constraints.get("min_cpu_memory") is not None:
+        filtered_df = filtered_df[
+            filtered_df["system.memory.capacity"] >= constraints["min_cpu_memory"]
+        ]
+    if constraints.get("max_cpu_memory") is not None:
+        filtered_df = filtered_df[
+            filtered_df["system.memory.capacity"] <= constraints["max_cpu_memory"]
+        ]
+    if constraints.get("min_accelerators") is not None:
+        filtered_df = filtered_df[
+            filtered_df["system.accelerator.total_count"]
+            >= constraints["min_accelerators"]
+        ]
+    if constraints.get("max_accelerators") is not None:
+        filtered_df = filtered_df[
+            filtered_df["system.accelerator.total_count"]
+            <= constraints["max_accelerators"]
+        ]
+    if (
+        include_predictions
+        and predictor
+        and workload_specs.get("model_size")
+        and workload_specs.get("architecture")
+    ):
+        predicted_df = predictor.generate_predictions(
+            architecture=workload_specs["architecture"],
+            parameters=float(workload_specs["model_size"]),
+            constraints=clean_constraints,
+            num_configs=20,
+        )
+        if not predicted_df.empty:
+            predicted_df = calculate_costs(predicted_df)
+            if not filtered_df.empty:
+                filtered_df = calculate_costs(filtered_df)
+                filtered_df["predicted"] = False
+                combined_df = pd.concat([filtered_df, predicted_df], ignore_index=True)
+            else:
+                combined_df = predicted_df
+            sort_col = (
+                "cost_per_million_tokens"
+                if optimization_metric == "cost"
+                else "metrics.result_per_accelerator"
+            )
+            asc = optimization_metric == "cost"
+            return combined_df.sort_values(by=sort_col, ascending=asc)
+    if not filtered_df.empty:
+        filtered_df = calculate_costs(filtered_df)
+        filtered_df["predicted"] = False
+        sort_col = (
+            "cost_per_million_tokens"
+            if optimization_metric == "cost"
+            else "metrics.result_per_accelerator"
+        )
+        asc = optimization_metric == "cost"
+        return filtered_df.sort_values(by=sort_col, ascending=asc)
+    return pd.DataFrame()
+def format_recommendations(configs_df: pd.DataFrame) -> pd.DataFrame:
+    """Format recommendations for display."""
+    if configs_df.empty:
+        return pd.DataFrame(
+            columns=[
+                "System",
+                "Accelerator",
+                "Count",
+                "Nodes",
+                "GPU Memory (GB)",
+                "Model",
+                "Architecture",
+                "Parameters (B)",
+                "Weight Data Type",
+                "Total Performance (Tokens/s)",
+                "Per-GPU Performance (Tokens/s)",
+                "Hourly Cost ($)",
+                "Cost/Million Tokens",
+                "Predicted",
+            ]
+        )
+    display_columns = {
+        "system.name": "System",
+        "system.accelerator.name": "Accelerator",
+        "system.accelerator.total_count": "Count",
+        "system.number_of_nodes": "Nodes",
+        "system.accelerator.memory_capacity": "GPU Memory (GB)",
+        "model.name": "Model",
+        "model.architecture": "Architecture",
+        "model.number_of_parameters": "Parameters (B)",
+        "model.weight_data_types": "Weight Data Type",
+        "metrics.result": "Total Performance (Tokens/s)",
+        "metrics.result_per_accelerator": "Per-GPU Performance (Tokens/s)",
+        "hourly_cost": "Hourly Cost ($)",
+        "cost_per_million_tokens": "Cost/Million Tokens",
+        "predicted": "Predicted",
+    }
+    result_df = pd.DataFrame()
+    for col_name, display_name in display_columns.items():
+        if col_name in configs_df.columns:
+            result_df[display_name] = configs_df[col_name]
+        else:
+            result_df[display_name] = "N/A" if col_name != "predicted" else "No"
+    numeric_columns = [
+        "Count",
+        "Nodes",
+        "GPU Memory (GB)",
+        "Parameters (B)",
+        "Total Performance (Tokens/s)",
+        "Per-GPU Performance (Tokens/s)",
+        "Hourly Cost ($)",
+        "Cost/Million Tokens",
+    ]
+    for col in numeric_columns:
+        if col in result_df.columns:
+            result_df[col] = pd.to_numeric(result_df[col], errors="coerce")
+    result_df["Total Performance (Tokens/s)"] = result_df[
+        "Total Performance (Tokens/s)"
+    ].round(4)
+    result_df["Per-GPU Performance (Tokens/s)"] = result_df[
+        "Per-GPU Performance (Tokens/s)"
+    ].round(4)
+    result_df["GPU Memory (GB)"] = result_df["GPU Memory (GB)"].round(2)
+    result_df["Cost/Million Tokens"] = result_df["Cost/Million Tokens"].round(4)
+    result_df["Hourly Cost ($)"] = result_df["Hourly Cost ($)"].round(4)
+    if "Parameters (B)" in result_df.columns:
+        result_df["Parameters (B)"] = result_df["Parameters (B)"].round(2)
+    if "Predicted" in result_df.columns:
+        result_df["Predicted"] = result_df["Predicted"].map(
+            lambda x: "Yes" if x is True else "No"
+        )
+    result_df = result_df.drop_duplicates()
+    return result_df
+def get_top_config_details(configs_df: pd.DataFrame) -> pd.DataFrame:
+    """Extract details for the top recommendation."""
+    if configs_df.empty:
+        return pd.DataFrame(columns=["Feature", "Value"])
+    top_config = configs_df.iloc[0]
+    is_predicted = "predicted" in top_config and top_config["predicted"]
+    details = {
+        "Feature": [
+            "System",
+            "Accelerator",
+            "Accelerator Count",
+            "Accelerator Vendor",
+            "Memory Capacity",
+            "CPU",
+            "CPU Vendor",
+            "Nodes",
+            "Devices per Node",
+            "Interconnect",
+            "Total Performance (Tokens/s)",
+            "Per-Accelerator Performance (Tokens/s)",
+            "Hourly Cost (estimated)",
+            "Cost per Million Tokens",
+            "Prediction Status",
+        ],
+        "Value": [
+            top_config.get("system.name", "N/A"),
+            top_config.get("system.accelerator.name", "N/A"),
+            top_config.get("system.accelerator.total_count", "N/A"),
+            top_config.get("system.accelerator.vendor", "N/A"),
+            (
+                f"{float(top_config.get('system.accelerator.memory_capacity', 0)):.1f}GB"
+                if top_config.get("system.accelerator.memory_capacity") is not None
+                else "N/A"
+            ),
+            top_config.get("system.cpu.model", "N/A"),
+            top_config.get("system.cpu.vendor", "N/A"),
+            top_config.get("system.number_of_nodes", "N/A"),
+            top_config.get("system.accelerator.count_per_node", "N/A"),
+            top_config.get("system.interconnect.accelerator", "N/A"),
+            (
+                f"{float(top_config.get('metrics.result', 0)):.4f}"
+                if top_config.get("metrics.result") is not None
+                else "N/A"
+            ),
+            (
+                f"{float(top_config.get('metrics.result_per_accelerator', 0)):.4f}"
+                if top_config.get("metrics.result_per_accelerator") is not None
+                else "N/A"
+            ),
+            (
+                f"${float(top_config.get('hourly_cost', 0)):.4f}"
+                if top_config.get("hourly_cost") is not None
+                else "N/A"
+            ),
+            (
+                f"${float(top_config.get('cost_per_million_tokens', 0)):.4f}"
+                if top_config.get("cost_per_million_tokens") is not None
+                else "N/A"
+            ),
+            "Predicted" if is_predicted else "Actual data",
+        ],
+    }
+    return pd.DataFrame(details)
+def create_top_configs_plot(
+    configs_df: pd.DataFrame, optimization_metric: str = "performance", top_n: int = 10
+) -> go.Figure:
+    """Create a bar plot of top configurations based on the optimization metric."""
+    if configs_df.empty:
+        fig = go.Figure()
+        fig.update_layout(
+            title="No configurations found",
+            xaxis_title="Value",
+            yaxis_title="Rank",
+            template="plotly_white",
+            height=600,
+        )
+        return fig
+    if optimization_metric == "cost":
+        sort_col = "cost_per_million_tokens"
+        display_col = "Cost/Million Tokens ($)"
+        configs_df = configs_df.sort_values(by=sort_col, ascending=True)
+    else:
+        sort_col = "metrics.result_per_accelerator"
+        display_col = "Performance (Tokens/s per device)"
+        configs_df = configs_df.sort_values(by=sort_col, ascending=False)
+    top_configs = configs_df.head(top_n)
+    ranks = [f"#{i + 1}" for i in range(len(top_configs))]
+    if optimization_metric == "cost":
+        x_values = top_configs["cost_per_million_tokens"]
+        color = "crimson"
+    else:
+        x_values = top_configs["metrics.result_per_accelerator"]
+        color = "royalblue"
+    hover_text = []
+    for _, row in top_configs.iterrows():
+        system = row.get("system.name", "Unknown")
+        acc_name = row.get("system.accelerator.name", "Unknown")
+        acc_count = row.get("system.accelerator.total_count", "?")
+        total_perf = row.get("metrics.result", 0)
+        per_acc_perf = row.get("metrics.result_per_accelerator", 0)
+        cost = row.get("hourly_cost", 0)
+        cost_per_million = row.get("cost_per_million_tokens", 0) or 0
+        predicted = "Yes" if row.get("predicted", False) else "No"
+        info = f"System: {system}<br>"
+        info += f"Config: {acc_count}× {acc_name}<br>"
+        info += f"Tokens/s (total): {total_perf:.4f}<br>"
+        info += f"Tokens/s (per device): {per_acc_perf:.4f}<br>"
+        info += f"Hourly cost: ${cost:.4f}<br>"
+        info += f"Cost per million tokens: ${cost_per_million:.4f}<br>"
+        info += f"Predicted: {predicted}"
+        hover_text.append(info)
+    fig = go.Figure()
+    fig.add_trace(
+        go.Bar(
+            y=ranks,
+            x=x_values,
+            text=x_values.apply(lambda x: f"{x:.4f}"),
+            textposition="auto",
+            marker=dict(color=color),
+            hovertext=hover_text,
+            hoverinfo="text",
+            orientation="h",
+        )
+    )
+    title = f"Top {len(ranks)} Configurations by {'Cost' if optimization_metric == 'cost' else 'Performance'}"
+    fig.update_layout(
+        title=title,
+        xaxis_title=display_col,
+        yaxis_title="Rank",
+        template="plotly_white",
+        height=max(400, min(20 * len(ranks), 800)),
+        margin=dict(l=50),
+    )
+    return fig
+def recommend_hardware(
+    model_size: float,
+    weight_data_type: str,
+    architecture: str,
+    accelerator_vendor: str,
+    accelerator_model: str,
+    min_gpu_memory: float | None,
+    max_gpu_memory: float | None,
+    interconnect: str,
+    min_accelerators: int | None,
+    max_accelerators: int | None,
+    cpu_vendor: str,
+    cpu_model: str,
+    nodes: str,
+    min_cpu_memory: float | None,
+    max_cpu_memory: float | None,
+    os: str,
+    include_predictions: bool = True,
+    optimization_metric: str = "performance",
+    top_n_configs: int = 10,
+    **framework_versions,
+) -> tuple[pd.DataFrame, pd.DataFrame, str, go.Figure]:
+    """Find hardware configurations matching requirements."""
+    workload_specs = {
+        "model_size": model_size,
+        "weight_data_type": weight_data_type,
+        "architecture": architecture,
+    }
+    constraints = {
+        "system.accelerator.vendor": accelerator_vendor,
+        "system.accelerator.name": accelerator_model,
+        "system.interconnect.accelerator": interconnect,
+        "system.cpu.vendor": cpu_vendor,
+        "system.cpu.model": cpu_model,
+        "system.number_of_nodes": nodes if nodes != "Any" else None,
+        "software.operating_system": os,
+        "min_gpu_memory": min_gpu_memory,
+        "max_gpu_memory": max_gpu_memory,
+        "min_cpu_memory": min_cpu_memory,
+        "max_cpu_memory": max_cpu_memory,
+        "min_accelerators": min_accelerators,
+        "max_accelerators": max_accelerators,
+    }
+    for fw_name, version in framework_versions.items():
+        if version != "Any":
+            constraints[f"software.framework.{fw_name}"] = version
+    best_configs = find_best_configs(
+        workload_specs, constraints, include_predictions, optimization_metric
+    )
+    recommendations_df = format_recommendations(best_configs)
+    details_df = get_top_config_details(best_configs)
+    top_configs_chart = create_top_configs_plot(
+        best_configs, optimization_metric, top_n_configs
+    )
+    if best_configs.empty:
+        summary = "No matching configurations found. Try relaxing some constraints or changing the model parameters."
+    else:
+        actual_count = (
+            sum(~best_configs["predicted"])
+            if "predicted" in best_configs.columns
+            else len(best_configs)
+        )
+        predicted_count = (
+            sum(best_configs["predicted"]) if "predicted" in best_configs.columns else 0
+        )
+        top_config = best_configs.iloc[0]
+        is_predicted = "predicted" in top_config and top_config["predicted"]
+        if optimization_metric == "cost":
+            metric_value = f"${float(top_config.get('cost_per_million_tokens', 0)):.4f} per million tokens"
+            metric_name = "cost"
+        else:
+            metric_value = f"{float(top_config.get('metrics.result_per_accelerator', 0)):.4f} tokens/s per device"
+            metric_name = "performance"
+        acc = top_config.get("system.accelerator.name", "Unknown")
+        count = top_config.get("system.accelerator.total_count", "Unknown")
+        summary = f"Found {actual_count} actual and {predicted_count} predicted configurations. "
+        summary += f"\nTop recommendation optimized for {metric_name}: {count}× {acc} with {metric_value}"
+        if is_predicted:
+            summary += " (Predicted)"
+    return recommendations_df, details_df, summary, top_configs_chart
+def create_model_performance_plot(
+    predictor: PerformancePredictor,
+) -> tuple[go.Figure, dict, pd.DataFrame]:
+    """Create performance visualization for the ML model using Plotly."""
+    logger.info("Starting to create model performance plot")
+    empty_metrics = {"rmse": 0, "mae": 0, "r2": 0, "mape": 0}
+    empty_df = pd.DataFrame(columns=["Feature", "Importance"])
+    empty_fig = make_subplots(
+        rows=2,
+        cols=2,
+        subplot_titles=(
+            "Predicted vs Actual Performance",
+            "Residual Plot (% Error)",
+            "Distribution of Prediction Errors",
+            "Top 10 Feature Importance",
+        ),
+    )
+    empty_fig.update_layout(
+        height=800,
+        width=1200,
+        showlegend=False,
+        title_text="No Model Evaluation Data Available",
+        annotations=[
+            dict(
+                text="Train the model with test data to see evaluation metrics",
+                showarrow=False,
+                xref="paper",
+                yref="paper",
+                x=0.5,
+                y=0.5,
+            )
+        ],
+    )
+    if predictor is None:
+        logger.warning("No predictor available for performance plot")
+        return empty_fig, empty_metrics, empty_df
+    if (
+        not hasattr(predictor, "evaluation_data")
+        or predictor.evaluation_data is None
+        or predictor.evaluation_data.empty
+    ):
+        logger.warning("Evaluation data not found, attempting to re-train model")
+        try:
+            predictor._train_model()
+        except Exception as e:
+            logger.error(f"Error re-training model: {e}")
+    eval_data = predictor.get_evaluation_data()
+    metrics = predictor.get_evaluation_metrics()
+    feature_importance = predictor.get_feature_importance()
+    logger.info(f"Retrieved evaluation data: {type(eval_data)}")
+    if eval_data is not None:
+        logger.info(
+            f"Evaluation data shape: {eval_data.shape if not eval_data.empty else 'empty'}"
+        )
+    if eval_data is None or eval_data.empty:
+        logger.warning("Evaluation data is not available")
+        return (
+            empty_fig,
+            empty_metrics,
+            feature_importance if feature_importance is not None else empty_df,
+        )
+    logger.info(f"First few rows of evaluation data: {eval_data.head(3).to_dict()}")
+    fig = make_subplots(
+        rows=2,
+        cols=2,
+        subplot_titles=(
+            "Predicted vs Actual Performance",
+            "Residual Plot (% Error)",
+            "Distribution of Prediction Errors",
+            "Top 10 Feature Importance",
+        ),
+    )
+    hover_text = [
+        f"Accelerator: {acc}<br>"
+        f"Vendor: {vendor}<br>"
+        f"Count: {count}<br>"
+        f"Actual: {actual:.4f}<br>"
+        f"Predicted: {pred:.4f}<br>"
+        f"Error: {error:.2f} ({err_pct:.2f}%)"
+        for acc, vendor, count, actual, pred, error, err_pct in zip(
+            eval_data["system.accelerator.name"],
+            eval_data["system.accelerator.vendor"],
+            eval_data["system.accelerator.total_count"],
+            eval_data["actual"],
+            eval_data["predicted"],
+            eval_data["error"],
+            eval_data["error_percent"],
+        )
+    ]
+    fig.add_trace(
+        go.Scatter(
+            x=eval_data["actual"],
+            y=eval_data["predicted"],
+            mode="markers",
+            marker=dict(
+                opacity=0.6,
+                color=eval_data["error_percent"],
+                colorscale="RdBu_r",
+                colorbar=dict(title="Error %"),
+                cmin=-30,
+                cmax=30,
+            ),
+            text=hover_text,
+            hoverinfo="text",
+            name="Predictions",
+        ),
+        row=1,
+        col=1,
+    )
+    max_val = max(eval_data["actual"].max(), eval_data["predicted"].max())
+    min_val = min(eval_data["actual"].min(), eval_data["predicted"].min())
+    fig.add_trace(
+        go.Scatter(
+            x=[min_val, max_val],
+            y=[min_val, max_val],
+            mode="lines",
+            line=dict(color="red", dash="dash"),
+            name="Perfect Prediction",
+            hoverinfo="none",
+        ),
+        row=1,
+        col=1,
+    )
+    fig.add_trace(
+        go.Scatter(
+            x=eval_data["predicted"],
+            y=eval_data["error_percent"],
+            mode="markers",
+            marker=dict(
+                opacity=0.6,
+                color=eval_data["error_percent"],
+                colorscale="RdBu_r",
+                colorbar=dict(title="Error %"),
+                showscale=False,
+                cmin=-30,
+                cmax=30,
+            ),
+            text=hover_text,
+            hoverinfo="text",
+            name="Errors",
+        ),
+        row=1,
+        col=2,
+    )
+    fig.add_trace(
+        go.Histogram(
+            x=eval_data["error_percent"],
+            nbinsx=20,
+            marker=dict(color="blue", opacity=0.7, line=dict(color="black", width=1)),
+            name="Error Distribution",
+        ),
+        row=2,
+        col=1,
+    )
+    fig.add_vline(x=0, line_dash="dash", line_color="red", row=2, col=1)
+    top_features = feature_importance.head(10).sort_values("Importance")
+    fig.add_trace(
+        go.Bar(
+            y=top_features["Feature"],
+            x=top_features["Importance"],
+            orientation="h",
+            marker=dict(color="blue"),
+            name="Feature Importance",
+        ),
+        row=2,
+        col=2,
+    )
+    fig.update_xaxes(title_text="Actual Performance (tokens/s)", row=1, col=1)
+    fig.update_yaxes(title_text="Predicted Performance (tokens/s)", row=1, col=1)
+    fig.update_xaxes(title_text="Predicted Value", row=1, col=2)
+    fig.update_yaxes(title_text="Error (%)", row=1, col=2)
+    fig.update_xaxes(title_text="Prediction Error (%)", row=2, col=1)
+    fig.update_yaxes(title_text="Frequency", row=2, col=1)
+    fig.update_xaxes(title_text="Importance", row=2, col=2)
+    fig.update_layout(
+        height=800,
+        width=1200,
+        autosize=True,
+        showlegend=False,
+        title_text="Model Performance Analysis",
+    )
+    logger.info("Successfully created model performance plot")
+    return fig, metrics, feature_importance.head(10)
+with gr.Blocks(title="MLPerf Configuration Finder") as interface:
+    gr.Markdown(
+        """
+    # 🔍 MLPerf Configuration Finder (ongoing preliminary work)
+    Find the optimal configurations for your AI workloads by specifying your model and constraints.
+    Results are ranked by performance and include both real benchmark data and AI-generated predictions.
+    *All configurations include a ±10% tolerance for continuous features like model size, memory capacity, etc.*
+    """
+    )
+    with gr.Row():
+        status_msg = gr.Markdown(
+            "*Ready to search. Enter your criteria and click 'Search Configurations'.*"
+        )
+    with gr.Tabs():
+        with gr.TabItem("Workload Specifications"):
+            with gr.Accordion("Model Specifications", open=True):
+                with gr.Row():
+                    architecture = gr.Dropdown(
+                        choices=["Any"] + metadata.get("architectures", []),
+                        label="Architecture",
+                        value="LLM",
+                        info="Model architecture type",
+                    )
+                    weight_data_type = gr.Dropdown(
+                        choices=["Any"] + metadata.get("weight_data_types", []),
+                        label="Weight Data Type",
+                        value="Any",
+                        info="Precision format for model weights",
+                    )
+                model_size = gr.Slider(
+                    minimum=metadata.get("model_size_min"),
+                    maximum=metadata.get("model_size_max"),
+                    value=70,
+                    step=1,
+                    label="Model Size (billions of parameters)",
+                    info="Number of parameters in billions",
+                )
+            with gr.Accordion("Accelerator (GPU/TPU) Specifications", open=False):
+                with gr.Row():
+                    accelerator_vendor = gr.Dropdown(
+                        choices=["Any"] + metadata.get("accelerator_vendors", []),
+                        label="Vendor",
+                        value="Any",
+                        info="Hardware manufacturer",
+                    )
+                    accelerator_model = gr.Dropdown(
+                        choices=["Any"] + metadata.get("accelerator_models", []),
+                        label="Model",
+                        value="Any",
+                        info="Specific accelerator model",
+                    )
+                with gr.Row():
+                    min_gpu_memory = gr.Slider(
+                        minimum=metadata.get("gpu_memory_min"),
+                        maximum=metadata.get("gpu_memory_max"),
+                        value=metadata.get("gpu_memory_min"),
+                        step=1,
+                        label="Min GPU Memory (GB)",
+                        info="Minimum GPU memory capacity needed",
+                    )
+                    max_gpu_memory = gr.Slider(
+                        minimum=metadata.get("gpu_memory_min"),
+                        maximum=metadata.get("gpu_memory_max"),
+                        value=metadata.get("gpu_memory_max"),
+                        step=1,
+                        label="Max GPU Memory (GB)",
+                        info="Maximum GPU memory capacity to consider",
+                    )
+                with gr.Row():
+                    interconnect = gr.Dropdown(
+                        choices=["Any"] + metadata.get("interconnect_types", []),
+                        label="Interconnect",
+                        value="Any",
+                        info="GPU-to-GPU connection type",
+                    )
+                with gr.Row():
+                    min_accelerators = gr.Slider(
+                        minimum=metadata.get("min_accelerators"),
+                        maximum=metadata.get("max_accelerators"),
+                        value=metadata.get("min_accelerators"),
+                        step=1,
+                        label="Minimum Accelerators",
+                        info="Minimum number of accelerators needed",
+                    )
+                    max_accelerators = gr.Slider(
+                        minimum=metadata.get("min_accelerators"),
+                        maximum=metadata.get("max_accelerators"),
+                        value=metadata.get("max_accelerators"),
+                        step=1,
+                        label="Maximum Accelerators",
+                        info="Maximum number of accelerators to consider",
+                    )
+            with gr.Accordion("CPU & System Specifications", open=False):
+                with gr.Row():
+                    cpu_vendor = gr.Dropdown(
+                        choices=["Any"] + metadata.get("cpu_vendors", []),
+                        label="CPU Vendor",
+                        value="Any",
+                        info="CPU manufacturer",
+                    )
+                    cpu_model = gr.Dropdown(
+                        choices=["Any"] + metadata.get("cpu_models", []),
+                        label="CPU Model",
+                        value="Any",
+                        info="Specific CPU model",
+                    )
+                nodes = gr.Dropdown(
+                    choices=["Any"] + [str(n) for n in metadata.get("node_counts", [])],
+                    label="Number of Nodes",
+                    value="Any",
+                    info="Number of physical servers in the system",
+                )
+                with gr.Row():
+                    min_cpu_memory = gr.Slider(
+                        minimum=metadata.get("cpu_memory_min"),
+                        maximum=metadata.get("cpu_memory_max"),
+                        value=metadata.get("cpu_memory_min"),
+                        step=1,
+                        label="Min System Memory (GB)",
+                        info="Minimum system RAM needed",
+                    )
+                    max_cpu_memory = gr.Slider(
+                        minimum=metadata.get("cpu_memory_min"),
+                        maximum=metadata.get("cpu_memory_max"),
+                        value=metadata.get("cpu_memory_max"),
+                        step=1,
+                        label="Max System Memory (GB)",
+                        info="Maximum system RAM to consider",
+                    )
+            with gr.Accordion("Software Environment", open=False):
+                os = gr.Dropdown(
+                    choices=["Any"] + metadata.get("operating_systems", []),
+                    label="Operating System",
+                    value="Any",
+                    info="Host operating system",
+                )
+                frameworks = [
+                    fw
+                    for fw in metadata.get("frameworks", [])
+                    if f"{fw}_versions" in metadata
+                ]
+                n_frameworks = len(frameworks)
+                column_size = (n_frameworks + 1) // 2
+                framework_dropdowns = []
+                with gr.Row():
+                    for i in range(0, 2):
+                        with gr.Column():
+                            start_idx = i * column_size
+                            end_idx = min((i + 1) * column_size, n_frameworks)
+                            if start_idx < n_frameworks:
+                                column_frameworks = frameworks[start_idx:end_idx]
+                                for fw in column_frameworks:
+                                    version_key = f"{fw}_versions"
+                                    dropdown = gr.Dropdown(
+                                        choices=["Any"] + metadata.get(version_key),
+                                        label=fw,
+                                        value="Any",
+                                        info=f"Select {fw} framework version",
+                                    )
+                                    framework_dropdowns.append((fw, dropdown))
+        with gr.TabItem("Device Cost Settings 💰"):
+            gr.Markdown(
+                """
+            ## Configure Device Hourly Costs
+            Customize the hourly cost (in USD) for each accelerator type. These values will be used to
+            calculate the cost metrics for hardware configurations.
+            Default values may not reflect actual current market prices. Please adjust them according to your needs.
+            """
+            )
+            with gr.Column():
+                with gr.Row():
+                    save_costs_button = gr.Button(
+                        "💾 Save Cost Settings", variant="primary"
+                    )
+                    reset_costs_button = gr.Button("↻ Reset to Defaults")
+                current_costs = get_device_costs()
+                cost_data = pd.DataFrame(
+                    {
+                        "Device": list(current_costs.keys()),
+                        "Hourly Cost ($)": list(current_costs.values()),
+                    }
+                ).sort_values("Device")
+                device_costs_df = gr.DataFrame(
+                    value=cost_data,
+                    datatype=["str", "number"],
+                    col_count=(2, "fixed"),
+                    interactive=True,
+                    wrap=True,
+                    show_copy_button=True,
+                    show_search="filter",
+                )
+                costs_status = gr.Markdown("*Device costs ready for customization*")
+                def update_costs_callback(df):
+                    """Update device costs with values from dataframe."""
+                    if isinstance(df, list):
+                        new_costs = {
+                            row[0]: float(row[1]) for row in df if len(row) >= 2
+                        }
+                    else:
+                        new_costs = {
+                            df.loc[i, "Device"]: float(df.loc[i, "Hourly Cost ($)"])
+                            for i in range(len(df))
+                        }
+                    update_device_costs(new_costs)
+                    return "*Device costs successfully updated!*"
+                def reset_costs_callback():
+                    """Reset all costs to defaults."""
+                    initialize_device_costs(pd_df)
+                    current_costs = get_device_costs()
+                    cost_data = pd.DataFrame(
+                        {
+                            "Device": list(current_costs.keys()),
+                            "Hourly Cost ($)": list(current_costs.values()),
+                        }
+                    ).sort_values("Device")
+                    return cost_data, "*Device costs reset to defaults*"
+                save_costs_button.click(
+                    fn=update_costs_callback,
+                    inputs=device_costs_df,
+                    outputs=costs_status,
+                )
+                reset_costs_button.click(
+                    fn=reset_costs_callback,
+                    inputs=[],
+                    outputs=[device_costs_df, costs_status],
+                )
+    with gr.Row():
+        with gr.Accordion("Options", open=True):
+            with gr.Row():
+                include_predictions = gr.Checkbox(
+                    label="Include AI-generated predictions",
+                    value=True,
+                    info="When enabled, AI will predict performance for configurations not in the benchmark database",
+                )
+                optimization_metric = gr.Radio(
+                    choices=["performance", "cost"],
+                    label="Optimization Target",
+                    value="performance",
+                    info="Choose whether to optimize for highest performance or lowest cost per token",
+                )
+    with gr.Row():
+        search_button = gr.Button(
+            "🔍 Search Configurations", variant="primary", scale=3
+        )
+    with gr.Group():
+        summary = gr.Markdown(
+            "Enter your requirements and click 'Search Configurations' to find suitable hardware.",
+            label="Summary",
+        )
+    with gr.Tabs():
+        with gr.TabItem("Top Configuration Details 🏆"):
+            details = gr.DataFrame(
+                headers=["Feature", "Value"],
+                datatype=["str", "str"],
+                label="Configuration Details",
+            )
+        with gr.TabItem("All Matching Configurations 📊"):
+            recommendations = gr.DataFrame(
+                headers=[
+                    "System",
+                    "Accelerator",
+                    "Count",
+                    "Nodes",
+                    "GPU Memory (GB)",
+                    "Model",
+                    "Architecture",
+                    "Parameters (B)",
+                    "Weight Data Type",
+                    "Total Performance (Tokens/s)",
+                    "Per-GPU Performance (Tokens/s)",
+                    "Hourly Cost ($)",
+                    "Cost/Million Tokens",
+                    "Predicted",
+                ],
+                datatype=[
+                    "str",
+                    "str",
+                    "number",
+                    "number",
+                    "number",
+                    "str",
+                    "str",
+                    "number",
+                    "str",
+                    "number",
+                    "number",
+                    "number",
+                    "number",
+                    "str",
+                ],
+                label="Hardware Configurations",
+            )
+        with gr.TabItem("ML Model Performance 📈"):
+            gr.Markdown(
+                """
+            ## Model Performance Analysis
+            This tab shows how well our machine learning model can predict performance for unseen hardware configurations.
+            The evaluation is based on a test set that was not used to train the model.
+            **Hover over data points in the plots to see detailed information about each prediction.**
+            """
+            )
+            model_metrics = gr.Dataframe(
+                headers=["Metric", "Value"],
+                value=[
+                    ["Root Mean Squared Error (RMSE)", 0],
+                    ["Mean Absolute Error (MAE)", 0],
+                    ["R² Score", 0],
+                    ["Mean Absolute Percentage Error (MAPE)", 0],
+                ],
+                label="Model Performance Metrics",
+            )
+            feature_importance_df = gr.Dataframe(
+                headers=["Feature", "Importance"], label="Feature Importance"
+            )
+            performance_plot = gr.Plot(
+                label="Performance Visualization", elem_id="performance_plot"
+            )
+    with gr.Row():
+        gr.Markdown("## Top Configurations Comparison")
+    with gr.Row():
+        top_n_configs = gr.Slider(
+            minimum=1,
+            maximum=100,
+            value=10,
+            step=1,
+            label="Number of configurations to show",
+            info="Adjust to see more or fewer configurations in the chart",
+        )
+    with gr.Row():
+        top_configs_chart = gr.Plot(label="")
+    current_configs_state = gr.State(pd.DataFrame())
+    all_inputs = [
+        model_size,
+        weight_data_type,
+        architecture,
+        accelerator_vendor,
+        accelerator_model,
+        min_gpu_memory,
+        max_gpu_memory,
+        interconnect,
+        min_accelerators,
+        max_accelerators,
+        cpu_vendor,
+        cpu_model,
+        nodes,
+        min_cpu_memory,
+        max_cpu_memory,
+        os,
+        include_predictions,
+        optimization_metric,
+        top_n_configs,
+    ]
+    framework_input_components = [dropdown for _, dropdown in framework_dropdowns]
+    def process_framework_inputs(*args):
+        base_args = args[: -len(framework_dropdowns)]
+        framework_args = args[-len(framework_dropdowns) :]
+        framework_versions = {}
+        for (framework_name, _), version in zip(framework_dropdowns, framework_args):
+            if version != "Any":
+                framework_versions[framework_name] = version
+        opt_metric = base_args[16]
+        results = recommend_hardware(*base_args, **framework_versions)
+        recommendations_df, details_df, summary, top_chart = results
+        best_configs = find_best_configs(
+            {
+                "model_size": base_args[0],
+                "weight_data_type": base_args[1],
+                "architecture": base_args[2],
+            },
+            constraints=get_constraints_from_args(*base_args),
+            include_predictions=base_args[15],
+            optimization_metric=opt_metric,
+        )
+        return (
+            recommendations_df,
+            details_df,
+            summary,
+            top_chart,
+            best_configs,
+        )
+    def get_constraints_from_args(*args):
+        """Helper function to convert args to constraints dict."""
+        return {
+            "system.accelerator.vendor": args[3],
+            "system.accelerator.name": args[4],
+            "system.interconnect.accelerator": args[7],
+            "system.cpu.vendor": args[10],
+            "system.cpu.model": args[11],
+            "system.number_of_nodes": args[12] if args[12] != "Any" else None,
+            "software.operating_system": args[15],
+            "min_gpu_memory": args[5],
+            "max_gpu_memory": args[6],
+            "min_cpu_memory": args[13],
+            "max_cpu_memory": args[14],
+            "min_accelerators": args[8],
+            "max_accelerators": args[9],
+        }
+    def update_chart(n: int, configs_df: pd.DataFrame, metric: str) -> go.Figure:
+        """Update the configurations chart based on the slider value."""
+        return create_top_configs_plot(configs_df, metric, n)
+    search_button.click(
+        fn=process_framework_inputs,
+        inputs=all_inputs + framework_input_components,
+        outputs=[
+            recommendations,
+            details,
+            summary,
+            top_configs_chart,
+            current_configs_state,
+        ],
+        show_progress="full",
+    )
+    top_n_configs.change(
+        fn=update_chart,
+        inputs=[top_n_configs, current_configs_state, optimization_metric],
+        outputs=top_configs_chart,
+    )
+    def initial_load():
+        logger.info("Starting initial load of app")
+        default_values = []
+        for input_component in all_inputs:
+            default_values.append(input_component.value)
+        for _, dropdown in framework_dropdowns:
+            default_values.append(dropdown.value)
+        (
+            recommendations_df,
+            details_df,
+            summary_text,
+            top_chart,
+            best_configs,
+        ) = process_framework_inputs(*default_values)
+        if not recommendations_df.empty:
+            top_n_configs.maximum = min(100, len(recommendations_df))
+        if predictor:
+            logger.info("Predictor available, generating performance visualization")
+            try:
+                plot_fig, metrics, feature_importance = create_model_performance_plot(
+                    predictor
+                )
+                metrics_df = pd.DataFrame(
+                    {
+                        "Metric": [
+                            "Root Mean Squared Error (RMSE)",
+                            "Mean Absolute Error (MAE)",
+                            "R² Score",
+                            "Mean Absolute Percentage Error (MAPE)",
+                        ],
+                        "Value": [
+                            f"{metrics.get('rmse', 0):.4f}",
+                            f"{metrics.get('mae', 0):.4f}",
+                            f"{metrics.get('r2', 0):.4f}",
+                            f"{metrics.get('mape', 0):.2f}%",
+                        ],
+                    }
+                )
+                logger.info(f"Created metrics_df with values: {metrics_df.to_dict()}")
+            except Exception as e:
+                logger.error(f"Error creating performance plot: {e}", exc_info=True)
+                plot_fig = go.Figure()
+                metrics_df = pd.DataFrame(
+                    {
+                        "Metric": [
+                            "Root Mean Squared Error (RMSE)",
+                            "Mean Absolute Error (MAE)",
+                            "R² Score",
+                            "Mean Absolute Percentage Error (MAPE)",
+                        ],
+                        "Value": ["N/A", "N/A", "N/A", "N/A"],
+                    }
+                )
+                feature_importance = pd.DataFrame(columns=["Feature", "Importance"])
+        else:
+            logger.warning("No predictor available for initial load")
+            plot_fig = go.Figure()
+            plot_fig.update_layout(
+                title="No model available",
+                annotations=[
+                    dict(
+                        text="No prediction model available",
+                        showarrow=False,
+                        xref="paper",
+                        yref="paper",
+                        x=0.5,
+                        y=0.5,
+                    )
+                ],
+            )
+            metrics_df = pd.DataFrame(
+                {
+                    "Metric": [
+                        "Root Mean Squared Error (RMSE)",
+                        "Mean Absolute Error (MAE)",
+                        "R² Score",
+                        "Mean Absolute Percentage Error (MAPE)",
+                    ],
+                    "Value": ["N/A", "N/A", "N/A", "N/A"],
+                }
+            )
+            feature_importance = pd.DataFrame(columns=["Feature", "Importance"])
+        logger.info("Completed initial load")
+        return (
+            recommendations_df,
+            details_df,
+            summary_text,
+            plot_fig,
+            metrics_df,
+            feature_importance,
+            top_chart,
+            best_configs,
+        )
+    interface.load(
+        fn=initial_load,
+        outputs=[
+            recommendations,
+            details,
+            summary,
+            performance_plot,
+            model_metrics,
+            feature_importance_df,
+            top_configs_chart,
+            current_configs_state,
+        ],
+        api_name=False,
+    )
+    gr.Markdown("---")
+    gr.HTML("""
+        <div style="text-align: center;">
+            Authors: <a href="https://www.linkedin.com/in/daltunay">Daniel Altunay</a> and
+            <a href="https://cKnowledge.org/gfursin">Grigori Fursin</a> (FCS Labs)
+        </div>
+    """)
+if __name__ == "__main__":
+    interface.launch(share=False)

cost_calculator.py ADDED Viewed

	@@ -0,0 +1,137 @@

+"""
+Cost calculation module for MLPerf configurations.
+"""
+import logging
+import pandas as pd
+logger = logging.getLogger(__name__)
+DEFAULT_HOURLY_COST = 1.0
+DEFAULT_DEVICE_COSTS = {
+    "NVIDIA H100": 3.00,
+    "NVIDIA H200": 4.00,
+    "NVIDIA GH200": 5.00,
+    "NVIDIA B200/GB200": 7.00,
+    "AMD MI300X": 3.50,
+    "AMD MI325X": 4.50,
+    "NVIDIA RTX 4090": 1.20,
+    "NVIDIA L40S": 1.80,
+    "NVIDIA Jetson AGX": 0.30,
+}
+device_costs = {}
+def normalize_gpu_name(name: str) -> str:
+    """Normalize GPU names by identifying common patterns for the same device families."""
+    if not name:
+        return name
+    name_upper = name.upper()
+    gpu_families = {
+        "H100": "NVIDIA H100",
+        "H200": "NVIDIA H200",
+        "GH200": "NVIDIA GH200",
+        "GRACE HOPPER": "NVIDIA GH200",
+        "B200": "NVIDIA B200/GB200",
+        "GB200": "NVIDIA B200/GB200",
+        "MI300X": "AMD MI300X",
+        "MI325X": "AMD MI325X",
+        "RTX 4090": "NVIDIA RTX 4090",
+        "L40S": "NVIDIA L40S",
+    }
+    if "JETSON" in name_upper and ("ORIN" in name_upper or "THOR" in name_upper):
+        return "NVIDIA Jetson AGX"
+    for keyword, normalized_name in gpu_families.items():
+        if keyword in name_upper:
+            return normalized_name
+    return name
+def initialize_device_costs(df: pd.DataFrame) -> None:
+    """Initialize device costs from dataset with default values."""
+    global device_costs
+    accelerators = set()
+    if df is not None and not df.empty and "system.accelerator.name" in df.columns:
+        for acc in df["system.accelerator.name"].dropna().unique():
+            normalized_name = normalize_gpu_name(acc)
+            accelerators.add(normalized_name)
+    device_costs = {}
+    for device in accelerators:
+        if device in DEFAULT_DEVICE_COSTS:
+            device_costs[device] = DEFAULT_DEVICE_COSTS[device]
+        else:
+            device_costs[device] = DEFAULT_HOURLY_COST
+    logger.info(f"Initialized costs for {len(device_costs)} unique device families")
+def get_device_costs() -> dict[str, float]:
+    """Return a copy of the current device costs."""
+    return device_costs.copy()
+def update_device_costs(new_costs: dict[str, float]) -> None:
+    """Update device costs with new values."""
+    global device_costs
+    device_costs.update(new_costs)
+    logger.info(f"Updated costs for {len(new_costs)} devices")
+def calculate_costs(df: pd.DataFrame) -> pd.DataFrame:
+    """Add cost metrics to the DataFrame."""
+    if df is None or df.empty:
+        return df
+    result_df = df.copy()
+    result_df["hourly_cost"] = None
+    result_df["cost_per_million_tokens"] = None
+    for idx, row in result_df.iterrows():
+        hourly_cost = estimate_hourly_cost(row)
+        result_df.at[idx, "hourly_cost"] = hourly_cost
+        if hourly_cost and "metrics.result" in row and row["metrics.result"]:
+            tokens_per_hour = row["metrics.result"] * 3600
+            if tokens_per_hour > 0:
+                cost_per_million = (hourly_cost / tokens_per_hour) * 1000000
+                result_df.at[idx, "cost_per_million_tokens"] = cost_per_million
+    return result_df
+def estimate_hourly_cost(row: pd.Series) -> float:
+    """Estimate hourly cost for a single configuration."""
+    try:
+        acc_name = row.get("system.accelerator.name")
+        acc_vendor = row.get("system.accelerator.vendor")
+        acc_count = row.get("system.accelerator.total_count")
+        if not acc_count:
+            return None
+        base_cost = DEFAULT_HOURLY_COST
+        if acc_name:
+            normalized_name = normalize_gpu_name(acc_name)
+            if normalized_name in device_costs:
+                base_cost = device_costs[normalized_name]
+            elif acc_vendor and acc_vendor in device_costs:
+                base_cost = device_costs[acc_vendor]
+        return base_cost * acc_count
+    except Exception as e:
+        logger.warning(f"Error calculating cost: {e}")
+        return None

data.json ADDED Viewed

The diff for this file is too large to render. See raw diff

predictor.py ADDED Viewed

	@@ -0,0 +1,900 @@

+"""Simplified performance predictor for MLPerf configurations using XGBoost."""
+import logging
+import random
+from collections import Counter, defaultdict
+import numpy as np
+import pandas as pd
+import xgboost as xgb
+from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
+from sklearn.model_selection import train_test_split
+from utils import FEATURE_TYPES
+logger = logging.getLogger(__name__)
+class PerformancePredictor:
+    """Predicts performance for hardware configurations."""
+    def __init__(self, dataset: pd.DataFrame, test_size: float = 0.2):
+        """Initialize with benchmark dataset."""
+        self.df = dataset
+        self.model = None
+        self.target = "metrics.result_per_accelerator"
+        self.features = []
+        self.test_size = test_size
+        self.evaluation_data = pd.DataFrame()
+        self.evaluation_metrics = {}
+        self.feature_importance = pd.DataFrame(columns=["Feature", "Importance"])
+        self.excluded_features = {
+            "model.name",
+            "model.mlperf_name",
+            "software.framework",
+            "system.name",
+        }
+        self.excluded_features.update(
+            {
+                col
+                for col in dataset.columns
+                if col.startswith("submission.") or col.startswith("metrics.")
+            }
+        )
+        self.distributions = {}
+        self.max_accelerators = int(dataset["system.accelerator.total_count"].max())
+        self.max_gpu_memory = float(dataset["system.accelerator.memory_capacity"].max())
+        self.max_cpu_memory = float(dataset["system.memory.capacity"].max())
+        self.frameworks = sorted(
+            list(
+                set(
+                    col.replace("software.framework.", "")
+                    for col in dataset.columns
+                    if col.startswith("software.framework.")
+                    and col != "software.framework"
+                )
+            )
+        )
+        logger.info(
+            f"Found {len(self.frameworks)} unique frameworks: {', '.join(self.frameworks)}"
+        )
+        self._identify_features()
+        self._analyze_data_distributions()
+        self._train_model()
+    def _identify_features(self):
+        """Identify features for model training."""
+        all_columns = set(self.df.columns)
+        available_features = all_columns - self.excluded_features - {self.target}
+        self.features = [f for f in available_features if not self.df[f].isna().all()]
+        logger.info(f"Identified {len(self.features)} features for model training")
+    def _analyze_data_distributions(self):
+        """Analyze feature distributions for realistic data generation."""
+        categorical_features = {
+            col
+            for col in self.df.columns
+            if self.df[col].dtype == "object"
+            or col in FEATURE_TYPES.get("categorical", [])
+        }
+        for feature in categorical_features:
+            values = self.df[feature].dropna().tolist()
+            if values:
+                counter = Counter(values)
+                total = sum(counter.values())
+                self.distributions[feature] = {
+                    value: count / total for value, count in counter.items()
+                }
+        continuous_features = {
+            col
+            for col in self.df.columns
+            if col in FEATURE_TYPES.get("continuous", [])
+            or pd.api.types.is_numeric_dtype(self.df[col].dtype)
+            if col not in categorical_features and not col.startswith("metrics.")
+        }
+        for feature in continuous_features:
+            values = self.df[feature].dropna()
+            if len(values) > 0:
+                self.distributions[feature] = {
+                    "min": float(values.min()),
+                    "max": float(values.max()),
+                    "mean": float(values.mean()),
+                    "std": float(values.std()),
+                    "median": float(values.median()),
+                    "values": sorted(values.unique().tolist()),
+                }
+        self._analyze_feature_relationships()
+        logger.info(f"Analyzed distributions for {len(self.distributions)} features")
+    def _analyze_feature_relationships(self):
+        """Analyze relationships between related features."""
+        self._analyze_vendor_accelerator_relations()
+        self._analyze_vendor_cpu_relations()
+        self._analyze_accelerator_memory_relations()
+        self._analyze_interconnect_relations()
+        self._analyze_software_relations()
+        self._analyze_device_counts()
+    def _analyze_vendor_accelerator_relations(self):
+        """Map vendors to their accelerators."""
+        vendor_accelerators = defaultdict(list)
+        for _, row in self.df.iterrows():
+            vendor = row.get("system.accelerator.vendor")
+            acc = row.get("system.accelerator.name")
+            if vendor and acc:
+                vendor_accelerators[vendor].append(acc)
+        self.distributions["vendor_accelerators"] = {}
+        for vendor, accelerators in vendor_accelerators.items():
+            counter = Counter(accelerators)
+            total = sum(counter.values())
+            self.distributions["vendor_accelerators"][vendor] = {
+                acc: count / total for acc, count in counter.items()
+            }
+    def _analyze_vendor_cpu_relations(self):
+        """Map CPU vendors to their models."""
+        vendor_cpus = defaultdict(list)
+        for _, row in self.df.iterrows():
+            vendor = row.get("system.cpu.vendor")
+            model = row.get("system.cpu.model")
+            if vendor and model:
+                vendor_cpus[vendor].append(model)
+        self.distributions["vendor_cpus"] = {}
+        for vendor, models in vendor_cpus.items():
+            counter = Counter(models)
+            total = sum(counter.values())
+            self.distributions["vendor_cpus"][vendor] = {
+                model: count / total for model, count in counter.items()
+            }
+    def _analyze_accelerator_memory_relations(self):
+        """Map accelerator models to memory capacities."""
+        acc_memory = defaultdict(list)
+        for _, row in self.df.iterrows():
+            acc = row.get("system.accelerator.name")
+            memory = row.get("system.accelerator.memory_capacity")
+            if acc and memory:
+                acc_memory[acc].append(memory)
+        self.distributions["accelerator_memory"] = {}
+        for acc, memories in acc_memory.items():
+            if memories:
+                counter = Counter(memories)
+                most_common = counter.most_common(1)[0][0] if counter else None
+                self.distributions["accelerator_memory"][acc] = {
+                    "min": min(memories),
+                    "max": max(memories),
+                    "mean": sum(memories) / len(memories),
+                    "most_common": most_common,
+                    "values": sorted(set(memories)),
+                }
+    def _analyze_interconnect_relations(self):
+        """Map vendors to interconnect types."""
+        vendor_interconnects = defaultdict(list)
+        for _, row in self.df.iterrows():
+            vendor = row.get("system.accelerator.vendor")
+            interconnect = row.get("system.interconnect.accelerator")
+            if vendor and interconnect:
+                vendor_interconnects[vendor].append(interconnect)
+        self.distributions["vendor_interconnects"] = {}
+        for vendor, interconnects in vendor_interconnects.items():
+            counter = Counter(interconnects)
+            total = sum(counter.values())
+            self.distributions["vendor_interconnects"][vendor] = {
+                ic: count / total for ic, count in counter.items()
+            }
+    def _analyze_software_relations(self):
+        """Map vendors to software stacks."""
+        vendor_software = defaultdict(lambda: defaultdict(list))
+        for _, row in self.df.iterrows():
+            vendor = row.get("system.accelerator.vendor")
+            if not vendor:
+                continue
+            os = row.get("software.operating_system")
+            if os:
+                vendor_software[vendor]["os"].append(os)
+            for col in self.df.columns:
+                if (
+                    col.startswith("software.framework.")
+                    and col != "software.framework"
+                ):
+                    framework = col.replace("software.framework.", "")
+                    version = row.get(col)
+                    if version:
+                        vendor_software[vendor][framework].append(version)
+        self.distributions["vendor_software"] = {}
+        for vendor, software_dict in vendor_software.items():
+            self.distributions["vendor_software"][vendor] = {}
+            for software_type, values in software_dict.items():
+                counter = Counter(values)
+                total = sum(counter.values())
+                self.distributions["vendor_software"][vendor][software_type] = {
+                    value: count / total for value, count in counter.items()
+                }
+    def _analyze_device_counts(self):
+        """Analyze distribution of device counts."""
+        counts = self.df["system.accelerator.total_count"].dropna().astype(int).tolist()
+        if counts:
+            counter = Counter(counts)
+            total = sum(counter.values())
+            self.distributions["device_count"] = {
+                count: freq / total for count, freq in counter.items()
+            }
+            self.distributions["device_count_values"] = sorted(list(set(counts)))
+        node_counts = self.df["system.number_of_nodes"].dropna().astype(int).tolist()
+        if node_counts:
+            counter = Counter(node_counts)
+            total = sum(counter.values())
+            self.distributions["node_count"] = {
+                count: freq / total for count, freq in counter.items()
+            }
+            self.distributions["node_count_values"] = sorted(list(set(node_counts)))
+        device_node_pairs = [
+            (
+                int(row["system.number_of_nodes"]),
+                int(row["system.accelerator.total_count"]),
+            )
+            for _, row in self.df.iterrows()
+            if row.get("system.number_of_nodes")
+            and row.get("system.accelerator.total_count")
+        ]
+        node_to_devices = defaultdict(list)
+        for nodes, devices in device_node_pairs:
+            node_to_devices[nodes].append(devices)
+        self.distributions["node_device_relation"] = {}
+        for node_count, device_counts in node_to_devices.items():
+            counter = Counter(device_counts)
+            total = sum(counter.values())
+            self.distributions["node_device_relation"][node_count] = {
+                count: freq / total for count, freq in counter.items()
+            }
+    def _train_model(self):
+        """Train XGBoost model on available data with train/test split."""
+        df_clean = self.df.dropna(subset=[self.target])
+        X = df_clean[self.features]
+        y = df_clean[self.target]
+        for col in X.select_dtypes(include=["object"]).columns:
+            with pd.option_context("mode.chained_assignment", None):
+                X[col] = X[col].astype("category")
+        try:
+            strat_column = df_clean["system.accelerator.name"].fillna("unknown")
+            X_train, X_test, y_train, y_test = train_test_split(
+                X, y, test_size=self.test_size, stratify=strat_column, random_state=42
+            )
+            logger.info(
+                f"Created stratified train/test split ({100 - self.test_size * 100:.0f}%/{self.test_size * 100:.0f}%) with {len(X_train)} training and {len(X_test)} test samples"
+            )
+        except ValueError:
+            X_train, X_test, y_train, y_test = train_test_split(
+                X, y, test_size=self.test_size, random_state=42
+            )
+            logger.info(
+                f"Created regular train/test split with {len(X_train)} training and {len(X_test)} test samples"
+            )
+        self.model = xgb.XGBRegressor(
+            objective="reg:squarederror",
+            n_estimators=100,
+            max_depth=6,
+            learning_rate=0.1,
+            subsample=0.8,
+            enable_categorical=True,
+        )
+        self.model.fit(X_train, y_train)
+        logger.info(f"Trained XGBoost model on {len(X_train)} rows")
+        self._evaluate_model(X_test, y_test, df_clean.loc[X_test.index])
+    def _evaluate_model(self, X_test, y_test, test_df):
+        """Evaluate model performance on test set."""
+        if X_test.empty:
+            logger.warning("No test data available for evaluation")
+            return
+        y_pred = self.model.predict(X_test)
+        rmse = np.sqrt(mean_squared_error(y_test, y_pred))
+        mae = mean_absolute_error(y_test, y_pred)
+        r2 = r2_score(y_test, y_pred)
+        mape = np.mean(np.abs((y_test - y_pred) / y_test)) * 100
+        self.evaluation_metrics = {
+            "rmse": rmse,
+            "mae": mae,
+            "r2": r2,
+            "mape": mape,
+            "test_size": len(y_test),
+            "training_size": len(self.df) - len(y_test),
+        }
+        eval_data = test_df[
+            [
+                "system.accelerator.name",
+                "system.accelerator.vendor",
+                "system.accelerator.total_count",
+            ]
+        ].copy()
+        eval_data["actual"] = y_test
+        eval_data["predicted"] = y_pred
+        eval_data["error"] = y_pred - y_test
+        eval_data["error_percent"] = (y_pred - y_test) / y_test * 100
+        self.evaluation_data = eval_data.copy()
+        logger.info(
+            f"Model evaluation - RMSE: {rmse:.2f}, MAE: {mae:.2f}, R²: {r2:.3f}, MAPE: {mape:.2f}%"
+        )
+        logger.info(
+            f"Evaluation data shape: {eval_data.shape}, with columns: {list(eval_data.columns)}"
+        )
+        logger.info(f"Evaluation data sample: {eval_data.head(2).to_dict()}")
+        logger.info(
+            f"Evaluation data stored as class attribute with shape: {self.evaluation_data.shape}"
+        )
+        importance = self.model.feature_importances_
+        feature_importance = pd.DataFrame(
+            {"Feature": self.model.feature_names_in_, "Importance": importance}
+        ).sort_values("Importance", ascending=False)
+        self.feature_importance = feature_importance.head(10).copy()
+        logger.info(
+            f"Top 5 important features: {', '.join(self.feature_importance['Feature'].head(5).tolist())}"
+        )
+    def get_evaluation_metrics(self) -> dict:
+        """Return model evaluation metrics."""
+        logger.info(f"Getting evaluation metrics: {self.evaluation_metrics}")
+        return self.evaluation_metrics.copy() if self.evaluation_metrics else {}
+    def get_evaluation_data(self) -> pd.DataFrame:
+        """Return evaluation data for visualization."""
+        data_shape = (
+            "empty" if self.evaluation_data.empty else self.evaluation_data.shape
+        )
+        logger.info(f"Getting evaluation data with shape: {data_shape}")
+        return self.evaluation_data.copy() if not self.evaluation_data.empty else None
+    def get_feature_importance(self) -> pd.DataFrame:
+        """Return feature importance data."""
+        logger.info(
+            f"Getting feature importance with shape: {self.feature_importance.shape}"
+        )
+        return (
+            self.feature_importance.copy()
+            if not self.feature_importance.empty
+            else pd.DataFrame(columns=["Feature", "Importance"])
+        )
+    def generate_predictions(
+        self,
+        architecture: str,
+        parameters: float,
+        constraints: dict = None,
+        num_configs: int = 10,
+    ) -> pd.DataFrame:
+        """Generate and predict performance for hardware configurations."""
+        constraints = constraints or {}
+        logger.info(
+            f"Generating {num_configs} predictions for {architecture} model with {parameters}B parameters"
+        )
+        configs = self._generate_configs(
+            architecture, parameters, constraints, num_configs
+        )
+        if not configs:
+            return pd.DataFrame()
+        configs_df = pd.DataFrame(configs)
+        model_features = getattr(self.model, "feature_names_in_", self.features)
+        for feature in model_features:
+            if feature not in configs_df.columns:
+                configs_df[feature] = None
+        X_pred = configs_df[model_features]
+        for col in X_pred.select_dtypes(include=["object"]).columns:
+            with pd.option_context("mode.chained_assignment", None):
+                X_pred[col] = X_pred[col].astype("category")
+        configs_df[self.target] = self.model.predict(X_pred)
+        configs_df["predicted"] = True
+        configs_df["metrics.result"] = (
+            configs_df[self.target] * configs_df["system.accelerator.total_count"]
+        )
+        configs_df["system.name"] = "Hypothetical system - ongoing work"
+        logger.info(
+            f"Performance range: {configs_df[self.target].min():.2f} - {configs_df[self.target].max():.2f} tokens/s per accelerator"
+        )
+        return configs_df
+    def _sample_from_distribution(self, distribution: dict) -> any:
+        """Sample a value from a categorical distribution."""
+        items = list(distribution.keys())
+        probabilities = list(distribution.values())
+        return np.random.choice(items, p=probabilities)
+    def _sample_continuous_value(self, feature: str) -> float:
+        """Sample a continuous value from the feature distribution."""
+        dist = self.distributions[feature]
+        if "values" in dist and dist["values"]:
+            if len(dist["values"]) > 3:
+                value = np.random.normal(dist["mean"], max(dist["std"], 1.0))
+                value = max(dist["min"], min(dist["max"], value))
+                closest_idx = min(
+                    range(len(dist["values"])),
+                    key=lambda i: abs(dist["values"][i] - value),
+                )
+                return dist["values"][closest_idx]
+            else:
+                return random.choice(dist["values"])
+        elif all(k in dist for k in ["min", "max", "mean", "std"]):
+            value = np.random.normal(dist["mean"], max(dist["std"], 1.0))
+            return max(dist["min"], min(dist["max"], value))
+        return np.random.uniform(dist["min"], dist["max"])
+    def _get_device_count(self, min_devices=None, max_devices=None) -> int:
+        """Get a realistic device count based on distribution and constraints."""
+        valid_counts = [
+            count
+            for count in self.distributions["device_count_values"]
+            if (min_devices is None or count >= min_devices)
+            and (max_devices is None or count <= max_devices)
+        ]
+        if valid_counts:
+            probs = {
+                count: self.distributions["device_count"][count]
+                for count in valid_counts
+                if count in self.distributions["device_count"]
+            }
+            if probs:
+                total = sum(probs.values())
+                items = list(probs.keys())
+                weights = [probs[item] / total for item in items]
+                return np.random.choice(items, p=weights)
+            return random.choice(valid_counts)
+        if min_devices is not None and max_devices is not None:
+            valid_powers = [
+                2**i for i in range(10) if min_devices <= 2**i <= max_devices
+            ]
+            if valid_powers:
+                return random.choice(valid_powers)
+            return random.randint(min_devices, max_devices)
+        return random.choice([1, 2, 4, 8, 16])
+    def _get_vendor_accelerator(self, vendor=None) -> tuple:
+        """Get a vendor and accelerator pair."""
+        if vendor is None or vendor == "Any":
+            vendor = self._sample_from_distribution(
+                self.distributions["system.accelerator.vendor"]
+            )
+        if vendor in self.distributions["vendor_accelerators"]:
+            accelerator = self._sample_from_distribution(
+                self.distributions["vendor_accelerators"][vendor]
+            )
+        else:
+            accelerator = self._sample_from_distribution(
+                self.distributions["system.accelerator.name"]
+            )
+        return vendor, accelerator
+    def _get_memory_for_accelerator(
+        self, vendor: str, accelerator: str, min_memory=None, max_memory=None
+    ) -> float:
+        """Get appropriate memory capacity for a given accelerator."""
+        if accelerator in self.distributions["accelerator_memory"]:
+            mem_dist = self.distributions["accelerator_memory"][accelerator]
+            if "values" in mem_dist:
+                valid_values = [
+                    m
+                    for m in mem_dist["values"]
+                    if (min_memory is None or m >= min_memory)
+                    and (max_memory is None or m <= max_memory)
+                ]
+                if valid_values:
+                    return random.choice(valid_values)
+            if "most_common" in mem_dist:
+                most_common = mem_dist["most_common"]
+                if (min_memory is None or most_common >= min_memory) and (
+                    max_memory is None or most_common <= max_memory
+                ):
+                    return most_common
+        dist = self.distributions["system.accelerator.memory_capacity"]
+        valid_values = [
+            m
+            for m in dist["values"]
+            if (min_memory is None or m >= min_memory)
+            and (max_memory is None or m <= max_memory)
+        ]
+        if valid_values:
+            return random.choice(valid_values)
+        min_val = max(dist["min"], min_memory or dist["min"])
+        max_val = min(dist["max"], max_memory or dist["max"])
+        if min_val <= max_val:
+            mean = min(max(dist["mean"], min_val), max_val)
+            std = max(dist["std"], 1.0)
+            for _ in range(5):
+                value = np.random.normal(mean, std)
+                if min_val <= value <= max_val:
+                    return value
+            return np.random.uniform(min_val, max_val)
+        return None
+    def _get_node_config(self, total_devices: int) -> tuple:
+        """Determine number of nodes and devices per node."""
+        VALID_GPUS_PER_NODE = [1, 2, 4, 8]
+        for gpus_per_node in sorted(VALID_GPUS_PER_NODE, reverse=True):
+            if total_devices % gpus_per_node == 0:
+                return total_devices // gpus_per_node, gpus_per_node
+        for gpus_per_node in sorted(VALID_GPUS_PER_NODE, reverse=True):
+            if gpus_per_node <= total_devices:
+                nodes = total_devices // gpus_per_node
+                return nodes, gpus_per_node
+        return 1, 1
+    def _get_cpu_config(self) -> dict:
+        """Generate a CPU configuration."""
+        cpu_config = {}
+        cpu_config["system.cpu.vendor"] = self._sample_from_distribution(
+            self.distributions["system.cpu.vendor"]
+        )
+        cpu_vendor = cpu_config["system.cpu.vendor"]
+        if cpu_vendor in self.distributions["vendor_cpus"]:
+            cpu_config["system.cpu.model"] = self._sample_from_distribution(
+                self.distributions["vendor_cpus"][cpu_vendor]
+            )
+        else:
+            cpu_config["system.cpu.model"] = self._sample_from_distribution(
+                self.distributions["system.cpu.model"]
+            )
+        for feature in [
+            "system.cpu.core_count",
+            "system.cpu.count_per_node",
+            "system.cpu.frequency",
+        ]:
+            value = self._sample_continuous_value(feature)
+            if value is not None:
+                if feature in ["system.cpu.core_count", "system.cpu.count_per_node"]:
+                    value = int(value)
+                cpu_config[feature] = value
+        if "system.cpu.caches" in self.distributions:
+            cpu_config["system.cpu.caches"] = self._sample_from_distribution(
+                self.distributions["system.cpu.caches"]
+            )
+        return cpu_config
+    def _get_software_config(self, vendor: str, constraints=None) -> dict:
+        """Generate a software configuration based on hardware vendor."""
+        constraints = constraints or {}
+        software_config = {}
+        if vendor in self.distributions["vendor_software"]:
+            vendor_sw = self.distributions["vendor_software"][vendor]
+            if "os" in vendor_sw:
+                os_constraint = constraints.get("software.operating_system")
+                if os_constraint and os_constraint != "Any":
+                    software_config["software.operating_system"] = os_constraint
+                else:
+                    software_config["software.operating_system"] = (
+                        self._sample_from_distribution(vendor_sw["os"])
+                    )
+            for framework, versions in vendor_sw.items():
+                if framework != "os":
+                    framework_key = f"software.framework.{framework}"
+                    version_constraint = constraints.get(framework_key)
+                    if version_constraint and version_constraint != "Any":
+                        software_config[framework_key] = version_constraint
+                    else:
+                        software_config[framework_key] = self._sample_from_distribution(
+                            versions
+                        )
+        if (
+            "software.operating_system" not in software_config
+            and "software.operating_system" in self.distributions
+        ):
+            os_constraint = constraints.get("software.operating_system")
+            if os_constraint and os_constraint != "Any":
+                software_config["software.operating_system"] = os_constraint
+            else:
+                software_config["software.operating_system"] = (
+                    self._sample_from_distribution(
+                        self.distributions["software.operating_system"]
+                    )
+                )
+        for framework in self.frameworks:
+            framework_key = f"software.framework.{framework}"
+            if (
+                framework_key not in software_config
+                and framework_key in self.distributions
+            ):
+                version_constraint = constraints.get(framework_key)
+                if version_constraint and version_constraint != "Any":
+                    software_config[framework_key] = version_constraint
+                else:
+                    software_config[framework_key] = self._sample_from_distribution(
+                        self.distributions[framework_key]
+                    )
+        return software_config
+    def _get_memory_config(self, min_memory=None, max_memory=None) -> dict:
+        """Generate a memory configuration."""
+        memory_config = {}
+        dist = self.distributions["system.memory.capacity"]
+        if "values" in dist:
+            valid_values = [
+                m
+                for m in dist["values"]
+                if (min_memory is None or m >= min_memory)
+                and (max_memory is None or m <= max_memory)
+            ]
+            if valid_values:
+                memory_config["system.memory.capacity"] = random.choice(valid_values)
+        if "system.memory.capacity" not in memory_config:
+            min_val = max(dist["min"], min_memory or dist["min"])
+            max_val = min(dist["max"], max_memory or dist["max"])
+            if min_val <= max_val:
+                mean = min(max(dist["mean"], min_val), max_val)
+                std = max(dist["std"], (max_val - min_val) / 6.0)
+                value = np.random.normal(mean, std)
+                if min_val <= value <= max_val:
+                    memory_config["system.memory.capacity"] = value
+                else:
+                    memory_config["system.memory.capacity"] = np.random.uniform(
+                        min_val, max_val
+                    )
+        if "system.memory.configuration" in self.distributions:
+            memory_config["system.memory.configuration"] = (
+                self._sample_from_distribution(
+                    self.distributions["system.memory.configuration"]
+                )
+            )
+        return memory_config
+    def _get_interconnect_config(self, vendor: str) -> dict:
+        """Generate interconnect configuration based on vendor."""
+        interconnect_config = {}
+        if vendor in self.distributions["vendor_interconnects"]:
+            interconnect_config["system.interconnect.accelerator"] = (
+                self._sample_from_distribution(
+                    self.distributions["vendor_interconnects"][vendor]
+                )
+            )
+        elif "system.interconnect.accelerator" in self.distributions:
+            interconnect_config["system.interconnect.accelerator"] = (
+                self._sample_from_distribution(
+                    self.distributions["system.interconnect.accelerator"]
+                )
+            )
+        if "system.interconnect.accelerator_host" in self.distributions:
+            interconnect_config["system.interconnect.accelerator_host"] = (
+                self._sample_from_distribution(
+                    self.distributions["system.interconnect.accelerator_host"]
+                )
+            )
+        return interconnect_config
+    def _generate_configs(
+        self, architecture: str, parameters: float, constraints=None, count: int = 10
+    ) -> list:
+        """Generate configurations respecting user constraints."""
+        constraints = constraints or {}
+        configs = []
+        vendor = constraints.get("system.accelerator.vendor")
+        acc_name = constraints.get("system.accelerator.name")
+        def apply_margin(value, is_min=True):
+            if value is None or not isinstance(value, (int, float)) or value == "Any":
+                return None
+            return value * (0.9 if is_min else 1.1)
+        min_gpu_memory = apply_margin(constraints.get("min_gpu_memory"), is_min=True)
+        max_gpu_memory = apply_margin(
+            constraints.get("max_gpu_memory"), is_min=False
+        ) or (self.max_gpu_memory * 1.1)
+        min_cpu_memory = apply_margin(constraints.get("min_cpu_memory"), is_min=True)
+        max_cpu_memory = apply_margin(
+            constraints.get("max_cpu_memory"), is_min=False
+        ) or (self.max_cpu_memory * 1.1)
+        min_devices = apply_margin(constraints.get("min_accelerators"), is_min=True)
+        max_devices = (
+            apply_margin(constraints.get("max_accelerators"), is_min=False)
+            or self.max_accelerators
+        )
+        interconnect = constraints.get("system.interconnect.accelerator")
+        nodes = constraints.get("system.number_of_nodes")
+        VALID_GPUS_PER_NODE = [1, 2, 4, 8]
+        for _ in range(count * 3):
+            if len(configs) >= count:
+                break
+            device_count = self._get_device_count(min_devices, max_devices)
+            acc_vendor, acc_model = self._get_vendor_accelerator(vendor)
+            if acc_name and acc_name != "Any":
+                acc_model = acc_name
+            if nodes and nodes != "Any":
+                node_count = int(nodes)
+                valid_device_counts = []
+                for gpus in VALID_GPUS_PER_NODE:
+                    if node_count * gpus >= (
+                        min_devices or 1
+                    ) and node_count * gpus <= (max_devices or float("inf")):
+                        valid_device_counts.append(gpus)
+                if not valid_device_counts:
+                    continue
+                devices_per_node = random.choice(valid_device_counts)
+                device_count = node_count * devices_per_node
+            else:
+                valid_count = False
+                for gpus_per_node in sorted(VALID_GPUS_PER_NODE, reverse=True):
+                    if device_count % gpus_per_node == 0:
+                        valid_count = True
+                        break
+                if not valid_count:
+                    node_count, devices_per_node = self._get_node_config(device_count)
+                    device_count = node_count * devices_per_node
+                else:
+                    node_count, devices_per_node = (
+                        device_count // gpus_per_node,
+                        gpus_per_node,
+                    )
+            config = {
+                "model.architecture": architecture,
+                "model.number_of_parameters": parameters,
+                "system.accelerator.vendor": acc_vendor,
+                "system.accelerator.name": acc_model,
+                "system.accelerator.total_count": device_count,
+                "system.number_of_nodes": node_count,
+                "system.accelerator.count_per_node": devices_per_node,
+            }
+            gpu_memory = self._get_memory_for_accelerator(
+                acc_vendor,
+                acc_model,
+                min_memory=min_gpu_memory,
+                max_memory=max_gpu_memory,
+            )
+            if gpu_memory is None:
+                continue
+            config["system.accelerator.memory_capacity"] = gpu_memory
+            if "system.accelerator.memory_config" in self.distributions:
+                config["system.accelerator.memory_config"] = (
+                    self._sample_from_distribution(
+                        self.distributions["system.accelerator.memory_config"]
+                    )
+                )
+            interconnect_config = self._get_interconnect_config(acc_vendor)
+            if interconnect and interconnect != "Any":
+                interconnect_config["system.interconnect.accelerator"] = interconnect
+            config.update(interconnect_config)
+            config.update(self._get_cpu_config())
+            memory_config = self._get_memory_config(
+                min_memory=min_cpu_memory, max_memory=max_cpu_memory
+            )
+            if "system.memory.capacity" not in memory_config:
+                continue
+            config.update(memory_config)
+            for feature_name in [
+                "system.type",
+                "system.cooling",
+                "model.weight_data_types",
+            ]:
+                if feature_name in self.distributions:
+                    config[feature_name] = self._sample_from_distribution(
+                        self.distributions[feature_name]
+                    )
+            config.update(self._get_software_config(acc_vendor, constraints))
+            for key, value in constraints.items():
+                if (
+                    not key.startswith("software.framework.")
+                    and key != "software.operating_system"
+                    and key
+                    not in [
+                        "min_gpu_memory",
+                        "max_gpu_memory",
+                        "min_cpu_memory",
+                        "max_cpu_memory",
+                        "min_accelerators",
+                        "max_accelerators",
+                    ]
+                    and key not in config
+                    and value != "Any"
+                    and value is not None
+                ):
+                    config[key] = value
+            configs.append(config)
+        return configs[:count]

recommender.py ADDED Viewed

	@@ -0,0 +1,97 @@

+"""Configuration recommendation module for MLPerf benchmarks."""
+import logging
+import pandas as pd
+from utils import get_feature_type
+logger = logging.getLogger(__name__)
+class ConfigurationFinder:
+    """Finds optimal hardware configurations based on user requirements."""
+    def __init__(self, dataset: pd.DataFrame):
+        """Initialize with benchmark dataset."""
+        self.df = dataset
+        self.perf_metric = "metrics.result_per_accelerator"
+        self.cost_metric = "cost_per_million_tokens"
+        self.total_perf_metric = "metrics.result"
+    def is_within_tolerance(
+        self, value1: float, value2: float, tolerance: float = 0.1
+    ) -> bool:
+        """Check if two values are within a specified percentage tolerance."""
+        if value1 is None or value2 is None:
+            return False
+        try:
+            if value1 == 0 or value2 == 0:
+                return value1 == value2
+            percentage_diff = abs(value1 - value2) / max(abs(value1), abs(value2))
+            return percentage_diff <= tolerance
+        except:
+            return False
+    def find_configurations(
+        self, constraints: dict, tolerance: float = 0.1
+    ) -> pd.DataFrame:
+        """Find configurations matching the given constraints."""
+        if self.df.empty:
+            return pd.DataFrame()
+        filtered_df = self.df.copy()
+        for feature, value in constraints.items():
+            if feature not in filtered_df.columns or value is None or value == "Any":
+                continue
+            if get_feature_type(feature) == "continuous":
+                try:
+                    target_value = float(value)
+                    lower_bound = target_value * (1 - tolerance)
+                    upper_bound = target_value * (1 + tolerance)
+                    filtered_df = filtered_df[
+                        (filtered_df[feature] >= lower_bound)
+                        & (filtered_df[feature] <= upper_bound)
+                    ]
+                except:
+                    filtered_df = filtered_df[filtered_df[feature] == value]
+            else:
+                filtered_df = filtered_df[filtered_df[feature] == value]
+        if "min_accelerators" in constraints and constraints["min_accelerators"]:
+            min_acc = constraints["min_accelerators"]
+            filtered_df = filtered_df[
+                filtered_df["system.accelerator.total_count"] >= min_acc
+            ]
+        if "max_accelerators" in constraints and constraints["max_accelerators"]:
+            max_acc = constraints["max_accelerators"]
+            filtered_df = filtered_df[
+                filtered_df["system.accelerator.total_count"] <= max_acc
+            ]
+        return filtered_df
+    def rank_configurations(
+        self,
+        df: pd.DataFrame,
+        metric: str = "metrics.result_per_accelerator",
+        ascending: bool = False,
+    ) -> pd.DataFrame:
+        """Rank configurations by the specified metric."""
+        if df.empty or metric not in df.columns:
+            return df
+        return df.sort_values(by=metric, ascending=ascending)
+    def recommend(self, constraints: dict, top_n: int = 10) -> pd.DataFrame:
+        """Find and rank configurations based on constraints."""
+        filtered_configs = self.find_configurations(constraints)
+        ranked_configs = self.rank_configurations(
+            filtered_configs, metric=self.perf_metric, ascending=False
+        )
+        if len(ranked_configs) > top_n:
+            return ranked_configs.head(top_n)
+        return ranked_configs

requirements.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+datasets
+gradio
+nbformat
+numpy
+pandas
+plotly
+polars
+pyarrow
+scikit-learn
+xgboost
+cmind

utils.py ADDED Viewed

	@@ -0,0 +1,115 @@

+import json
+import logging
+import polars as pl
+logger = logging.getLogger(__name__)
+FEATURES = {
+    "Performance": {
+        "metrics.result": "continuous",
+        "metrics.result_per_accelerator": "continuous",
+        "metrics.accuracy": "continuous",
+    },
+    "Model": {
+        "model.name": "categorical",
+        "model.mlperf_name": "categorical",
+        "model.architecture": "categorical",
+        "model.number_of_parameters": "continuous",
+        "model.weight_data_types": "categorical",
+    },
+    "Accelerator": {
+        "system.accelerator.vendor": "categorical",
+        "system.accelerator.name": "categorical",
+        "system.accelerator.count_per_node": "continuous",
+        "system.accelerator.total_count": "continuous",
+        "system.accelerator.memory_capacity": "continuous",
+        "system.accelerator.memory_config": "text",
+        "system.interconnect.accelerator": "categorical",
+    },
+    "CPU": {
+        "system.cpu.vendor": "categorical",
+        "system.cpu.model": "categorical",
+        "system.cpu.core_count": "continuous",
+        "system.cpu.count_per_node": "continuous",
+        "system.cpu.frequency": "continuous",
+        "system.cpu.caches": "text",
+        "system.cpu.vcpu_count": "continuous",
+    },
+    "System": {
+        "system.name": "text",
+        "system.type": "categorical",
+        "system.cooling": "categorical",
+        "system.number_of_nodes": "continuous",
+        "system.memory.capacity": "continuous",
+        "system.memory.configuration": "text",
+        "system.interconnect.accelerator_host": "categorical",
+    },
+    "Software": {
+        "software.framework": "categorical",
+        "software.version": "categorical",
+        "software.operating_system": "categorical",
+    },
+    "Submission": {
+        "submission.organization": "categorical",
+        "submission.division": "categorical",
+        "submission.scenario": "categorical",
+        "submission.availability": "boolean",
+    },
+}
+def get_features_by_type(feature_type: str) -> list[str]:
+    """Get all features of a specific type."""
+    result = []
+    for group in FEATURES.values():
+        for feature, typ in group.items():
+            if typ == feature_type:
+                result.append(feature)
+    return result
+FEATURE_TYPES = {
+    "continuous": get_features_by_type("continuous"),
+    "categorical": get_features_by_type("categorical"),
+    "boolean": get_features_by_type("boolean"),
+    "text": get_features_by_type("text"),
+}
+UI_FEATURE_GROUPS = {
+    group: list(features.keys()) for group, features in FEATURES.items()
+}
+def get_feature_type(feature_name: str) -> str:
+    """Get the type of a feature from the FEATURES dictionary."""
+    for group in FEATURES.values():
+        if feature_name in group:
+            return group[feature_name]
+    return "categorical"
+def load_data(file_path: str = "data.json") -> pl.DataFrame:
+    """Load processed benchmark data."""
+    logger.info(f"Loading processed data from {file_path}")
+    try:
+        with open(file_path, "r") as f:
+            data = json.load(f)
+        for item in data:
+            for key, value in item.items():
+                if isinstance(value, str):
+                    if value.isdigit():
+                        item[key] = int(value)
+                    elif value.replace(".", "", 1).isdigit():
+                        item[key] = float(value)
+        df = pl.DataFrame(data, infer_schema_length=None)
+        logger.info(f"Loaded {len(df)} benchmark results")
+        return df
+    except Exception as e:
+        logger.error(f"Error loading data: {e}")
+        return pl.DataFrame()