| import gradio as gr |
| import csv |
| import os |
| import numpy as np |
|
|
| def load_gpu_data(): |
| """Load GPU data from gpus.csv file.""" |
| gpu_data = {} |
| csv_path = os.path.join(os.path.dirname(__file__), 'gpus.csv') |
| |
| try: |
| with open(csv_path, 'r') as file: |
| reader = csv.DictReader(file) |
| for row in reader: |
| gpu_name = row['gpu_model'].replace('_', ' ') |
| tflops = float(row['sparce_tflops']) |
| gpu_data[gpu_name] = tflops |
| except Exception as e: |
| print(f"Error loading GPU data: {e}") |
| gpu_data = {"Custom": 0} |
| |
| return gpu_data |
|
|
| def calculate_training_time(model_size_billions, tflops_per_gpu, num_gpus, tokens_millions, mfu_percentage): |
| """ |
| Calculate the time to train a model. |
| |
| Formula: |
| - Total FLOPs = 6 * num_params * num_tokens |
| - Effective FLOPs per second = tflops_per_gpu * num_gpus * 10^12 * (MFU/100) |
| - Training time = Total FLOPs / Effective FLOPs per second |
| |
| Args: |
| model_size_billions: Model size in billions of parameters |
| tflops_per_gpu: BF16 TFLOPs per GPU (effective, non-sparsity) |
| num_gpus: Number of GPUs used |
| tokens_millions: Number of tokens in millions |
| mfu_percentage: Model FLOPs Utilization percentage |
| |
| Returns: |
| Training time in hours |
| """ |
| |
| num_params = model_size_billions * 1e9 |
| num_tokens = tokens_millions * 1e6 |
| |
| |
| total_flops = 6 * num_params * num_tokens |
| |
| |
| |
| flops_per_second = tflops_per_gpu * num_gpus * 1e12 * (mfu_percentage / 100) |
| |
| |
| training_time_seconds = total_flops / flops_per_second |
| |
| |
| training_time_hours = training_time_seconds / 3600 |
| |
| return training_time_hours |
|
|
| def format_output(hours): |
| """Format the output in a readable way.""" |
| if hours < 24: |
| return f"{hours:.2f} hours" |
| else: |
| days = hours / 24 |
| if days < 30: |
| return f"{days:.2f} days ({hours:.1f} hours)" |
| else: |
| months = days / 30 |
| return f"{months:.2f} months ({days:.1f} days, {hours:.0f} hours)" |
|
|
| def slider_to_model_size(value): |
| """Convert logarithmic slider value to actual model size in billions.""" |
| |
| min_log = np.log10(0.1) |
| max_log = np.log10(1000) |
| log_value = min_log + (max_log - min_log) * value / 100 |
| return 10 ** log_value |
|
|
| def model_size_to_slider(size_billions): |
| """Convert model size in billions to slider value.""" |
| min_log = np.log10(0.1) |
| max_log = np.log10(1000) |
| log_value = np.log10(size_billions) |
| return 100 * (log_value - min_log) / (max_log - min_log) |
|
|
| def format_model_size(size_billions): |
| """Format model size for display.""" |
| if size_billions < 1: |
| return f"{size_billions * 1000:.0f}M" |
| elif size_billions < 1000: |
| return f"{size_billions:.1f}B" |
| else: |
| return f"{size_billions / 1000:.1f}T" |
|
|
| def update_calculation(model_size_value, model_size_unit, use_gpu_model, gpu_model, custom_tflops, num_gpus, tokens_value, tokens_unit, mfu_percentage): |
| """Update the calculation and return formatted results.""" |
| |
| if model_size_unit == "B": |
| model_size_billions = model_size_value |
| else: |
| model_size_billions = model_size_value * 1000 |
| |
| |
| if tokens_unit == "M": |
| tokens_millions = tokens_value |
| elif tokens_unit == "B": |
| tokens_millions = tokens_value * 1000 |
| else: |
| tokens_millions = tokens_value * 1000000 |
| |
| |
| if use_gpu_model and gpu_model != "Custom": |
| gpu_data = load_gpu_data() |
| tflops_per_gpu = gpu_data.get(gpu_model, custom_tflops) |
| gpu_info = f"{gpu_model} ({tflops_per_gpu} TFLOPs)" |
| else: |
| tflops_per_gpu = custom_tflops |
| gpu_info = f"Custom ({tflops_per_gpu} TFLOPs)" |
| |
| hours = calculate_training_time(model_size_billions, tflops_per_gpu, num_gpus, tokens_millions, mfu_percentage) |
| |
| |
| total_flops = 6 * (model_size_billions * 1e9) * (tokens_millions * 1e6) |
| effective_tflops = tflops_per_gpu * num_gpus * (mfu_percentage / 100) |
| |
| breakdown = f""" |
| ### Calculation Breakdown: |
| - **GPU Selection**: {gpu_info} |
| - **Model Size**: {format_model_size(model_size_billions)} parameters ({model_size_billions:.2f}B) |
| - **Training Tokens**: {tokens_value}{tokens_unit} tokens ({tokens_millions:.0f}M) |
| - **Total FLOPs**: {total_flops:.2e} FLOPs |
| - **Formula**: 6 × {model_size_billions:.2f}B params × {tokens_millions:.0f}M tokens |
| - **Effective TFLOPs**: {effective_tflops:.2f} TFLOPs/s |
| - **Formula**: {tflops_per_gpu} TFLOPs/GPU × {num_gpus} GPUs × {mfu_percentage}% MFU |
| |
| ### Training Time: |
| **{format_output(hours)}** |
| """ |
| |
| return breakdown |
|
|
| |
| gpu_data = load_gpu_data() |
| gpu_choices = ["Custom"] + list(gpu_data.keys()) |
|
|
| |
| with gr.Blocks(title="Model Training Time Calculator") as demo: |
| gr.Markdown("# Model Training Time Calculator") |
| gr.Markdown("Calculate the time required to train a model based on model size, hardware specs, and token count.") |
| |
| with gr.Row(): |
| with gr.Column(): |
| with gr.Row(): |
| model_size_value = gr.Number( |
| minimum=0.5, |
| maximum=1000, |
| value=7, |
| step=0.1, |
| label="Model Size", |
| info="Enter model size (0.5-1000)" |
| ) |
| model_size_unit = gr.Radio( |
| choices=["B", "T"], |
| value="B", |
| label="Unit", |
| info="Model size unit" |
| ) |
| |
| |
| use_gpu_model = gr.Checkbox( |
| value=True, |
| label="Use GPU Model from List", |
| info="Check to select a GPU model, uncheck to input custom TFLOPs" |
| ) |
| |
| gpu_model = gr.Dropdown( |
| choices=gpu_choices, |
| value="H100" if "H100" in gpu_choices else gpu_choices[0], |
| label="GPU Model", |
| info="Select a GPU model from the list", |
| visible=True |
| ) |
| |
| custom_tflops = gr.Slider( |
| minimum=10, |
| maximum=2000, |
| value=300, |
| step=10, |
| label="Custom BF16 TFLOPs per GPU", |
| info="Effective (non-sparsity) TFLOPs per GPU", |
| visible=False |
| ) |
| |
| num_gpus = gr.Slider( |
| minimum=1, |
| maximum=1024, |
| value=8, |
| step=1, |
| label="Number of GPUs", |
| info="Total number of GPUs for training" |
| ) |
| |
| with gr.Row(): |
| tokens_value = gr.Slider( |
| minimum=1, |
| maximum=1000, |
| value=100, |
| step=1, |
| label="Training Tokens", |
| info="Number of training tokens" |
| ) |
| tokens_unit = gr.Radio( |
| choices=["M", "B", "T"], |
| value="B", |
| label="Unit", |
| info="Token count unit" |
| ) |
| |
| mfu = gr.Slider( |
| minimum=10, |
| maximum=100, |
| value=50, |
| step=5, |
| label="Model FLOPs Utilization (MFU) %", |
| info="Efficiency of hardware utilization (50% is typical for low-end estimate)" |
| ) |
| |
| with gr.Column(): |
| output = gr.Markdown(label="Results") |
| |
| |
| def toggle_gpu_input(use_gpu): |
| return gr.update(visible=use_gpu), gr.update(visible=not use_gpu or use_gpu and gpu_model.value == "Custom") |
| |
| use_gpu_model.change( |
| fn=toggle_gpu_input, |
| inputs=[use_gpu_model], |
| outputs=[gpu_model, custom_tflops] |
| ) |
| |
| |
| def check_custom_selected(gpu_model_value): |
| return gr.update(visible=gpu_model_value == "Custom") |
| |
| gpu_model.change( |
| fn=check_custom_selected, |
| inputs=[gpu_model], |
| outputs=[custom_tflops] |
| ) |
| |
| |
| all_inputs = [model_size_value, model_size_unit, use_gpu_model, gpu_model, custom_tflops, num_gpus, tokens_value, tokens_unit, mfu] |
| |
| for input_component in all_inputs: |
| input_component.change( |
| fn=update_calculation, |
| inputs=all_inputs, |
| outputs=output |
| ) |
| |
| |
| demo.load( |
| fn=update_calculation, |
| inputs=all_inputs, |
| outputs=output |
| ) |
|
|
| if __name__ == "__main__": |
| demo.launch() |