Spaces:
Running
Running
| from estimator import GPU_SPECS, estimate_tps_with_calibration, total_tokens | |
| from scaling import scale_tps | |
| from cost import estimate_cost | |
| from memory_estimator import estimate_memory, suggest_batch | |
| CLOUDS = ["AWS", "Azure", "GCP"] | |
| GPUS = list(GPU_SPECS.keys()) | |
| def find_best_config( | |
| params, | |
| dataset, | |
| epochs, | |
| seq_len, | |
| precision, | |
| efficiency, | |
| correction, | |
| budget, | |
| pricing_mode, | |
| training_mode="QLoRA", | |
| gradient_checkpointing=True, | |
| selected_gpus=None, | |
| selected_clouds=None, | |
| max_gpus=8, | |
| calibration=None, | |
| dataset_mode="Examples / sequences", | |
| ): | |
| if dataset_mode == "Total tokens": | |
| tokens = dataset * epochs | |
| else: | |
| tokens = total_tokens(dataset, epochs, seq_len) | |
| best = None | |
| candidate_gpus = selected_gpus or GPUS | |
| candidate_clouds = selected_clouds or CLOUDS | |
| gpu_count_options = [g for g in [1, 2, 4, 8] if g <= max_gpus] | |
| for gpu in candidate_gpus: | |
| gpu_mem = GPU_SPECS[gpu]["memory"] | |
| batch = suggest_batch( | |
| params, | |
| seq_len, | |
| gpu_mem, | |
| precision, | |
| training_mode, | |
| gradient_checkpointing, | |
| ) | |
| if batch < 1: | |
| continue | |
| mem_gb = estimate_memory( | |
| params, | |
| seq_len, | |
| batch, | |
| precision, | |
| training_mode, | |
| gradient_checkpointing, | |
| ) | |
| for cloud in candidate_clouds: | |
| for g in gpu_count_options: | |
| base_tps = estimate_tps_with_calibration( | |
| params, | |
| gpu, | |
| efficiency, | |
| correction, | |
| precision, | |
| seq_len, | |
| training_mode, | |
| calibration, | |
| ) | |
| tps = scale_tps(base_tps, g) | |
| if tps <= 0: | |
| continue | |
| time_h = tokens / tps / 3600 | |
| cost = estimate_cost(gpu, time_h, g, cloud, pricing_mode) | |
| if cost is None or cost > budget: | |
| continue | |
| if not best or time_h < best["time"] or ( | |
| time_h == best["time"] and cost < best["cost"] | |
| ): | |
| best = { | |
| "gpu": gpu, | |
| "cloud": cloud, | |
| "gpus": g, | |
| "batch": batch, | |
| "memory_gb": round(mem_gb, 2), | |
| "time": round(time_h, 2), | |
| "cost": round(cost, 2), | |
| } | |
| return best |