Spaces:
Sleeping
Sleeping
File size: 2,639 Bytes
e268c11 af7d5c7 e268c11 af7d5c7 e268c11 af7d5c7 e268c11 af7d5c7 e268c11 af7d5c7 e268c11 af7d5c7 e268c11 af7d5c7 e268c11 af7d5c7 e268c11 af7d5c7 e268c11 af7d5c7 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 | from estimator import GPU_SPECS, estimate_tps_with_calibration, total_tokens
from scaling import scale_tps
from cost import estimate_cost
from memory_estimator import estimate_memory, suggest_batch
CLOUDS = ["AWS", "Azure", "GCP"]
GPUS = list(GPU_SPECS.keys())
def find_best_config(
params,
dataset,
epochs,
seq_len,
precision,
efficiency,
correction,
budget,
pricing_mode,
training_mode="QLoRA",
gradient_checkpointing=True,
selected_gpus=None,
selected_clouds=None,
max_gpus=8,
calibration=None,
dataset_mode="Examples / sequences",
):
if dataset_mode == "Total tokens":
tokens = dataset * epochs
else:
tokens = total_tokens(dataset, epochs, seq_len)
best = None
candidate_gpus = selected_gpus or GPUS
candidate_clouds = selected_clouds or CLOUDS
gpu_count_options = [g for g in [1, 2, 4, 8] if g <= max_gpus]
for gpu in candidate_gpus:
gpu_mem = GPU_SPECS[gpu]["memory"]
batch = suggest_batch(
params,
seq_len,
gpu_mem,
precision,
training_mode,
gradient_checkpointing,
)
if batch < 1:
continue
mem_gb = estimate_memory(
params,
seq_len,
batch,
precision,
training_mode,
gradient_checkpointing,
)
for cloud in candidate_clouds:
for g in gpu_count_options:
base_tps = estimate_tps_with_calibration(
params,
gpu,
efficiency,
correction,
precision,
seq_len,
training_mode,
calibration,
)
tps = scale_tps(base_tps, g)
if tps <= 0:
continue
time_h = tokens / tps / 3600
cost = estimate_cost(gpu, time_h, g, cloud, pricing_mode)
if cost is None or cost > budget:
continue
if not best or time_h < best["time"] or (
time_h == best["time"] and cost < best["cost"]
):
best = {
"gpu": gpu,
"cloud": cloud,
"gpus": g,
"batch": batch,
"memory_gb": round(mem_gb, 2),
"time": round(time_h, 2),
"cost": round(cost, 2),
}
return best |