File size: 2,639 Bytes
e268c11
af7d5c7
 
e268c11
af7d5c7
 
e268c11
af7d5c7
 
 
 
 
 
 
 
 
 
 
e268c11
 
 
 
 
 
 
 
af7d5c7
e268c11
 
 
 
af7d5c7
e268c11
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
af7d5c7
e268c11
 
 
 
 
 
 
 
af7d5c7
e268c11
 
 
 
 
 
 
 
 
 
 
 
af7d5c7
 
e268c11
 
 
af7d5c7
 
 
e268c11
 
 
 
 
 
 
 
 
 
 
 
 
 
 
af7d5c7
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
from estimator import GPU_SPECS, estimate_tps_with_calibration, total_tokens
from scaling import scale_tps
from cost import estimate_cost
from memory_estimator import estimate_memory, suggest_batch

CLOUDS = ["AWS", "Azure", "GCP"]
GPUS = list(GPU_SPECS.keys())


def find_best_config(
    params,
    dataset,
    epochs,
    seq_len,
    precision,
    efficiency,
    correction,
    budget,
    pricing_mode,
    training_mode="QLoRA",
    gradient_checkpointing=True,
    selected_gpus=None,
    selected_clouds=None,
    max_gpus=8,
    calibration=None,
    dataset_mode="Examples / sequences",
):
    if dataset_mode == "Total tokens":
        tokens = dataset * epochs
    else:
        tokens = total_tokens(dataset, epochs, seq_len)
    best = None
    candidate_gpus = selected_gpus or GPUS
    candidate_clouds = selected_clouds or CLOUDS
    gpu_count_options = [g for g in [1, 2, 4, 8] if g <= max_gpus]

    for gpu in candidate_gpus:
        gpu_mem = GPU_SPECS[gpu]["memory"]
        batch = suggest_batch(
            params,
            seq_len,
            gpu_mem,
            precision,
            training_mode,
            gradient_checkpointing,
        )

        if batch < 1:
            continue

        mem_gb = estimate_memory(
            params,
            seq_len,
            batch,
            precision,
            training_mode,
            gradient_checkpointing,
        )

        for cloud in candidate_clouds:
            for g in gpu_count_options:
                base_tps = estimate_tps_with_calibration(
                    params,
                    gpu,
                    efficiency,
                    correction,
                    precision,
                    seq_len,
                    training_mode,
                    calibration,
                )
                tps = scale_tps(base_tps, g)

                if tps <= 0:
                    continue

                time_h = tokens / tps / 3600
                cost = estimate_cost(gpu, time_h, g, cloud, pricing_mode)

                if cost is None or cost > budget:
                    continue

                if not best or time_h < best["time"] or (
                    time_h == best["time"] and cost < best["cost"]
                ):
                    best = {
                        "gpu": gpu,
                        "cloud": cloud,
                        "gpus": g,
                        "batch": batch,
                        "memory_gb": round(mem_gb, 2),
                        "time": round(time_h, 2),
                        "cost": round(cost, 2),
                    }

    return best