Spaces:
Sleeping
Sleeping
add adamw-8bit
Browse files- app.py +1 -4
- vram_helpers.py +1 -1
app.py
CHANGED
|
@@ -10,15 +10,12 @@ from vram_helpers import ModelConfig, TrainingConfig, filter_params_for_dataclas
|
|
| 10 |
|
| 11 |
ZERO_STAGES = [0, 1, 2, 3]
|
| 12 |
BATCH_SIZES = [1, 2, 4, 8, 16, 32, 64]
|
| 13 |
-
OPTIMIZERS = ["adam", "adamw", "sgd"]
|
| 14 |
HUGGINGFACE_URL_CONFIG = "https://huggingface.co/{}/resolve/main/config.json"
|
| 15 |
|
| 16 |
|
| 17 |
-
|
| 18 |
-
|
| 19 |
def parse_args():
|
| 20 |
parser = argparse.ArgumentParser(description="Parser for VRAM estimator")
|
| 21 |
-
|
| 22 |
parser.add_argument("--repo_id", type=str, default=None, help="HuggingFace repo id to automatically determine model settings")
|
| 23 |
parser.add_argument("--model_size", type=float, default=7, help="Model size (in billion parameters)")
|
| 24 |
parser.add_argument("--hidden_size", type=int, default=4096, help="Hidden size")
|
|
|
|
| 10 |
|
| 11 |
ZERO_STAGES = [0, 1, 2, 3]
|
| 12 |
BATCH_SIZES = [1, 2, 4, 8, 16, 32, 64]
|
| 13 |
+
OPTIMIZERS = ["adam", "adamw", "adamw_8bit", "sgd"]
|
| 14 |
HUGGINGFACE_URL_CONFIG = "https://huggingface.co/{}/resolve/main/config.json"
|
| 15 |
|
| 16 |
|
|
|
|
|
|
|
| 17 |
def parse_args():
|
| 18 |
parser = argparse.ArgumentParser(description="Parser for VRAM estimator")
|
|
|
|
| 19 |
parser.add_argument("--repo_id", type=str, default=None, help="HuggingFace repo id to automatically determine model settings")
|
| 20 |
parser.add_argument("--model_size", type=float, default=7, help="Model size (in billion parameters)")
|
| 21 |
parser.add_argument("--hidden_size", type=int, default=4096, help="Hidden size")
|
vram_helpers.py
CHANGED
|
@@ -91,7 +91,7 @@ def optimizer_memory(parameters, optimizer= "adamw", precision = "fp32"):
|
|
| 91 |
optimizer_choices = {"adam": 3, # Adam: stores precision copies of the optimizer parameters, momentum, and variance -> 4 + 4 + 4 = 12 bytes per model parameter
|
| 92 |
"adamw": 3, # AdamW: Same for Adam
|
| 93 |
"sgd": 2, # For SGD: optimier parameters and gradients -> 4 + 4 = 8 bytes per model parameter
|
| 94 |
-
"
|
| 95 |
}
|
| 96 |
return optimizer_choices[optimizer] * parameters * PRECISION_TO_BYTES[precision]
|
| 97 |
|
|
|
|
| 91 |
optimizer_choices = {"adam": 3, # Adam: stores precision copies of the optimizer parameters, momentum, and variance -> 4 + 4 + 4 = 12 bytes per model parameter
|
| 92 |
"adamw": 3, # AdamW: Same for Adam
|
| 93 |
"sgd": 2, # For SGD: optimier parameters and gradients -> 4 + 4 = 8 bytes per model parameter
|
| 94 |
+
"adamw_8bit": 1.5, # Adam 8-bit: same for Adam-> 2 + 2 + 2 = 6 bytes per model parameter
|
| 95 |
}
|
| 96 |
return optimizer_choices[optimizer] * parameters * PRECISION_TO_BYTES[precision]
|
| 97 |
|