update about and params
Browse files- src/display/about.py +4 -0
- src/submission/check_validity.py +8 -8
- src/submission/submit.py +3 -3
- tests/test_submit.py +3 -3
src/display/about.py
CHANGED
|
@@ -33,6 +33,8 @@ LLM_BENCHMARKS_TEXT = f"""
|
|
| 33 |
## ABOUT
|
| 34 |
Quantization is a key technique for making LLMs more accessible and practical for a wide range of applications, especially where computational resources are a limiting factor. However, there has been no dedicated tool to track and compare quantized LLMs across different quantization algorithms, making it difficult to filter out genuine progress by the open-source community and identify the current state of the art.
|
| 35 |
|
|
|
|
|
|
|
| 36 |
### Introducing V1.0
|
| 37 |
V1.0 marks the launch of our comprehensive evaluation system for quantized language models.
|
| 38 |
This version introduces a standardized platform for submitting and benchmarking models, providing clear insights into their performance across various tasks.
|
|
@@ -79,6 +81,8 @@ results = simple_evaluate(
|
|
| 79 |
To get more information about quantization, see:
|
| 80 |
- auto-round: [intel/auto-round](https://github.com/intel/auto-round)
|
| 81 |
- neural-compressor: [intel/neural-compressor](https://github.com/intel/neural-compressor/tree/master)
|
|
|
|
|
|
|
| 82 |
"""
|
| 83 |
|
| 84 |
FAQ_TEXT = """
|
|
|
|
| 33 |
## ABOUT
|
| 34 |
Quantization is a key technique for making LLMs more accessible and practical for a wide range of applications, especially where computational resources are a limiting factor. However, there has been no dedicated tool to track and compare quantized LLMs across different quantization algorithms, making it difficult to filter out genuine progress by the open-source community and identify the current state of the art.
|
| 35 |
|
| 36 |
+
> βΉοΈ **Scope:** This leaderboard mainly supports **text-generation models**.
|
| 37 |
+
|
| 38 |
### Introducing V1.0
|
| 39 |
V1.0 marks the launch of our comprehensive evaluation system for quantized language models.
|
| 40 |
This version introduces a standardized platform for submitting and benchmarking models, providing clear insights into their performance across various tasks.
|
|
|
|
| 81 |
To get more information about quantization, see:
|
| 82 |
- auto-round: [intel/auto-round](https://github.com/intel/auto-round)
|
| 83 |
- neural-compressor: [intel/neural-compressor](https://github.com/intel/neural-compressor/tree/master)
|
| 84 |
+
|
| 85 |
+
> π‘ **Found this quantization useful?** If AutoRound helped you ship a smaller, faster model, please consider giving it a β on GitHub β [**Star intel/auto-round**](https://github.com/intel/auto-round). It helps the project grow and brings more low-bit techniques to the community.
|
| 86 |
"""
|
| 87 |
|
| 88 |
FAQ_TEXT = """
|
src/submission/check_validity.py
CHANGED
|
@@ -603,24 +603,24 @@ PRECISION_TO_BITS: dict[str, int] = {
|
|
| 603 |
def estimate_weight_memory_gb(
|
| 604 |
params_b: float,
|
| 605 |
bits: int,
|
| 606 |
-
overhead_factor: float =
|
| 607 |
) -> float:
|
| 608 |
"""Estimate GPU VRAM (GB) required to load a model with *bits*-bit weights.
|
| 609 |
|
| 610 |
The formula covers only weight storage; *overhead_factor* accounts for KV
|
| 611 |
-
cache, activations, and framework buffers (default
|
| 612 |
|
| 613 |
Examples
|
| 614 |
--------
|
| 615 |
-
- 7 B W4A16: 7 Γ 0.5 Γ
|
| 616 |
-
- 13 B W4A16: 13 Γ 0.5 Γ
|
| 617 |
-
- 70 B W4A16: 70 Γ 0.5 Γ
|
| 618 |
|
| 619 |
Args:
|
| 620 |
params_b: Parameter count in billions.
|
| 621 |
bits: Weight quantization bits (e.g. 4 for W4A16, 8 for W8A16,
|
| 622 |
16 for FP16/BF16, 32 for FP32).
|
| 623 |
-
overhead_factor: Multiplier for non-weight memory. Default
|
| 624 |
|
| 625 |
Returns:
|
| 626 |
Estimated GPU memory in GB (rounded to 2 decimal places).
|
|
@@ -816,7 +816,7 @@ def get_num_layers(model_config) -> int | None:
|
|
| 816 |
def estimate_quantization_memory_gb(
|
| 817 |
model_weight_gb: float,
|
| 818 |
num_layers: int,
|
| 819 |
-
overhead_factor: float = 1.
|
| 820 |
) -> float:
|
| 821 |
"""Estimate GPU VRAM needed to **quantize** a model (layerwise / blockwise).
|
| 822 |
|
|
@@ -827,7 +827,7 @@ def estimate_quantization_memory_gb(
|
|
| 827 |
+ model_weight_gb / num_layers # activations / Hessian for that layer
|
| 828 |
) Γ overhead_factor
|
| 829 |
|
| 830 |
-
The ``overhead_factor`` (default 1.
|
| 831 |
data, and temporary tensors.
|
| 832 |
|
| 833 |
Args:
|
|
|
|
| 603 |
def estimate_weight_memory_gb(
|
| 604 |
params_b: float,
|
| 605 |
bits: int,
|
| 606 |
+
overhead_factor: float = 2.2,
|
| 607 |
) -> float:
|
| 608 |
"""Estimate GPU VRAM (GB) required to load a model with *bits*-bit weights.
|
| 609 |
|
| 610 |
The formula covers only weight storage; *overhead_factor* accounts for KV
|
| 611 |
+
cache, activations, and framework buffers (default 2.2 = 120 % overhead).
|
| 612 |
|
| 613 |
Examples
|
| 614 |
--------
|
| 615 |
+
- 7 B W4A16: 7 Γ 0.5 Γ 2.2 β 7.7 GB
|
| 616 |
+
- 13 B W4A16: 13 Γ 0.5 Γ 2.2 β 14.3 GB
|
| 617 |
+
- 70 B W4A16: 70 Γ 0.5 Γ 2.2 β 77.0 GB
|
| 618 |
|
| 619 |
Args:
|
| 620 |
params_b: Parameter count in billions.
|
| 621 |
bits: Weight quantization bits (e.g. 4 for W4A16, 8 for W8A16,
|
| 622 |
16 for FP16/BF16, 32 for FP32).
|
| 623 |
+
overhead_factor: Multiplier for non-weight memory. Default 2.2.
|
| 624 |
|
| 625 |
Returns:
|
| 626 |
Estimated GPU memory in GB (rounded to 2 decimal places).
|
|
|
|
| 816 |
def estimate_quantization_memory_gb(
|
| 817 |
model_weight_gb: float,
|
| 818 |
num_layers: int,
|
| 819 |
+
overhead_factor: float = 1.5,
|
| 820 |
) -> float:
|
| 821 |
"""Estimate GPU VRAM needed to **quantize** a model (layerwise / blockwise).
|
| 822 |
|
|
|
|
| 827 |
+ model_weight_gb / num_layers # activations / Hessian for that layer
|
| 828 |
) Γ overhead_factor
|
| 829 |
|
| 830 |
+
The ``overhead_factor`` (default 1.5) covers framework buffers, calibration
|
| 831 |
data, and temporary tensors.
|
| 832 |
|
| 833 |
Args:
|
src/submission/submit.py
CHANGED
|
@@ -572,7 +572,7 @@ def add_new_eval(
|
|
| 572 |
# ββ Step 3: Estimate VRAM ββββββββββββββββββββββββββββββββββββββββ
|
| 573 |
bits_for_precision = PRECISION_TO_BITS.get(precision, 4)
|
| 574 |
estimated_memory_gb = estimate_weight_memory_gb(
|
| 575 |
-
params_b=model_params, bits=bits_for_precision, overhead_factor=
|
| 576 |
)
|
| 577 |
|
| 578 |
|
|
@@ -837,7 +837,7 @@ def add_new_quant(
|
|
| 837 |
quant_memory_gb = estimate_quantization_memory_gb(
|
| 838 |
model_weight_gb=model_weight_gb,
|
| 839 |
num_layers=num_layers,
|
| 840 |
-
overhead_factor=1.
|
| 841 |
)
|
| 842 |
|
| 843 |
quant_gpu_type, quant_gpu_nums = select_gpu_with_override(quant_memory_gb, hardware_override, gpu_count_override)
|
|
@@ -846,7 +846,7 @@ def add_new_quant(
|
|
| 846 |
# ββ Step 4: Estimate VRAM for post-quantization evaluation βββββββ
|
| 847 |
output_bits = scheme.bits
|
| 848 |
eval_memory_gb = estimate_weight_memory_gb(
|
| 849 |
-
params_b=model_params, bits=output_bits, overhead_factor=
|
| 850 |
)
|
| 851 |
|
| 852 |
eval_gpu_type, eval_gpu_nums = select_gpu_with_override(eval_memory_gb, hardware_override, gpu_count_override)
|
|
|
|
| 572 |
# ββ Step 3: Estimate VRAM ββββββββββββββββββββββββββββββββββββββββ
|
| 573 |
bits_for_precision = PRECISION_TO_BITS.get(precision, 4)
|
| 574 |
estimated_memory_gb = estimate_weight_memory_gb(
|
| 575 |
+
params_b=model_params, bits=bits_for_precision, overhead_factor=2.2,
|
| 576 |
)
|
| 577 |
|
| 578 |
|
|
|
|
| 837 |
quant_memory_gb = estimate_quantization_memory_gb(
|
| 838 |
model_weight_gb=model_weight_gb,
|
| 839 |
num_layers=num_layers,
|
| 840 |
+
overhead_factor=1.5,
|
| 841 |
)
|
| 842 |
|
| 843 |
quant_gpu_type, quant_gpu_nums = select_gpu_with_override(quant_memory_gb, hardware_override, gpu_count_override)
|
|
|
|
| 846 |
# ββ Step 4: Estimate VRAM for post-quantization evaluation βββββββ
|
| 847 |
output_bits = scheme.bits
|
| 848 |
eval_memory_gb = estimate_weight_memory_gb(
|
| 849 |
+
params_b=model_params, bits=output_bits, overhead_factor=2.2,
|
| 850 |
)
|
| 851 |
|
| 852 |
eval_gpu_type, eval_gpu_nums = select_gpu_with_override(eval_memory_gb, hardware_override, gpu_count_override)
|
tests/test_submit.py
CHANGED
|
@@ -440,7 +440,7 @@ def test_auto_eval():
|
|
| 440 |
print(f"\n--- Step 3: VRAM estimation ---")
|
| 441 |
if params_b:
|
| 442 |
bits = PRECISION_TO_BITS.get(precision, 4)
|
| 443 |
-
est_mem = estimate_weight_memory_gb(params_b, bits=bits, overhead_factor=
|
| 444 |
print(f" bits: {bits}")
|
| 445 |
print(f" estimated_vram: {est_mem} GB")
|
| 446 |
else:
|
|
@@ -537,7 +537,7 @@ def test_auto_quant():
|
|
| 537 |
print(f"\n--- Step 4: Quantization VRAM ---")
|
| 538 |
quant_mem = None
|
| 539 |
if size_gb and num_layers:
|
| 540 |
-
quant_mem = estimate_quantization_memory_gb(size_gb, num_layers, overhead_factor=1.
|
| 541 |
print(f" model_weight_gb: {size_gb}")
|
| 542 |
print(f" num_layers: {num_layers}")
|
| 543 |
print(f" quant_vram: {quant_mem} GB")
|
|
@@ -549,7 +549,7 @@ def test_auto_quant():
|
|
| 549 |
scheme = SUPPORTED_QUANT_SCHEMES.get(quant_scheme)
|
| 550 |
eval_mem = None
|
| 551 |
if params_b and scheme:
|
| 552 |
-
eval_mem = estimate_weight_memory_gb(params_b, bits=scheme.bits, overhead_factor=
|
| 553 |
print(f" params_b: {params_b}")
|
| 554 |
print(f" output_bits: {scheme.bits}")
|
| 555 |
print(f" eval_vram: {eval_mem} GB")
|
|
|
|
| 440 |
print(f"\n--- Step 3: VRAM estimation ---")
|
| 441 |
if params_b:
|
| 442 |
bits = PRECISION_TO_BITS.get(precision, 4)
|
| 443 |
+
est_mem = estimate_weight_memory_gb(params_b, bits=bits, overhead_factor=2.2)
|
| 444 |
print(f" bits: {bits}")
|
| 445 |
print(f" estimated_vram: {est_mem} GB")
|
| 446 |
else:
|
|
|
|
| 537 |
print(f"\n--- Step 4: Quantization VRAM ---")
|
| 538 |
quant_mem = None
|
| 539 |
if size_gb and num_layers:
|
| 540 |
+
quant_mem = estimate_quantization_memory_gb(size_gb, num_layers, overhead_factor=1.5)
|
| 541 |
print(f" model_weight_gb: {size_gb}")
|
| 542 |
print(f" num_layers: {num_layers}")
|
| 543 |
print(f" quant_vram: {quant_mem} GB")
|
|
|
|
| 549 |
scheme = SUPPORTED_QUANT_SCHEMES.get(quant_scheme)
|
| 550 |
eval_mem = None
|
| 551 |
if params_b and scheme:
|
| 552 |
+
eval_mem = estimate_weight_memory_gb(params_b, bits=scheme.bits, overhead_factor=2.2)
|
| 553 |
print(f" params_b: {params_b}")
|
| 554 |
print(f" output_bits: {scheme.bits}")
|
| 555 |
print(f" eval_vram: {eval_mem} GB")
|