Spaces:

Intel
/

low_bit_open_llm_leaderboard

Running

App Files Files Community

wenjiao commited on 9 days ago

Commit

f5ea11f

1 Parent(s): b15b21e

update about and params

Browse files

Files changed (4) hide show

src/display/about.py +4 -0
src/submission/check_validity.py +8 -8
src/submission/submit.py +3 -3
tests/test_submit.py +3 -3

src/display/about.py CHANGED Viewed

@@ -33,6 +33,8 @@ LLM_BENCHMARKS_TEXT = f"""
 ## ABOUT
 Quantization is a key technique for making LLMs more accessible and practical for a wide range of applications, especially where computational resources are a limiting factor. However, there has been no dedicated tool to track and compare quantized LLMs across different quantization algorithms, making it difficult to filter out genuine progress by the open-source community and identify the current state of the art.
 ### Introducing V1.0
 V1.0 marks the launch of our comprehensive evaluation system for quantized language models.
 This version introduces a standardized platform for submitting and benchmarking models, providing clear insights into their performance across various tasks.
@@ -79,6 +81,8 @@ results = simple_evaluate(
 To get more information about quantization, see:
 - auto-round: [intel/auto-round](https://github.com/intel/auto-round)
 - neural-compressor: [intel/neural-compressor](https://github.com/intel/neural-compressor/tree/master)
 """
 FAQ_TEXT = """

 ## ABOUT
 Quantization is a key technique for making LLMs more accessible and practical for a wide range of applications, especially where computational resources are a limiting factor. However, there has been no dedicated tool to track and compare quantized LLMs across different quantization algorithms, making it difficult to filter out genuine progress by the open-source community and identify the current state of the art.
+> ℹ️ **Scope:** This leaderboard mainly supports **text-generation models**.
 ### Introducing V1.0
 V1.0 marks the launch of our comprehensive evaluation system for quantized language models.
 This version introduces a standardized platform for submitting and benchmarking models, providing clear insights into their performance across various tasks.
 To get more information about quantization, see:
 - auto-round: [intel/auto-round](https://github.com/intel/auto-round)
 - neural-compressor: [intel/neural-compressor](https://github.com/intel/neural-compressor/tree/master)
+> 💡 **Found this quantization useful?** If AutoRound helped you ship a smaller, faster model, please consider giving it a ⭐ on GitHub — [**Star intel/auto-round**](https://github.com/intel/auto-round). It helps the project grow and brings more low-bit techniques to the community.
 """
 FAQ_TEXT = """

src/submission/check_validity.py CHANGED Viewed

@@ -603,24 +603,24 @@ PRECISION_TO_BITS: dict[str, int] = {
 def estimate_weight_memory_gb(
     params_b: float,
     bits: int,
-    overhead_factor: float = 1.2,
 ) -> float:
     """Estimate GPU VRAM (GB) required to load a model with *bits*-bit weights.
     The formula covers only weight storage; *overhead_factor* accounts for KV
-    cache, activations, and framework buffers (default 1.2 = 20 % overhead).
     Examples
     --------
-    - 7 B  W4A16:  7  × 0.5 × 1.2 ≈  4.2 GB
-    - 13 B W4A16:  13 × 0.5 × 1.2 ≈  7.8 GB
-    - 70 B W4A16:  70 × 0.5 × 1.2 ≈ 42.0 GB
     Args:
         params_b:        Parameter count in billions.
         bits:            Weight quantization bits (e.g. 4 for W4A16, 8 for W8A16,
                          16 for FP16/BF16, 32 for FP32).
-        overhead_factor: Multiplier for non-weight memory.  Default 1.2.
     Returns:
         Estimated GPU memory in GB (rounded to 2 decimal places).
@@ -816,7 +816,7 @@ def get_num_layers(model_config) -> int | None:
 def estimate_quantization_memory_gb(
     model_weight_gb: float,
     num_layers: int,
-    overhead_factor: float = 1.3,
 ) -> float:
     """Estimate GPU VRAM needed to **quantize** a model (layerwise / blockwise).
@@ -827,7 +827,7 @@ def estimate_quantization_memory_gb(
                       + model_weight_gb / num_layers         # activations / Hessian for that layer
                      ) × overhead_factor
-    The ``overhead_factor`` (default 1.3) covers framework buffers, calibration
     data, and temporary tensors.
     Args:

 def estimate_weight_memory_gb(
     params_b: float,
     bits: int,
+    overhead_factor: float = 2.2,
 ) -> float:
     """Estimate GPU VRAM (GB) required to load a model with *bits*-bit weights.
     The formula covers only weight storage; *overhead_factor* accounts for KV
+    cache, activations, and framework buffers (default 2.2 = 120 % overhead).
     Examples
     --------
+    - 7 B  W4A16:  7  × 0.5 × 2.2 ≈  7.7 GB
+    - 13 B W4A16:  13 × 0.5 × 2.2 ≈ 14.3 GB
+    - 70 B W4A16:  70 × 0.5 × 2.2 ≈ 77.0 GB
     Args:
         params_b:        Parameter count in billions.
         bits:            Weight quantization bits (e.g. 4 for W4A16, 8 for W8A16,
                          16 for FP16/BF16, 32 for FP32).
+        overhead_factor: Multiplier for non-weight memory.  Default 2.2.
     Returns:
         Estimated GPU memory in GB (rounded to 2 decimal places).
 def estimate_quantization_memory_gb(
     model_weight_gb: float,
     num_layers: int,
+    overhead_factor: float = 1.5,
 ) -> float:
     """Estimate GPU VRAM needed to **quantize** a model (layerwise / blockwise).
                       + model_weight_gb / num_layers         # activations / Hessian for that layer
                      ) × overhead_factor
+    The ``overhead_factor`` (default 1.5) covers framework buffers, calibration
     data, and temporary tensors.
     Args:

src/submission/submit.py CHANGED Viewed

@@ -572,7 +572,7 @@ def add_new_eval(
     # ── Step 3: Estimate VRAM ────────────────────────────────────────
     bits_for_precision = PRECISION_TO_BITS.get(precision, 4)
     estimated_memory_gb = estimate_weight_memory_gb(
-        params_b=model_params, bits=bits_for_precision, overhead_factor=1.2,
     )
@@ -837,7 +837,7 @@ def add_new_quant(
     quant_memory_gb = estimate_quantization_memory_gb(
         model_weight_gb=model_weight_gb,
         num_layers=num_layers,
-        overhead_factor=1.3,
     )
     quant_gpu_type, quant_gpu_nums = select_gpu_with_override(quant_memory_gb, hardware_override, gpu_count_override)
@@ -846,7 +846,7 @@ def add_new_quant(
     # ── Step 4: Estimate VRAM for post-quantization evaluation ───────
     output_bits = scheme.bits
     eval_memory_gb = estimate_weight_memory_gb(
-        params_b=model_params, bits=output_bits, overhead_factor=1.2,
     )
     eval_gpu_type, eval_gpu_nums = select_gpu_with_override(eval_memory_gb, hardware_override, gpu_count_override)

     # ── Step 3: Estimate VRAM ────────────────────────────────────────
     bits_for_precision = PRECISION_TO_BITS.get(precision, 4)
     estimated_memory_gb = estimate_weight_memory_gb(
+        params_b=model_params, bits=bits_for_precision, overhead_factor=2.2,
     )
     quant_memory_gb = estimate_quantization_memory_gb(
         model_weight_gb=model_weight_gb,
         num_layers=num_layers,
+        overhead_factor=1.5,
     )
     quant_gpu_type, quant_gpu_nums = select_gpu_with_override(quant_memory_gb, hardware_override, gpu_count_override)
     # ── Step 4: Estimate VRAM for post-quantization evaluation ───────
     output_bits = scheme.bits
     eval_memory_gb = estimate_weight_memory_gb(
+        params_b=model_params, bits=output_bits, overhead_factor=2.2,
     )
     eval_gpu_type, eval_gpu_nums = select_gpu_with_override(eval_memory_gb, hardware_override, gpu_count_override)

tests/test_submit.py CHANGED Viewed

@@ -440,7 +440,7 @@ def test_auto_eval():
     print(f"\n--- Step 3: VRAM estimation ---")
     if params_b:
         bits = PRECISION_TO_BITS.get(precision, 4)
-        est_mem = estimate_weight_memory_gb(params_b, bits=bits, overhead_factor=1.2)
         print(f"  bits:              {bits}")
         print(f"  estimated_vram:    {est_mem} GB")
     else:
@@ -537,7 +537,7 @@ def test_auto_quant():
     print(f"\n--- Step 4: Quantization VRAM ---")
     quant_mem = None
     if size_gb and num_layers:
-        quant_mem = estimate_quantization_memory_gb(size_gb, num_layers, overhead_factor=1.3)
         print(f"  model_weight_gb: {size_gb}")
         print(f"  num_layers:      {num_layers}")
         print(f"  quant_vram:      {quant_mem} GB")
@@ -549,7 +549,7 @@ def test_auto_quant():
     scheme = SUPPORTED_QUANT_SCHEMES.get(quant_scheme)
     eval_mem = None
     if params_b and scheme:
-        eval_mem = estimate_weight_memory_gb(params_b, bits=scheme.bits, overhead_factor=1.2)
         print(f"  params_b:     {params_b}")
         print(f"  output_bits:  {scheme.bits}")
         print(f"  eval_vram:    {eval_mem} GB")

     print(f"\n--- Step 3: VRAM estimation ---")
     if params_b:
         bits = PRECISION_TO_BITS.get(precision, 4)
+        est_mem = estimate_weight_memory_gb(params_b, bits=bits, overhead_factor=2.2)
         print(f"  bits:              {bits}")
         print(f"  estimated_vram:    {est_mem} GB")
     else:
     print(f"\n--- Step 4: Quantization VRAM ---")
     quant_mem = None
     if size_gb and num_layers:
+        quant_mem = estimate_quantization_memory_gb(size_gb, num_layers, overhead_factor=1.5)
         print(f"  model_weight_gb: {size_gb}")
         print(f"  num_layers:      {num_layers}")
         print(f"  quant_vram:      {quant_mem} GB")
     scheme = SUPPORTED_QUANT_SCHEMES.get(quant_scheme)
     eval_mem = None
     if params_b and scheme:
+        eval_mem = estimate_weight_memory_gb(params_b, bits=scheme.bits, overhead_factor=2.2)
         print(f"  params_b:     {params_b}")
         print(f"  output_bits:  {scheme.bits}")
         print(f"  eval_vram:    {eval_mem} GB")