wenjiao commited on
Commit
f5ea11f
Β·
1 Parent(s): b15b21e

update about and params

Browse files
src/display/about.py CHANGED
@@ -33,6 +33,8 @@ LLM_BENCHMARKS_TEXT = f"""
33
  ## ABOUT
34
  Quantization is a key technique for making LLMs more accessible and practical for a wide range of applications, especially where computational resources are a limiting factor. However, there has been no dedicated tool to track and compare quantized LLMs across different quantization algorithms, making it difficult to filter out genuine progress by the open-source community and identify the current state of the art.
35
 
 
 
36
  ### Introducing V1.0
37
  V1.0 marks the launch of our comprehensive evaluation system for quantized language models.
38
  This version introduces a standardized platform for submitting and benchmarking models, providing clear insights into their performance across various tasks.
@@ -79,6 +81,8 @@ results = simple_evaluate(
79
  To get more information about quantization, see:
80
  - auto-round: [intel/auto-round](https://github.com/intel/auto-round)
81
  - neural-compressor: [intel/neural-compressor](https://github.com/intel/neural-compressor/tree/master)
 
 
82
  """
83
 
84
  FAQ_TEXT = """
 
33
  ## ABOUT
34
  Quantization is a key technique for making LLMs more accessible and practical for a wide range of applications, especially where computational resources are a limiting factor. However, there has been no dedicated tool to track and compare quantized LLMs across different quantization algorithms, making it difficult to filter out genuine progress by the open-source community and identify the current state of the art.
35
 
36
+ > ℹ️ **Scope:** This leaderboard mainly supports **text-generation models**.
37
+
38
  ### Introducing V1.0
39
  V1.0 marks the launch of our comprehensive evaluation system for quantized language models.
40
  This version introduces a standardized platform for submitting and benchmarking models, providing clear insights into their performance across various tasks.
 
81
  To get more information about quantization, see:
82
  - auto-round: [intel/auto-round](https://github.com/intel/auto-round)
83
  - neural-compressor: [intel/neural-compressor](https://github.com/intel/neural-compressor/tree/master)
84
+
85
+ > πŸ’‘ **Found this quantization useful?** If AutoRound helped you ship a smaller, faster model, please consider giving it a ⭐ on GitHub β€” [**Star intel/auto-round**](https://github.com/intel/auto-round). It helps the project grow and brings more low-bit techniques to the community.
86
  """
87
 
88
  FAQ_TEXT = """
src/submission/check_validity.py CHANGED
@@ -603,24 +603,24 @@ PRECISION_TO_BITS: dict[str, int] = {
603
  def estimate_weight_memory_gb(
604
  params_b: float,
605
  bits: int,
606
- overhead_factor: float = 1.2,
607
  ) -> float:
608
  """Estimate GPU VRAM (GB) required to load a model with *bits*-bit weights.
609
 
610
  The formula covers only weight storage; *overhead_factor* accounts for KV
611
- cache, activations, and framework buffers (default 1.2 = 20 % overhead).
612
 
613
  Examples
614
  --------
615
- - 7 B W4A16: 7 Γ— 0.5 Γ— 1.2 β‰ˆ 4.2 GB
616
- - 13 B W4A16: 13 Γ— 0.5 Γ— 1.2 β‰ˆ 7.8 GB
617
- - 70 B W4A16: 70 Γ— 0.5 Γ— 1.2 β‰ˆ 42.0 GB
618
 
619
  Args:
620
  params_b: Parameter count in billions.
621
  bits: Weight quantization bits (e.g. 4 for W4A16, 8 for W8A16,
622
  16 for FP16/BF16, 32 for FP32).
623
- overhead_factor: Multiplier for non-weight memory. Default 1.2.
624
 
625
  Returns:
626
  Estimated GPU memory in GB (rounded to 2 decimal places).
@@ -816,7 +816,7 @@ def get_num_layers(model_config) -> int | None:
816
  def estimate_quantization_memory_gb(
817
  model_weight_gb: float,
818
  num_layers: int,
819
- overhead_factor: float = 1.3,
820
  ) -> float:
821
  """Estimate GPU VRAM needed to **quantize** a model (layerwise / blockwise).
822
 
@@ -827,7 +827,7 @@ def estimate_quantization_memory_gb(
827
  + model_weight_gb / num_layers # activations / Hessian for that layer
828
  ) Γ— overhead_factor
829
 
830
- The ``overhead_factor`` (default 1.3) covers framework buffers, calibration
831
  data, and temporary tensors.
832
 
833
  Args:
 
603
  def estimate_weight_memory_gb(
604
  params_b: float,
605
  bits: int,
606
+ overhead_factor: float = 2.2,
607
  ) -> float:
608
  """Estimate GPU VRAM (GB) required to load a model with *bits*-bit weights.
609
 
610
  The formula covers only weight storage; *overhead_factor* accounts for KV
611
+ cache, activations, and framework buffers (default 2.2 = 120 % overhead).
612
 
613
  Examples
614
  --------
615
+ - 7 B W4A16: 7 Γ— 0.5 Γ— 2.2 β‰ˆ 7.7 GB
616
+ - 13 B W4A16: 13 Γ— 0.5 Γ— 2.2 β‰ˆ 14.3 GB
617
+ - 70 B W4A16: 70 Γ— 0.5 Γ— 2.2 β‰ˆ 77.0 GB
618
 
619
  Args:
620
  params_b: Parameter count in billions.
621
  bits: Weight quantization bits (e.g. 4 for W4A16, 8 for W8A16,
622
  16 for FP16/BF16, 32 for FP32).
623
+ overhead_factor: Multiplier for non-weight memory. Default 2.2.
624
 
625
  Returns:
626
  Estimated GPU memory in GB (rounded to 2 decimal places).
 
816
  def estimate_quantization_memory_gb(
817
  model_weight_gb: float,
818
  num_layers: int,
819
+ overhead_factor: float = 1.5,
820
  ) -> float:
821
  """Estimate GPU VRAM needed to **quantize** a model (layerwise / blockwise).
822
 
 
827
  + model_weight_gb / num_layers # activations / Hessian for that layer
828
  ) Γ— overhead_factor
829
 
830
+ The ``overhead_factor`` (default 1.5) covers framework buffers, calibration
831
  data, and temporary tensors.
832
 
833
  Args:
src/submission/submit.py CHANGED
@@ -572,7 +572,7 @@ def add_new_eval(
572
  # ── Step 3: Estimate VRAM ────────────────────────────────────────
573
  bits_for_precision = PRECISION_TO_BITS.get(precision, 4)
574
  estimated_memory_gb = estimate_weight_memory_gb(
575
- params_b=model_params, bits=bits_for_precision, overhead_factor=1.2,
576
  )
577
 
578
 
@@ -837,7 +837,7 @@ def add_new_quant(
837
  quant_memory_gb = estimate_quantization_memory_gb(
838
  model_weight_gb=model_weight_gb,
839
  num_layers=num_layers,
840
- overhead_factor=1.3,
841
  )
842
 
843
  quant_gpu_type, quant_gpu_nums = select_gpu_with_override(quant_memory_gb, hardware_override, gpu_count_override)
@@ -846,7 +846,7 @@ def add_new_quant(
846
  # ── Step 4: Estimate VRAM for post-quantization evaluation ───────
847
  output_bits = scheme.bits
848
  eval_memory_gb = estimate_weight_memory_gb(
849
- params_b=model_params, bits=output_bits, overhead_factor=1.2,
850
  )
851
 
852
  eval_gpu_type, eval_gpu_nums = select_gpu_with_override(eval_memory_gb, hardware_override, gpu_count_override)
 
572
  # ── Step 3: Estimate VRAM ────────────────────────────────────────
573
  bits_for_precision = PRECISION_TO_BITS.get(precision, 4)
574
  estimated_memory_gb = estimate_weight_memory_gb(
575
+ params_b=model_params, bits=bits_for_precision, overhead_factor=2.2,
576
  )
577
 
578
 
 
837
  quant_memory_gb = estimate_quantization_memory_gb(
838
  model_weight_gb=model_weight_gb,
839
  num_layers=num_layers,
840
+ overhead_factor=1.5,
841
  )
842
 
843
  quant_gpu_type, quant_gpu_nums = select_gpu_with_override(quant_memory_gb, hardware_override, gpu_count_override)
 
846
  # ── Step 4: Estimate VRAM for post-quantization evaluation ───────
847
  output_bits = scheme.bits
848
  eval_memory_gb = estimate_weight_memory_gb(
849
+ params_b=model_params, bits=output_bits, overhead_factor=2.2,
850
  )
851
 
852
  eval_gpu_type, eval_gpu_nums = select_gpu_with_override(eval_memory_gb, hardware_override, gpu_count_override)
tests/test_submit.py CHANGED
@@ -440,7 +440,7 @@ def test_auto_eval():
440
  print(f"\n--- Step 3: VRAM estimation ---")
441
  if params_b:
442
  bits = PRECISION_TO_BITS.get(precision, 4)
443
- est_mem = estimate_weight_memory_gb(params_b, bits=bits, overhead_factor=1.2)
444
  print(f" bits: {bits}")
445
  print(f" estimated_vram: {est_mem} GB")
446
  else:
@@ -537,7 +537,7 @@ def test_auto_quant():
537
  print(f"\n--- Step 4: Quantization VRAM ---")
538
  quant_mem = None
539
  if size_gb and num_layers:
540
- quant_mem = estimate_quantization_memory_gb(size_gb, num_layers, overhead_factor=1.3)
541
  print(f" model_weight_gb: {size_gb}")
542
  print(f" num_layers: {num_layers}")
543
  print(f" quant_vram: {quant_mem} GB")
@@ -549,7 +549,7 @@ def test_auto_quant():
549
  scheme = SUPPORTED_QUANT_SCHEMES.get(quant_scheme)
550
  eval_mem = None
551
  if params_b and scheme:
552
- eval_mem = estimate_weight_memory_gb(params_b, bits=scheme.bits, overhead_factor=1.2)
553
  print(f" params_b: {params_b}")
554
  print(f" output_bits: {scheme.bits}")
555
  print(f" eval_vram: {eval_mem} GB")
 
440
  print(f"\n--- Step 3: VRAM estimation ---")
441
  if params_b:
442
  bits = PRECISION_TO_BITS.get(precision, 4)
443
+ est_mem = estimate_weight_memory_gb(params_b, bits=bits, overhead_factor=2.2)
444
  print(f" bits: {bits}")
445
  print(f" estimated_vram: {est_mem} GB")
446
  else:
 
537
  print(f"\n--- Step 4: Quantization VRAM ---")
538
  quant_mem = None
539
  if size_gb and num_layers:
540
+ quant_mem = estimate_quantization_memory_gb(size_gb, num_layers, overhead_factor=1.5)
541
  print(f" model_weight_gb: {size_gb}")
542
  print(f" num_layers: {num_layers}")
543
  print(f" quant_vram: {quant_mem} GB")
 
549
  scheme = SUPPORTED_QUANT_SCHEMES.get(quant_scheme)
550
  eval_mem = None
551
  if params_b and scheme:
552
+ eval_mem = estimate_weight_memory_gb(params_b, bits=scheme.bits, overhead_factor=2.2)
553
  print(f" params_b: {params_b}")
554
  print(f" output_bits: {scheme.bits}")
555
  print(f" eval_vram: {eval_mem} GB")