fix init Evaluation queue count + add quant description + update overhead_factor params
Browse files- app.py +10 -0
- src/app_helpers/assets.py +12 -0
- src/display/about.py +1 -0
- src/display/css_html_js.py +17 -0
- src/submission/check_validity.py +6 -6
- src/submission/submit.py +2 -2
- tests/test_submit.py +2 -2
app.py
CHANGED
|
@@ -1012,6 +1012,16 @@ with demo:
|
|
| 1012 |
_quant_submit_evt.then(fn=refresh_queue_tables, inputs=_REFRESH_INPUTS, outputs=_REFRESH_OUTPUTS)
|
| 1013 |
_eval_submit_evt.then(fn=refresh_queue_tables, inputs=_REFRESH_INPUTS, outputs=_REFRESH_OUTPUTS)
|
| 1014 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1015 |
# Checkbox filters β instantly re-filter the queue tables
|
| 1016 |
my_submissions_quant_cb.change(
|
| 1017 |
fn=refresh_queue_tables,
|
|
|
|
| 1012 |
_quant_submit_evt.then(fn=refresh_queue_tables, inputs=_REFRESH_INPUTS, outputs=_REFRESH_OUTPUTS)
|
| 1013 |
_eval_submit_evt.then(fn=refresh_queue_tables, inputs=_REFRESH_INPUTS, outputs=_REFRESH_OUTPUTS)
|
| 1014 |
|
| 1015 |
+
# Refresh queue tables on every page load so the accordion counts reflect
|
| 1016 |
+
# the current state of disk β not the (possibly stale) values captured at
|
| 1017 |
+
# module-load time. Without this, the labels show whatever count was true
|
| 1018 |
+
# when the server started, until the 60 s timer fires for the first time.
|
| 1019 |
+
demo.load(
|
| 1020 |
+
fn=refresh_queue_tables,
|
| 1021 |
+
inputs=_REFRESH_INPUTS,
|
| 1022 |
+
outputs=_REFRESH_OUTPUTS,
|
| 1023 |
+
)
|
| 1024 |
+
|
| 1025 |
# Checkbox filters β instantly re-filter the queue tables
|
| 1026 |
my_submissions_quant_cb.change(
|
| 1027 |
fn=refresh_queue_tables,
|
src/app_helpers/assets.py
CHANGED
|
@@ -431,10 +431,12 @@ SIDEBAR_HEAD = r"""
|
|
| 431 |
quant: {
|
| 432 |
title: 'Quantization',
|
| 433 |
subtitle: 'Submit a source model and quantization scheme to run managed low-bit quantization jobs on the hosted pipeline',
|
|
|
|
| 434 |
badges: [
|
| 435 |
{ icon: 'π', text: 'Open Source Model' },
|
| 436 |
{ icon: 'β‘', text: 'INT4 (W4A16) / MXFP4 / NVFP4' },
|
| 437 |
{ icon: 'π', text: 'Open Source Benchmark' },
|
|
|
|
| 438 |
{ icon: 'π€', text: 'Agentic AI for AutoQuant' }
|
| 439 |
]
|
| 440 |
},
|
|
@@ -494,10 +496,20 @@ SIDEBAR_HEAD = r"""
|
|
| 494 |
function syncHero(tid) {
|
| 495 |
var heroTitle = document.getElementById('hero-title');
|
| 496 |
var heroSubtitle = document.getElementById('hero-subtitle');
|
|
|
|
| 497 |
var heroBadges = document.getElementById('hero-badges');
|
| 498 |
var copy = HERO_COPY[tid] || HERO_COPY['pipeline-results'];
|
| 499 |
if (heroTitle) heroTitle.textContent = copy.title;
|
| 500 |
if (heroSubtitle) heroSubtitle.textContent = copy.subtitle;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 501 |
if (heroBadges && copy.badges !== undefined) {
|
| 502 |
heroBadges.innerHTML = copy.badges.map(function(b) {
|
| 503 |
return '<span class="hero-badge">' + b.icon + ' ' + b.text + '</span>';
|
|
|
|
| 431 |
quant: {
|
| 432 |
title: 'Quantization',
|
| 433 |
subtitle: 'Submit a source model and quantization scheme to run managed low-bit quantization jobs on the hosted pipeline',
|
| 434 |
+
note: 'π‘ <strong>Powered by AutoRound.</strong> Find it useful? <a href="https://github.com/intel/auto-round" target="_blank" rel="noopener">Star intel/auto-round β</a> on GitHub.',
|
| 435 |
badges: [
|
| 436 |
{ icon: 'π', text: 'Open Source Model' },
|
| 437 |
{ icon: 'β‘', text: 'INT4 (W4A16) / MXFP4 / NVFP4' },
|
| 438 |
{ icon: 'π', text: 'Open Source Benchmark' },
|
| 439 |
+
{ icon: 'β', text: 'AutoRound' },
|
| 440 |
{ icon: 'π€', text: 'Agentic AI for AutoQuant' }
|
| 441 |
]
|
| 442 |
},
|
|
|
|
| 496 |
function syncHero(tid) {
|
| 497 |
var heroTitle = document.getElementById('hero-title');
|
| 498 |
var heroSubtitle = document.getElementById('hero-subtitle');
|
| 499 |
+
var heroNote = document.getElementById('hero-note');
|
| 500 |
var heroBadges = document.getElementById('hero-badges');
|
| 501 |
var copy = HERO_COPY[tid] || HERO_COPY['pipeline-results'];
|
| 502 |
if (heroTitle) heroTitle.textContent = copy.title;
|
| 503 |
if (heroSubtitle) heroSubtitle.textContent = copy.subtitle;
|
| 504 |
+
if (heroNote) {
|
| 505 |
+
if (copy.note) {
|
| 506 |
+
heroNote.innerHTML = copy.note;
|
| 507 |
+
heroNote.style.display = '';
|
| 508 |
+
} else {
|
| 509 |
+
heroNote.innerHTML = '';
|
| 510 |
+
heroNote.style.display = 'none';
|
| 511 |
+
}
|
| 512 |
+
}
|
| 513 |
if (heroBadges && copy.badges !== undefined) {
|
| 514 |
heroBadges.innerHTML = copy.badges.map(function(b) {
|
| 515 |
return '<span class="hero-badge">' + b.icon + ' ' + b.text + '</span>';
|
src/display/about.py
CHANGED
|
@@ -8,6 +8,7 @@ TITLE = """
|
|
| 8 |
<div class="hero-text" id="hero-copy">
|
| 9 |
<h1 class="hero-title" id="hero-title">Low-bit LLM Leaderboard</h1>
|
| 10 |
<p class="hero-subtitle" id="hero-subtitle">Track, compare and benchmark quantized language models across 3 standard evaluation tasks</p>
|
|
|
|
| 11 |
<div class="hero-badges" id="hero-badges">
|
| 12 |
<span class="hero-badge">π Open Source Model</span>
|
| 13 |
<span class="hero-badge">β‘ INT4 (W4A16) / MXFP4 / NVFP4</span>
|
|
|
|
| 8 |
<div class="hero-text" id="hero-copy">
|
| 9 |
<h1 class="hero-title" id="hero-title">Low-bit LLM Leaderboard</h1>
|
| 10 |
<p class="hero-subtitle" id="hero-subtitle">Track, compare and benchmark quantized language models across 3 standard evaluation tasks</p>
|
| 11 |
+
<p class="hero-note" id="hero-note" style="display:none;"></p>
|
| 12 |
<div class="hero-badges" id="hero-badges">
|
| 13 |
<span class="hero-badge">π Open Source Model</span>
|
| 14 |
<span class="hero-badge">β‘ INT4 (W4A16) / MXFP4 / NVFP4</span>
|
src/display/css_html_js.py
CHANGED
|
@@ -567,6 +567,23 @@ body {
|
|
| 567 |
line-height: 1.5;
|
| 568 |
}
|
| 569 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 570 |
.hero-badges {
|
| 571 |
display: flex;
|
| 572 |
flex-wrap: wrap;
|
|
|
|
| 567 |
line-height: 1.5;
|
| 568 |
}
|
| 569 |
|
| 570 |
+
.hero-note {
|
| 571 |
+
color: rgba(255, 255, 255, 0.92) !important;
|
| 572 |
+
font-size: 0.92rem !important;
|
| 573 |
+
margin: 0 0 14px 0 !important;
|
| 574 |
+
padding: 8px 12px;
|
| 575 |
+
line-height: 1.5;
|
| 576 |
+
background: rgba(255, 255, 255, 0.10);
|
| 577 |
+
border-left: 3px solid #fcd34d;
|
| 578 |
+
border-radius: 6px;
|
| 579 |
+
}
|
| 580 |
+
|
| 581 |
+
.hero-note a {
|
| 582 |
+
color: #fde68a !important;
|
| 583 |
+
text-decoration: underline;
|
| 584 |
+
font-weight: 600;
|
| 585 |
+
}
|
| 586 |
+
|
| 587 |
.hero-badges {
|
| 588 |
display: flex;
|
| 589 |
flex-wrap: wrap;
|
src/submission/check_validity.py
CHANGED
|
@@ -603,24 +603,24 @@ PRECISION_TO_BITS: dict[str, int] = {
|
|
| 603 |
def estimate_weight_memory_gb(
|
| 604 |
params_b: float,
|
| 605 |
bits: int,
|
| 606 |
-
overhead_factor: float =
|
| 607 |
) -> float:
|
| 608 |
"""Estimate GPU VRAM (GB) required to load a model with *bits*-bit weights.
|
| 609 |
|
| 610 |
The formula covers only weight storage; *overhead_factor* accounts for KV
|
| 611 |
-
cache, activations, and framework buffers (default
|
| 612 |
|
| 613 |
Examples
|
| 614 |
--------
|
| 615 |
-
- 7 B W4A16: 7 Γ 0.5 Γ
|
| 616 |
-
- 13 B W4A16: 13 Γ 0.5 Γ
|
| 617 |
-
- 70 B W4A16: 70 Γ 0.5 Γ
|
| 618 |
|
| 619 |
Args:
|
| 620 |
params_b: Parameter count in billions.
|
| 621 |
bits: Weight quantization bits (e.g. 4 for W4A16, 8 for W8A16,
|
| 622 |
16 for FP16/BF16, 32 for FP32).
|
| 623 |
-
overhead_factor: Multiplier for non-weight memory. Default
|
| 624 |
|
| 625 |
Returns:
|
| 626 |
Estimated GPU memory in GB (rounded to 2 decimal places).
|
|
|
|
| 603 |
def estimate_weight_memory_gb(
|
| 604 |
params_b: float,
|
| 605 |
bits: int,
|
| 606 |
+
overhead_factor: float = 4.4,
|
| 607 |
) -> float:
|
| 608 |
"""Estimate GPU VRAM (GB) required to load a model with *bits*-bit weights.
|
| 609 |
|
| 610 |
The formula covers only weight storage; *overhead_factor* accounts for KV
|
| 611 |
+
cache, activations, and framework buffers (default 4.4 = 340 % overhead).
|
| 612 |
|
| 613 |
Examples
|
| 614 |
--------
|
| 615 |
+
- 7 B W4A16: 7 Γ 0.5 Γ 4.4 β 15.4 GB
|
| 616 |
+
- 13 B W4A16: 13 Γ 0.5 Γ 4.4 β 28.6 GB
|
| 617 |
+
- 70 B W4A16: 70 Γ 0.5 Γ 4.4 β 154.0 GB
|
| 618 |
|
| 619 |
Args:
|
| 620 |
params_b: Parameter count in billions.
|
| 621 |
bits: Weight quantization bits (e.g. 4 for W4A16, 8 for W8A16,
|
| 622 |
16 for FP16/BF16, 32 for FP32).
|
| 623 |
+
overhead_factor: Multiplier for non-weight memory. Default 4.4.
|
| 624 |
|
| 625 |
Returns:
|
| 626 |
Estimated GPU memory in GB (rounded to 2 decimal places).
|
src/submission/submit.py
CHANGED
|
@@ -572,7 +572,7 @@ def add_new_eval(
|
|
| 572 |
# ββ Step 3: Estimate VRAM ββββββββββββββββββββββββββββββββββββββββ
|
| 573 |
bits_for_precision = PRECISION_TO_BITS.get(precision, 4)
|
| 574 |
estimated_memory_gb = estimate_weight_memory_gb(
|
| 575 |
-
params_b=model_params, bits=bits_for_precision, overhead_factor=
|
| 576 |
)
|
| 577 |
|
| 578 |
|
|
@@ -846,7 +846,7 @@ def add_new_quant(
|
|
| 846 |
# ββ Step 4: Estimate VRAM for post-quantization evaluation βββββββ
|
| 847 |
output_bits = scheme.bits
|
| 848 |
eval_memory_gb = estimate_weight_memory_gb(
|
| 849 |
-
params_b=model_params, bits=output_bits, overhead_factor=
|
| 850 |
)
|
| 851 |
|
| 852 |
eval_gpu_type, eval_gpu_nums = select_gpu_with_override(eval_memory_gb, hardware_override, gpu_count_override)
|
|
|
|
| 572 |
# ββ Step 3: Estimate VRAM ββββββββββββββββββββββββββββββββββββββββ
|
| 573 |
bits_for_precision = PRECISION_TO_BITS.get(precision, 4)
|
| 574 |
estimated_memory_gb = estimate_weight_memory_gb(
|
| 575 |
+
params_b=model_params, bits=bits_for_precision, overhead_factor=4.4,
|
| 576 |
)
|
| 577 |
|
| 578 |
|
|
|
|
| 846 |
# ββ Step 4: Estimate VRAM for post-quantization evaluation βββββββ
|
| 847 |
output_bits = scheme.bits
|
| 848 |
eval_memory_gb = estimate_weight_memory_gb(
|
| 849 |
+
params_b=model_params, bits=output_bits, overhead_factor=4.4,
|
| 850 |
)
|
| 851 |
|
| 852 |
eval_gpu_type, eval_gpu_nums = select_gpu_with_override(eval_memory_gb, hardware_override, gpu_count_override)
|
tests/test_submit.py
CHANGED
|
@@ -440,7 +440,7 @@ def test_auto_eval():
|
|
| 440 |
print(f"\n--- Step 3: VRAM estimation ---")
|
| 441 |
if params_b:
|
| 442 |
bits = PRECISION_TO_BITS.get(precision, 4)
|
| 443 |
-
est_mem = estimate_weight_memory_gb(params_b, bits=bits, overhead_factor=
|
| 444 |
print(f" bits: {bits}")
|
| 445 |
print(f" estimated_vram: {est_mem} GB")
|
| 446 |
else:
|
|
@@ -549,7 +549,7 @@ def test_auto_quant():
|
|
| 549 |
scheme = SUPPORTED_QUANT_SCHEMES.get(quant_scheme)
|
| 550 |
eval_mem = None
|
| 551 |
if params_b and scheme:
|
| 552 |
-
eval_mem = estimate_weight_memory_gb(params_b, bits=scheme.bits, overhead_factor=
|
| 553 |
print(f" params_b: {params_b}")
|
| 554 |
print(f" output_bits: {scheme.bits}")
|
| 555 |
print(f" eval_vram: {eval_mem} GB")
|
|
|
|
| 440 |
print(f"\n--- Step 3: VRAM estimation ---")
|
| 441 |
if params_b:
|
| 442 |
bits = PRECISION_TO_BITS.get(precision, 4)
|
| 443 |
+
est_mem = estimate_weight_memory_gb(params_b, bits=bits, overhead_factor=4.4)
|
| 444 |
print(f" bits: {bits}")
|
| 445 |
print(f" estimated_vram: {est_mem} GB")
|
| 446 |
else:
|
|
|
|
| 549 |
scheme = SUPPORTED_QUANT_SCHEMES.get(quant_scheme)
|
| 550 |
eval_mem = None
|
| 551 |
if params_b and scheme:
|
| 552 |
+
eval_mem = estimate_weight_memory_gb(params_b, bits=scheme.bits, overhead_factor=4.4)
|
| 553 |
print(f" params_b: {params_b}")
|
| 554 |
print(f" output_bits: {scheme.bits}")
|
| 555 |
print(f" eval_vram: {eval_mem} GB")
|