Spaces:

Intel
/

low_bit_open_llm_leaderboard

Running

App Files Files Community

wenjiao commited on 6 days ago

Commit

4802f1a

1 Parent(s): f5ea11f

fix init Evaluation queue count + add quant description + update overhead_factor params

Browse files

Files changed (7) hide show

app.py +10 -0
src/app_helpers/assets.py +12 -0
src/display/about.py +1 -0
src/display/css_html_js.py +17 -0
src/submission/check_validity.py +6 -6
src/submission/submit.py +2 -2
tests/test_submit.py +2 -2

app.py CHANGED Viewed

@@ -1012,6 +1012,16 @@ with demo:
     _quant_submit_evt.then(fn=refresh_queue_tables, inputs=_REFRESH_INPUTS, outputs=_REFRESH_OUTPUTS)
     _eval_submit_evt.then(fn=refresh_queue_tables, inputs=_REFRESH_INPUTS, outputs=_REFRESH_OUTPUTS)
     # Checkbox filters — instantly re-filter the queue tables
     my_submissions_quant_cb.change(
         fn=refresh_queue_tables,

     _quant_submit_evt.then(fn=refresh_queue_tables, inputs=_REFRESH_INPUTS, outputs=_REFRESH_OUTPUTS)
     _eval_submit_evt.then(fn=refresh_queue_tables, inputs=_REFRESH_INPUTS, outputs=_REFRESH_OUTPUTS)
+    # Refresh queue tables on every page load so the accordion counts reflect
+    # the current state of disk — not the (possibly stale) values captured at
+    # module-load time. Without this, the labels show whatever count was true
+    # when the server started, until the 60 s timer fires for the first time.
+    demo.load(
+        fn=refresh_queue_tables,
+        inputs=_REFRESH_INPUTS,
+        outputs=_REFRESH_OUTPUTS,
+    )
     # Checkbox filters — instantly re-filter the queue tables
     my_submissions_quant_cb.change(
         fn=refresh_queue_tables,

src/app_helpers/assets.py CHANGED Viewed

@@ -431,10 +431,12 @@ SIDEBAR_HEAD = r"""
         quant: {
             title: 'Quantization',
             subtitle: 'Submit a source model and quantization scheme to run managed low-bit quantization jobs on the hosted pipeline',
             badges: [
                 { icon: '🏆', text: 'Open Source Model' },
                 { icon: '⚡', text: 'INT4 (W4A16) / MXFP4 / NVFP4' },
                 { icon: '📊', text: 'Open Source Benchmark' },
                 { icon: '🤖', text: 'Agentic AI for AutoQuant' }
             ]
         },
@@ -494,10 +496,20 @@ SIDEBAR_HEAD = r"""
     function syncHero(tid) {
         var heroTitle = document.getElementById('hero-title');
         var heroSubtitle = document.getElementById('hero-subtitle');
         var heroBadges = document.getElementById('hero-badges');
         var copy = HERO_COPY[tid] || HERO_COPY['pipeline-results'];
         if (heroTitle) heroTitle.textContent = copy.title;
         if (heroSubtitle) heroSubtitle.textContent = copy.subtitle;
         if (heroBadges && copy.badges !== undefined) {
             heroBadges.innerHTML = copy.badges.map(function(b) {
                 return '<span class="hero-badge">' + b.icon + ' ' + b.text + '</span>';

         quant: {
             title: 'Quantization',
             subtitle: 'Submit a source model and quantization scheme to run managed low-bit quantization jobs on the hosted pipeline',
+            note: '💡 <strong>Powered by AutoRound.</strong> Find it useful? <a href="https://github.com/intel/auto-round" target="_blank" rel="noopener">Star intel/auto-round ⭐</a> on GitHub.',
             badges: [
                 { icon: '🏆', text: 'Open Source Model' },
                 { icon: '⚡', text: 'INT4 (W4A16) / MXFP4 / NVFP4' },
                 { icon: '📊', text: 'Open Source Benchmark' },
+                { icon: '⭐', text: 'AutoRound' },
                 { icon: '🤖', text: 'Agentic AI for AutoQuant' }
             ]
         },
     function syncHero(tid) {
         var heroTitle = document.getElementById('hero-title');
         var heroSubtitle = document.getElementById('hero-subtitle');
+        var heroNote = document.getElementById('hero-note');
         var heroBadges = document.getElementById('hero-badges');
         var copy = HERO_COPY[tid] || HERO_COPY['pipeline-results'];
         if (heroTitle) heroTitle.textContent = copy.title;
         if (heroSubtitle) heroSubtitle.textContent = copy.subtitle;
+        if (heroNote) {
+            if (copy.note) {
+                heroNote.innerHTML = copy.note;
+                heroNote.style.display = '';
+            } else {
+                heroNote.innerHTML = '';
+                heroNote.style.display = 'none';
+            }
+        }
         if (heroBadges && copy.badges !== undefined) {
             heroBadges.innerHTML = copy.badges.map(function(b) {
                 return '<span class="hero-badge">' + b.icon + ' ' + b.text + '</span>';

src/display/about.py CHANGED Viewed

@@ -8,6 +8,7 @@ TITLE = """
             <div class="hero-text" id="hero-copy">
                   <h1 class="hero-title" id="hero-title">Low-bit LLM Leaderboard</h1>
                   <p class="hero-subtitle" id="hero-subtitle">Track, compare and benchmark quantized language models across 3 standard evaluation tasks</p>
                   <div class="hero-badges" id="hero-badges">
                         <span class="hero-badge">🏆 Open Source Model</span>
                         <span class="hero-badge">⚡ INT4 (W4A16) / MXFP4 / NVFP4</span>

             <div class="hero-text" id="hero-copy">
                   <h1 class="hero-title" id="hero-title">Low-bit LLM Leaderboard</h1>
                   <p class="hero-subtitle" id="hero-subtitle">Track, compare and benchmark quantized language models across 3 standard evaluation tasks</p>
+                  <p class="hero-note" id="hero-note" style="display:none;"></p>
                   <div class="hero-badges" id="hero-badges">
                         <span class="hero-badge">🏆 Open Source Model</span>
                         <span class="hero-badge">⚡ INT4 (W4A16) / MXFP4 / NVFP4</span>

src/display/css_html_js.py CHANGED Viewed

@@ -567,6 +567,23 @@ body {
     line-height: 1.5;
 }
 .hero-badges {
     display: flex;
     flex-wrap: wrap;

     line-height: 1.5;
 }
+.hero-note {
+    color: rgba(255, 255, 255, 0.92) !important;
+    font-size: 0.92rem !important;
+    margin: 0 0 14px 0 !important;
+    padding: 8px 12px;
+    line-height: 1.5;
+    background: rgba(255, 255, 255, 0.10);
+    border-left: 3px solid #fcd34d;
+    border-radius: 6px;
+}
+.hero-note a {
+    color: #fde68a !important;
+    text-decoration: underline;
+    font-weight: 600;
+}
 .hero-badges {
     display: flex;
     flex-wrap: wrap;

src/submission/check_validity.py CHANGED Viewed

@@ -603,24 +603,24 @@ PRECISION_TO_BITS: dict[str, int] = {
 def estimate_weight_memory_gb(
     params_b: float,
     bits: int,
-    overhead_factor: float = 2.2,
 ) -> float:
     """Estimate GPU VRAM (GB) required to load a model with *bits*-bit weights.
     The formula covers only weight storage; *overhead_factor* accounts for KV
-    cache, activations, and framework buffers (default 2.2 = 120 % overhead).
     Examples
     --------
-    - 7 B  W4A16:  7  × 0.5 × 2.2 ≈  7.7 GB
-    - 13 B W4A16:  13 × 0.5 × 2.2 ≈ 14.3 GB
-    - 70 B W4A16:  70 × 0.5 × 2.2 ≈ 77.0 GB
     Args:
         params_b:        Parameter count in billions.
         bits:            Weight quantization bits (e.g. 4 for W4A16, 8 for W8A16,
                          16 for FP16/BF16, 32 for FP32).
-        overhead_factor: Multiplier for non-weight memory.  Default 2.2.
     Returns:
         Estimated GPU memory in GB (rounded to 2 decimal places).

 def estimate_weight_memory_gb(
     params_b: float,
     bits: int,
+    overhead_factor: float = 4.4,
 ) -> float:
     """Estimate GPU VRAM (GB) required to load a model with *bits*-bit weights.
     The formula covers only weight storage; *overhead_factor* accounts for KV
+    cache, activations, and framework buffers (default 4.4 = 340 % overhead).
     Examples
     --------
+    - 7 B  W4A16:  7  × 0.5 × 4.4 ≈ 15.4 GB
+    - 13 B W4A16:  13 × 0.5 × 4.4 ≈ 28.6 GB
+    - 70 B W4A16:  70 × 0.5 × 4.4 ≈ 154.0 GB
     Args:
         params_b:        Parameter count in billions.
         bits:            Weight quantization bits (e.g. 4 for W4A16, 8 for W8A16,
                          16 for FP16/BF16, 32 for FP32).
+        overhead_factor: Multiplier for non-weight memory.  Default 4.4.
     Returns:
         Estimated GPU memory in GB (rounded to 2 decimal places).

src/submission/submit.py CHANGED Viewed

@@ -572,7 +572,7 @@ def add_new_eval(
     # ── Step 3: Estimate VRAM ────────────────────────────────────────
     bits_for_precision = PRECISION_TO_BITS.get(precision, 4)
     estimated_memory_gb = estimate_weight_memory_gb(
-        params_b=model_params, bits=bits_for_precision, overhead_factor=2.2,
     )
@@ -846,7 +846,7 @@ def add_new_quant(
     # ── Step 4: Estimate VRAM for post-quantization evaluation ───────
     output_bits = scheme.bits
     eval_memory_gb = estimate_weight_memory_gb(
-        params_b=model_params, bits=output_bits, overhead_factor=2.2,
     )
     eval_gpu_type, eval_gpu_nums = select_gpu_with_override(eval_memory_gb, hardware_override, gpu_count_override)

     # ── Step 3: Estimate VRAM ────────────────────────────────────────
     bits_for_precision = PRECISION_TO_BITS.get(precision, 4)
     estimated_memory_gb = estimate_weight_memory_gb(
+        params_b=model_params, bits=bits_for_precision, overhead_factor=4.4,
     )
     # ── Step 4: Estimate VRAM for post-quantization evaluation ───────
     output_bits = scheme.bits
     eval_memory_gb = estimate_weight_memory_gb(
+        params_b=model_params, bits=output_bits, overhead_factor=4.4,
     )
     eval_gpu_type, eval_gpu_nums = select_gpu_with_override(eval_memory_gb, hardware_override, gpu_count_override)

tests/test_submit.py CHANGED Viewed

@@ -440,7 +440,7 @@ def test_auto_eval():
     print(f"\n--- Step 3: VRAM estimation ---")
     if params_b:
         bits = PRECISION_TO_BITS.get(precision, 4)
-        est_mem = estimate_weight_memory_gb(params_b, bits=bits, overhead_factor=2.2)
         print(f"  bits:              {bits}")
         print(f"  estimated_vram:    {est_mem} GB")
     else:
@@ -549,7 +549,7 @@ def test_auto_quant():
     scheme = SUPPORTED_QUANT_SCHEMES.get(quant_scheme)
     eval_mem = None
     if params_b and scheme:
-        eval_mem = estimate_weight_memory_gb(params_b, bits=scheme.bits, overhead_factor=2.2)
         print(f"  params_b:     {params_b}")
         print(f"  output_bits:  {scheme.bits}")
         print(f"  eval_vram:    {eval_mem} GB")

     print(f"\n--- Step 3: VRAM estimation ---")
     if params_b:
         bits = PRECISION_TO_BITS.get(precision, 4)
+        est_mem = estimate_weight_memory_gb(params_b, bits=bits, overhead_factor=4.4)
         print(f"  bits:              {bits}")
         print(f"  estimated_vram:    {est_mem} GB")
     else:
     scheme = SUPPORTED_QUANT_SCHEMES.get(quant_scheme)
     eval_mem = None
     if params_b and scheme:
+        eval_mem = estimate_weight_memory_gb(params_b, bits=scheme.bits, overhead_factor=4.4)
         print(f"  params_b:     {params_b}")
         print(f"  output_bits:  {scheme.bits}")
         print(f"  eval_vram:    {eval_mem} GB")