Spaces:

Nathan12
/

Compressor

Running

App Files Files Community

Nathan12 commited on Oct 23, 2025

Commit

578c966

1 Parent(s): ed3da44

fix latency

Browse files

Files changed (1) hide show

app.py +42 -7

app.py CHANGED Viewed

@@ -101,6 +101,40 @@ def evaluate_cpu_speed(model, dummy_input, warmup_rounds=5, test_rounds=25):
     return mean_latency, std_latency, throughput
 # %% ../nbs/00_benchmark.ipynb 13
 @torch.inference_mode()
 def get_model_macs(model, inputs) -> int:
@@ -157,7 +191,10 @@ def benchmark(model, dummy_input):
     # CPU Speed
     print('cpu speed')
-    cpu_latency, cpu_std_latency, cpu_throughput = evaluate_cpu_speed(model, dummy_input)
     # Model MACs and parameters with fallbacks
     print('macs')
@@ -357,9 +394,8 @@ def benchmark_interface(model_name, compression_level, metrics):
     # Benchmark before (convert to readable units for plotting)
     if metrics == 'Latency':
-        before_results, *_ = evaluate_cpu_speed(model, dummy_input)
-    elif metrics == 'Throughput':
-        *_, before_results = evaluate_cpu_speed(model, dummy_input)
     elif metrics == 'Size':
         before_results = get_model_size(model) / 1e6  # MB
     elif metrics == 'MACs':
@@ -398,9 +434,8 @@ def benchmark_interface(model_name, compression_level, metrics):
     if metrics == 'Latency':
-        after_results, *_ = evaluate_cpu_speed(q_model, dummy_input)
-    elif metrics == 'Throughput':
-        *_, after_results = evaluate_cpu_speed(q_model, dummy_input)
     elif metrics == 'Size':
         after_results = get_model_size(q_model) / 1e6  # MB
     elif metrics == 'MACs':

     return mean_latency, std_latency, throughput
+import numpy as np, copy, time
+try:
+    from torch.utils.benchmark import Timer
+    _HAS_TBENCH = True
+except Exception:
+    _HAS_TBENCH = False
+@torch.inference_mode()
+def evaluate_cpu_speed_raw(model, dummy_input, warmup_rounds=10, test_rounds=31):
+    # Use the SAME instance (no deepcopy) to keep any benign prepack/caches.
+    m = model.eval().to("cpu")
+    x = dummy_input.to("cpu")
+    # Brief warmup: enough for caches, not long enough to throttle
+    for _ in range(warmup_rounds):
+        _ = m(x)
+    # Time individual forwards; take robust stats (median)
+    lat_ms = []
+    for _ in range(test_rounds):
+        t0 = time.perf_counter(); _ = m(x); t1 = time.perf_counter()
+        lat_ms.append((t1 - t0) * 1e3)
+    lat_ms = np.asarray(lat_ms, dtype=float)
+    p50 = float(np.median(lat_ms))
+    return {
+        "p50_ms": p50,
+        "p90_ms": float(np.percentile(lat_ms, 90)),
+        "mean_ms": float(lat_ms.mean()),
+        "std_ms": float(lat_ms.std()),
+        "throughput_ips": float(1000.0 / p50),
+    }
 # %% ../nbs/00_benchmark.ipynb 13
 @torch.inference_mode()
 def get_model_macs(model, inputs) -> int:
     # CPU Speed
     print('cpu speed')
+    base_stats = evaluate_cpu_speed_raw(model, dummy_input)
+    cpu_latency = base_stats["p50_ms"]
+    cpu_std_latency = base_stats["std_ms"]
+    cpu_throughput = base_stats["throughput_ips"]
     # Model MACs and parameters with fallbacks
     print('macs')
     # Benchmark before (convert to readable units for plotting)
     if metrics == 'Latency':
+        base_stats = evaluate_cpu_speed_raw(model, dummy_input)
+        before_results = base_stats["p50_ms"]
     elif metrics == 'Size':
         before_results = get_model_size(model) / 1e6  # MB
     elif metrics == 'MACs':
     if metrics == 'Latency':
+        base_stats = evaluate_cpu_speed_raw(q_model, dummy_input)
+        after_results = base_stats["p50_ms"]
     elif metrics == 'Size':
         after_results = get_model_size(q_model) / 1e6  # MB
     elif metrics == 'MACs':