File size: 17,094 Bytes
7ab4f00
d1ac7b4
 
 
 
 
 
7ab4f00
d1ac7b4
 
 
 
 
 
 
 
 
 
 
7ab4f00
 
d1ac7b4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ed3da44
 
 
 
 
 
 
 
 
 
 
 
 
 
d1ac7b4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
578c966
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d1ac7b4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7ab4f00
d1ac7b4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
578c966
 
 
 
d1ac7b4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7ab4f00
 
 
 
d1ac7b4
 
7ab4f00
 
d1ac7b4
 
 
 
 
 
 
 
 
 
 
 
 
 
7ab4f00
d1ac7b4
 
 
 
7ab4f00
d1ac7b4
 
 
 
 
 
 
6a77094
d1ac7b4
3f80e8d
6a77094
 
 
 
d1ac7b4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6a77094
 
d1ac7b4
 
 
 
3f80e8d
d1ac7b4
 
 
 
3f80e8d
6a77094
 
 
3f80e8d
 
6a77094
 
d1ac7b4
 
 
 
 
 
 
 
 
6a77094
3f80e8d
 
 
 
6a77094
 
 
 
 
 
3f80e8d
 
d1ac7b4
6a77094
7ab4f00
d1ac7b4
6a77094
 
d1ac7b4
 
 
 
 
 
 
 
 
ed3da44
 
 
d1ac7b4
 
6a77094
d1ac7b4
 
578c966
 
d1ac7b4
 
 
 
 
 
 
 
 
 
 
 
6a77094
d1ac7b4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6a77094
 
d1ac7b4
578c966
 
d1ac7b4
 
 
 
 
 
 
 
 
 
 
 
 
6a77094
d1ac7b4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7ab4f00
 
d1ac7b4
 
 
 
 
 
 
7ab4f00
 
d1ac7b4
6a77094
 
d1ac7b4
 
 
6a77094
 
 
 
7ab4f00
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
import gradio as gr

# %% ../nbs/00_benchmark.ipynb 5
import torch
import time
from codecarbon import OfflineEmissionsTracker
import numpy as np
import os
from thop import profile, clever_format
from thop.vision.basic_hooks import count_convNd, count_linear

# Map quantized modules to existing conv/linear counters
import torch.ao.nn.quantized as nnq
import torch.ao.nn.intrinsic.quantized as nniq

from tqdm.notebook import tqdm
from torchprofile import profile_macs
from fasterai.sparse.all import *
from fasterai.prune.all import *
from torch.ao.quantization import get_default_qconfig_mapping
from torch.ao.quantization.quantize_fx import convert_fx, prepare_fx
import matplotlib.pyplot as plt
import seaborn as sns
import io
import copy

# Simple in-memory caches to avoid recomputation across UI interactions
_MODEL_CACHE = {}
_COMPRESSED_CACHE = {}

# %% ../nbs/00_benchmark.ipynb 7
def get_model_size(model, temp_path="temp_model.pth"):
    """Return model disk size in bytes.

    - If model is a path string, returns file size.
    - If model is an nn.Module, saves state_dict to temp and measures size.
    - If model is a ScriptModule, saves via torch.jit.save and measures size.
    """
    if isinstance(model, str) and os.path.exists(model):
        return os.path.getsize(model)

    try:
        torch.save(model.state_dict(), temp_path)
    except Exception:
        # Fallback for ScriptModules or objects without state_dict
        try:
            torch.jit.save(model, temp_path)
        except Exception:
            torch.save(model, temp_path)

    model_size = os.path.getsize(temp_path)
    os.remove(temp_path)
    
    return model_size

# %% ../nbs/00_benchmark.ipynb 8
def get_num_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)


# Warm up a model on CPU to stabilize kernel selection and prepack weights
@torch.inference_mode()
def warmup_model(model, num_warmup: int = 10, input_shape=(1, 3, 224, 224)):
    try:
        model.eval()
        device = torch.device("cpu")
        model.to(device)
        dummy_input = torch.randn(*input_shape, device=device)
        for _ in range(num_warmup):
            _ = model(dummy_input)
    except Exception:
        pass
    return model

# %% ../nbs/00_benchmark.ipynb 11
@torch.inference_mode()
def evaluate_cpu_speed(model, dummy_input, warmup_rounds=5, test_rounds=25):
    device = torch.device("cpu")
    model.eval()
    model.to(device)
    dummy_input = dummy_input.to(device)
    
    # Warm up CPU
    for _ in range(warmup_rounds):
        _ = model(dummy_input)
    
    # Measure Latency
    latencies = []
    for _ in range(test_rounds):
        start_time = time.perf_counter()
        _ = model(dummy_input)
        end_time = time.perf_counter()
        latencies.append(end_time - start_time)
    
    latencies = np.array(latencies) * 1000  # Convert to milliseconds
    mean_latency = np.mean(latencies)
    std_latency = np.std(latencies)

    # Measure Throughput
    throughput = dummy_input.size(0) * 1000 / mean_latency  # Inferences per second

    return mean_latency, std_latency, throughput


import numpy as np, copy, time
try:
    from torch.utils.benchmark import Timer
    _HAS_TBENCH = True
except Exception:
    _HAS_TBENCH = False

@torch.inference_mode()
def evaluate_cpu_speed_raw(model, dummy_input, warmup_rounds=10, test_rounds=31):
    # Use the SAME instance (no deepcopy) to keep any benign prepack/caches.
    m = model.eval().to("cpu")
    x = dummy_input.to("cpu")

    # Brief warmup: enough for caches, not long enough to throttle
    for _ in range(warmup_rounds):
        _ = m(x)

    # Time individual forwards; take robust stats (median)
    lat_ms = []
    for _ in range(test_rounds):
        t0 = time.perf_counter(); _ = m(x); t1 = time.perf_counter()
        lat_ms.append((t1 - t0) * 1e3)

    lat_ms = np.asarray(lat_ms, dtype=float)
    p50 = float(np.median(lat_ms))
    return {
        "p50_ms": p50,
        "p90_ms": float(np.percentile(lat_ms, 90)),
        "mean_ms": float(lat_ms.mean()),
        "std_ms": float(lat_ms.std()),
        "throughput_ips": float(1000.0 / p50),
    }

# %% ../nbs/00_benchmark.ipynb 13
@torch.inference_mode()
def get_model_macs(model, inputs) -> int:
    args = (inputs,) if not isinstance(inputs, (tuple, list)) else tuple(inputs)
    try:
        return profile_macs(model, args)
    except Exception:
        try:
            custom_ops = {
                nnq.Conv2d: count_convNd,
                nniq.ConvReLU2d: count_convNd,
                nnq.Linear: count_linear,
                nniq.LinearReLU: count_linear,
            }
            macs_val, _ = profile(model, inputs=args, custom_ops=custom_ops)
            return macs_val
        except Exception:
            return 0


# %% ../nbs/00_benchmark.ipynb 16
@torch.inference_mode()
def evaluate_emissions(model, dummy_input, warmup_rounds=5, test_rounds=20):
    device = torch.device("cpu")
    model.eval()
    model.to(device)
    dummy_input = dummy_input.to(device)

    # Warm up GPU
    for _ in range(warmup_rounds):
        _ = model(dummy_input)
    
    # Measure Latency
    tracker = OfflineEmissionsTracker(country_iso_code="USA")
    tracker.start()
    for _ in range(test_rounds):
        _ = model(dummy_input)
    tracker.stop()
    total_emissions = tracker.final_emissions
    total_energy_consumed = tracker.final_emissions_data.energy_consumed
    
    # Calculate average emissions and energy consumption per inference
    average_emissions_per_inference = total_emissions / test_rounds
    average_energy_per_inference = total_energy_consumed / test_rounds
    
    return average_emissions_per_inference, average_energy_per_inference

# %% ../nbs/00_benchmark.ipynb 18
@torch.inference_mode()
def benchmark(model, dummy_input):
    # Model Size
    print('disk size')
    disk_size = get_model_size(model)

    # CPU Speed
    print('cpu speed')
    base_stats = evaluate_cpu_speed_raw(model, dummy_input)
    cpu_latency = base_stats["p50_ms"]
    cpu_std_latency = base_stats["std_ms"]
    cpu_throughput = base_stats["throughput_ips"]

    # Model MACs and parameters with fallbacks
    print('macs')
    macs_str = "0.000G"
    params_str = "0.000M"
    try:
        macs_val, params_val = profile(model, inputs=(dummy_input, ))
        macs_str, params_str = clever_format([macs_val, params_val], "%.3f")
    except Exception:
        try:
            macs_val = profile_macs(model, (dummy_input,))
            macs_str = clever_format([macs_val], "%.3f")[0]
        except Exception:
            macs_str = "0.000G"
        try:
            params_val = sum(p.numel() for p in getattr(model, 'parameters', lambda: [])() if getattr(p, 'requires_grad', False))
            # convert to M
            params_str = f"{params_val/1e6:.3f}M"
        except Exception:
            params_str = "0.000M"

    print('emissions')
    # Emissions
    avg_emissions, avg_energy = evaluate_emissions(model, dummy_input)

    # Print results
    try:
        print(f"Model Size: {disk_size / 1e6:.2f} MB (disk), {params_str} parameters")
    except Exception:
        pass
    print(f"CPU Latency: {cpu_latency:.3f} ms (Β± {cpu_std_latency:.3f} ms)")
    print(f"CPU Throughput: {cpu_throughput:.2f} inferences/sec")
    print(f"Model MACs: {macs_str}")
    print(f"Average Carbon Emissions per Inference: {avg_emissions*1e3:.6f} gCO2e")
    print(f"Average Energy Consumption per Inference: {avg_energy*1e3:.6f} Wh")

    return {

        'disk_size': disk_size,
        'num_parameters': params_str, 
        'cpu_latency': cpu_latency,
        'cpu_throughput': cpu_throughput,
        'macs': macs_str, 
        'avg_emissions': avg_emissions, 
        'avg_energy': avg_energy
        
    }
def parse_metric_value(value_str):
    """Convert string values with units (M, G) to float"""
    if isinstance(value_str, (int, float)):
        return float(value_str)
    
    value_str = str(value_str)
    if 'G' in value_str:
        return float(value_str.replace('G', '')) * 1000  # Convert G to M
    elif 'M' in value_str:
        return float(value_str.replace('M', ''))  # Keep in M
    elif 'K' in value_str:
        return float(value_str.replace('K', '')) / 1000  # Convert K to M
    else:
        return float(value_str)

# Compression and visualization utilities (merged from Compressor)
class Quant:
    def __init__(self, backend="x86"):
        self.qconfig = get_default_qconfig_mapping(backend)
    
    def quantize(self, model):
        example_inputs = (torch.randn(1, 3, 224, 224),)
        model_prepared = prepare_fx(model.eval(), self.qconfig, example_inputs)
        return convert_fx(model_prepared)

def prune_model(input_model, sparsity, context, criteria):
    # Accept either a path or an nn.Module
    if isinstance(input_model, str):
        model = torch.load(input_model, weights_only=False, map_location='cpu')
    else:
        model = input_model
    model = model.eval()
    model = model.to('cpu')
    sp = Sparsifier(model, 'filter', context, criteria=eval(criteria))
    sp.sparsify_model(sparsity)
    sp._clean_buffers()
    pr = Pruner(model, sparsity, context, criteria=eval(criteria))
    pr.prune_model()
    return pr.model

def quantize_model(model):
    qu = Quant()
    qu_model = qu.quantize(model)
    return qu_model

def optimize_model(model, sparsity, context, criteria):
    model = prune_model(model, sparsity, context, criteria)
    model = quantize_model(model)
    return model


def create_size_comparison_plot(before_results, after_results, metrics):
    sns.set_style("darkgrid")
    # Increase figure size height to accommodate labels better
    fig = plt.figure(figsize=(12, 7), dpi=150)
    fig.patch.set_alpha(0.0)
    ax = plt.gca()
    ax.patch.set_alpha(0.0)
    bars = plt.bar(['Original', 'Compressed'], 
                   [before_results, after_results],
                   color=['#FF6B00', '#FF9F1C'],
                   alpha=0.8,
                   width=0.6)
    # Dynamic units per metric
    unit_label_map = {
        'Latency': 'Latency (ms)',
        'Size': 'Size (MB)',
        'MACs': 'MACs (GMAC)',
        'Energy': 'Energy (mWh)',
        'Emissions': 'Emissions (mgCO2e)'
    }
    def format_value(val, metric):
        try:
            fval = float(val)
        except Exception:
            fval = 0.0
        if metric == 'Latency':
            return f"{fval:.2f} ms"
        if metric == 'Size':
            return f"{fval:.2f} MB"
        if metric == 'MACs':
            return f"{fval:.3f} GMAC"
        if metric == 'Energy':
            return f"{fval:.3f} mWh"
        if metric == 'Emissions':
            return f"{fval:.3f} mgCO2e"
        return f"{fval:.3f}"
    # Annotate bars with values + units
    for bar in bars:
        height = bar.get_height()
        offset = (height * 0.02) if height else 0.05
        plt.text(bar.get_x() + bar.get_width()/2., height + offset,
                 format_value(height, metrics),
                 ha='center', va='bottom',
                 fontsize=15,
                 fontweight='bold',
                 color='white')
    compression_ratio = ((before_results - after_results) / before_results) * 100 if before_results else 0
    plt.title(f'Model Compression: {compression_ratio:.1f}%', 
              fontsize=18, 
              fontweight='bold', 
              pad=20,
              color='white')
    plt.xlabel('Model Version', fontsize=15, fontweight='bold', labelpad=10, color='white')
    plt.ylabel(unit_label_map.get(metrics, metrics), fontsize=15, fontweight='bold', labelpad=10, color='white')
    ax.grid(alpha=0.2, color='gray')
    sns.despine()
    # Use scientific notation for small Energy/Emissions values
    if metrics in ('Energy', 'Emissions'):
        ax.ticklabel_format(style='sci', axis='y', scilimits=(-2, 3))
    try:
        max_value = max(float(before_results), float(after_results))
    except Exception:
        max_value = float(before_results or after_results or 1)
    plt.ylim(0, max_value * 1.3) # Increased upper limit
    plt.yticks(np.linspace(0, max_value * 1.3, 10))
    ax.tick_params(colors='white')
    ax.tick_params(axis='x', colors='white', labelsize=16)
    ax.tick_params(axis='y', colors='white', labelsize=15)
    for tick_label in ax.get_xticklabels():
        tick_label.set_fontweight('bold')
    for spine in ax.spines.values():
        spine.set_color('white')
    ax.xaxis.label.set_color('white')
    ax.yaxis.label.set_color('white')
    ax.tick_params(axis='x', colors='white')
    ax.tick_params(axis='y', colors='white')
    if metrics not in ('Energy', 'Emissions'):
        ax.yaxis.set_major_formatter(plt.FuncFormatter(lambda x, p: f'{x:.1f}'))
    plt.tight_layout(pad=3.5) # Increased padding from 2.5 to 3.5
    return fig

def benchmark_interface(model_name, compression_level, metrics):
    import torchvision.models as models
    
    # Cache base models by name
    if model_name not in _MODEL_CACHE:
        model_mapping = {
            'ResNet18': models.resnet18(weights=None),
            'ResNet50': models.resnet50(weights=None),
            'MobileNetV2': models.mobilenet_v2(weights=None),
            'EfficientNet-B0': models.efficientnet_b0(weights=None),
            'VGG16': models.vgg16(weights=None),
        }
        base_model = model_mapping[model_name]
        warmup_model(base_model)
        _MODEL_CACHE[model_name] = base_model
    model = _MODEL_CACHE[model_name]
    dummy_input = torch.randn(1, 3, 224, 224)
    
    # Benchmark before (convert to readable units for plotting)
    if metrics == 'Latency':
        base_stats = evaluate_cpu_speed_raw(model, dummy_input)
        before_results = base_stats["p50_ms"]
    elif metrics == 'Size':
        before_results = get_model_size(model) / 1e6  # MB
    elif metrics == 'MACs':
        before_results = get_model_macs(model, dummy_input) / 1e9  # GMAC
    elif metrics == 'Energy':
        _, energy_kwh = evaluate_emissions(model, dummy_input)
        before_results = energy_kwh * 1e6  # mWh
    elif metrics == 'Emissions':
        emissions_kg, _ = evaluate_emissions(model, dummy_input)
        before_results = emissions_kg * 1e6  # mgCO2e
    else:
        raise ValueError(f"Invalid metric: {metrics}")
    
    # Build or reuse compressed model for the selected compression level
    cache_key = (model_name, compression_level)
    if cache_key not in _COMPRESSED_CACHE:
        sparsity = compression_values[compression_level]
        model_for_pruning = copy.deepcopy(model)
        comp_model = prune_model(model_for_pruning, sparsity, "local", "large_final")
        _COMPRESSED_CACHE[cache_key] = comp_model
    else:
        comp_model = _COMPRESSED_CACHE[cache_key]

    # Compute pre-quantization MACs if requested (more robust for tracing)
    if metrics == 'MACs':
        after_results = get_model_macs(comp_model, dummy_input) / 1e9  # GMAC

    # Quantize lazily and cache the quantized variant too
    q_cache_key = (model_name, compression_level, 'quant')
    if q_cache_key not in _COMPRESSED_CACHE:
        q_model = quantize_model(comp_model)
        q_model.eval()
        _COMPRESSED_CACHE[q_cache_key] = q_model
    else:
        q_model = _COMPRESSED_CACHE[q_cache_key]
    
    
    if metrics == 'Latency':
        base_stats = evaluate_cpu_speed_raw(q_model, dummy_input)
        after_results = base_stats["p50_ms"]
    elif metrics == 'Size':
        after_results = get_model_size(q_model) / 1e6  # MB
    elif metrics == 'MACs':
        # already computed above (pre-quantization for better compatibility)
        pass
    elif metrics == 'Energy':
        _, energy_kwh_after = evaluate_emissions(q_model, dummy_input)
        after_results = energy_kwh_after * 1e6  # mWh
    elif metrics == 'Emissions':
        emissions_kg_after, _ = evaluate_emissions(q_model, dummy_input)
        after_results = emissions_kg_after * 1e6  # mgCO2e
    else:
        raise ValueError(f"Invalid metric: {metrics}")
        
        
    # Build plots
    size_plot = create_size_comparison_plot(before_results, after_results, metrics)
    return size_plot
available_models = [
    'ResNet18', 
    'ResNet50', 
    'MobileNetV2', 
    'EfficientNet-B0', 
    'VGG16'
]

compression_values = {
    'Mild 🐌': 25,
    'Balanced 🐒': 50, 
    'Aggressive πŸ‡': 75,
    'Extreme 🐎': 90
}


metrics = [
    'Latency',
    'Size',
    'MACs',
    'Energy',
    'Emissions',
]

iface = gr.Interface(
    fn=benchmark_interface,
    inputs=[            
        gr.Dropdown(choices=available_models, label="Select Model", value='ResNet18'),
        gr.Radio(choices=list(compression_values.keys()), label="Compression Level", value='Balanced 🐒'),
        #gr.Radio(choices=list(target_device.keys()), label="Target Device", value='CPU'),
        gr.Radio(choices=metrics, label="Comparison Metric", value='Latency'),
    ],
    outputs=[
        gr.Plot(label="Size Comparison")  # Changed from gr.Image to gr.Plot
    ],
)

iface.launch()