Spaces:
Paused
Paused
| """Roofline model analysis for detection and segmentation pipelines. | |
| Computes theoretical maximum throughput, identifies bottlenecks, and | |
| provides actionable recommendations based on hardware specs and | |
| profiling measurements. | |
| """ | |
| import logging | |
| from dataclasses import dataclass, field | |
| from typing import Dict, List, Optional | |
| logger = logging.getLogger(__name__) | |
| # Approximate GFLOPs per forward pass at reference resolution (640x480 for YOLO, 800x800 for DETR) | |
| # These are rough estimates; actual FLOPs depend on input resolution and model variant. | |
| _MODEL_FLOPS: Dict[str, float] = { | |
| # Detection models (GFLOPs per frame) | |
| "yolo11": 78.9, # YOLO11m ~79 GFLOPs at 640px | |
| "detr_resnet50": 86.0, # DETR-R50 ~86 GFLOPs at 800px | |
| "grounding_dino": 172.0, # Grounding DINO-B ~172 GFLOPs | |
| "drone_yolo": 78.9, # Same arch as YOLO11m-class model | |
| # Segmentation models (GFLOPs per keyframe) | |
| "GSAM2-S": 48.0, # SAM2 small encoder | |
| "GSAM2-B": 96.0, # SAM2 base encoder | |
| "GSAM2-L": 200.0, # SAM2 large encoder | |
| # YSAM2 uses the same SAM2 backbone; detector differences are reflected in timing. | |
| "YSAM2-S": 48.0, | |
| "YSAM2-B": 96.0, | |
| "YSAM2-L": 200.0, | |
| "gsam2_tiny": 24.0, # SAM2 tiny encoder | |
| } | |
| # Approximate bytes moved per forward pass (weights + activations + I/O) | |
| _MODEL_BYTES: Dict[str, float] = { | |
| # In MB — approximate weight size + activation memory | |
| "yolo11": 52.0, | |
| "detr_resnet50": 166.0, | |
| "grounding_dino": 340.0, | |
| "drone_yolo": 52.0, | |
| "GSAM2-S": 92.0, | |
| "GSAM2-B": 180.0, | |
| "GSAM2-L": 400.0, | |
| "YSAM2-S": 92.0, | |
| "YSAM2-B": 180.0, | |
| "YSAM2-L": 400.0, | |
| "gsam2_tiny": 46.0, | |
| } | |
| class BottleneckBreakdown: | |
| """Per-phase bottleneck identification.""" | |
| phase: str = "" # "decode", "preprocess", "transfer", "gpu_kernel", "postprocess" | |
| time_ms: float = 0.0 | |
| fraction: float = 0.0 # Fraction of total pipeline time | |
| is_bottleneck: bool = False | |
| class RooflineResult: | |
| """Complete roofline analysis output.""" | |
| # Hardware ceilings | |
| peak_fp32_tflops: float = 0.0 | |
| peak_fp16_tflops: float = 0.0 | |
| peak_memory_bandwidth_gbps: float = 0.0 | |
| ridge_point_flop_per_byte: float = 0.0 # = peak_tflops / peak_bw | |
| # Workload characteristics | |
| model_name: str = "" | |
| model_gflops: float = 0.0 | |
| model_bytes_mb: float = 0.0 | |
| operational_intensity: float = 0.0 # FLOPs / bytes_moved | |
| # Achieved performance | |
| achieved_tflops: float = 0.0 | |
| achieved_bandwidth_gbps: float = 0.0 | |
| # Bottleneck analysis | |
| primary_bottleneck: str = "" # "decode", "transfer", "memory", "compute" | |
| bottleneck_explanation: str = "" | |
| phase_breakdown: List[BottleneckBreakdown] = field(default_factory=list) | |
| # Throughput | |
| theoretical_max_fps: float = 0.0 | |
| observed_fps: float = 0.0 | |
| utilization_pct: float = 0.0 | |
| # GPU memory | |
| gpu_peak_memory_mb: float = 0.0 | |
| gpu_vram_total_mb: float = 0.0 | |
| memory_utilization_pct: float = 0.0 | |
| # Recommendations | |
| recommendations: List[str] = field(default_factory=list) | |
| # GSAM2-specific metrics (populated for segmentation mode) | |
| gsam2_metrics: Optional[Dict] = None | |
| def compute_roofline(hardware, profiling) -> RooflineResult: | |
| """Compute roofline analysis from hardware info and profiling results. | |
| Args: | |
| hardware: HardwareInfo dataclass from hardware_info.py | |
| profiling: ProfilingResult dataclass from profiler.py | |
| Returns: | |
| RooflineResult with theoretical ceilings, achieved performance, | |
| bottleneck identification, and recommendations. | |
| """ | |
| result = RooflineResult() | |
| result.model_name = profiling.detector_name | |
| # --- Hardware ceilings (use first GPU) --- | |
| if hardware.gpus: | |
| gpu = hardware.gpus[0] | |
| result.peak_fp32_tflops = gpu.fp32_tflops or 0.0 | |
| result.peak_fp16_tflops = gpu.fp16_tflops or 0.0 | |
| result.peak_memory_bandwidth_gbps = gpu.memory_bandwidth_gbps or 0.0 | |
| if gpu.vram_total_gb: | |
| result.gpu_vram_total_mb = gpu.vram_total_gb * 1024 | |
| else: | |
| logger.warning("No GPU info available; roofline will have zero ceilings") | |
| # Ridge point: where compute and memory roofs intersect | |
| if result.peak_memory_bandwidth_gbps > 0: | |
| # peak_tflops / peak_bw (TB/s) = FLOPs/byte | |
| peak_tbps = result.peak_memory_bandwidth_gbps / 1000 # GB/s -> TB/s | |
| if peak_tbps > 0: | |
| result.ridge_point_flop_per_byte = result.peak_fp32_tflops / peak_tbps | |
| # --- Workload characteristics --- | |
| model_key = profiling.detector_name | |
| result.model_gflops = _MODEL_FLOPS.get(model_key, 0.0) | |
| result.model_bytes_mb = _MODEL_BYTES.get(model_key, 0.0) | |
| if result.model_bytes_mb > 0: | |
| # Operational intensity = FLOPs / bytes_moved | |
| bytes_moved = result.model_bytes_mb * 1e6 # MB -> bytes | |
| flops = result.model_gflops * 1e9 # GFLOPs -> FLOPs | |
| result.operational_intensity = flops / bytes_moved if bytes_moved > 0 else 0 | |
| # --- Achieved performance --- | |
| gpu_kernel_ms = profiling.gpu_kernel_stats.mean_ms if profiling.gpu_kernel_stats.count > 0 else 0 | |
| if gpu_kernel_ms > 0 and result.model_gflops > 0: | |
| # Achieved TFLOPS = GFLOPs / (kernel_time_s) | |
| kernel_time_s = gpu_kernel_ms / 1000 | |
| result.achieved_tflops = round(result.model_gflops / kernel_time_s / 1000, 4) | |
| if gpu_kernel_ms > 0 and result.model_bytes_mb > 0: | |
| kernel_time_s = gpu_kernel_ms / 1000 | |
| result.achieved_bandwidth_gbps = round(result.model_bytes_mb / kernel_time_s / 1000, 2) | |
| # --- Per-phase bottleneck breakdown --- | |
| phases = [ | |
| ("decode", profiling.decode_stats.mean_ms), | |
| ("preprocess", profiling.preprocess_stats.mean_ms), | |
| ] | |
| # Only include transfer if we have valid measurements | |
| if profiling.transfer_stats.count > 0 and profiling.transfer_stats.mean_ms >= 0: | |
| phases.append(("transfer", profiling.transfer_stats.mean_ms)) | |
| phases.extend([ | |
| ("gpu_kernel", profiling.gpu_kernel_stats.mean_ms), | |
| ("postprocess", profiling.postprocess_stats.mean_ms), | |
| ]) | |
| total_phase_ms = sum(ms for _, ms in phases) | |
| max_phase_name = "" | |
| max_phase_ms = 0 | |
| for name, ms in phases: | |
| bb = BottleneckBreakdown( | |
| phase=name, | |
| time_ms=round(ms, 3), | |
| fraction=round(ms / total_phase_ms, 4) if total_phase_ms > 0 else 0, | |
| ) | |
| if ms > max_phase_ms: | |
| max_phase_ms = ms | |
| max_phase_name = name | |
| result.phase_breakdown.append(bb) | |
| # Mark bottleneck phase | |
| for bb in result.phase_breakdown: | |
| if bb.phase == max_phase_name: | |
| bb.is_bottleneck = True | |
| # --- Primary bottleneck classification --- | |
| if max_phase_name == "decode": | |
| result.primary_bottleneck = "decode-bound" | |
| result.bottleneck_explanation = ( | |
| f"Video decoding ({max_phase_ms:.1f}ms) is the slowest phase. " | |
| "GPU is waiting for frames. Consider hardware-accelerated decoding (NVDEC) " | |
| "or reducing input resolution." | |
| ) | |
| elif max_phase_name == "transfer": | |
| result.primary_bottleneck = "transfer-bound" | |
| result.bottleneck_explanation = ( | |
| f"CPU->GPU data transfer ({max_phase_ms:.1f}ms) is the slowest phase. " | |
| "Consider using pinned memory, reducing input tensor size, or " | |
| "overlapping transfer with computation." | |
| ) | |
| elif max_phase_name == "gpu_kernel": | |
| # Sub-classify: memory-bound vs compute-bound | |
| if result.operational_intensity > 0 and result.ridge_point_flop_per_byte > 0: | |
| if result.operational_intensity < result.ridge_point_flop_per_byte: | |
| result.primary_bottleneck = "memory-bound" | |
| result.bottleneck_explanation = ( | |
| f"GPU kernel ({max_phase_ms:.1f}ms) is memory-bandwidth limited. " | |
| f"Operational intensity ({result.operational_intensity:.1f} FLOP/byte) " | |
| f"is below the ridge point ({result.ridge_point_flop_per_byte:.1f} FLOP/byte). " | |
| "Consider model quantization (FP16/INT8), reducing batch size, " | |
| "or using a more compute-dense model." | |
| ) | |
| else: | |
| result.primary_bottleneck = "compute-bound" | |
| result.bottleneck_explanation = ( | |
| f"GPU kernel ({max_phase_ms:.1f}ms) is compute-limited. " | |
| f"Achieved {result.achieved_tflops:.2f} TFLOPS out of " | |
| f"{result.peak_fp32_tflops:.2f} TFLOPS peak " | |
| f"({result.achieved_tflops / result.peak_fp32_tflops * 100:.1f}% utilization). " | |
| "Consider FP16 inference, TensorRT optimization, or a smaller model." | |
| if result.peak_fp32_tflops > 0 | |
| else "Consider a faster GPU or a smaller model." | |
| ) | |
| else: | |
| result.primary_bottleneck = "compute-bound" | |
| result.bottleneck_explanation = ( | |
| f"GPU kernel ({max_phase_ms:.1f}ms) dominates pipeline time." | |
| ) | |
| elif max_phase_name == "preprocess": | |
| result.primary_bottleneck = "preprocess-bound" | |
| result.bottleneck_explanation = ( | |
| f"CPU preprocessing ({max_phase_ms:.1f}ms) is the slowest phase. " | |
| "Consider GPU-accelerated preprocessing or reducing input resolution." | |
| ) | |
| elif max_phase_name == "postprocess": | |
| result.primary_bottleneck = "postprocess-bound" | |
| result.bottleneck_explanation = ( | |
| f"CPU post-processing/NMS ({max_phase_ms:.1f}ms) is the slowest phase. " | |
| "Consider batched NMS on GPU or raising the confidence threshold." | |
| ) | |
| else: | |
| result.primary_bottleneck = "unknown" | |
| result.bottleneck_explanation = "Unable to determine primary bottleneck." | |
| # --- Throughput --- | |
| # Theoretical max FPS = 1000 / max(phase_times) | |
| if max_phase_ms > 0: | |
| result.theoretical_max_fps = round(1000 / max_phase_ms, 2) | |
| result.observed_fps = round(profiling.avg_fps, 2) | |
| if result.theoretical_max_fps > 0: | |
| result.utilization_pct = round(result.observed_fps / result.theoretical_max_fps * 100, 1) | |
| # --- GPU memory --- | |
| result.gpu_peak_memory_mb = profiling.gpu_peak_memory_mb | |
| if result.gpu_vram_total_mb > 0: | |
| result.memory_utilization_pct = round( | |
| result.gpu_peak_memory_mb / result.gpu_vram_total_mb * 100, 1 | |
| ) | |
| # --- GSAM2 metrics --- | |
| gsam2_metrics = getattr(profiling, "_gsam2_metrics", None) | |
| if gsam2_metrics: | |
| result.gsam2_metrics = gsam2_metrics | |
| # --- Recommendations --- | |
| recs = [] | |
| # Bottleneck-specific recommendations | |
| if result.primary_bottleneck == "decode-bound": | |
| recs.append("Use NVIDIA NVDEC for hardware-accelerated video decoding") | |
| recs.append("Reduce input video resolution before processing") | |
| elif result.primary_bottleneck == "transfer-bound": | |
| recs.append("Use torch.cuda pinned memory for faster CPU->GPU transfers") | |
| recs.append("Pre-allocate GPU tensors and reuse across frames") | |
| elif result.primary_bottleneck == "memory-bound": | |
| recs.append("Enable FP16 (half-precision) inference to reduce memory bandwidth pressure") | |
| recs.append("Consider INT8 quantization via TensorRT for further speedup") | |
| elif result.primary_bottleneck == "compute-bound": | |
| recs.append("Enable FP16 inference (2x theoretical throughput on Volta+ GPUs)") | |
| recs.append("Consider TensorRT or torch.compile() for kernel fusion") | |
| if result.peak_fp32_tflops > 0 and result.achieved_tflops / result.peak_fp32_tflops < 0.3: | |
| recs.append("Low GPU utilization — consider increasing batch size or using a multi-stream pipeline") | |
| # General recommendations | |
| if result.memory_utilization_pct > 80: | |
| recs.append(f"GPU memory utilization is high ({result.memory_utilization_pct:.0f}%); " | |
| "reduce batch size or use gradient checkpointing to avoid OOM") | |
| elif result.memory_utilization_pct > 0 and result.memory_utilization_pct < 30: | |
| recs.append(f"GPU memory utilization is low ({result.memory_utilization_pct:.0f}%); " | |
| "consider processing multiple streams or increasing batch size") | |
| if profiling.mode == "detection" and profiling.avg_fps < profiling.video_fps: | |
| recs.append( | |
| f"Processing speed ({profiling.avg_fps:.1f} FPS) is below video frame rate " | |
| f"({profiling.video_fps:.1f} FPS); consider frame skipping or a faster model" | |
| ) | |
| result.recommendations = recs | |
| return result | |