Spaces:

MogensR
/

VideoBackgroundReplacer2

Paused

App Files Files Community

MogensR commited on Sep 13, 2025

Commit

2cd2385

1 Parent(s): 6a7c643

environmental fixes

Browse files

Files changed (1) hide show

perf_tuning.py +66 -105

perf_tuning.py CHANGED Viewed

@@ -1,119 +1,80 @@
 # perf_tuning.py
 """
-Perf tuning bootstrap for BackgroundFX Pro
-- Enforce CUDA usage (fail fast if missing when REQUIRE_CUDA=1)
-- Turn on TF32 + cuDNN benchmark + high-precision matmul
-- Allow using most of the GPU memory (CUDA_MEMORY_FRACTION)
-- Set sane OpenCV threading
-- Default SAM2/MATANY devices to 'cuda' when available
-Import this module BEFORE loading any models:
-    import perf_tuning  # must be near the top of app.py and pipeline.py
 """
-from __future__ import annotations
 import os
 import logging
-# OpenCV tuning (thread count); imported early, safe if cv2 present later
-try:
-    import cv2  # type: ignore
-except Exception:  # pragma: no cover
-    cv2 = None
-logger = logging.getLogger("backgroundfx_pro")
-if not logger.handlers:
-    _h = logging.StreamHandler()
-    _h.setFormatter(logging.Formatter("[%(asctime)s] %(levelname)s: %(message)s"))
-    logger.addHandler(_h)
-logger.setLevel(logging.INFO)
-def _env_bool(key: str, default: bool = False) -> bool:
-    v = os.environ.get(key)
-    if v is None:
-        return default
-    return str(v).strip().lower() in {"1", "true", "yes", "on"}
-def configure() -> None:
-    # --- OpenCV threads (CPU-side decode/encode/compositing) ---
     try:
-        if cv2 is not None:
-            n_threads = int(os.environ.get("OPENCV_NUM_THREADS", str(min(8, (os.cpu_count() or 4)))))
-            cv2.setNumThreads(max(1, n_threads))
-    except Exception as e:
-        logger.warning(f"OpenCV threading setup skipped: {e}")
-    # --- PyTorch / CUDA ---
     try:
-        import torch  # type: ignore
-        require_cuda = _env_bool("REQUIRE_CUDA", True)  # default: require GPU since you’re paying for it
-        device_index = int(os.environ.get("FORCE_CUDA_DEVICE", "0"))
-        has_cuda = torch.cuda.is_available()
-        if require_cuda and not has_cuda:
-            raise RuntimeError(
-                "CUDA GPU not available but REQUIRE_CUDA=1. "
-                "Check Space hardware settings (GPU) and that the NVIDIA runtime is active."
-            )
-        if has_cuda:
-            try:
-                torch.cuda.set_device(device_index)
-            except Exception as e:
-                logger.warning(f"Could not set CUDA device {device_index}: {e}")
-            # Inference-only flags
-            try:
-                torch.set_grad_enabled(False)
-            except Exception:
-                pass
-            # cuDNN / matmul performance + TF32 (fast on Ampere/T4/A100)
-            try:
-                if hasattr(torch.backends, "cudnn"):
-                    torch.backends.cudnn.benchmark = True
-                    torch.backends.cudnn.allow_tf32 = True
-                if hasattr(torch.backends, "cuda") and hasattr(torch.backends.cuda, "matmul"):
-                    torch.backends.cuda.matmul.allow_tf32 = True
-                # PyTorch 2.x matmul precision hint
-                try:
-                    torch.set_float32_matmul_precision("high")
-                except Exception:
-                    pass
-            except Exception as e:
-                logger.warning(f"Matmul/cudnn tuning skipped: {e}")
-            # Allow using most of VRAM (won’t exceed card capacity, but avoids PyTorch self-throttling)
-            try:
-                frac = float(os.environ.get("CUDA_MEMORY_FRACTION", "0.98"))
-                torch.cuda.set_per_process_memory_fraction(min(max(frac, 0.1), 1.0), device=device_index)
-            except Exception:
-                pass
-            # Log device summary
-            try:
-                props = torch.cuda.get_device_properties(device_index)
-                logger.info(
-                    f"Using CUDA device {device_index}: {props.name} | "
-                    f"VRAM {props.total_memory/1e9:.2f} GB | TF32:ON | cuDNN benchmark:ON"
-                )
-            except Exception:
-                logger.info("Using CUDA (device summary not available).")
-        else:
-            logger.info("Running on CPU (REQUIRE_CUDA=0).")
-        # Default the model device envs so your pipeline picks CUDA
-        os.environ.setdefault("SAM2_DEVICE", "cuda" if has_cuda else "cpu")
-        os.environ.setdefault("MATANY_DEVICE", "cuda" if has_cuda else "cpu")
-    except Exception as e:
-        # If torch import fails entirely, we cannot enforce CUDA.
-        raise RuntimeError(f"PyTorch/CUDA setup failed: {e}") from e
-# Run configuration at import time
-configure()

 # perf_tuning.py
+#!/usr/bin/env python3
 """
+Forces CUDA use (or fails fast), configures cuDNN/TF32, and logs a clear GPU banner.
+Loaded automatically because pipeline.py does: `import perf_tuning` (best-effort).
 """
 import os
 import logging
+log = logging.getLogger("backgroundfx_pro")
+if not log.handlers:
+    h = logging.StreamHandler()
+    h.setFormatter(logging.Formatter("[%(asctime)s] %(levelname)s: %(message)s"))
+    log.addHandler(h)
+log.setLevel(logging.INFO)
+try:
+    import torch
+except Exception as e:
+    raise RuntimeError(f"PyTorch not importable: {e}")
+require_cuda = os.environ.get("REQUIRE_CUDA", "0").strip() == "1"
+force_idx_env = os.environ.get("FORCE_CUDA_DEVICE", "").strip()
+mem_frac = float(os.environ.get("CUDA_MEMORY_FRACTION", "0.98"))
+if not torch.cuda.is_available():
+    if require_cuda:
+        raise RuntimeError("CUDA is NOT available, but REQUIRE_CUDA=1. "
+                           "Make sure the Space is on GPU and the container runs with --gpus all.")
+    else:
+        log.warning("CUDA not available; running on CPU. Set REQUIRE_CUDA=1 to fail fast.")
+else:
+    # Choose device
+    try:
+        idx = int(force_idx_env) if force_idx_env != "" else 0
+    except Exception:
+        idx = 0
+    if idx >= torch.cuda.device_count() or idx < 0:
+        idx = 0
+    torch.cuda.set_device(idx)
+    # Perf knobs
     try:
+        torch.backends.cuda.matmul.allow_tf32 = True
+    except Exception:
+        pass
     try:
+        torch.backends.cudnn.allow_tf32 = True
+        torch.backends.cudnn.benchmark = True
+    except Exception:
+        pass
+    # Reserve VRAM fraction (best effort)
+    try:
+        torch.cuda.set_per_process_memory_fraction(mem_frac, idx)
+    except Exception:
+        pass
+    # Log a clear banner
+    try:
+        name = torch.cuda.get_device_name(idx)
+        cap = torch.cuda.get_device_capability(idx)
+        total_gb = torch.cuda.get_device_properties(idx).total_memory / (1024**3)
+        free_gb = torch.cuda.mem_get_info()[0] / (1024**3)
+        log.info(f"Using CUDA device {idx}: {name} | cc {cap[0]}.{cap[1]} | "
+                 f"VRAM {total_gb:.2f} GB (free ~{free_gb:.2f} GB) | TF32:ON | cuDNN benchmark:ON")
+    except Exception:
+        log.info("Using CUDA; device info unavailable (but cuda.is_available()==True).")
+# Optional: limit OpenCV threads if provided
+threads = os.environ.get("OPENCV_NUM_THREADS")
+if threads:
+    try:
+        import cv2
+        cv2.setNumThreads(int(threads))
+        log.info(f"OpenCV threads set to: {threads}")
+    except Exception:
+        pass