MogensR commited on
Commit
2cd2385
·
1 Parent(s): 6a7c643

environmental fixes

Browse files
Files changed (1) hide show
  1. perf_tuning.py +66 -105
perf_tuning.py CHANGED
@@ -1,119 +1,80 @@
1
  # perf_tuning.py
 
2
  """
3
- Perf tuning bootstrap for BackgroundFX Pro
4
- - Enforce CUDA usage (fail fast if missing when REQUIRE_CUDA=1)
5
- - Turn on TF32 + cuDNN benchmark + high-precision matmul
6
- - Allow using most of the GPU memory (CUDA_MEMORY_FRACTION)
7
- - Set sane OpenCV threading
8
- - Default SAM2/MATANY devices to 'cuda' when available
9
-
10
- Import this module BEFORE loading any models:
11
- import perf_tuning # must be near the top of app.py and pipeline.py
12
  """
13
 
14
- from __future__ import annotations
15
-
16
  import os
17
  import logging
18
 
19
- # OpenCV tuning (thread count); imported early, safe if cv2 present later
20
- try:
21
- import cv2 # type: ignore
22
- except Exception: # pragma: no cover
23
- cv2 = None
24
-
25
- logger = logging.getLogger("backgroundfx_pro")
26
- if not logger.handlers:
27
- _h = logging.StreamHandler()
28
- _h.setFormatter(logging.Formatter("[%(asctime)s] %(levelname)s: %(message)s"))
29
- logger.addHandler(_h)
30
- logger.setLevel(logging.INFO)
31
-
32
 
33
- def _env_bool(key: str, default: bool = False) -> bool:
34
- v = os.environ.get(key)
35
- if v is None:
36
- return default
37
- return str(v).strip().lower() in {"1", "true", "yes", "on"}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
 
 
39
 
40
- def configure() -> None:
41
- # --- OpenCV threads (CPU-side decode/encode/compositing) ---
42
  try:
43
- if cv2 is not None:
44
- n_threads = int(os.environ.get("OPENCV_NUM_THREADS", str(min(8, (os.cpu_count() or 4)))))
45
- cv2.setNumThreads(max(1, n_threads))
46
- except Exception as e:
47
- logger.warning(f"OpenCV threading setup skipped: {e}")
48
-
49
- # --- PyTorch / CUDA ---
50
  try:
51
- import torch # type: ignore
52
-
53
- require_cuda = _env_bool("REQUIRE_CUDA", True) # default: require GPU since you’re paying for it
54
- device_index = int(os.environ.get("FORCE_CUDA_DEVICE", "0"))
55
-
56
- has_cuda = torch.cuda.is_available()
57
- if require_cuda and not has_cuda:
58
- raise RuntimeError(
59
- "CUDA GPU not available but REQUIRE_CUDA=1. "
60
- "Check Space hardware settings (GPU) and that the NVIDIA runtime is active."
61
- )
62
-
63
- if has_cuda:
64
- try:
65
- torch.cuda.set_device(device_index)
66
- except Exception as e:
67
- logger.warning(f"Could not set CUDA device {device_index}: {e}")
68
-
69
- # Inference-only flags
70
- try:
71
- torch.set_grad_enabled(False)
72
- except Exception:
73
- pass
74
-
75
- # cuDNN / matmul performance + TF32 (fast on Ampere/T4/A100)
76
- try:
77
- if hasattr(torch.backends, "cudnn"):
78
- torch.backends.cudnn.benchmark = True
79
- torch.backends.cudnn.allow_tf32 = True
80
- if hasattr(torch.backends, "cuda") and hasattr(torch.backends.cuda, "matmul"):
81
- torch.backends.cuda.matmul.allow_tf32 = True
82
- # PyTorch 2.x matmul precision hint
83
- try:
84
- torch.set_float32_matmul_precision("high")
85
- except Exception:
86
- pass
87
- except Exception as e:
88
- logger.warning(f"Matmul/cudnn tuning skipped: {e}")
89
-
90
- # Allow using most of VRAM (won’t exceed card capacity, but avoids PyTorch self-throttling)
91
- try:
92
- frac = float(os.environ.get("CUDA_MEMORY_FRACTION", "0.98"))
93
- torch.cuda.set_per_process_memory_fraction(min(max(frac, 0.1), 1.0), device=device_index)
94
- except Exception:
95
- pass
96
-
97
- # Log device summary
98
- try:
99
- props = torch.cuda.get_device_properties(device_index)
100
- logger.info(
101
- f"Using CUDA device {device_index}: {props.name} | "
102
- f"VRAM {props.total_memory/1e9:.2f} GB | TF32:ON | cuDNN benchmark:ON"
103
- )
104
- except Exception:
105
- logger.info("Using CUDA (device summary not available).")
106
- else:
107
- logger.info("Running on CPU (REQUIRE_CUDA=0).")
108
-
109
- # Default the model device envs so your pipeline picks CUDA
110
- os.environ.setdefault("SAM2_DEVICE", "cuda" if has_cuda else "cpu")
111
- os.environ.setdefault("MATANY_DEVICE", "cuda" if has_cuda else "cpu")
112
-
113
- except Exception as e:
114
- # If torch import fails entirely, we cannot enforce CUDA.
115
- raise RuntimeError(f"PyTorch/CUDA setup failed: {e}") from e
116
 
 
 
 
 
 
117
 
118
- # Run configuration at import time
119
- configure()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  # perf_tuning.py
2
+ #!/usr/bin/env python3
3
  """
4
+ Forces CUDA use (or fails fast), configures cuDNN/TF32, and logs a clear GPU banner.
5
+ Loaded automatically because pipeline.py does: `import perf_tuning` (best-effort).
 
 
 
 
 
 
 
6
  """
7
 
 
 
8
  import os
9
  import logging
10
 
11
+ log = logging.getLogger("backgroundfx_pro")
12
+ if not log.handlers:
13
+ h = logging.StreamHandler()
14
+ h.setFormatter(logging.Formatter("[%(asctime)s] %(levelname)s: %(message)s"))
15
+ log.addHandler(h)
16
+ log.setLevel(logging.INFO)
 
 
 
 
 
 
 
17
 
18
+ try:
19
+ import torch
20
+ except Exception as e:
21
+ raise RuntimeError(f"PyTorch not importable: {e}")
22
+
23
+ require_cuda = os.environ.get("REQUIRE_CUDA", "0").strip() == "1"
24
+ force_idx_env = os.environ.get("FORCE_CUDA_DEVICE", "").strip()
25
+ mem_frac = float(os.environ.get("CUDA_MEMORY_FRACTION", "0.98"))
26
+
27
+ if not torch.cuda.is_available():
28
+ if require_cuda:
29
+ raise RuntimeError("CUDA is NOT available, but REQUIRE_CUDA=1. "
30
+ "Make sure the Space is on GPU and the container runs with --gpus all.")
31
+ else:
32
+ log.warning("CUDA not available; running on CPU. Set REQUIRE_CUDA=1 to fail fast.")
33
+ else:
34
+ # Choose device
35
+ try:
36
+ idx = int(force_idx_env) if force_idx_env != "" else 0
37
+ except Exception:
38
+ idx = 0
39
+ if idx >= torch.cuda.device_count() or idx < 0:
40
+ idx = 0
41
 
42
+ torch.cuda.set_device(idx)
43
 
44
+ # Perf knobs
 
45
  try:
46
+ torch.backends.cuda.matmul.allow_tf32 = True
47
+ except Exception:
48
+ pass
 
 
 
 
49
  try:
50
+ torch.backends.cudnn.allow_tf32 = True
51
+ torch.backends.cudnn.benchmark = True
52
+ except Exception:
53
+ pass
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
54
 
55
+ # Reserve VRAM fraction (best effort)
56
+ try:
57
+ torch.cuda.set_per_process_memory_fraction(mem_frac, idx)
58
+ except Exception:
59
+ pass
60
 
61
+ # Log a clear banner
62
+ try:
63
+ name = torch.cuda.get_device_name(idx)
64
+ cap = torch.cuda.get_device_capability(idx)
65
+ total_gb = torch.cuda.get_device_properties(idx).total_memory / (1024**3)
66
+ free_gb = torch.cuda.mem_get_info()[0] / (1024**3)
67
+ log.info(f"Using CUDA device {idx}: {name} | cc {cap[0]}.{cap[1]} | "
68
+ f"VRAM {total_gb:.2f} GB (free ~{free_gb:.2f} GB) | TF32:ON | cuDNN benchmark:ON")
69
+ except Exception:
70
+ log.info("Using CUDA; device info unavailable (but cuda.is_available()==True).")
71
+
72
+ # Optional: limit OpenCV threads if provided
73
+ threads = os.environ.get("OPENCV_NUM_THREADS")
74
+ if threads:
75
+ try:
76
+ import cv2
77
+ cv2.setNumThreads(int(threads))
78
+ log.info(f"OpenCV threads set to: {threads}")
79
+ except Exception:
80
+ pass