v5 cold-start fix: eager CUDA warmup + concurrency=1 + drop dead timeout_seconds

ORT CUDA EP lazily binds on first sess.run; this caused validator's first /predict to eat cold-bind cost (30-300s in TEE-VM) and scheduler reaped the instance before activation. Now Miner.__init__ runs a no-op inference so on_startup blocks until GPU is hot. Also drop concurrency:4 (default 1; our miner.py is not thread-safe) and remove timeout_seconds:900 (not a Chute() kwarg, silently dropped).

Files changed (2) hide show

chute_config.yml +3 -8
miner.py +9 -5

chute_config.yml CHANGED Viewed

@@ -9,17 +9,12 @@ NodeSelector:
   gpu_count: 1
   min_vram_gb_per_gpu: 16
   max_hourly_price_per_gpu: 2
-  exclude:
-    - "5090"
-    - b200
-    - h200
-    - h20
-    - mi300x
 Chute:
   tee: true
-  timeout_seconds: 900
   shutdown_after_seconds: 86400
-  concurrency: 4
   max_instances: 5
   scaling_threshold: 0.5

   gpu_count: 1
   min_vram_gb_per_gpu: 16
   max_hourly_price_per_gpu: 2
+  include:
+    - pro_6000
 Chute:
   tee: true
   shutdown_after_seconds: 86400
+  concurrency: 1
   max_instances: 5
   scaling_threshold: 0.5

miner.py CHANGED Viewed

@@ -62,13 +62,17 @@ class Miner:
         active = self.sess.get_providers()[0]
         print(f"✅ ONNX beverage model loaded (provider={active})")
-        # Warm CUDA kernels / ORT graph so the very first /predict isn't slow.
-        warm = np.zeros((64, 64, 3), dtype=np.uint8)
         try:
-            self._infer(warm)
-            print("✅ ONNX warmup pass done")
         except Exception as e:
-            print(f"⚠️ ONNX warmup pass failed: {e}")
     def __repr__(self) -> str:
         return f"BeverageONNX(in={self.input_size}, cls={self.num_classes})"

         active = self.sess.get_providers()[0]
         print(f"✅ ONNX beverage model loaded (provider={active})")
+        # Eager CUDA EP allocation: ORT lazily binds CUDA on first sess.run,
+        # so without this the validator's first /predict eats the cold-bind
+        # cost (30-300s in TEE-VM) and the scheduler reaps the instance
+        # before activation. Run a no-op inference here so on_startup only
+        # returns once GPU kernels/buffers are hot.
         try:
+            _dummy = np.zeros((self.input_size, self.input_size, 3), dtype=np.uint8)
+            _ = self._infer(_dummy)
+            print(f"✅ ONNX warmup pass completed (provider={active})")
         except Exception as e:
+            print(f"⚠️ ONNX warmup pass failed (not fatal): {e}")
     def __repr__(self) -> str:
         return f"BeverageONNX(in={self.input_size}, cls={self.num_classes})"