TTA optimization: cache point fusion, vary only priority sampling

The previous TTA commit (8e33a89) timed out because it called
build_compact_scene 3x per sample (the expensive multi-view label
voting step). Refactor:

- compute_scene(sample, cfg, rng): does build_compact_scene + group/class/
center/scale computation. Called ONCE per sample.
- sample_from_scene(scene): does priority sampling + result-dict assembly.
Cheap, called K=3 times per sample.
- fuse_and_sample is preserved as a backward-compat wrapper.

Why this still gives TTA variation: _priority_sample uses the *global*
numpy random state (np.random.shuffle), not an explicit rng arg. Each
consecutive call advances the global state and produces a different
4096-point subset of the same fused scene. The model sees different
inputs across passes despite the scene being identical.

Cost: ~10% extra wall time vs single pass (3x cheap priority sampling
+ 3x cheap model forward), instead of ~200% from the previous commit.
Should fit comfortably in the 2h budget.

Files changed (1) hide show

script.py +59 -30

script.py CHANGED Viewed

@@ -60,11 +60,11 @@ MERGE_THRESH = 0.4
 SNAP_RADIUS = 0.5
-def fuse_and_sample(sample, cfg, rng):
-    """Run point fusion + priority sampling on a raw dataset sample.
-    Returns a dict with xyz_norm, class_id, source, mask, center, scale, etc.
-    ready for model inference. Returns None if fusion fails.
     """
     try:
         scene = build_compact_scene(sample, cfg, rng)
@@ -74,21 +74,43 @@ def fuse_and_sample(sample, cfg, rng):
     xyz = scene["xyz"]
     source = scene["source"]
     if len(xyz) < 10:
         return None
-    # Compute group_id and class_id (same as cache_scenes.py)
     behind_id = scene.get("behind_gest_id", np.full(len(xyz), -1, dtype=np.int16))
     group_id, class_id = _compute_group_and_class(
         scene["visible_src"], scene["visible_id"], behind_id, source)
-    # Normalize
     center, scale = _compute_smart_center_scale(xyz, source)
-    # Priority sample
-    indices, mask = _priority_sample(source, group_id, SEQ_LEN, COLMAP_QUOTA, DEPTH_QUOTA)
     xyz_norm = (xyz[indices] - center) / scale
     result = {
@@ -99,23 +121,26 @@ def fuse_and_sample(sample, cfg, rng):
         "center": center.astype(np.float32),
         "scale": np.float32(scale),
     }
-    # Optional fields
-    if "behind_gest_id" in scene:
         behind = np.clip(scene["behind_gest_id"][indices].astype(np.int16), 0, None)
         result["behind"] = behind.astype(np.int64)
-    if "n_views_voted" in scene:
         result["n_views_voted"] = scene["n_views_voted"][indices].astype(np.float32)
-    if "vote_frac" in scene:
         result["vote_frac"] = scene["vote_frac"][indices].astype(np.float32)
-    # Visible src/id for snap post-processing
     result["visible_src"] = scene["visible_src"][indices].astype(np.int64)
     result["visible_id"] = scene["visible_id"][indices].astype(np.int64)
     return result
 def load_model(checkpoint_path, device):
     """Load model from checkpoint."""
     ckpt = torch.load(checkpoint_path, map_location=device, weights_only=False)
@@ -455,19 +480,23 @@ if __name__ == "__main__":
             order_id = sample["order_id"]
             try:
-                # ---- TTA: run the learned pipeline K times, union outputs
                 tta_outputs = []
-                for k in range(TTA_PASSES):
-                    rng_k = np.random.RandomState(TTA_BASE_SEED + k * 1000)
-                    fused_k = fuse_and_sample(sample, cfg, rng_k)
-                    if fused_k is None:
-                        continue
-                    try:
-                        pv_k, pe_k = predict_sample(fused_k, model, device)
-                        if isinstance(pv_k, np.ndarray) and len(pv_k) >= 2 and len(pe_k) >= 1:
-                            tta_outputs.append((pv_k, pe_k))
-                    except Exception as tta_e:
-                        print(f"  TTA pass {k} failed for {order_id}: {tta_e}")
                 if torch.cuda.is_available():
                     torch.cuda.empty_cache()

 SNAP_RADIUS = 0.5
+def compute_scene(sample, cfg, rng):
+    """Expensive: multi-view label voting + smart normalization. Call once per sample.
+    Returns a dict with the full pre-priority-sampling fused scene, ready to
+    feed into ``sample_from_scene`` repeatedly for TTA. Returns None on failure.
     """
     try:
         scene = build_compact_scene(sample, cfg, rng)
     xyz = scene["xyz"]
     source = scene["source"]
     if len(xyz) < 10:
         return None
     behind_id = scene.get("behind_gest_id", np.full(len(xyz), -1, dtype=np.int16))
     group_id, class_id = _compute_group_and_class(
         scene["visible_src"], scene["visible_id"], behind_id, source)
     center, scale = _compute_smart_center_scale(xyz, source)
+    return {
+        "xyz": xyz,
+        "source": source,
+        "group_id": group_id,
+        "class_id": class_id,
+        "center": center,
+        "scale": scale,
+        "behind_gest_id": scene.get("behind_gest_id"),
+        "n_views_voted": scene.get("n_views_voted"),
+        "vote_frac": scene.get("vote_frac"),
+        "visible_src": scene["visible_src"],
+        "visible_id": scene["visible_id"],
+    }
+def sample_from_scene(scene):
+    """Cheap: priority-sample 4096 points from a fused scene.
+    Uses the global numpy random state (advanced internally by ``_priority_sample``),
+    so consecutive calls yield different 4096-subsets — perfect for TTA.
+    """
+    xyz = scene["xyz"]
+    source = scene["source"]
+    group_id = scene["group_id"]
+    class_id = scene["class_id"]
+    center = scene["center"]
+    scale = scene["scale"]
+    indices, mask = _priority_sample(source, group_id, SEQ_LEN, COLMAP_QUOTA, DEPTH_QUOTA)
     xyz_norm = (xyz[indices] - center) / scale
     result = {
         "center": center.astype(np.float32),
         "scale": np.float32(scale),
     }
+    if scene.get("behind_gest_id") is not None:
         behind = np.clip(scene["behind_gest_id"][indices].astype(np.int16), 0, None)
         result["behind"] = behind.astype(np.int64)
+    if scene.get("n_views_voted") is not None:
         result["n_views_voted"] = scene["n_views_voted"][indices].astype(np.float32)
+    if scene.get("vote_frac") is not None:
         result["vote_frac"] = scene["vote_frac"][indices].astype(np.float32)
     result["visible_src"] = scene["visible_src"][indices].astype(np.int64)
     result["visible_id"] = scene["visible_id"][indices].astype(np.int64)
     return result
+def fuse_and_sample(sample, cfg, rng):
+    """Backward-compatible wrapper: compute scene + one priority sample."""
+    scene = compute_scene(sample, cfg, rng)
+    if scene is None:
+        return None
+    return sample_from_scene(scene)
 def load_model(checkpoint_path, device):
     """Load model from checkpoint."""
     ckpt = torch.load(checkpoint_path, map_location=device, weights_only=False)
             order_id = sample["order_id"]
             try:
+                # ---- Build the fused scene ONCE (the expensive multi-view
+                # label voting); then run priority sampling + model K times
+                # for TTA. _priority_sample uses the global numpy RNG which
+                # advances on each call, giving genuine variation cheaply.
+                scene_rng = np.random.RandomState(TTA_BASE_SEED)
+                scene = compute_scene(sample, cfg, scene_rng)
                 tta_outputs = []
+                if scene is not None:
+                    np.random.seed(TTA_BASE_SEED)  # reset global RNG for reproducibility
+                    for k in range(TTA_PASSES):
+                        try:
+                            fused_k = sample_from_scene(scene)
+                            pv_k, pe_k = predict_sample(fused_k, model, device)
+                            if isinstance(pv_k, np.ndarray) and len(pv_k) >= 2 and len(pe_k) >= 1:
+                                tta_outputs.append((pv_k, pe_k))
+                        except Exception as tta_e:
+                            print(f"  TTA pass {k} failed for {order_id}: {tta_e}")
                 if torch.cuda.is_available():
                     torch.cuda.empty_cache()