Upload folder using huggingface_hub

Browse files

Files changed (9) hide show

configs/callbacks/vis/vis_unity_val.yaml +3 -0
genmo/callbacks/vis/vis_unity_val.py +64 -19
genmo/utils/vis/renderer.py +7 -2
scripts/train.py +17 -1
third_party/GVHMR/hmr4d/utils/vis/renderer.py +7 -2
third_party/GVHMR/process_data.sh +1 -1
third_party/GVHMR/tools/demo/process_dataset.py +1090 -209
train.log +632 -0
train.sh +9 -1

configs/callbacks/vis/vis_unity_val.yaml CHANGED Viewed

@@ -3,6 +3,9 @@ vis_unity_val:
   enabled: false
   every_n_epochs: 1
   num_batches: 1
   num_frames: 30
   render_incam: true
   render_global: true

   enabled: false
   every_n_epochs: 1
   num_batches: 1
+  # Which val batches to render: "first" or "random".
+  batch_select: "first"
+  batch_select_seed: 123
   num_frames: 30
   render_incam: true
   render_global: true

genmo/callbacks/vis/vis_unity_val.py CHANGED Viewed

@@ -31,6 +31,8 @@ class VisUnityVal(pl.Callback):
         num_frames: int = 30,
         render_incam: bool = True,
         render_global: bool = True,
         use_gt_betas_for_pred: bool = True,
         global_root_relative: bool = False,
         postprocess_global: bool = True,
@@ -46,6 +48,8 @@ class VisUnityVal(pl.Callback):
         self.num_frames = num_frames
         self.render_incam = render_incam
         self.render_global = render_global
         self.use_gt_betas_for_pred = use_gt_betas_for_pred
         self.global_root_relative = global_root_relative
         self.postprocess_global = postprocess_global
@@ -58,6 +62,41 @@ class VisUnityVal(pl.Callback):
         self._smplx2smpl = None
         self._faces = None
         self._J_regressor = None
     def _lazy_init_models(self, device: torch.device):
         if self._smplx is None:
@@ -118,8 +157,19 @@ class VisUnityVal(pl.Callback):
             return
         if self.every_n_epochs is not None and (trainer.current_epoch % self.every_n_epochs) != 0:
             return
-        if batch_idx >= self.num_batches:
-            return
         if outputs is None or "pred_smpl_params_incam" not in outputs:
             Log.warning("[VisUnityVal] Missing `pred_smpl_params_incam` in outputs; skipping.")
@@ -128,33 +178,31 @@ class VisUnityVal(pl.Callback):
         meta_render = None
         if "meta_render" in batch and isinstance(batch["meta_render"], list) and batch["meta_render"]:
             meta_render = batch["meta_render"][0]
-        img_paths = meta_render.get("img_paths") if isinstance(meta_render, dict) else None
-        if not img_paths:
-            Log.warning("[VisUnityVal] Missing `meta_render.img_paths`; skipping incam rendering.")
-            return
         vid = batch["meta"][0].get("vid", f"b{batch_idx:03d}")
         vid = self._safe_vid(str(vid))
         # Pick frames to render (within the already-sliced/padded window).
-        L = int(batch["K_fullimg"].shape[1]) if "K_fullimg" in batch else len(img_paths)
         if L <= 0:
             return
         num_frames = min(self.num_frames, L)
         frame_idxs = np.linspace(0, L - 1, num_frames).round().astype(int)
-        # Read one frame to get size.
-        first = cv2.imread(img_paths[int(frame_idxs[0])])
-        if first is None:
-            Log.warning(f"[VisUnityVal] Failed to read image: {img_paths[int(frame_idxs[0])]}")
-            return
-        height, width = first.shape[:2]
         device = pl_module.device
         self._lazy_init_models(device)
-        # Renderer uses a single K; in practice K is constant for Unity sequences.
         K = batch["K_fullimg"][0, 0].to(device)
         renderer_incam = Renderer(width, height, device=device, faces=self._faces, K=K)
         # Make the overlay look "flat colored" (no Phong shading).
         try:
@@ -190,10 +238,7 @@ class VisUnityVal(pl.Callback):
                         renderer_incam.set_intrinsic(K_fi)
                     except Exception:
                         pass
-                    frame_bgr = cv2.imread(img_paths[int(fi)])
-                    if frame_bgr is None:
-                        continue
-                    frame = frame_bgr[..., ::-1]  # RGB
                     img = renderer_incam.render_mesh(gt_verts_incam[i], frame, colors=self.gt_color)
                     img = renderer_incam.render_mesh(pred_verts_incam[i], img, colors=self.pred_color)
                     writer.write_frame(img.astype(np.uint8))

         num_frames: int = 30,
         render_incam: bool = True,
         render_global: bool = True,
+        batch_select: str = "first",
+        batch_select_seed: int = 123,
         use_gt_betas_for_pred: bool = True,
         global_root_relative: bool = False,
         postprocess_global: bool = True,
         self.num_frames = num_frames
         self.render_incam = render_incam
         self.render_global = render_global
+        self.batch_select = str(batch_select or "first").strip().lower()
+        self.batch_select_seed = int(batch_select_seed)
         self.use_gt_betas_for_pred = use_gt_betas_for_pred
         self.global_root_relative = global_root_relative
         self.postprocess_global = postprocess_global
         self._smplx2smpl = None
         self._faces = None
         self._J_regressor = None
+        self._selected_batch_idxs_by_loader = {}
+        self._seen_batch_count_by_loader = {}
+    def on_validation_epoch_start(self, trainer, pl_module):
+        self._selected_batch_idxs_by_loader = {}
+        self._seen_batch_count_by_loader = {}
+        if not self.enabled:
+            return
+        if trainer.global_rank != 0:
+            return
+        if self.every_n_epochs is not None and (trainer.current_epoch % self.every_n_epochs) != 0:
+            return
+        # Try to deterministically select which batches to render for each val dataloader.
+        try:
+            num_val_batches = getattr(trainer, "num_val_batches", None)
+            if num_val_batches is None:
+                return
+            if isinstance(num_val_batches, int):
+                num_val_batches = [num_val_batches]
+            for dl_idx, n in enumerate(list(num_val_batches)):
+                n = int(n)
+                if n <= 0:
+                    self._selected_batch_idxs_by_loader[dl_idx] = set()
+                    continue
+                k = min(int(self.num_batches), n)
+                if self.batch_select == "random":
+                    rng = np.random.default_rng(int(self.batch_select_seed) + int(trainer.current_epoch) * 1000 + int(dl_idx))
+                    chosen = rng.choice(np.arange(n, dtype=np.int64), size=k, replace=False)
+                    self._selected_batch_idxs_by_loader[dl_idx] = set(int(x) for x in chosen.tolist())
+                else:
+                    self._selected_batch_idxs_by_loader[dl_idx] = set(range(k))
+        except Exception:
+            # Fallback: keep legacy behavior (first N batches).
+            self._selected_batch_idxs_by_loader = {}
     def _lazy_init_models(self, device: torch.device):
         if self._smplx is None:
             return
         if self.every_n_epochs is not None and (trainer.current_epoch % self.every_n_epochs) != 0:
             return
+        dl_i = int(dataloader_idx)
+        local_idx = int(self._seen_batch_count_by_loader.get(dl_i, 0))
+        self._seen_batch_count_by_loader[dl_i] = local_idx + 1
+        selected = self._selected_batch_idxs_by_loader.get(dl_i, None)
+        if selected is None:
+            # Fallback: legacy behavior.
+            if batch_idx >= self.num_batches:
+                return
+        else:
+            # Use loader-local index (CombinedLoader may provide a global `batch_idx`).
+            if local_idx not in selected:
+                return
         if outputs is None or "pred_smpl_params_incam" not in outputs:
             Log.warning("[VisUnityVal] Missing `pred_smpl_params_incam` in outputs; skipping.")
         meta_render = None
         if "meta_render" in batch and isinstance(batch["meta_render"], list) and batch["meta_render"]:
             meta_render = batch["meta_render"][0]
+        # NOTE: Do not depend on image/video I/O for validation visualization; render on black.
         vid = batch["meta"][0].get("vid", f"b{batch_idx:03d}")
         vid = self._safe_vid(str(vid))
         # Pick frames to render (within the already-sliced/padded window).
+        L = int(batch["K_fullimg"].shape[1]) if "K_fullimg" in batch else 0
         if L <= 0:
             return
         num_frames = min(self.num_frames, L)
         frame_idxs = np.linspace(0, L - 1, num_frames).round().astype(int)
         device = pl_module.device
         self._lazy_init_models(device)
+        # Render on black; infer output size from principal point (usually near W/2, H/2).
         K = batch["K_fullimg"][0, 0].to(device)
+        try:
+            cx = float(K[0, 2].detach().cpu().item())
+            cy = float(K[1, 2].detach().cpu().item())
+            width = max(64, int(round(cx * 2.0)))
+            height = max(64, int(round(cy * 2.0)))
+        except Exception:
+            width, height = 1280, 720
         renderer_incam = Renderer(width, height, device=device, faces=self._faces, K=K)
         # Make the overlay look "flat colored" (no Phong shading).
         try:
                         renderer_incam.set_intrinsic(K_fi)
                     except Exception:
                         pass
+                    frame = np.zeros((height, width, 3), dtype=np.uint8)  # RGB black
                     img = renderer_incam.render_mesh(gt_verts_incam[i], frame, colors=self.gt_color)
                     img = renderer_incam.render_mesh(pred_verts_incam[i], img, colors=self.pred_color)
                     writer.write_frame(img.astype(np.uint8))

genmo/utils/vis/renderer.py CHANGED Viewed

@@ -291,8 +291,13 @@ class Renderer:
             verts_features = colors.to(device=vertices.device, dtype=vertices.dtype)
             colors = [0.8, 0.8, 0.8]
         else:
-            if colors[0] > 1:
-                colors = [c / 255.0 for c in colors]
             verts_features = (
                 torch.tensor(colors)
                 .reshape(1, 1, 3)

             verts_features = colors.to(device=vertices.device, dtype=vertices.dtype)
             colors = [0.8, 0.8, 0.8]
         else:
+            # Accept either [0..1] floats or [0..255] uint8-like colors.
+            # Don't key off `colors[0]` because valid RGB like green [0,255,0] would fail.
+            try:
+                if max(colors) > 1:
+                    colors = [c / 255.0 for c in colors]
+            except Exception:
+                pass
             verts_features = (
                 torch.tensor(colors)
                 .reshape(1, 1, 3)

scripts/train.py CHANGED Viewed

@@ -1,5 +1,21 @@
-import builtins
 import os
 from datetime import datetime
 import hydra

 import os
+import sys
+# Ensure repo root is importable when running as `python scripts/train.py`.
+# Without this, `genmo.*` may resolve from site-packages while `third_party.*`
+# (a namespace package in this repo) fails to import, which Hydra reports as
+# "Error locating target ...".
+_REPO_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
+if _REPO_ROOT not in sys.path:
+    sys.path.insert(0, _REPO_ROOT)
+# GVHMR uses absolute imports like `import hmr4d...` internally, so its repo root
+# must also be importable.
+_GVHMR_ROOT = os.path.join(_REPO_ROOT, "third_party", "GVHMR")
+if os.path.isdir(_GVHMR_ROOT) and _GVHMR_ROOT not in sys.path:
+    sys.path.insert(0, _GVHMR_ROOT)
+import builtins
 from datetime import datetime
 import hydra

third_party/GVHMR/hmr4d/utils/vis/renderer.py CHANGED Viewed

@@ -223,8 +223,13 @@ class Renderer:
             verts_features = colors.to(device=vertices.device, dtype=vertices.dtype)
             colors = [0.8, 0.8, 0.8]
         else:
-            if colors[0] > 1:
-                colors = [c / 255.0 for c in colors]
             verts_features = torch.tensor(colors).reshape(1, 1, 3).to(device=vertices.device, dtype=vertices.dtype)
             verts_features = verts_features.repeat(1, vertices.shape[1], 1)
         textures = TexturesVertex(verts_features=verts_features)

             verts_features = colors.to(device=vertices.device, dtype=vertices.dtype)
             colors = [0.8, 0.8, 0.8]
         else:
+            # Accept either [0..1] floats or [0..255] uint8-like colors.
+            # Don't key off `colors[0]` because valid RGB like green [0,255,0] would fail.
+            try:
+                if max(colors) > 1:
+                    colors = [c / 255.0 for c in colors]
+            except Exception:
+                pass
             verts_features = torch.tensor(colors).reshape(1, 1, 3).to(device=vertices.device, dtype=vertices.dtype)
             verts_features = verts_features.repeat(1, vertices.shape[1], 1)
         textures = TexturesVertex(verts_features=verts_features)

third_party/GVHMR/process_data.sh CHANGED Viewed

	@@ -1 +1 @@
1	- python tools/demo/process_dataset.py --input /mnt/c/Temp/SyntheticDataset --output ./~~processed_data~~ --~~vitpose~~ --~~workers 3 --~~debug


1	+ python tools/demo/process_dataset.py --input /mnt/c/Temp/SyntheticDataset --output ./processed_dataset --genmo --debug

third_party/GVHMR/tools/demo/process_dataset.py CHANGED Viewed

@@ -1,879 +1,1760 @@
 import sys
 import os
 import json
 import argparse
 import numpy as np
 import zlib
 from glob import glob
 from tqdm import tqdm
 import cv2
 import torch
 from scipy.spatial.transform import Rotation as R
 import time
 import shutil
 from pathlib import Path
 # --- SETUP PATHS FOR IMPORTS ---
-REPO_ROOT = Path(__file__).resolve().parents[2] # Adjust as needed based on where this script lives
 if str(REPO_ROOT) not in sys.path:
     sys.path.insert(0, str(REPO_ROOT))
-# Try to import Extractor. If this fails, the script will error out early.
-try:
-    gvhmr_root = REPO_ROOT / "third_party" / "GVHMR"
-    if gvhmr_root.exists() and str(gvhmr_root) not in sys.path:
-        sys.path.insert(0, str(gvhmr_root))
-    from hmr4d.utils.preproc.vitfeat_extractor import Extractor
-    from hmr4d.utils.pylogger import Log
-except ImportError:
-    # Fallback/Mock for standalone testing if repo structure differs,
-    # but based on your prompt, this path should exist.
-    print("WARNING: Could not import Extractor. Feature extraction will fail.")
-    Extractor = None
-# Force single thread for libraries
 os.environ["OMP_NUM_THREADS"] = "1"
 os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
 cv2.setNumThreads(0)
 torch.set_num_threads(1)
 FPS = 30.0
-DEBUG_NUM_FRAMES = 60
 IMAGENET_MEAN = np.array([0.485, 0.456, 0.406], dtype=np.float32)
 IMAGENET_STD = np.array([0.229, 0.224, 0.225], dtype=np.float32)
-# --- HELPER FUNCTIONS ---
 def _process_image_memory(img_bgr, bbox_xywh, img_size=256):
-    """
-    Adapted from process_single_image to work on in-memory numpy arrays.
-    """
-    if img_bgr is None:
-        return np.zeros((3, img_size, img_size), dtype=np.float32)
     x, y, w, h = bbox_xywh
     cx, cy = x + w/2, y + h/2
-    # Genmo/HMR usually uses max(w,h) * 1.2 or similar, ensuring we match the prompt's scale logic if implicit
-    # Assuming the bbox provided in json is already the correct crop region or close to it.
-    # The user provided snippet used 'scale'. We approximate scale from bbox if not provided.
-    # Standard HMR extraction uses a specific scale factor.
-    # Here we assume the input bbox is the "tight" bbox and we need to square it.
-    scale = max(w, h) * 1.2
     H, W = img_bgr.shape[:2]
     max_side = float(max(H, W, 1))
-    if scale <= 1.0 or scale > max_side * 20.0:
-        # Fallback for bad scales
-        scale = max_side * 0.5
     half = scale / 2.0
     x0, y0 = int(cx - half), int(cy - half)
     x1, y1 = int(cx + half), int(cy + half)
     pad_l, pad_t = max(0, -x0), max(0, -y0)
     pad_r, pad_b = max(0, x1 - W), max(0, y1 - H)
-    if max(pad_l, pad_t, pad_r, pad_b) > int(max_side * 4.0):
-        # Sanity check fail, return black
-        return np.zeros((3, img_size, img_size), dtype=np.float32)
     if pad_l or pad_t or pad_r or pad_b:
         img_bgr = cv2.copyMakeBorder(img_bgr, pad_t, pad_b, pad_l, pad_r, cv2.BORDER_CONSTANT, value=(0,0,0))
         x0 += pad_l; y0 += pad_t; x1 += pad_l; y1 += pad_t
     crop = img_bgr[y0:y1, x0:x1]
-    if crop.size == 0:
-        return np.zeros((3, img_size, img_size), dtype=np.float32)
     if crop.shape[0] != img_size or crop.shape[1] != img_size:
         crop = cv2.resize(crop, (img_size, img_size), interpolation=cv2.INTER_LINEAR)
-    # Normalize
-    crop = crop[:, :, ::-1].astype(np.float32) / 255.0 # BGR to RGB
     crop = (crop - IMAGENET_MEAN) / IMAGENET_STD
-    return crop.transpose(2, 0, 1) # HWC -> CHW
 def _alpha_blend_bgra_onto_bgr(dst_bgr, src_bgra, x, y):
     if dst_bgr is None or src_bgra is None: return dst_bgr
     H, W = dst_bgr.shape[:2]
     h, w = src_bgra.shape[:2]
     if w <= 0 or h <= 0: return dst_bgr
     x0, y0 = max(int(x), 0), max(int(y), 0)
     x1, y1 = min(int(x + w), W), min(int(y + h), H)
     if x1 <= x0 or y1 <= y0: return dst_bgr
     roi = dst_bgr[y0:y1, x0:x1]
     src_crop = src_bgra[(y0 - int(y)):(y0 - int(y)) + (y1 - y0), (x0 - int(x)):(x0 - int(x)) + (x1 - x0)]
     if src_crop.shape[2] == 3:
         roi[:] = src_crop
         return dst_bgr
     alpha = src_crop[:, :, 3].astype(np.uint16)
     inv_alpha = 255 - alpha
     b_src, g_src, r_src = src_crop[:, :, 0], src_crop[:, :, 1], src_crop[:, :, 2]
     b_dst, g_dst, r_dst = roi[:, :, 0], roi[:, :, 1], roi[:, :, 2]
     roi[:, :, 0] = ((b_src * alpha + b_dst * inv_alpha) >> 8).astype(np.uint8)
     roi[:, :, 1] = ((g_src * alpha + g_dst * inv_alpha) >> 8).astype(np.uint8)
     roi[:, :, 2] = ((r_src * alpha + r_dst * inv_alpha) >> 8).astype(np.uint8)
     return dst_bgr
 def _find_ui_dir():
     cand = os.path.join(os.getcwd(), "UI")
     if os.path.isdir(cand): return cand
-    script_dir = os.path.dirname(os.path.abspath(__file__))
-    cand2 = os.path.abspath(os.path.join(script_dir, "..", "..", "UI"))
-    if os.path.isdir(cand2): return cand2
-    return None
 def _find_font_path(ui_dir, filename="Inter_18pt-Bold.ttf"):
     if not ui_dir: return None
     p = os.path.join(ui_dir, filename)
     return p if os.path.isfile(p) else None
 def _load_ui_images(ui_dir):
     if not ui_dir or (not os.path.isdir(ui_dir)): return []
     imgs = []
     for name in sorted(os.listdir(ui_dir)):
         p = os.path.join(ui_dir, name)
         if not os.path.isfile(p): continue
         if name.lower().endswith(('.png', '.jpg', '.jpeg', '.webp')):
             im = cv2.imread(p, cv2.IMREAD_UNCHANGED)
             if im is not None:
                 if im.ndim == 2: im = cv2.cvtColor(im, cv2.COLOR_GRAY2BGR)
                 imgs.append(im)
     return imgs
 class SimpleUIOverlay:
     def __init__(self, width, height, seed=0, ui_dir=None, max_images=4, show_prob=0.6, min_hold_frames=20, max_hold_frames=120):
         self.W, self.H = int(width), int(height)
         self.rng = np.random.default_rng(int(seed))
         self.max_images = max(0, int(max_images))
         self.show_prob = float(show_prob)
         self.min_hold_frames, self.max_hold_frames = max(1, int(min_hold_frames)), max(1, int(max_hold_frames))
         self.ui_dir = ui_dir if ui_dir else _find_ui_dir()
         self.assets = _load_ui_images(self.ui_dir)
         self._ttl, self._active = 0, []
     def _pick_new_state(self):
         self._ttl = int(self.rng.integers(self.min_hold_frames, self.max_hold_frames + 1))
         self._active = []
         if (not self.assets) or (self.max_images <= 0): return
         if float(self.rng.random()) > self.show_prob: return
         k = min(int(self.rng.integers(1, self.max_images + 1)), len(self.assets))
         idxs = self.rng.choice(len(self.assets), size=k, replace=False)
         for idx in idxs:
             im = self.assets[int(idx)]
             h, w = im.shape[:2]
             if w > 0 and h > 0:
                 x = int(self.rng.integers(-w // 4, max(1, self.W - (3 * w // 4))))
                 y = int(self.rng.integers(-h // 4, max(1, self.H - (3 * h // 4))))
                 self._active.append((im, x, y))
     def draw(self, img_bgr):
         if img_bgr is None: return img_bgr
         if self._ttl <= 0: self._pick_new_state()
         self._ttl -= 1
-        for im, x, y in self._active:
-            _alpha_blend_bgra_onto_bgr(img_bgr, im, x, y)
         return img_bgr
 class SimpleChatOverlay:
     def __init__(self, width, height, seed=0, num_lines=7, region_w=420, region_h=180, margin=18, every_n_frames=15, corner=None, font_path=None):
         from collections import deque
         self.W, self.H = int(width), int(height)
         self.rng = np.random.default_rng(int(seed))
         self.num_lines, self.margin, self.every_n_frames = int(num_lines), int(margin), max(1, int(every_n_frames))
         self.region_w, self.region_h = int(region_w), int(region_h)
         self.font_path = font_path
         self._pil_fonts = {}
         self.corner = str(corner) if corner else str(self.rng.choice(["tl", "tr", "bl", "br"]))
         self.messages = deque(maxlen=self.num_lines)
         for _ in range(self.num_lines): self.messages.append(self._random_message())
         self._cached_overlay, self._dirty = None, True
     def _random_message(self):
         user = str(self.rng.choice(["nightbot", "viewer", "catjam", "shadow", "speedrunner", "chattycathy", "kappaking"]))
         if self.rng.random() < 0.5: user += str(self.rng.integers(10, 999))
         text = str(self.rng.choice(["pog", "lol", "gg", "nice", "W", "L", "no shot", "crazy", "clip it", "cooking", "unlucky"]))
         color = tuple(int(x) for x in self.rng.choice([(255, 120, 0), (0, 180, 255), (255, 0, 180), (0, 255, 120)]))
         return {"user": user, "text": text, "color": color}
     def _get_pil_font(self, size_px):
         if not self.font_path: return None
         if size_px in self._pil_fonts: return self._pil_fonts[size_px]
         try:
             from PIL import ImageFont
-            font = ImageFont.truetype(self.font_path, size=max(1, size_px))
-            self._pil_fonts[size_px] = font
-            return font
         except: return None
     def maybe_append(self, frame_idx):
         if int(frame_idx) % self.every_n_frames == 0:
             self.messages.append(self._random_message())
             self._dirty = True
     def _render_cache(self):
         rw = min(self.region_w, max(40, self.W - 2 * self.margin))
         rh = min(self.region_h, max(40, self.H - 2 * self.margin))
         pil_font = self._get_pil_font(int(round(np.clip(20.0 * (self.H / 720.0), 14.0, 30.0))))
         if pil_font is None:
-            self._cached_overlay = None
-            return
         try:
             from PIL import Image, ImageDraw
             pil = Image.new("RGBA", (rw, rh), (0, 0, 0, 0))
             draw = ImageDraw.Draw(pil)
-            line_h = _clamp_int(int(round(float(getattr(pil_font, "size", 18)) * 1.25)), 14, 34)
             lines = list(self.messages)[-min(self.num_lines, max(1, rh // line_h)):]
             local_y = rh - line_h if self.corner in ("bl", "br") else 0
             for msg in lines:
                 user = f"{msg['user']}: "
-                draw.text((0, local_y), user, font=pil_font, fill=tuple(msg['color'][::-1]))
                 tw = draw.textlength(user, font=pil_font)
                 draw.text((tw, local_y), msg['text'], font=pil_font, fill=(240, 240, 240))
                 local_y += (-line_h if self.corner in ("bl", "br") else line_h)
             self._cached_overlay = cv2.cvtColor(np.asarray(pil), cv2.COLOR_RGBA2BGRA)
         except: self._cached_overlay = None
     def draw(self, img_bgr):
         if img_bgr is None: return img_bgr
-        if self._dirty:
-            self._render_cache()
-            self._dirty = False
         if self._cached_overlay is not None:
             rw = min(self.region_w, max(40, self.W - 2 * self.margin))
             rh = min(self.region_h, max(40, self.H - 2 * self.margin))
             if self.corner == "tl": x, y = self.margin, self.margin
             elif self.corner == "tr": x, y = self.W - self.margin - rw, self.margin
             elif self.corner == "bl": x, y = self.margin, self.H - self.margin - rh
             else: x, y = self.W - self.margin - rw, self.H - self.margin - rh
             _alpha_blend_bgra_onto_bgr(img_bgr, self._cached_overlay, x, y)
         return img_bgr
-def k4_to_K3(k4):
-    return np.array([[k4[0], 0, k4[2]], [0, k4[1], k4[3]], [0, 0, 1]], dtype=np.float32)
 def bbox_xywh_to_bbx_xys(bbox_xywh, base_enlarge=1.0):
     x, y, w, h = [float(v) for v in bbox_xywh]
     return np.array([x + 0.5 * w, y + 0.5 * h, max(w, h) * float(base_enlarge)], dtype=np.float32)
 def clamp_bbox_xywh_to_image(bbox_xywh, W, H, min_size=1.0):
     x, y, w, h = [float(v) for v in bbox_xywh]
     W, H = float(W), float(H)
     if W <= 0 or H <= 0: return [0.0, 0.0, 0.0, 0.0]
     x2, y2 = x + w, y + h
     x1c = float(np.clip(x, 0.0, max(0.0, W - 1.0)))
     y1c = float(np.clip(y, 0.0, max(0.0, H - 1.0)))
     x2c = float(np.clip(x2, 0.0, W))
     y2c = float(np.clip(y2, 0.0, H))
     if x2c <= x1c: x2c = min(W, x1c + float(min_size))
     if y2c <= y1c: y2c = min(H, y1c + float(min_size))
     wc = max(0.0, x2c - x1c)
     hc = max(0.0, y2c - y1c)
     return [x1c, y1c, wc, hc]
 def draw_bbox_xywh_and_center(img_bgr, bbox_xywh, color=(255, 255, 0)):
     x, y, w, h = [float(v) for v in bbox_xywh]
     cv2.rectangle(img_bgr, (int(x), int(y)), (int(x+w), int(y+h)), color, 2)
     cv2.circle(img_bgr, (int(x+w/2), int(y+h/2)), 4, (0, 0, 255), -1)
 def vis_label_and_color(v: int):
     if v == 2: return "VIS", (0, 255, 0)
     if v == 1: return "OCC", (0, 165, 255)
     return "OFF", (160, 160, 160)
 def draw_vis_text_and_points(img_bgr, kpts2d_xy, vis17):
     for k in range(17):
         v = int(vis17[k])
         label, color = vis_label_and_color(v)
         x, y = int(round(kpts2d_xy[k, 0])), int(round(kpts2d_xy[k, 1]))
         if v > 0: cv2.circle(img_bgr, (x, y), 4, color, -1)
         cv2.putText(img_bgr, f"{k}:{label}", (x + 6, y - 6), cv2.FONT_HERSHEY_SIMPLEX, 0.45, color, 1, cv2.LINE_AA)
 def build_T_wc(pos_world, quat_world_xyzw):
     T = np.eye(4, dtype=np.float64)
     T[:3, :3] = R.from_quat(np.asarray(quat_world_xyzw, dtype=np.float64)).as_matrix()
     T[:3, 3] = np.asarray(pos_world, dtype=np.float64)
     return T
 def compute_velocity(mats, fps=30.0):
     N = len(mats)
     if N < 2: return np.zeros((N, 3), dtype=np.float32), np.zeros((N, 3), dtype=np.float32)
     R_curr = mats[:, :3, :3]
     R_diff = np.matmul(R_curr[1:], np.transpose(R_curr[:-1], (0, 2, 1)))
     rv = R.from_matrix(R_diff).as_rotvec()
     angvel = np.zeros((N, 3), dtype=np.float32)
     angvel[1:] = rv
     t_curr = mats[:, :3, 3]
     tvel = np.zeros((N, 3), dtype=np.float32)
     tvel[1:] = t_curr[1:] - t_curr[:-1]
     return angvel.astype(np.float32), tvel.astype(np.float32)
 def _compute_vitpose_selected_indices(num_frames, fps, bucket_seconds, frames_per_bucket, sampling="uniform", seed=123):
     if num_frames <= 0: return []
     rng = np.random.default_rng(int(seed))
     selected = []
     bucket_len = max(1, int(round(float(bucket_seconds) * float(fps))))
     b_start = 0
     while b_start < num_frames:
         b_end = min(num_frames, b_start + bucket_len)
         k = min(int(frames_per_bucket), b_end - b_start)
         if k > 0:
-            if sampling == "random":
-                idxs = np.sort(rng.choice(np.arange(b_start, b_end), size=k, replace=False)).tolist()
-            elif sampling == "linspace":
-                idxs = sorted(list(set(np.linspace(b_start, b_end - 1, k, dtype=int).tolist())))
-            else: # uniform
                 if k == 1: idxs = [b_start + (b_end - b_start) // 2]
-                else:
-                    step = (b_end - b_start) // k
-                    idxs = [min(b_start + i * step, b_end - 1) for i in range(k)]
             selected.extend(idxs)
         b_start = b_end
     return sorted(list(set(selected)))
-# Initialize model variable
 _SMPLX_MODEL = None
 _SMPLX_DEVICE = None
 def _get_smplx_model(device):
     global _SMPLX_MODEL, _SMPLX_DEVICE
-    if _SMPLX_MODEL is not None and _SMPLX_DEVICE == device:
-        return _SMPLX_MODEL
     from hmr4d.utils.smplx_utils import make_smplx
     _SMPLX_MODEL = make_smplx("supermotion").to(device).eval()
     _SMPLX_DEVICE = device
     return _SMPLX_MODEL
-# SMPL Renderer
 class SmplIncamRenderer:
     def __init__(self, width, height, K4, device="cuda", smplx2smpl_path="hmr4d/utils/body_model/smplx2smpl_sparse.pt"):
         from hmr4d.utils.smplx_utils import make_smplx
         from hmr4d.utils.vis.renderer import Renderer
         self.torch = torch
         self.device = device
         self.smplx = make_smplx("supermotion").to(device).eval()
-        self.smplx2smpl = None
-        self.faces = None
         try:
             self.smplx2smpl = torch.load(smplx2smpl_path).to(device)
             self.faces = make_smplx("smpl").faces
         except: self.faces = self.smplx.faces
         self.K_torch = torch.from_numpy(k4_to_K3(K4)).to(device)
         self.renderer = Renderer(width, height, device=device, faces=self.faces, K=self.K_torch)
     @torch.no_grad()
     def render(self, img_rgb_uint8, global_orient_aa, body_pose_aa, betas_10, transl_xyz, fl, pp):
         K3_torch = torch.from_numpy(np.array([[fl[0], 0, pp[0]], [0, fl[1], pp[1]], [0, 0, 1]], dtype=np.float32)).to(self.device)
         self.renderer.set_intrinsic(K3_torch)
-        params = {
-            "global_orient": torch.from_numpy(global_orient_aa[None]).float().to(self.device),
-            "body_pose": torch.from_numpy(body_pose_aa[None]).float().to(self.device),
-            "betas": torch.from_numpy(betas_10[None]).float().to(self.device),
-            "transl": torch.from_numpy(transl_xyz[None]).float().to(self.device),
-        }
-        out = self.smplx(**params)
-        verts = out.vertices[0]
         if self.smplx2smpl is not None and verts.dim() == 2: verts = torch.matmul(self.smplx2smpl, verts)
         img_out = self.renderer.render_mesh(verts, img_rgb_uint8, [0.8, 0.8, 0.8])
         return img_out
     @torch.no_grad()
     def get_verts(self, global_orient_aa, body_pose_aa, betas_10, transl_xyz):
-        params = {
-            "global_orient": torch.from_numpy(global_orient_aa[None]).float().to(self.device),
-            "body_pose": torch.from_numpy(body_pose_aa[None]).float().to(self.device),
-            "betas": torch.from_numpy(betas_10[None]).float().to(self.device),
-            "transl": torch.from_numpy(transl_xyz[None]).float().to(self.device),
-        }
-        out = self.smplx(**params)
-        verts = out.vertices[0]
-        if self.smplx2smpl is not None and verts.dim() == 2:
-            verts = torch.matmul(self.smplx2smpl, verts)
         return verts
 def _as_betas10(betas_any) -> np.ndarray:
     betas = np.asarray(betas_any, dtype=np.float32).reshape(-1)
-    betas10 = np.zeros(10, dtype=np.float32)
-    n = min(10, betas.size)
     if n > 0: betas10[:n] = betas[:n]
     return betas10
 def load_betas10_from_npz(npz_path, key="betas", index=None):
-    with np.load(npz_path, allow_pickle=True) as data:
-        arr = data[key]
     if arr.ndim == 0: arr = np.asarray(arr).reshape(1)
     if arr.ndim == 1: betas = arr
-    elif arr.ndim == 2:
-        row_idx = 0 if index is None else int(index)
-        betas = arr[row_idx]
     else: raise ValueError(f"Bad betas shape: {arr.shape}")
     return _as_betas10(betas)
-def _default_shape_npz_path() -> str:
-    return os.path.join(os.path.dirname(__file__), "shape.npz")
 def parse_smpl_inputs_from_row(row, override_betas10=None, keep_unity_scale=False, transl_source="pelvis", transl_y_offset_m=0.0):
     C = np.diag([1.0, -1.0, 1.0]).astype(np.float64)
     cam_rot_w_quat = np.array(row["cam_rot_world"], dtype=np.float64)
     R_cam_w = R.from_quat(cam_rot_w_quat).as_matrix()
     pel_rot_w_quat = np.array(row["pelvis_rot_world"], dtype=np.float64)
     R_pel_w = R.from_quat(pel_rot_w_quat).as_matrix()
     R_rel_unity = R_cam_w.T @ R_pel_w
     R_cv = C @ R_rel_unity @ C
-    R_final = R_cv @ R.from_euler('z', 180, degrees=True).as_matrix()
     global_orient_aa = R.from_matrix(R_final).as_rotvec().astype(np.float32)
     smpl_scale = float(row.get("smpl_root_world_scale", 1.0))
     pelvis_cam_unity = np.asarray(row["smpl_incam_transl"], dtype=np.float64).reshape(3)
     root_cam_unity = np.asarray(row.get("smpl_root_incam_transl", [0.0, 0.0, 0.0]), dtype=np.float64).reshape(3)
     pelvis_cam_unity = pelvis_cam_unity + np.array([0.0, float(transl_y_offset_m), 0.0], dtype=np.float64)
     if str(transl_source).strip().lower() == "root": target_cam_unity = root_cam_unity
     else:
         if bool(keep_unity_scale): target_cam_unity = pelvis_cam_unity
         else:
             if abs(smpl_scale) > 1e-8: target_cam_unity = root_cam_unity + (pelvis_cam_unity - root_cam_unity) / smpl_scale
             else: target_cam_unity = pelvis_cam_unity
     target_cam_cv = (C @ target_cam_unity).astype(np.float64)
     pose = np.asarray(row["smplx_pose"], dtype=np.float32)
     body_pose = pose[3:66].astype(np.float32)
     betas10 = _as_betas10(override_betas10)
     return {
         "global_orient": global_orient_aa, "body_pose": body_pose, "betas": betas10,
         "target_cam_cv": target_cam_cv, "cam_rot_w_quat": cam_rot_w_quat,
         "cam_pos_world": np.asarray(row["cam_pos_world"], dtype=np.float64).reshape(3),
         "pelvis_pos_world": np.asarray(row["pelvis_pos_world"], dtype=np.float64).reshape(3),
         "smpl_scale": smpl_scale, "root_cam_unity": root_cam_unity
     }
 def batch_smpl_forward(betas, global_orient, body_pose, device):
     model = _get_smplx_model(device)
     N = len(betas)
-    chunk_size = 4096
-    pelvis_list = []
     with torch.no_grad():
         for i in range(0, N, chunk_size):
             b_betas = torch.from_numpy(betas[i:i+chunk_size]).float().to(device)
             b_go = torch.from_numpy(global_orient[i:i+chunk_size]).float().to(device)
             b_bp = torch.from_numpy(body_pose[i:i+chunk_size]).float().to(device)
             b_tr = torch.zeros((len(b_betas), 3), dtype=torch.float32, device=device)
             out = model(betas=b_betas, global_orient=b_go, body_pose=b_bp, transl=b_tr)
             pelvis_list.append(out.joints[:, 0, :].detach().cpu().numpy())
     return np.concatenate(pelvis_list, axis=0)
 def main():
     parser = argparse.ArgumentParser()
     parser.add_argument("--input", required=True)
     parser.add_argument("--output", required=True)
     parser.add_argument("--debug", action="store_true")
     parser.add_argument("--vitpose", action="store_true")
-    parser.add_argument("--genmo", action="store_true")
     parser.add_argument("--dpvo", action="store_true")
     parser.add_argument("--smplx", action="store_true")
     parser.add_argument("--debug_no_coco", action="store_true")
     parser.add_argument("--shape_npz", default=_default_shape_npz_path())
     parser.add_argument("--vitpose_use_all_frames", action="store_true")
     parser.add_argument("--vitpose_bucket_seconds", type=float, default=12.0)
     parser.add_argument("--vitpose_frames_per_bucket", type=int, default=36)
     parser.add_argument("--vitpose_sampling", type=str, default="random")
     parser.add_argument("--vitpose_seed", type=int, default=123)
     parser.add_argument("--ui_dir", type=str, default=None)
     parser.add_argument("--ui_show_prob", type=float, default=0.25)
     parser.add_argument("--ui_max_images", type=int, default=3)
     parser.add_argument("--ui_hold_min_s", type=float, default=0.7)
     parser.add_argument("--ui_hold_max_s", type=float, default=5.0)
     parser.add_argument("--ui_seed", type=int, default=None)
     parser.add_argument("--keep_unity_scale", action="store_true")
     parser.add_argument("--transl_source", type=str, default="pelvis")
     parser.add_argument("--transl_y_offset_m", type=float, default=-0.020)
     parser.add_argument("--world_y_offset_m", type=float, default=1.3415)
     parser.add_argument("--vit_batch_size", type=int, default=512, help="Batch size for in-memory ViT extraction")
     args = parser.parse_args()
     if not (args.vitpose or args.genmo or args.dpvo or args.smplx):
         args.vitpose = args.genmo = args.dpvo = args.smplx = True
     device = "cuda" if torch.cuda.is_available() else "cpu"
     print(f"Running STREAMING processing on {device.upper()}...")
-    # --- INIT ViT MODEL ONCE ---
     vit_model = None
     if args.genmo and Extractor is not None:
         print("Initializing ViT Extractor (HMR2)...")
         extractor_wrapper = Extractor(tqdm_leave=False)
         vit_model = extractor_wrapper.extractor
         vit_model.eval()
         vit_model.to(device)
     override_betas10 = load_betas10_from_npz(args.shape_npz, key="betas")
     temp_ann_dir = os.path.join(args.output, "vitpose", "temp_annotations")
     os.makedirs(temp_ann_dir, exist_ok=True)
     jsonl_files = sorted(glob(os.path.join(args.input, "sequence_*.jsonl")))
     global_J_reg = None
     j_reg_path = "third_party/GVHMR/inputs/checkpoints/body_models/smpl_neutral_J_regressor.pt"
     if os.path.exists(j_reg_path) and device == "cuda":
         global_J_reg = torch.load(j_reg_path, map_location=device)
     for jsonl_idx, jsonl_path in enumerate(jsonl_files):
         seq_name = os.path.splitext(os.path.basename(jsonl_path))[0].replace("sequence_", "")
         print(f"[{jsonl_idx+1}/{len(jsonl_files)}] Processing {seq_name}...")
         prof = {"smpl_batch": 0.0, "video_read": 0.0, "overlay": 0.0, "vit_process": 0.0,
                 "sparse_write": 0.0, "loop_total": 0.0, "save_files": 0.0, "debug_rend": 0.0, "prep": 0.0}
         t_start_seq = time.perf_counter()
         jsonl_dir = os.path.dirname(os.path.abspath(jsonl_path))
         video_path = os.path.join(jsonl_dir, f"video_{seq_name}.mp4")
         if not os.path.exists(video_path): video_path = os.path.join(jsonl_dir, "video.mp4")
-        # SPARSE WRITING FOLDER
         out_img_folder = os.path.join(args.output, "images", seq_name)
         os.makedirs(out_img_folder, exist_ok=True)
-        # Clean existing only if needed, usually we overwrite
-        # for p in glob(os.path.join(out_img_folder, "img_*.jpg")): try: os.remove(p) except: pass
         with open(jsonl_path, "r") as f: lines = f.readlines()
         lines = lines[1:] if len(lines) > 0 else []
         num_frames = len(lines)
         if num_frames <= 0: continue
         genmo_out = os.path.join(args.output, "genmo_features", f"{seq_name}.pt")
         smplx_out = os.path.join(args.output, "smplx_incam", f"{seq_name}_smplx.npz")
         smplx_global_out = os.path.join(args.output, "smplx_global", f"{seq_name}_global.npz")
         dpvo_dir = os.path.join(args.output, "dpvo", seq_name)
         for p in [genmo_out, smplx_out, smplx_global_out, dpvo_dir]:
             if p: os.makedirs(os.path.dirname(p), exist_ok=True)
         selected_set = set()
         if args.vitpose:
             if args.vitpose_use_all_frames: selected_indices = list(range(num_frames))
             else:
                 selected_indices = _compute_vitpose_selected_indices(
                     num_frames, FPS, args.vitpose_bucket_seconds,
                     args.vitpose_frames_per_bucket, args.vitpose_sampling, args.vitpose_seed
                 )
             selected_set = set(selected_indices)
         cap = cv2.VideoCapture(video_path)
         if not cap.isOpened(): continue
         W = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
         H = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
         resolved_ui_dir = args.ui_dir if args.ui_dir else _find_ui_dir()
         chat_font_path = _find_font_path(resolved_ui_dir)
         seq_seed = int(zlib.crc32(seq_name.encode("utf-8")) & 0xFFFFFFFF)
         chat_aug = SimpleChatOverlay(W, H, seed=seq_seed, num_lines=7, font_path=chat_font_path)
-        ui_aug = SimpleUIOverlay(W, H, seed=((seq_seed ^ 0xA5A5A5A5) & 0xFFFFFFFF), ui_dir=resolved_ui_dir,
                                  max_images=args.ui_max_images, show_prob=args.ui_show_prob)
         # --- BATCH SMPL (GPU) ---
         t0_smpl = time.perf_counter()
         smpl_precalc_data = []
         debug_global_verts_cpu = []
         parsed_rows = []
         for line in lines:
             row = json.loads(line)
             parsed_rows.append(parse_smpl_inputs_from_row(row, override_betas10, args.keep_unity_scale, args.transl_source, args.transl_y_offset_m))
         all_betas = np.stack([d['betas'] for d in parsed_rows])
         all_go = np.stack([d['global_orient'] for d in parsed_rows])
         all_bp = np.stack([d['body_pose'] for d in parsed_rows])
         all_pelvis0 = batch_smpl_forward(all_betas, all_go, all_bp, device=device)
         C = np.diag([1.0, -1.0, 1.0]).astype(np.float64)
-        world_fix_R, world_fix_R4, world_fix_R4_inv = None, None, None
         all_go_w, all_pelvis_pos_w_cv = [], []
         for i, d in enumerate(parsed_rows):
             R_cam_w_unity = R.from_quat(d['cam_rot_w_quat']).as_matrix()
-            R_cam_w_cv = C @ R_cam_w_unity @ C
             R_pelvis_c_cv = R.from_rotvec(d['global_orient'].astype(np.float64)).as_matrix()
-            R_pelvis_w_cv_raw = R_cam_w_cv @ R_pelvis_c_cv
-            if i == 0:
-                go_w0 = R.from_matrix(R_pelvis_w_cv_raw).as_rotvec().astype(np.float32)
-                model_gpu = _get_smplx_model(device)
-                with torch.no_grad():
-                    out_fix = model_gpu(betas=torch.from_numpy(d['betas'][None]).float().to(device),
-                                        global_orient=torch.from_numpy(go_w0[None]).float().to(device),
-                                        body_pose=torch.from_numpy(d['body_pose'][None]).float().to(device))
-                    joints = out_fix.joints[0].detach().cpu().numpy().astype(np.float64)
-                pelvis_y = float(joints[0, 1])
-                head_y = float(joints[15, 1]) if joints.shape[0] > 15 else pelvis_y
-                if head_y < pelvis_y: world_fix_R = R.from_euler("x", 180, degrees=True).as_matrix().astype(np.float64)
-                else: world_fix_R = np.eye(3, dtype=np.float64)
-                world_fix_R4 = np.eye(4, dtype=np.float64)
-                world_fix_R4[:3, :3] = world_fix_R
-                world_fix_R4_inv = np.eye(4, dtype=np.float64); world_fix_R4_inv[:3, :3] = world_fix_R.T
-            R_pelvis_w_cv = world_fix_R @ R_pelvis_w_cv_raw
             all_go_w.append(R.from_matrix(R_pelvis_w_cv).as_rotvec().astype(np.float32))
             pelvis_pos_w_unity = d['pelvis_pos_world']
             root_pos_w_unity = (R_cam_w_unity @ d['root_cam_unity'] + d['cam_pos_world']).reshape(3)
             smpl_scale = d['smpl_scale']
             transl_source_local = str(args.transl_source).strip().lower()
             if transl_source_local == "root": target_pos_w_unity = root_pos_w_unity
             else:
                 if bool(args.keep_unity_scale): target_pos_w_unity = pelvis_pos_w_unity
                 else:
                     if abs(smpl_scale) > 1e-8: target_pos_w_unity = root_pos_w_unity + (pelvis_pos_w_unity - root_pos_w_unity) / smpl_scale
                     else: target_pos_w_unity = pelvis_pos_w_unity
-            pelvis_pos_w_cv = (C @ target_pos_w_unity).astype(np.float64)
-            pelvis_pos_w_cv = (world_fix_R @ pelvis_pos_w_cv.reshape(3, 1)).reshape(3)
-            all_pelvis_pos_w_cv.append(pelvis_pos_w_cv)
-        all_go_w = np.stack(all_go_w)
-        all_pelvis0_w = batch_smpl_forward(all_betas, all_go_w, all_bp, device=device)
-        for i in range(num_frames):
-            d = parsed_rows[i]
-            transl_c = (d['target_cam_cv'] - all_pelvis0[i]).astype(np.float32)
-            if str(args.transl_source) == "root": transl_w = all_pelvis_pos_w_cv[i].astype(np.float32)
             else: transl_w = (all_pelvis_pos_w_cv[i] - all_pelvis0_w[i]).astype(np.float32)
             smpl_precalc_data.append({
                 "go_c": d['global_orient'], "bp": d['body_pose'], "beta": d['betas'], "tr_c": transl_c,
-                "go_w": all_go_w[i], "tr_w": transl_w, "world_fix_R4": world_fix_R4, "world_fix_R4_inv": world_fix_R4_inv
             })
         prof["smpl_batch"] = time.perf_counter() - t0_smpl
         t0_gap = time.perf_counter()
         smpl_renderer = None
         vid_incam, vid_global = None, None
         debug_end_frame = min(num_frames, DEBUG_NUM_FRAMES)
         if args.debug:
              os.makedirs(os.path.join(args.output, "debug_renders"), exist_ok=True)
              if debug_end_frame > 0:
                  try:
-                    # REUSE parsed_rows!
                     K4_init = np.asarray(json.loads(lines[0])["cam_intrinsics"], dtype=np.float32)
                     smpl_renderer = SmplIncamRenderer(W, H, K4_init, device=device)
                     fourcc = cv2.VideoWriter_fourcc(*'mp4v')
                     vid_incam = cv2.VideoWriter(os.path.join(args.output, "debug_renders", f"{seq_name}_incam.mp4"), fourcc, FPS, (W, H))
-                    vid_global = cv2.VideoWriter(os.path.join(args.output, "debug_renders", f"{seq_name}_global.mp4"), fourcc, FPS, (512, 512))
                  except: pass
         # --- MAIN LOOP ---
-        coco_subset = []
-        img_paths = []
-        cam_intrinsics = []
         cam_T_wc_cv_all, cam_T_w2c_cv_all = [], []
         dpvo_poses, dpvo_intrinsics = [], []
-        bboxes, bbx_xys_all, kp2d_all, K_fullimg_all = [], [], [], []
         global_orient_c_all, transl_c_all, body_pose_all, betas_all = [], [], [], []
         global_orient_w_all, transl_w_all = [], []
-        C4 = np.diag([1.0, -1.0, 1.0, 1.0]).astype(np.float64)
-        # ViT Batching Lists
-        vit_img_batch = []
-        all_vit_features = []
         ret, _ = cap.read() # skip 0
         prof["prep"] = time.perf_counter() - t0_gap
         t_start_loop = time.perf_counter()
         for idx in tqdm(range(num_frames), desc="Frames", leave=False):
             t0_read = time.perf_counter()
             ret, img_bgr = cap.read()
             prof["video_read"] += (time.perf_counter() - t0_read)
             if not ret: break
             img_filename = f"img_{idx:05d}.jpg"
             img_abs_path = os.path.join(out_img_folder, img_filename)
-            # --- OVERLAY ---
             t0_ov = time.perf_counter()
             chat_aug.maybe_append(idx)
             chat_aug.draw(img_bgr)
             ui_aug.draw(img_bgr)
             prof["overlay"] += (time.perf_counter() - t0_ov)
-            # --- METADATA ---
-            row = json.loads(lines[idx])
             K4 = np.asarray(row["cam_intrinsics"], dtype=np.float32)
             kpts_raw = np.asarray(row["kpts_2d"], dtype=np.float32).reshape(-1, 2)[:17]
             vis_raw = np.asarray(row["kpts_vis"], dtype=np.int32)[:17]
-            if vis_raw.shape[0] >= 5: vis_raw[3] = 1; vis_raw[4] = 1
             bbox = clamp_bbox_xywh_to_image(row["bbox"], W, H)
             sd = smpl_precalc_data[idx]
             global_orient_c_all.append(sd['go_c'])
             transl_c_all.append(sd['tr_c'])
             global_orient_w_all.append(sd['go_w'])
             transl_w_all.append(sd['tr_w'])
             body_pose_all.append(sd['bp'])
             betas_all.append(sd['beta'])
             bboxes.append(np.asarray(bbox, dtype=np.float32))
             bbx_xys_all.append(bbox_xywh_to_bbx_xys(bbox))
             kp2d_all.append(np.concatenate([kpts_raw, (vis_raw > 0).astype(np.float32)[:, None]], axis=1))
             K_fullimg_all.append(k4_to_K3(K4))
             img_rel = os.path.join("images", seq_name, img_filename).replace("\\", "/")
             img_paths.append(img_rel)
-            p_w, q_w = np.asarray(row["cam_pos_world"], dtype=np.float32), np.asarray(row["cam_rot_world"], dtype=np.float32)
             cam_T_wc = build_T_wc(p_w, q_w)
-            T_cw = np.linalg.inv(cam_T_wc)
-            cam_T_wc_cv = (C4 @ cam_T_wc @ C4).astype(np.float32)
-            cam_T_w2c_cv = (C4 @ T_cw @ C4).astype(np.float32)
-            if sd['world_fix_R4'] is not None:
-                cam_T_wc_cv = (sd['world_fix_R4'] @ cam_T_wc_cv.astype(np.float64)).astype(np.float32)
-                cam_T_w2c_cv = (cam_T_w2c_cv.astype(np.float64) @ sd['world_fix_R4_inv']).astype(np.float32)
             cam_T_wc_cv_all.append(cam_T_wc_cv)
             cam_T_w2c_cv_all.append(cam_T_w2c_cv)
             dpvo_poses.append(f"{p_w[0]} {p_w[1]} {p_w[2]} {q_w[0]} {q_w[1]} {q_w[2]} {q_w[3]}")
             dpvo_intrinsics.append(K4.astype(np.float32))
-            # --- BRANCH A: Genmo (ViT Extraction) ---
             if args.genmo and vit_model is not None:
                 t0_vit = time.perf_counter()
-                # Process image in RAM (Crop/Resize/Norm)
-                img_tensor = _process_image_memory(img_bgr, bbox, img_size=256) # Returns CHW numpy
                 vit_img_batch.append(img_tensor)
                 if len(vit_img_batch) >= args.vit_batch_size:
                     batch_np = np.stack(vit_img_batch)
                     batch_t = torch.from_numpy(batch_np).to(device, non_blocking=True)
                     with torch.inference_mode():
                         with torch.amp.autocast("cuda"):
                             feats = vit_model({"img": batch_t})
                     all_vit_features.append(feats.detach().cpu())
                     vit_img_batch = []
                 prof["vit_process"] += (time.perf_counter() - t0_vit)
-            # --- BRANCH B: VitPose (Sparse Write) ---
             if args.vitpose and (idx in selected_set):
                 t0_wr = time.perf_counter()
-                # Use faster write if possible, 90 quality
                 cv2.imwrite(img_abs_path, img_bgr, [int(cv2.IMWRITE_JPEG_QUALITY), 90])
                 kpts_coco = []
                 for k in range(17): kpts_coco.extend([float(kpts_raw[k, 0]), float(kpts_raw[k, 1]), int(vis_raw[k])])
                 coco_subset.append(({"file_name": img_rel, "width": W, "height": H},
                                     {"category_id": 1, "bbox": bbox, "area": float(bbox[2]*bbox[3]), "iscrowd": 0, "keypoints": kpts_coco, "num_keypoints": int(np.sum(vis_raw > 0))}))
                 prof["sparse_write"] += (time.perf_counter() - t0_wr)
-            # --- BRANCH C: Debug Video ---
             if args.debug and idx < debug_end_frame and smpl_renderer:
                 t0_dbg = time.perf_counter()
                 dbg = img_bgr.copy()
                 try: draw_bbox_xywh_and_center(dbg, bbox)
                 except: pass
                 try:
                     rgb = smpl_renderer.render(dbg[:, :, ::-1].copy(), sd['go_c'], sd['bp'], sd['beta'], sd['tr_c'], K4[:2], K4[2:])
                     dbg = rgb[:, :, ::-1].copy()
                 except: pass
                 if not args.debug_no_coco:
                      draw_vis_text_and_points(dbg, kpts_raw, vis_raw)
                 if vid_incam: vid_incam.write(dbg)
                 if vid_global:
                     verts_w = smpl_renderer.get_verts(sd['go_w'], sd['bp'], sd['beta'], sd['tr_w']).float()
                     debug_global_verts_cpu.append(verts_w.detach().cpu())
                 prof["debug_rend"] += (time.perf_counter() - t0_dbg)
-        # Flush remaining ViT batch
         if args.genmo and len(vit_img_batch) > 0 and vit_model is not None:
              t0_vit = time.perf_counter()
              batch_np = np.stack(vit_img_batch)
              batch_t = torch.from_numpy(batch_np).to(device, non_blocking=True)
              with torch.inference_mode():
                  with torch.amp.autocast("cuda"):
                      feats = vit_model({"img": batch_t})
              all_vit_features.append(feats.detach().cpu())
              prof["vit_process"] += (time.perf_counter() - t0_vit)
         prof["loop_total"] = time.perf_counter() - t_start_loop
         cap.release()
         if vid_incam: vid_incam.release()
-        # --- GLOBAL RENDER POST-LOOP ---
         t0_dbg = time.perf_counter()
         if vid_global and len(debug_global_verts_cpu) > 0:
             try:
-                from hmr4d.utils.vis.renderer import Renderer, get_global_cameras_static, get_ground_params_from_points
                 from hmr4d.utils.geo.hmr_cam import create_camera_sensor
-                _, _, K_global = create_camera_sensor(512, 512, 24)
-                global_renderer = Renderer(512, 512, device=device, faces=smpl_renderer.faces, K=K_global.to(device), bin_size=0)
                 verts_seq = torch.stack(debug_global_verts_cpu, dim=0)
                 off = verts_seq[0].mean(0); off[1] = verts_seq[0, :, 1].min()
                 verts_seq = verts_seq - off
-                g_R, g_T, g_L = get_global_cameras_static(verts_seq, beta=2.0, cam_height_degree=20, target_center_height=1.0, device=device)
                 if global_J_reg is not None and verts_seq.shape[1] == global_J_reg.shape[-1]:
-                    roots = torch.einsum("jv,fvk->fjk", global_J_reg.cpu(), verts_seq)[:, 0]
-                else: roots = verts_seq.mean(1)
                 sc, cx, cz = get_ground_params_from_points(roots, verts_seq)
                 global_renderer.set_ground(sc * 1.5, cx, cz)
                 col = torch.tensor([[0.0, 1.0, 0.0]], device=device)
                 for i in range(len(verts_seq)):
                     cam = global_renderer.create_camera(g_R[i], g_T[i])
                     img = global_renderer.render_with_ground(verts_seq[i].to(device)[None], col, cam, g_L)
-                    vid_global.write(img[:, :, ::-1].copy())
             except: pass
             vid_global.release()
         prof["debug_rend"] += (time.perf_counter() - t0_dbg)
         t0_save = time.perf_counter()
         if args.genmo:
             trans_w = np.stack(transl_w_all).astype(np.float32)
             world_off = trans_w[0].copy(); world_off[1] -= float(args.world_y_offset_m)
             trans_w_centered = trans_w - world_off[None]
             mats_w2c = np.stack(cam_T_w2c_cv_all).astype(np.float32)
             mats_wc = np.stack(cam_T_wc_cv_all).astype(np.float32)
             T_wp_w = np.eye(4, dtype=np.float32); T_wp_w[:3, 3] = world_off
             T_w_wp = np.eye(4, dtype=np.float32); T_w_wp[:3, 3] = -world_off
             mats_w2c_c = np.matmul(mats_w2c, T_wp_w[None])
             mats_wc_c = np.matmul(T_w_wp[None], mats_wc)
             cam_av, cam_tv = compute_velocity(mats_wc_c, fps=FPS)
-            # CONCAT FEATURES
             f_imgseq = torch.cat(all_vit_features, dim=0).float() if all_vit_features else torch.empty(0)
             g_dict = {
                 "smpl_params_c": {"global_orient": torch.from_numpy(np.stack(global_orient_c_all)), "body_pose": torch.from_numpy(np.stack(body_pose_all)), "transl": torch.from_numpy(np.stack(transl_c_all)), "betas": torch.from_numpy(np.stack(betas_all))},
                 "smpl_params_w": {"global_orient": torch.from_numpy(np.stack(global_orient_w_all)), "body_pose": torch.from_numpy(np.stack(body_pose_all)), "transl": torch.from_numpy(trans_w_centered), "betas": torch.from_numpy(np.stack(betas_all))},
                 "T_w2c": torch.from_numpy(mats_w2c_c), "K_fullimg": torch.from_numpy(np.stack(K_fullimg_all)),
                 "kp2d": torch.from_numpy(np.stack(kp2d_all)), "bbx_xys": torch.from_numpy(np.stack(bbx_xys_all)),
                 "cam_angvel": torch.from_numpy(cam_av), "cam_tvel": torch.from_numpy(cam_tv),
                 "imgname": img_paths, "valid_mask": torch.ones(len(img_paths), dtype=torch.float32),
                 "world_offset": torch.from_numpy(world_off.astype(np.float32)),
-                "f_imgseq": f_imgseq # <--- ADDED FEATURES HERE
             }
             torch.save(g_dict, genmo_out)
         if args.smplx:
              poses66 = np.concatenate([np.stack(global_orient_w_all), np.stack(body_pose_all)], axis=1)
              poses165 = np.pad(poses66, ((0,0),(0,99)), mode="constant").astype(np.float32)
              trans_w = np.stack(transl_w_all).astype(np.float32)
              world_off = trans_w[0].copy(); world_off[1] -= float(args.world_y_offset_m)
              trans_w = trans_w - world_off[None]
              np.savez(smplx_global_out, mocap_framerate=int(FPS), gender="neutral", betas=betas_all[0], trans=trans_w, poses=poses165, world_offset=world_off)
         if args.vitpose and coco_subset:
             with open(os.path.join(temp_ann_dir, f"{seq_name}.json"), "w") as f: json.dump(coco_subset, f)
         prof["save_files"] = time.perf_counter() - t0_save
         total_t = time.perf_counter() - t_start_seq
         print(f"   > Done in {total_t:.2f}s | FPS: {num_frames/total_t:.1f}")
         print(f"     [Breakdown] BatchPrep: {prof['smpl_batch']:.2f}s | Init/Gap: {prof['prep']:.2f}s | Read: {prof['video_read']:.2f}s")
         print(f"                 Overlay: {prof['overlay']:.2f}s | SparseWrite: {prof['sparse_write']:.2f}s | ViT: {prof['vit_process']:.2f}s")
         print(f"                 DbgRend: {prof['debug_rend']:.2f}s | SaveFiles: {prof['save_files']:.2f}s")
     print("All sequences processed.")
 if __name__ == "__main__":
-    main()

 import sys
 import os
 import json
 import argparse
 import numpy as np
 import zlib
 from glob import glob
 from tqdm import tqdm
 import cv2
 import torch
 from scipy.spatial.transform import Rotation as R
 import time
 import shutil
 from pathlib import Path
 # --- SETUP PATHS FOR IMPORTS ---
+REPO_ROOT = Path(__file__).resolve().parents[2]
 if str(REPO_ROOT) not in sys.path:
     sys.path.insert(0, str(REPO_ROOT))
+from hmr4d.utils.preproc.vitfeat_extractor import Extractor
+from hmr4d.utils.pylogger import Log
+# Force single thread
 os.environ["OMP_NUM_THREADS"] = "1"
 os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
 cv2.setNumThreads(0)
 torch.set_num_threads(1)
 FPS = 30.0
+DEBUG_NUM_FRAMES = 5
 IMAGENET_MEAN = np.array([0.485, 0.456, 0.406], dtype=np.float32)
 IMAGENET_STD = np.array([0.229, 0.224, 0.225], dtype=np.float32)
+# --- HELPER FUNCTIONS (No Changes) ---
 def _process_image_memory(img_bgr, bbox_xywh, img_size=256):
+    if img_bgr is None: return np.zeros((3, img_size, img_size), dtype=np.float32)
     x, y, w, h = bbox_xywh
     cx, cy = x + w/2, y + h/2
+    scale = max(w, h) * 1.2
     H, W = img_bgr.shape[:2]
     max_side = float(max(H, W, 1))
+    if scale <= 1.0 or scale > max_side * 20.0: scale = max_side * 0.5
     half = scale / 2.0
     x0, y0 = int(cx - half), int(cy - half)
     x1, y1 = int(cx + half), int(cy + half)
     pad_l, pad_t = max(0, -x0), max(0, -y0)
     pad_r, pad_b = max(0, x1 - W), max(0, y1 - H)
+    if max(pad_l, pad_t, pad_r, pad_b) > int(max_side * 4.0): return np.zeros((3, img_size, img_size), dtype=np.float32)
     if pad_l or pad_t or pad_r or pad_b:
         img_bgr = cv2.copyMakeBorder(img_bgr, pad_t, pad_b, pad_l, pad_r, cv2.BORDER_CONSTANT, value=(0,0,0))
         x0 += pad_l; y0 += pad_t; x1 += pad_l; y1 += pad_t
     crop = img_bgr[y0:y1, x0:x1]
+    if crop.size == 0: return np.zeros((3, img_size, img_size), dtype=np.float32)
     if crop.shape[0] != img_size or crop.shape[1] != img_size:
         crop = cv2.resize(crop, (img_size, img_size), interpolation=cv2.INTER_LINEAR)
+    crop = crop[:, :, ::-1].astype(np.float32) / 255.0
     crop = (crop - IMAGENET_MEAN) / IMAGENET_STD
+    return crop.transpose(2, 0, 1)
 def _alpha_blend_bgra_onto_bgr(dst_bgr, src_bgra, x, y):
     if dst_bgr is None or src_bgra is None: return dst_bgr
     H, W = dst_bgr.shape[:2]
     h, w = src_bgra.shape[:2]
     if w <= 0 or h <= 0: return dst_bgr
     x0, y0 = max(int(x), 0), max(int(y), 0)
     x1, y1 = min(int(x + w), W), min(int(y + h), H)
     if x1 <= x0 or y1 <= y0: return dst_bgr
     roi = dst_bgr[y0:y1, x0:x1]
     src_crop = src_bgra[(y0 - int(y)):(y0 - int(y)) + (y1 - y0), (x0 - int(x)):(x0 - int(x)) + (x1 - x0)]
     if src_crop.shape[2] == 3:
         roi[:] = src_crop
         return dst_bgr
     alpha = src_crop[:, :, 3].astype(np.uint16)
     inv_alpha = 255 - alpha
     b_src, g_src, r_src = src_crop[:, :, 0], src_crop[:, :, 1], src_crop[:, :, 2]
     b_dst, g_dst, r_dst = roi[:, :, 0], roi[:, :, 1], roi[:, :, 2]
     roi[:, :, 0] = ((b_src * alpha + b_dst * inv_alpha) >> 8).astype(np.uint8)
     roi[:, :, 1] = ((g_src * alpha + g_dst * inv_alpha) >> 8).astype(np.uint8)
     roi[:, :, 2] = ((r_src * alpha + r_dst * inv_alpha) >> 8).astype(np.uint8)
     return dst_bgr
 def _find_ui_dir():
     cand = os.path.join(os.getcwd(), "UI")
     if os.path.isdir(cand): return cand
+    return None # Simplified for brevity
 def _find_font_path(ui_dir, filename="Inter_18pt-Bold.ttf"):
     if not ui_dir: return None
     p = os.path.join(ui_dir, filename)
     return p if os.path.isfile(p) else None
 def _load_ui_images(ui_dir):
     if not ui_dir or (not os.path.isdir(ui_dir)): return []
     imgs = []
     for name in sorted(os.listdir(ui_dir)):
         p = os.path.join(ui_dir, name)
         if not os.path.isfile(p): continue
         if name.lower().endswith(('.png', '.jpg', '.jpeg', '.webp')):
             im = cv2.imread(p, cv2.IMREAD_UNCHANGED)
             if im is not None:
                 if im.ndim == 2: im = cv2.cvtColor(im, cv2.COLOR_GRAY2BGR)
                 imgs.append(im)
     return imgs
 class SimpleUIOverlay:
     def __init__(self, width, height, seed=0, ui_dir=None, max_images=4, show_prob=0.6, min_hold_frames=20, max_hold_frames=120):
         self.W, self.H = int(width), int(height)
         self.rng = np.random.default_rng(int(seed))
         self.max_images = max(0, int(max_images))
         self.show_prob = float(show_prob)
         self.min_hold_frames, self.max_hold_frames = max(1, int(min_hold_frames)), max(1, int(max_hold_frames))
         self.ui_dir = ui_dir if ui_dir else _find_ui_dir()
         self.assets = _load_ui_images(self.ui_dir)
         self._ttl, self._active = 0, []
     def _pick_new_state(self):
         self._ttl = int(self.rng.integers(self.min_hold_frames, self.max_hold_frames + 1))
         self._active = []
         if (not self.assets) or (self.max_images <= 0): return
         if float(self.rng.random()) > self.show_prob: return
         k = min(int(self.rng.integers(1, self.max_images + 1)), len(self.assets))
         idxs = self.rng.choice(len(self.assets), size=k, replace=False)
         for idx in idxs:
             im = self.assets[int(idx)]
             h, w = im.shape[:2]
             if w > 0 and h > 0:
                 x = int(self.rng.integers(-w // 4, max(1, self.W - (3 * w // 4))))
                 y = int(self.rng.integers(-h // 4, max(1, self.H - (3 * h // 4))))
                 self._active.append((im, x, y))
     def draw(self, img_bgr):
         if img_bgr is None: return img_bgr
         if self._ttl <= 0: self._pick_new_state()
         self._ttl -= 1
+        for im, x, y in self._active: _alpha_blend_bgra_onto_bgr(img_bgr, im, x, y)
         return img_bgr
 class SimpleChatOverlay:
     def __init__(self, width, height, seed=0, num_lines=7, region_w=420, region_h=180, margin=18, every_n_frames=15, corner=None, font_path=None):
         from collections import deque
         self.W, self.H = int(width), int(height)
         self.rng = np.random.default_rng(int(seed))
         self.num_lines, self.margin, self.every_n_frames = int(num_lines), int(margin), max(1, int(every_n_frames))
         self.region_w, self.region_h = int(region_w), int(region_h)
         self.font_path = font_path
         self._pil_fonts = {}
         self.corner = str(corner) if corner else str(self.rng.choice(["tl", "tr", "bl", "br"]))
         self.messages = deque(maxlen=self.num_lines)
         for _ in range(self.num_lines): self.messages.append(self._random_message())
         self._cached_overlay, self._dirty = None, True
     def _random_message(self):
         user = str(self.rng.choice(["nightbot", "viewer", "catjam", "shadow", "speedrunner", "chattycathy", "kappaking"]))
         if self.rng.random() < 0.5: user += str(self.rng.integers(10, 999))
         text = str(self.rng.choice(["pog", "lol", "gg", "nice", "W", "L", "no shot", "crazy", "clip it", "cooking", "unlucky"]))
         color = tuple(int(x) for x in self.rng.choice([(255, 120, 0), (0, 180, 255), (255, 0, 180), (0, 255, 120)]))
         return {"user": user, "text": text, "color": color}
     def _get_pil_font(self, size_px):
         if not self.font_path: return None
         if size_px in self._pil_fonts: return self._pil_fonts[size_px]
         try:
             from PIL import ImageFont
+            return ImageFont.truetype(self.font_path, size=max(1, size_px))
         except: return None
     def maybe_append(self, frame_idx):
         if int(frame_idx) % self.every_n_frames == 0:
             self.messages.append(self._random_message())
             self._dirty = True
     def _render_cache(self):
         rw = min(self.region_w, max(40, self.W - 2 * self.margin))
         rh = min(self.region_h, max(40, self.H - 2 * self.margin))
         pil_font = self._get_pil_font(int(round(np.clip(20.0 * (self.H / 720.0), 14.0, 30.0))))
         if pil_font is None:
+            self._cached_overlay = None; return
         try:
             from PIL import Image, ImageDraw
             pil = Image.new("RGBA", (rw, rh), (0, 0, 0, 0))
             draw = ImageDraw.Draw(pil)
+            line_h = max(14, int(round(float(getattr(pil_font, "size", 18)) * 1.25)))
             lines = list(self.messages)[-min(self.num_lines, max(1, rh // line_h)):]
             local_y = rh - line_h if self.corner in ("bl", "br") else 0
             for msg in lines:
                 user = f"{msg['user']}: "
+                draw.text((0, local_y), user, font=pil_font, fill=tuple(msg['color'][::-1]))
                 tw = draw.textlength(user, font=pil_font)
                 draw.text((tw, local_y), msg['text'], font=pil_font, fill=(240, 240, 240))
                 local_y += (-line_h if self.corner in ("bl", "br") else line_h)
             self._cached_overlay = cv2.cvtColor(np.asarray(pil), cv2.COLOR_RGBA2BGRA)
         except: self._cached_overlay = None
     def draw(self, img_bgr):
         if img_bgr is None: return img_bgr
+        if self._dirty: self._render_cache(); self._dirty = False
         if self._cached_overlay is not None:
             rw = min(self.region_w, max(40, self.W - 2 * self.margin))
             rh = min(self.region_h, max(40, self.H - 2 * self.margin))
             if self.corner == "tl": x, y = self.margin, self.margin
             elif self.corner == "tr": x, y = self.W - self.margin - rw, self.margin
             elif self.corner == "bl": x, y = self.margin, self.H - self.margin - rh
             else: x, y = self.W - self.margin - rw, self.H - self.margin - rh
             _alpha_blend_bgra_onto_bgr(img_bgr, self._cached_overlay, x, y)
         return img_bgr
+def k4_to_K3(k4): return np.array([[k4[0], 0, k4[2]], [0, k4[1], k4[3]], [0, 0, 1]], dtype=np.float32)
 def bbox_xywh_to_bbx_xys(bbox_xywh, base_enlarge=1.0):
     x, y, w, h = [float(v) for v in bbox_xywh]
     return np.array([x + 0.5 * w, y + 0.5 * h, max(w, h) * float(base_enlarge)], dtype=np.float32)
 def clamp_bbox_xywh_to_image(bbox_xywh, W, H, min_size=1.0):
     x, y, w, h = [float(v) for v in bbox_xywh]
     W, H = float(W), float(H)
     if W <= 0 or H <= 0: return [0.0, 0.0, 0.0, 0.0]
     x2, y2 = x + w, y + h
     x1c = float(np.clip(x, 0.0, max(0.0, W - 1.0)))
     y1c = float(np.clip(y, 0.0, max(0.0, H - 1.0)))
     x2c = float(np.clip(x2, 0.0, W))
     y2c = float(np.clip(y2, 0.0, H))
     if x2c <= x1c: x2c = min(W, x1c + float(min_size))
     if y2c <= y1c: y2c = min(H, y1c + float(min_size))
     wc = max(0.0, x2c - x1c)
     hc = max(0.0, y2c - y1c)
     return [x1c, y1c, wc, hc]
 def draw_bbox_xywh_and_center(img_bgr, bbox_xywh, color=(255, 255, 0)):
     x, y, w, h = [float(v) for v in bbox_xywh]
     cv2.rectangle(img_bgr, (int(x), int(y)), (int(x+w), int(y+h)), color, 2)
     cv2.circle(img_bgr, (int(x+w/2), int(y+h/2)), 4, (0, 0, 255), -1)
 def vis_label_and_color(v: int):
     if v == 2: return "VIS", (0, 255, 0)
     if v == 1: return "OCC", (0, 165, 255)
     return "OFF", (160, 160, 160)
 def draw_vis_text_and_points(img_bgr, kpts2d_xy, vis17):
     for k in range(17):
         v = int(vis17[k])
         label, color = vis_label_and_color(v)
         x, y = int(round(kpts2d_xy[k, 0])), int(round(kpts2d_xy[k, 1]))
         if v > 0: cv2.circle(img_bgr, (x, y), 4, color, -1)
         cv2.putText(img_bgr, f"{k}:{label}", (x + 6, y - 6), cv2.FONT_HERSHEY_SIMPLEX, 0.45, color, 1, cv2.LINE_AA)
 def build_T_wc(pos_world, quat_world_xyzw):
     T = np.eye(4, dtype=np.float64)
     T[:3, :3] = R.from_quat(np.asarray(quat_world_xyzw, dtype=np.float64)).as_matrix()
     T[:3, 3] = np.asarray(pos_world, dtype=np.float64)
     return T
 def compute_velocity(mats, fps=30.0):
     N = len(mats)
     if N < 2: return np.zeros((N, 3), dtype=np.float32), np.zeros((N, 3), dtype=np.float32)
     R_curr = mats[:, :3, :3]
     R_diff = np.matmul(R_curr[1:], np.transpose(R_curr[:-1], (0, 2, 1)))
     rv = R.from_matrix(R_diff).as_rotvec()
     angvel = np.zeros((N, 3), dtype=np.float32)
     angvel[1:] = rv
     t_curr = mats[:, :3, 3]
     tvel = np.zeros((N, 3), dtype=np.float32)
     tvel[1:] = t_curr[1:] - t_curr[:-1]
     return angvel.astype(np.float32), tvel.astype(np.float32)
 def _compute_vitpose_selected_indices(num_frames, fps, bucket_seconds, frames_per_bucket, sampling="uniform", seed=123):
     if num_frames <= 0: return []
     rng = np.random.default_rng(int(seed))
     selected = []
     bucket_len = max(1, int(round(float(bucket_seconds) * float(fps))))
     b_start = 0
     while b_start < num_frames:
         b_end = min(num_frames, b_start + bucket_len)
         k = min(int(frames_per_bucket), b_end - b_start)
         if k > 0:
+            if sampling == "random": idxs = np.sort(rng.choice(np.arange(b_start, b_end), size=k, replace=False)).tolist()
+            elif sampling == "linspace": idxs = sorted(list(set(np.linspace(b_start, b_end - 1, k, dtype=int).tolist())))
+            else:
                 if k == 1: idxs = [b_start + (b_end - b_start) // 2]
+                else: step = (b_end - b_start) // k; idxs = [min(b_start + i * step, b_end - 1) for i in range(k)]
             selected.extend(idxs)
         b_start = b_end
     return sorted(list(set(selected)))
 _SMPLX_MODEL = None
 _SMPLX_DEVICE = None
 def _get_smplx_model(device):
     global _SMPLX_MODEL, _SMPLX_DEVICE
+    if _SMPLX_MODEL is not None and _SMPLX_DEVICE == device: return _SMPLX_MODEL
     from hmr4d.utils.smplx_utils import make_smplx
     _SMPLX_MODEL = make_smplx("supermotion").to(device).eval()
     _SMPLX_DEVICE = device
     return _SMPLX_MODEL
 class SmplIncamRenderer:
     def __init__(self, width, height, K4, device="cuda", smplx2smpl_path="hmr4d/utils/body_model/smplx2smpl_sparse.pt"):
         from hmr4d.utils.smplx_utils import make_smplx
         from hmr4d.utils.vis.renderer import Renderer
         self.torch = torch
         self.device = device
         self.smplx = make_smplx("supermotion").to(device).eval()
+        self.smplx2smpl = None; self.faces = None
         try:
             self.smplx2smpl = torch.load(smplx2smpl_path).to(device)
             self.faces = make_smplx("smpl").faces
         except: self.faces = self.smplx.faces
         self.K_torch = torch.from_numpy(k4_to_K3(K4)).to(device)
         self.renderer = Renderer(width, height, device=device, faces=self.faces, K=self.K_torch)
     @torch.no_grad()
     def render(self, img_rgb_uint8, global_orient_aa, body_pose_aa, betas_10, transl_xyz, fl, pp):
         K3_torch = torch.from_numpy(np.array([[fl[0], 0, pp[0]], [0, fl[1], pp[1]], [0, 0, 1]], dtype=np.float32)).to(self.device)
         self.renderer.set_intrinsic(K3_torch)
+        params = { "global_orient": torch.from_numpy(global_orient_aa[None]).float().to(self.device), "body_pose": torch.from_numpy(body_pose_aa[None]).float().to(self.device), "betas": torch.from_numpy(betas_10[None]).float().to(self.device), "transl": torch.from_numpy(transl_xyz[None]).float().to(self.device), }
+        out = self.smplx(**params); verts = out.vertices[0]
         if self.smplx2smpl is not None and verts.dim() == 2: verts = torch.matmul(self.smplx2smpl, verts)
         img_out = self.renderer.render_mesh(verts, img_rgb_uint8, [0.8, 0.8, 0.8])
         return img_out
     @torch.no_grad()
     def get_verts(self, global_orient_aa, body_pose_aa, betas_10, transl_xyz):
+        params = { "global_orient": torch.from_numpy(global_orient_aa[None]).float().to(self.device), "body_pose": torch.from_numpy(body_pose_aa[None]).float().to(self.device), "betas": torch.from_numpy(betas_10[None]).float().to(self.device), "transl": torch.from_numpy(transl_xyz[None]).float().to(self.device), }
+        out = self.smplx(**params); verts = out.vertices[0]
+        if self.smplx2smpl is not None and verts.dim() == 2: verts = torch.matmul(self.smplx2smpl, verts)
         return verts
 def _as_betas10(betas_any) -> np.ndarray:
     betas = np.asarray(betas_any, dtype=np.float32).reshape(-1)
+    betas10 = np.zeros(10, dtype=np.float32); n = min(10, betas.size)
     if n > 0: betas10[:n] = betas[:n]
     return betas10
 def load_betas10_from_npz(npz_path, key="betas", index=None):
+    with np.load(npz_path, allow_pickle=True) as data: arr = data[key]
     if arr.ndim == 0: arr = np.asarray(arr).reshape(1)
     if arr.ndim == 1: betas = arr
+    elif arr.ndim == 2: row_idx = 0 if index is None else int(index); betas = arr[row_idx]
     else: raise ValueError(f"Bad betas shape: {arr.shape}")
     return _as_betas10(betas)
+def _default_shape_npz_path() -> str: return os.path.join(os.path.dirname(__file__), "shape.npz")
 def parse_smpl_inputs_from_row(row, override_betas10=None, keep_unity_scale=False, transl_source="pelvis", transl_y_offset_m=0.0):
     C = np.diag([1.0, -1.0, 1.0]).astype(np.float64)
     cam_rot_w_quat = np.array(row["cam_rot_world"], dtype=np.float64)
     R_cam_w = R.from_quat(cam_rot_w_quat).as_matrix()
     pel_rot_w_quat = np.array(row["pelvis_rot_world"], dtype=np.float64)
     R_pel_w = R.from_quat(pel_rot_w_quat).as_matrix()
+    # Relative Rotation (Body to Camera)
     R_rel_unity = R_cam_w.T @ R_pel_w
     R_cv = C @ R_rel_unity @ C
+    R_final = R_cv @ R.from_euler("z", 180, degrees=True).as_matrix()
     global_orient_aa = R.from_matrix(R_final).as_rotvec().astype(np.float32)
     smpl_scale = float(row.get("smpl_root_world_scale", 1.0))
     pelvis_cam_unity = np.asarray(row["smpl_incam_transl"], dtype=np.float64).reshape(3)
     root_cam_unity = np.asarray(row.get("smpl_root_incam_transl", [0.0, 0.0, 0.0]), dtype=np.float64).reshape(3)
     pelvis_cam_unity = pelvis_cam_unity + np.array([0.0, float(transl_y_offset_m), 0.0], dtype=np.float64)
     if str(transl_source).strip().lower() == "root": target_cam_unity = root_cam_unity
     else:
         if bool(keep_unity_scale): target_cam_unity = pelvis_cam_unity
         else:
             if abs(smpl_scale) > 1e-8: target_cam_unity = root_cam_unity + (pelvis_cam_unity - root_cam_unity) / smpl_scale
             else: target_cam_unity = pelvis_cam_unity
     target_cam_cv = (C @ target_cam_unity).astype(np.float64)
     pose = np.asarray(row["smplx_pose"], dtype=np.float32)
     body_pose = pose[3:66].astype(np.float32)
     betas10 = _as_betas10(override_betas10)
     return {
         "global_orient": global_orient_aa, "body_pose": body_pose, "betas": betas10,
         "target_cam_cv": target_cam_cv, "cam_rot_w_quat": cam_rot_w_quat,
         "cam_pos_world": np.asarray(row["cam_pos_world"], dtype=np.float64).reshape(3),
         "pelvis_pos_world": np.asarray(row["pelvis_pos_world"], dtype=np.float64).reshape(3),
         "smpl_scale": smpl_scale, "root_cam_unity": root_cam_unity
     }
 def batch_smpl_forward(betas, global_orient, body_pose, device):
     model = _get_smplx_model(device)
     N = len(betas)
+    chunk_size = 4096; pelvis_list = []
     with torch.no_grad():
         for i in range(0, N, chunk_size):
             b_betas = torch.from_numpy(betas[i:i+chunk_size]).float().to(device)
             b_go = torch.from_numpy(global_orient[i:i+chunk_size]).float().to(device)
             b_bp = torch.from_numpy(body_pose[i:i+chunk_size]).float().to(device)
             b_tr = torch.zeros((len(b_betas), 3), dtype=torch.float32, device=device)
             out = model(betas=b_betas, global_orient=b_go, body_pose=b_bp, transl=b_tr)
             pelvis_list.append(out.joints[:, 0, :].detach().cpu().numpy())
     return np.concatenate(pelvis_list, axis=0)
 def main():
     parser = argparse.ArgumentParser()
     parser.add_argument("--input", required=True)
     parser.add_argument("--output", required=True)
     parser.add_argument("--debug", action="store_true")
     parser.add_argument("--vitpose", action="store_true")
+    parser.add_argument("--genmo", action="store_true")
     parser.add_argument("--dpvo", action="store_true")
     parser.add_argument("--smplx", action="store_true")
     parser.add_argument("--debug_no_coco", action="store_true")
     parser.add_argument("--shape_npz", default=_default_shape_npz_path())
     parser.add_argument("--vitpose_use_all_frames", action="store_true")
     parser.add_argument("--vitpose_bucket_seconds", type=float, default=12.0)
     parser.add_argument("--vitpose_frames_per_bucket", type=int, default=36)
     parser.add_argument("--vitpose_sampling", type=str, default="random")
     parser.add_argument("--vitpose_seed", type=int, default=123)
     parser.add_argument("--ui_dir", type=str, default=None)
     parser.add_argument("--ui_show_prob", type=float, default=0.25)
     parser.add_argument("--ui_max_images", type=int, default=3)
     parser.add_argument("--ui_hold_min_s", type=float, default=0.7)
     parser.add_argument("--ui_hold_max_s", type=float, default=5.0)
     parser.add_argument("--ui_seed", type=int, default=None)
     parser.add_argument("--keep_unity_scale", action="store_true")
     parser.add_argument("--transl_source", type=str, default="pelvis")
     parser.add_argument("--transl_y_offset_m", type=float, default=-0.020)
     parser.add_argument("--world_y_offset_m", type=float, default=1.3415)
     parser.add_argument("--vit_batch_size", type=int, default=512, help="Batch size for in-memory ViT extraction")
     args = parser.parse_args()
     if not (args.vitpose or args.genmo or args.dpvo or args.smplx):
         args.vitpose = args.genmo = args.dpvo = args.smplx = True
     device = "cuda" if torch.cuda.is_available() else "cpu"
     print(f"Running STREAMING processing on {device.upper()}...")
     vit_model = None
     if args.genmo and Extractor is not None:
         print("Initializing ViT Extractor (HMR2)...")
         extractor_wrapper = Extractor(tqdm_leave=False)
         vit_model = extractor_wrapper.extractor
         vit_model.eval()
         vit_model.to(device)
     override_betas10 = load_betas10_from_npz(args.shape_npz, key="betas")
     temp_ann_dir = os.path.join(args.output, "vitpose", "temp_annotations")
     os.makedirs(temp_ann_dir, exist_ok=True)
     jsonl_files = sorted(glob(os.path.join(args.input, "sequence_*.jsonl")))
     global_J_reg = None
     j_reg_path = "third_party/GVHMR/inputs/checkpoints/body_models/smpl_neutral_J_regressor.pt"
     if os.path.exists(j_reg_path) and device == "cuda":
         global_J_reg = torch.load(j_reg_path, map_location=device)
     for jsonl_idx, jsonl_path in enumerate(jsonl_files):
         seq_name = os.path.splitext(os.path.basename(jsonl_path))[0].replace("sequence_", "")
         print(f"[{jsonl_idx+1}/{len(jsonl_files)}] Processing {seq_name}...")
         prof = {"smpl_batch": 0.0, "video_read": 0.0, "overlay": 0.0, "vit_process": 0.0,
                 "sparse_write": 0.0, "loop_total": 0.0, "save_files": 0.0, "debug_rend": 0.0, "prep": 0.0}
         t_start_seq = time.perf_counter()
         jsonl_dir = os.path.dirname(os.path.abspath(jsonl_path))
         video_path = os.path.join(jsonl_dir, f"video_{seq_name}.mp4")
         if not os.path.exists(video_path): video_path = os.path.join(jsonl_dir, "video.mp4")
         out_img_folder = os.path.join(args.output, "images", seq_name)
         os.makedirs(out_img_folder, exist_ok=True)
         with open(jsonl_path, "r") as f: lines = f.readlines()
         lines = lines[1:] if len(lines) > 0 else []
         num_frames = len(lines)
         if num_frames <= 0: continue
         genmo_out = os.path.join(args.output, "genmo_features", f"{seq_name}.pt")
         smplx_out = os.path.join(args.output, "smplx_incam", f"{seq_name}_smplx.npz")
         smplx_global_out = os.path.join(args.output, "smplx_global", f"{seq_name}_global.npz")
         dpvo_dir = os.path.join(args.output, "dpvo", seq_name)
         for p in [genmo_out, smplx_out, smplx_global_out, dpvo_dir]:
             if p: os.makedirs(os.path.dirname(p), exist_ok=True)
         selected_set = set()
         if args.vitpose:
             if args.vitpose_use_all_frames: selected_indices = list(range(num_frames))
             else:
                 selected_indices = _compute_vitpose_selected_indices(
                     num_frames, FPS, args.vitpose_bucket_seconds,
                     args.vitpose_frames_per_bucket, args.vitpose_sampling, args.vitpose_seed
                 )
             selected_set = set(selected_indices)
         cap = cv2.VideoCapture(video_path)
         if not cap.isOpened(): continue
         W = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
         H = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
         resolved_ui_dir = args.ui_dir if args.ui_dir else _find_ui_dir()
         chat_font_path = _find_font_path(resolved_ui_dir)
         seq_seed = int(zlib.crc32(seq_name.encode("utf-8")) & 0xFFFFFFFF)
         chat_aug = SimpleChatOverlay(W, H, seed=seq_seed, num_lines=7, font_path=chat_font_path)
+        ui_aug = SimpleUIOverlay(W, H, seed=((seq_seed ^ 0xA5A5A5A5) & 0xFFFFFFFF), ui_dir=resolved_ui_dir,
                                  max_images=args.ui_max_images, show_prob=args.ui_show_prob)
         # --- BATCH SMPL (GPU) ---
         t0_smpl = time.perf_counter()
         smpl_precalc_data = []
         debug_global_verts_cpu = []
         parsed_rows = []
         for line in lines:
             row = json.loads(line)
             parsed_rows.append(parse_smpl_inputs_from_row(row, override_betas10, args.keep_unity_scale, args.transl_source, args.transl_y_offset_m))
         all_betas = np.stack([d['betas'] for d in parsed_rows])
         all_go = np.stack([d['global_orient'] for d in parsed_rows])
         all_bp = np.stack([d['body_pose'] for d in parsed_rows])
+        # We need an initial batch forward to get local pelvis offsets
         all_pelvis0 = batch_smpl_forward(all_betas, all_go, all_bp, device=device)
         C = np.diag([1.0, -1.0, 1.0]).astype(np.float64)
+        C4 = np.diag([1.0, -1.0, 1.0, 1.0]).astype(np.float64)
         all_go_w, all_pelvis_pos_w_cv = [], []
+        # --- FIX: DEFINE THE FIX ROTATION (Z-180) FOR WORLD ---
+        fix_rot = R.from_euler("z", 180, degrees=True).as_matrix()
+        fix_mat = np.eye(4, dtype=np.float64)
+        fix_mat[:3, :3] = fix_rot
+        # SMPLX-only adjustment: rotate the SMPLX global orientation in world by 180deg around Y.
+        # (Do NOT touch camera/world transforms; this only changes the SMPL parameters.)
+        # smplx_global_y180 = R.from_euler("y", 180, degrees=True).as_matrix()
         for i, d in enumerate(parsed_rows):
+            # Unity Rotation (Raw)
             R_cam_w_unity = R.from_quat(d['cam_rot_w_quat']).as_matrix()
+            # --- APPLY FIX HERE: Pre-multiply Camera Rot by Fix (Z-180) ---
+            # This ensures the SMPL global orientation is calculated relative to the FIXED Camera
+            R_cam_w_cv = fix_rot @ (C @ R_cam_w_unity @ C)
             R_pelvis_c_cv = R.from_rotvec(d['global_orient'].astype(np.float64)).as_matrix()
+            R_pelvis_w_cv = R_cam_w_cv @ R_pelvis_c_cv
             all_go_w.append(R.from_matrix(R_pelvis_w_cv).as_rotvec().astype(np.float32))
+            # Position Logic
             pelvis_pos_w_unity = d['pelvis_pos_world']
             root_pos_w_unity = (R_cam_w_unity @ d['root_cam_unity'] + d['cam_pos_world']).reshape(3)
             smpl_scale = d['smpl_scale']
             transl_source_local = str(args.transl_source).strip().lower()
             if transl_source_local == "root": target_pos_w_unity = root_pos_w_unity
             else:
                 if bool(args.keep_unity_scale): target_pos_w_unity = pelvis_pos_w_unity
                 else:
                     if abs(smpl_scale) > 1e-8: target_pos_w_unity = root_pos_w_unity + (pelvis_pos_w_unity - root_pos_w_unity) / smpl_scale
                     else: target_pos_w_unity = pelvis_pos_w_unity
+            # --- APPLY FIX HERE: Pre-multiply Position by Fix (Z-180) ---
+            pos_cv_raw = (C @ target_pos_w_unity).astype(np.float64)
+            pelvis_pos_w_cv = fix_rot @ pos_cv_raw.reshape(3, 1)
+            all_pelvis_pos_w_cv.append(pelvis_pos_w_cv.reshape(3))
+        all_go_w = np.stack(all_go_w)
+        # Apply the SMPLX-only world-space rotation to global_orient.
+        # all_go_w = (
+        #     R.from_matrix(smplx_global_y180 @ R.from_rotvec(all_go_w.astype(np.float64)).as_matrix())
+        #     .as_rotvec()
+        #     .astype(np.float32)
+        # )
+        # Compute World-Space Pelvis offsets (dependent on global orient)
+        all_pelvis0_w = batch_smpl_forward(all_betas, all_go_w, all_bp, device=device)
+        for i in range(num_frames):
+            d = parsed_rows[i]
+            # Incam Transl
+            transl_c = (d['target_cam_cv'] - all_pelvis0[i]).astype(np.float32)
+            # World Transl
+            if str(args.transl_source) == "root": transl_w = all_pelvis_pos_w_cv[i].astype(np.float32)
             else: transl_w = (all_pelvis_pos_w_cv[i] - all_pelvis0_w[i]).astype(np.float32)
             smpl_precalc_data.append({
                 "go_c": d['global_orient'], "bp": d['body_pose'], "beta": d['betas'], "tr_c": transl_c,
+                "go_w": all_go_w[i], "tr_w": transl_w,
+                "cam_rot_w_quat": d["cam_rot_w_quat"], "cam_pos_world": d["cam_pos_world"]
             })
         prof["smpl_batch"] = time.perf_counter() - t0_smpl
+        # --- Debug Verification ---
+        if args.debug:
+            try:
+                row0 = json.loads(lines[0])
+                cam_pos0 = np.asarray(row0["cam_pos_world"], dtype=np.float64).reshape(3)
+                cam_q0 = np.asarray(row0["cam_rot_world"], dtype=np.float64).reshape(4)
+                pelvis_pos0 = np.asarray(row0["pelvis_pos_world"], dtype=np.float64).reshape(3)
+                pelvis_cam_meta0 = np.asarray(row0.get("smpl_incam_transl", [0.0, 0.0, 0.0]), dtype=np.float64).reshape(3)
+                R_cam_w0 = R.from_quat(cam_q0).as_matrix()
+                pelvis_cam_est0 = (R_cam_w0.T @ (pelvis_pos0 - cam_pos0).reshape(3, 1)).reshape(3)
+                diff0 = pelvis_cam_est0 - pelvis_cam_meta0
+                Log.info(f"[Debug] {seq_name} pelvis_cam_unity check: diff={diff0.round(4)}")
+            except Exception as e:
+                Log.warning(f"[Debug] {seq_name} pelvis_cam_unity check failed: {e}")
         t0_gap = time.perf_counter()
         smpl_renderer = None
         vid_incam, vid_global = None, None
         debug_end_frame = min(num_frames, DEBUG_NUM_FRAMES)
         if args.debug:
              os.makedirs(os.path.join(args.output, "debug_renders"), exist_ok=True)
              if debug_end_frame > 0:
                  try:
                     K4_init = np.asarray(json.loads(lines[0])["cam_intrinsics"], dtype=np.float32)
                     smpl_renderer = SmplIncamRenderer(W, H, K4_init, device=device)
                     fourcc = cv2.VideoWriter_fourcc(*'mp4v')
                     vid_incam = cv2.VideoWriter(os.path.join(args.output, "debug_renders", f"{seq_name}_incam.mp4"), fourcc, FPS, (W, H))
+                    dbg_gw, dbg_gh = 960, 540
+                    vid_global = cv2.VideoWriter(os.path.join(args.output, "debug_renders", f"{seq_name}_global.mp4"), fourcc, FPS, (dbg_gw, dbg_gh))
                  except: pass
         # --- MAIN LOOP ---
+        coco_subset, img_paths, K_fullimg_all = [], [], []
         cam_T_wc_cv_all, cam_T_w2c_cv_all = [], []
         dpvo_poses, dpvo_intrinsics = [], []
+        bboxes, bbx_xys_all, kp2d_all = [], [], []
         global_orient_c_all, transl_c_all, body_pose_all, betas_all = [], [], [], []
         global_orient_w_all, transl_w_all = [], []
+        vit_img_batch, all_vit_features = [], []
         ret, _ = cap.read() # skip 0
         prof["prep"] = time.perf_counter() - t0_gap
         t_start_loop = time.perf_counter()
         for idx in tqdm(range(num_frames), desc="Frames", leave=False):
             t0_read = time.perf_counter()
             ret, img_bgr = cap.read()
             prof["video_read"] += (time.perf_counter() - t0_read)
             if not ret: break
             img_filename = f"img_{idx:05d}.jpg"
             img_abs_path = os.path.join(out_img_folder, img_filename)
             t0_ov = time.perf_counter()
             chat_aug.maybe_append(idx)
             chat_aug.draw(img_bgr)
             ui_aug.draw(img_bgr)
             prof["overlay"] += (time.perf_counter() - t0_ov)
+            row = json.loads(lines[idx])
             K4 = np.asarray(row["cam_intrinsics"], dtype=np.float32)
             kpts_raw = np.asarray(row["kpts_2d"], dtype=np.float32).reshape(-1, 2)[:17]
             vis_raw = np.asarray(row["kpts_vis"], dtype=np.int32)[:17]
+            if vis_raw.shape[0] >= 5: vis_raw[3] = 1; vis_raw[4] = 1
             bbox = clamp_bbox_xywh_to_image(row["bbox"], W, H)
             sd = smpl_precalc_data[idx]
             global_orient_c_all.append(sd['go_c'])
             transl_c_all.append(sd['tr_c'])
             global_orient_w_all.append(sd['go_w'])
             transl_w_all.append(sd['tr_w'])
             body_pose_all.append(sd['bp'])
             betas_all.append(sd['beta'])
             bboxes.append(np.asarray(bbox, dtype=np.float32))
             bbx_xys_all.append(bbox_xywh_to_bbx_xys(bbox))
             kp2d_all.append(np.concatenate([kpts_raw, (vis_raw > 0).astype(np.float32)[:, None]], axis=1))
             K_fullimg_all.append(k4_to_K3(K4))
             img_rel = os.path.join("images", seq_name, img_filename).replace("\\", "/")
             img_paths.append(img_rel)
+            # Use raw Unity values
+            p_w = np.asarray(sd["cam_pos_world"], dtype=np.float32)
+            q_w = np.asarray(sd["cam_rot_w_quat"], dtype=np.float32)
+            # 1. Build the Standard Unity-to-CV Matrix (C @ M @ C)
             cam_T_wc = build_T_wc(p_w, q_w)
+            cam_T_wc_cv_raw = (C4 @ cam_T_wc @ C4)
+            # 2. APPLY THE FIX (Z-180) to the Camera, matching precalc loop
+            cam_T_wc_cv = (fix_mat @ cam_T_wc_cv_raw).astype(np.float32)
+            # 3. Invert for W2C
+            cam_T_w2c_cv = np.linalg.inv(cam_T_wc_cv)
             cam_T_wc_cv_all.append(cam_T_wc_cv)
             cam_T_w2c_cv_all.append(cam_T_w2c_cv)
             dpvo_poses.append(f"{p_w[0]} {p_w[1]} {p_w[2]} {q_w[0]} {q_w[1]} {q_w[2]} {q_w[3]}")
             dpvo_intrinsics.append(K4.astype(np.float32))
             if args.genmo and vit_model is not None:
                 t0_vit = time.perf_counter()
+                img_tensor = _process_image_memory(img_bgr, bbox, img_size=256)
                 vit_img_batch.append(img_tensor)
                 if len(vit_img_batch) >= args.vit_batch_size:
                     batch_np = np.stack(vit_img_batch)
                     batch_t = torch.from_numpy(batch_np).to(device, non_blocking=True)
                     with torch.inference_mode():
                         with torch.amp.autocast("cuda"):
                             feats = vit_model({"img": batch_t})
                     all_vit_features.append(feats.detach().cpu())
                     vit_img_batch = []
                 prof["vit_process"] += (time.perf_counter() - t0_vit)
             if args.vitpose and (idx in selected_set):
                 t0_wr = time.perf_counter()
                 cv2.imwrite(img_abs_path, img_bgr, [int(cv2.IMWRITE_JPEG_QUALITY), 90])
                 kpts_coco = []
                 for k in range(17): kpts_coco.extend([float(kpts_raw[k, 0]), float(kpts_raw[k, 1]), int(vis_raw[k])])
                 coco_subset.append(({"file_name": img_rel, "width": W, "height": H},
                                     {"category_id": 1, "bbox": bbox, "area": float(bbox[2]*bbox[3]), "iscrowd": 0, "keypoints": kpts_coco, "num_keypoints": int(np.sum(vis_raw > 0))}))
                 prof["sparse_write"] += (time.perf_counter() - t0_wr)
             if args.debug and idx < debug_end_frame and smpl_renderer:
                 t0_dbg = time.perf_counter()
                 dbg = img_bgr.copy()
                 try: draw_bbox_xywh_and_center(dbg, bbox)
                 except: pass
                 try:
                     rgb = smpl_renderer.render(dbg[:, :, ::-1].copy(), sd['go_c'], sd['bp'], sd['beta'], sd['tr_c'], K4[:2], K4[2:])
                     dbg = rgb[:, :, ::-1].copy()
                 except: pass
                 if not args.debug_no_coco:
                      draw_vis_text_and_points(dbg, kpts_raw, vis_raw)
                 if vid_incam: vid_incam.write(dbg)
                 if vid_global:
                     verts_w = smpl_renderer.get_verts(sd['go_w'], sd['bp'], sd['beta'], sd['tr_w']).float()
                     debug_global_verts_cpu.append(verts_w.detach().cpu())
                 prof["debug_rend"] += (time.perf_counter() - t0_dbg)
         if args.genmo and len(vit_img_batch) > 0 and vit_model is not None:
              t0_vit = time.perf_counter()
              batch_np = np.stack(vit_img_batch)
              batch_t = torch.from_numpy(batch_np).to(device, non_blocking=True)
              with torch.inference_mode():
                  with torch.amp.autocast("cuda"):
                      feats = vit_model({"img": batch_t})
              all_vit_features.append(feats.detach().cpu())
              prof["vit_process"] += (time.perf_counter() - t0_vit)
         prof["loop_total"] = time.perf_counter() - t_start_loop
         cap.release()
         if vid_incam: vid_incam.release()
         t0_dbg = time.perf_counter()
         if vid_global and len(debug_global_verts_cpu) > 0:
             try:
+                from hmr4d.utils.vis.renderer import (
+                    Renderer,
+                    get_global_cameras_static,
+                    get_ground_params_from_points,
+                    perspective_projection,
+                )
                 from hmr4d.utils.geo.hmr_cam import create_camera_sensor
+                dbg_gw, dbg_gh = 960, 540
+                _, _, K_global = create_camera_sensor(dbg_gw, dbg_gh, 24)
+                global_renderer = Renderer(dbg_gw, dbg_gh, device=device, faces=smpl_renderer.faces, K=K_global.to(device), bin_size=0)
                 verts_seq = torch.stack(debug_global_verts_cpu, dim=0)
                 off = verts_seq[0].mean(0); off[1] = verts_seq[0, :, 1].min()
                 verts_seq = verts_seq - off
+                # Convert CV-cam to GPU tensor for visualizer
+                cam_centers = None
+                try:
+                    F = int(verts_seq.shape[0])
+                    if len(cam_T_wc_cv_all) >= F:
+                        cam_wc = np.stack(cam_T_wc_cv_all[:F], axis=0).astype(np.float32)
+                        cam_centers = torch.from_numpy(cam_wc[:, :3, 3]).to(device=device)
+                        cam_centers = cam_centers - off.to(device=device)[None]
+                except Exception:
+                    cam_centers = None
+                g_R, g_T, g_L = get_global_cameras_static(
+                    verts_seq, beta=2.0, cam_height_degree=20, target_center_height=1.0, device=device
+                )
                 if global_J_reg is not None and verts_seq.shape[1] == global_J_reg.shape[-1]:
+                    joints_seq = torch.einsum("jv,fvk->fjk", global_J_reg.cpu(), verts_seq)
+                    roots = joints_seq[:, 0]
+                else:
+                    roots = verts_seq.mean(1)
                 sc, cx, cz = get_ground_params_from_points(roots, verts_seq)
                 global_renderer.set_ground(sc * 1.5, cx, cz)
                 col = torch.tensor([[0.0, 1.0, 0.0]], device=device)
+                trail = []
+                def _project_xy(points_w: torch.Tensor):
+                    P = points_w.view(1, -1, 3)
+                    x2d = perspective_projection(P, global_renderer.K, global_renderer.R, global_renderer.T.reshape(1, 3, 1))[0]
+                    return x2d
+                def _draw_polyline(img_bgr, pts_xy, color, closed=False, thickness=1):
+                    pts = np.asarray(pts_xy, dtype=np.int32).reshape(-1, 1, 2)
+                    if len(pts) < 2: return
+                    cv2.polylines(img_bgr, [pts], bool(closed), color, int(thickness), cv2.LINE_AA)
+                def _draw_camera_box_axes(img_bgr, C_w, right, up, fwd, scale=0.25):
+                    C_w = C_w.reshape(3)
+                    right = right.reshape(3)
+                    up = up.reshape(3)
+                    fwd = fwd.reshape(3)
+                    L = float(scale)
+                    # Draw Axis instead of just box (RGB = XYZ)
+                    # X (Right) - Red
+                    p_x = C_w + L * right
+                    xy_x = _project_xy(torch.stack([C_w, p_x])).detach().cpu().numpy()
+                    _draw_polyline(img_bgr, xy_x, (0, 0, 255), thickness=2)
+                    # Y (Up/Down) - Green
+                    p_y = C_w + L * up
+                    xy_y = _project_xy(torch.stack([C_w, p_y])).detach().cpu().numpy()
+                    _draw_polyline(img_bgr, xy_y, (0, 255, 0), thickness=2)
+                    # Z (Fwd) - Blue
+                    p_z = C_w + L * fwd
+                    xy_z = _project_xy(torch.stack([C_w, p_z])).detach().cpu().numpy()
+                    _draw_polyline(img_bgr, xy_z, (255, 0, 0), thickness=2)
                 for i in range(len(verts_seq)):
                     cam = global_renderer.create_camera(g_R[i], g_T[i])
                     img = global_renderer.render_with_ground(verts_seq[i].to(device)[None], col, cam, g_L)
+                    img_bgr = img[:, :, ::-1].copy()
+                    if cam_centers is not None and i < cam_centers.shape[0]:
+                        try:
+                            # Blue ray: camera center -> SMPL root
+                            if i < roots.shape[0]:
+                                pts_line = torch.stack([cam_centers[i], roots[i].to(device=device)], dim=0)
+                                xy_line = _project_xy(pts_line).detach().cpu().numpy()
+                                _draw_polyline(img_bgr, xy_line, (255, 200, 50), closed=False, thickness=1)
+                            P = cam_centers[i].view(1, 3)
+                            x2d = _project_xy(P)[0]
+                            x, y = int(round(float(x2d[0].item()))), int(round(float(x2d[1].item())))
+                            if 0 <= x < img_bgr.shape[1] and 0 <= y < img_bgr.shape[0]:
+                                trail.append((x, y))
+                                cv2.circle(img_bgr, (x, y), 3, (0, 0, 255), -1)
+                                if len(trail) >= 2:
+                                    cv2.polylines(img_bgr, [np.array(trail, dtype=np.int32)], False, (0, 0, 255), 1)
+                            if len(cam_T_wc_cv_all) > i:
+                                R_c2w = torch.from_numpy(np.asarray(cam_T_wc_cv_all[i], dtype=np.float32)[:3, :3]).to(device=device)
+                                C_w = cam_centers[i]
+                                right = R_c2w[:, 0]
+                                up = R_c2w[:, 1]
+                                fwd = R_c2w[:, 2]
+                                _draw_camera_box_axes(img_bgr, C_w, right, up, fwd, scale=0.35)
+                        except Exception: pass
+                    vid_global.write(img_bgr)
             except: pass
             vid_global.release()
         prof["debug_rend"] += (time.perf_counter() - t0_dbg)
         t0_save = time.perf_counter()
         if args.genmo:
             trans_w = np.stack(transl_w_all).astype(np.float32)
             world_off = trans_w[0].copy(); world_off[1] -= float(args.world_y_offset_m)
             trans_w_centered = trans_w - world_off[None]
             mats_w2c = np.stack(cam_T_w2c_cv_all).astype(np.float32)
             mats_wc = np.stack(cam_T_wc_cv_all).astype(np.float32)
             T_wp_w = np.eye(4, dtype=np.float32); T_wp_w[:3, 3] = world_off
             T_w_wp = np.eye(4, dtype=np.float32); T_w_wp[:3, 3] = -world_off
             mats_w2c_c = np.matmul(mats_w2c, T_wp_w[None])
             mats_wc_c = np.matmul(T_w_wp[None], mats_wc)
             cam_av, cam_tv = compute_velocity(mats_wc_c, fps=FPS)
             f_imgseq = torch.cat(all_vit_features, dim=0).float() if all_vit_features else torch.empty(0)
             g_dict = {
                 "smpl_params_c": {"global_orient": torch.from_numpy(np.stack(global_orient_c_all)), "body_pose": torch.from_numpy(np.stack(body_pose_all)), "transl": torch.from_numpy(np.stack(transl_c_all)), "betas": torch.from_numpy(np.stack(betas_all))},
                 "smpl_params_w": {"global_orient": torch.from_numpy(np.stack(global_orient_w_all)), "body_pose": torch.from_numpy(np.stack(body_pose_all)), "transl": torch.from_numpy(trans_w_centered), "betas": torch.from_numpy(np.stack(betas_all))},
                 "T_w2c": torch.from_numpy(mats_w2c_c), "K_fullimg": torch.from_numpy(np.stack(K_fullimg_all)),
                 "kp2d": torch.from_numpy(np.stack(kp2d_all)), "bbx_xys": torch.from_numpy(np.stack(bbx_xys_all)),
                 "cam_angvel": torch.from_numpy(cam_av), "cam_tvel": torch.from_numpy(cam_tv),
                 "imgname": img_paths, "valid_mask": torch.ones(len(img_paths), dtype=torch.float32),
                 "world_offset": torch.from_numpy(world_off.astype(np.float32)),
+                "f_imgseq": f_imgseq
             }
             torch.save(g_dict, genmo_out)
         if args.smplx:
              poses66 = np.concatenate([np.stack(global_orient_w_all), np.stack(body_pose_all)], axis=1)
              poses165 = np.pad(poses66, ((0,0),(0,99)), mode="constant").astype(np.float32)
              trans_w = np.stack(transl_w_all).astype(np.float32)
              world_off = trans_w[0].copy(); world_off[1] -= float(args.world_y_offset_m)
              trans_w = trans_w - world_off[None]
              np.savez(smplx_global_out, mocap_framerate=int(FPS), gender="neutral", betas=betas_all[0], trans=trans_w, poses=poses165, world_offset=world_off)
         if args.vitpose and coco_subset:
             with open(os.path.join(temp_ann_dir, f"{seq_name}.json"), "w") as f: json.dump(coco_subset, f)
         prof["save_files"] = time.perf_counter() - t0_save
         total_t = time.perf_counter() - t_start_seq
         print(f"   > Done in {total_t:.2f}s | FPS: {num_frames/total_t:.1f}")
         print(f"     [Breakdown] BatchPrep: {prof['smpl_batch']:.2f}s | Init/Gap: {prof['prep']:.2f}s | Read: {prof['video_read']:.2f}s")
         print(f"                 Overlay: {prof['overlay']:.2f}s | SparseWrite: {prof['sparse_write']:.2f}s | ViT: {prof['vit_process']:.2f}s")
         print(f"                 DbgRend: {prof['debug_rend']:.2f}s | SaveFiles: {prof['save_files']:.2f}s")
     print("All sequences processed.")
 if __name__ == "__main__":
+    main()

train.log CHANGED Viewed

@@ -3197,3 +3197,635 @@ full_key: dataset_opts.train.unity
 [12/30 22:16:05][INFO] [UnityDataset] Found 5 sequences.
 [12/30 22:16:05][INFO] [Val Dataset][7/7]: name=unity_val, size=5, genmo.datasets.unity_dataset.UnityDataset
 [12/30 22:16:05][INFO]

 [12/30 22:16:05][INFO] [UnityDataset] Found 5 sequences.
 [12/30 22:16:05][INFO] [Val Dataset][7/7]: name=unity_val, size=5, genmo.datasets.unity_dataset.UnityDataset
 [12/30 22:16:05][INFO]
+[12/30 22:26:06][INFO] [Exp Name]: finetune_
+[12/30 22:26:06][INFO] [GPU x Batch] = 1 x 1
+[12/30 22:26:06][INFO] [UnityDataset] Found 5 sequences.
+[12/30 22:26:06][INFO] [Train Dataset][9/9]: name=unity, size=5, genmo.datasets.unity_dataset.UnityDataset
+[12/30 22:26:06][INFO] [Train Dataset][All]: ConcatDataset size=5
+[12/30 22:26:06][INFO]
+[12/30 22:26:06][INFO] [UnityDataset] Found 5 sequences.
+[12/30 22:26:06][INFO] [Val Dataset][7/7]: name=unity_val, size=5, genmo.datasets.unity_dataset.UnityDataset
+[12/30 22:26:06][INFO]
+[12/30 22:26:11][INFO] [PL-Trainer] Loading ckpt: ./s050000.ckpt
+[12/30 22:26:42][INFO] [Simple Ckpt Saver]: Save to `outputs/unity/finetune_/version_3/checkpoints'
+[12/30 22:26:54][INFO] Start Fitting...
+[12/30 22:26:56][WARNING] /root/miniconda3/envs/gvhmr/lib/python3.10/site-packages/pytorch_lightning/utilities/model_summary/model_summary.py:242: Precision 16-mixed is not supported by the model summary.  Estimated model size in MB will not be accurate. Using 32 bits instead.
+[12/30 22:26:56][WARNING] /root/miniconda3/envs/gvhmr/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:434: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=11` in the `DataLoader` to improve performance.
+[12/30 22:26:56][WARNING] /root/miniconda3/envs/gvhmr/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:434: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=11` in the `DataLoader` to improve performance.
+[12/30 22:26:56][INFO] 🚀[FIT][Epoch 0] Data: unity Experiment: finetune_
+[12/30 22:27:28][INFO] [Exp Name]: finetune_
+[12/30 22:27:28][INFO] [GPU x Batch] = 1 x 1
+[12/30 22:27:28][INFO] [UnityDataset] Found 5 sequences.
+[12/30 22:27:28][INFO] [Train Dataset][9/9]: name=unity, size=5, genmo.datasets.unity_dataset.UnityDataset
+[12/30 22:27:28][INFO] [Train Dataset][All]: ConcatDataset size=5
+[12/30 22:27:28][INFO]
+[12/30 22:27:28][INFO] [UnityDataset] Found 5 sequences.
+[12/30 22:27:28][INFO] [Val Dataset][7/7]: name=unity_val, size=5, genmo.datasets.unity_dataset.UnityDataset
+[12/30 22:27:28][INFO]
+[12/30 22:27:37][INFO] [PL-Trainer] Loading ckpt: ./s050000.ckpt
+[12/30 22:27:56][INFO] [Simple Ckpt Saver]: Save to `outputs/unity/finetune_/version_4/checkpoints'
+[12/30 22:28:08][INFO] Start Fitting...
+[12/30 22:28:11][WARNING] /root/miniconda3/envs/gvhmr/lib/python3.10/site-packages/pytorch_lightning/utilities/model_summary/model_summary.py:242: Precision 16-mixed is not supported by the model summary.  Estimated model size in MB will not be accurate. Using 32 bits instead.
+[12/30 22:28:11][WARNING] /root/miniconda3/envs/gvhmr/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:434: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=11` in the `DataLoader` to improve performance.
+[12/30 22:28:11][WARNING] /root/miniconda3/envs/gvhmr/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:434: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=11` in the `DataLoader` to improve performance.
+[12/30 22:28:11][INFO] 🚀[FIT][Epoch 0] Data: unity Experiment: finetune_
+[12/30 22:29:56][INFO] [Exp Name]: finetune_
+[12/30 22:29:56][INFO] [GPU x Batch] = 1 x 1
+[12/30 22:29:56][INFO] [UnityDataset] Found 2 sequences.
+[12/30 22:29:56][INFO] [Train Dataset][9/9]: name=unity, size=2, genmo.datasets.unity_dataset.UnityDataset
+[12/30 22:29:56][INFO] [Train Dataset][All]: ConcatDataset size=2
+[12/30 22:29:56][INFO]
+[12/30 22:29:56][INFO] [UnityDataset] Found 2 sequences.
+[12/30 22:29:56][INFO] [Val Dataset][7/7]: name=unity_val, size=2, genmo.datasets.unity_dataset.UnityDataset
+[12/30 22:29:56][INFO]
+[12/30 22:30:02][INFO] [PL-Trainer] Loading ckpt: ./s050000.ckpt
+[12/30 22:30:17][INFO] [Simple Ckpt Saver]: Save to `outputs/unity/finetune_/version_5/checkpoints'
+[12/30 22:30:30][INFO] Start Fitting...
+[12/30 22:30:31][WARNING] /root/miniconda3/envs/gvhmr/lib/python3.10/site-packages/pytorch_lightning/utilities/model_summary/model_summary.py:242: Precision 16-mixed is not supported by the model summary.  Estimated model size in MB will not be accurate. Using 32 bits instead.
+[12/30 22:30:31][WARNING] /root/miniconda3/envs/gvhmr/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:434: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=11` in the `DataLoader` to improve performance.
+[12/30 22:30:31][WARNING] /root/miniconda3/envs/gvhmr/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:434: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=11` in the `DataLoader` to improve performance.
+[12/30 22:30:31][INFO] 🚀[FIT][Epoch 0] Data: unity Experiment: finetune_
+[12/30 22:56:38][INFO] [Exp Name]: finetune_
+[12/30 22:56:38][INFO] [GPU x Batch] = 1 x 1
+[12/30 22:56:38][INFO] [UnityDataset] Found 6 sequences.
+[12/30 22:56:38][INFO] [Train Dataset][9/9]: name=unity, size=6, genmo.datasets.unity_dataset.UnityDataset
+[12/30 22:56:38][INFO] [Train Dataset][All]: ConcatDataset size=6
+[12/30 22:56:38][INFO]
+[12/30 22:56:38][INFO] [UnityDataset] Found 6 sequences.
+[12/30 22:56:38][INFO] [Val Dataset][7/7]: name=unity_val, size=6, genmo.datasets.unity_dataset.UnityDataset
+[12/30 22:56:38][INFO]
+[12/30 22:56:44][INFO] [PL-Trainer] Loading ckpt: ./s050000.ckpt
+[12/30 22:57:07][INFO] [Simple Ckpt Saver]: Save to `outputs/unity/finetune_/version_6/checkpoints'
+[12/30 22:57:27][INFO] Start Fitting...
+[12/30 22:57:30][WARNING] /root/miniconda3/envs/gvhmr/lib/python3.10/site-packages/pytorch_lightning/utilities/model_summary/model_summary.py:242: Precision 16-mixed is not supported by the model summary.  Estimated model size in MB will not be accurate. Using 32 bits instead.
+[12/30 22:57:30][WARNING] /root/miniconda3/envs/gvhmr/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:434: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=11` in the `DataLoader` to improve performance.
+[12/30 22:57:31][WARNING] /root/miniconda3/envs/gvhmr/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:434: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=11` in the `DataLoader` to improve performance.
+[12/30 22:57:31][INFO] 🚀[FIT][Epoch 0] Data: unity Experiment: finetune_
+[12/30 22:57:34][WARNING] /root/miniconda3/envs/gvhmr/lib/python3.10/site-packages/torch/nn/modules/conv.py:306: UserWarning: Plan failed with a cudnnException: CUDNN_BACKEND_EXECUTION_PLAN_DESCRIPTOR: cudnnFinalize Descriptor Failed cudnn_status: CUDNN_STATUS_NOT_SUPPORTED (Triggered internally at ../aten/src/ATen/native/cudnn/Conv_v8.cpp:919.)
+  return F.conv1d(input, weight, bias, self.stride,
+[12/30 22:57:36][WARNING] /root/miniconda3/envs/gvhmr/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: Plan failed with a cudnnException: CUDNN_BACKEND_EXECUTION_PLAN_DESCRIPTOR: cudnnFinalize Descriptor Failed cudnn_status: CUDNN_STATUS_NOT_SUPPORTED (Triggered internally at ../aten/src/ATen/native/cudnn/Conv_v8.cpp:919.)
+  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass
+[12/30 22:57:40][WARNING] /root/miniconda3/envs/gvhmr/lib/python3.10/site-packages/torch/optim/lr_scheduler.py:143: UserWarning: Detected call of `lr_scheduler.step()` before `optimizer.step()`. In PyTorch 1.1.0 and later, you should call them in the opposite order: `optimizer.step()` before `lr_scheduler.step()`.  Failure to do this will result in PyTorch skipping the first value of the learning rate schedule. See more details at https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate
+  warnings.warn("Detected call of `lr_scheduler.step()` before `optimizer.step()`. "
+[12/30 22:57:47][WARNING] [VisUnityVal] Failed to read image: third_party/GVHMR/processed_dataset/images/0_biboo_birthday_speech/img_00699.jpg
+[12/30 22:58:14][WARNING] /root/miniconda3/envs/gvhmr/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/logger_connector/result.py:433: It is recommended to use `self.log('val_metric_Unity/pa_mpjpe', ..., sync_dist=True)` when logging on epoch level in distributed setting to accumulate the metric across devices.
+[12/30 22:58:14][WARNING] /root/miniconda3/envs/gvhmr/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/logger_connector/result.py:433: It is recommended to use `self.log('val_metric_Unity/mpjpe', ..., sync_dist=True)` when logging on epoch level in distributed setting to accumulate the metric across devices.
+[12/30 22:58:14][WARNING] /root/miniconda3/envs/gvhmr/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/logger_connector/result.py:433: It is recommended to use `self.log('val_metric_Unity/pve', ..., sync_dist=True)` when logging on epoch level in distributed setting to accumulate the metric across devices.
+[12/30 22:58:14][WARNING] /root/miniconda3/envs/gvhmr/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/logger_connector/result.py:433: It is recommended to use `self.log('val_metric_Unity/accel', ..., sync_dist=True)` when logging on epoch level in distributed setting to accumulate the metric across devices.
+[12/30 22:58:14][INFO] ✅[FIT][Epoch 0] finished! 00:46→03:06 | loss_epoch=28
+[12/30 23:01:18][INFO] [Exp Name]: finetune_
+[12/30 23:01:18][INFO] [GPU x Batch] = 1 x 1
+[12/30 23:01:18][INFO] [UnityDataset] Found 6 sequences.
+[12/30 23:01:18][INFO] [Train Dataset][9/9]: name=unity, size=6, genmo.datasets.unity_dataset.UnityDataset
+[12/30 23:01:18][INFO] [Train Dataset][All]: ConcatDataset size=6
+[12/30 23:01:18][INFO]
+[12/30 23:01:18][INFO] [UnityDataset] Found 6 sequences.
+[12/30 23:01:18][INFO] [Val Dataset][7/7]: name=unity_val, size=6, genmo.datasets.unity_dataset.UnityDataset
+[12/30 23:01:18][INFO]
+[12/30 23:01:26][INFO] [PL-Trainer] Loading ckpt: ./s050000.ckpt
+[12/30 23:01:45][INFO] [Simple Ckpt Saver]: Save to `outputs/unity/finetune_/version_0/checkpoints'
+[12/30 23:01:57][INFO] Start Fitting...
+[12/30 23:01:59][WARNING] /root/miniconda3/envs/gvhmr/lib/python3.10/site-packages/pytorch_lightning/utilities/model_summary/model_summary.py:242: Precision 16-mixed is not supported by the model summary.  Estimated model size in MB will not be accurate. Using 32 bits instead.
+[12/30 23:01:59][WARNING] /root/miniconda3/envs/gvhmr/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:434: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=11` in the `DataLoader` to improve performance.
+[12/30 23:01:59][WARNING] /root/miniconda3/envs/gvhmr/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:434: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=11` in the `DataLoader` to improve performance.
+[12/30 23:01:59][INFO] 🚀[FIT][Epoch 0] Data: unity Experiment: finetune_
+[12/30 23:02:01][WARNING] /root/miniconda3/envs/gvhmr/lib/python3.10/site-packages/torch/nn/modules/conv.py:306: UserWarning: Plan failed with a cudnnException: CUDNN_BACKEND_EXECUTION_PLAN_DESCRIPTOR: cudnnFinalize Descriptor Failed cudnn_status: CUDNN_STATUS_NOT_SUPPORTED (Triggered internally at ../aten/src/ATen/native/cudnn/Conv_v8.cpp:919.)
+  return F.conv1d(input, weight, bias, self.stride,
+[12/30 23:02:03][WARNING] /root/miniconda3/envs/gvhmr/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: Plan failed with a cudnnException: CUDNN_BACKEND_EXECUTION_PLAN_DESCRIPTOR: cudnnFinalize Descriptor Failed cudnn_status: CUDNN_STATUS_NOT_SUPPORTED (Triggered internally at ../aten/src/ATen/native/cudnn/Conv_v8.cpp:919.)
+  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass
+[12/30 23:02:07][WARNING] /root/miniconda3/envs/gvhmr/lib/python3.10/site-packages/torch/optim/lr_scheduler.py:143: UserWarning: Detected call of `lr_scheduler.step()` before `optimizer.step()`. In PyTorch 1.1.0 and later, you should call them in the opposite order: `optimizer.step()` before `lr_scheduler.step()`.  Failure to do this will result in PyTorch skipping the first value of the learning rate schedule. See more details at https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate
+  warnings.warn("Detected call of `lr_scheduler.step()` before `optimizer.step()`. "
+[12/30 23:02:13][WARNING] [VisUnityVal] Failed to read image: third_party/GVHMR/processed_dataset/images/0_biboo_birthday_speech/img_00699.jpg
+[12/30 23:02:41][WARNING] /root/miniconda3/envs/gvhmr/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/logger_connector/result.py:433: It is recommended to use `self.log('val_metric_Unity/pa_mpjpe', ..., sync_dist=True)` when logging on epoch level in distributed setting to accumulate the metric across devices.
+[12/30 23:02:41][WARNING] /root/miniconda3/envs/gvhmr/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/logger_connector/result.py:433: It is recommended to use `self.log('val_metric_Unity/mpjpe', ..., sync_dist=True)` when logging on epoch level in distributed setting to accumulate the metric across devices.
+[12/30 23:02:41][WARNING] /root/miniconda3/envs/gvhmr/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/logger_connector/result.py:433: It is recommended to use `self.log('val_metric_Unity/pve', ..., sync_dist=True)` when logging on epoch level in distributed setting to accumulate the metric across devices.
+[12/30 23:02:41][WARNING] /root/miniconda3/envs/gvhmr/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/logger_connector/result.py:433: It is recommended to use `self.log('val_metric_Unity/accel', ..., sync_dist=True)` when logging on epoch level in distributed setting to accumulate the metric across devices.
+[12/30 23:02:41][INFO] ✅[FIT][Epoch 0] finished! 00:43→02:53 | loss_epoch=28
+[12/30 23:09:40][INFO] [Exp Name]: finetune_
+[12/30 23:09:40][INFO] [GPU x Batch] = 1 x 1
+[12/30 23:09:41][INFO] [UnityDataset] Found 6 sequences.
+[12/30 23:09:41][INFO] [Train Dataset][9/9]: name=unity, size=6, genmo.datasets.unity_dataset.UnityDataset
+[12/30 23:09:41][INFO] [Train Dataset][All]: ConcatDataset size=6
+[12/30 23:09:41][INFO]
+[12/30 23:09:41][INFO] [UnityDataset] Found 6 sequences.
+[12/30 23:09:41][INFO] [Val Dataset][7/7]: name=unity_val, size=6, genmo.datasets.unity_dataset.UnityDataset
+[12/30 23:09:41][INFO]
+[12/30 23:09:49][INFO] [PL-Trainer] Loading ckpt: ./s050000.ckpt
+[12/30 23:10:08][INFO] [Simple Ckpt Saver]: Save to `outputs/unity/finetune_/version_1/checkpoints'
+[12/30 23:10:17][INFO] Start Fitting...
+[12/30 23:10:18][WARNING] /root/miniconda3/envs/gvhmr/lib/python3.10/site-packages/pytorch_lightning/utilities/model_summary/model_summary.py:242: Precision 16-mixed is not supported by the model summary.  Estimated model size in MB will not be accurate. Using 32 bits instead.
+[12/30 23:10:18][WARNING] /root/miniconda3/envs/gvhmr/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:434: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=11` in the `DataLoader` to improve performance.
+[12/30 23:10:18][WARNING] /root/miniconda3/envs/gvhmr/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:434: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=11` in the `DataLoader` to improve performance.
+[12/30 23:10:18][INFO] 🚀[FIT][Epoch 0] Data: unity Experiment: finetune_
+[12/30 23:10:19][WARNING] /root/miniconda3/envs/gvhmr/lib/python3.10/site-packages/torch/nn/modules/conv.py:306: UserWarning: Plan failed with a cudnnException: CUDNN_BACKEND_EXECUTION_PLAN_DESCRIPTOR: cudnnFinalize Descriptor Failed cudnn_status: CUDNN_STATUS_NOT_SUPPORTED (Triggered internally at ../aten/src/ATen/native/cudnn/Conv_v8.cpp:919.)
+  return F.conv1d(input, weight, bias, self.stride,
+[12/30 23:10:20][WARNING] /root/miniconda3/envs/gvhmr/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: Plan failed with a cudnnException: CUDNN_BACKEND_EXECUTION_PLAN_DESCRIPTOR: cudnnFinalize Descriptor Failed cudnn_status: CUDNN_STATUS_NOT_SUPPORTED (Triggered internally at ../aten/src/ATen/native/cudnn/Conv_v8.cpp:919.)
+  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass
+[12/30 23:10:24][WARNING] /root/miniconda3/envs/gvhmr/lib/python3.10/site-packages/torch/optim/lr_scheduler.py:143: UserWarning: Detected call of `lr_scheduler.step()` before `optimizer.step()`. In PyTorch 1.1.0 and later, you should call them in the opposite order: `optimizer.step()` before `lr_scheduler.step()`.  Failure to do this will result in PyTorch skipping the first value of the learning rate schedule. See more details at https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate
+  warnings.warn("Detected call of `lr_scheduler.step()` before `optimizer.step()`. "
+[12/30 23:10:44][INFO] [Exp Name]: finetune_
+[12/30 23:10:44][INFO] [GPU x Batch] = 1 x 1
+[12/30 23:10:44][INFO] [UnityDataset] Found 6 sequences.
+[12/30 23:10:44][INFO] [Train Dataset][9/9]: name=unity, size=6, genmo.datasets.unity_dataset.UnityDataset
+[12/30 23:10:44][INFO] [Train Dataset][All]: ConcatDataset size=6
+[12/30 23:10:44][INFO]
+[12/30 23:10:44][INFO] [UnityDataset] Found 6 sequences.
+[12/30 23:10:44][INFO] [Val Dataset][7/7]: name=unity_val, size=6, genmo.datasets.unity_dataset.UnityDataset
+[12/30 23:10:44][INFO]
+[12/30 23:10:52][INFO] [PL-Trainer] Loading ckpt: ./s050000.ckpt
+[12/30 23:11:04][INFO] [Simple Ckpt Saver]: Save to `outputs/unity/finetune_/version_2/checkpoints'
+[12/30 23:11:11][INFO] Start Fitting...
+[12/30 23:11:13][WARNING] /root/miniconda3/envs/gvhmr/lib/python3.10/site-packages/pytorch_lightning/utilities/model_summary/model_summary.py:242: Precision 16-mixed is not supported by the model summary.  Estimated model size in MB will not be accurate. Using 32 bits instead.
+[12/30 23:11:13][WARNING] /root/miniconda3/envs/gvhmr/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:434: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=11` in the `DataLoader` to improve performance.
+[12/30 23:11:13][WARNING] /root/miniconda3/envs/gvhmr/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:434: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=11` in the `DataLoader` to improve performance.
+[12/30 23:11:13][INFO] 🚀[FIT][Epoch 0] Data: unity Experiment: finetune_
+[12/30 23:11:14][WARNING] /root/miniconda3/envs/gvhmr/lib/python3.10/site-packages/torch/nn/modules/conv.py:306: UserWarning: Plan failed with a cudnnException: CUDNN_BACKEND_EXECUTION_PLAN_DESCRIPTOR: cudnnFinalize Descriptor Failed cudnn_status: CUDNN_STATUS_NOT_SUPPORTED (Triggered internally at ../aten/src/ATen/native/cudnn/Conv_v8.cpp:919.)
+  return F.conv1d(input, weight, bias, self.stride,
+[12/30 23:11:15][WARNING] /root/miniconda3/envs/gvhmr/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: Plan failed with a cudnnException: CUDNN_BACKEND_EXECUTION_PLAN_DESCRIPTOR: cudnnFinalize Descriptor Failed cudnn_status: CUDNN_STATUS_NOT_SUPPORTED (Triggered internally at ../aten/src/ATen/native/cudnn/Conv_v8.cpp:919.)
+  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass
+[12/30 23:11:19][WARNING] /root/miniconda3/envs/gvhmr/lib/python3.10/site-packages/torch/optim/lr_scheduler.py:143: UserWarning: Detected call of `lr_scheduler.step()` before `optimizer.step()`. In PyTorch 1.1.0 and later, you should call them in the opposite order: `optimizer.step()` before `lr_scheduler.step()`.  Failure to do this will result in PyTorch skipping the first value of the learning rate schedule. See more details at https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate
+  warnings.warn("Detected call of `lr_scheduler.step()` before `optimizer.step()`. "
+[12/30 23:11:27][WARNING] [VisUnityVal] Failed to read image: third_party/GVHMR/processed_dataset/images/0_biboo_birthday_speech/img_00699.jpg
+[12/30 23:11:53][WARNING] /root/miniconda3/envs/gvhmr/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/logger_connector/result.py:433: It is recommended to use `self.log('val_metric_Unity/pa_mpjpe', ..., sync_dist=True)` when logging on epoch level in distributed setting to accumulate the metric across devices.
+[12/30 23:11:53][WARNING] /root/miniconda3/envs/gvhmr/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/logger_connector/result.py:433: It is recommended to use `self.log('val_metric_Unity/mpjpe', ..., sync_dist=True)` when logging on epoch level in distributed setting to accumulate the metric across devices.
+[12/30 23:11:53][WARNING] /root/miniconda3/envs/gvhmr/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/logger_connector/result.py:433: It is recommended to use `self.log('val_metric_Unity/pve', ..., sync_dist=True)` when logging on epoch level in distributed setting to accumulate the metric across devices.
+[12/30 23:11:53][WARNING] /root/miniconda3/envs/gvhmr/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/logger_connector/result.py:433: It is recommended to use `self.log('val_metric_Unity/accel', ..., sync_dist=True)` when logging on epoch level in distributed setting to accumulate the metric across devices.
+[12/30 23:11:53][INFO] ✅[FIT][Epoch 0] finished! 00:40→02:43 | loss_epoch=28
+[12/30 23:29:33][INFO] [Exp Name]: finetune_
+[12/30 23:29:33][INFO] [GPU x Batch] = 1 x 1
+[12/30 23:29:33][INFO] [UnityDataset] Found 2 sequences.
+[12/30 23:29:33][INFO] [Train Dataset][9/9]: name=unity, size=2, genmo.datasets.unity_dataset.UnityDataset
+[12/30 23:29:33][INFO] [Train Dataset][All]: ConcatDataset size=2
+[12/30 23:29:33][INFO]
+[12/30 23:29:33][INFO] [UnityDataset] Found 2 sequences.
+[12/30 23:29:33][INFO] [Val Dataset][7/7]: name=unity_val, size=2, genmo.datasets.unity_dataset.UnityDataset
+[12/30 23:29:33][INFO]
+[12/30 23:29:39][INFO] [PL-Trainer] Loading ckpt: ./s050000.ckpt
+[12/30 23:30:02][INFO] [Simple Ckpt Saver]: Save to `outputs/unity/finetune_/version_3/checkpoints'
+[12/30 23:30:13][INFO] Start Fitting...
+[12/30 23:30:14][WARNING] /root/miniconda3/envs/gvhmr/lib/python3.10/site-packages/pytorch_lightning/utilities/model_summary/model_summary.py:242: Precision 16-mixed is not supported by the model summary.  Estimated model size in MB will not be accurate. Using 32 bits instead.
+[12/30 23:30:14][WARNING] /root/miniconda3/envs/gvhmr/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:434: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=11` in the `DataLoader` to improve performance.
+[12/30 23:30:14][WARNING] /root/miniconda3/envs/gvhmr/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:434: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=11` in the `DataLoader` to improve performance.
+[12/30 23:30:14][INFO] 🚀[FIT][Epoch 0] Data: unity Experiment: finetune_
+[12/30 23:30:15][WARNING] /root/miniconda3/envs/gvhmr/lib/python3.10/site-packages/torch/nn/modules/conv.py:306: UserWarning: Plan failed with a cudnnException: CUDNN_BACKEND_EXECUTION_PLAN_DESCRIPTOR: cudnnFinalize Descriptor Failed cudnn_status: CUDNN_STATUS_NOT_SUPPORTED (Triggered internally at ../aten/src/ATen/native/cudnn/Conv_v8.cpp:919.)
+  return F.conv1d(input, weight, bias, self.stride,
+[12/30 23:30:17][WARNING] /root/miniconda3/envs/gvhmr/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: Plan failed with a cudnnException: CUDNN_BACKEND_EXECUTION_PLAN_DESCRIPTOR: cudnnFinalize Descriptor Failed cudnn_status: CUDNN_STATUS_NOT_SUPPORTED (Triggered internally at ../aten/src/ATen/native/cudnn/Conv_v8.cpp:919.)
+  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass
+[12/30 23:30:18][WARNING] /root/miniconda3/envs/gvhmr/lib/python3.10/site-packages/torch/optim/lr_scheduler.py:143: UserWarning: Detected call of `lr_scheduler.step()` before `optimizer.step()`. In PyTorch 1.1.0 and later, you should call them in the opposite order: `optimizer.step()` before `lr_scheduler.step()`.  Failure to do this will result in PyTorch skipping the first value of the learning rate schedule. See more details at https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate
+  warnings.warn("Detected call of `lr_scheduler.step()` before `optimizer.step()`. "
+[12/30 23:30:30][INFO] [VisUnityVal] e000_0_biboo_birthday_speech root_y0: gt=+0.9930 pred=+0.9643 delta(pred-gt)=-0.0287
+[12/30 23:30:30][INFO] [VisUnityVal] e000_0_biboo_birthday_speech global_orient0_aa(gt)=[ 0.03876931 -0.17480041  0.02509396] global_orient0_aa(pred)=[-0.1090048  -1.7763788  -0.15125035]
+[12/30 23:30:30][INFO] [VisUnityVal] e000_0_biboo_birthday_speech global_orient0_yxz_deg gt=(-9.99,+2.34,+1.24) pred=(-101.99,-9.31,-0.53) pred_vs_gt=(-91.73,-11.16,-3.80)
+[12/30 23:30:30][INFO] [VisUnityVal] e000_0_biboo_birthday_speech yaw0_deg(pred_vs_gt)=+94.23
+[12/30 23:46:28][INFO] [Exp Name]: finetune_
+[12/30 23:46:28][INFO] [GPU x Batch] = 1 x 1
+[12/30 23:46:28][INFO] [UnityDataset] Found 2 sequences.
+[12/30 23:46:28][INFO] [Train Dataset][9/9]: name=unity, size=2, genmo.datasets.unity_dataset.UnityDataset
+[12/30 23:46:28][INFO] [Train Dataset][All]: ConcatDataset size=2
+[12/30 23:46:28][INFO]
+[12/30 23:46:28][INFO] [UnityDataset] Found 2 sequences.
+[12/30 23:46:28][INFO] [Val Dataset][7/7]: name=unity_val, size=2, genmo.datasets.unity_dataset.UnityDataset
+[12/30 23:46:28][INFO]
+[12/30 23:46:34][INFO] [PL-Trainer] Loading ckpt: ./s050000.ckpt
+[12/30 23:46:54][INFO] [Simple Ckpt Saver]: Save to `outputs/unity/finetune_/version_4/checkpoints'
+[12/30 23:47:05][INFO] Start Fitting...
+[12/30 23:47:07][WARNING] /root/miniconda3/envs/gvhmr/lib/python3.10/site-packages/pytorch_lightning/utilities/model_summary/model_summary.py:242: Precision 16-mixed is not supported by the model summary.  Estimated model size in MB will not be accurate. Using 32 bits instead.
+[12/30 23:47:07][WARNING] /root/miniconda3/envs/gvhmr/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:434: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=11` in the `DataLoader` to improve performance.
+[12/30 23:47:07][WARNING] /root/miniconda3/envs/gvhmr/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:434: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=11` in the `DataLoader` to improve performance.
+[12/30 23:47:07][INFO] 🚀[FIT][Epoch 0] Data: unity Experiment: finetune_
+[12/30 23:47:09][WARNING] /root/miniconda3/envs/gvhmr/lib/python3.10/site-packages/torch/nn/modules/conv.py:306: UserWarning: Plan failed with a cudnnException: CUDNN_BACKEND_EXECUTION_PLAN_DESCRIPTOR: cudnnFinalize Descriptor Failed cudnn_status: CUDNN_STATUS_NOT_SUPPORTED (Triggered internally at ../aten/src/ATen/native/cudnn/Conv_v8.cpp:919.)
+  return F.conv1d(input, weight, bias, self.stride,
+[12/30 23:47:11][WARNING] /root/miniconda3/envs/gvhmr/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: Plan failed with a cudnnException: CUDNN_BACKEND_EXECUTION_PLAN_DESCRIPTOR: cudnnFinalize Descriptor Failed cudnn_status: CUDNN_STATUS_NOT_SUPPORTED (Triggered internally at ../aten/src/ATen/native/cudnn/Conv_v8.cpp:919.)
+  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass
+[12/30 23:47:12][WARNING] /root/miniconda3/envs/gvhmr/lib/python3.10/site-packages/torch/optim/lr_scheduler.py:143: UserWarning: Detected call of `lr_scheduler.step()` before `optimizer.step()`. In PyTorch 1.1.0 and later, you should call them in the opposite order: `optimizer.step()` before `lr_scheduler.step()`.  Failure to do this will result in PyTorch skipping the first value of the learning rate schedule. See more details at https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate
+  warnings.warn("Detected call of `lr_scheduler.step()` before `optimizer.step()`. "
+[12/30 23:47:26][INFO] [VisUnityVal] e000_0_biboo_birthday_speech root_y0: gt=+0.9930 pred=+0.9643 delta(pred-gt)=-0.0287
+[12/30 23:47:26][INFO] [VisUnityVal] e000_0_biboo_birthday_speech global_orient0_aa(gt)=[ 0.03876931 -0.17480041  0.02509396] global_orient0_aa(pred)=[-0.1090048  -1.7763788  -0.15125035]
+[12/30 23:47:26][INFO] [VisUnityVal] e000_0_biboo_birthday_speech global_orient0_yxz_deg gt=(-9.99,+2.34,+1.24) pred=(-101.99,-9.31,-0.53) pred_vs_gt=(-91.73,-11.16,-3.80)
+[12/30 23:47:26][INFO] [VisUnityVal] e000_0_biboo_birthday_speech yaw0_deg(pred_vs_gt)=+94.23
+[12/30 23:51:42][INFO] [Exp Name]: finetune_
+[12/30 23:51:42][INFO] [GPU x Batch] = 1 x 1
+[12/30 23:51:42][INFO] [UnityDataset] Found 2 sequences.
+[12/30 23:51:42][INFO] [Train Dataset][9/9]: name=unity, size=2, genmo.datasets.unity_dataset.UnityDataset
+[12/30 23:51:42][INFO] [Train Dataset][All]: ConcatDataset size=2
+[12/30 23:51:42][INFO]
+[12/30 23:51:42][INFO] [UnityDataset] Found 2 sequences.
+[12/30 23:51:42][INFO] [Val Dataset][7/7]: name=unity_val, size=2, genmo.datasets.unity_dataset.UnityDataset
+[12/30 23:51:42][INFO]
+[12/30 23:51:48][INFO] [PL-Trainer] Loading ckpt: ./s050000.ckpt
+[12/30 23:52:04][INFO] [Simple Ckpt Saver]: Save to `outputs/unity/finetune_/version_5/checkpoints'
+[12/30 23:52:15][INFO] Start Fitting...
+[12/30 23:52:16][WARNING] /root/miniconda3/envs/gvhmr/lib/python3.10/site-packages/pytorch_lightning/utilities/model_summary/model_summary.py:242: Precision 16-mixed is not supported by the model summary.  Estimated model size in MB will not be accurate. Using 32 bits instead.
+[12/30 23:52:16][WARNING] /root/miniconda3/envs/gvhmr/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:434: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=11` in the `DataLoader` to improve performance.
+[12/30 23:52:16][WARNING] /root/miniconda3/envs/gvhmr/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:434: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=11` in the `DataLoader` to improve performance.
+[12/30 23:52:16][INFO] 🚀[FIT][Epoch 0] Data: unity Experiment: finetune_
+[12/30 23:52:18][WARNING] /root/miniconda3/envs/gvhmr/lib/python3.10/site-packages/torch/nn/modules/conv.py:306: UserWarning: Plan failed with a cudnnException: CUDNN_BACKEND_EXECUTION_PLAN_DESCRIPTOR: cudnnFinalize Descriptor Failed cudnn_status: CUDNN_STATUS_NOT_SUPPORTED (Triggered internally at ../aten/src/ATen/native/cudnn/Conv_v8.cpp:919.)
+  return F.conv1d(input, weight, bias, self.stride,
+[12/30 23:52:20][WARNING] /root/miniconda3/envs/gvhmr/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: Plan failed with a cudnnException: CUDNN_BACKEND_EXECUTION_PLAN_DESCRIPTOR: cudnnFinalize Descriptor Failed cudnn_status: CUDNN_STATUS_NOT_SUPPORTED (Triggered internally at ../aten/src/ATen/native/cudnn/Conv_v8.cpp:919.)
+  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass
+[12/30 23:52:21][WARNING] /root/miniconda3/envs/gvhmr/lib/python3.10/site-packages/torch/optim/lr_scheduler.py:143: UserWarning: Detected call of `lr_scheduler.step()` before `optimizer.step()`. In PyTorch 1.1.0 and later, you should call them in the opposite order: `optimizer.step()` before `lr_scheduler.step()`.  Failure to do this will result in PyTorch skipping the first value of the learning rate schedule. See more details at https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate
+  warnings.warn("Detected call of `lr_scheduler.step()` before `optimizer.step()`. "
+[12/30 23:52:35][INFO] [VisUnityVal] e000_0_biboo_birthday_speech root_y0: gt=+0.9930 pred=+0.9643 delta(pred-gt)=-0.0287
+[12/30 23:52:35][INFO] [VisUnityVal] e000_0_biboo_birthday_speech global_orient0_aa(gt)=[ 0.03876931 -0.17480041  0.02509396] global_orient0_aa(pred)=[-0.1090048  -1.7763788  -0.15125035]
+[12/30 23:52:35][INFO] [VisUnityVal] e000_0_biboo_birthday_speech global_orient0_yxz_deg gt=(-9.99,+2.34,+1.24) pred=(-101.99,-9.31,-0.53) pred_vs_gt=(-91.73,-11.16,-3.80)
+[12/30 23:52:35][INFO] [VisUnityVal] e000_0_biboo_birthday_speech yaw0_deg(pred_vs_gt)=+94.23
+[12/30 23:53:24][WARNING] /root/miniconda3/envs/gvhmr/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/logger_connector/result.py:433: It is recommended to use `self.log('val_metric_Unity/pa_mpjpe', ..., sync_dist=True)` when logging on epoch level in distributed setting to accumulate the metric across devices.
+[12/30 23:53:24][WARNING] /root/miniconda3/envs/gvhmr/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/logger_connector/result.py:433: It is recommended to use `self.log('val_metric_Unity/mpjpe', ..., sync_dist=True)` when logging on epoch level in distributed setting to accumulate the metric across devices.
+[12/30 23:53:24][WARNING] /root/miniconda3/envs/gvhmr/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/logger_connector/result.py:433: It is recommended to use `self.log('val_metric_Unity/pve', ..., sync_dist=True)` when logging on epoch level in distributed setting to accumulate the metric across devices.
+[12/30 23:53:24][WARNING] /root/miniconda3/envs/gvhmr/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/logger_connector/result.py:433: It is recommended to use `self.log('val_metric_Unity/accel', ..., sync_dist=True)` when logging on epoch level in distributed setting to accumulate the metric across devices.
+[12/30 23:53:24][INFO] ✅[FIT][Epoch 0] finished! 01:08→04:34 | loss_epoch=24.5
+[12/30 23:55:59][INFO] [Exp Name]: finetune_
+[12/30 23:55:59][INFO] [GPU x Batch] = 1 x 1
+[12/30 23:55:59][INFO] [UnityDataset] Found 2 sequences.
+[12/30 23:55:59][INFO] [Train Dataset][9/9]: name=unity, size=2, genmo.datasets.unity_dataset.UnityDataset
+[12/30 23:55:59][INFO] [Train Dataset][All]: ConcatDataset size=2
+[12/30 23:55:59][INFO]
+[12/30 23:55:59][INFO] [UnityDataset] Found 2 sequences.
+[12/30 23:55:59][INFO] [Val Dataset][7/7]: name=unity_val, size=2, genmo.datasets.unity_dataset.UnityDataset
+[12/30 23:55:59][INFO]
+[12/30 23:56:06][INFO] [PL-Trainer] Loading ckpt: ./s050000.ckpt
+[12/30 23:56:23][INFO] [Simple Ckpt Saver]: Save to `outputs/unity/finetune_/version_6/checkpoints'
+[12/30 23:56:35][INFO] Start Fitting...
+[12/30 23:56:37][WARNING] /root/miniconda3/envs/gvhmr/lib/python3.10/site-packages/pytorch_lightning/utilities/model_summary/model_summary.py:242: Precision 16-mixed is not supported by the model summary.  Estimated model size in MB will not be accurate. Using 32 bits instead.
+[12/30 23:56:37][WARNING] /root/miniconda3/envs/gvhmr/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:434: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=11` in the `DataLoader` to improve performance.
+[12/30 23:56:37][WARNING] /root/miniconda3/envs/gvhmr/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:434: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=11` in the `DataLoader` to improve performance.
+[12/30 23:56:37][INFO] 🚀[FIT][Epoch 0] Data: unity Experiment: finetune_
+[12/30 23:56:39][WARNING] /root/miniconda3/envs/gvhmr/lib/python3.10/site-packages/torch/nn/modules/conv.py:306: UserWarning: Plan failed with a cudnnException: CUDNN_BACKEND_EXECUTION_PLAN_DESCRIPTOR: cudnnFinalize Descriptor Failed cudnn_status: CUDNN_STATUS_NOT_SUPPORTED (Triggered internally at ../aten/src/ATen/native/cudnn/Conv_v8.cpp:919.)
+  return F.conv1d(input, weight, bias, self.stride,
+[12/30 23:56:41][WARNING] /root/miniconda3/envs/gvhmr/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: Plan failed with a cudnnException: CUDNN_BACKEND_EXECUTION_PLAN_DESCRIPTOR: cudnnFinalize Descriptor Failed cudnn_status: CUDNN_STATUS_NOT_SUPPORTED (Triggered internally at ../aten/src/ATen/native/cudnn/Conv_v8.cpp:919.)
+  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass
+[12/30 23:56:42][WARNING] /root/miniconda3/envs/gvhmr/lib/python3.10/site-packages/torch/optim/lr_scheduler.py:143: UserWarning: Detected call of `lr_scheduler.step()` before `optimizer.step()`. In PyTorch 1.1.0 and later, you should call them in the opposite order: `optimizer.step()` before `lr_scheduler.step()`.  Failure to do this will result in PyTorch skipping the first value of the learning rate schedule. See more details at https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate
+  warnings.warn("Detected call of `lr_scheduler.step()` before `optimizer.step()`. "
+[12/30 23:56:54][INFO] [VisUnityVal] e000_0_biboo_birthday_speech root_y0: gt=+0.9930 pred=+0.9643 delta(pred-gt)=-0.0287
+[12/30 23:56:54][INFO] [VisUnityVal] e000_0_biboo_birthday_speech global_orient0_aa(gt)=[ 0.03876931 -0.17480041  0.02509396] global_orient0_aa(pred)=[-0.1090048  -1.7763788  -0.15125035]
+[12/30 23:56:54][INFO] [VisUnityVal] e000_0_biboo_birthday_speech global_orient0_yxz_deg gt=(-9.99,+2.34,+1.24) pred=(-101.99,-9.31,-0.53) pred_vs_gt=(-91.73,-11.16,-3.80)
+[12/30 23:56:54][INFO] [VisUnityVal] e000_0_biboo_birthday_speech yaw0_deg(pred_vs_gt)=+94.23
+[12/30 23:57:45][WARNING] /root/miniconda3/envs/gvhmr/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/logger_connector/result.py:433: It is recommended to use `self.log('val_metric_Unity/pa_mpjpe', ..., sync_dist=True)` when logging on epoch level in distributed setting to accumulate the metric across devices.
+[12/30 23:57:45][WARNING] /root/miniconda3/envs/gvhmr/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/logger_connector/result.py:433: It is recommended to use `self.log('val_metric_Unity/mpjpe', ..., sync_dist=True)` when logging on epoch level in distributed setting to accumulate the metric across devices.
+[12/30 23:57:45][WARNING] /root/miniconda3/envs/gvhmr/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/logger_connector/result.py:433: It is recommended to use `self.log('val_metric_Unity/pve', ..., sync_dist=True)` when logging on epoch level in distributed setting to accumulate the metric across devices.
+[12/30 23:57:45][WARNING] /root/miniconda3/envs/gvhmr/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/logger_connector/result.py:433: It is recommended to use `self.log('val_metric_Unity/accel', ..., sync_dist=True)` when logging on epoch level in distributed setting to accumulate the metric across devices.
+[12/30 23:57:45][INFO] ✅[FIT][Epoch 0] finished! 01:09→04:38 | loss_epoch=24.5
+[12/30 23:58:35][INFO] [Exp Name]: finetune_
+[12/30 23:58:35][INFO] [GPU x Batch] = 1 x 1
+[12/30 23:58:35][INFO] [UnityDataset] Found 1 sequences.
+[12/30 23:58:35][INFO] [Train Dataset][9/9]: name=unity, size=1, genmo.datasets.unity_dataset.UnityDataset
+[12/30 23:58:35][INFO] [Train Dataset][All]: ConcatDataset size=1
+[12/30 23:58:35][INFO]
+[12/30 23:58:35][INFO] [UnityDataset] Found 1 sequences.
+[12/30 23:58:35][INFO] [Val Dataset][7/7]: name=unity_val, size=1, genmo.datasets.unity_dataset.UnityDataset
+[12/30 23:58:35][INFO]
+[12/30 23:58:44][INFO] [PL-Trainer] Loading ckpt: ./s050000.ckpt
+[12/30 23:59:06][INFO] [Simple Ckpt Saver]: Save to `outputs/unity/finetune_/version_7/checkpoints'
+[12/30 23:59:18][INFO] Start Fitting...
+[12/30 23:59:20][WARNING] /root/miniconda3/envs/gvhmr/lib/python3.10/site-packages/pytorch_lightning/utilities/model_summary/model_summary.py:242: Precision 16-mixed is not supported by the model summary.  Estimated model size in MB will not be accurate. Using 32 bits instead.
+[12/30 23:59:20][WARNING] /root/miniconda3/envs/gvhmr/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:434: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=11` in the `DataLoader` to improve performance.
+[12/30 23:59:20][WARNING] /root/miniconda3/envs/gvhmr/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:434: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=11` in the `DataLoader` to improve performance.
+[12/30 23:59:20][INFO] 🚀[FIT][Epoch 0] Data: unity Experiment: finetune_
+[12/30 23:59:22][WARNING] /root/miniconda3/envs/gvhmr/lib/python3.10/site-packages/torch/nn/modules/conv.py:306: UserWarning: Plan failed with a cudnnException: CUDNN_BACKEND_EXECUTION_PLAN_DESCRIPTOR: cudnnFinalize Descriptor Failed cudnn_status: CUDNN_STATUS_NOT_SUPPORTED (Triggered internally at ../aten/src/ATen/native/cudnn/Conv_v8.cpp:919.)
+  return F.conv1d(input, weight, bias, self.stride,
+[12/30 23:59:24][WARNING] /root/miniconda3/envs/gvhmr/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: Plan failed with a cudnnException: CUDNN_BACKEND_EXECUTION_PLAN_DESCRIPTOR: cudnnFinalize Descriptor Failed cudnn_status: CUDNN_STATUS_NOT_SUPPORTED (Triggered internally at ../aten/src/ATen/native/cudnn/Conv_v8.cpp:919.)
+  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass
+[12/30 23:59:24][WARNING] /root/miniconda3/envs/gvhmr/lib/python3.10/site-packages/torch/optim/lr_scheduler.py:143: UserWarning: Detected call of `lr_scheduler.step()` before `optimizer.step()`. In PyTorch 1.1.0 and later, you should call them in the opposite order: `optimizer.step()` before `lr_scheduler.step()`.  Failure to do this will result in PyTorch skipping the first value of the learning rate schedule. See more details at https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate
+  warnings.warn("Detected call of `lr_scheduler.step()` before `optimizer.step()`. "
+[12/30 23:59:36][INFO] [VisUnityVal] e000_100_biboo_birthday_speech_explosion_1 root_y0: gt=+0.9944 pred=+0.9685 delta(pred-gt)=-0.0259
+[12/30 23:59:36][INFO] [VisUnityVal] e000_100_biboo_birthday_speech_explosion_1 global_orient0_aa(gt)=[0.02056097 0.18737577 0.01068786] global_orient0_aa(pred)=[ 0.0337113  -2.8594027  -0.01747983]
+[12/30 23:59:36][INFO] [VisUnityVal] e000_100_biboo_birthday_speech_explosion_1 global_orient0_yxz_deg gt=(+10.74,+1.11,+0.72) pred=(-163.84,-0.50,-1.42) pred_vs_gt=(-174.54,-1.98,-1.80)
+[12/30 23:59:36][INFO] [VisUnityVal] e000_100_biboo_birthday_speech_explosion_1 yaw0_deg(pred_vs_gt)=+174.27
+[12/31 02:50:01][INFO] [Exp Name]: finetune_
+[12/31 02:50:01][INFO] [GPU x Batch] = 1 x 1
+[12/31 02:50:01][INFO] [UnityDataset] Found 1 sequences.
+[12/31 02:50:01][INFO] [Train Dataset][9/9]: name=unity, size=1, genmo.datasets.unity_dataset.UnityDataset
+[12/31 02:50:01][INFO] [Train Dataset][All]: ConcatDataset size=1
+[12/31 02:50:01][INFO]
+[12/31 02:50:01][INFO] [UnityDataset] Found 1 sequences.
+[12/31 02:50:01][INFO] [Val Dataset][7/7]: name=unity_val, size=1, genmo.datasets.unity_dataset.UnityDataset
+[12/31 02:50:01][INFO]
+[12/31 02:50:07][INFO] [PL-Trainer] Loading ckpt: ./s050000.ckpt
+[12/31 02:50:28][INFO] [Simple Ckpt Saver]: Save to `outputs/unity/finetune_/version_8/checkpoints'
+[12/31 02:50:41][INFO] Start Fitting...
+[12/31 02:50:42][WARNING] /root/miniconda3/envs/gvhmr/lib/python3.10/site-packages/pytorch_lightning/utilities/model_summary/model_summary.py:242: Precision 16-mixed is not supported by the model summary.  Estimated model size in MB will not be accurate. Using 32 bits instead.
+[12/31 02:50:42][WARNING] /root/miniconda3/envs/gvhmr/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:434: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=11` in the `DataLoader` to improve performance.
+[12/31 02:50:42][WARNING] /root/miniconda3/envs/gvhmr/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:434: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=11` in the `DataLoader` to improve performance.
+[12/31 02:50:42][INFO] 🚀[FIT][Epoch 0] Data: unity Experiment: finetune_
+[12/31 02:50:43][WARNING] /root/miniconda3/envs/gvhmr/lib/python3.10/site-packages/torch/nn/modules/conv.py:306: UserWarning: Plan failed with a cudnnException: CUDNN_BACKEND_EXECUTION_PLAN_DESCRIPTOR: cudnnFinalize Descriptor Failed cudnn_status: CUDNN_STATUS_NOT_SUPPORTED (Triggered internally at ../aten/src/ATen/native/cudnn/Conv_v8.cpp:919.)
+  return F.conv1d(input, weight, bias, self.stride,
+[12/31 02:50:45][WARNING] /root/miniconda3/envs/gvhmr/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: Plan failed with a cudnnException: CUDNN_BACKEND_EXECUTION_PLAN_DESCRIPTOR: cudnnFinalize Descriptor Failed cudnn_status: CUDNN_STATUS_NOT_SUPPORTED (Triggered internally at ../aten/src/ATen/native/cudnn/Conv_v8.cpp:919.)
+  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass
+[12/31 02:50:45][WARNING] /root/miniconda3/envs/gvhmr/lib/python3.10/site-packages/torch/optim/lr_scheduler.py:143: UserWarning: Detected call of `lr_scheduler.step()` before `optimizer.step()`. In PyTorch 1.1.0 and later, you should call them in the opposite order: `optimizer.step()` before `lr_scheduler.step()`.  Failure to do this will result in PyTorch skipping the first value of the learning rate schedule. See more details at https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate
+  warnings.warn("Detected call of `lr_scheduler.step()` before `optimizer.step()`. "
+[12/31 02:50:55][INFO] [VisUnityVal] e000_0_biboo_birthday_speech root_y0: gt=+0.9875 pred=+0.9726 delta(pred-gt)=-0.0149
+[12/31 02:50:55][INFO] [VisUnityVal] e000_0_biboo_birthday_speech global_orient0_aa(gt)=[ 0.01689476 -0.20703591  0.01797612] global_orient0_aa(pred)=[-0.0321125  -2.8486555  -0.07525362]
+[12/31 02:50:55][INFO] [VisUnityVal] e000_0_biboo_birthday_speech global_orient0_yxz_deg gt=(-11.85,+1.07,+0.92) pred=(-163.30,-3.15,+0.83) pred_vs_gt=(-151.41,-4.11,-0.96)
+[12/31 02:50:55][INFO] [VisUnityVal] e000_0_biboo_birthday_speech yaw0_deg(pred_vs_gt)=+151.99
+[12/31 02:51:41][WARNING] /root/miniconda3/envs/gvhmr/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/logger_connector/result.py:433: It is recommended to use `self.log('val_metric_Unity/pa_mpjpe', ..., sync_dist=True)` when logging on epoch level in distributed setting to accumulate the metric across devices.
+[12/31 02:51:41][WARNING] /root/miniconda3/envs/gvhmr/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/logger_connector/result.py:433: It is recommended to use `self.log('val_metric_Unity/mpjpe', ..., sync_dist=True)` when logging on epoch level in distributed setting to accumulate the metric across devices.
+[12/31 02:51:41][WARNING] /root/miniconda3/envs/gvhmr/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/logger_connector/result.py:433: It is recommended to use `self.log('val_metric_Unity/pve', ..., sync_dist=True)` when logging on epoch level in distributed setting to accumulate the metric across devices.
+[12/31 02:51:41][WARNING] /root/miniconda3/envs/gvhmr/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/logger_connector/result.py:433: It is recommended to use `self.log('val_metric_Unity/accel', ..., sync_dist=True)` when logging on epoch level in distributed setting to accumulate the metric across devices.
+[12/31 02:51:41][INFO] ✅[FIT][Epoch 0] finished! 01:00→04:01 | loss_epoch=12.6
+[12/31 03:10:22][INFO] [Exp Name]: finetune_
+[12/31 03:10:22][INFO] [GPU x Batch] = 1 x 1
+[12/31 03:10:22][INFO] [UnityDataset] Found 1 sequences.
+[12/31 03:10:22][INFO] [Train Dataset][9/9]: name=unity, size=1, genmo.datasets.unity_dataset.UnityDataset
+[12/31 03:10:22][INFO] [Train Dataset][All]: ConcatDataset size=1
+[12/31 03:10:22][INFO]
+[12/31 03:10:22][INFO] [UnityDataset] Found 1 sequences.
+[12/31 03:10:22][INFO] [Val Dataset][7/7]: name=unity_val, size=1, genmo.datasets.unity_dataset.UnityDataset
+[12/31 03:10:22][INFO]
+[12/31 03:10:28][INFO] [PL-Trainer] Loading ckpt: ./s050000.ckpt
+[12/31 03:10:51][INFO] [Simple Ckpt Saver]: Save to `outputs/unity/finetune_/version_9/checkpoints'
+[12/31 03:11:01][INFO] Start Fitting...
+[12/31 03:11:03][WARNING] /root/miniconda3/envs/gvhmr/lib/python3.10/site-packages/pytorch_lightning/utilities/model_summary/model_summary.py:242: Precision 16-mixed is not supported by the model summary.  Estimated model size in MB will not be accurate. Using 32 bits instead.
+[12/31 03:11:03][WARNING] /root/miniconda3/envs/gvhmr/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:434: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=11` in the `DataLoader` to improve performance.
+[12/31 03:11:03][WARNING] /root/miniconda3/envs/gvhmr/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:434: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=11` in the `DataLoader` to improve performance.
+[12/31 03:11:03][INFO] 🚀[FIT][Epoch 0] Data: unity Experiment: finetune_
+[12/31 03:11:04][WARNING] /root/miniconda3/envs/gvhmr/lib/python3.10/site-packages/torch/nn/modules/conv.py:306: UserWarning: Plan failed with a cudnnException: CUDNN_BACKEND_EXECUTION_PLAN_DESCRIPTOR: cudnnFinalize Descriptor Failed cudnn_status: CUDNN_STATUS_NOT_SUPPORTED (Triggered internally at ../aten/src/ATen/native/cudnn/Conv_v8.cpp:919.)
+  return F.conv1d(input, weight, bias, self.stride,
+[12/31 03:11:05][WARNING] /root/miniconda3/envs/gvhmr/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: Plan failed with a cudnnException: CUDNN_BACKEND_EXECUTION_PLAN_DESCRIPTOR: cudnnFinalize Descriptor Failed cudnn_status: CUDNN_STATUS_NOT_SUPPORTED (Triggered internally at ../aten/src/ATen/native/cudnn/Conv_v8.cpp:919.)
+  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass
+[12/31 03:11:05][WARNING] /root/miniconda3/envs/gvhmr/lib/python3.10/site-packages/torch/optim/lr_scheduler.py:143: UserWarning: Detected call of `lr_scheduler.step()` before `optimizer.step()`. In PyTorch 1.1.0 and later, you should call them in the opposite order: `optimizer.step()` before `lr_scheduler.step()`.  Failure to do this will result in PyTorch skipping the first value of the learning rate schedule. See more details at https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate
+  warnings.warn("Detected call of `lr_scheduler.step()` before `optimizer.step()`. "
+[12/31 03:11:16][INFO] [VisUnityVal] e000_0_biboo_birthday_speech root_y0: gt=+0.9875 pred=+0.9726 delta(pred-gt)=-0.0149
+[12/31 03:11:16][INFO] [VisUnityVal] e000_0_biboo_birthday_speech global_orient0_aa(gt)=[ 0.02646996  2.9343371  -0.02487765] global_orient0_aa(pred)=[-0.03202499 -2.848779   -0.0755955 ]
+[12/31 03:11:16][INFO] [VisUnityVal] e000_0_biboo_birthday_speech global_orient0_yxz_deg gt=(+168.15,+1.07,+0.92) pred=(-163.31,-3.16,+0.82) pred_vs_gt=(+28.58,+4.12,+0.97)
+[12/31 03:11:16][INFO] [VisUnityVal] e000_0_biboo_birthday_speech yaw0_deg(pred_vs_gt)=-28.01
+[12/31 03:12:02][WARNING] /root/miniconda3/envs/gvhmr/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/logger_connector/result.py:433: It is recommended to use `self.log('val_metric_Unity/pa_mpjpe', ..., sync_dist=True)` when logging on epoch level in distributed setting to accumulate the metric across devices.
+[12/31 03:12:02][WARNING] /root/miniconda3/envs/gvhmr/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/logger_connector/result.py:433: It is recommended to use `self.log('val_metric_Unity/mpjpe', ..., sync_dist=True)` when logging on epoch level in distributed setting to accumulate the metric across devices.
+[12/31 03:12:02][WARNING] /root/miniconda3/envs/gvhmr/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/logger_connector/result.py:433: It is recommended to use `self.log('val_metric_Unity/pve', ..., sync_dist=True)` when logging on epoch level in distributed setting to accumulate the metric across devices.
+[12/31 03:12:02][WARNING] /root/miniconda3/envs/gvhmr/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/logger_connector/result.py:433: It is recommended to use `self.log('val_metric_Unity/accel', ..., sync_dist=True)` when logging on epoch level in distributed setting to accumulate the metric across devices.
+[12/31 03:12:02][INFO] ✅[FIT][Epoch 0] finished! 01:00→04:01 | loss_epoch=14.2
+[12/31 03:16:57][INFO] [Exp Name]: finetune_
+[12/31 03:16:57][INFO] [GPU x Batch] = 1 x 1
+[12/31 03:16:57][INFO] [UnityDataset] Found 1 sequences.
+[12/31 03:16:57][INFO] [Train Dataset][9/9]: name=unity, size=1, genmo.datasets.unity_dataset.UnityDataset
+[12/31 03:16:57][INFO] [Train Dataset][All]: ConcatDataset size=1
+[12/31 03:16:57][INFO]
+[12/31 03:16:57][INFO] [UnityDataset] Found 1 sequences.
+[12/31 03:16:57][INFO] [Val Dataset][7/7]: name=unity_val, size=1, genmo.datasets.unity_dataset.UnityDataset
+[12/31 03:16:57][INFO]
+[12/31 03:17:04][INFO] [PL-Trainer] Loading ckpt: ./s050000.ckpt
+[12/31 03:17:24][INFO] [Simple Ckpt Saver]: Save to `outputs/unity/finetune_/version_10/checkpoints'
+[12/31 03:17:36][INFO] Start Fitting...
+[12/31 03:17:38][WARNING] /root/miniconda3/envs/gvhmr/lib/python3.10/site-packages/pytorch_lightning/utilities/model_summary/model_summary.py:242: Precision 16-mixed is not supported by the model summary.  Estimated model size in MB will not be accurate. Using 32 bits instead.
+[12/31 03:17:38][WARNING] /root/miniconda3/envs/gvhmr/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:434: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=11` in the `DataLoader` to improve performance.
+[12/31 03:17:38][WARNING] /root/miniconda3/envs/gvhmr/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:434: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=11` in the `DataLoader` to improve performance.
+[12/31 03:17:38][INFO] 🚀[FIT][Epoch 0] Data: unity Experiment: finetune_
+[12/31 03:17:40][WARNING] /root/miniconda3/envs/gvhmr/lib/python3.10/site-packages/torch/nn/modules/conv.py:306: UserWarning: Plan failed with a cudnnException: CUDNN_BACKEND_EXECUTION_PLAN_DESCRIPTOR: cudnnFinalize Descriptor Failed cudnn_status: CUDNN_STATUS_NOT_SUPPORTED (Triggered internally at ../aten/src/ATen/native/cudnn/Conv_v8.cpp:919.)
+  return F.conv1d(input, weight, bias, self.stride,
+[12/31 03:17:41][WARNING] /root/miniconda3/envs/gvhmr/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: Plan failed with a cudnnException: CUDNN_BACKEND_EXECUTION_PLAN_DESCRIPTOR: cudnnFinalize Descriptor Failed cudnn_status: CUDNN_STATUS_NOT_SUPPORTED (Triggered internally at ../aten/src/ATen/native/cudnn/Conv_v8.cpp:919.)
+  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass
+[12/31 03:17:41][WARNING] /root/miniconda3/envs/gvhmr/lib/python3.10/site-packages/torch/optim/lr_scheduler.py:143: UserWarning: Detected call of `lr_scheduler.step()` before `optimizer.step()`. In PyTorch 1.1.0 and later, you should call them in the opposite order: `optimizer.step()` before `lr_scheduler.step()`.  Failure to do this will result in PyTorch skipping the first value of the learning rate schedule. See more details at https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate
+  warnings.warn("Detected call of `lr_scheduler.step()` before `optimizer.step()`. "
+[12/31 03:17:52][INFO] [VisUnityVal] e000_100_biboo_birthday_speech_explosion_1 root_y0: gt=+0.9944 pred=+0.9686 delta(pred-gt)=-0.0258
+[12/31 03:17:52][INFO] [VisUnityVal] e000_100_biboo_birthday_speech_explosion_1 global_orient0_aa(gt)=[-0.01583315 -2.9540217   0.03045931] global_orient0_aa(pred)=[ 0.03382589 -2.8592563  -0.01758517]
+[12/31 03:17:52][INFO] [VisUnityVal] e000_100_biboo_birthday_speech_explosion_1 global_orient0_yxz_deg gt=(-169.26,+1.11,+0.72) pred=(-163.83,-0.50,-1.43) pred_vs_gt=(+5.47,+1.99,+1.81)
+[12/31 03:17:52][INFO] [VisUnityVal] e000_100_biboo_birthday_speech_explosion_1 yaw0_deg(pred_vs_gt)=-5.75
+[12/31 03:18:36][WARNING] /root/miniconda3/envs/gvhmr/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/logger_connector/result.py:433: It is recommended to use `self.log('val_metric_Unity/pa_mpjpe', ..., sync_dist=True)` when logging on epoch level in distributed setting to accumulate the metric across devices.
+[12/31 03:18:36][WARNING] /root/miniconda3/envs/gvhmr/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/logger_connector/result.py:433: It is recommended to use `self.log('val_metric_Unity/mpjpe', ..., sync_dist=True)` when logging on epoch level in distributed setting to accumulate the metric across devices.
+[12/31 03:18:36][WARNING] /root/miniconda3/envs/gvhmr/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/logger_connector/result.py:433: It is recommended to use `self.log('val_metric_Unity/pve', ..., sync_dist=True)` when logging on epoch level in distributed setting to accumulate the metric across devices.
+[12/31 03:18:36][WARNING] /root/miniconda3/envs/gvhmr/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/logger_connector/result.py:433: It is recommended to use `self.log('val_metric_Unity/accel', ..., sync_dist=True)` when logging on epoch level in distributed setting to accumulate the metric across devices.
+[12/31 03:18:36][INFO] ✅[FIT][Epoch 0] finished! 00:58→03:55 | loss_epoch=23
+[12/31 06:06:15][INFO] [Exp Name]: finetune_
+[12/31 06:06:15][INFO] [GPU x Batch] = 1 x 1
+[12/31 06:06:15][INFO] [UnityDataset] Found 3 sequences.
+[12/31 06:06:15][INFO] [Train Dataset][9/9]: name=unity, size=3, genmo.datasets.unity_dataset.UnityDataset
+[12/31 06:06:15][INFO] [Train Dataset][All]: ConcatDataset size=3
+[12/31 06:06:15][INFO]
+[12/31 06:06:15][INFO] [UnityDataset] Found 3 sequences.
+[12/31 06:06:15][INFO] [Val Dataset][7/7]: name=unity_val, size=3, genmo.datasets.unity_dataset.UnityDataset
+[12/31 06:06:15][INFO]
+[12/31 06:06:21][INFO] [PL-Trainer] Loading ckpt: ./s050000.ckpt
+[12/31 06:06:49][INFO] [Simple Ckpt Saver]: Save to `outputs/unity/finetune_/version_11/checkpoints'
+[12/31 06:07:02][INFO] Start Fitting...
+[12/31 06:07:04][WARNING] /root/miniconda3/envs/gvhmr/lib/python3.10/site-packages/pytorch_lightning/utilities/model_summary/model_summary.py:242: Precision 16-mixed is not supported by the model summary.  Estimated model size in MB will not be accurate. Using 32 bits instead.
+[12/31 06:07:04][WARNING] /root/miniconda3/envs/gvhmr/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:434: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=11` in the `DataLoader` to improve performance.
+[12/31 06:07:04][WARNING] /root/miniconda3/envs/gvhmr/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:434: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=11` in the `DataLoader` to improve performance.
+[12/31 06:07:04][INFO] 🚀[FIT][Epoch 0] Data: unity Experiment: finetune_
+[12/31 06:07:07][WARNING] /root/miniconda3/envs/gvhmr/lib/python3.10/site-packages/torch/nn/modules/conv.py:306: UserWarning: Plan failed with a cudnnException: CUDNN_BACKEND_EXECUTION_PLAN_DESCRIPTOR: cudnnFinalize Descriptor Failed cudnn_status: CUDNN_STATUS_NOT_SUPPORTED (Triggered internally at ../aten/src/ATen/native/cudnn/Conv_v8.cpp:919.)
+  return F.conv1d(input, weight, bias, self.stride,
+[12/31 06:07:09][WARNING] /root/miniconda3/envs/gvhmr/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: Plan failed with a cudnnException: CUDNN_BACKEND_EXECUTION_PLAN_DESCRIPTOR: cudnnFinalize Descriptor Failed cudnn_status: CUDNN_STATUS_NOT_SUPPORTED (Triggered internally at ../aten/src/ATen/native/cudnn/Conv_v8.cpp:919.)
+  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass
+[12/31 06:07:11][WARNING] /root/miniconda3/envs/gvhmr/lib/python3.10/site-packages/torch/optim/lr_scheduler.py:143: UserWarning: Detected call of `lr_scheduler.step()` before `optimizer.step()`. In PyTorch 1.1.0 and later, you should call them in the opposite order: `optimizer.step()` before `lr_scheduler.step()`.  Failure to do this will result in PyTorch skipping the first value of the learning rate schedule. See more details at https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate
+  warnings.warn("Detected call of `lr_scheduler.step()` before `optimizer.step()`. "
+[12/31 06:07:22][INFO] [VisUnityVal] e000_0_biboo_birthday_speech root_y0: gt=+0.9820 pred=+0.9698 delta(pred-gt)=-0.0123
+[12/31 06:07:22][INFO] [VisUnityVal] e000_0_biboo_birthday_speech global_orient0_aa(gt)=[ 0.03590106 -0.17807975  0.02012725] global_orient0_aa(pred)=[-0.08420898 -2.6493108  -0.07150012]
+[12/31 06:07:22][INFO] [VisUnityVal] e000_0_biboo_birthday_speech global_orient0_yxz_deg gt=(-10.19,+2.15,+0.96) pred=(-151.99,-3.76,+2.70) pred_vs_gt=(-141.82,-6.13,+0.67)
+[12/31 06:07:22][INFO] [VisUnityVal] e000_0_biboo_birthday_speech yaw0_deg(pred_vs_gt)=+144.65
+[12/31 06:08:17][WARNING] /root/miniconda3/envs/gvhmr/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/logger_connector/result.py:433: It is recommended to use `self.log('val_metric_Unity/pa_mpjpe', ..., sync_dist=True)` when logging on epoch level in distributed setting to accumulate the metric across devices.
+[12/31 06:08:17][WARNING] /root/miniconda3/envs/gvhmr/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/logger_connector/result.py:433: It is recommended to use `self.log('val_metric_Unity/mpjpe', ..., sync_dist=True)` when logging on epoch level in distributed setting to accumulate the metric across devices.
+[12/31 06:08:17][WARNING] /root/miniconda3/envs/gvhmr/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/logger_connector/result.py:433: It is recommended to use `self.log('val_metric_Unity/pve', ..., sync_dist=True)` when logging on epoch level in distributed setting to accumulate the metric across devices.
+[12/31 06:08:17][WARNING] /root/miniconda3/envs/gvhmr/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/logger_connector/result.py:433: It is recommended to use `self.log('val_metric_Unity/accel', ..., sync_dist=True)` when logging on epoch level in distributed setting to accumulate the metric across devices.
+[12/31 06:08:17][INFO] ✅[FIT][Epoch 0] finished! 01:14→04:57 | loss_epoch=41.9
+[12/31 06:13:34][INFO] [Exp Name]: finetune_
+[12/31 06:13:34][INFO] [GPU x Batch] = 1 x 1
+[12/31 06:13:34][INFO] [UnityDataset] Found 3 sequences.
+[12/31 06:13:34][INFO] [Train Dataset][9/9]: name=unity, size=3, genmo.datasets.unity_dataset.UnityDataset
+[12/31 06:13:34][INFO] [Train Dataset][All]: ConcatDataset size=3
+[12/31 06:13:34][INFO]
+[12/31 06:13:34][INFO] [UnityDataset] Found 3 sequences.
+[12/31 06:13:34][INFO] [Val Dataset][7/7]: name=unity_val, size=3, genmo.datasets.unity_dataset.UnityDataset
+[12/31 06:13:34][INFO]
+[12/31 06:13:43][INFO] [Exp Name]: finetune_
+[12/31 06:13:43][INFO] [GPU x Batch] = 1 x 1
+[12/31 06:13:43][INFO] [UnityDataset] Found 1 sequences.
+[12/31 06:13:43][INFO] [Train Dataset][9/9]: name=unity, size=1, genmo.datasets.unity_dataset.UnityDataset
+[12/31 06:13:43][INFO] [Train Dataset][All]: ConcatDataset size=1
+[12/31 06:13:43][INFO]
+[12/31 06:13:43][INFO] [UnityDataset] Found 1 sequences.
+[12/31 06:13:43][INFO] [Val Dataset][7/7]: name=unity_val, size=1, genmo.datasets.unity_dataset.UnityDataset
+[12/31 06:13:43][INFO]
+[12/31 06:13:48][INFO] [PL-Trainer] Loading ckpt: ./s050000.ckpt
+[12/31 06:14:11][INFO] [Simple Ckpt Saver]: Save to `outputs/unity/finetune_/version_12/checkpoints'
+[12/31 06:14:22][INFO] Start Fitting...
+[12/31 06:14:26][WARNING] /root/miniconda3/envs/gvhmr/lib/python3.10/site-packages/pytorch_lightning/utilities/model_summary/model_summary.py:242: Precision 16-mixed is not supported by the model summary.  Estimated model size in MB will not be accurate. Using 32 bits instead.
+[12/31 06:14:26][WARNING] /root/miniconda3/envs/gvhmr/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:434: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=11` in the `DataLoader` to improve performance.
+[12/31 06:14:26][WARNING] /root/miniconda3/envs/gvhmr/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:434: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=11` in the `DataLoader` to improve performance.
+[12/31 06:14:26][INFO] 🚀[FIT][Epoch 0] Data: unity Experiment: finetune_
+[12/31 06:14:28][WARNING] /root/miniconda3/envs/gvhmr/lib/python3.10/site-packages/torch/nn/modules/conv.py:306: UserWarning: Plan failed with a cudnnException: CUDNN_BACKEND_EXECUTION_PLAN_DESCRIPTOR: cudnnFinalize Descriptor Failed cudnn_status: CUDNN_STATUS_NOT_SUPPORTED (Triggered internally at ../aten/src/ATen/native/cudnn/Conv_v8.cpp:919.)
+  return F.conv1d(input, weight, bias, self.stride,
+[12/31 06:14:30][WARNING] /root/miniconda3/envs/gvhmr/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: Plan failed with a cudnnException: CUDNN_BACKEND_EXECUTION_PLAN_DESCRIPTOR: cudnnFinalize Descriptor Failed cudnn_status: CUDNN_STATUS_NOT_SUPPORTED (Triggered internally at ../aten/src/ATen/native/cudnn/Conv_v8.cpp:919.)
+  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass
+[12/31 06:14:30][WARNING] /root/miniconda3/envs/gvhmr/lib/python3.10/site-packages/torch/optim/lr_scheduler.py:143: UserWarning: Detected call of `lr_scheduler.step()` before `optimizer.step()`. In PyTorch 1.1.0 and later, you should call them in the opposite order: `optimizer.step()` before `lr_scheduler.step()`.  Failure to do this will result in PyTorch skipping the first value of the learning rate schedule. See more details at https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate
+  warnings.warn("Detected call of `lr_scheduler.step()` before `optimizer.step()`. "
+[12/31 06:14:41][INFO] [VisUnityVal] e000_0_biboo_birthday_speech root_y0: gt=+0.9875 pred=+0.9726 delta(pred-gt)=-0.0149
+[12/31 06:14:41][INFO] [VisUnityVal] e000_0_biboo_birthday_speech global_orient0_aa(gt)=[ 0.01689476 -0.20703594  0.01797612] global_orient0_aa(pred)=[-0.03202499 -2.848779   -0.0755955 ]
+[12/31 06:14:41][INFO] [VisUnityVal] e000_0_biboo_birthday_speech global_orient0_yxz_deg gt=(-11.85,+1.07,+0.92) pred=(-163.31,-3.16,+0.82) pred_vs_gt=(-151.42,-4.12,-0.97)
+[12/31 06:14:41][INFO] [VisUnityVal] e000_0_biboo_birthday_speech yaw0_deg(pred_vs_gt)=+151.99
+[12/31 06:15:23][WARNING] /root/miniconda3/envs/gvhmr/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/logger_connector/result.py:433: It is recommended to use `self.log('val_metric_Unity/pa_mpjpe', ..., sync_dist=True)` when logging on epoch level in distributed setting to accumulate the metric across devices.
+[12/31 06:15:23][WARNING] /root/miniconda3/envs/gvhmr/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/logger_connector/result.py:433: It is recommended to use `self.log('val_metric_Unity/mpjpe', ..., sync_dist=True)` when logging on epoch level in distributed setting to accumulate the metric across devices.
+[12/31 06:15:23][WARNING] /root/miniconda3/envs/gvhmr/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/logger_connector/result.py:433: It is recommended to use `self.log('val_metric_Unity/pve', ..., sync_dist=True)` when logging on epoch level in distributed setting to accumulate the metric across devices.
+[12/31 06:15:23][WARNING] /root/miniconda3/envs/gvhmr/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/logger_connector/result.py:433: It is recommended to use `self.log('val_metric_Unity/accel', ..., sync_dist=True)` when logging on epoch level in distributed setting to accumulate the metric across devices.
+[12/31 06:15:23][INFO] ✅[FIT][Epoch 0] finished! 01:00→04:00 | loss_epoch=14.3
+[12/31 06:19:20][INFO] [Exp Name]: finetune_
+[12/31 06:19:20][INFO] [GPU x Batch] = 1 x 1
+[12/31 06:19:20][INFO] [UnityDataset] Found 1 sequences.
+[12/31 06:19:20][INFO] [Train Dataset][9/9]: name=unity, size=1, genmo.datasets.unity_dataset.UnityDataset
+[12/31 06:19:20][INFO] [Train Dataset][All]: ConcatDataset size=1
+[12/31 06:19:20][INFO]
+[12/31 06:19:20][INFO] [UnityDataset] Found 1 sequences.
+[12/31 06:19:20][INFO] [Val Dataset][7/7]: name=unity_val, size=1, genmo.datasets.unity_dataset.UnityDataset
+[12/31 06:19:20][INFO]
+[12/31 06:19:26][INFO] [PL-Trainer] Loading ckpt: ./s050000.ckpt
+[12/31 06:19:48][INFO] [Simple Ckpt Saver]: Save to `outputs/unity/finetune_/version_13/checkpoints'
+[12/31 06:19:59][INFO] Start Fitting...
+[12/31 06:20:01][WARNING] /root/miniconda3/envs/gvhmr/lib/python3.10/site-packages/pytorch_lightning/utilities/model_summary/model_summary.py:242: Precision 16-mixed is not supported by the model summary.  Estimated model size in MB will not be accurate. Using 32 bits instead.
+[12/31 06:20:01][WARNING] /root/miniconda3/envs/gvhmr/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:434: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=11` in the `DataLoader` to improve performance.
+[12/31 06:20:01][WARNING] /root/miniconda3/envs/gvhmr/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:434: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=11` in the `DataLoader` to improve performance.
+[12/31 06:20:01][INFO] 🚀[FIT][Epoch 0] Data: unity Experiment: finetune_
+[12/31 06:20:04][WARNING] /root/miniconda3/envs/gvhmr/lib/python3.10/site-packages/torch/nn/modules/conv.py:306: UserWarning: Plan failed with a cudnnException: CUDNN_BACKEND_EXECUTION_PLAN_DESCRIPTOR: cudnnFinalize Descriptor Failed cudnn_status: CUDNN_STATUS_NOT_SUPPORTED (Triggered internally at ../aten/src/ATen/native/cudnn/Conv_v8.cpp:919.)
+  return F.conv1d(input, weight, bias, self.stride,
+[12/31 06:20:06][WARNING] /root/miniconda3/envs/gvhmr/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: Plan failed with a cudnnException: CUDNN_BACKEND_EXECUTION_PLAN_DESCRIPTOR: cudnnFinalize Descriptor Failed cudnn_status: CUDNN_STATUS_NOT_SUPPORTED (Triggered internally at ../aten/src/ATen/native/cudnn/Conv_v8.cpp:919.)
+  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass
+[12/31 06:20:06][WARNING] /root/miniconda3/envs/gvhmr/lib/python3.10/site-packages/torch/optim/lr_scheduler.py:143: UserWarning: Detected call of `lr_scheduler.step()` before `optimizer.step()`. In PyTorch 1.1.0 and later, you should call them in the opposite order: `optimizer.step()` before `lr_scheduler.step()`.  Failure to do this will result in PyTorch skipping the first value of the learning rate schedule. See more details at https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate
+  warnings.warn("Detected call of `lr_scheduler.step()` before `optimizer.step()`. "
+[12/31 06:20:16][INFO] [VisUnityVal] e000_0_biboo_birthday_speech root_y0: gt=+0.9875 pred=+0.9726 delta(pred-gt)=-0.0149
+[12/31 06:20:16][INFO] [VisUnityVal] e000_0_biboo_birthday_speech global_orient0_aa(gt)=[ 0.02646996  2.9343371  -0.02487765] global_orient0_aa(pred)=[-0.03202499 -2.848779   -0.0755955 ]
+[12/31 06:20:16][INFO] [VisUnityVal] e000_0_biboo_birthday_speech global_orient0_yxz_deg gt=(+168.15,+1.07,+0.92) pred=(-163.31,-3.16,+0.82) pred_vs_gt=(+28.58,+4.12,+0.97)
+[12/31 06:20:16][INFO] [VisUnityVal] e000_0_biboo_birthday_speech yaw0_deg(pred_vs_gt)=-28.01
+[12/31 06:20:59][WARNING] /root/miniconda3/envs/gvhmr/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/logger_connector/result.py:433: It is recommended to use `self.log('val_metric_Unity/pa_mpjpe', ..., sync_dist=True)` when logging on epoch level in distributed setting to accumulate the metric across devices.
+[12/31 06:20:59][WARNING] /root/miniconda3/envs/gvhmr/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/logger_connector/result.py:433: It is recommended to use `self.log('val_metric_Unity/mpjpe', ..., sync_dist=True)` when logging on epoch level in distributed setting to accumulate the metric across devices.
+[12/31 06:20:59][WARNING] /root/miniconda3/envs/gvhmr/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/logger_connector/result.py:433: It is recommended to use `self.log('val_metric_Unity/pve', ..., sync_dist=True)` when logging on epoch level in distributed setting to accumulate the metric across devices.
+[12/31 06:20:59][WARNING] /root/miniconda3/envs/gvhmr/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/logger_connector/result.py:433: It is recommended to use `self.log('val_metric_Unity/accel', ..., sync_dist=True)` when logging on epoch level in distributed setting to accumulate the metric across devices.
+[12/31 06:20:59][INFO] ✅[FIT][Epoch 0] finished! 00:59→03:56 | loss_epoch=14.2

train.sh CHANGED Viewed

	@@ -1 +1,9 @@
1	- ~~python scripts~~/~~train.py~~ ~~--config-name finetune_unity ckpt_path=./s050000.ckpt~~

+#!/usr/bin/env bash
+set -euo pipefail
+# Make sure local repo modules (incl. `third_party/*`) are importable.
+export PYTHONPATH="$(pwd)${PYTHONPATH:+:$PYTHONPATH}"
+# GVHMR uses absolute imports like `import hmr4d...` internally.
+export PYTHONPATH="$(pwd)/third_party/GVHMR${PYTHONPATH:+:$PYTHONPATH}"
+python scripts/train.py --config-name finetune_unity ckpt_path=./s050000.ckpt