Spaces:
Running on Zero
Running on Zero
File size: 7,764 Bytes
e340a84 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 | import argparse
import os
import sys
import yaml
def default_config_path() -> str:
return os.path.join(
os.path.dirname(os.path.dirname(os.path.dirname(__file__))),
"configs",
"longstream_infer.yaml",
)
def add_runtime_arguments(parser):
parser.add_argument(
"--config",
default=default_config_path(),
help="Path to longstream config yaml.",
)
parser.add_argument(
"--dataset",
default=None,
help="Optional dataset hint. Generic format works without it.",
)
parser.add_argument("--img-path", default=None)
parser.add_argument(
"--seq-list",
default=None,
help="Comma-separated sequence names. Default: auto-detect all sequences.",
)
parser.add_argument("--format", default=None, help="generalizable")
parser.add_argument("--data-roots-file", default=None)
parser.add_argument("--camera", default=None)
parser.add_argument("--output-root", default=None)
parser.add_argument("--device", default=None)
parser.add_argument("--checkpoint", default=None)
parser.add_argument("--hf-repo", default=None)
parser.add_argument("--hf-file", default=None)
parser.add_argument(
"--mode", default=None, help="batch_refresh | streaming_refresh"
)
parser.add_argument("--streaming-mode", default=None, help="causal | window")
parser.add_argument("--window-size", type=int, default=None)
parser.add_argument("--keyframe-stride", type=int, default=None)
parser.add_argument(
"--refresh",
type=int,
default=None,
help="Number of keyframes per refresh span, inclusive of both ends and including the segment start keyframe.",
)
parser.add_argument(
"--keyframes-per-batch",
dest="keyframes_per_batch_legacy",
type=int,
default=None,
help=argparse.SUPPRESS,
)
parser.add_argument("--max-frames", type=int, default=None)
parser.add_argument("--depth-rel-delta-threshold", type=float, default=None)
parser.add_argument("--point-f1-threshold", type=float, default=None)
parser.add_argument("--eval-max-points", type=int, default=None)
parser.add_argument("--eval-voxel-size", type=float, default=None)
parser.add_argument("--max-full-pointcloud-points", type=int, default=None)
parser.add_argument("--max-frame-pointcloud-points", type=int, default=None)
parser.add_argument("--save-frame-points", action="store_true")
parser.add_argument("--no-save-frame-points", action="store_true")
parser.add_argument("--no-align-scale", action="store_true")
parser.add_argument("--mask-sky", action="store_true")
parser.add_argument("--no-mask-sky", action="store_true")
return parser
def parse_runtime_args(parser):
argv = [arg for arg in sys.argv[1:] if arg.strip()]
return parser.parse_args(argv)
def load_config_with_overrides(args):
with open(args.config, "r") as f:
cfg = yaml.safe_load(f) or {}
cfg.setdefault("model", {})
if args.device is not None:
cfg["device"] = args.device
if args.output_root is not None:
cfg.setdefault("output", {})
cfg["output"]["root"] = args.output_root
if args.dataset is not None:
cfg.setdefault("data", {})
cfg["data"]["dataset"] = args.dataset
if args.img_path is not None:
cfg.setdefault("data", {})
cfg["data"]["img_path"] = args.img_path
if args.seq_list is not None:
seqs = [s.strip() for s in args.seq_list.split(",") if s.strip()]
cfg.setdefault("data", {})
cfg["data"]["seq_list"] = seqs
if args.format is not None:
cfg.setdefault("data", {})
cfg["data"]["format"] = args.format
if args.data_roots_file is not None:
cfg.setdefault("data", {})
cfg["data"]["data_roots_file"] = args.data_roots_file
if args.camera is not None:
cfg.setdefault("data", {})
cfg["data"]["camera"] = args.camera
if args.max_frames is not None:
cfg.setdefault("data", {})
cfg["data"]["max_frames"] = args.max_frames
if args.checkpoint is not None:
cfg.setdefault("model", {})
cfg["model"]["checkpoint"] = args.checkpoint
if args.hf_repo is not None or args.hf_file is not None:
cfg.setdefault("model", {})
cfg["model"].setdefault("hf", {})
if args.hf_repo is not None:
cfg["model"]["hf"]["repo_id"] = args.hf_repo
if args.hf_file is not None:
cfg["model"]["hf"]["filename"] = args.hf_file
if cfg["model"].get("checkpoint") is None:
cfg["model"]["checkpoint"] = None
if args.mode is not None:
cfg.setdefault("inference", {})
cfg["inference"]["mode"] = args.mode
if args.streaming_mode is not None:
cfg.setdefault("inference", {})
cfg["inference"]["streaming_mode"] = args.streaming_mode
if args.window_size is not None:
cfg.setdefault("inference", {})
cfg["inference"]["window_size"] = args.window_size
cfg["model"].setdefault("longstream_cfg", {})
cfg["model"]["longstream_cfg"]["window_size"] = args.window_size
if args.keyframe_stride is not None:
cfg.setdefault("inference", {})
cfg["inference"]["keyframe_stride"] = args.keyframe_stride
cfg["model"].setdefault("longstream_cfg", {})
cfg["model"]["longstream_cfg"].setdefault("rel_pose_head_cfg", {})
cfg["model"]["longstream_cfg"]["rel_pose_head_cfg"][
"keyframe_stride"
] = args.keyframe_stride
refresh = args.refresh
if refresh is None and args.keyframes_per_batch_legacy is not None:
refresh = args.keyframes_per_batch_legacy + 1
if refresh is not None:
cfg.setdefault("inference", {})
cfg["inference"]["refresh"] = refresh
if args.depth_rel_delta_threshold is not None:
cfg.setdefault("evaluation", {})
cfg["evaluation"]["depth_rel_delta_threshold"] = args.depth_rel_delta_threshold
if args.point_f1_threshold is not None:
cfg.setdefault("evaluation", {})
cfg["evaluation"]["point_f1_threshold"] = args.point_f1_threshold
if args.eval_max_points is not None:
cfg.setdefault("evaluation", {})
cfg["evaluation"]["point_eval_max_points"] = args.eval_max_points
if args.eval_voxel_size is not None:
cfg.setdefault("evaluation", {})
cfg["evaluation"]["point_eval_voxel_size"] = args.eval_voxel_size
if args.max_full_pointcloud_points is not None:
cfg.setdefault("output", {})
cfg["output"]["max_full_pointcloud_points"] = args.max_full_pointcloud_points
if args.max_frame_pointcloud_points is not None:
cfg.setdefault("output", {})
cfg["output"]["max_frame_pointcloud_points"] = args.max_frame_pointcloud_points
if args.save_frame_points:
cfg.setdefault("output", {})
cfg["output"]["save_frame_points"] = True
if args.no_save_frame_points:
cfg.setdefault("output", {})
cfg["output"]["save_frame_points"] = False
if args.no_align_scale:
cfg.setdefault("evaluation", {})
cfg["evaluation"]["align_scale"] = False
if args.mask_sky:
cfg.setdefault("output", {})
cfg["output"]["mask_sky"] = True
if args.no_mask_sky:
cfg.setdefault("output", {})
cfg["output"]["mask_sky"] = False
infer_cfg = cfg.setdefault("inference", {})
if "refresh" not in infer_cfg and "keyframes_per_batch" in infer_cfg:
infer_cfg["refresh"] = int(infer_cfg["keyframes_per_batch"]) + 1
cfg.setdefault("data", {})
cfg["data"]["format"] = "generalizable"
return cfg
|