hcy511 commited on
Commit
ce6a2bf
·
verified ·
1 Parent(s): 6220fe9

Add files using upload-large-folder tool

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. output_flash6/correct/configs+libero_90_no_noops+b8+lr-0.0001+SPD+wd-0.3+x-action_queries+layerwise_decay--image_aug--VLA-Adapter--90----45000_chkpt/dataset_statistics.json +133 -0
  2. output_flash6/correct/configs+libero_90_no_noops+b8+lr-0.0001+SPD+wd-0.3+x-action_queries+layerwise_decay--image_aug--VLA-Adapter--90----45000_chkpt/merges.txt +0 -0
  3. output_flash6/correct/configs+libero_90_no_noops+b8+lr-0.0001+SPD+wd-0.3+x-action_queries+layerwise_decay--image_aug--VLA-Adapter--90----45000_chkpt/processing_prismatic.py +257 -0
  4. output_flash6/correct/configs+libero_90_no_noops+b8+lr-0.0001+SPD+wd-0.3+x-action_queries+layerwise_decay--image_aug--VLA-Adapter--90----45000_chkpt/special_tokens_map.json +31 -0
  5. output_flash6/correct/configs+libero_90_no_noops+b8+lr-0.0001+SPD+wd-0.3+x-action_queries+layerwise_decay--image_aug--VLA-Adapter--90----45000_chkpt/vocab.json +0 -0
  6. output_flash6/correct/configs+libero_90_no_noops+b8+lr-0.0001+SPD+wd-1.0+x-action_queries+layerwise_decay_cosine--image_aug--VLA-Adapter--90----5000_chkpt/added_tokens.json +24 -0
  7. output_flash6/correct/configs+libero_90_no_noops+b8+lr-0.0001+SPD+wd-1.0+x-action_queries+layerwise_decay_cosine--image_aug--VLA-Adapter--90----5000_chkpt/dataset_statistics.json +133 -0
  8. output_flash6/correct/configs+libero_90_no_noops+b8+lr-0.0001+SPD+wd-1.0+x-action_queries+layerwise_decay_cosine--image_aug--VLA-Adapter--90----5000_chkpt/generation_config.json +7 -0
  9. output_flash6/correct/configs+libero_90_no_noops+b8+lr-0.0001+SPD+wd-1.0+x-action_queries+layerwise_decay_cosine--image_aug--VLA-Adapter--90----5000_chkpt/preprocessor_config.json +114 -0
  10. output_flash6/correct/configs+libero_90_no_noops+b8+lr-0.0001+SPD+wd-1.0+x-action_queries+layerwise_decay_cosine--image_aug--VLA-Adapter--90----5000_chkpt/processor_config.json +6 -0
  11. output_flash6/correct/configs+libero_90_no_noops+b8+lr-0.0001+SPD+wd-1.0+x-action_queries+layerwise_decay_cosine--image_aug--VLA-Adapter--90----5000_chkpt/tokenizer_config.json +211 -0
  12. outputs/bridge/configs+bridge+b8+lr-0.0001+AdamW+wd-0+x-action_queries--image_aug--VLA-OFT--BRIDGE----10000_chkpt/action_head--10000_checkpoint.pt +3 -0
  13. outputs/bridge/configs+bridge+b8+lr-0.0001+AdamW+wd-0+x-action_queries--image_aug--VLA-OFT--BRIDGE----10000_chkpt/model.safetensors +3 -0
  14. outputs/bridge/configs+bridge+b8+lr-0.0001+AdamW+wd-0+x-action_queries--image_aug--VLA-OFT--BRIDGE----10000_chkpt/proprio_projector--10000_checkpoint.pt +3 -0
  15. outputs/bridge/configs+bridge+b8+lr-0.0001+AdamW+wd-0+x-action_queries--image_aug--VLA-OFT--BRIDGE----10000_chkpt/train_state--10000_checkpoint.pt +3 -0
  16. outputs/bridge/configs+bridge+b8+lr-0.0001+AdamW+wd-0+x-action_queries--image_aug--VLA-OFT--BRIDGE----20000_chkpt/action_head--20000_checkpoint.pt +3 -0
  17. outputs/bridge/configs+bridge+b8+lr-0.0001+AdamW+wd-0+x-action_queries--image_aug--VLA-OFT--BRIDGE----20000_chkpt/model.safetensors +3 -0
  18. outputs/bridge/configs+bridge+b8+lr-0.0001+AdamW+wd-0+x-action_queries--image_aug--VLA-OFT--BRIDGE----20000_chkpt/proprio_projector--20000_checkpoint.pt +3 -0
  19. outputs/bridge/configs+bridge+b8+lr-0.0001+AdamW+wd-0+x-action_queries--image_aug--VLA-OFT--BRIDGE----20000_chkpt/train_state--20000_checkpoint.pt +3 -0
  20. outputs/bridge/configs+bridge+b8+lr-0.0001+AdamW+wd-0+x-action_queries--image_aug--VLA-OFT--BRIDGE----30000_chkpt/action_head--30000_checkpoint.pt +3 -0
  21. outputs/bridge/configs+bridge+b8+lr-0.0001+AdamW+wd-0+x-action_queries--image_aug--VLA-OFT--BRIDGE----30000_chkpt/model.safetensors +3 -0
  22. outputs/bridge/configs+bridge+b8+lr-0.0001+AdamW+wd-0+x-action_queries--image_aug--VLA-OFT--BRIDGE----30000_chkpt/proprio_projector--30000_checkpoint.pt +3 -0
  23. outputs/bridge/configs+bridge+b8+lr-0.0001+AdamW+wd-0+x-action_queries--image_aug--VLA-OFT--BRIDGE----30000_chkpt/train_state--30000_checkpoint.pt +3 -0
  24. outputs/bridge/configs+bridge+b8+lr-0.0001+AdamW+wd-0+x-action_queries--image_aug--VLA-OFT--BRIDGE----40000_chkpt/action_head--40000_checkpoint.pt +3 -0
  25. outputs/bridge/configs+bridge+b8+lr-0.0001+AdamW+wd-0+x-action_queries--image_aug--VLA-OFT--BRIDGE----40000_chkpt/model.safetensors +3 -0
  26. outputs/bridge/configs+bridge+b8+lr-0.0001+AdamW+wd-0+x-action_queries--image_aug--VLA-OFT--BRIDGE----40000_chkpt/proprio_projector--40000_checkpoint.pt +3 -0
  27. outputs/bridge/configs+bridge+b8+lr-0.0001+AdamW+wd-0+x-action_queries--image_aug--VLA-OFT--BRIDGE----40000_chkpt/train_state--40000_checkpoint.pt +3 -0
  28. outputs/bridge/configs+bridge+b8+lr-0.0001+AdamW+wd-0+x-action_queries--image_aug--VLA-OFT--BRIDGE----5000_chkpt/action_head--5000_checkpoint.pt +3 -0
  29. outputs/bridge/configs+bridge+b8+lr-0.0001+AdamW+wd-0+x-action_queries--image_aug--VLA-OFT--BRIDGE----5000_chkpt/model.safetensors +3 -0
  30. outputs/bridge/configs+bridge+b8+lr-0.0001+AdamW+wd-0+x-action_queries--image_aug--VLA-OFT--BRIDGE----5000_chkpt/proprio_projector--5000_checkpoint.pt +3 -0
  31. outputs/bridge/configs+bridge+b8+lr-0.0001+AdamW+wd-0+x-action_queries--image_aug--VLA-OFT--BRIDGE----5000_chkpt/train_state--5000_checkpoint.pt +3 -0
  32. outputs/bridge/configs+bridge+b8+lr-1e-05+AdamW+wd-0+x-action_queries--image_aug--VLA-OFT--BRIDGE----10000_chkpt/action_head--10000_checkpoint.pt +3 -0
  33. outputs/bridge/configs+bridge+b8+lr-1e-05+AdamW+wd-0+x-action_queries--image_aug--VLA-OFT--BRIDGE----10000_chkpt/model.safetensors +3 -0
  34. outputs/bridge/configs+bridge+b8+lr-1e-05+AdamW+wd-0+x-action_queries--image_aug--VLA-OFT--BRIDGE----10000_chkpt/proprio_projector--10000_checkpoint.pt +3 -0
  35. outputs/bridge/configs+bridge+b8+lr-1e-05+AdamW+wd-0+x-action_queries--image_aug--VLA-OFT--BRIDGE----10000_chkpt/train_state--10000_checkpoint.pt +3 -0
  36. outputs/bridge/configs+bridge+b8+lr-1e-05+AdamW+wd-0+x-action_queries--image_aug--VLA-OFT--BRIDGE----15000_chkpt/action_head--15000_checkpoint.pt +3 -0
  37. outputs/bridge/configs+bridge+b8+lr-1e-05+AdamW+wd-0+x-action_queries--image_aug--VLA-OFT--BRIDGE----15000_chkpt/model.safetensors +3 -0
  38. outputs/bridge/configs+bridge+b8+lr-1e-05+AdamW+wd-0+x-action_queries--image_aug--VLA-OFT--BRIDGE----15000_chkpt/proprio_projector--15000_checkpoint.pt +3 -0
  39. outputs/bridge/configs+bridge+b8+lr-1e-05+AdamW+wd-0+x-action_queries--image_aug--VLA-OFT--BRIDGE----15000_chkpt/train_state--15000_checkpoint.pt +3 -0
  40. outputs/bridge/configs+bridge+b8+lr-1e-05+AdamW+wd-0+x-action_queries--image_aug--VLA-OFT--BRIDGE----20000_chkpt/action_head--20000_checkpoint.pt +3 -0
  41. outputs/bridge/configs+bridge+b8+lr-1e-05+AdamW+wd-0+x-action_queries--image_aug--VLA-OFT--BRIDGE----20000_chkpt/model.safetensors +3 -0
  42. outputs/bridge/configs+bridge+b8+lr-1e-05+AdamW+wd-0+x-action_queries--image_aug--VLA-OFT--BRIDGE----20000_chkpt/proprio_projector--20000_checkpoint.pt +3 -0
  43. outputs/bridge/configs+bridge+b8+lr-1e-05+AdamW+wd-0+x-action_queries--image_aug--VLA-OFT--BRIDGE----20000_chkpt/train_state--20000_checkpoint.pt +3 -0
  44. outputs/bridge/configs+bridge+b8+lr-1e-05+AdamW+wd-0+x-action_queries--image_aug--VLA-OFT--BRIDGE----25000_chkpt/action_head--25000_checkpoint.pt +3 -0
  45. outputs/bridge/configs+bridge+b8+lr-1e-05+AdamW+wd-0+x-action_queries--image_aug--VLA-OFT--BRIDGE----25000_chkpt/model.safetensors +3 -0
  46. outputs/bridge/configs+bridge+b8+lr-1e-05+AdamW+wd-0+x-action_queries--image_aug--VLA-OFT--BRIDGE----25000_chkpt/proprio_projector--25000_checkpoint.pt +3 -0
  47. outputs/bridge/configs+bridge+b8+lr-1e-05+AdamW+wd-0+x-action_queries--image_aug--VLA-OFT--BRIDGE----25000_chkpt/train_state--25000_checkpoint.pt +3 -0
  48. outputs/bridge/configs+bridge+b8+lr-1e-05+AdamW+wd-0+x-action_queries--image_aug--VLA-OFT--BRIDGE----30000_chkpt/action_head--30000_checkpoint.pt +3 -0
  49. outputs/bridge/configs+bridge+b8+lr-1e-05+AdamW+wd-0+x-action_queries--image_aug--VLA-OFT--BRIDGE----30000_chkpt/model.safetensors +3 -0
  50. outputs/bridge/configs+bridge+b8+lr-1e-05+AdamW+wd-0+x-action_queries--image_aug--VLA-OFT--BRIDGE----30000_chkpt/proprio_projector--30000_checkpoint.pt +3 -0
output_flash6/correct/configs+libero_90_no_noops+b8+lr-0.0001+SPD+wd-0.3+x-action_queries+layerwise_decay--image_aug--VLA-Adapter--90----45000_chkpt/dataset_statistics.json ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "libero_90_no_noops": {
3
+ "action": {
4
+ "mean": [
5
+ 0.04552208632230759,
6
+ 0.037328869104385376,
7
+ -0.09673234075307846,
8
+ 0.0050192056223750114,
9
+ 0.002271906239911914,
10
+ -0.006229790858924389,
11
+ 0.5282046794891357
12
+ ],
13
+ "std": [
14
+ 0.2984381318092346,
15
+ 0.36122551560401917,
16
+ 0.4067350924015045,
17
+ 0.048389386385679245,
18
+ 0.05818882957100868,
19
+ 0.08691500872373581,
20
+ 0.4985457956790924
21
+ ],
22
+ "max": [
23
+ 0.9375,
24
+ 0.9375,
25
+ 0.9375,
26
+ 0.375,
27
+ 0.375,
28
+ 0.375,
29
+ 1.0
30
+ ],
31
+ "min": [
32
+ -0.9375,
33
+ -0.9375,
34
+ -0.9375,
35
+ -0.3257142901420593,
36
+ -0.375,
37
+ -0.375,
38
+ 0.0
39
+ ],
40
+ "q01": [
41
+ -0.6294642686843872,
42
+ -0.8705357313156128,
43
+ -0.8946428298950195,
44
+ -0.12321428209543228,
45
+ -0.1574999988079071,
46
+ -0.2775000035762787,
47
+ 0.0
48
+ ],
49
+ "q99": [
50
+ 0.8517857193946838,
51
+ 0.8464285731315613,
52
+ 0.9375,
53
+ 0.1875,
54
+ 0.1778571456670761,
55
+ 0.3471428453922272,
56
+ 1.0
57
+ ],
58
+ "mask": [
59
+ true,
60
+ true,
61
+ true,
62
+ true,
63
+ true,
64
+ true,
65
+ false
66
+ ]
67
+ },
68
+ "proprio": {
69
+ "mean": [
70
+ -0.08226079493761063,
71
+ 0.010916395112872124,
72
+ 0.9453150629997253,
73
+ 2.974484920501709,
74
+ -0.11405275762081146,
75
+ -0.0996461734175682,
76
+ 0.02663537487387657,
77
+ -0.027010969817638397
78
+ ],
79
+ "std": [
80
+ 0.1132412925362587,
81
+ 0.14199519157409668,
82
+ 0.23618268966674805,
83
+ 0.43265748023986816,
84
+ 0.9902353286743164,
85
+ 0.32450467348098755,
86
+ 0.0145635474473238,
87
+ 0.014437161386013031
88
+ ],
89
+ "max": [
90
+ 0.20274034142494202,
91
+ 0.4884968400001526,
92
+ 1.3584461212158203,
93
+ 4.8432722091674805,
94
+ 3.966320753097534,
95
+ 2.4007365703582764,
96
+ 0.04637677222490311,
97
+ 0.0017036759527400136
98
+ ],
99
+ "min": [
100
+ -0.48259806632995605,
101
+ -0.3968846797943115,
102
+ 0.4455491006374359,
103
+ -0.7501075863838196,
104
+ -4.363162040710449,
105
+ -3.2127554416656494,
106
+ -0.002592125441879034,
107
+ -0.04256961867213249
108
+ ],
109
+ "q01": [
110
+ -0.4019535529613495,
111
+ -0.2819894528388977,
112
+ 0.458499813079834,
113
+ 1.229066481590271,
114
+ -2.779330949783325,
115
+ -1.3500228834152221,
116
+ 0.0016688233194872737,
117
+ -0.04004087835550308
118
+ ],
119
+ "q99": [
120
+ 0.12681280374526968,
121
+ 0.3188697147369384,
122
+ 1.2563055849075317,
123
+ 3.8263492584228516,
124
+ 2.3427903938293455,
125
+ 0.6062234616279595,
126
+ 0.04022635221481323,
127
+ -0.0016752025950700054
128
+ ]
129
+ },
130
+ "num_transitions": 573965,
131
+ "num_trajectories": 3954
132
+ }
133
+ }
output_flash6/correct/configs+libero_90_no_noops+b8+lr-0.0001+SPD+wd-0.3+x-action_queries+layerwise_decay--image_aug--VLA-Adapter--90----45000_chkpt/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
output_flash6/correct/configs+libero_90_no_noops+b8+lr-0.0001+SPD+wd-0.3+x-action_queries+layerwise_decay--image_aug--VLA-Adapter--90----45000_chkpt/processing_prismatic.py ADDED
@@ -0,0 +1,257 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ processing_prismatic.py
3
+
4
+ HuggingFace-style preprocessor definitions for Prismatic VLMs, inheriting from `ProcessorMixin`. Default configuration
5
+ specifies `siglip-224px+7b`.
6
+ """
7
+
8
+ from typing import Any, ClassVar, List, Optional, Tuple, Union
9
+
10
+ import timm.data
11
+ import torch
12
+ import torchvision.transforms.functional as TVF
13
+ from PIL import Image
14
+ from torchvision.transforms import CenterCrop, Compose, Normalize, Resize, ToTensor
15
+ from transformers import PreTrainedTokenizerBase
16
+ from transformers.image_processing_utils import BatchFeature, ImageProcessingMixin
17
+ from transformers.processing_utils import ProcessorMixin
18
+ from transformers.tokenization_utils import PaddingStrategy, PreTokenizedInput, TextInput, TruncationStrategy
19
+ from transformers.utils import TensorType
20
+
21
+
22
+ # === Image Processing ===
23
+ def letterbox_pad_transform(image: Image.Image, padding_fill_value: Tuple[int, int, int]) -> Image.Image:
24
+ """Given a PIL.Image, pad to square by adding a symmetric border around the height/width."""
25
+ (w, h), max_wh = image.size, max(image.size)
26
+ horizontal_pad, vertical_pad = int((max_wh - w) / 2), int((max_wh - h) / 2)
27
+ padding = (horizontal_pad, vertical_pad, horizontal_pad, vertical_pad)
28
+
29
+ return TVF.pad(image, padding, fill=padding_fill_value, padding_mode="constant")
30
+
31
+
32
+ class PrismaticImageProcessor(ImageProcessingMixin):
33
+ model_input_names: ClassVar[List[str]] = ["pixel_values"]
34
+
35
+ def __init__(
36
+ self,
37
+ use_fused_vision_backbone: bool = False,
38
+ image_resize_strategy: str = "letterbox",
39
+ input_sizes: Optional[List[Tuple[int, int, int]]] = None,
40
+ interpolations: Optional[List[str]] = None,
41
+ means: Optional[List[Tuple[float, float, float]]] = None,
42
+ stds: Optional[List[Tuple[float, float, float]]] = None,
43
+ **kwargs: str,
44
+ ) -> None:
45
+ """
46
+ Initialize a PrismaticImageProcessor as a wrapper around a torchvision transform; this transform will be
47
+ created by TIMM, and edited to follow our custom `image_resize_strategy` logic.
48
+
49
+ @param use_fused_vision_backbone: Boolean indicating single or fused (dual) vision backbone
50
+ @param image_resize_strategy: Prismatic image resize strategy in < resize-naive | resize-crop | letterbox >
51
+ @param input_size: [TIMM :: `data_cfg`] Input image size as tuple (channels, width, height)
52
+ @param interpolation: [TIMM :: `data_cfg`] Interpolation as string (default: "bicubic")
53
+ @param mean: [TIMM :: `data_cfg`] Normalization mean as float tuple (or two-tuple if `fused_backbone`)
54
+ @param std: [TIMM :: `data_cfg`] Normalization std as float tuple (or two-tuple if `fused_backbone`)
55
+ """
56
+ self.use_fused_vision_backbone = use_fused_vision_backbone
57
+ self.image_resize_strategy = image_resize_strategy
58
+
59
+ # Handle `None` default values
60
+ input_sizes = [(3, 224, 224)] if input_sizes is None else input_sizes
61
+ means = [(0.5, 0.5, 0.5)] if means is None else means
62
+ stds = [(0.5, 0.5, 0.5)] if stds is None else stds
63
+
64
+ # TIMM `data_cfg` Parameters
65
+ self.input_sizes, self.interpolations, self.means, self.stds = input_sizes, interpolations, means, stds
66
+
67
+ # Grab torchvision transforms via TIMM =>> need to parse for specific "functional" transform values!
68
+ self.tvf_resize_params, self.tvf_crop_params, self.tvf_normalize_params = [], [], []
69
+ self.tvf_do_letterbox, self.tvf_letterbox_fill = False, None
70
+
71
+ for idx in range(len(input_sizes)):
72
+ transform = timm.data.create_transform(
73
+ input_size=self.input_sizes[idx],
74
+ interpolation=self.interpolations[idx],
75
+ mean=self.means[idx],
76
+ std=self.stds[idx],
77
+ crop_pct=1.0, # Set to 1.0 to ignore cropping (initial Resize sets `input_size`)
78
+ crop_mode="center", # Default crop mode -- no-op when `crop_pct == 1.0`
79
+ is_training=False, # No image augmentations when loading the transform!
80
+ )
81
+
82
+ # [Validation] Ensure appropriate transform structure, expected sizes
83
+ if not (
84
+ isinstance(transform, Compose)
85
+ and (len(transform.transforms) == 4)
86
+ and isinstance(transform.transforms[0], Resize)
87
+ and isinstance(transform.transforms[1], CenterCrop)
88
+ and isinstance(transform.transforms[2], ToTensor)
89
+ and isinstance(transform.transforms[3], Normalize)
90
+ and (transform.transforms[0].size == self.input_sizes[idx][-1])
91
+ and (transform.transforms[1].size == self.input_sizes[idx][-2:])
92
+ ):
93
+ raise ValueError(f"Unexpected TIMM image transformation structure/sizes: `{transform}`")
94
+
95
+ # HF Image Processors *must* be JSON-serializable; as such, cannot have torchvision. as an attribute.
96
+ # => Instead, we're going to parse the transform and call "torchvision.transforms.functional" (`tvf`)
97
+ resize_t, crop_t, norm_t = transform.transforms[0], transform.transforms[1], transform.transforms[3]
98
+ self.tvf_resize_params.append(
99
+ {
100
+ "size": resize_t.size,
101
+ "interpolation": TVF.pil_modes_mapping[resize_t.interpolation],
102
+ "max_size": None,
103
+ "antialias": True,
104
+ }
105
+ )
106
+ self.tvf_crop_params.append({"output_size": crop_t.size})
107
+ self.tvf_normalize_params.append(
108
+ {
109
+ "mean": norm_t.mean.float().numpy().tolist(),
110
+ "std": norm_t.std.float().numpy().tolist(),
111
+ "inplace": False,
112
+ }
113
+ )
114
+ self.tvf_do_letterbox, self.tvf_letterbox_fill = False, None
115
+
116
+ # Handle Prismatic `image_resize_strategy`
117
+ if self.image_resize_strategy == "resize-naive":
118
+ self.tvf_resize_params[idx]["size"] = (resize_t.size, resize_t.size)
119
+ elif self.image_resize_strategy == "letterbox":
120
+ self.tvf_do_letterbox, self.tvf_letterbox_fill = True, tuple([int(x * 255) for x in self.means[idx]])
121
+ elif self.image_resize_strategy == "resize-crop":
122
+ pass
123
+ else:
124
+ raise ValueError(f"Image resize strategy `{self.image_resize_strategy}` is not supported!")
125
+
126
+ # Dispatch **kwargs to super()
127
+ super().__init__(**kwargs)
128
+
129
+ def apply_transform(self, img: Image.Image) -> torch.Tensor:
130
+ """Apply `functional` variant of TIMM's Transform = Compose([Resize -> CenterCrop -> ToTensor -> Normalize])"""
131
+ if self.tvf_do_letterbox:
132
+ img = letterbox_pad_transform(img, self.tvf_letterbox_fill)
133
+
134
+ # [Contract] Fused Backbones expect "channel-stacked" inputs; we'll unpack on the model side!
135
+ imgs_t = []
136
+ for idx in range(len(self.input_sizes)):
137
+ img_idx = TVF.resize(img, **self.tvf_resize_params[idx])
138
+ img_idx = TVF.center_crop(img_idx, **self.tvf_crop_params[idx])
139
+ img_idx_t = TVF.to_tensor(img_idx)
140
+ img_idx_t = TVF.normalize(img_idx_t, **self.tvf_normalize_params[idx])
141
+ imgs_t.append(img_idx_t)
142
+
143
+ # [Contract] `imgs_t` is a list of Tensors of shape [3, input_size, input_size]; stack along dim = 0
144
+ img_t = torch.vstack(imgs_t)
145
+
146
+ return img_t
147
+
148
+ def preprocess(
149
+ self,
150
+ images: Union[Image.Image, List[Image.Image]],
151
+ return_tensors: Optional[Union[str, TensorType]] = None,
152
+ **_: str,
153
+ ) -> BatchFeature:
154
+ """
155
+ Preprocess an image (or batch of images); note that unlike the `transformers :: BaseImageProcessor` we
156
+ explicitly only handle PIL.Image.Image instances for simplicity.
157
+
158
+ @param images: A (batch of) PIL.Image.Image instance(s) to preprocess.
159
+ @param return_tensors: BatchFeature default Tensor format (e.g., "pt" for torch); if None, returns np.ndarray
160
+
161
+ @return: Instance of `transformers :: BatchFeature` with a single key "pixel_values"
162
+ """
163
+ if not isinstance(images, list):
164
+ images = [images]
165
+
166
+ # Apply `self.img_transform` to each image (will return list of torch.Tensors); stack into "batched" Tensor
167
+ pixel_values = torch.stack([self.apply_transform(img.convert("RGB")) for img in images])
168
+
169
+ # Return BatchFeature =>> note that for compatibility, constructor expects Dict[str, np.ndarray], so we convert
170
+ return BatchFeature(data={"pixel_values": pixel_values.float().numpy()}, tensor_type=return_tensors)
171
+
172
+ def __call__(self, images: Union[Image.Image, List[Image.Image]], **kwargs) -> BatchFeature:
173
+ return self.preprocess(images, **kwargs)
174
+
175
+
176
+ # === PrismaticProcessor =>> Wraps both ImageProcessor and Tokenizer ===
177
+ # =>> https://github.com/huggingface/transformers/blob/main/src/transformers/models/llava/processing_llava.py
178
+ class PrismaticProcessor(ProcessorMixin):
179
+ attributes: ClassVar[List[str]] = ["image_processor", "tokenizer"]
180
+ image_processor_class: str = "AutoImageProcessor"
181
+ tokenizer_class: str = "AutoTokenizer"
182
+
183
+ def __init__(
184
+ self,
185
+ image_processor: Optional[ImageProcessingMixin] = None,
186
+ tokenizer: Optional[PreTrainedTokenizerBase] = None,
187
+ ) -> None:
188
+ super().__init__(image_processor, tokenizer)
189
+
190
+ def __call__(
191
+ self,
192
+ text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]],
193
+ images: Union[Image.Image, List[Image.Image]],
194
+ padding: Union[bool, str, PaddingStrategy] = False,
195
+ truncation: Optional[Union[bool, str, TruncationStrategy]] = None,
196
+ max_length: Optional[int] = None,
197
+ return_tensors: Optional[Union[str, TensorType]] = TensorType.PYTORCH,
198
+ ) -> BatchFeature:
199
+ """
200
+ Preprocess a given (batch) of text/images for a Prismatic VLM; forwards text to the underlying LLM's tokenizer,
201
+ forwards images to PrismaticImageProcessor.
202
+
203
+ @param text: The (batch) of text to encode; must be a string or list of strings.
204
+ @param images: A (batch of) PIL.Image.Image instance(s) to preprocess.
205
+ @param padding: Sequence padding strategy (if multiple specified) in < True = "longest" | "max_length" | False >
206
+ @param truncation: Truncation strategy for the output sequences; requires `max_length` to be specified
207
+ @param max_length: Maximum length (in tokens) to truncate
208
+ @param return_tensors: Type of return tensors (usually "pt" or TensorType.PYTORCH)
209
+
210
+ @return: BatchFeature with keys for `input_ids`, `attention_mask` and `pixel_values`.
211
+ """
212
+ pixel_values = self.image_processor(images, return_tensors=return_tensors)["pixel_values"]
213
+ text_inputs = self.tokenizer(
214
+ text, return_tensors=return_tensors, padding=padding, truncation=truncation, max_length=max_length
215
+ )
216
+
217
+ # [Validate] Need same number of images and text inputs!
218
+ if pixel_values.shape[0] != text_inputs.input_ids.shape[0]:
219
+ raise ValueError("Batch is malformed; expected same number of images and text inputs!")
220
+
221
+ return BatchFeature(data={**text_inputs, "pixel_values": pixel_values})
222
+
223
+ # === Tokenizer Dispatch Utilities =>> check `PreTrainedTokenizerBase` for documentation ===
224
+ def batch_decode(
225
+ self,
226
+ sequences: Union[List[int], List[List[int]], torch.Tensor, Any], # `Any` = np.ndarray | tf.Tensor
227
+ skip_special_tokens: bool = False,
228
+ clean_up_tokenization_spaces: Optional[bool] = None,
229
+ **kwargs: str,
230
+ ) -> List[str]:
231
+ return self.tokenizer.batch_decode(
232
+ sequences=sequences,
233
+ skip_special_tokens=skip_special_tokens,
234
+ clean_up_tokenization_spaces=clean_up_tokenization_spaces,
235
+ **kwargs,
236
+ )
237
+
238
+ def decode(
239
+ self,
240
+ token_ids: Union[int, List[int], torch.Tensor, Any], # `Any` = np.ndarray | tf.Tensor
241
+ skip_special_tokens: bool = False,
242
+ clean_up_tokenization_spaces: Optional[bool] = None,
243
+ **kwargs: str,
244
+ ) -> str:
245
+ return self.tokenizer.decode(
246
+ token_ids=token_ids,
247
+ skip_special_tokens=skip_special_tokens,
248
+ clean_up_tokenization_spaces=clean_up_tokenization_spaces,
249
+ **kwargs,
250
+ )
251
+
252
+ @property
253
+ def model_input_names(self) -> List[str]:
254
+ tokenizer_input_names = self.tokenizer.model_input_names
255
+ image_processor_input_names = self.image_processor.model_input_names
256
+
257
+ return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
output_flash6/correct/configs+libero_90_no_noops+b8+lr-0.0001+SPD+wd-0.3+x-action_queries+layerwise_decay--image_aug--VLA-Adapter--90----45000_chkpt/special_tokens_map.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_start|>",
4
+ "<|im_end|>",
5
+ "<|object_ref_start|>",
6
+ "<|object_ref_end|>",
7
+ "<|box_start|>",
8
+ "<|box_end|>",
9
+ "<|quad_start|>",
10
+ "<|quad_end|>",
11
+ "<|vision_start|>",
12
+ "<|vision_end|>",
13
+ "<|vision_pad|>",
14
+ "<|image_pad|>",
15
+ "<|video_pad|>"
16
+ ],
17
+ "eos_token": {
18
+ "content": "<|endoftext|>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ },
24
+ "pad_token": {
25
+ "content": "<|endoftext|>",
26
+ "lstrip": false,
27
+ "normalized": false,
28
+ "rstrip": false,
29
+ "single_word": false
30
+ }
31
+ }
output_flash6/correct/configs+libero_90_no_noops+b8+lr-0.0001+SPD+wd-0.3+x-action_queries+layerwise_decay--image_aug--VLA-Adapter--90----45000_chkpt/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
output_flash6/correct/configs+libero_90_no_noops+b8+lr-0.0001+SPD+wd-1.0+x-action_queries+layerwise_decay_cosine--image_aug--VLA-Adapter--90----5000_chkpt/added_tokens.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "</tool_call>": 151658,
3
+ "<tool_call>": 151657,
4
+ "<|box_end|>": 151649,
5
+ "<|box_start|>": 151648,
6
+ "<|endoftext|>": 151643,
7
+ "<|file_sep|>": 151664,
8
+ "<|fim_middle|>": 151660,
9
+ "<|fim_pad|>": 151662,
10
+ "<|fim_prefix|>": 151659,
11
+ "<|fim_suffix|>": 151661,
12
+ "<|im_end|>": 151645,
13
+ "<|im_start|>": 151644,
14
+ "<|image_pad|>": 151655,
15
+ "<|object_ref_end|>": 151647,
16
+ "<|object_ref_start|>": 151646,
17
+ "<|quad_end|>": 151651,
18
+ "<|quad_start|>": 151650,
19
+ "<|repo_name|>": 151663,
20
+ "<|video_pad|>": 151656,
21
+ "<|vision_end|>": 151653,
22
+ "<|vision_pad|>": 151654,
23
+ "<|vision_start|>": 151652
24
+ }
output_flash6/correct/configs+libero_90_no_noops+b8+lr-0.0001+SPD+wd-1.0+x-action_queries+layerwise_decay_cosine--image_aug--VLA-Adapter--90----5000_chkpt/dataset_statistics.json ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "libero_90_no_noops": {
3
+ "action": {
4
+ "mean": [
5
+ 0.04552208632230759,
6
+ 0.037328869104385376,
7
+ -0.09673234075307846,
8
+ 0.0050192056223750114,
9
+ 0.002271906239911914,
10
+ -0.006229790858924389,
11
+ 0.5282046794891357
12
+ ],
13
+ "std": [
14
+ 0.2984381318092346,
15
+ 0.36122551560401917,
16
+ 0.4067350924015045,
17
+ 0.048389386385679245,
18
+ 0.05818882957100868,
19
+ 0.08691500872373581,
20
+ 0.4985457956790924
21
+ ],
22
+ "max": [
23
+ 0.9375,
24
+ 0.9375,
25
+ 0.9375,
26
+ 0.375,
27
+ 0.375,
28
+ 0.375,
29
+ 1.0
30
+ ],
31
+ "min": [
32
+ -0.9375,
33
+ -0.9375,
34
+ -0.9375,
35
+ -0.3257142901420593,
36
+ -0.375,
37
+ -0.375,
38
+ 0.0
39
+ ],
40
+ "q01": [
41
+ -0.6294642686843872,
42
+ -0.8705357313156128,
43
+ -0.8946428298950195,
44
+ -0.12321428209543228,
45
+ -0.1574999988079071,
46
+ -0.2775000035762787,
47
+ 0.0
48
+ ],
49
+ "q99": [
50
+ 0.8517857193946838,
51
+ 0.8464285731315613,
52
+ 0.9375,
53
+ 0.1875,
54
+ 0.1778571456670761,
55
+ 0.3471428453922272,
56
+ 1.0
57
+ ],
58
+ "mask": [
59
+ true,
60
+ true,
61
+ true,
62
+ true,
63
+ true,
64
+ true,
65
+ false
66
+ ]
67
+ },
68
+ "proprio": {
69
+ "mean": [
70
+ -0.08226079493761063,
71
+ 0.010916395112872124,
72
+ 0.9453150629997253,
73
+ 2.974484920501709,
74
+ -0.11405275762081146,
75
+ -0.0996461734175682,
76
+ 0.02663537487387657,
77
+ -0.027010969817638397
78
+ ],
79
+ "std": [
80
+ 0.1132412925362587,
81
+ 0.14199519157409668,
82
+ 0.23618268966674805,
83
+ 0.43265748023986816,
84
+ 0.9902353286743164,
85
+ 0.32450467348098755,
86
+ 0.0145635474473238,
87
+ 0.014437161386013031
88
+ ],
89
+ "max": [
90
+ 0.20274034142494202,
91
+ 0.4884968400001526,
92
+ 1.3584461212158203,
93
+ 4.8432722091674805,
94
+ 3.966320753097534,
95
+ 2.4007365703582764,
96
+ 0.04637677222490311,
97
+ 0.0017036759527400136
98
+ ],
99
+ "min": [
100
+ -0.48259806632995605,
101
+ -0.3968846797943115,
102
+ 0.4455491006374359,
103
+ -0.7501075863838196,
104
+ -4.363162040710449,
105
+ -3.2127554416656494,
106
+ -0.002592125441879034,
107
+ -0.04256961867213249
108
+ ],
109
+ "q01": [
110
+ -0.4019535529613495,
111
+ -0.2819894528388977,
112
+ 0.458499813079834,
113
+ 1.229066481590271,
114
+ -2.779330949783325,
115
+ -1.3500228834152221,
116
+ 0.0016688233194872737,
117
+ -0.04004087835550308
118
+ ],
119
+ "q99": [
120
+ 0.12681280374526968,
121
+ 0.3188697147369384,
122
+ 1.2563055849075317,
123
+ 3.8263492584228516,
124
+ 2.3427903938293455,
125
+ 0.6062234616279595,
126
+ 0.04022635221481323,
127
+ -0.0016752025950700054
128
+ ]
129
+ },
130
+ "num_transitions": 573965,
131
+ "num_trajectories": 3954
132
+ }
133
+ }
output_flash6/correct/configs+libero_90_no_noops+b8+lr-0.0001+SPD+wd-1.0+x-action_queries+layerwise_decay_cosine--image_aug--VLA-Adapter--90----5000_chkpt/generation_config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 151643,
4
+ "eos_token_id": 151643,
5
+ "pad_token_id": 32000,
6
+ "transformers_version": "4.40.1"
7
+ }
output_flash6/correct/configs+libero_90_no_noops+b8+lr-0.0001+SPD+wd-1.0+x-action_queries+layerwise_decay_cosine--image_aug--VLA-Adapter--90----5000_chkpt/preprocessor_config.json ADDED
@@ -0,0 +1,114 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "auto_map": {
3
+ "AutoImageProcessor": "processing_prismatic.PrismaticImageProcessor",
4
+ "AutoProcessor": "processing_prismatic.PrismaticProcessor"
5
+ },
6
+ "image_processor_type": "PrismaticImageProcessor",
7
+ "image_resize_strategy": "resize-naive",
8
+ "input_sizes": [
9
+ [
10
+ 3,
11
+ 224,
12
+ 224
13
+ ],
14
+ [
15
+ 3,
16
+ 224,
17
+ 224
18
+ ]
19
+ ],
20
+ "interpolations": [
21
+ "bicubic",
22
+ "bicubic"
23
+ ],
24
+ "means": [
25
+ [
26
+ 0.485,
27
+ 0.456,
28
+ 0.406
29
+ ],
30
+ [
31
+ 0.5,
32
+ 0.5,
33
+ 0.5
34
+ ]
35
+ ],
36
+ "processor_class": "PrismaticProcessor",
37
+ "stds": [
38
+ [
39
+ 0.229,
40
+ 0.224,
41
+ 0.225
42
+ ],
43
+ [
44
+ 0.5,
45
+ 0.5,
46
+ 0.5
47
+ ]
48
+ ],
49
+ "tvf_crop_params": [
50
+ {
51
+ "output_size": [
52
+ 224,
53
+ 224
54
+ ]
55
+ },
56
+ {
57
+ "output_size": [
58
+ 224,
59
+ 224
60
+ ]
61
+ }
62
+ ],
63
+ "tvf_do_letterbox": false,
64
+ "tvf_letterbox_fill": null,
65
+ "tvf_normalize_params": [
66
+ {
67
+ "inplace": false,
68
+ "mean": [
69
+ 0.484375,
70
+ 0.455078125,
71
+ 0.40625
72
+ ],
73
+ "std": [
74
+ 0.228515625,
75
+ 0.2236328125,
76
+ 0.224609375
77
+ ]
78
+ },
79
+ {
80
+ "inplace": false,
81
+ "mean": [
82
+ 0.5,
83
+ 0.5,
84
+ 0.5
85
+ ],
86
+ "std": [
87
+ 0.5,
88
+ 0.5,
89
+ 0.5
90
+ ]
91
+ }
92
+ ],
93
+ "tvf_resize_params": [
94
+ {
95
+ "antialias": true,
96
+ "interpolation": 3,
97
+ "max_size": null,
98
+ "size": [
99
+ 224,
100
+ 224
101
+ ]
102
+ },
103
+ {
104
+ "antialias": true,
105
+ "interpolation": 3,
106
+ "max_size": null,
107
+ "size": [
108
+ 224,
109
+ 224
110
+ ]
111
+ }
112
+ ],
113
+ "use_fused_vision_backbone": true
114
+ }
output_flash6/correct/configs+libero_90_no_noops+b8+lr-0.0001+SPD+wd-1.0+x-action_queries+layerwise_decay_cosine--image_aug--VLA-Adapter--90----5000_chkpt/processor_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "auto_map": {
3
+ "AutoProcessor": "processing_prismatic.PrismaticProcessor"
4
+ },
5
+ "processor_class": "PrismaticProcessor"
6
+ }
output_flash6/correct/configs+libero_90_no_noops+b8+lr-0.0001+SPD+wd-1.0+x-action_queries+layerwise_decay_cosine--image_aug--VLA-Adapter--90----5000_chkpt/tokenizer_config.json ADDED
@@ -0,0 +1,211 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_prefix_space": false,
4
+ "added_tokens_decoder": {
5
+ "151643": {
6
+ "content": "<|endoftext|>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "151644": {
14
+ "content": "<|im_start|>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "151645": {
22
+ "content": "<|im_end|>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ },
29
+ "151646": {
30
+ "content": "<|object_ref_start|>",
31
+ "lstrip": false,
32
+ "normalized": false,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": true
36
+ },
37
+ "151647": {
38
+ "content": "<|object_ref_end|>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false,
43
+ "special": true
44
+ },
45
+ "151648": {
46
+ "content": "<|box_start|>",
47
+ "lstrip": false,
48
+ "normalized": false,
49
+ "rstrip": false,
50
+ "single_word": false,
51
+ "special": true
52
+ },
53
+ "151649": {
54
+ "content": "<|box_end|>",
55
+ "lstrip": false,
56
+ "normalized": false,
57
+ "rstrip": false,
58
+ "single_word": false,
59
+ "special": true
60
+ },
61
+ "151650": {
62
+ "content": "<|quad_start|>",
63
+ "lstrip": false,
64
+ "normalized": false,
65
+ "rstrip": false,
66
+ "single_word": false,
67
+ "special": true
68
+ },
69
+ "151651": {
70
+ "content": "<|quad_end|>",
71
+ "lstrip": false,
72
+ "normalized": false,
73
+ "rstrip": false,
74
+ "single_word": false,
75
+ "special": true
76
+ },
77
+ "151652": {
78
+ "content": "<|vision_start|>",
79
+ "lstrip": false,
80
+ "normalized": false,
81
+ "rstrip": false,
82
+ "single_word": false,
83
+ "special": true
84
+ },
85
+ "151653": {
86
+ "content": "<|vision_end|>",
87
+ "lstrip": false,
88
+ "normalized": false,
89
+ "rstrip": false,
90
+ "single_word": false,
91
+ "special": true
92
+ },
93
+ "151654": {
94
+ "content": "<|vision_pad|>",
95
+ "lstrip": false,
96
+ "normalized": false,
97
+ "rstrip": false,
98
+ "single_word": false,
99
+ "special": true
100
+ },
101
+ "151655": {
102
+ "content": "<|image_pad|>",
103
+ "lstrip": false,
104
+ "normalized": false,
105
+ "rstrip": false,
106
+ "single_word": false,
107
+ "special": true
108
+ },
109
+ "151656": {
110
+ "content": "<|video_pad|>",
111
+ "lstrip": false,
112
+ "normalized": false,
113
+ "rstrip": false,
114
+ "single_word": false,
115
+ "special": true
116
+ },
117
+ "151657": {
118
+ "content": "<tool_call>",
119
+ "lstrip": false,
120
+ "normalized": false,
121
+ "rstrip": false,
122
+ "single_word": false,
123
+ "special": false
124
+ },
125
+ "151658": {
126
+ "content": "</tool_call>",
127
+ "lstrip": false,
128
+ "normalized": false,
129
+ "rstrip": false,
130
+ "single_word": false,
131
+ "special": false
132
+ },
133
+ "151659": {
134
+ "content": "<|fim_prefix|>",
135
+ "lstrip": false,
136
+ "normalized": false,
137
+ "rstrip": false,
138
+ "single_word": false,
139
+ "special": false
140
+ },
141
+ "151660": {
142
+ "content": "<|fim_middle|>",
143
+ "lstrip": false,
144
+ "normalized": false,
145
+ "rstrip": false,
146
+ "single_word": false,
147
+ "special": false
148
+ },
149
+ "151661": {
150
+ "content": "<|fim_suffix|>",
151
+ "lstrip": false,
152
+ "normalized": false,
153
+ "rstrip": false,
154
+ "single_word": false,
155
+ "special": false
156
+ },
157
+ "151662": {
158
+ "content": "<|fim_pad|>",
159
+ "lstrip": false,
160
+ "normalized": false,
161
+ "rstrip": false,
162
+ "single_word": false,
163
+ "special": false
164
+ },
165
+ "151663": {
166
+ "content": "<|repo_name|>",
167
+ "lstrip": false,
168
+ "normalized": false,
169
+ "rstrip": false,
170
+ "single_word": false,
171
+ "special": false
172
+ },
173
+ "151664": {
174
+ "content": "<|file_sep|>",
175
+ "lstrip": false,
176
+ "normalized": false,
177
+ "rstrip": false,
178
+ "single_word": false,
179
+ "special": false
180
+ }
181
+ },
182
+ "additional_special_tokens": [
183
+ "<|im_start|>",
184
+ "<|im_end|>",
185
+ "<|object_ref_start|>",
186
+ "<|object_ref_end|>",
187
+ "<|box_start|>",
188
+ "<|box_end|>",
189
+ "<|quad_start|>",
190
+ "<|quad_end|>",
191
+ "<|vision_start|>",
192
+ "<|vision_end|>",
193
+ "<|vision_pad|>",
194
+ "<|image_pad|>",
195
+ "<|video_pad|>"
196
+ ],
197
+ "auto_map": {
198
+ "AutoProcessor": "processing_prismatic.PrismaticProcessor"
199
+ },
200
+ "bos_token": null,
201
+ "chat_template": "{%- if tools %}\n {{- '<|im_start|>system\\n' }}\n {%- if messages[0]['role'] == 'system' %}\n {{- messages[0]['content'] }}\n {%- else %}\n {{- 'You are a helpful assistant.' }}\n {%- endif %}\n {{- \"\\n\\n# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>\" }}\n {%- for tool in tools %}\n {{- \"\\n\" }}\n {{- tool | tojson }}\n {%- endfor %}\n {{- \"\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}\\n</tool_call><|im_end|>\\n\" }}\n{%- else %}\n {%- if messages[0]['role'] == 'system' %}\n {{- '<|im_start|>system\\n' + messages[0]['content'] + '<|im_end|>\\n' }}\n {%- else %}\n {{- '<|im_start|>system\\nYou are a helpful assistant.<|im_end|>\\n' }}\n {%- endif %}\n{%- endif %}\n{%- for message in messages %}\n {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) or (message.role == \"assistant\" and not message.tool_calls) %}\n {{- '<|im_start|>' + message.role + '\\n' + message.content + '<|im_end|>' + '\\n' }}\n {%- elif message.role == \"assistant\" %}\n {{- '<|im_start|>' + message.role }}\n {%- if message.content %}\n {{- '\\n' + message.content }}\n {%- endif %}\n {%- for tool_call in message.tool_calls %}\n {%- if tool_call.function is defined %}\n {%- set tool_call = tool_call.function %}\n {%- endif %}\n {{- '\\n<tool_call>\\n{\"name\": \"' }}\n {{- tool_call.name }}\n {{- '\", \"arguments\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- '}\\n</tool_call>' }}\n {%- endfor %}\n {{- '<|im_end|>\\n' }}\n {%- elif message.role == \"tool\" %}\n {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != \"tool\") %}\n {{- '<|im_start|>user' }}\n {%- endif %}\n {{- '\\n<tool_response>\\n' }}\n {{- message.content }}\n {{- '\\n</tool_response>' }}\n {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n {{- '<|im_end|>\\n' }}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|im_start|>assistant\\n' }}\n{%- endif %}\n",
202
+ "clean_up_tokenization_spaces": false,
203
+ "eos_token": "<|endoftext|>",
204
+ "errors": "replace",
205
+ "model_max_length": 131072,
206
+ "pad_token": "<|endoftext|>",
207
+ "processor_class": "PrismaticProcessor",
208
+ "split_special_tokens": false,
209
+ "tokenizer_class": "Qwen2Tokenizer",
210
+ "unk_token": null
211
+ }
outputs/bridge/configs+bridge+b8+lr-0.0001+AdamW+wd-0+x-action_queries--image_aug--VLA-OFT--BRIDGE----10000_chkpt/action_head--10000_checkpoint.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:44839fc56b1564feb265e4ec0a6a466c3414e4bdf8771684dd3846d8bc87d480
3
+ size 14511454
outputs/bridge/configs+bridge+b8+lr-0.0001+AdamW+wd-0+x-action_queries--image_aug--VLA-OFT--BRIDGE----10000_chkpt/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7cde03aab154a90686b2cd9f4e135ecb993c7a8390d30662557442f6815c6961
3
+ size 2505117808
outputs/bridge/configs+bridge+b8+lr-0.0001+AdamW+wd-0+x-action_queries--image_aug--VLA-OFT--BRIDGE----10000_chkpt/proprio_projector--10000_checkpoint.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dddfed5dffd388aec3f61ac3910c902f48c639a0020cb96c70915e09713aca29
3
+ size 1624304
outputs/bridge/configs+bridge+b8+lr-0.0001+AdamW+wd-0+x-action_queries--image_aug--VLA-OFT--BRIDGE----10000_chkpt/train_state--10000_checkpoint.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dbb9306abf43fa7ba9d159715ebde8afddba0727bb2710bf8771fa55fcd08c8d
3
+ size 4870783328
outputs/bridge/configs+bridge+b8+lr-0.0001+AdamW+wd-0+x-action_queries--image_aug--VLA-OFT--BRIDGE----20000_chkpt/action_head--20000_checkpoint.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6942abe3b69ea8ea4944afa08a0fc6e8431dc92c6c1d68815fd82a237bf21f73
3
+ size 14511454
outputs/bridge/configs+bridge+b8+lr-0.0001+AdamW+wd-0+x-action_queries--image_aug--VLA-OFT--BRIDGE----20000_chkpt/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1639fdf688477b44cb6fd12a76366a49a1b1df571a3a7ce8eaf39a91c150975d
3
+ size 2505117808
outputs/bridge/configs+bridge+b8+lr-0.0001+AdamW+wd-0+x-action_queries--image_aug--VLA-OFT--BRIDGE----20000_chkpt/proprio_projector--20000_checkpoint.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0655e0e74408b9af7547a91879d4ac202582e94080345f533482050155819d6e
3
+ size 1624304
outputs/bridge/configs+bridge+b8+lr-0.0001+AdamW+wd-0+x-action_queries--image_aug--VLA-OFT--BRIDGE----20000_chkpt/train_state--20000_checkpoint.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:617738403225edc640c58c5722e66f7bea03ebe2fcb96588ade9907616c9d9a6
3
+ size 4870783328
outputs/bridge/configs+bridge+b8+lr-0.0001+AdamW+wd-0+x-action_queries--image_aug--VLA-OFT--BRIDGE----30000_chkpt/action_head--30000_checkpoint.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cde4bbdf5fb560cc3732233986b9507b83d8a5dbbe192d5bec8a077871a735c9
3
+ size 14511454
outputs/bridge/configs+bridge+b8+lr-0.0001+AdamW+wd-0+x-action_queries--image_aug--VLA-OFT--BRIDGE----30000_chkpt/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7a7a13ab254ffe0b866d788f5c66954c76fa96a45d6268b09f79d3975bd8d902
3
+ size 2505117808
outputs/bridge/configs+bridge+b8+lr-0.0001+AdamW+wd-0+x-action_queries--image_aug--VLA-OFT--BRIDGE----30000_chkpt/proprio_projector--30000_checkpoint.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a3abc1e32a2088b2721495bc8c98b1e09a4c8ef7893a233d3c70f7423ba272d9
3
+ size 1624304
outputs/bridge/configs+bridge+b8+lr-0.0001+AdamW+wd-0+x-action_queries--image_aug--VLA-OFT--BRIDGE----30000_chkpt/train_state--30000_checkpoint.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e018d8807581cabf7dab7d713565d3c7e351edc08d8e074ba75806ee4aeae03b
3
+ size 4870783328
outputs/bridge/configs+bridge+b8+lr-0.0001+AdamW+wd-0+x-action_queries--image_aug--VLA-OFT--BRIDGE----40000_chkpt/action_head--40000_checkpoint.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d0f05b459335a3ce5d26e9ee01bd9e1f7b42ed680e5f8a2b9184cd37309b11ff
3
+ size 14511454
outputs/bridge/configs+bridge+b8+lr-0.0001+AdamW+wd-0+x-action_queries--image_aug--VLA-OFT--BRIDGE----40000_chkpt/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:003314ce76a2983a2ca232bc9d45129a7d3bd46167b11c67140380f6712d7fd2
3
+ size 2505117808
outputs/bridge/configs+bridge+b8+lr-0.0001+AdamW+wd-0+x-action_queries--image_aug--VLA-OFT--BRIDGE----40000_chkpt/proprio_projector--40000_checkpoint.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6f84978c75b87e1ae64f3d01ed38d1212d2b42471f0427f1cd660838d6ab2f52
3
+ size 1624304
outputs/bridge/configs+bridge+b8+lr-0.0001+AdamW+wd-0+x-action_queries--image_aug--VLA-OFT--BRIDGE----40000_chkpt/train_state--40000_checkpoint.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8f652445d8aa175e8f50c4206e8b081f0903f963168971f8b8855c1fa31c2b91
3
+ size 4870783328
outputs/bridge/configs+bridge+b8+lr-0.0001+AdamW+wd-0+x-action_queries--image_aug--VLA-OFT--BRIDGE----5000_chkpt/action_head--5000_checkpoint.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:39ab9693e548c2431dd8fdd03b2a68bd90e11f0e56bf3d3d334d650297894fc3
3
+ size 14511434
outputs/bridge/configs+bridge+b8+lr-0.0001+AdamW+wd-0+x-action_queries--image_aug--VLA-OFT--BRIDGE----5000_chkpt/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:77e47de078648be1b6d61fed32842f43692f5a4045948b0f1ecfbc506d70ccbf
3
+ size 2505117808
outputs/bridge/configs+bridge+b8+lr-0.0001+AdamW+wd-0+x-action_queries--image_aug--VLA-OFT--BRIDGE----5000_chkpt/proprio_projector--5000_checkpoint.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0889c95c17334852afc48097a1db82bc4599dd38061da8fed3f5a59ece0d58a8
3
+ size 1624296
outputs/bridge/configs+bridge+b8+lr-0.0001+AdamW+wd-0+x-action_queries--image_aug--VLA-OFT--BRIDGE----5000_chkpt/train_state--5000_checkpoint.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3e45b735b4ffefe3abd8c0d120ba9a0f4ce7c41522771bafd76062d9bfa79f82
3
+ size 4870780450
outputs/bridge/configs+bridge+b8+lr-1e-05+AdamW+wd-0+x-action_queries--image_aug--VLA-OFT--BRIDGE----10000_chkpt/action_head--10000_checkpoint.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a3fd51ce561673c8f7519934cb7775cde67c4756504faf8ec9d35ba16889122f
3
+ size 14511454
outputs/bridge/configs+bridge+b8+lr-1e-05+AdamW+wd-0+x-action_queries--image_aug--VLA-OFT--BRIDGE----10000_chkpt/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:427383170a8c586c319d313946f780c53460efe2a505fba5e0fce4d8902d0ee1
3
+ size 2505117808
outputs/bridge/configs+bridge+b8+lr-1e-05+AdamW+wd-0+x-action_queries--image_aug--VLA-OFT--BRIDGE----10000_chkpt/proprio_projector--10000_checkpoint.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:efbf281a8c82d084b9663d2c286cc68abb0b117fdaf7f2e843f3192d3f63aefa
3
+ size 1624304
outputs/bridge/configs+bridge+b8+lr-1e-05+AdamW+wd-0+x-action_queries--image_aug--VLA-OFT--BRIDGE----10000_chkpt/train_state--10000_checkpoint.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4045d6fed1debd89b7cff2ed9c8f18174f439501c2d489f233f41243f18a1473
3
+ size 4870783328
outputs/bridge/configs+bridge+b8+lr-1e-05+AdamW+wd-0+x-action_queries--image_aug--VLA-OFT--BRIDGE----15000_chkpt/action_head--15000_checkpoint.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:19639e11797cc9fd9f364d93aea56878b902a8d3b0bdf3327d3f607191a2d41a
3
+ size 14511454
outputs/bridge/configs+bridge+b8+lr-1e-05+AdamW+wd-0+x-action_queries--image_aug--VLA-OFT--BRIDGE----15000_chkpt/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:304601881cb23455e210e28ec30015f6860d7b6c32068d99bf40ac02192d3838
3
+ size 2505117808
outputs/bridge/configs+bridge+b8+lr-1e-05+AdamW+wd-0+x-action_queries--image_aug--VLA-OFT--BRIDGE----15000_chkpt/proprio_projector--15000_checkpoint.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c2c941c7a3d05793058d6fdc8abcb5c7ec628c7ed603e4b21731e5f4b307fd5c
3
+ size 1624304
outputs/bridge/configs+bridge+b8+lr-1e-05+AdamW+wd-0+x-action_queries--image_aug--VLA-OFT--BRIDGE----15000_chkpt/train_state--15000_checkpoint.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:985d00e1c42fbd6595d21cbb74222fb99d9017fcda8105e3815aa94f65123aa0
3
+ size 4870783328
outputs/bridge/configs+bridge+b8+lr-1e-05+AdamW+wd-0+x-action_queries--image_aug--VLA-OFT--BRIDGE----20000_chkpt/action_head--20000_checkpoint.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f1bd574c83b8167b7cd421b940e704e0ecd7b8b064f7fe11e2ce408b9a20b566
3
+ size 14511454
outputs/bridge/configs+bridge+b8+lr-1e-05+AdamW+wd-0+x-action_queries--image_aug--VLA-OFT--BRIDGE----20000_chkpt/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ca6cf59cb2b7445bfc051fbf9f420bb8bfa2ba949a29f2ed57720ebef659b70b
3
+ size 2505117808
outputs/bridge/configs+bridge+b8+lr-1e-05+AdamW+wd-0+x-action_queries--image_aug--VLA-OFT--BRIDGE----20000_chkpt/proprio_projector--20000_checkpoint.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2e41160bd84c449cbbcce6c93b37043aa85eb36726878c90bea119a4f2adbd9b
3
+ size 1624304
outputs/bridge/configs+bridge+b8+lr-1e-05+AdamW+wd-0+x-action_queries--image_aug--VLA-OFT--BRIDGE----20000_chkpt/train_state--20000_checkpoint.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:223799560d87d14a064237fd0e01dbc9c249fcb5d97b371d42447eef01054b94
3
+ size 4870783328
outputs/bridge/configs+bridge+b8+lr-1e-05+AdamW+wd-0+x-action_queries--image_aug--VLA-OFT--BRIDGE----25000_chkpt/action_head--25000_checkpoint.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:387b85ee298b6717acb032810854c6d6463f5642b7cd7ee07fc1f30a6b0abc6b
3
+ size 14511454
outputs/bridge/configs+bridge+b8+lr-1e-05+AdamW+wd-0+x-action_queries--image_aug--VLA-OFT--BRIDGE----25000_chkpt/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8d4b7d1dbe26b4688f64fdf1f1d5e03f39b0d72dc8dc371ad9106cb055e7cdd4
3
+ size 2505117808
outputs/bridge/configs+bridge+b8+lr-1e-05+AdamW+wd-0+x-action_queries--image_aug--VLA-OFT--BRIDGE----25000_chkpt/proprio_projector--25000_checkpoint.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4d0fa23b8defbd6d2dc6bcfe7d0adb4088a9e55c2ddfab2d48cc849a134a7f06
3
+ size 1624304
outputs/bridge/configs+bridge+b8+lr-1e-05+AdamW+wd-0+x-action_queries--image_aug--VLA-OFT--BRIDGE----25000_chkpt/train_state--25000_checkpoint.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5cfac2d9a0f5f9bee0df66a5e302bf4293209aabafe99ff8392ebe9ff10b4ba0
3
+ size 4870783328
outputs/bridge/configs+bridge+b8+lr-1e-05+AdamW+wd-0+x-action_queries--image_aug--VLA-OFT--BRIDGE----30000_chkpt/action_head--30000_checkpoint.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:08f2d796551fc7ab5305ec14174fffb8c8c2884525d5cd480c24daa509af28ea
3
+ size 14511454
outputs/bridge/configs+bridge+b8+lr-1e-05+AdamW+wd-0+x-action_queries--image_aug--VLA-OFT--BRIDGE----30000_chkpt/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0089e557ff820bd9e923a61838a2146d0bea7833135f236eb2350450330bab3d
3
+ size 2505117808
outputs/bridge/configs+bridge+b8+lr-1e-05+AdamW+wd-0+x-action_queries--image_aug--VLA-OFT--BRIDGE----30000_chkpt/proprio_projector--30000_checkpoint.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4fe41568328daed77c6e88379016952a75d378e626fb8ab54ccf8329858025cf
3
+ size 1624304