devin-lai commited on
Commit
3dce169
·
verified ·
1 Parent(s): 420d9b6

Add files using upload-large-folder tool

Browse files
.gitattributes CHANGED
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ test.png filter=lfs diff=lfs merge=lfs -text
37
+ LocateAnything-assets/tokenizer.json filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ .DS_Store
2
+ .cache/
3
+ __pycache__/
4
+ *.py[cod]
5
+ .venv/
6
+ venv/
7
+
8
+ *.coreml.annotated.png
9
+ *.coreml.detections.json
LocateAnything-assets/runtime_config.json ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "token_ids": {
3
+ "image_token_index": 151665,
4
+ "box_start_token_id": 151668,
5
+ "box_end_token_id": 151669,
6
+ "coord_start_token_id": 151677,
7
+ "coord_end_token_id": 152677,
8
+ "ref_start_token_id": 151672,
9
+ "ref_end_token_id": 151673,
10
+ "none_token_id": 4064,
11
+ "null_token_id": 152678,
12
+ "im_end_token_id": 151645,
13
+ "switch_token_id": 152679,
14
+ "default_mask_token_id": 151676
15
+ },
16
+ "image_token": "<IMG_CONTEXT>",
17
+ "template_prefix": "<|im_start|>system\nYou are a helpful assistant.\n<|im_end|>\n<|im_start|>user\n<image 1><img>",
18
+ "template_mid": "</img>",
19
+ "template_suffix": "<|im_end|>\n<|im_start|>assistant\n",
20
+ "n_img": 1369,
21
+ "grid_h": 74,
22
+ "grid_w": 74,
23
+ "canvas": 1036,
24
+ "patch_size": 14,
25
+ "in_token_limit": 25600,
26
+ "merge_kernel_size": [
27
+ 2,
28
+ 2
29
+ ],
30
+ "block_size": 6,
31
+ "model_max_length": 16384,
32
+ "q_max": 1625,
33
+ "kv_max": 3689,
34
+ "vocab_size": 152681
35
+ }
LocateAnything-assets/tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f56587c1c7b4d69be60b018606b1bf2f90e46ca9d51fb2689090997281df7e40
3
+ size 11606727
LocateAnything-decoder.mlpackage/Data/com.apple.CoreML/model.mlmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d66a462f54742d9d5ef27b99002d8f30de0e49ace9c3e6c48dbf2911ef7c0042
3
+ size 826933
LocateAnything-decoder.mlpackage/Data/com.apple.CoreML/weights/weight.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c1cacb1e198cc3283d62705f7bd09fe30c1ada744c5b2aa237270e7807e09684
3
+ size 6177177298
LocateAnything-decoder.mlpackage/Manifest.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "fileFormatVersion": "1.0.0",
3
+ "itemInfoEntries": {
4
+ "4C82C103-68AD-44D0-8798-23E5CC391315": {
5
+ "author": "com.apple.CoreML",
6
+ "description": "CoreML Model Weights",
7
+ "name": "weights",
8
+ "path": "com.apple.CoreML/weights"
9
+ },
10
+ "659959C3-FA9F-41E4-9C1D-151B3BDE84B3": {
11
+ "author": "com.apple.CoreML",
12
+ "description": "CoreML Model Specification",
13
+ "name": "model.mlmodel",
14
+ "path": "com.apple.CoreML/model.mlmodel"
15
+ }
16
+ },
17
+ "rootModelIdentifier": "659959C3-FA9F-41E4-9C1D-151B3BDE84B3"
18
+ }
LocateAnything-embed.mlpackage/Data/com.apple.CoreML/model.mlmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:183d28233a90cc8f53e5ab77404f3f74e9a896444147abdeea98565999d264a3
3
+ size 1873
LocateAnything-embed.mlpackage/Data/com.apple.CoreML/weights/weight.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5bfa333b76145ec94504bf3fb8a7f147a28ace43817cd12790fa1d8a7df79615
3
+ size 625381504
LocateAnything-embed.mlpackage/Manifest.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "fileFormatVersion": "1.0.0",
3
+ "itemInfoEntries": {
4
+ "308B20E9-8131-4632-8AA7-97B7F0315FDB": {
5
+ "author": "com.apple.CoreML",
6
+ "description": "CoreML Model Weights",
7
+ "name": "weights",
8
+ "path": "com.apple.CoreML/weights"
9
+ },
10
+ "73F1D2AF-2A2F-41F4-A186-F7DD46AA56DD": {
11
+ "author": "com.apple.CoreML",
12
+ "description": "CoreML Model Specification",
13
+ "name": "model.mlmodel",
14
+ "path": "com.apple.CoreML/model.mlmodel"
15
+ }
16
+ },
17
+ "rootModelIdentifier": "73F1D2AF-2A2F-41F4-A186-F7DD46AA56DD"
18
+ }
LocateAnything-vision.mlpackage/Data/com.apple.CoreML/model.mlmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:346dc831dbfe9182b15b25bee7d1c96bcf122415df923f3498442112bb52e41c
3
+ size 599797
LocateAnything-vision.mlpackage/Data/com.apple.CoreML/weights/weight.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:00c65abd6206ad578579b64561c9aeec0c7c6837275f54d0891246f8b7201dff
3
+ size 865012288
LocateAnything-vision.mlpackage/Manifest.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "fileFormatVersion": "1.0.0",
3
+ "itemInfoEntries": {
4
+ "81DE057D-F4C2-41BC-AAE9-1F3A267CE3A0": {
5
+ "author": "com.apple.CoreML",
6
+ "description": "CoreML Model Specification",
7
+ "name": "model.mlmodel",
8
+ "path": "com.apple.CoreML/model.mlmodel"
9
+ },
10
+ "8870B83E-8C9A-4195-90E8-FAD1EAADED26": {
11
+ "author": "com.apple.CoreML",
12
+ "description": "CoreML Model Weights",
13
+ "name": "weights",
14
+ "path": "com.apple.CoreML/weights"
15
+ }
16
+ },
17
+ "rootModelIdentifier": "81DE057D-F4C2-41BC-AAE9-1F3A267CE3A0"
18
+ }
README.md CHANGED
@@ -2,4 +2,49 @@
2
  license: other
3
  license_name: nvidia-license
4
  license_link: https://huggingface.co/nvidia/LocateAnything-3B
 
 
 
 
 
 
 
5
  ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  license: other
3
  license_name: nvidia-license
4
  license_link: https://huggingface.co/nvidia/LocateAnything-3B
5
+ pipeline_tag: object-detection
6
+ tags:
7
+ - coreml
8
+ - vision
9
+ - object-detection
10
+ - image-localization
11
+ - apple-silicon
12
  ---
13
+
14
+ # LocateAnything-3B CoreML
15
+
16
+ CoreML packages and a lightweight Python runner for image localization on Apple hardware.
17
+
18
+ ## Contents
19
+
20
+ - `LocateAnything-vision.mlpackage` - image encoder package
21
+ - `LocateAnything-embed.mlpackage` - token embedding package
22
+ - `LocateAnything-decoder.mlpackage` - decoder package
23
+ - `LocateAnything-assets/` - tokenizer and runtime configuration
24
+ - `run_locateanything_image_coreml.py` - still-image runner
25
+ - `test.png` - sample input
26
+
27
+ ## Setup
28
+
29
+ ```bash
30
+ pip install -r requirements.txt
31
+ ```
32
+
33
+ ## Example
34
+
35
+ ```bash
36
+ python run_locateanything_image_coreml.py \
37
+ --input test.png \
38
+ --categories "person,car"
39
+ ```
40
+
41
+ By default, the script writes:
42
+
43
+ - `test.coreml.annotated.png`
44
+ - `test.coreml.detections.json`
45
+
46
+ ## Notes
47
+
48
+ The packages are configured for the image grid stored in the vision package metadata. Use the bundled assets directory with these packages so token ids and runtime limits stay aligned.
49
+
50
+ The license follows the upstream NVIDIA LocateAnything-3B terms linked in the metadata above.
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ coremltools>=8.0
2
+ numpy>=1.24
3
+ opencv-python>=4.8
4
+ Pillow>=10.0
5
+ tokenizers>=0.15
run_locateanything_image_coreml.py ADDED
@@ -0,0 +1,505 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """Run LocateAnything-3B CoreML packages on a still image.
3
+
4
+ The script loads the vision, embedding, and decoder packages, then writes an
5
+ annotated image plus JSON detections for the requested categories.
6
+
7
+ Dependencies: coremltools, numpy, tokenizers, Pillow, opencv-python.
8
+ """
9
+ import argparse
10
+ import json
11
+ import math
12
+ import os
13
+ import re
14
+ import time
15
+ import zlib
16
+
17
+ import cv2
18
+ import numpy as np
19
+ from PIL import Image
20
+
21
+ HERE = os.path.dirname(os.path.abspath(__file__))
22
+ NEG_MASK = -30000.0
23
+
24
+
25
+ def preprocess_image(path, cfg):
26
+ """Load and patchify an image for the vision package."""
27
+ patch = cfg["patch_size"]
28
+ image = Image.open(path).convert("RGB")
29
+ orig_w, orig_h = image.size
30
+
31
+ canvas = cfg.get("canvas")
32
+ if canvas:
33
+ image = image.resize((canvas, canvas), Image.Resampling.BICUBIC)
34
+
35
+ w, h = image.size
36
+ if (w // patch) * (h // patch) > cfg["in_token_limit"]:
37
+ scale = math.sqrt(cfg["in_token_limit"] / ((w // patch) * (h // patch)))
38
+ w, h = int(w * scale), int(h * scale)
39
+ image = image.resize((w, h), Image.Resampling.BICUBIC)
40
+ pad_h = cfg["merge_kernel_size"][0] * patch
41
+ pad_w = cfg["merge_kernel_size"][1] * patch
42
+ target_w = math.ceil(w / pad_w) * pad_w
43
+ target_h = math.ceil(h / pad_h) * pad_h
44
+ if (target_w, target_h) != (w, h):
45
+ image = image.resize((target_w, target_h), Image.Resampling.BICUBIC)
46
+ w, h = image.size
47
+ if w // patch >= 512 or h // patch >= 512:
48
+ raise SystemExit("Image exceeds the position-embedding limit (grid >= 512)")
49
+
50
+ x = np.asarray(image, dtype=np.float32).transpose(2, 0, 1) / 255.0
51
+ x = (x - 0.5) / 0.5
52
+ gh, gw = h // patch, w // patch
53
+ x = x.reshape(3, gh, patch, gw, patch).transpose(1, 3, 0, 2, 4).reshape(-1, 3, patch, patch)
54
+ return np.ascontiguousarray(x), (gh, gw), (orig_w, orig_h)
55
+
56
+
57
+ def build_prompt_ids(tokenizer, cfg, categories):
58
+ """Build token ids and locate the image-token block."""
59
+ prompt = ("Locate all the instances that matches the following description: "
60
+ + "</c>".join(categories) + ".")
61
+ text = (cfg["template_prefix"]
62
+ + cfg["image_token"] * cfg["n_img"]
63
+ + cfg["template_mid"] + prompt + cfg["template_suffix"])
64
+ ids = tokenizer.encode(text).ids
65
+ img_start = ids.index(cfg["token_ids"]["image_token_index"])
66
+ n_img = sum(1 for t in ids if t == cfg["token_ids"]["image_token_index"])
67
+ assert n_img == cfg["n_img"], f"image token count {n_img} != expected {cfg['n_img']}"
68
+ return ids, img_start, n_img
69
+
70
+
71
+ def build_mtp_mask(cur, q_len, kv_max, block_size=6):
72
+ """Causal mask with a bidirectional final window."""
73
+ kv_len = cur + q_len
74
+ mask = np.full((q_len, kv_max), NEG_MASK, dtype=np.float32)
75
+ cols = np.arange(kv_max)[None, :]
76
+ rows_g = (cur + np.arange(q_len))[:, None]
77
+ mask[cols <= rows_g] = 0.0
78
+ mask[:, kv_len:] = NEG_MASK
79
+ mask[-block_size:, kv_len - block_size:kv_len] = 0.0
80
+ mask[-block_size:, kv_len - block_size - 1] = NEG_MASK
81
+ return mask[None, None]
82
+
83
+
84
+ def build_ar_mask(cur, q_len, kv_max):
85
+ """Plain causal mask over the fixed-width KV buffer."""
86
+ kv_len = cur + q_len
87
+ mask = np.full((q_len, kv_max), NEG_MASK, dtype=np.float32)
88
+ cols = np.arange(kv_max)[None, :]
89
+ rows_g = (cur + np.arange(q_len))[:, None]
90
+ mask[cols <= rows_g] = 0.0
91
+ mask[:, kv_len:] = NEG_MASK
92
+ return mask[None, None]
93
+
94
+
95
+ def _softmax(x, axis=-1):
96
+ x = x - x.max(axis=axis, keepdims=True)
97
+ e = np.exp(x)
98
+ return e / e.sum(axis=axis, keepdims=True)
99
+
100
+
101
+ def _apply_repetition_penalty(logits, generated_ids, penalty):
102
+ if penalty == 1.0:
103
+ return logits
104
+ logits = logits.copy()
105
+ seen = np.unique(generated_ids)
106
+ seen = seen[(seen >= 0) & (seen < logits.shape[-1])]
107
+ vals = logits[..., seen]
108
+ logits[..., seen] = np.where(vals > 0, vals / penalty, vals * penalty)
109
+ return logits
110
+
111
+
112
+ def _apply_top_p(logits, top_p):
113
+ """Per-row top-p filtering."""
114
+ order = np.argsort(-logits, axis=-1)
115
+ sorted_logits = np.take_along_axis(logits, order, axis=-1)
116
+ cum = np.cumsum(_softmax(sorted_logits), axis=-1)
117
+ remove = cum > top_p
118
+ remove[..., 1:] = remove[..., :-1].copy()
119
+ remove[..., 0] = False
120
+ mask = np.zeros_like(remove)
121
+ np.put_along_axis(mask, order, remove, axis=-1)
122
+ return np.where(mask, np.finfo(logits.dtype).min, logits)
123
+
124
+
125
+ def _process_logits(logits, generated_ids, *, temperature, top_p, repetition_penalty):
126
+ logits = _apply_repetition_penalty(logits, generated_ids, repetition_penalty)
127
+ if temperature > 0:
128
+ logits = logits / temperature
129
+ if top_p is not None and top_p < 1:
130
+ logits = _apply_top_p(logits, top_p)
131
+ return logits, _softmax(logits)
132
+
133
+
134
+ def _sample_rows(probs, temperature, rng):
135
+ if temperature > 0:
136
+ cum = np.cumsum(probs, axis=-1)
137
+ r = rng.random((probs.shape[0], 1)).astype(cum.dtype)
138
+ x0 = np.minimum((cum < r).sum(axis=-1), probs.shape[-1] - 1)
139
+ else:
140
+ x0 = probs.argmax(axis=-1)
141
+ return x0.astype(np.int64)
142
+
143
+
144
+ def _topk(arr, k):
145
+ """Descending top-k along the last axis."""
146
+ idx = np.argpartition(-arr, k - 1, axis=-1)[..., :k]
147
+ vals = np.take_along_axis(arr, idx, axis=-1)
148
+ order = np.argsort(-vals, axis=-1)
149
+ return np.take_along_axis(vals, order, axis=-1), np.take_along_axis(idx, order, axis=-1)
150
+
151
+
152
+ def is_valid_box_frame(probs, tk, start_thresh=0.7, end_thresh=0.2):
153
+ if probs[0, tk["box_start_token_id"]] >= start_thresh:
154
+ if (probs[1, tk["none_token_id"]] > 0.2 and
155
+ probs[2, tk["box_end_token_id"]] > 0.2 and
156
+ probs[3, tk["null_token_id"]] > 0.1 and
157
+ probs[4, tk["null_token_id"]] > 0.1):
158
+ return "empty_box"
159
+ end_ids = [tk["box_end_token_id"], tk["null_token_id"], tk["im_end_token_id"]]
160
+ if probs[5, end_ids].sum() >= end_thresh:
161
+ return "legal_box"
162
+ return "illegal_box"
163
+
164
+
165
+ def decode_bbox_avg(probs, tk, keep_k=4, generation_mode="hybrid"):
166
+ box_type = is_valid_box_frame(probs, tk)
167
+ if box_type == "empty_box":
168
+ return np.array([tk["box_start_token_id"], tk["none_token_id"], tk["box_end_token_id"],
169
+ tk["null_token_id"], tk["null_token_id"], tk["null_token_id"]], dtype=np.int64)
170
+ if box_type == "illegal_box":
171
+ return None
172
+
173
+ pos_probs, pos_ids = _topk(probs[1:5], keep_k) # [4, k]
174
+ mask = (pos_ids >= tk["coord_start_token_id"]) & (pos_ids <= tk["coord_end_token_id"])
175
+ if not mask.any(axis=-1).all():
176
+ return None
177
+ first_valid_idx = mask.argmax(axis=-1)
178
+ first_valid_probs = np.take_along_axis(pos_probs, first_valid_idx[:, None], -1)[:, 0]
179
+ first_valid_ids = np.take_along_axis(pos_ids, first_valid_idx[:, None], -1)[:, 0]
180
+ if generation_mode == "hybrid":
181
+ valid_counts = mask.sum(axis=-1)
182
+ valid_max = np.where(mask, pos_ids, -999999).max(axis=-1)
183
+ valid_min = np.where(mask, pos_ids, 999999).min(axis=-1)
184
+ is_abnormal = (first_valid_probs < 0.9) & (valid_counts > 1) & ((valid_max - valid_min) > 60)
185
+ final_coords = np.where(is_abnormal, 0, first_valid_ids)
186
+ else:
187
+ final_coords = first_valid_ids
188
+ return np.concatenate([[tk["box_start_token_id"]], final_coords, [tk["box_end_token_id"]]]).astype(np.int64)
189
+
190
+
191
+ def decode_ref(probs, tk, keep_k=5, start_thresh=0.6):
192
+ if probs[0, tk["ref_start_token_id"]] < start_thresh:
193
+ return None
194
+ pos_probs, pos_ids = _topk(probs[1:], keep_k)
195
+ is_coord = (pos_ids >= tk["coord_start_token_id"]) & (pos_ids <= tk["coord_end_token_id"])
196
+ is_valid = ~is_coord
197
+ if not is_valid.any(axis=-1).all():
198
+ return None
199
+ first_valid_idx = is_valid.argmax(axis=-1)
200
+ final_ids = np.take_along_axis(pos_ids, first_valid_idx[:, None], -1)[:, 0]
201
+ return np.concatenate([[tk["ref_start_token_id"]], final_ids]).astype(np.int64)
202
+
203
+
204
+ def sample_tokens_mtp(logits6, generated_ids, tk, rng, *, temperature, top_p,
205
+ repetition_penalty, generation_mode):
206
+ """Sample the six-position window."""
207
+ logits, probs = _process_logits(logits6, generated_ids, temperature=temperature,
208
+ top_p=top_p, repetition_penalty=repetition_penalty)
209
+ x0 = _sample_rows(probs, temperature, rng)
210
+ box = decode_bbox_avg(probs, tk, keep_k=4, generation_mode=generation_mode)
211
+ if box is None:
212
+ box = decode_ref(probs, tk)
213
+ if box is None:
214
+ box = np.zeros(1, dtype=np.int64)
215
+ return x0, box
216
+
217
+
218
+ def sample_token_ar(logits1, generated_ids, tk, rng, *, temperature, top_p,
219
+ repetition_penalty):
220
+ logits, probs = _process_logits(logits1, generated_ids, temperature=temperature,
221
+ top_p=top_p, repetition_penalty=repetition_penalty)
222
+ return _sample_rows(probs, temperature, rng)
223
+
224
+
225
+ def handle_pattern(x0, tk, generation_mode="hybrid"):
226
+ """Normalize sampled tokens into an output pattern."""
227
+ x0 = [int(t) for t in x0]
228
+ if x0[0] == tk["null_token_id"] or x0[0] == tk["im_end_token_id"]:
229
+ return {"type": "im_end", "tokens": [tk["im_end_token_id"]], "is_terminal": True,
230
+ "need_switch_to_ar": False}
231
+ if x0[:2] == [tk["box_start_token_id"], tk["none_token_id"]]:
232
+ return {"type": "empty_box",
233
+ "tokens": [tk["box_start_token_id"], tk["none_token_id"], tk["box_end_token_id"]],
234
+ "is_terminal": False, "need_switch_to_ar": False}
235
+ if x0[0] == tk["box_start_token_id"]:
236
+ coord_ix = 1
237
+ for coord in x0[1:5]:
238
+ if tk["coord_start_token_id"] <= coord <= tk["coord_end_token_id"]:
239
+ coord_ix += 1
240
+ else:
241
+ break
242
+ if coord_ix == 5 and x0[5] == tk["box_end_token_id"]:
243
+ return {"type": "coord_box", "tokens": x0, "is_terminal": False,
244
+ "need_switch_to_ar": False}
245
+ if coord_ix == 3 and x0[3] == tk["box_end_token_id"]:
246
+ return {"type": "point_box", "tokens": x0[:4], "is_terminal": False,
247
+ "need_switch_to_ar": False}
248
+ if generation_mode == "fast":
249
+ return {"type": "coord_box", "tokens": x0, "is_terminal": False,
250
+ "need_switch_to_ar": False}
251
+ return {"type": "error_box", "tokens": x0[:coord_ix], "is_terminal": False,
252
+ "need_switch_to_ar": True}
253
+ for i, token in enumerate(x0):
254
+ if token == tk["null_token_id"]:
255
+ x0 = x0[:i]
256
+ break
257
+ if len(x0) >= 2 and x0[-1] == x0[-2] == tk["ref_end_token_id"]:
258
+ x0 = x0[:-1]
259
+ return {"type": "ref_object", "tokens": x0, "is_terminal": False,
260
+ "need_switch_to_ar": False}
261
+
262
+
263
+ class CoreMLDecoder:
264
+ """Small wrapper around the decoder package and its KV state."""
265
+
266
+ def __init__(self, mlmodel, kv_max):
267
+ self.mlmodel = mlmodel
268
+ self.kv_max = kv_max
269
+ self.state = mlmodel.make_state()
270
+
271
+ def forward(self, embeds, position_ids, mask, write_begin, out_rows):
272
+ q = embeds.shape[0]
273
+ out = self.mlmodel.predict({
274
+ "inputs_embeds": embeds.astype(np.float16)[None],
275
+ "position_ids": np.asarray(position_ids, dtype=np.int32)[None],
276
+ "mask": mask.astype(np.float16),
277
+ "write_rows": np.arange(write_begin, write_begin + q, dtype=np.int32),
278
+ "out_rows": np.asarray(out_rows, dtype=np.int32),
279
+ }, self.state)
280
+ return np.asarray(out["logits"])[0]
281
+
282
+
283
+ def generate(decoder, embed_fn, ids, visual_features, img_start, cfg, rng, *,
284
+ generation_mode="hybrid", max_new_tokens=2048, temperature=0.7,
285
+ top_p=0.9, repetition_penalty=1.1, n_future_tokens=6, verbose=False):
286
+ """Return generated ids after the input prefix."""
287
+ tk = cfg["token_ids"]
288
+ kv_max = cfg["kv_max"]
289
+ mask_id = tk["default_mask_token_id"]
290
+ generated = list(ids)
291
+ seq_len = len(ids)
292
+ total_len = min(cfg["model_max_length"], seq_len + max_new_tokens)
293
+ use_mtp = generation_mode in ("fast", "hybrid")
294
+ cur = 0
295
+ iter_round = 0
296
+ switch_to_ar = 0
297
+ t0 = time.time()
298
+ prefill_time = None
299
+
300
+ while len(generated) < total_len:
301
+ iter_round += 1
302
+ L = len(generated)
303
+ if use_mtp:
304
+ rows = generated[cur:] + [generated[-1]] + [mask_id] * (n_future_tokens - 1)
305
+ pos = list(range(cur, L)) + [L - 1 + i for i in range(n_future_tokens)]
306
+ mask = build_mtp_mask(cur, len(rows), kv_max, n_future_tokens)
307
+ out_rows = list(range(len(rows) - n_future_tokens, len(rows)))
308
+ else:
309
+ rows = generated[cur:]
310
+ pos = list(range(cur, L))
311
+ mask = build_ar_mask(cur, len(rows), kv_max)
312
+ out_rows = [len(rows) - 1] * n_future_tokens
313
+
314
+ embeds = embed_fn(np.asarray(rows, dtype=np.int32))
315
+ if iter_round == 1:
316
+ embeds[img_start:img_start + visual_features.shape[0]] = visual_features
317
+
318
+ logits = decoder.forward(embeds, pos, mask, cur, out_rows)
319
+ cur = L
320
+
321
+ gen_arr = np.asarray(generated)
322
+ if use_mtp:
323
+ x0, box = sample_tokens_mtp(logits, gen_arr, tk, rng,
324
+ temperature=temperature, top_p=top_p,
325
+ repetition_penalty=repetition_penalty,
326
+ generation_mode=generation_mode)
327
+ new_tokens = x0 if (box == 0).all() else box
328
+ pattern = handle_pattern(new_tokens, tk, generation_mode)
329
+ else:
330
+ x0 = sample_token_ar(logits[:1], gen_arr, tk, rng, temperature=temperature,
331
+ top_p=top_p, repetition_penalty=repetition_penalty)
332
+ tok = int(x0[0])
333
+ if generation_mode == "hybrid":
334
+ if tok == tk["box_end_token_id"]:
335
+ out_type = "box_end_ar"
336
+ elif (tk["coord_start_token_id"] <= tok <= tk["coord_end_token_id"]
337
+ or tok == tk["none_token_id"]):
338
+ out_type = "coord_ar"
339
+ else:
340
+ out_type = "im_end"
341
+ else:
342
+ out_type = "im_end" if tok == tk["im_end_token_id"] else "continue_ar"
343
+ pattern = {"type": out_type, "tokens": [tok]}
344
+
345
+ generated.extend(int(t) for t in pattern["tokens"])
346
+
347
+ if pattern["type"] == "im_end":
348
+ break
349
+ if generation_mode == "hybrid":
350
+ if pattern["type"] == "error_box":
351
+ use_mtp = False
352
+ switch_to_ar += 1
353
+ elif pattern["type"] == "box_end_ar":
354
+ use_mtp = True
355
+ if prefill_time is None:
356
+ prefill_time = time.time() - t0
357
+
358
+ if verbose:
359
+ n_new = len(generated) - seq_len
360
+ dt = time.time() - t0
361
+ print(f"\nStatistic Info, num_tokens={n_new}; generate_time(s)={dt:.4f}; "
362
+ f"tps={n_new / dt:.4f}; forward_step={iter_round}; "
363
+ f"prefill_time={prefill_time:.4f}; switch_to_ar={switch_to_ar}\n")
364
+ return generated[seq_len:]
365
+
366
+
367
+ _DET_RE = re.compile(r"<ref>(.*?)</ref>|<box>((?:<\d+>)+)</box>", re.S)
368
+ _COORD_RE = re.compile(r"<(\d+)>")
369
+
370
+
371
+ def parse_detections(answer, width, height):
372
+ out = []
373
+ label = None
374
+ for m in _DET_RE.finditer(answer):
375
+ if m.group(1) is not None:
376
+ label = m.group(1).strip()
377
+ continue
378
+ coords = [int(c) for c in _COORD_RE.findall(m.group(2))]
379
+ if len(coords) == 4:
380
+ x1, y1, x2, y2 = coords
381
+ out.append({"label": label, "type": "box",
382
+ "x1": x1 / 1000 * width, "y1": y1 / 1000 * height,
383
+ "x2": x2 / 1000 * width, "y2": y2 / 1000 * height})
384
+ elif len(coords) == 2:
385
+ x, y = coords
386
+ out.append({"label": label, "type": "point",
387
+ "x": x / 1000 * width, "y": y / 1000 * height})
388
+ return out
389
+
390
+
391
+ def _color_for(label):
392
+ h = zlib.crc32((label or "obj").encode())
393
+ return (int(50 + h % 180), int(50 + (h // 180) % 180), int(50 + (h // 32400) % 180))
394
+
395
+
396
+ def draw_detections(frame_bgr, dets):
397
+ for d in dets:
398
+ color = _color_for(d.get("label"))
399
+ if d["type"] == "box":
400
+ p1, p2 = (int(d["x1"]), int(d["y1"])), (int(d["x2"]), int(d["y2"]))
401
+ cv2.rectangle(frame_bgr, p1, p2, color, 2)
402
+ if d.get("label"):
403
+ cv2.putText(frame_bgr, d["label"], (p1[0], max(0, p1[1] - 6)),
404
+ cv2.FONT_HERSHEY_SIMPLEX, 0.6, color, 2, cv2.LINE_AA)
405
+ else:
406
+ c = (int(d["x"]), int(d["y"]))
407
+ cv2.circle(frame_bgr, c, 6, color, -1)
408
+ if d.get("label"):
409
+ cv2.putText(frame_bgr, d["label"], (c[0] + 8, c[1]),
410
+ cv2.FONT_HERSHEY_SIMPLEX, 0.6, color, 2, cv2.LINE_AA)
411
+ return frame_bgr
412
+
413
+
414
+ def _load_mlmodel(path, compute_units):
415
+ import coremltools as ct
416
+ units = {"cpu_and_gpu": ct.ComputeUnit.CPU_AND_GPU,
417
+ "cpu_only": ct.ComputeUnit.CPU_ONLY,
418
+ "all": ct.ComputeUnit.ALL}[compute_units]
419
+ return ct.models.MLModel(path, compute_units=units)
420
+
421
+
422
+ def main():
423
+ ap = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
424
+ ap.add_argument("--input", default=os.path.join(HERE, "test.png"))
425
+ ap.add_argument("--vision-mlpackage", default=os.path.join(HERE, "LocateAnything-vision.mlpackage"))
426
+ ap.add_argument("--embed-mlpackage", default=os.path.join(HERE, "LocateAnything-embed.mlpackage"))
427
+ ap.add_argument("--decoder-mlpackage", default=os.path.join(HERE, "LocateAnything-decoder.mlpackage"))
428
+ ap.add_argument("--assets", default=os.path.join(HERE, "LocateAnything-assets"))
429
+ ap.add_argument("--categories", default="person,car")
430
+ ap.add_argument("--out-image", default=None)
431
+ ap.add_argument("--out-json", default=None)
432
+ ap.add_argument("--compute-units", default="cpu_and_gpu", choices=["cpu_and_gpu", "cpu_only", "all"])
433
+ ap.add_argument("--generation-mode", default="hybrid", choices=["fast", "slow", "hybrid"])
434
+ ap.add_argument("--max-new-tokens", type=int, default=2048)
435
+ ap.add_argument("--temperature", type=float, default=0.7, help="reference 0.7; pass 0 for greedy")
436
+ ap.add_argument("--top-p", type=float, default=0.9)
437
+ ap.add_argument("--repetition-penalty", type=float, default=1.1)
438
+ ap.add_argument("--seed", type=int, default=0)
439
+ args = ap.parse_args()
440
+ t_run = time.time()
441
+
442
+ from tokenizers import Tokenizer
443
+ cfg = json.load(open(os.path.join(args.assets, "runtime_config.json")))
444
+ tokenizer = Tokenizer.from_file(os.path.join(args.assets, "tokenizer.json"))
445
+ categories = [c.strip() for c in args.categories.split(",") if c.strip()]
446
+ rng = np.random.default_rng(args.seed)
447
+
448
+ stem = os.path.splitext(os.path.basename(args.input))[0]
449
+ out_image = args.out_image or os.path.join(HERE, f"{stem}.coreml.annotated.png")
450
+ out_json = args.out_json or os.path.join(HERE, f"{stem}.coreml.detections.json")
451
+
452
+ print(f"[info] categories={categories} compute_units={args.compute_units}")
453
+ t0 = time.time()
454
+ vision = _load_mlmodel(args.vision_mlpackage, args.compute_units)
455
+ embed = _load_mlmodel(args.embed_mlpackage, args.compute_units)
456
+ decoder_ml = _load_mlmodel(args.decoder_mlpackage, args.compute_units)
457
+ print(f"[info] CoreML models loaded in {time.time() - t0:.1f}s")
458
+
459
+ meta = vision.user_defined_metadata
460
+ pkg_grid = (int(meta["grid_h"]), int(meta["grid_w"]))
461
+
462
+ t0 = time.time()
463
+ pixel_values, grid, (width, height) = preprocess_image(args.input, cfg)
464
+ if grid != pkg_grid:
465
+ raise SystemExit(f"Image patch grid {grid} != vision package grid {pkg_grid}. "
466
+ "Use a vision package with a matching input grid.")
467
+ print(f"[info] image {width}x{height} -> grid {grid[0]}x{grid[1]} "
468
+ f"({pixel_values.shape[0]} patches) in {time.time() - t0:.1f}s")
469
+
470
+ t0 = time.time()
471
+ features = np.asarray(vision.predict({"pixel_values": pixel_values})["features"],
472
+ dtype=np.float16)
473
+ print(f"[vision] features {features.shape} in {time.time() - t0:.1f}s")
474
+
475
+ ids, img_start, n_img = build_prompt_ids(tokenizer, cfg, categories)
476
+ print(f"[info] prompt: {len(ids)} tokens (image block {n_img} @ {img_start})")
477
+
478
+ def embed_fn(row_ids):
479
+ return np.asarray(embed.predict({"input_ids": row_ids[None]})["embeds"],
480
+ dtype=np.float16)[0]
481
+
482
+ decoder = CoreMLDecoder(decoder_ml, cfg["kv_max"])
483
+ out_ids = generate(decoder, embed_fn, ids, features, img_start, cfg, rng,
484
+ generation_mode=args.generation_mode,
485
+ max_new_tokens=args.max_new_tokens, temperature=args.temperature,
486
+ top_p=args.top_p, repetition_penalty=args.repetition_penalty,
487
+ verbose=True)
488
+ answer = tokenizer.decode(out_ids, skip_special_tokens=False)
489
+
490
+ dets = parse_detections(answer, width, height)
491
+ frame = cv2.imread(args.input, cv2.IMREAD_COLOR)
492
+ draw_detections(frame, dets)
493
+ if not cv2.imwrite(out_image, frame):
494
+ raise SystemExit(f"Could not write {out_image}")
495
+ with open(out_json, "w") as f:
496
+ json.dump({"image": args.input, "backend": "coreml-pure", "categories": categories,
497
+ "generation_mode": args.generation_mode,
498
+ "frames": [{"frame": 0, "num_dets": len(dets),
499
+ "detections": dets, "raw": answer}]}, f, indent=2)
500
+ print(f"[done] {len(dets)} detections -> {out_image} and {out_json}")
501
+ print(f"[time] total runtime: {time.time() - t_run:.1f}s (single full run)")
502
+
503
+
504
+ if __name__ == "__main__":
505
+ main()
test.png ADDED

Git LFS Details

  • SHA256: cffa4da93deb8aa0ea3e271869a799b348181eae2a83b9e6e12eca92f7c9d304
  • Pointer size: 132 Bytes
  • Size of remote file: 2.77 MB