| """Reference inference for the Stage 0 baseline. |
| |
| Loads Argus (EUPE-ViT-B backbone), reads the classifier config, and scores one |
| or more images. Prints the raw score and the binary decision. |
| |
| Usage: python infer.py image1.jpg [image2.jpg ...] |
| """ |
| import json, sys, os |
| import torch |
| import torch.nn.functional as F |
| from PIL import Image |
| import numpy as np |
| from transformers import AutoModel |
|
|
|
|
| def load_classifier(path='classifier.json'): |
| with open(path) as f: |
| return json.load(f) |
|
|
|
|
| def load_argus(repo_or_path='phanerozoic/argus'): |
| return AutoModel.from_pretrained(repo_or_path, trust_remote_code=True) |
|
|
|
|
| def preprocess(image_path, resolution=768, device='cuda'): |
| img = Image.open(image_path).convert('RGB').resize((resolution, resolution), Image.BILINEAR) |
| arr = np.asarray(img, dtype=np.uint8).copy() |
| x = torch.from_numpy(arr).permute(2, 0, 1).unsqueeze(0).to(device).float() / 255.0 |
| mean = torch.tensor([0.485, 0.456, 0.406]).view(1, 3, 1, 1).to(device) |
| std = torch.tensor([0.229, 0.224, 0.225]).view(1, 3, 1, 1).to(device) |
| return (x - mean) / std |
|
|
|
|
| @torch.inference_mode() |
| def score(model, x, classifier): |
| with torch.autocast('cuda', dtype=torch.bfloat16): |
| out = model.backbone.forward_features(x) |
| patches = out['x_norm_patchtokens'].float().squeeze(0) |
| D = classifier['feature_dim'] |
| ln = F.layer_norm(patches, [D]) |
| pooled = ln.max(dim=0).values |
| pos = pooled[classifier['pos_dims']].sum() |
| neg = pooled[classifier['neg_dims']].sum() |
| return float((pos - neg).item()) |
|
|
|
|
| def main(): |
| if len(sys.argv) < 2: |
| print('usage: python infer.py <image1> [image2 ...]') |
| sys.exit(1) |
| here = os.path.dirname(os.path.abspath(__file__)) |
| classifier = load_classifier(os.path.join(here, 'classifier.json')) |
| device = 'cuda' if torch.cuda.is_available() else 'cpu' |
| model = load_argus().to(device).eval() |
| thr = classifier['threshold'] |
| for image_path in sys.argv[1:]: |
| x = preprocess(image_path, classifier['input_resolution'], device) |
| s = score(model, x, classifier) |
| print(f'{image_path} score={s:+.3f} threshold={thr:+.3f} person={s > thr}') |
|
|
|
|
| if __name__ == '__main__': |
| main() |
|
|