"""Reference inference for the Stage 0 baseline. Loads Argus (EUPE-ViT-B backbone), reads the classifier config, and scores one or more images. Prints the raw score and the binary decision. Usage: python infer.py image1.jpg [image2.jpg ...] """ import json, sys, os import torch import torch.nn.functional as F from PIL import Image import numpy as np from transformers import AutoModel def load_classifier(path='classifier.json'): with open(path) as f: return json.load(f) def load_argus(repo_or_path='phanerozoic/argus'): return AutoModel.from_pretrained(repo_or_path, trust_remote_code=True) def preprocess(image_path, resolution=768, device='cuda'): img = Image.open(image_path).convert('RGB').resize((resolution, resolution), Image.BILINEAR) arr = np.asarray(img, dtype=np.uint8).copy() x = torch.from_numpy(arr).permute(2, 0, 1).unsqueeze(0).to(device).float() / 255.0 mean = torch.tensor([0.485, 0.456, 0.406]).view(1, 3, 1, 1).to(device) std = torch.tensor([0.229, 0.224, 0.225]).view(1, 3, 1, 1).to(device) return (x - mean) / std @torch.inference_mode() def score(model, x, classifier): with torch.autocast('cuda', dtype=torch.bfloat16): out = model.backbone.forward_features(x) patches = out['x_norm_patchtokens'].float().squeeze(0) D = classifier['feature_dim'] ln = F.layer_norm(patches, [D]) pooled = ln.max(dim=0).values pos = pooled[classifier['pos_dims']].sum() neg = pooled[classifier['neg_dims']].sum() return float((pos - neg).item()) def main(): if len(sys.argv) < 2: print('usage: python infer.py [image2 ...]') sys.exit(1) here = os.path.dirname(os.path.abspath(__file__)) classifier = load_classifier(os.path.join(here, 'classifier.json')) device = 'cuda' if torch.cuda.is_available() else 'cpu' model = load_argus().to(device).eval() thr = classifier['threshold'] for image_path in sys.argv[1:]: x = preprocess(image_path, classifier['input_resolution'], device) s = score(model, x, classifier) print(f'{image_path} score={s:+.3f} threshold={thr:+.3f} person={s > thr}') if __name__ == '__main__': main()