| # InternVideo ECVA tuned head | |
| - Base backbone: `revliter/internvideo_next_large_p14_res224_f16` | |
| - Clip length: `16` frames | |
| - Frame size: `224x224` | |
| - Head hidden dims: `[512]` | |
| - Repo: `happy8825/internvideo_tuned` | |
| ## Quick start (single video) | |
| ```bash | |
| pip install decord transformers huggingface_hub | |
| python inference_example.py --repo_id happy8825/internvideo_tuned --video /path/to/video.mp4 --device cuda | |
| ``` | |
| The script downloads this repo, loads the InternVideo backbone + tuned head, and prints `normal` or `abnormal`. | |
| ## Minimal Python snippet | |
| ```python | |
| import json, os, numpy as np, torch | |
| from huggingface_hub import snapshot_download | |
| from transformers import VideoMAEImageProcessor, AutoModel | |
| from decord import VideoReader | |
| ID2LABEL = {0: "normal", 1: "abnormal"} | |
| class ClassificationHead(torch.nn.Module): | |
| def __init__(self, in_dim, hidden_dims, num_labels=2, dropout=0.1): | |
| super().__init__() | |
| dims = [in_dim] + list(hidden_dims) | |
| layers = [] | |
| for i in range(len(dims) - 1): | |
| layers += [torch.nn.Linear(dims[i], dims[i+1]), torch.nn.GELU(), torch.nn.Dropout(dropout)] | |
| layers.append(torch.nn.Linear(dims[-1], num_labels)) | |
| self.net = torch.nn.Sequential(*layers) | |
| def forward(self, x): return self.net(x) | |
| def pool_tokens(feats, expected=None): | |
| if feats.dim() != 3: return feats | |
| _, d1, d2 = feats.shape | |
| if expected: | |
| if d1 == expected: return feats.mean(dim=2) | |
| if d2 == expected: return feats.mean(dim=1) | |
| return feats.mean(dim=2 if d1 <= d2 else 1) | |
| repo = "happy8825/internvideo_tuned" | |
| local = snapshot_download(repo) | |
| cfg = json.load(open(os.path.join(local, "train_config.json"))) | |
| base = cfg.get("base_model", "revliter/internvideo_next_large_p14_res224_f16") | |
| clip_len = int(cfg.get("clip_len", 16)) | |
| hidden = cfg.get("hidden", [512]) | |
| feat_dim = cfg.get("feature_dim") or cfg.get("hidden_size") | |
| processor = VideoMAEImageProcessor.from_pretrained(base) | |
| backbone = AutoModel.from_pretrained(base, trust_remote_code=True).eval().to("cuda") | |
| head = ClassificationHead(in_dim=feat_dim or backbone.config.hidden_size, hidden_dims=hidden) | |
| state = torch.load(os.path.join(local, "best_head.pt"), map_location="cpu") | |
| head.load_state_dict(state["head"]); head.eval().to("cuda") | |
| vr = VideoReader("/path/to/video.mp4") | |
| idxs = np.linspace(0, len(vr)-1, num=clip_len, dtype=int) | |
| frames = [vr[i].asnumpy() for i in idxs] | |
| px = processor(frames, return_tensors="pt")["pixel_values"].permute(0,2,1,3,4).to("cuda") | |
| with torch.no_grad(), torch.amp.autocast("cuda", dtype=torch.bfloat16): | |
| feats = backbone.extract_features(pixel_values=px) | |
| pooled = pool_tokens(feats, expected=feat_dim) | |
| pred = int(head(pooled.float()).argmax(dim=-1).item()) | |
| print(ID2LABEL.get(pred, pred)) | |
| ``` | |
| ## Files | |
| - `best_head.pt`: classifier head weights | |
| - `train_config.json`: training config (contains base model, clip_len, frame_size, hidden dims, etc.) | |
| - `inference_example.py`: minimal inference helper | |