Improve model card, add link to paper

#1
by nielsr HF Staff - opened
Files changed (1) hide show
  1. README.md +73 -61
README.md CHANGED
@@ -1,64 +1,76 @@
 
 
 
 
 
 
 
 
 
 
1
  Example to use pretrained checkpoints.
2
 
3
- from huggingface_hub import hf_hub_download
4
- import torch
5
- import clip
6
- import torch.nn.functional as F
7
- import numpy as np
8
- import cv2
9
- import torchvision.transforms as transforms
10
-
11
-
12
- def generate_event_image(frames, threshold=10):
13
- frames = np.array(frames)
14
- num_frames, height, width, _ = frames.shape
15
- event_images = []
16
-
17
- for i in range(1, num_frames):
18
- diff = cv2.absdiff(frames[i], frames[i-1])
19
- gray_diff = cv2.cvtColor(diff, cv2.COLOR_RGB2GRAY)
20
- _, event_image = cv2.threshold(gray_diff, threshold, 255, cv2.THRESH_BINARY)
21
- event_images.append(event_image)
22
-
23
- return torch.tensor(event_images).sum(dim=0)
24
-
25
-
26
- ckpt_path = hf_hub_download(
27
- repo_id="Eavn/event-clip",
28
- filename="vitb.pt", # or vitl.pt for pretraining checkpoints
29
- repo_type="model"
30
- )
31
-
32
- model, preprocess = clip.load("ViT-B/32")
33
- # model, preprocess = clip.load("ViT-L/14")
34
-
35
- state_dict = torch.load(ckpt_path)["checkpoint"]
36
- new_state_dict = {}
37
- for key in state_dict.keys():
38
- if 'encoder_k' in key:
39
- new_state_dict[key.replace('encoder_k.', '')] = state_dict[key]
40
- model.load_state_dict(new_state_dict)
41
-
42
- transform = transforms.Compose([
43
- transforms.Resize((224, 224)),
44
- ])
45
-
46
- stack_size = 16
47
- threshold = 10
48
- clamp = 10
49
- text = 'Put the Text Here'
50
- text = clip.tokenize([text]).cuda()
51
-
52
- images = (np.random.rand(32, 224, 224, 3) * 255).astype(np.uint8)
53
- event = generate_event_image(
54
- images[:stack_size],
55
- threshold=threshold
56
- )
57
- if clamp > 0:
58
- event = torch.clamp(event, min=0, max=clamp)
59
- event = event / event.max()
60
- event = torch.stack([event, event, event])
61
- event = transform(event)
62
- event = event.cuda().unsqueeze(0)
63
 
64
- logits_per_event, _ = model(event, text)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ pipeline_tag: zero-shot-image-classification
3
+ ---
4
+
5
+ This repository contains the models presented in [Uncertainty-Weighted Image-Event Multimodal Fusion for Video Anomaly Detection](https://huggingface.co/papers/2505.02393).
6
+
7
+ Code: https://github.com/EavnJeong/IEF-VAD
8
+
9
+ ## Usage
10
+
11
  Example to use pretrained checkpoints.
12
 
13
+ ```python
14
+ from huggingface_hub import hf_hub_download
15
+ import torch
16
+ import clip
17
+ import torch.nn.functional as F
18
+ import numpy as np
19
+ import cv2
20
+ import torchvision.transforms as transforms
21
+
22
+
23
+ def generate_event_image(frames, threshold=10):
24
+ frames = np.array(frames)
25
+ num_frames, height, width, _ = frames.shape
26
+ event_images = []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
 
28
+ for i in range(1, num_frames):
29
+ diff = cv2.absdiff(frames[i], frames[i-1])
30
+ gray_diff = cv2.cvtColor(diff, cv2.COLOR_RGB2GRAY)
31
+ _, event_image = cv2.threshold(gray_diff, threshold, 255, cv2.THRESH_BINARY)
32
+ event_images.append(event_image)
33
+
34
+ return torch.tensor(event_images).sum(dim=0)
35
+
36
+
37
+ ckpt_path = hf_hub_download(
38
+ repo_id="Eavn/event-clip",
39
+ filename="vitb.pt", # or vitl.pt for pretraining checkpoints
40
+ repo_type="model"
41
+ )
42
+
43
+ model, preprocess = clip.load("ViT-B/32")
44
+ # model, preprocess = clip.load("ViT-L/14")
45
+
46
+ state_dict = torch.load(ckpt_path)["checkpoint"]
47
+ new_state_dict = {}
48
+ for key in state_dict.keys():
49
+ if 'encoder_k' in key:
50
+ new_state_dict[key.replace('encoder_k.', '')] = state_dict[key]
51
+ model.load_state_dict(new_state_dict)
52
+
53
+ transform = transforms.Compose([
54
+ transforms.Resize((224, 224)),
55
+ ])
56
+
57
+ stack_size = 16
58
+ threshold = 10
59
+ clamp = 10
60
+ text = 'Put the Text Here'
61
+ text = clip.tokenize([text]).cuda()
62
+
63
+ images = (np.random.rand(32, 224, 224, 3) * 255).astype(np.uint8)
64
+ event = generate_event_image(
65
+ images[:stack_size],
66
+ threshold=threshold
67
+ )
68
+ if clamp > 0:
69
+ event = torch.clamp(event, min=0, max=clamp)
70
+ event = event / event.max()
71
+ event = torch.stack([event, event, event])
72
+ event = transform(event)
73
+ event = event.cuda().unsqueeze(0)
74
+
75
+ logits_per_event, _ = model(event, text)
76
+ ```