Eavn commited on
Commit
033ef02
·
verified ·
1 Parent(s): e1d0ea0

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +62 -62
README.md CHANGED
@@ -1,64 +1,64 @@
1
  Example to use pretrained checkpoints.
2
 
3
- from huggingface_hub import hf_hub_download
4
- import torch
5
- import clip
6
- import torch.nn.functional as F
7
- import numpy as np
8
- import cv2
9
- import torchvision.transforms as transforms
10
-
11
-
12
- def generate_event_image(frames, threshold=10):
13
- frames = np.array(frames)
14
- num_frames, height, width, _ = frames.shape
15
- event_images = []
16
-
17
- for i in range(1, num_frames):
18
- diff = cv2.absdiff(frames[i], frames[i-1])
19
- gray_diff = cv2.cvtColor(diff, cv2.COLOR_RGB2GRAY)
20
- _, event_image = cv2.threshold(gray_diff, threshold, 255, cv2.THRESH_BINARY)
21
- event_images.append(event_image)
22
-
23
- return torch.tensor(event_images).sum(dim=0)
24
-
25
-
26
- ckpt_path = hf_hub_download(
27
- repo_id="Eavn/event-clip",
28
- filename="vitb.pt", # or vitl.pt for pretraining checkpoints
29
- repo_type="model"
30
- )
31
-
32
- model, preprocess = clip.load("ViT-B/32")
33
- # model, preprocess = clip.load("ViT-L/14")
34
-
35
- state_dict = torch.load(ckpt_path)["checkpoint"]
36
- new_state_dict = {}
37
- for key in state_dict.keys():
38
- if 'encoder_k' in key:
39
- new_state_dict[key.replace('encoder_k.', '')] = state_dict[key]
40
- model.load_state_dict(new_state_dict)
41
-
42
- transform = transforms.Compose([
43
- transforms.Resize((224, 224)),
44
- ])
45
-
46
- stack_size = 16
47
- threshold = 10
48
- clamp = 10
49
- text = 'Put the Text Here'
50
- text = clip.tokenize([text]).cuda()
51
-
52
- images = (np.random.rand(32, 224, 224, 3) * 255).astype(np.uint8)
53
- event = generate_event_image(
54
- images[:stack_size],
55
- threshold=threshold
56
- )
57
- if clamp > 0:
58
- event = torch.clamp(event, min=0, max=clamp)
59
- event = event / event.max()
60
- event = torch.stack([event, event, event])
61
- event = transform(event)
62
- event = event.cuda().unsqueeze(0)
63
-
64
- logits_per_event, _ = model(event, text)
 
1
  Example to use pretrained checkpoints.
2
 
3
+ from huggingface_hub import hf_hub_download
4
+ import torch
5
+ import clip
6
+ import torch.nn.functional as F
7
+ import numpy as np
8
+ import cv2
9
+ import torchvision.transforms as transforms
10
+
11
+
12
+ def generate_event_image(frames, threshold=10):
13
+ frames = np.array(frames)
14
+ num_frames, height, width, _ = frames.shape
15
+ event_images = []
16
+
17
+ for i in range(1, num_frames):
18
+ diff = cv2.absdiff(frames[i], frames[i-1])
19
+ gray_diff = cv2.cvtColor(diff, cv2.COLOR_RGB2GRAY)
20
+ _, event_image = cv2.threshold(gray_diff, threshold, 255, cv2.THRESH_BINARY)
21
+ event_images.append(event_image)
22
+
23
+ return torch.tensor(event_images).sum(dim=0)
24
+
25
+
26
+ ckpt_path = hf_hub_download(
27
+ repo_id="Eavn/event-clip",
28
+ filename="vitb.pt", # or vitl.pt for pretraining checkpoints
29
+ repo_type="model"
30
+ )
31
+
32
+ model, preprocess = clip.load("ViT-B/32")
33
+ # model, preprocess = clip.load("ViT-L/14")
34
+
35
+ state_dict = torch.load(ckpt_path)["checkpoint"]
36
+ new_state_dict = {}
37
+ for key in state_dict.keys():
38
+ if 'encoder_k' in key:
39
+ new_state_dict[key.replace('encoder_k.', '')] = state_dict[key]
40
+ model.load_state_dict(new_state_dict)
41
+
42
+ transform = transforms.Compose([
43
+ transforms.Resize((224, 224)),
44
+ ])
45
+
46
+ stack_size = 16
47
+ threshold = 10
48
+ clamp = 10
49
+ text = 'Put the Text Here'
50
+ text = clip.tokenize([text]).cuda()
51
+
52
+ images = (np.random.rand(32, 224, 224, 3) * 255).astype(np.uint8)
53
+ event = generate_event_image(
54
+ images[:stack_size],
55
+ threshold=threshold
56
+ )
57
+ if clamp > 0:
58
+ event = torch.clamp(event, min=0, max=clamp)
59
+ event = event / event.max()
60
+ event = torch.stack([event, event, event])
61
+ event = transform(event)
62
+ event = event.cuda().unsqueeze(0)
63
+
64
+ logits_per_event, _ = model(event, text)