Eavn
/

event-clip

Model card Files Files and versions

xet

Community

Improve model card, add link to paper

by nielsr HF Staff - opened May 8, 2025

base: refs/heads/main

←

from: refs/pr/1

Discussion Files changed

+73

-61

Files changed (1) hide show

README.md +73 -61

README.md CHANGED Viewed

@@ -1,64 +1,76 @@
 Example to use pretrained checkpoints.
-    from huggingface_hub import hf_hub_download
-    import torch
-    import clip
-    import torch.nn.functional as F
-    import numpy as np
-    import cv2
-    import torchvision.transforms as transforms
-    def generate_event_image(frames, threshold=10):
-        frames = np.array(frames)
-        num_frames, height, width, _ = frames.shape
-        event_images = []
-        for i in range(1, num_frames):
-            diff = cv2.absdiff(frames[i], frames[i-1])
-            gray_diff = cv2.cvtColor(diff, cv2.COLOR_RGB2GRAY)
-            _, event_image = cv2.threshold(gray_diff, threshold, 255, cv2.THRESH_BINARY)
-            event_images.append(event_image)
-        return torch.tensor(event_images).sum(dim=0)
-    ckpt_path = hf_hub_download(
-        repo_id="Eavn/event-clip",
-        filename="vitb.pt", # or vitl.pt for pretraining checkpoints
-        repo_type="model"
-    )
-    model, preprocess = clip.load("ViT-B/32")
-    # model, preprocess = clip.load("ViT-L/14")
-    state_dict = torch.load(ckpt_path)["checkpoint"]
-    new_state_dict = {}
-    for key in state_dict.keys():
-        if 'encoder_k' in key:
-            new_state_dict[key.replace('encoder_k.', '')] = state_dict[key]
-    model.load_state_dict(new_state_dict)
-    transform = transforms.Compose([
-        transforms.Resize((224, 224)),
-    ])
-    stack_size = 16
-    threshold = 10
-    clamp = 10
-    text = 'Put the Text Here'
-    text = clip.tokenize([text]).cuda()
-    images = (np.random.rand(32, 224, 224, 3) * 255).astype(np.uint8)
-    event = generate_event_image(
-        images[:stack_size],
-        threshold=threshold
-    )
-    if clamp > 0:
-        event = torch.clamp(event, min=0, max=clamp)
-    event = event / event.max()
-    event = torch.stack([event, event, event])
-    event = transform(event)
-    event = event.cuda().unsqueeze(0)
-    logits_per_event, _ = model(event, text)

+---
+pipeline_tag: zero-shot-image-classification
+---
+This repository contains the models presented in [Uncertainty-Weighted Image-Event Multimodal Fusion for Video Anomaly Detection](https://huggingface.co/papers/2505.02393).
+Code: https://github.com/EavnJeong/IEF-VAD
+## Usage
 Example to use pretrained checkpoints.
+```python
+from huggingface_hub import hf_hub_download
+import torch
+import clip
+import torch.nn.functional as F
+import numpy as np
+import cv2
+import torchvision.transforms as transforms
+def generate_event_image(frames, threshold=10):
+    frames = np.array(frames)
+    num_frames, height, width, _ = frames.shape
+    event_images = []
+    for i in range(1, num_frames):
+        diff = cv2.absdiff(frames[i], frames[i-1])
+        gray_diff = cv2.cvtColor(diff, cv2.COLOR_RGB2GRAY)
+        _, event_image = cv2.threshold(gray_diff, threshold, 255, cv2.THRESH_BINARY)
+        event_images.append(event_image)
+    return torch.tensor(event_images).sum(dim=0)
+ckpt_path = hf_hub_download(
+    repo_id="Eavn/event-clip",
+    filename="vitb.pt", # or vitl.pt for pretraining checkpoints
+    repo_type="model"
+)
+model, preprocess = clip.load("ViT-B/32")
+# model, preprocess = clip.load("ViT-L/14")
+state_dict = torch.load(ckpt_path)["checkpoint"]
+new_state_dict = {}
+for key in state_dict.keys():
+    if 'encoder_k' in key:
+        new_state_dict[key.replace('encoder_k.', '')] = state_dict[key]
+model.load_state_dict(new_state_dict)
+transform = transforms.Compose([
+    transforms.Resize((224, 224)),
+])
+stack_size = 16
+threshold = 10
+clamp = 10
+text = 'Put the Text Here'
+text = clip.tokenize([text]).cuda()
+images = (np.random.rand(32, 224, 224, 3) * 255).astype(np.uint8)
+event = generate_event_image(
+    images[:stack_size],
+    threshold=threshold
+)
+if clamp > 0:
+    event = torch.clamp(event, min=0, max=clamp)
+event = event / event.max()
+event = torch.stack([event, event, event])
+event = transform(event)
+event = event.cuda().unsqueeze(0)
+logits_per_event, _ = model(event, text)
+```