KevinX-Penn28 commited on
Commit
3ca196f
·
verified ·
1 Parent(s): 179030a

Upload VINE model - config

Browse files
Files changed (2) hide show
  1. config.json +30 -0
  2. vine_config.py +91 -0
config.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_attn_implementation_autoset": true,
3
+ "_device": "cuda",
4
+ "alpha": 0.5,
5
+ "auto_map": {
6
+ "AutoConfig": "vine_config.VineConfig"
7
+ },
8
+ "bbox_min_dim": 5,
9
+ "box_threshold": 0.35,
10
+ "debug_visualizations": false,
11
+ "hidden_dim": 768,
12
+ "interested_object_pairs": [],
13
+ "max_video_length": 100,
14
+ "model_name": "openai/clip-vit-base-patch32",
15
+ "model_type": "vine",
16
+ "multi_class": false,
17
+ "num_top_pairs": 18,
18
+ "output_logit": false,
19
+ "pretrained_vine_path": null,
20
+ "return_flattened_segments": false,
21
+ "return_valid_pairs": false,
22
+ "segmentation_method": "grounding_dino_sam2",
23
+ "target_fps": 1,
24
+ "text_threshold": 0.25,
25
+ "topk_cate": 3,
26
+ "transformers_version": "4.46.2",
27
+ "visualization_dir": null,
28
+ "visualize": false,
29
+ "white_alpha": 0.8
30
+ }
vine_config.py ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from transformers import PretrainedConfig
3
+ from typing import List, Optional, Dict, Any, Tuple
4
+
5
+
6
+ class VineConfig(PretrainedConfig):
7
+ """
8
+ Configuration class for VINE (Video Understanding with Natural Language) model.
9
+
10
+ VINE is a video understanding model that processes categorical (object class names),
11
+ unary keywords (actions on one object), and binary keywords (relations between two objects),
12
+ and returns probability distributions over all of them when passed a video.
13
+
14
+ Args:
15
+ model_name (str): The CLIP model name to use as backbone. Default: "openai/clip-vit-large-patch14-336"
16
+ hidden_dim (int): Hidden dimension size. Default: 768
17
+ num_top_pairs (int): Number of top object pairs to consider. Default: 10
18
+ segmentation_method (str): Segmentation method to use ("sam2" or "grounding_dino_sam2"). Default: "grounding_dino_sam2"
19
+ box_threshold (float): Box threshold for Grounding DINO. Default: 0.35
20
+ text_threshold (float): Text threshold for Grounding DINO. Default: 0.25
21
+ target_fps (int): Target FPS for video processing. Default: 1
22
+ alpha (float): Alpha value for object extraction. Default: 0.5
23
+ white_alpha (float): White alpha value for background blending. Default: 0.8
24
+ topk_cate (int): Top-k categories to return. Default: 3
25
+ multi_class (bool): Whether to use multi-class classification. Default: False
26
+ output_logit (bool): Whether to output logits instead of probabilities. Default: False
27
+ max_video_length (int): Maximum number of frames to process. Default: 100
28
+ bbox_min_dim (int): Minimum bounding box dimension. Default: 5
29
+ visualize (bool): Whether to visualize results. Default: False
30
+ visualization_dir (str, optional): Directory to save visualizations. Default: None
31
+ debug_visualizations (bool): Whether to save debug visualizations. Default: False
32
+ return_flattened_segments (bool): Whether to return flattened segments. Default: False
33
+ return_valid_pairs (bool): Whether to return valid object pairs. Default: False
34
+ interested_object_pairs (List[Tuple[int, int]], optional): List of interested object pairs
35
+ """
36
+
37
+ model_type = "vine"
38
+
39
+ def __init__(
40
+ self,
41
+ model_name: str = "openai/clip-vit-base-patch32",
42
+ hidden_dim = 768,
43
+ pretrained_vine_path: Optional[str] = None,
44
+ num_top_pairs: int = 18,
45
+ segmentation_method: str = "grounding_dino_sam2",
46
+ box_threshold: float = 0.35,
47
+ text_threshold: float = 0.25,
48
+ target_fps: int = 1,
49
+ alpha: float = 0.5,
50
+ white_alpha: float = 0.8,
51
+ topk_cate: int = 3,
52
+ multi_class: bool = False,
53
+ output_logit: bool = False,
54
+ max_video_length: int = 100,
55
+ bbox_min_dim: int = 5,
56
+ visualize: bool = False,
57
+ visualization_dir: Optional[str] = None,
58
+ return_flattened_segments: bool = False,
59
+ return_valid_pairs: bool = False,
60
+ interested_object_pairs: Optional[List[Tuple[int, int]]] = None,
61
+ debug_visualizations: bool = False,
62
+ device: Optional[str | int] = None,
63
+ **kwargs
64
+ ):
65
+ self.model_name = model_name
66
+ self.pretrained_vine_path = pretrained_vine_path
67
+ self.hidden_dim = hidden_dim
68
+ self.num_top_pairs = num_top_pairs
69
+ self.segmentation_method = segmentation_method
70
+ self.box_threshold = box_threshold
71
+ self.text_threshold = text_threshold
72
+ self.target_fps = target_fps
73
+ self.alpha = alpha
74
+ self.white_alpha = white_alpha
75
+ self.topk_cate = topk_cate
76
+ self.multi_class = multi_class
77
+ self.output_logit = output_logit
78
+ self.max_video_length = max_video_length
79
+ self.bbox_min_dim = bbox_min_dim
80
+ self.visualize = visualize
81
+ self.visualization_dir = visualization_dir
82
+ self.return_flattened_segments = return_flattened_segments
83
+ self.return_valid_pairs = return_valid_pairs
84
+ self.interested_object_pairs = interested_object_pairs or []
85
+ self.debug_visualizations = debug_visualizations
86
+ if device is int:
87
+ self._device = f"cuda:{device}" if torch.cuda.is_available() else "cpu"
88
+ else:
89
+ self._device = device or ("cuda" if torch.cuda.is_available() else "cpu")
90
+
91
+ super().__init__(**kwargs)