3v324v23 commited on
Commit
8268fab
·
1 Parent(s): be16ca7

update reamde

Browse files
Files changed (1) hide show
  1. README.md +9 -15
README.md CHANGED
@@ -30,17 +30,17 @@ TODO
30
  ---
31
  ### 1. Inference w/o. Efficiency Optimization
32
  ```python
33
- from transformers import AutoTokenizer, AutoModel, AutoConfig, BitsAndBytesConfig
34
  import torch
35
 
36
  # load model
37
  model_path = '/root/Models/Video-XL-2'
38
  tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
39
  device = 'cuda' if torch.cuda.is_available() else 'cpu'
40
- model = AutoModel.from_pretrained(model_path, trust_remote_code=True, device_map=device,quantization_config=None,attn_implementation="sdpa",torch_dtype=torch.bfloat16)
41
 
42
  gen_kwargs = {
43
- "do_sample": True,
44
  "temperature": 0.01,
45
  "top_p": 0.001,
46
  "num_beams": 1,
@@ -81,26 +81,20 @@ To enable this mode, you need to set `enable_chunk_prefill` to `True` and config
81
  **Tip: Currently, chunk-based prefill only supports the 'sdpa' attention implementation.*
82
 
83
  ```python
84
- from transformers import AutoTokenizer, AutoModel, AutoConfig, BitsAndBytesConfig
85
  import torch
86
  import pdb
87
  import argparse
88
 
89
  torch.cuda.reset_peak_memory_stats()
90
  # load model
91
- model_path = '/share/minghao/Models/Video-XL-2'
92
  tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
93
  device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
94
- model = AutoModel.from_pretrained(model_path, trust_remote_code=True, device_map=device,quantization_config=None,attn_implementation="sdpa",torch_dtype=torch.bfloat16)
95
 
96
  gen_kwargs = {"do_sample": False, "temperature": 0.01, "top_p": 0.001, "num_beams": 1, "use_cache": True, "max_new_tokens": 128}
97
 
98
-
99
- """
100
- Set params
101
- With Chunk-based Prefill enabled, Video-XL-2 can process 1,300 frames on a 24GB GPU (using approximately 23.72GB). When combined with bi-level KVS decoding, this capacity increases to 1,800 frames.
102
- If you have ample resources, you can disable offload and increase chunk_size_for_vision_tower and chunk_size to achieve faster processing.
103
- """
104
  model.config.enable_chunk_prefill = True
105
  prefill_config = {
106
  'chunk_prefill_mode': 'streaming',
@@ -112,12 +106,12 @@ prefill_config = {
112
  model.config.prefill_config = prefill_config
113
 
114
  # input data
115
- video_path = "/share/LXRlxr0_0/code/videoxl2/lmm-eval/~/.cache/huggingface/videomme/ZBKUqc_ICpg.mp4"
116
  question1 = "How many people in the video? (A)3 people (B)6 people. Please only respone the letter"
117
 
118
  # params
119
  max_num_frames = 1300
120
- sample_fps = None # extract frame at 1fps
121
  max_sample_fps = None
122
 
123
  with torch.inference_mode():
@@ -125,7 +119,7 @@ with torch.inference_mode():
125
 
126
 
127
  peak_memory_allocated = torch.cuda.max_memory_allocated()
128
- print(f"Memory Peak: {peak_memory_allocated / (1024**3):.2f} GB") # 23.72GB
129
  print(response)
130
  ```
131
 
 
30
  ---
31
  ### 1. Inference w/o. Efficiency Optimization
32
  ```python
33
+ from transformers import AutoTokenizer, AutoModel, AutoConfig, BitsAndBytesConfig, AutoModelForCausalLM
34
  import torch
35
 
36
  # load model
37
  model_path = '/root/Models/Video-XL-2'
38
  tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
39
  device = 'cuda' if torch.cuda.is_available() else 'cpu'
40
+ model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True, device_map=device,quantization_config=None, attn_implementation="sdpa", torch_dtype=torch.float16, low_cpu_mem_usage=True)
41
 
42
  gen_kwargs = {
43
+ "do_sample": False,
44
  "temperature": 0.01,
45
  "top_p": 0.001,
46
  "num_beams": 1,
 
81
  **Tip: Currently, chunk-based prefill only supports the 'sdpa' attention implementation.*
82
 
83
  ```python
84
+ from transformers import AutoTokenizer, AutoModel, AutoConfig, BitsAndBytesConfig, AutoModelForCausalLM
85
  import torch
86
  import pdb
87
  import argparse
88
 
89
  torch.cuda.reset_peak_memory_stats()
90
  # load model
91
+ model_path = '/share/minghao/Models2/Video-XL-2'
92
  tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
93
  device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
94
+ model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True, device_map=device,quantization_config=None, attn_implementation="sdpa", torch_dtype=torch.float16, low_cpu_mem_usage=True) # sdpa
95
 
96
  gen_kwargs = {"do_sample": False, "temperature": 0.01, "top_p": 0.001, "num_beams": 1, "use_cache": True, "max_new_tokens": 128}
97
 
 
 
 
 
 
 
98
  model.config.enable_chunk_prefill = True
99
  prefill_config = {
100
  'chunk_prefill_mode': 'streaming',
 
106
  model.config.prefill_config = prefill_config
107
 
108
  # input data
109
+ video_path = "/asset/demo.mp4"
110
  question1 = "How many people in the video? (A)3 people (B)6 people. Please only respone the letter"
111
 
112
  # params
113
  max_num_frames = 1300
114
+ sample_fps = None # uniform sampling
115
  max_sample_fps = None
116
 
117
  with torch.inference_mode():
 
119
 
120
 
121
  peak_memory_allocated = torch.cuda.max_memory_allocated()
122
+ print(f"Memory Peak: {peak_memory_allocated / (1024**3):.2f} GB") # 转换为GB
123
  print(response)
124
  ```
125