BAAI
/

Video-XL-2

@@ -30,17 +30,17 @@ TODO
 ---
 ### 1. Inference w/o. Efficiency Optimization
 ```python
-from transformers import AutoTokenizer, AutoModel, AutoConfig, BitsAndBytesConfig
 import torch
 # load model
 model_path = '/root/Models/Video-XL-2'
 tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
 device = 'cuda' if torch.cuda.is_available() else 'cpu'
-model = AutoModel.from_pretrained(model_path, trust_remote_code=True, device_map=device,quantization_config=None,attn_implementation="sdpa",torch_dtype=torch.bfloat16)
 gen_kwargs = {
-    "do_sample": True,
     "temperature": 0.01,
     "top_p": 0.001,
     "num_beams": 1,
@@ -81,26 +81,20 @@ To enable this mode, you need to set `enable_chunk_prefill` to `True` and config
 **Tip: Currently, chunk-based prefill only supports the 'sdpa' attention implementation.*
 ```python
-from transformers import AutoTokenizer, AutoModel, AutoConfig, BitsAndBytesConfig
 import torch
 import pdb
 import argparse
 torch.cuda.reset_peak_memory_stats()
 # load model
-model_path = '/share/minghao/Models/Video-XL-2'
 tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
 device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
-model = AutoModel.from_pretrained(model_path, trust_remote_code=True, device_map=device,quantization_config=None,attn_implementation="sdpa",torch_dtype=torch.bfloat16)
 gen_kwargs = {"do_sample": False, "temperature": 0.01, "top_p": 0.001, "num_beams": 1, "use_cache": True, "max_new_tokens": 128}
-"""
-Set params
-With Chunk-based Prefill enabled, Video-XL-2 can process 1,300 frames on a 24GB GPU (using approximately 23.72GB). When combined with bi-level KVS decoding, this capacity increases to 1,800 frames.
-If you have ample resources, you can disable offload and increase chunk_size_for_vision_tower and chunk_size to achieve faster processing.
-"""
 model.config.enable_chunk_prefill = True
 prefill_config = {
     'chunk_prefill_mode': 'streaming',
@@ -112,12 +106,12 @@ prefill_config = {
 model.config.prefill_config = prefill_config
 # input data
-video_path = "/share/LXRlxr0_0/code/videoxl2/lmm-eval/~/.cache/huggingface/videomme/ZBKUqc_ICpg.mp4"
 question1 = "How many people in the video? (A)3 people (B)6 people. Please only respone the letter"
 # params
 max_num_frames = 1300
-sample_fps = None  # extract frame at 1fps
 max_sample_fps = None
 with torch.inference_mode():
@@ -125,7 +119,7 @@ with torch.inference_mode():
 peak_memory_allocated = torch.cuda.max_memory_allocated()
-print(f"Memory Peak: {peak_memory_allocated / (1024**3):.2f} GB") # 23.72GB
 print(response)
 ```

 ---
 ### 1. Inference w/o. Efficiency Optimization
 ```python
+from transformers import AutoTokenizer, AutoModel, AutoConfig, BitsAndBytesConfig, AutoModelForCausalLM
 import torch
 # load model
 model_path = '/root/Models/Video-XL-2'
 tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
 device = 'cuda' if torch.cuda.is_available() else 'cpu'
+model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True, device_map=device,quantization_config=None, attn_implementation="sdpa", torch_dtype=torch.float16, low_cpu_mem_usage=True)
 gen_kwargs = {
+    "do_sample": False,
     "temperature": 0.01,
     "top_p": 0.001,
     "num_beams": 1,
 **Tip: Currently, chunk-based prefill only supports the 'sdpa' attention implementation.*
 ```python
+from transformers import AutoTokenizer, AutoModel, AutoConfig, BitsAndBytesConfig, AutoModelForCausalLM
 import torch
 import pdb
 import argparse
 torch.cuda.reset_peak_memory_stats()
 # load model
+model_path = '/share/minghao/Models2/Video-XL-2'
 tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
 device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
+model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True, device_map=device,quantization_config=None, attn_implementation="sdpa", torch_dtype=torch.float16, low_cpu_mem_usage=True) # sdpa
 gen_kwargs = {"do_sample": False, "temperature": 0.01, "top_p": 0.001, "num_beams": 1, "use_cache": True, "max_new_tokens": 128}
 model.config.enable_chunk_prefill = True
 prefill_config = {
     'chunk_prefill_mode': 'streaming',
 model.config.prefill_config = prefill_config
 # input data
+video_path = "/asset/demo.mp4"
 question1 = "How many people in the video? (A)3 people (B)6 people. Please only respone the letter"
 # params
 max_num_frames = 1300
+sample_fps = None  # uniform sampling
 max_sample_fps = None
 with torch.inference_mode():
 peak_memory_allocated = torch.cuda.max_memory_allocated()
+print(f"Memory Peak: {peak_memory_allocated / (1024**3):.2f} GB") # 转换为GB
 print(response)
 ```