In [7]:
from safetensors import safe_open

def inspect_safetensors(file_path):
    print(f"Inspecting file: {file_path}")
    with safe_open(file_path, framework="pt", device="cpu") as f:
        print("Available keys (tensor names):")
        for key in f.keys():
            if 'model.layers.0' in key:
                
                tensor = f.get_tensor(key)
                print(f"\nKey: {key}")
                print(f"  Shape: {tensor.shape}")
                print(f"  Dtype: {tensor.dtype}")
                print(f"  Size: {tensor.numel()} elements")
                # 可选：显示前几个元素
                print(f"  First few elements: {tensor.flatten()[:5].tolist()}")

# 示例路径，请替换为你自己的 .safetensors 文件路径
file_path = "model-00001-of-00004.safetensors"
inspect_safetensors(file_path)

Inspecting file: model-00001-of-00004.safetensors
Available keys (tensor names):

Key: model.layers.0.input_layernorm.weight
  Shape: torch.Size([3584])
  Dtype: torch.bfloat16
  Size: 3584 elements
  First few elements: [0.275390625, 0.3046875, 0.26171875, 0.291015625, 0.29296875]

Key: model.layers.0.mlp.down_proj.weight
  Shape: torch.Size([3584, 18944])
  Dtype: torch.bfloat16
  Size: 67895296 elements
  First few elements: [-0.005096435546875, 0.01385498046875, 0.0096435546875, -0.00848388671875, -0.002593994140625]

Key: model.layers.0.mlp.gate_proj.weight
  Shape: torch.Size([18944, 3584])
  Dtype: torch.bfloat16
  Size: 67895296 elements
  First few elements: [0.00286865234375, -0.0201416015625, -0.0216064453125, 0.006622314453125, -0.015625]

Key: model.layers.0.mlp.up_proj.weight
  Shape: torch.Size([18944, 3584])
  Dtype: torch.bfloat16
  Size: 67895296 elements
  First few elements: [0.007537841796875, -0.0111083984375, -0.0024261474609375, -0.006927490234375, -0.0258789062

In [2]:
from transformers import AutoModel, AutoTokenizer
import torch
from modeling_videochat_flash import VideoChatFlashQwenForCausalLM

# model setting
model_path = './'

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
model = VideoChatFlashQwenForCausalLM.from_pretrained(model_path).to(torch.bfloat16).cuda()
image_processor = model.get_vision_tower().image_processor

mm_llm_compress = False # use the global compress or not
if mm_llm_compress:
    model.config.mm_llm_compress = True
    model.config.llm_compress_type = "uniform0_attention"
    model.config.llm_compress_layer_list = [4, 18]
    model.config.llm_image_token_ratio_list = [1, 0.75, 0.25]
else:
    model.config.mm_llm_compress = False

# evaluation setting
max_num_frames = 512
generation_config = dict(
    do_sample=False,
    temperature=0.0,
    max_new_tokens=1024,
    top_p=0.1,
    num_beams=1
)

video_path = "test.mp4"

# single-turn conversation
question1 = "Describe this video in detail."
output1, chat_history = model.chat(video_path=video_path, tokenizer=tokenizer, user_prompt=question1, return_history=True, max_num_frames=max_num_frames, generation_config=generation_config)

print(output1)

# # multi-turn conversation
# question2 = "How many people appear in the video?"
# output2, chat_history = model.chat(video_path=video_path, tokenizer=tokenizer, user_prompt=question2, chat_history=chat_history, return_history=True, max_num_frames=max_num_frames, generation_config=generation_config)

# print(output2)


Exception: data did not match any variant of untagged enum ModelWrapper at line 757455 column 3