File size: 4,548 Bytes

e3bb923

"""
InternVideo3-8B-Instruct Inference Demo

Supports:
- Text-only conversation
- Video understanding
- Image understanding
"""

import torch
import time
from transformers import AutoModelForCausalLM, AutoProcessor
from qwen_vl_utils import process_vision_info


# ============ Model Loading ============
model_path = "/mnt/shared-storage-user/yanziang/HF_toupload/InternVideo3-8B-Instruct"  # current directory

model = AutoModelForCausalLM.from_pretrained(
    model_path,
    dtype=torch.bfloat16,
    attn_implementation="sdpa",
    device_map="cuda:0",
    trust_remote_code=True,
)



processor = AutoProcessor.from_pretrained(
    model_path,
    trust_remote_code=True,
)
# processor = AutoProcessor.from_pretrained("/mnt/shared-storage-user/sfteval/sfteval_models/Qwen3-VL-8B-Instruct/",trust_remote_code=True)
# ============ Example 1: Text-only ============
print("=" * 50)
print("Example 1: Text-only")
print("=" * 50)

messages = [
    {
        "role": "user",
        "content": [
            {"type": "text", "text": "Please introduce yourself."},
        ],
    }
]

text = processor.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True,
    enable_thinking=True,
)

inputs = processor(text=text, images=None, videos=None, do_resize=False, return_tensors="pt")
inputs = inputs.to(model.device)

start_time = time.time()
gen_output = model.generate(
    **inputs,
    max_new_tokens=1024,
    use_cache=True,
)

generated_ids_trimmed = [
    out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, gen_output)
]
output_text = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)
print(output_text[0])
print(f"Time taken: {time.time() - start_time:.2f}s\n")


# ============ Example 2: Video Understanding ============
print("=" * 50)
print("Example 2: Video Understanding")
print("=" * 50)

video_path = "/mnt/shared-storage-user/yanziang/space_woaudio.mp4"  # Replace with your video path

fps = 1
min_pixels = 128 * 32 * 32
max_pixels = 128 * 32 * 32

messages = [
    {
        "role": "user",
        "content": [
            {
                "type": "video",
                "video": video_path,
                "fps": fps,
            },
            {"type": "text", "text": "Please describe this video in detail."},
        ],
    }
]

processor.video_processor.size = {"longest_edge": max_pixels * 512, "shortest_edge": min_pixels * 32}

inputs = processor.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True,
    return_dict=True,
    fps=fps,
    return_tensors="pt",
)
inputs = inputs.to(model.device)

start_time = time.time()
gen_output = model.generate(
    **inputs,
    max_new_tokens=1024,
    use_cache=True,
)

generated_ids_trimmed = [
    out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, gen_output)
]
output_text = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)
print(output_text[0])
print(f"Time taken: {time.time() - start_time:.2f}s\n")


# # ============ Example 3: Image Understanding ============
# print("=" * 50)
# print("Example 3: Image Understanding")
# print("=" * 50)

# image_path = "/mnt/shared-storage-user/yanziang/demo.jpeg"  # Replace with your image path

# messages = [
#     {
#         "role": "user",
#         "content": [
#             {
#                 "type": "image",
#                 "image": image_path,
#             },
#             {"type": "text", "text": "Please describe this image in detail."},
#         ],
#     }
# ]

# text = processor.apply_chat_template(
#     messages,
#     tokenize=False,
#     add_generation_prompt=True,
#     enable_thinking=True,
# )

# images, videos, video_kwargs = process_vision_info(
#     messages, image_patch_size=16, return_video_kwargs=True, return_video_metadata=True
# )

# inputs = processor(
#     text=text,
#     images=images,
#     videos=None,
#     do_resize=False,
#     return_tensors="pt",
# )
# inputs = inputs.to(model.device)

# start_time = time.time()
# gen_output = model.generate(
#     **inputs,
#     max_new_tokens=1024,
#     use_cache=True,
# )

# generated_ids_trimmed = [
#     out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, gen_output)
# ]
# output_text = processor.batch_decode(
#     generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
# )
# print(output_text[0])
# print(f"Time taken: {time.time() - start_time:.2f}s\n")