""" InternVideo3-8B-Instruct Inference Demo Supports: - Text-only conversation - Video understanding - Image understanding """ import torch import time from transformers import AutoModelForCausalLM, AutoProcessor from qwen_vl_utils import process_vision_info # ============ Model Loading ============ model_path = "/mnt/shared-storage-user/yanziang/HF_toupload/InternVideo3-8B-Instruct" # current directory model = AutoModelForCausalLM.from_pretrained( model_path, dtype=torch.bfloat16, attn_implementation="sdpa", device_map="cuda:0", trust_remote_code=True, ) processor = AutoProcessor.from_pretrained( model_path, trust_remote_code=True, ) # processor = AutoProcessor.from_pretrained("/mnt/shared-storage-user/sfteval/sfteval_models/Qwen3-VL-8B-Instruct/",trust_remote_code=True) # ============ Example 1: Text-only ============ print("=" * 50) print("Example 1: Text-only") print("=" * 50) messages = [ { "role": "user", "content": [ {"type": "text", "text": "Please introduce yourself."}, ], } ] text = processor.apply_chat_template( messages, tokenize=False, add_generation_prompt=True, enable_thinking=True, ) inputs = processor(text=text, images=None, videos=None, do_resize=False, return_tensors="pt") inputs = inputs.to(model.device) start_time = time.time() gen_output = model.generate( **inputs, max_new_tokens=1024, use_cache=True, ) generated_ids_trimmed = [ out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, gen_output) ] output_text = processor.batch_decode( generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False ) print(output_text[0]) print(f"Time taken: {time.time() - start_time:.2f}s\n") # ============ Example 2: Video Understanding ============ print("=" * 50) print("Example 2: Video Understanding") print("=" * 50) video_path = "/mnt/shared-storage-user/yanziang/space_woaudio.mp4" # Replace with your video path fps = 1 min_pixels = 128 * 32 * 32 max_pixels = 128 * 32 * 32 messages = [ { "role": "user", "content": [ { "type": "video", "video": video_path, "fps": fps, }, {"type": "text", "text": "Please describe this video in detail."}, ], } ] processor.video_processor.size = {"longest_edge": max_pixels * 512, "shortest_edge": min_pixels * 32} inputs = processor.apply_chat_template( messages, tokenize=True, add_generation_prompt=True, return_dict=True, fps=fps, return_tensors="pt", ) inputs = inputs.to(model.device) start_time = time.time() gen_output = model.generate( **inputs, max_new_tokens=1024, use_cache=True, ) generated_ids_trimmed = [ out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, gen_output) ] output_text = processor.batch_decode( generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False ) print(output_text[0]) print(f"Time taken: {time.time() - start_time:.2f}s\n") # # ============ Example 3: Image Understanding ============ # print("=" * 50) # print("Example 3: Image Understanding") # print("=" * 50) # image_path = "/mnt/shared-storage-user/yanziang/demo.jpeg" # Replace with your image path # messages = [ # { # "role": "user", # "content": [ # { # "type": "image", # "image": image_path, # }, # {"type": "text", "text": "Please describe this image in detail."}, # ], # } # ] # text = processor.apply_chat_template( # messages, # tokenize=False, # add_generation_prompt=True, # enable_thinking=True, # ) # images, videos, video_kwargs = process_vision_info( # messages, image_patch_size=16, return_video_kwargs=True, return_video_metadata=True # ) # inputs = processor( # text=text, # images=images, # videos=None, # do_resize=False, # return_tensors="pt", # ) # inputs = inputs.to(model.device) # start_time = time.time() # gen_output = model.generate( # **inputs, # max_new_tokens=1024, # use_cache=True, # ) # generated_ids_trimmed = [ # out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, gen_output) # ] # output_text = processor.batch_decode( # generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False # ) # print(output_text[0]) # print(f"Time taken: {time.time() - start_time:.2f}s\n")