""" Sample inference script for Phi4-Siglip. Usage: cd phi4mm python sample_inference.py """ from PIL import Image import torch from transformers import AutoModelForCausalLM, AutoProcessor model_path = "." # change to your model path if not running in the same directory as the model # get first argument as an image path if not throw an error explaining how to use the script with an image import sys with_image_mode = False if len(sys.argv) > 1: with_image_mode = True image_path = sys.argv[1] print(f"Image path provided: {image_path}") else: print("No image path provided. Running in text-only mode. To run with an image, provide the image path as an argument:\npython sample_inference.py /path/to/image.jpg") # Load model and processor print("Loading model...") processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True) model = AutoModelForCausalLM.from_pretrained( model_path, trust_remote_code=True, dtype=torch.bfloat16, device_map="cuda", ).eval() # Import helpers for image processing from processing_phi4_visionr import DEFAULT_IMAGE_TOKEN print(f"Model loaded on {model.device}") #################################################### text-only #################################################### print("\n" + "="*60) print("TEST: Text-only generation") print("="*60) messages = [{"role": "user", "content": "What is the answer for 1+1? Explain it."}] prompt = processor.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) print(f">>> Prompt\n{prompt}") inputs = processor(prompt, images=None, return_tensors="pt").to("cuda:0") generate_ids = model.generate( **inputs, max_new_tokens=4096, eos_token_id=processor.tokenizer.eos_token_id, do_sample=False, ) generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:] response = processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] print(f'>>> Response\n{response}') #################################################### single image #################################################### if not with_image_mode: print("\n" + "="*60) print("No image provided, skipping multimodal test.") print("="*60) exit(0) print("\n" + "="*60) print("TEST: Single image understanding") print("="*60) messages = [{"role": "user", "content": DEFAULT_IMAGE_TOKEN + "\nDescribe this image in detail."}] prompt = processor.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) if with_image_mode: print(f">>> Loading image from {image_path}") image = Image.open(image_path).convert("RGB") print(f"Image size: {image.size}") else: image = None print(f">>> Prompt\n{prompt}") # Process text and image together using the processor inputs = processor(text=prompt, images=[image] if image is not None else None, return_tensors="pt").to("cuda:0") with torch.inference_mode(): generate_ids = model.generate( **inputs, max_new_tokens=4096, eos_token_id=processor.tokenizer.eos_token_id, do_sample=False, ) generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:] response = processor.tokenizer.decode(generate_ids[0], skip_special_tokens=True) print(f'>>> Response\n{response}') print("\n" + "="*60) print("All tests completed!") print("="*60)