| ### Single-sample prediction example | |
| Below is a minimal example to run a single datapoint using this model from the Hub. It uses the base processor and the finetuned model: | |
| ```python | |
| import re | |
| import torch | |
| from PIL import Image | |
| from transformers import AutoProcessor, AutoModelForCausalLM | |
| # Inputs | |
| caption = "A honeycomb-like grid pattern made of connected hexagons." | |
| question = ( | |
| "As shown in the figure, which of the following shapes is the basic unit of a honeycomb? " | |
| "A. Parallelogram; B. Regular hexagon; C. Square; D. Regular pentagon" | |
| ) | |
| image_path = "/data-mount-large/scripts/test.jpeg" # replace with your local image path | |
| # Load base processor + finetuned model | |
| processor = AutoProcessor.from_pretrained("microsoft/Phi-4-multimodal-instruct", trust_remote_code=True) | |
| model = AutoModelForCausalLM.from_pretrained( | |
| "kalkiai3000/we-math-phi4", | |
| trust_remote_code=True, | |
| torch_dtype=torch.float16, | |
| device_map="auto", | |
| attn_implementation="eager", | |
| ) | |
| try: | |
| model.config.use_cache = False | |
| except Exception: | |
| pass | |
| try: | |
| model.gradient_checkpointing_disable() | |
| except Exception: | |
| pass | |
| # Build prompt (MCQ-aware instruction) | |
| if any(x in question for x in ["A:", "B:", "C:", "A.", "B.", "C.", ";"]): | |
| instruction = "Answer with the option's letter from the given choices directly." | |
| max_new = 4 | |
| else: | |
| instruction = "Answer succinctly with the final value/word only." | |
| max_new = 64 | |
| prompt = ( | |
| f"<|user|><|image_1|>Please solve this math problem: {question}\n" | |
| f"Image description: {caption}\n{instruction}<|end|><|assistant|>" | |
| ) | |
| # Prepare image and inputs | |
| image = Image.open(image_path).convert("RGB") | |
| if max(image.size) > 1024: | |
| try: | |
| image = image.resize((1024, 1024), Image.Resampling.LANCZOS) | |
| except Exception: | |
| image = image.resize((1024, 1024)) | |
| proc = processor(prompt, images=[image], return_tensors="pt") | |
| device = next(model.parameters()).device | |
| inputs = { | |
| "input_ids": proc.input_ids.to(device), | |
| "attention_mask": (proc.input_ids != processor.tokenizer.pad_token_id).long().to(device), | |
| "input_image_embeds": proc.input_image_embeds.to(device), | |
| "image_attention_mask": proc.image_attention_mask.to(device), | |
| "image_sizes": proc.image_sizes.to(device), | |
| "input_mode": torch.tensor([1], dtype=torch.long, device=device), | |
| } | |
| with torch.no_grad(): | |
| gen = model.generate( | |
| **inputs, | |
| max_new_tokens=max_new, | |
| do_sample=False, | |
| temperature=0.0, | |
| eos_token_id=processor.tokenizer.eos_token_id, | |
| num_logits_to_keep=1, | |
| use_cache=False, | |
| ) | |
| # Decode continuation only | |
| in_len = inputs["input_ids"].shape[1] | |
| out_text = processor.batch_decode(gen[:, in_len:], skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] | |
| # Optional: extract final answer (letter for MCQ; final token for word problems) | |
| if "Answer with the option's letter" in instruction: | |
| m = re.search(r"\b([ABCD])\b", out_text, flags=re.IGNORECASE) | |
| print((m.group(1).upper() if m else out_text[:1]).strip()) | |
| else: | |
| tokens = re.findall(r"[A-Za-z0-9\.]+", out_text.strip()) | |
| print((tokens[-1] if tokens else out_text).strip()) | |
| ``` | |