kalkiai3000
/

we-math-phi4

+### Single-sample prediction example
+Below is a minimal example to run a single datapoint using this model from the Hub. It uses the base processor and the finetuned model:
+```python
+import re
+import torch
+from PIL import Image
+from transformers import AutoProcessor, AutoModelForCausalLM
+# Inputs
+caption = "A honeycomb-like grid pattern made of connected hexagons."
+question = (
+    "As shown in the figure, which of the following shapes is the basic unit of a honeycomb? "
+    "A. Parallelogram; B. Regular hexagon; C. Square; D. Regular pentagon"
+)
+image_path = "/data-mount-large/scripts/test.jpeg"  # replace with your local image path
+# Load base processor + finetuned model
+processor = AutoProcessor.from_pretrained("microsoft/Phi-4-multimodal-instruct", trust_remote_code=True)
+model = AutoModelForCausalLM.from_pretrained(
+    "kalkiai3000/we-math-phi4",
+    trust_remote_code=True,
+    torch_dtype=torch.float16,
+    device_map="auto",
+    attn_implementation="eager",
+)
+try:
+    model.config.use_cache = False
+except Exception:
+    pass
+try:
+    model.gradient_checkpointing_disable()
+except Exception:
+    pass
+# Build prompt (MCQ-aware instruction)
+if any(x in question for x in ["A:", "B:", "C:", "A.", "B.", "C.", ";"]):
+    instruction = "Answer with the option's letter from the given choices directly."
+    max_new = 4
+else:
+    instruction = "Answer succinctly with the final value/word only."
+    max_new = 64
+prompt = (
+    f"<|user|><|image_1|>Please solve this math problem: {question}\n"
+    f"Image description: {caption}\n{instruction}<|end|><|assistant|>"
+)
+# Prepare image and inputs
+image = Image.open(image_path).convert("RGB")
+if max(image.size) > 1024:
+    try:
+        image = image.resize((1024, 1024), Image.Resampling.LANCZOS)
+    except Exception:
+        image = image.resize((1024, 1024))
+proc = processor(prompt, images=[image], return_tensors="pt")
+device = next(model.parameters()).device
+inputs = {
+    "input_ids": proc.input_ids.to(device),
+    "attention_mask": (proc.input_ids != processor.tokenizer.pad_token_id).long().to(device),
+    "input_image_embeds": proc.input_image_embeds.to(device),
+    "image_attention_mask": proc.image_attention_mask.to(device),
+    "image_sizes": proc.image_sizes.to(device),
+    "input_mode": torch.tensor([1], dtype=torch.long, device=device),
+}
+with torch.no_grad():
+    gen = model.generate(
+        **inputs,
+        max_new_tokens=max_new,
+        do_sample=False,
+        temperature=0.0,
+        eos_token_id=processor.tokenizer.eos_token_id,
+        num_logits_to_keep=1,
+        use_cache=False,
+    )
+# Decode continuation only
+in_len = inputs["input_ids"].shape[1]
+out_text = processor.batch_decode(gen[:, in_len:], skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+# Optional: extract final answer (letter for MCQ; final token for word problems)
+if "Answer with the option's letter" in instruction:
+    m = re.search(r"\b([ABCD])\b", out_text, flags=re.IGNORECASE)
+    print((m.group(1).upper() if m else out_text[:1]).strip())
+else:
+    tokens = re.findall(r"[A-Za-z0-9\.]+", out_text.strip())
+    print((tokens[-1] if tokens else out_text).strip())
+```