kalki-sambhal commited on
Commit
66f41c5
·
verified ·
1 Parent(s): 8eae3ec

Initial upload of We-Math Phi-4 (multimodal) with model card

Browse files
Files changed (1) hide show
  1. README.md +92 -0
README.md CHANGED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+
3
+ ### Single-sample prediction example
4
+
5
+ Below is a minimal example to run a single datapoint using this model from the Hub. It uses the base processor and the finetuned model:
6
+
7
+ ```python
8
+ import re
9
+ import torch
10
+ from PIL import Image
11
+ from transformers import AutoProcessor, AutoModelForCausalLM
12
+
13
+ # Inputs
14
+ caption = "A honeycomb-like grid pattern made of connected hexagons."
15
+ question = (
16
+ "As shown in the figure, which of the following shapes is the basic unit of a honeycomb? "
17
+ "A. Parallelogram; B. Regular hexagon; C. Square; D. Regular pentagon"
18
+ )
19
+ image_path = "/data-mount-large/scripts/test.jpeg" # replace with your local image path
20
+
21
+ # Load base processor + finetuned model
22
+ processor = AutoProcessor.from_pretrained("microsoft/Phi-4-multimodal-instruct", trust_remote_code=True)
23
+ model = AutoModelForCausalLM.from_pretrained(
24
+ "kalkiai3000/we-math-phi4",
25
+ trust_remote_code=True,
26
+ torch_dtype=torch.float16,
27
+ device_map="auto",
28
+ attn_implementation="eager",
29
+ )
30
+ try:
31
+ model.config.use_cache = False
32
+ except Exception:
33
+ pass
34
+ try:
35
+ model.gradient_checkpointing_disable()
36
+ except Exception:
37
+ pass
38
+
39
+ # Build prompt (MCQ-aware instruction)
40
+ if any(x in question for x in ["A:", "B:", "C:", "A.", "B.", "C.", ";"]):
41
+ instruction = "Answer with the option's letter from the given choices directly."
42
+ max_new = 4
43
+ else:
44
+ instruction = "Answer succinctly with the final value/word only."
45
+ max_new = 64
46
+ prompt = (
47
+ f"<|user|><|image_1|>Please solve this math problem: {question}\n"
48
+ f"Image description: {caption}\n{instruction}<|end|><|assistant|>"
49
+ )
50
+
51
+ # Prepare image and inputs
52
+ image = Image.open(image_path).convert("RGB")
53
+ if max(image.size) > 1024:
54
+ try:
55
+ image = image.resize((1024, 1024), Image.Resampling.LANCZOS)
56
+ except Exception:
57
+ image = image.resize((1024, 1024))
58
+
59
+ proc = processor(prompt, images=[image], return_tensors="pt")
60
+ device = next(model.parameters()).device
61
+ inputs = {
62
+ "input_ids": proc.input_ids.to(device),
63
+ "attention_mask": (proc.input_ids != processor.tokenizer.pad_token_id).long().to(device),
64
+ "input_image_embeds": proc.input_image_embeds.to(device),
65
+ "image_attention_mask": proc.image_attention_mask.to(device),
66
+ "image_sizes": proc.image_sizes.to(device),
67
+ "input_mode": torch.tensor([1], dtype=torch.long, device=device),
68
+ }
69
+
70
+ with torch.no_grad():
71
+ gen = model.generate(
72
+ **inputs,
73
+ max_new_tokens=max_new,
74
+ do_sample=False,
75
+ temperature=0.0,
76
+ eos_token_id=processor.tokenizer.eos_token_id,
77
+ num_logits_to_keep=1,
78
+ use_cache=False,
79
+ )
80
+
81
+ # Decode continuation only
82
+ in_len = inputs["input_ids"].shape[1]
83
+ out_text = processor.batch_decode(gen[:, in_len:], skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
84
+
85
+ # Optional: extract final answer (letter for MCQ; final token for word problems)
86
+ if "Answer with the option's letter" in instruction:
87
+ m = re.search(r"\b([ABCD])\b", out_text, flags=re.IGNORECASE)
88
+ print((m.group(1).upper() if m else out_text[:1]).strip())
89
+ else:
90
+ tokens = re.findall(r"[A-Za-z0-9\.]+", out_text.strip())
91
+ print((tokens[-1] if tokens else out_text).strip())
92
+ ```