File size: 6,825 Bytes
cd994b0 b92e464 c2acafd 44e5ba7 82367b7 b92e464 82367b7 b92e464 c2acafd b92e464 c2acafd 97ecdbf c2acafd 82367b7 c2acafd b92e464 c2acafd b92e464 be6070c c2acafd b92e464 c2acafd b92e464 19d8287 c2acafd 02cb616 82367b7 c2acafd 82367b7 c2acafd b92e464 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 | ---
license: apache-2.0
---
# Diagram Formalization Enhanced Multi-Modal Geometry Problem Solver
## Model Structure
<p align="center">
<img src="sample/DFE-GPS.png" alt="Alt text" width="80%" height="auto">
</p>
- **Diagram Encoder**: [siglip-so400m-patch14-384](https://huggingface.co/google/siglip-so400m-patch14-384)
- **Lightweight LLM**: [Qwen2-0.5B-Instruct](https://huggingface.co/Qwen/Qwen2-0.5B-Instruct)
- **LLM**: [Yi-1.5-9B-Chat](https://huggingface.co/01-ai/Yi-1.5-9B-Chat)
## Quick Start
Before running the script, install the following necessary dependencies.
```shell
pip install torch transformers==4.40.0 accelerate pillow sentencepiece
```
You can solve geometric problems using the following script. First, formalize the geometric images with the [Diagram Formalizer](https://huggingface.co/NaughtyDog97/DiagramFormalizer), and then use the multi-modal reasing model for problem-solving:
```python
import torch
import transformers
from transformers import AutoModelForCausalLM, AutoTokenizer
from PIL import Image
import warnings
import numpy as np
import re
def tokenizer_image_token(prompt, tokenizer, image_token_index, return_tensors=None):
prompt_chunks = [tokenizer(chunk).input_ids for chunk in prompt.split('<image>')]
def insert_separator(X, sep):
return [ele for sublist in zip(X, [sep] * len(X)) for ele in sublist][:-1]
input_ids = []
offset = 0
if len(prompt_chunks) > 0 and len(prompt_chunks[0]) > 0 and prompt_chunks[0][0] == tokenizer.bos_token_id:
offset = 1
input_ids.append(prompt_chunks[0][0])
for x in insert_separator(prompt_chunks, [image_token_index] * (offset + 1)):
input_ids.extend(x[offset:])
if return_tensors is not None:
if return_tensors == 'pt':
return torch.tensor(input_ids, dtype=torch.long)
raise ValueError(f'Unsupported tensor type: {return_tensors}')
return input_ids
def parse_cdl(input_string):
patterns = {
'construction_cdl': r'(?:The )?(?:calibrate )?construction_cdl(?: is)?:\n(.*?)(?=\n(?:The )?(?:calibrate )?\w+_cdl is:|\n(?:The )?(?:calibrate )?\w+_cdl:|\nSolution is:|\Z)',
'image_cdl': r'(?:The )?(?:calibrate )?image_cdl(?: is)?:\n(.*?)(?=\n(?:The )?(?:calibrate )?\w+_cdl is:|\n(?:The )?(?:calibrate )?\w+_cdl:|\nSolution is:|\Z)',
'text_cdl': r'(?:The )?text_cdl(?: is)?:\n(.*?)(?=\n(?:The )?\w+_cdl is:|\n(?:The )?\w+_cdl:|\nSolution is:|\Z)',
'goal_cdl': r'(?:The )?goal_cdl(?: is)?:\n(.*?)(?=\n(?:The )?\w+_cdl is:|\n(?:The )?\w+_cdl:|\nSolution is:|\Z)'
}
results = {}
for key, pattern in patterns.items():
pattern = pattern.replace("(?:calibrate )?", "(?:calibrate )")
match = re.search(pattern, input_string, re.DOTALL)
if match:
results[key] = match.group(1).strip()
else:
pattern = pattern.replace("(?:calibrate )", "(?:calibrate )?")
match = re.search(pattern, input_string, re.DOTALL)
if match:
results[key] = match.group(1).strip()
return results
# set device
device = 'cuda' # or cpu
torch.set_default_device(device)
# create model
formalization_model = AutoModelForCausalLM.from_pretrained(
'NaughtyDog97/DiagramFormalizer',
torch_dtype=torch.float16, # float32 for cpu
device_map='auto',
trust_remote_code=True)
formalization_tokenizer = AutoTokenizer.from_pretrained(
'NaughtyDog97/DiagramFormalizer',
use_fast=True,
padding_side="right",
trust_remote_code=True)
reason_model = AutoModelForCausalLM.from_pretrained(
'NaughtyDog97/DFE-GPS-9B',
torch_dtype=torch.float16, # float32 for cpu
device_map='auto',
trust_remote_code=True)
reason_tokenizer = AutoTokenizer.from_pretrained(
'NaughtyDog97/DFE-GPS-9B',
use_fast=False,
trust_remote_code=True)
img_path = 'sample/4927.png'
image = Image.open(img_path).convert('RGB')
# formalization
prompt = 'Based on the image, first describe what you see in the figure, then predict the construction_cdl and image_cdl and calibrate it.'
text = f'<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<image>\n{prompt}<|im_end|>\n<|im_start|>assistant\n'
input_ids = tokenizer_image_token(text, formalization_tokenizer, -200, return_tensors='pt').unsqueeze(0).cuda()
# generate
image_tensor = formalization_model.process_images([image], formalization_model.config).to(dtype=formalization_model.dtype, device=device)
with torch.inference_mode():
output_ids = formalization_model.generate(
input_ids,
images=image_tensor,
do_sample=False,
temperature=None,
top_p=None,
top_k=None,
num_beams=1,
max_new_tokens=3500,
eos_token_id=formalization_tokenizer.eos_token_id,
repetition_penalty=None,
use_cache=True
)[0]
respones = formalization_tokenizer.decode(output_ids[input_ids.shape[1]:], skip_special_tokens=True).strip()
print(f'Formalization result is\n{respones}')
cdl_info = parse_cdl(respones)
predict_consCDL = cdl_info['construction_cdl']
predict_imgCDL = cdl_info['image_cdl']
# reasoning
qs = 'As shown in the diagram, AE/AB=1/4, M is the midpoint of segment AC, BE is parallel to CP, EA is parallel to CP. Find the ratio of the length of line BC to the length of line CD.'
prompt = f'Using the provided geometric image and the possibly erroneous construction_cdl and image_cdl, first calibrate the construction_cdl and image_cdl, then give a detailed step-by-step solution to the question.\nThe initial construction_cdl is:\n{predict_consCDL}\nThe initial image_cdl is:\n{predict_imgCDL}\nThe question is:\n{qs}'
text = f'<|im_start|>user\n<image>\n{prompt}<|im_end|>\n<|im_start|>assistant\n'
input_ids = tokenizer_image_token(text, reason_tokenizer, -200, return_tensors='pt').unsqueeze(0).cuda()
# generate
image_tensor = reason_model.process_images([image], reason_model.config).to(dtype=reason_model.dtype, device=device)
with torch.inference_mode():
output_ids = reason_model.generate(
input_ids,
images=image_tensor,
do_sample=False,
temperature=None,
top_p=None,
top_k=None,
num_beams=1,
max_new_tokens=3500,
eos_token_id=reason_tokenizer.eos_token_id,
repetition_penalty=None,
use_cache=True
)[0]
respones = reason_tokenizer.decode(output_ids[input_ids.shape[1]:], skip_special_tokens=True).strip()
print(f'Reasoning steps is\n{respones}')
```
## Performance of DFE-GPS on formalgeo7k test set
| Model | Choice Acc | OpenEnd ACC | Process Evaluation Score |
|-------|------------|-------------|--------------------------|
| DFE-GPS-9B | 77.05 | 68.67 | 76.00 |
| DFE-GPS-34B | **82.38** | **75.33** | **79.07** |
|