NaughtyDog97
/

DFE-GPS-9B

@@ -1,111 +1,25 @@
 ---
 license: apache-2.0
 ---
-# 基于FormalGeo7K的推理模型
-## 快速开始
-在运行脚本之前，首先安装如下必要的依赖。
-```shell
-pip install torch transformers==4.40.0 accelerate pillow sentencepiece
-```
-```python
-import torch
-import transformers
-from transformers import AutoModelForCausalLM, AutoTokenizer
-from PIL import Image
-import warnings
-import numpy as np
-# set device
-device = 'cuda'  # or cpu
-torch.set_default_device(device)
-# create model
-model = AutoModelForCausalLM.from_pretrained(
-    'NaughtyDog97/FormalEnhencedGPS-9B',
-    torch_dtype=torch.float16, # float32 for cpu
-    device_map='auto',
-    trust_remote_code=True)
-tokenizer = AutoTokenizer.from_pretrained(
-    'NaughtyDog97/FormalEnhencedGPS-9B',
-    use_fast=False,
-    trust_remote_code=True)
-# text prompt
-img_path = 'sample/4927.png'
-qs = 'As shown in the diagram, AE/AB=1/4, M is the midpoint of segment AC, BE is parallel to CP, EA is parallel to CP. Find the ratio of the length of line BC to the length of line CD.'
-prompt = f'Using the provided geometric image and question, first predict the construction_cdl and image_cdl. Then, give a detailed step-by-step solution.\nThe question is:\n{qs}'
-text = f'<|im_start|>user\n<image>\n{prompt}<|im_end|>\n<|im_start|>assistant\n'
-def tokenizer_image_token(prompt, tokenizer, image_token_index, return_tensors=None):
-    prompt_chunks = [tokenizer(chunk).input_ids for chunk in prompt.split('<image>')]
-    def insert_separator(X, sep):
-        return [ele for sublist in zip(X, [sep] * len(X)) for ele in sublist][:-1]
-    input_ids = []
-    offset = 0
-    if len(prompt_chunks) > 0 and len(prompt_chunks[0]) > 0 and prompt_chunks[0][0] == tokenizer.bos_token_id:
-        offset = 1
-        input_ids.append(prompt_chunks[0][0])
-    for x in insert_separator(prompt_chunks, [image_token_index] * (offset + 1)):
-        input_ids.extend(x[offset:])
-    if return_tensors is not None:
-        if return_tensors == 'pt':
-            return torch.tensor(input_ids, dtype=torch.long)
-        raise ValueError(f'Unsupported tensor type: {return_tensors}')
-    return input_ids
-input_ids = tokenizer_image_token(text, tokenizer, -200, return_tensors='pt').unsqueeze(0).cuda()
-# image, sample images can be found in images folder
-image = Image.open(img_path).convert('RGB')
-image_tensor = model.process_images([image], model.config).to(dtype=model.dtype, device=device)
-# generate
-with torch.inference_mode():
-    output_ids = model.generate(
-        input_ids,
-        images=image_tensor,
-        do_sample=False,
-        temperature=None,
-        top_p=None,
-        top_k=None,
-        num_beams=1,
-        max_new_tokens=3500,
-        eos_token_id=tokenizer.eos_token_id,
-        repetition_penalty=None,
-        use_cache=True
-    )[0]
-respones = tokenizer.decode(output_ids[input_ids.shape[1]:], skip_special_tokens=True).strip()
-print(respones)
 ```
-我们的模型支持的求解方式有如下三种:
-```python
-# Q => Predicted CDL + CoT Answer
-prompt = f'Using the provided geometric image and question, first predict the construction_cdl and image_cdl. Then, give a detailed step-by-step solution.\nThe question is:\n{qs}'
-# Q + Predicted CDL => CoT Answer
-prompt = f'Using the provided geometric image, construction_cdl, image_cdl, and question, give a detailed step-by-step solution. Note that there may be minor errors in the construction_cdl and image_cdl.\nThe construction_cdl is:\n{predict_consCDL}\nThe image_cdl is:\n{predict_imgCDL}\nThe question is:\n{qs}'
-# Q + Predicted CDL => Calibrated CDL + CoT Answer
-prompt = f'Using the provided geometric image and the possibly erroneous construction_cdl and image_cdl, first calibrate the construction_cdl and image_cdl, then give a detailed step-by-step solution to the question.\nThe initial construction_cdl is:\n{predict_consCDL}\nThe initial image_cdl is:\n{predict_imgCDL}\nThe question is:\n{qs}'
-```
-## 结合Formalization模型的推理
 ```python
 import torch
 import transformers
@@ -137,7 +51,6 @@ def tokenizer_image_token(prompt, tokenizer, image_token_index, return_tensors=N
     return input_ids
 def parse_cdl(input_string):
-    # 使用正则表达式查找各个部分
     patterns = {
         'construction_cdl': r'(?:The )?(?:calibrate )?construction_cdl(?: is)?:\n(.*?)(?=\n(?:The )?(?:calibrate )?\w+_cdl is:|\n(?:The )?(?:calibrate )?\w+_cdl:|\nSolution is:|\Z)',
         'image_cdl': r'(?:The )?(?:calibrate )?image_cdl(?: is)?:\n(.*?)(?=\n(?:The )?(?:calibrate )?\w+_cdl is:|\n(?:The )?(?:calibrate )?\w+_cdl:|\nSolution is:|\Z)',
@@ -146,8 +59,6 @@ def parse_cdl(input_string):
     }
     results = {}
-    # 优先匹配包含"calibrate"的版本
     for key, pattern in patterns.items():
         pattern = pattern.replace("(?:calibrate )?", "(?:calibrate )")
         match = re.search(pattern, input_string, re.DOTALL)
@@ -169,25 +80,25 @@ torch.set_default_device(device)
 # create model
 formalization_model = AutoModelForCausalLM.from_pretrained(
-    'NaughtyDog97/GeoFormalizer',
     torch_dtype=torch.float16, # float32 for cpu
     device_map='auto',
     trust_remote_code=True)
 formalization_tokenizer = AutoTokenizer.from_pretrained(
-    'NaughtyDog97/GeoFormalizer',
     use_fast=True,
     padding_side="right",
     trust_remote_code=True)
 reason_model = AutoModelForCausalLM.from_pretrained(
-    'NaughtyDog97/FormalEnhencedGPS-9B',
     torch_dtype=torch.float16, # float32 for cpu
     device_map='auto',
     trust_remote_code=True)
 reason_tokenizer = AutoTokenizer.from_pretrained(
-    'NaughtyDog97/FormalEnhencedGPS-9B',
     use_fase=False
     trust_remote_code=True)
@@ -257,13 +168,13 @@ with torch.inference_mode():
 respones = reason_tokenizer.decode(output_ids[input_ids.shape[1]:], skip_special_tokens=True).strip()
 print(f'Reasoning steps is\n{respones}')
 ```
-## Performance
-|     |   Q => Predicted CDL + CoT Answer   |   Q + Predicted CDL => CoT Answer    |   Q + Predicted CDL => Calibrated CDL + CoT Answer   |
-|-----|-------------------------------------|--------------------------------------|------------------------------------------------------|
-|  siglip-0.4B-yi1.5-9B  |        63.92/73.30        |         63.59/74.27            |           65.05/75.24        |

 ---
 license: apache-2.0
 ---
+# Diagram Formalization Enhanced Multi-Modal Geometry Problem Solver
+## Model Structure
+<img src="sample/DFE-GPS.png" alt="Alt text" width="30%" height="auto">
+- **Diagram Encoder**: [siglip-so400m-patch14-384](https://huggingface.co/google/siglip-so400m-patch14-384)
+- **Lightweight LLM**: [Qwen2-0.5B-Instruct](https://huggingface.co/Qwen/Qwen2-0.5B-Instruct)
+- **LLM**: [Yi-1.5-9B-Chat](https://huggingface.co/01-ai/Yi-1.5-9B-Chat)
+## Quick Start
+Before running the script, install the following necessary dependencies.
+```shell
+pip install torch transformers==4.40.0 accelerate pillow sentencepiece
 ```
+You can solve geometric problems using the following script. First, formalize the geometric images with the Diagram Formalizer, and then use the multi-modal reasing model for problem-solving:
 ```python
 import torch
 import transformers
     return input_ids
 def parse_cdl(input_string):
     patterns = {
         'construction_cdl': r'(?:The )?(?:calibrate )?construction_cdl(?: is)?:\n(.*?)(?=\n(?:The )?(?:calibrate )?\w+_cdl is:|\n(?:The )?(?:calibrate )?\w+_cdl:|\nSolution is:|\Z)',
         'image_cdl': r'(?:The )?(?:calibrate )?image_cdl(?: is)?:\n(.*?)(?=\n(?:The )?(?:calibrate )?\w+_cdl is:|\n(?:The )?(?:calibrate )?\w+_cdl:|\nSolution is:|\Z)',
     }
     results = {}
     for key, pattern in patterns.items():
         pattern = pattern.replace("(?:calibrate )?", "(?:calibrate )")
         match = re.search(pattern, input_string, re.DOTALL)
 # create model
 formalization_model = AutoModelForCausalLM.from_pretrained(
+    'NaughtyDog97/DiagramFormalizer',
     torch_dtype=torch.float16, # float32 for cpu
     device_map='auto',
     trust_remote_code=True)
 formalization_tokenizer = AutoTokenizer.from_pretrained(
+    'NaughtyDog97/DiagramFormalizer',
     use_fast=True,
     padding_side="right",
     trust_remote_code=True)
 reason_model = AutoModelForCausalLM.from_pretrained(
+    'NaughtyDog97/DFE-GPS-9B',
     torch_dtype=torch.float16, # float32 for cpu
     device_map='auto',
     trust_remote_code=True)
 reason_tokenizer = AutoTokenizer.from_pretrained(
+    'NaughtyDog97/DFE-GPS-9B',
     use_fase=False
     trust_remote_code=True)
 respones = reason_tokenizer.decode(output_ids[input_ids.shape[1]:], skip_special_tokens=True).strip()
 print(f'Reasoning steps is\n{respones}')
 ```
+## Performance of DFE-GPS on formalgeo7k test set
+| Model | Choice Acc | OpenEnd ACC | Process Evaluation Score |
+|-------|------------|-------------|--------------------------|
+| DFE-GPS-9B | 77.05 | 68.67 | 76.00 |
+| DFE-GPS-34B | **82.38** | **75.33** | **79.07** |

sample/DFE-GPS.png ADDED Viewed