fix typo

19d8287 verified over 1 year ago

6.83 kB

	---
	license: apache-2.0
	---
	# Diagram Formalization Enhanced Multi-Modal Geometry Problem Solver
	## Model Structure

	<p align="center">
	<img src="sample/DFE-GPS.png" alt="Alt text" width="80%" height="auto">
	</p>

	- Diagram Encoder: [siglip-so400m-patch14-384](https://huggingface.co/google/siglip-so400m-patch14-384)
	- Lightweight LLM: [Qwen2-0.5B-Instruct](https://huggingface.co/Qwen/Qwen2-0.5B-Instruct)
	- LLM: [Yi-1.5-9B-Chat](https://huggingface.co/01-ai/Yi-1.5-9B-Chat)

	## Quick Start
	Before running the script, install the following necessary dependencies.

	```shell
	pip install torch transformers==4.40.0 accelerate pillow sentencepiece
	```

	You can solve geometric problems using the following script. First, formalize the geometric images with the [Diagram Formalizer](https://huggingface.co/NaughtyDog97/DiagramFormalizer), and then use the multi-modal reasing model for problem-solving:

	```python
	import torch
	import transformers
	from transformers import AutoModelForCausalLM, AutoTokenizer
	from PIL import Image
	import warnings
	import numpy as np
	import re

	def tokenizer_image_token(prompt, tokenizer, image_token_index, return_tensors=None):
	prompt_chunks = [tokenizer(chunk).input_ids for chunk in prompt.split('<image>')]

	def insert_separator(X, sep):
	return [ele for sublist in zip(X, [sep] * len(X)) for ele in sublist][:-1]

	input_ids = []
	offset = 0
	if len(prompt_chunks) > 0 and len(prompt_chunks[0]) > 0 and prompt_chunks[0][0] == tokenizer.bos_token_id:
	offset = 1
	input_ids.append(prompt_chunks[0][0])

	for x in insert_separator(prompt_chunks, [image_token_index] * (offset + 1)):
	input_ids.extend(x[offset:])

	if return_tensors is not None:
	if return_tensors == 'pt':
	return torch.tensor(input_ids, dtype=torch.long)
	raise ValueError(f'Unsupported tensor type: {return_tensors}')
	return input_ids

	def parse_cdl(input_string):
	patterns = {
	'construction_cdl': r'(?:The )?(?:calibrate )?construction_cdl(?: is)?:\n(.*?)(?=\n(?:The )?(?:calibrate )?\w+_cdl is:\|\n(?:The )?(?:calibrate )?\w+_cdl:\|\nSolution is:\|\Z)',
	'image_cdl': r'(?:The )?(?:calibrate )?image_cdl(?: is)?:\n(.*?)(?=\n(?:The )?(?:calibrate )?\w+_cdl is:\|\n(?:The )?(?:calibrate )?\w+_cdl:\|\nSolution is:\|\Z)',
	'text_cdl': r'(?:The )?text_cdl(?: is)?:\n(.*?)(?=\n(?:The )?\w+_cdl is:\|\n(?:The )?\w+_cdl:\|\nSolution is:\|\Z)',
	'goal_cdl': r'(?:The )?goal_cdl(?: is)?:\n(.*?)(?=\n(?:The )?\w+_cdl is:\|\n(?:The )?\w+_cdl:\|\nSolution is:\|\Z)'
	}

	results = {}
	for key, pattern in patterns.items():
	pattern = pattern.replace("(?:calibrate )?", "(?:calibrate )")
	match = re.search(pattern, input_string, re.DOTALL)
	if match:
	results[key] = match.group(1).strip()
	else:
	pattern = pattern.replace("(?:calibrate )", "(?:calibrate )?")
	match = re.search(pattern, input_string, re.DOTALL)
	if match:
	results[key] = match.group(1).strip()

	return results


	# set device
	device = 'cuda' # or cpu
	torch.set_default_device(device)

	# create model
	formalization_model = AutoModelForCausalLM.from_pretrained(
	'NaughtyDog97/DiagramFormalizer',
	torch_dtype=torch.float16, # float32 for cpu
	device_map='auto',
	trust_remote_code=True)

	formalization_tokenizer = AutoTokenizer.from_pretrained(
	'NaughtyDog97/DiagramFormalizer',
	use_fast=True,
	padding_side="right",
	trust_remote_code=True)


	reason_model = AutoModelForCausalLM.from_pretrained(
	'NaughtyDog97/DFE-GPS-9B',
	torch_dtype=torch.float16, # float32 for cpu
	device_map='auto',
	trust_remote_code=True)
	reason_tokenizer = AutoTokenizer.from_pretrained(
	'NaughtyDog97/DFE-GPS-9B',
	use_fast=False,
	trust_remote_code=True)



	img_path = 'sample/4927.png'
	image = Image.open(img_path).convert('RGB')


	# formalization
	prompt = 'Based on the image, first describe what you see in the figure, then predict the construction_cdl and image_cdl and calibrate it.'
	text = f'<\|im_start\|>system\nYou are a helpful assistant.<\|im_end\|>\n<\|im_start\|>user\n<image>\n{prompt}<\|im_end\|>\n<\|im_start\|>assistant\n'
	input_ids = tokenizer_image_token(text, formalization_tokenizer, -200, return_tensors='pt').unsqueeze(0).cuda()

	# generate
	image_tensor = formalization_model.process_images([image], formalization_model.config).to(dtype=formalization_model.dtype, device=device)
	with torch.inference_mode():
	output_ids = formalization_model.generate(
	input_ids,
	images=image_tensor,
	do_sample=False,
	temperature=None,
	top_p=None,
	top_k=None,
	num_beams=1,
	max_new_tokens=3500,
	eos_token_id=formalization_tokenizer.eos_token_id,
	repetition_penalty=None,
	use_cache=True
	)[0]


	respones = formalization_tokenizer.decode(output_ids[input_ids.shape[1]:], skip_special_tokens=True).strip()
	print(f'Formalization result is\n{respones}')
	cdl_info = parse_cdl(respones)
	predict_consCDL = cdl_info['construction_cdl']
	predict_imgCDL = cdl_info['image_cdl']



	# reasoning

	qs = 'As shown in the diagram, AE/AB=1/4, M is the midpoint of segment AC, BE is parallel to CP, EA is parallel to CP. Find the ratio of the length of line BC to the length of line CD.'
	prompt = f'Using the provided geometric image and the possibly erroneous construction_cdl and image_cdl, first calibrate the construction_cdl and image_cdl, then give a detailed step-by-step solution to the question.\nThe initial construction_cdl is:\n{predict_consCDL}\nThe initial image_cdl is:\n{predict_imgCDL}\nThe question is:\n{qs}'
	text = f'<\|im_start\|>user\n<image>\n{prompt}<\|im_end\|>\n<\|im_start\|>assistant\n'
	input_ids = tokenizer_image_token(text, reason_tokenizer, -200, return_tensors='pt').unsqueeze(0).cuda()



	# generate
	image_tensor = reason_model.process_images([image], reason_model.config).to(dtype=reason_model.dtype, device=device)
	with torch.inference_mode():
	output_ids = reason_model.generate(
	input_ids,
	images=image_tensor,
	do_sample=False,
	temperature=None,
	top_p=None,
	top_k=None,
	num_beams=1,
	max_new_tokens=3500,
	eos_token_id=reason_tokenizer.eos_token_id,
	repetition_penalty=None,
	use_cache=True
	)[0]

	respones = reason_tokenizer.decode(output_ids[input_ids.shape[1]:], skip_special_tokens=True).strip()
	print(f'Reasoning steps is\n{respones}')

	```



	## Performance of DFE-GPS on formalgeo7k test set

	\| Model \| Choice Acc \| OpenEnd ACC \| Process Evaluation Score \|
	\|-------\|------------\|-------------\|--------------------------\|
	\| DFE-GPS-9B \| 77.05 \| 68.67 \| 76.00 \|
	\| DFE-GPS-34B \| 82.38 \| 75.33 \| 79.07 \|