Spaces:

Varshithdharmajv
/

mvm2-math-verification

Running

App Files Files Community

mvm2-math-verification / math_verify /tasks.py

Varshithdharmajv

Upload math_verify/tasks.py with huggingface_hub

b2df04b verified 8 days ago

raw

history blame contribute delete

9.6 kB

	import logging
	from textwrap import dedent
	from typing import Callable, Optional

	import numpy as np
	from lighteval.metrics.dynamic_metrics import SampleLevelMetric
	from lighteval.metrics.utils.metric_utils import MetricCategory, MetricUseCase
	from lighteval.tasks.lighteval_task import LightevalTaskConfig
	from lighteval.tasks.requests import Doc

	from math_verify.few_shots import GSM8K_FEW_SHOTS, MATH_HARD_FEW_SHOTS
	from math_verify.metric import math_metric
	from math_verify.parser import ExprExtractionConfig, LatexExtractionConfig

	logger = logging.getLogger(__name__)


	def as_lighteval_metric(
	metric: Callable[
	[list[str], list[str]], tuple[float, Optional[tuple[list[str], list[str]]]]
	],
	) -> SampleLevelMetric:
	def sample_level_fn(
	formatted_doc: Doc, golds: list[str], predictions: list[str]
	) -> float:
	result, extracted_predictions = metric(golds, predictions)
	if extracted_predictions is not None:
	if not formatted_doc.specific:
	formatted_doc.specific = {}
	formatted_doc.specific["extracted_predictions"] = extracted_predictions
	return result

	return SampleLevelMetric(
	metric_name="extractive_match",
	sample_level_fn=sample_level_fn,
	category=MetricCategory.GENERATIVE,
	use_case=MetricUseCase.ACCURACY,
	corpus_level_fn=np.mean,
	higher_is_better=True,
	)


	def math_hard_prompt_function(x: dict, task_name: str) -> Doc:
	if x.get("__few_shots"):
	index = x["__index"]
	few_shot_doc = (
	MATH_HARD_FEW_SHOTS[index]
	if index < len(MATH_HARD_FEW_SHOTS)
	else MATH_HARD_FEW_SHOTS[-1]
	)
	answer = few_shot_doc["answer"]
	question = few_shot_doc["question"]
	else:
	answer = str(x["solution"])
	question = x["problem"]

	query = dedent(
	f"""\
	Question: {question}
	Step-by-Step Answer:\
	"""
	).strip()

	choices = [answer]
	return Doc(query=query, choices=choices, gold_index=0)


	def math_prompt_function(x: dict, task_name: str) -> Doc:
	if x.get("__few_shots"):
	index = x["__index"]
	few_shot_doc = (
	MATH_HARD_FEW_SHOTS[index]
	if index < len(MATH_HARD_FEW_SHOTS)
	else MATH_HARD_FEW_SHOTS[-1]
	)
	answer = few_shot_doc["answer"]
	question = few_shot_doc["question"]
	else:
	answer = str(x["answer"])
	question = x["problem"]

	query = dedent(
	f"""\
	Question: {question}
	Step-by-Step Answer:\
	"""
	).strip()

	choices = [answer]
	return Doc(query=query, choices=choices, gold_index=0)


	def math_aime24_prompt_function(x: dict, task_name: str) -> Doc:
	if x.get("__few_shots"):
	index = x["__index"]
	few_shot_doc = (
	MATH_HARD_FEW_SHOTS[index]
	if index < len(MATH_HARD_FEW_SHOTS)
	else MATH_HARD_FEW_SHOTS[-1]
	)
	answer = few_shot_doc["answer"]
	question = few_shot_doc["question"]
	else:
	answer = str(x["reference_solution"])
	question = x["problem"]

	query = dedent(
	f"""\
	Question: {question}
	Step-by-Step Answer:\
	"""
	).strip()

	choices = [f" {answer}"]
	return Doc(query=query, choices=choices, gold_index=0)


	def math_amc23_prompt_function(x: dict, task_name: str) -> Doc:
	if x.get("__few_shots"):
	index = x["__index"]
	few_shot_doc = (
	MATH_HARD_FEW_SHOTS[index]
	if index < len(MATH_HARD_FEW_SHOTS)
	else MATH_HARD_FEW_SHOTS[-1]
	)
	answer = few_shot_doc["answer"]
	question = few_shot_doc["question"]
	else:
	answer = str(x["answer"])
	question = x["question"]

	query = dedent(
	f"""\
	Question: {question}
	Step-by-Step Answer:\
	"""
	).strip()
	choices = [f" {answer}"]
	return Doc(query=query, choices=choices, gold_index=0)


	def gsm8k_prompt_function(x: dict, task_name: str) -> Doc:
	if x.get("__few_shots"):
	index = x["__index"]
	few_shot_doc = (
	GSM8K_FEW_SHOTS[index]
	if index < len(GSM8K_FEW_SHOTS)
	else GSM8K_FEW_SHOTS[-1]
	)
	answer = few_shot_doc["answer"]
	question = few_shot_doc["question"]
	else:
	answer = f"{x['answer'].split('####')[-1].strip()}"
	question = x["question"]

	query = dedent(
	f"""\
	Question: {question}
	Step-by-Step Answer:\
	"""
	).strip()

	choices = [f" {answer}"]
	return Doc(query=query, choices=choices, gold_index=0)


	math_hard_lighteval = [
	LightevalTaskConfig(
	name=f"math_hard:{subset}",
	suite=["lighteval", "math"],
	prompt_function=math_hard_prompt_function,
	hf_repo="lighteval/MATH-Hard",
	hf_subset=subset,
	evaluation_splits=["test"],
	few_shots_split="train",
	generation_size=1024,
	metric=[
	as_lighteval_metric(
	math_metric(
	gold_extraction_target=(
	LatexExtractionConfig(boxed_match_priority=0),
	),
	pred_extraction_target=(
	LatexExtractionConfig(),
	ExprExtractionConfig(),
	),
	)
	),
	],
	stop_sequence=["\nQuestion:", "\nProblem:", "\nquestion:", "\nproblem:"],
	trust_dataset=True,
	version=0,
	)
	for subset in [
	"algebra",
	"counting_and_probability",
	"geometry",
	"intermediate_algebra",
	"number_theory",
	"prealgebra",
	"precalculus",
	]
	]

	math_500_lighteval = [
	LightevalTaskConfig(
	name="math_500",
	suite=["lighteval", "math"],
	prompt_function=math_prompt_function,
	hf_repo="HuggingFaceH4/MATH-500",
	hf_subset="default",
	evaluation_splits=["test"],
	few_shots_split="test",
	generation_size=1024,
	metric=[
	as_lighteval_metric(
	math_metric(
	gold_extraction_target=(
	LatexExtractionConfig(boxed_match_priority=0),
	),
	pred_extraction_target=(
	LatexExtractionConfig(),
	ExprExtractionConfig(),
	),
	)
	),
	],
	stop_sequence=["\nQuestion:", "\nProblem:", "\nquestion:", "\nproblem:"],
	trust_dataset=True,
	version=0,
	)
	]


	aime24_lighteval = [
	LightevalTaskConfig(
	name="aime24",
	suite=["lighteval", "math"],
	prompt_function=math_aime24_prompt_function,
	hf_repo="zwhe99/aime24",
	hf_subset="default",
	evaluation_splits=["test"],
	few_shots_split="test",
	generation_size=1024,
	metric=[
	as_lighteval_metric(
	math_metric(
	gold_extraction_target=(LatexExtractionConfig(),),
	pred_extraction_target=(
	LatexExtractionConfig(),
	ExprExtractionConfig(),
	),
	)
	),
	],
	stop_sequence=["\nQuestion:", "\nProblem:", "\nquestion:", "\nproblem:"],
	trust_dataset=True,
	version=0,
	)
	]

	amc23_lighteval = [
	LightevalTaskConfig(
	name="amc23",
	suite=["lighteval", "math"],
	prompt_function=math_amc23_prompt_function,
	hf_repo="zwhe99/amc23",
	hf_subset="default",
	hf_filter=lambda x: len(x["question"].strip()) > 0,
	evaluation_splits=["test"],
	few_shots_split="test",
	generation_size=1024,
	metric=[
	as_lighteval_metric(
	math_metric(
	gold_extraction_target=(ExprExtractionConfig(),),
	pred_extraction_target=(
	LatexExtractionConfig(),
	ExprExtractionConfig(),
	),
	)
	),
	],
	stop_sequence=["\nQuestion:", "\nProblem:", "\nquestion:", "\nproblem:"],
	trust_dataset=True,
	version=0,
	)
	]

	gsm8k_lighteval = [
	LightevalTaskConfig(
	name="gsm8k",
	suite=["lighteval", "math"],
	prompt_function=gsm8k_prompt_function,
	hf_repo="openai/gsm8k",
	hf_subset="main",
	hf_filter=lambda x: len(x["question"].strip()) > 0,
	evaluation_splits=["test"],
	few_shots_split="test",
	generation_size=1024,
	stop_sequence=["\nQuestion:", "\nProblem:", "\nquestion:", "\nproblem:"],
	metric=[
	as_lighteval_metric(
	math_metric(
	gold_extraction_target=(ExprExtractionConfig(),),
	pred_extraction_target=(
	LatexExtractionConfig(),
	ExprExtractionConfig(),
	),
	fallback_mode="first_match",
	)
	),
	],
	)
	]


	TASKS_TABLE = [
	*gsm8k_lighteval,
	*math_hard_lighteval,
	*math_500_lighteval,
	*aime24_lighteval,
	*amc23_lighteval,
	]