Student0809
/

interactSpeech

Model card Files Files and versions

interactSpeech / GRPO /Reward.py

Student0809's picture

Add files using upload-large-folder tool

3438cdb verified 5 months ago

history blame contribute delete

3.26 kB

	import os
	import re
	import math
	import json
	from datetime import datetime
	from swift.plugin import ORM,orms
	from typing import Dict, List, Union


	class MultiModalAccuracyORM(ORM):
	def __call__(self, completions, solution, **kwargs) -> List[float]:
	"""
	Reward function that checks if the completion is correct.
	Args:
	completions (list[str]): Generated outputs
	solution (list[str]): Ground Truths.

	Returns:
	list[float]: Reward scores
	"""
	rewards = []
	#completion_contents = [completion[0]["content"] for completion in completions]
	for content, gt_score_orig in zip(completions, solution):
	score_match = re.search(r"<overall score>(\d+)</overall score>", content)
	#score_match = re.search(r"<score>(\d+)</score>", content)
	pred_score = None
	gt_score = None
	# breakpoint()
	# print(content)
	# print(score_match)
	if score_match:
	try:
	pred_score = int(score_match.group(1))
	if not (1 <= pred_score <= 2):
	pred_score = None
	except:
	pass

	try:
	gt_score = int(gt_score_orig[0])

	if not (1 <= gt_score <= 2):
	gt_score = None
	except:
	pass

	# 分段奖励逻辑
	if pred_score is not None and gt_score is not None:
	if pred_score == gt_score:
	reward = 5.0
	elif abs(pred_score - gt_score) <= 1:
	reward = 1.0
	else:
	reward = 0.0
	else:
	reward = 0.0

	rewards.append(reward)
	return rewards
	class MultiModalFormatAccuracyORM(ORM):
	def __call__(self, completions, **kwargs) -> List[float]:
	"""Reward function that checks if the completion has a specific format."""
	rewards = []
	response_pattern = r"<response think>.*?</response think>"
	react_pattern = r"<fluency think>.*?</fluency think>"
	score_pattern = r"[\s]<overall score>(\d+)</overall score>[\s]"
	#completion_contents = [completion[0]["content"] for completion in completions]
	for content in completions:
	# breakpoint()
	# print(content)
	has_response = bool(re.search(response_pattern, content, re.DOTALL))
	#print(has_response)
	has_react = bool(re.search(react_pattern, content, re.DOTALL))
	#print(has_react)
	has_score = bool(re.search(score_pattern, content, re.DOTALL))
	#print(has_score)
	if has_response and has_react and has_score:
	rewards.append(5.0)
	# elif has_score and (has_response or has_react):
	# rewards.append(3.0)
	# elif has_response or has_react:
	# rewards.append(1.0)
	else:
	rewards.append(0)
	return rewards
	orms['external_r1v_format_acc'] = MultiModalFormatAccuracyORM
	orms['external_r1v_acc'] = MultiModalAccuracyORM