Spaces:

ljm565-org
/

H-AdminSim_Arena

Running

App Files Files Community

H-AdminSim_Arena / utils /postprocess.py

ljm565

feat: Updated rating criteria

3703b6a 5 months ago

raw

history blame contribute delete

4 kB

	import os, re

	from utils.client import GPTClient
	from utils.filesys_utils import json_load, get_files



	def make_dialog_dict(dir_path: str = './dialogs') -> dict:
	"""
	Create a dictionary of dialogs from JSON files in the specified directory.

	Args:
	dir_path (str, optional): Path to the directory containing dialog JSON files. Defaults to './dialogs'.

	Returns:
	dict: A dictionary where keys are model name and values are the parsed JSON content.
	"""
	dialog_dict = {}

	for model in os.listdir(dir_path):
	model_path = os.path.join(dir_path, model)
	dialog_files = get_files(model_path, ext='.json')
	if not dialog_files:
	continue

	dialogues = [content
	for file in dialog_files
	for content in map(dialog_postprocessing, json_load(file).values())
	if content
	]

	if dialogues:
	dialog_dict[model] = dialogues

	return dialog_dict



	def dialog_postprocessing(dialog: str) -> str:
	"""
	Postprocess a dialog string by removing unwanted tokens.

	Args:
	dialog (str): The dialog string to be processed.

	Returns:
	str: The cleaned dialog string.
	"""
	department_candidates = ["gastroenterology", "cardiology", "pulmonary", "endocrinology/metabolism", "nephrology", "hematology/oncology", "allergy", "infectious diseases", "rheumatology"]
	try:
	answer_pattern = re.compile(r'Answer:\s\d+\.\s(.+)')
	split_pattern = re.compile(r'\bAnswer:')

	department = answer_pattern.search(dialog).group(1)
	for candidate in department_candidates:
	if department.lower().startswith(candidate):
	department = candidate
	break
	assert department.lower() in department_candidates

	before_answer = re.split(split_pattern, dialog)[0].strip()
	before_answer += f' I will introduce you to a physician who work in the {department}.'

	before_answer = before_answer.replace("Staff:", "<span style='color:rgb(0,102,204); font-weight:bold'>Staff</span>      :")
	before_answer = before_answer.replace("Patient:", "<span style='color:rgb(204,0,102); font-weight:bold'>Patient</span>:")
	before_answer = before_answer.replace("\n", "<br>")

	except:
	before_answer = ''

	return before_answer



	def dialog_translate(dialog: str) -> str:
	"""
	Translate a dialogue between a hospital staff member and a patient from English to Korean.

	Args:
	dialog (str): A string containing the dialogue in HTML format.
	Staff lines are marked with a blue "Staff" span and
	patient lines with a pink "Patient" span, separated by '<br>' tags.

	Raises:
	TypeError: If translation or formatting fails.

	Returns:
	str: Translated dialogue lines in Korean
	"""
	user_prompt = "아래 문장 리스트를은 원무과 직원과 환자의 대화야. 순서대로 한국어로 번역해주고 string list로 반환해줘. 다른 결과, 미사여구 붙이지말고 딱 string list만 반환 해줘야해.\n\n{lines}"
	staff_flag = "<span style='color:rgb(0,102,204); font-weight:bold'>Staff</span>      :"
	patient_flag = "<span style='color:rgb(204,0,102); font-weight:bold'>Patient</span>:"
	lines = [''.join(line.split(staff_flag)[1:]).strip() if staff_flag in line else ''.join(line.split(patient_flag)[1:]).strip() for line in dialog.split('<br>')]
	user_prompt = user_prompt.format(lines=lines)
	client = GPTClient('gpt-5-nano')

	try:
	response = client(user_prompt, **{'reasoning_effort': 'minimal'})
	response = eval(response)
	response = '<br>'.join([staff_flag + ' ' + r if i % 2 == 0 else patient_flag + ' ' + r for i, r in enumerate(response)])
	return response
	except:
	raise TypeError