Spaces:

BrainDrive
/

FinanceEval

Running

FinanceEval / core /preprocess.py

Create preprocess.py

da9c977 verified 7 months ago

994 Bytes

	import re, json
	from typing import List

	# Heuristic normalization & role extraction. If roles unknown, we keep all text.

	SPEAKER_RE = re.compile(r"^(user\|client\|customer\|advisor\|agent\|assistant\|model)\s*[:\-]", re.I)


	def normalize_conversation(text: str) -> List[str]:
	lines = [l.strip() for l in text.splitlines() if l.strip()]
	return lines


	def extract_model_utterances(lines: List[str], prefer_llm_provider: str = None) -> str:
	model_lines = []
	user_aliases = ("user", "client", "customer")
	model_aliases = ("advisor", "agent", "assistant", "model")

	for l in lines:
	m = SPEAKER_RE.match(l)
	if m:
	who = m.group(1).lower()
	if who in model_aliases:
	model_lines.append(SPEAKER_RE.sub("", l).strip())
	# if user line, skip
	else:
	# Untagged; keep it (we prefer to include content to avoid missing eval)
	model_lines.append(l)

	return "\n".join(model_lines).strip()