Spaces:

evaleval
/

entity-registry

Sleeping

Upload folder using huggingface_hub

31626e0 verified 13 days ago

1.34 kB

	import re


	def normalize(value: str) -> str:
	"""Lowercase, strip, collapse all separators (space/_/-/slash) to one space.

	Collapsing ``/`` with the other separators means ``tau-bench-2/airline``
	and ``tau-bench-2_airline`` normalize identically — critical for generalized
	ingestion where the same benchmark may appear with slash, underscore, or
	hyphen separators across configs. False merges across distinct canonical
	IDs are prevented by fuzzy's suffix-stripping being the only stem rewrite
	we apply (no generic similarity).

	Dots between digits are converted to spaces first so that version
	numbers like ``4.5`` and ``4-5`` normalize identically (both → ``4 5``).

	``+`` is preserved because it carries identity-relevant meaning for some
	benchmarks (``MBPP+`` / ``HumanEval+`` are distinct from their non-``+``
	counterparts; stripping it collides them in the normalized index and
	last-write-wins assigns the wrong canonical).
	"""
	value = value.lower()
	value = value.strip()
	# Convert dots between digits to spaces (e.g. "4.5" → "4 5")
	value = re.sub(r"(?<=\d)\.(?=\d)", " ", value)
	value = re.sub(r"[^\w\s\-/+]", "", value) # remove punctuation, keep +
	value = re.sub(r"[\s_\-/]+", " ", value).strip() # collapse separators
	return value