Spaces:

taraky
/

Medical_Document_Retrieval

Running

Upload folder using huggingface_hub

b7f3196 verified 3 days ago

416 Bytes

	import re

	# Simple non-word splitter (keeps letters/numbers, splits on punctuation/whitespace)
	_WS = re.compile(r"\W+", flags=re.UNICODE)

	def tokenize(s: str) -> list[str]:
	"""
	Lowercase + split on non-word chars. Returns [] for None/empty.
	Used by BM25 to build the tokenized corpus and query.
	"""
	if not s:
	return []
	return [t for t in _WS.split(s.lower()) if t]