Spaces:

hmqid03
/

vietqa-api

Running

quanho114

Deploy VietQA API

ebb8326 14 days ago

2.13 kB

	"""Common utility functions used across the project."""

	import re
	import unicodedata


	def normalize_text(text: str) -> str:
	"""Normalize text: clean whitespace, unicode, and formatting.

	Applies:
	- Unicode NFKC normalization
	- Zero-width character removal
	- Whitespace normalization
	- Multiple newline compression

	Args:
	text: Raw text to normalize

	Returns:
	Cleaned and normalized text
	"""
	if not text:
	return ""

	# Unicode NFKC normalization (composing characters)
	text = unicodedata.normalize("NFKC", text)

	# Remove zero-width characters
	text = re.sub(r"[\u200b\u200c\u200d\ufeff]", "", text)

	# Normalize whitespace & newlines
	text = re.sub(r"[ \t]+", " ", text)
	text = re.sub(r"\n{3,}", "\n\n", text)
	text = re.sub(r"http[s]?://\S+", "", text)
	# Strip leading/trailing whitespace from each line
	lines = [line.strip() for line in text.splitlines()]
	text = "\n".join(lines)

	return text.strip()


	def remove_diacritics(text: str) -> str:
	"""Remove Vietnamese diacritics from text.

	Example: 'văn hóa' -> 'van hoa'

	Args:
	text: Text with Vietnamese diacritics

	Returns:
	Text with diacritics removed, lowercased
	"""
	nfkd = unicodedata.normalize("NFKD", text)
	return "".join(c for c in nfkd if not unicodedata.combining(c)).lower()


	def _extract_qid_number(qid: str) -> tuple[str, int]:
	"""Extract prefix and numeric part from qid for natural sorting.

	Args:
	qid: Question ID like "test_0001" or "val_123"

	Returns:
	Tuple of (prefix, number) for sorting
	"""
	match = re.match(r"^([a-zA-Z_]+)(\d+)$", qid)
	if match:
	return (match.group(1), int(match.group(2)))
	return (qid, 0)


	def sort_qids(qids: list[str]) -> list[str]:
	"""Sort question IDs naturally (test_0001 < test_0002 < test_0010).

	Args:
	qids: List of question IDs

	Returns:
	Sorted list of question IDs
	"""
	return sorted(qids, key=_extract_qid_number)