Spaces:

MCP-1st-Birthday
/

HR-Assistant

Running

update from github stable code (#3)

3370983 verified 14 days ago

1.68 kB

	"""Text processing utilities for document parsing."""

	import re
	from typing import List


	# Regex patterns for contact detection
	EMAIL_RE = re.compile(r"(?i)\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,}\b")
	PHONE_RE = re.compile(r"(?:(?<=\s)\|^)(\+\d{1,3}[\s()./-]?)?(?:\d[\s()/.-]?){6,}\d(?=\s\|$)")
	URL_RE = re.compile(r"(?i)\b(?:https?://\|www\.)[^\s<>'\"]+\.[^\s<>'\"]+")

	# Bullet characters to normalize
	_BULLET_CHARS = {"•", "·", "-", "–", "—", "▪", "◦", "‣", "●", "○", ""}


	def normalize_bullets(text: str) -> str:
	"""Coerce common bullet characters to '- ' while keeping numbering."""
	lines = text.splitlines()
	normalized: List[str] = []

	for line in lines:
	stripped = line.lstrip()
	if not stripped:
	normalized.append(line)
	continue

	# Keep numbered lists as-is
	if re.match(r"^\d+[\.)]\s+", stripped):
	normalized.append(line)
	continue

	first = stripped[0]
	if first in _BULLET_CHARS or stripped.startswith(("- ", "* ")):
	content = re.sub(
	r"^([\-\*\u2022\u2023\u2043\u2219\u25E6\u25AA\u25CB\u25CF\u25A0]+\s+)", "", stripped
	)
	normalized.append(f"- {content.strip()}")
	else:
	normalized.append(line)

	return "\n".join(normalized)


	def tag_contacts(text: str) -> str:
	"""Wrap detected email/phone/URL values with simple tags."""
	tagged = EMAIL_RE.sub(lambda m: f"[EMAIL]{m.group(0)}[/EMAIL]", text)
	tagged = PHONE_RE.sub(lambda m: f"[PHONE]{m.group(0)}[/PHONE]", tagged)
	tagged = URL_RE.sub(lambda m: f"[URL]{m.group(0)}[/URL]", tagged)
	return tagged