owenkaplinsky's picture
update from github stable code (#3)
3370983 verified
raw
history blame
1.68 kB
"""Text processing utilities for document parsing."""
import re
from typing import List
# Regex patterns for contact detection
EMAIL_RE = re.compile(r"(?i)\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,}\b")
PHONE_RE = re.compile(r"(?:(?<=\s)|^)(\+\d{1,3}[\s()./-]?)?(?:\d[\s()/.-]?){6,}\d(?=\s|$)")
URL_RE = re.compile(r"(?i)\b(?:https?://|www\.)[^\s<>'\"]+\.[^\s<>'\"]+")
# Bullet characters to normalize
_BULLET_CHARS = {"β€’", "Β·", "-", "–", "β€”", "β–ͺ", "β—¦", "β€£", "●", "β—‹", ""}
def normalize_bullets(text: str) -> str:
"""Coerce common bullet characters to '- ' while keeping numbering."""
lines = text.splitlines()
normalized: List[str] = []
for line in lines:
stripped = line.lstrip()
if not stripped:
normalized.append(line)
continue
# Keep numbered lists as-is
if re.match(r"^\d+[\.)]\s+", stripped):
normalized.append(line)
continue
first = stripped[0]
if first in _BULLET_CHARS or stripped.startswith(("- ", "* ")):
content = re.sub(
r"^([\-\*\u2022\u2023\u2043\u2219\u25E6\u25AA\u25CB\u25CF\u25A0]+\s+)", "", stripped
)
normalized.append(f"- {content.strip()}")
else:
normalized.append(line)
return "\n".join(normalized)
def tag_contacts(text: str) -> str:
"""Wrap detected email/phone/URL values with simple tags."""
tagged = EMAIL_RE.sub(lambda m: f"[EMAIL]{m.group(0)}[/EMAIL]", text)
tagged = PHONE_RE.sub(lambda m: f"[PHONE]{m.group(0)}[/PHONE]", tagged)
tagged = URL_RE.sub(lambda m: f"[URL]{m.group(0)}[/URL]", tagged)
return tagged