|
|
"""Text processing utilities for document parsing.""" |
|
|
|
|
|
import re |
|
|
from typing import List |
|
|
|
|
|
|
|
|
|
|
|
EMAIL_RE = re.compile(r"(?i)\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,}\b") |
|
|
PHONE_RE = re.compile(r"(?:(?<=\s)|^)(\+\d{1,3}[\s()./-]?)?(?:\d[\s()/.-]?){6,}\d(?=\s|$)") |
|
|
URL_RE = re.compile(r"(?i)\b(?:https?://|www\.)[^\s<>'\"]+\.[^\s<>'\"]+") |
|
|
|
|
|
|
|
|
_BULLET_CHARS = {"β’", "Β·", "-", "β", "β", "βͺ", "β¦", "β£", "β", "β", ""} |
|
|
|
|
|
|
|
|
def normalize_bullets(text: str) -> str: |
|
|
"""Coerce common bullet characters to '- ' while keeping numbering.""" |
|
|
lines = text.splitlines() |
|
|
normalized: List[str] = [] |
|
|
|
|
|
for line in lines: |
|
|
stripped = line.lstrip() |
|
|
if not stripped: |
|
|
normalized.append(line) |
|
|
continue |
|
|
|
|
|
|
|
|
if re.match(r"^\d+[\.)]\s+", stripped): |
|
|
normalized.append(line) |
|
|
continue |
|
|
|
|
|
first = stripped[0] |
|
|
if first in _BULLET_CHARS or stripped.startswith(("- ", "* ")): |
|
|
content = re.sub( |
|
|
r"^([\-\*\u2022\u2023\u2043\u2219\u25E6\u25AA\u25CB\u25CF\u25A0]+\s+)", "", stripped |
|
|
) |
|
|
normalized.append(f"- {content.strip()}") |
|
|
else: |
|
|
normalized.append(line) |
|
|
|
|
|
return "\n".join(normalized) |
|
|
|
|
|
|
|
|
def tag_contacts(text: str) -> str: |
|
|
"""Wrap detected email/phone/URL values with simple tags.""" |
|
|
tagged = EMAIL_RE.sub(lambda m: f"[EMAIL]{m.group(0)}[/EMAIL]", text) |
|
|
tagged = PHONE_RE.sub(lambda m: f"[PHONE]{m.group(0)}[/PHONE]", tagged) |
|
|
tagged = URL_RE.sub(lambda m: f"[URL]{m.group(0)}[/URL]", tagged) |
|
|
return tagged |
|
|
|
|
|
|