from __future__ import annotations
import html
import re
import unicodedata
from dataclasses import dataclass
from typing import Any, Optional
import pandas as pd
from ftfy import fix_text
@dataclass(slots=True)
class Config:
verbose: bool = True
unicode_form: str = "NFC"
config = Config()
CONTROL_RE = re.compile(r"[\u0000-\u0008\u000b\u000c\u000e-\u001f\u007f]") # filter out non-printable control characters
INLINE_SPACE_RE = re.compile(r"[^\S\r\n]+") # collapse sequences inline whitespace into a single regular space
SPACES_AROUND_NEWLINE_RE = re.compile(r"[ \t]*\n[ \t]*") # match newline characters
THREE_PLUS_NEWLINES_RE = re.compile(r"\n{3,}") # match sequences of >=3 consecutive newline characters; preserving paragraph spacing to at most 3 newlines
QUOTE_DASH_TRANSLATION = str.maketrans({ # normalize similar unicode characters
"\u2018": "'",
"\u2019": "'",
"\u201c": '"',
"\u201d": '"',
"\u2013": "-",
"\u2014": "-",
"\u2212": "-",
"\u00a0": " ",
})
# ======= DEALING WITH MATH MODE ============
# match inline/display LaTeX math spans
MATH_SPAN_RE = re.compile(
r"(?
SIMPLE_SYMBOL_RE = re.compile(r"^\s*(\\[A-Za-z]+)\s*$")
def normalize_math_spans(value: str, math_placeholder: str = "