| import re |
| import subprocess |
| from pathlib import Path |
| from typing import Tuple |
| import sys |
|
|
|
|
| IMPLICIT_MULTIPLICATION_TARGETS = [ |
| "arccos", |
| "arcsin", |
| "arctan", |
| "sinh", |
| "cosh", |
| "tanh", |
| "coth", |
| "sech", |
| "csch", |
| "lim", |
| "log", |
| "ln", |
| "exp", |
| "sin", |
| "cos", |
| "tan", |
| "cot", |
| "sec", |
| "csc", |
| "ArcCos", |
| "ArcSin", |
| "ArcTan", |
| "Sinh", |
| "Cosh", |
| "Tanh", |
| "Coth", |
| "Sech", |
| "Csch", |
| "Lim", |
| "Log", |
| "Ln", |
| "Exp", |
| "Sin", |
| "Cos", |
| "Tan", |
| "Cot", |
| "Sec", |
| "Csc", |
| "det", |
| "dim", |
| "min", |
| "max", |
| "sup", |
| "inf", |
| "deg", |
| "gcd", |
| "lcm", |
| "ker", |
| "im", |
| "Pr", |
| "E", |
| "Var", |
| "Cov", |
| "rank", |
| "Tr", |
| "span", |
| "proj", |
| "grad", |
| "div", |
| "curl", |
| "Res", |
| "pi", |
| "alpha", |
| "beta", |
| "gamma", |
| "delta", |
| "epsilon", |
| "zeta", |
| "eta", |
| "theta", |
| "iota", |
| "kappa", |
| "lambda", |
| "mu", |
| "nu", |
| "xi", |
| "omicron", |
| "rho", |
| "sigma", |
| "tau", |
| "upsilon", |
| "phi", |
| "chi", |
| "psi", |
| "omega", |
| "Pi", |
| "Alpha", |
| "Beta", |
| "Gamma", |
| "Delta", |
| "Epsilon", |
| "Zeta", |
| "Eta", |
| "Theta", |
| "Iota", |
| "Kappa", |
| "Lambda", |
| "Mu", |
| "Nu", |
| "Xi", |
| "Omicron", |
| "Rho", |
| "Sigma", |
| "Tau", |
| "Upsilon", |
| "Phi", |
| "Chi", |
| "Psi", |
| "Omega", |
| "varepsilon", |
| "vartheta", |
| "varpi", |
| "varrho", |
| "varsigma", |
| "varphi", |
| "partial", |
| "nabla", |
| "int", |
| "oint", |
| "sum", |
| "prod", |
| "wedge", |
| "vee", |
| "cap", |
| "cup", |
| "therefore", |
| "because", |
| "Rightarrow", |
| "rightarrow", |
| "Leftarrow", |
| "leftarrow", |
| "Leftrightarrow", |
| "leftrightarrow", |
| "in", |
| "ni", |
| "subset", |
| "supset", |
| "subseteq", |
| "supseteq", |
| "equiv", |
| "sim", |
| "simeq", |
| "approx", |
| "propto", |
| "cdot", |
| "times", |
| "otimes", |
| "oplus", |
| "quad", |
| "qquad", |
| "ldots", |
| "cdots", |
| "ddots", |
| "forall", |
| "exists", |
| "neg", |
| "infty", |
| "hbar", |
| "emptyset", |
| "angle", |
| "geqslant", |
| "hat", |
| "bar", |
| "tilde", |
| "vec", |
| "dot", |
| "ddot", |
| "sqrt", |
| ] |
| IMPLICIT_MULTIPLICATION_TARGETS.sort(key=len, reverse=True) |
| TARGETS_PATTERN = "|".join(IMPLICIT_MULTIPLICATION_TARGETS) |
|
|
| PATTERN_ALIGN_ENV = re.compile( |
| r"\\begin{(split|align|alignedat|alignat|eqnarray)\*?}(.+?)\\end{\1\*?}", re.S |
| ) |
| PATTERN_SMALLMATRIX = re.compile(r"\\begin{(smallmatrix)\*?}(.+?)\\end{\1\*?}", re.S) |
| PATTERN_INVALID_SINGLE_CHAR_CMD = re.compile(r"\\([a-zA-Z0-9])(?![a-zA-Z])") |
| PATTERN_LATEX_CMD_CONCAT_CMD = re.compile( |
| r"\\(" + TARGETS_PATTERN + r")" + r"(\\[a-zA-Z])" |
| ) |
| PATTERN_LATEX_CMD_CONCAT_TEXT = re.compile(r"\\(" + TARGETS_PATTERN + r")([a-zA-Z])") |
| PATTERN_NON_CMD_IMPLICIT_MULT = re.compile( |
| r"\b(" + TARGETS_PATTERN + r")([a-zA-Z][a-zA-Z0-9]*)\b" |
| ) |
|
|
| OPERATORS = "\s?".join( |
| "|".join( |
| [ |
| "arccos", |
| "arcsin", |
| "arctan", |
| "arg", |
| "cos", |
| "cosh", |
| "cot", |
| "coth", |
| "csc", |
| "deg", |
| "det", |
| "dim", |
| "exp", |
| "gcd", |
| "hom", |
| "inf", |
| "injlim", |
| "ker", |
| "lg", |
| "lim", |
| "liminf", |
| "limsup", |
| "ln", |
| "log", |
| "max", |
| "min", |
| "Pr", |
| "projlim", |
| "sec", |
| "sin", |
| "sinh", |
| "sup", |
| "tan", |
| "tanh", |
| ] |
| ) |
| ) |
| PATTERN_OPERATOR_NAME = re.compile(r"\\operatorname {(%s)}" % OPERATORS) |
|
|
|
|
|
|
| def tokenize(latex_code: str) -> Tuple[bool, str]: |
|
|
| if not latex_code: |
| return True, "" |
|
|
| root_dir = Path(__file__).parent |
| formula_script = root_dir / "tokenize_latex" / "preprocess_formula.js" |
|
|
| prepre = latex_code.replace("\n", " ") |
| prepre = PATTERN_ALIGN_ENV.sub(r"\\begin{aligned}\2\\end{aligned}", prepre) |
| prepre = PATTERN_SMALLMATRIX.sub(r"\\begin{matrix}\2\\end{matrix}", prepre) |
| prepre = PATTERN_INVALID_SINGLE_CHAR_CMD.sub(r"\1", prepre) |
| prepre = PATTERN_LATEX_CMD_CONCAT_CMD.sub(r"\\\1 \2", prepre) |
| prepre = PATTERN_LATEX_CMD_CONCAT_TEXT.sub(r"\\\1 \2", prepre) |
| prepre = PATTERN_NON_CMD_IMPLICIT_MULT.sub(r"\1 \2", prepre) |
|
|
| try: |
| proc = subprocess.run( |
| ["node", str(formula_script), "normalize"], |
| input=prepre, |
| capture_output=True, |
| text=True, |
| check=True, |
| encoding="utf-8", |
| ) |
| normalized_latex = proc.stdout |
| except (subprocess.CalledProcessError, FileNotFoundError) as e: |
| print(f"Error executing Node.js script (formula): {e}", file=sys.stderr) |
| if hasattr(e, "stderr"): |
| print(f"Node.js stderr: {e.stderr}", file=sys.stderr) |
| return False, latex_code |
|
|
| names = [ |
| "\\" + x.replace(" ", "") |
| for x in re.findall(PATTERN_OPERATOR_NAME, normalized_latex) |
| ] |
| post = PATTERN_OPERATOR_NAME.sub( |
| lambda match: str(names.pop(0)), normalized_latex |
| ).replace(r"\\ \end{array}", r"\end{array}") |
| return True, post.strip() |
|
|