| import re |
|
|
| |
| |
| PATTERN_STRIP_START_BRACKET = re.compile( |
| r"^\s*\\color\{#[\da-fA-F]{6}\}\{\s*\\\[\s*\}\s*" |
| ) |
| |
| PATTERN_STRIP_END_BRACKET = re.compile( |
| r"\s*\\color\{#[\da-fA-F]{6}\}\{\s*\\\]\s*\}\s*$" |
| ) |
|
|
|
|
| def full_to_half_width(s: str) -> str: |
| """ |
| 将字符串中的全角字符转换为半角字符。 |
| """ |
| res = "" |
| for char in s: |
| inside_code = ord(char) |
| if inside_code == 12288: |
| inside_code = 32 |
| elif 65281 <= inside_code <= 65374: |
| inside_code -= 65248 |
| res += chr(inside_code) |
| return res |
|
|
|
|
| def clean_latex_delimiters(latex_string: str) -> str: |
| """ |
| 清理 LaTeX 字符串中的美元符号和双美元符号。 |
| """ |
| s = latex_string.strip() |
| if s.startswith("$$") and s.endswith("$$"): |
| return s[2:-2].strip() |
| if s.startswith("$") and s.endswith("$"): |
| return s[1:-1].strip() |
| return s |
|
|
|
|
| def clean(s: str) -> str: |
| """ |
| 对字符串进行预处理,包括全角转半角、去空格等。 |
| """ |
| processed_text = full_to_half_width(s) |
| processed_text = processed_text.replace("{/[}", r" \[").replace("{/]}", r" \]") |
| cleaned_text = clean_latex_delimiters(processed_text) |
| return cleaned_text |
|
|