Spaces:
Running
Running
| import re | |
| from ftfy import fix_text | |
| def contains_math(text): | |
| return text.startswith("$") or text.endswith("$") | |
| def fix_math(text): | |
| # Fix any issues with the text | |
| text = fix_text(text) | |
| # Remove LaTeX labels and references | |
| text = remove_labels(text) | |
| text = replace_katex_invalid(text) | |
| text = fix_fences(text) | |
| return text | |
| def remove_labels(text): | |
| pattern = r'\\label\{[^}]*\}' | |
| text = re.sub(pattern, '', text) | |
| ref_pattern = r'\\ref\{[^}]*\}' | |
| text = re.sub(ref_pattern, '', text) | |
| pageref_pattern = r'\\pageref\{[^}]*\}' | |
| text = re.sub(pageref_pattern, '', text) | |
| return text | |
| def replace_katex_invalid(string): | |
| # KaTeX cannot render all LaTeX, so we need to replace some things | |
| string = re.sub(r'\\tag\{.*?\}', '', string) | |
| string = re.sub(r'\\(?:Bigg?|bigg?)\{(.*?)\}', r'\1', string) | |
| string = re.sub(r'\\quad\\mbox\{(.*?)\}', r'\1', string) | |
| string = re.sub(r'\\mbox\{(.*?)\}', r'\1', string) | |
| string = remove_inner_dollars(string) | |
| return string | |
| def remove_inner_dollars(text): | |
| def replace_dollar(match): | |
| # Replace single $ with nothing, keep $$ intact | |
| math_block = match.group(1) | |
| return '$$' + math_block.replace('$', '') + '$$' | |
| pattern = r'\$\$(.*?)\$\$' | |
| return re.sub(pattern, replace_dollar, text, flags=re.DOTALL) | |
| def extract_latex_with_positions(text): | |
| pattern = r'(\$\$.*?\$\$|\$.*?\$)' | |
| matches = [] | |
| for match in re.finditer(pattern, text, re.DOTALL): | |
| matches.append((match.group(), match.start(), match.end())) | |
| return matches | |
| def slice_latex(text): | |
| # Extract LaTeX blocks along with their positions | |
| latex_blocks_with_positions = extract_latex_with_positions(text) | |
| chunks = [] | |
| last_position = 0 | |
| for block, start, end in latex_blocks_with_positions: | |
| # Add text before the current LaTeX block, if any | |
| if start > last_position: | |
| chunks.append({"text": text[last_position:start], "type": "text"}) | |
| # Add the LaTeX block | |
| chunks.append({"text": block, "type": "latex"}) | |
| last_position = end | |
| # Add remaining text after the last LaTeX block, if any | |
| if last_position < len(text): | |
| chunks.append({"text": text[last_position:], "type": "text"}) | |
| return chunks | |
| def is_latex(text): | |
| latex_patterns = [ | |
| r'\\(?:begin|end)\{[a-zA-Z]*\}', | |
| r'\$.*?\$', | |
| r'\$\$.*?\$\$', | |
| r'\\[a-zA-Z]+', | |
| r'\\[^a-zA-Z]', | |
| ] | |
| combined_pattern = '|'.join(latex_patterns) | |
| if re.search(combined_pattern, text, re.DOTALL): | |
| return True | |
| return False | |
| def fix_fences(text): | |
| if text.startswith("$$") and not text.endswith("$$"): | |
| if text[-1] == "$": | |
| text += "$" | |
| else: | |
| text += "$$" | |
| if text.endswith("$$") and not text.startswith("$$"): | |
| if text[0] == "$": | |
| text = "$" + text | |
| else: | |
| text = "$$" + text | |
| if text.startswith("$") and not text.endswith("$"): | |
| text = "$" + text + "$$" | |
| if text.endswith("$") and not text.startswith("$"): | |
| text = "$$" + text + "$" | |
| return text | |
| def strip_fences(text): | |
| while text.startswith("$"): | |
| text = text[1:] | |
| while text.endswith("$"): | |
| text = text[:-1] | |
| return text | |