Spaces:
Runtime error
Runtime error
| import os | |
| import sys | |
| import json | |
| import re | |
| from PIL import Image | |
| from dots_ocr.utils.image_utils import PILimage_to_base64 | |
| def has_latex_markdown(text: str) -> bool: | |
| """ | |
| Checks if a string contains LaTeX markdown patterns. | |
| Args: | |
| text (str): The string to check. | |
| Returns: | |
| bool: True if LaTeX markdown is found, otherwise False. | |
| """ | |
| if not isinstance(text, str): | |
| return False | |
| # Define regular expression patterns for LaTeX markdown | |
| latex_patterns = [ | |
| r'\$\$.*?\$\$', # Block-level math formula $$...$$ | |
| r'\$[^$\n]+?\$', # Inline math formula $...$ | |
| r'\\begin\{.*?\}.*?\\end\{.*?\}', # LaTeX environment \begin{...}...\end{...} | |
| r'\\[a-zA-Z]+\{.*?\}', # LaTeX command \command{...} | |
| r'\\[a-zA-Z]+', # Simple LaTeX command \command | |
| r'\\\[.*?\\\]', # Display math formula \[...\] | |
| r'\\\(.*?\\\)', # Inline math formula \(...\) | |
| ] | |
| # Check if any of the patterns match | |
| for pattern in latex_patterns: | |
| if re.search(pattern, text, re.DOTALL): | |
| return True | |
| return False | |
| def clean_latex_preamble(latex_text: str) -> str: | |
| """ | |
| Removes LaTeX preamble commands like document class and package imports. | |
| Args: | |
| latex_text (str): The original LaTeX text. | |
| Returns: | |
| str: The cleaned LaTeX text without preamble commands. | |
| """ | |
| # Define patterns to be removed | |
| patterns = [ | |
| r'\\documentclass\{[^}]+\}', # \documentclass{...} | |
| r'\\usepackage\{[^}]+\}', # \usepackage{...} | |
| r'\\usepackage\[[^\]]*\]\{[^}]+\}', # \usepackage[options]{...} | |
| r'\\begin\{document\}', # \begin{document} | |
| r'\\end\{document\}', # \end{document} | |
| ] | |
| # Apply each pattern to clean the text | |
| cleaned_text = latex_text | |
| for pattern in patterns: | |
| cleaned_text = re.sub(pattern, '', cleaned_text, flags=re.IGNORECASE) | |
| return cleaned_text | |
| def get_formula_in_markdown(text: str) -> str: | |
| """ | |
| Formats a string containing a formula into a standard Markdown block. | |
| Args: | |
| text (str): The input string, potentially containing a formula. | |
| Returns: | |
| str: The formatted string, ready for Markdown rendering. | |
| """ | |
| # Remove leading/trailing whitespace | |
| text = text.strip() | |
| # Check if it's already enclosed in $$ | |
| if text.startswith('$$') and text.endswith('$$'): | |
| text_new = text[2:-2].strip() | |
| if not '$' in text_new: | |
| return f"$$\n{text_new}\n$$" | |
| else: | |
| return text | |
| # Handle \[...\] format, convert to $$...$$ | |
| if text.startswith('\\[') and text.endswith('\\]'): | |
| inner_content = text[2:-2].strip() | |
| return f"$$\n{inner_content}\n$$" | |
| # Check if it's enclosed in \[ \] | |
| if len(re.findall(r'.*\\\[.*\\\].*', text)) > 0: | |
| return text | |
| # Handle inline formulas ($...$) | |
| pattern = r'\$([^$]+)\$' | |
| matches = re.findall(pattern, text) | |
| if len(matches) > 0: | |
| # It's an inline formula, return it as is | |
| return text | |
| # If no LaTeX markdown syntax is present, return directly | |
| if not has_latex_markdown(text): | |
| return text | |
| # Handle unnecessary LaTeX formatting like \usepackage | |
| if 'usepackage' in text: | |
| text = clean_latex_preamble(text) | |
| if text[0] == '`' and text[-1] == '`': | |
| text = text[1:-1] | |
| # Enclose the final text in a $$ block with newlines | |
| text = f"$$\n{text}\n$$" | |
| return text | |
| def clean_text(text: str) -> str: | |
| """ | |
| Cleans text by removing extra whitespace. | |
| Args: | |
| text: The original text. | |
| Returns: | |
| str: The cleaned text. | |
| """ | |
| if not text: | |
| return "" | |
| # Remove leading and trailing whitespace | |
| text = text.strip() | |
| # Replace multiple consecutive whitespace characters with a single space | |
| if text[:2] == '`$' and text[-2:] == '$`': | |
| text = text[1:-1] | |
| return text | |
| def layoutjson2md(image: Image.Image, cells: list, text_key: str = 'text', no_page_hf: bool = False) -> str: | |
| """ | |
| Converts a layout JSON format to Markdown. | |
| In the layout JSON, formulas are LaTeX, tables are HTML, and text is Markdown. | |
| Args: | |
| image: A PIL Image object. | |
| cells: A list of dictionaries, each representing a layout cell. | |
| text_key: The key for the text field in the cell dictionary. | |
| no_page_header_footer: If True, skips page headers and footers. | |
| Returns: | |
| str: The text in Markdown format. | |
| """ | |
| text_items = [] | |
| for i, cell in enumerate(cells): | |
| x1, y1, x2, y2 = [int(coord) for coord in cell['bbox']] | |
| text = cell.get(text_key, "") | |
| if no_page_hf and cell['category'] in ['Page-header', 'Page-footer']: | |
| continue | |
| if cell['category'] == 'Picture': | |
| image_crop = image.crop((x1, y1, x2, y2)) | |
| image_base64 = PILimage_to_base64(image_crop) | |
| text_items.append(f"") | |
| elif cell['category'] == 'Formula': | |
| text_items.append(get_formula_in_markdown(text)) | |
| else: | |
| text = clean_text(text) | |
| text_items.append(f"{text}") | |
| markdown_text = '\n\n'.join(text_items) | |
| return markdown_text | |
| def fix_streamlit_formulas(md: str) -> str: | |
| """ | |
| Fixes the format of formulas in Markdown to ensure they display correctly in Streamlit. | |
| It adds a newline after the opening $$ and before the closing $$ if they don't already exist. | |
| Args: | |
| md_text (str): The Markdown text to fix. | |
| Returns: | |
| str: The fixed Markdown text. | |
| """ | |
| # This inner function will be used by re.sub to perform the replacement | |
| def replace_formula(match): | |
| content = match.group(1) | |
| # If the content already has surrounding newlines, don't add more. | |
| if content.startswith('\n'): | |
| content = content[1:] | |
| if content.endswith('\n'): | |
| content = content[:-1] | |
| return f'$$\n{content}\n$$' | |
| # Use regex to find all $$....$$ patterns and replace them using the helper function. | |
| return re.sub(r'\$\$(.*?)\$\$', replace_formula, md, flags=re.DOTALL) | |