Spaces:
Runtime error
Runtime error
Approach #2 -- converting latex output from GOT OCR to markdown
Browse files- app.py +2 -1
- requirements.txt +0 -1
- setup.sh +1 -1
- src/core/converter.py +36 -0
- src/core/latex_to_markdown_converter.py +67 -0
- src/parsers/gemini_flash_parser.py +2 -1
- src/parsers/got_ocr_parser.py +2 -2
app.py
CHANGED
|
@@ -77,9 +77,10 @@ gemini_api_key = os.getenv("GOOGLE_API_KEY")
|
|
| 77 |
|
| 78 |
# Check if API key is available and print a message if not
|
| 79 |
if not gemini_api_key:
|
| 80 |
-
print("Warning: GOOGLE_API_KEY environment variable not found. Gemini Flash parser may not work.")
|
| 81 |
else:
|
| 82 |
print(f"Found Gemini API key: {gemini_api_key[:5]}...{gemini_api_key[-5:] if len(gemini_api_key) > 10 else ''}")
|
|
|
|
| 83 |
|
| 84 |
# Add the current directory to the Python path
|
| 85 |
sys.path.append(current_dir)
|
|
|
|
| 77 |
|
| 78 |
# Check if API key is available and print a message if not
|
| 79 |
if not gemini_api_key:
|
| 80 |
+
print("Warning: GOOGLE_API_KEY environment variable not found. Gemini Flash parser and LaTeX to Markdown conversion may not work.")
|
| 81 |
else:
|
| 82 |
print(f"Found Gemini API key: {gemini_api_key[:5]}...{gemini_api_key[-5:] if len(gemini_api_key) > 10 else ''}")
|
| 83 |
+
print("Gemini API will be used for LaTeX to Markdown conversion when using GOT-OCR with Formatted Text mode")
|
| 84 |
|
| 85 |
# Add the current directory to the Python path
|
| 86 |
sys.path.append(current_dir)
|
requirements.txt
CHANGED
|
@@ -13,7 +13,6 @@ opencv-python # Match exact dependency from GOT-OCR
|
|
| 13 |
# Utility dependencies
|
| 14 |
python-dotenv>=1.0.0
|
| 15 |
pydantic==2.7.1
|
| 16 |
-
latex2markdown>=0.1.0 # For LaTeX to Markdown conversion
|
| 17 |
|
| 18 |
# Gemini API client
|
| 19 |
google-genai>=0.1.0
|
|
|
|
| 13 |
# Utility dependencies
|
| 14 |
python-dotenv>=1.0.0
|
| 15 |
pydantic==2.7.1
|
|
|
|
| 16 |
|
| 17 |
# Gemini API client
|
| 18 |
google-genai>=0.1.0
|
setup.sh
CHANGED
|
@@ -29,7 +29,7 @@ echo "NumPy installed successfully"
|
|
| 29 |
echo "Installing Python dependencies..."
|
| 30 |
pip install -q -U pillow opencv-python
|
| 31 |
pip install -q -U google-genai
|
| 32 |
-
pip install -q -U latex2markdown
|
| 33 |
echo "Python dependencies installed successfully"
|
| 34 |
|
| 35 |
# Install GOT-OCR transformers dependencies
|
|
|
|
| 29 |
echo "Installing Python dependencies..."
|
| 30 |
pip install -q -U pillow opencv-python
|
| 31 |
pip install -q -U google-genai
|
| 32 |
+
# pip install -q -U latex2markdown - removed, now using Gemini API for LaTeX conversion
|
| 33 |
echo "Python dependencies installed successfully"
|
| 34 |
|
| 35 |
# Install GOT-OCR transformers dependencies
|
src/core/converter.py
CHANGED
|
@@ -10,6 +10,14 @@ from src.core.parser_factory import ParserFactory
|
|
| 10 |
# Import all parsers to ensure they're registered
|
| 11 |
from src import parsers
|
| 12 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 13 |
# Reference to the cancellation flag from ui.py
|
| 14 |
# This will be set by the UI when the cancel button is clicked
|
| 15 |
conversion_cancelled = None # Will be a threading.Event object
|
|
@@ -133,6 +141,34 @@ def convert_file(file_path, parser_name, ocr_method_name, output_format):
|
|
| 133 |
safe_delete_file(temp_input)
|
| 134 |
return "Conversion cancelled.", None
|
| 135 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 136 |
except Exception as e:
|
| 137 |
safe_delete_file(temp_input)
|
| 138 |
return f"Error: {e}", None
|
|
|
|
| 10 |
# Import all parsers to ensure they're registered
|
| 11 |
from src import parsers
|
| 12 |
|
| 13 |
+
# Import the LaTeX to Markdown converter
|
| 14 |
+
try:
|
| 15 |
+
from src.core.latex_to_markdown_converter import convert_latex_to_markdown
|
| 16 |
+
HAS_GEMINI_CONVERTER = True
|
| 17 |
+
except ImportError:
|
| 18 |
+
HAS_GEMINI_CONVERTER = False
|
| 19 |
+
logging.warning("LaTeX to Markdown converter not available. Raw LaTeX will be returned for formatted text.")
|
| 20 |
+
|
| 21 |
# Reference to the cancellation flag from ui.py
|
| 22 |
# This will be set by the UI when the cancel button is clicked
|
| 23 |
conversion_cancelled = None # Will be a threading.Event object
|
|
|
|
| 141 |
safe_delete_file(temp_input)
|
| 142 |
return "Conversion cancelled.", None
|
| 143 |
|
| 144 |
+
# Process LaTeX content for GOT-OCR formatted text
|
| 145 |
+
if parser_name == "GOT-OCR (jpg,png only)" and ocr_method_name == "Formatted Text" and HAS_GEMINI_CONVERTER:
|
| 146 |
+
logging.info("Converting LaTeX output to Markdown using Gemini API")
|
| 147 |
+
start_convert = time.time()
|
| 148 |
+
|
| 149 |
+
# Check for cancellation before conversion
|
| 150 |
+
if check_cancellation():
|
| 151 |
+
logging.info("Cancellation detected before LaTeX conversion")
|
| 152 |
+
safe_delete_file(temp_input)
|
| 153 |
+
return "Conversion cancelled.", None
|
| 154 |
+
|
| 155 |
+
try:
|
| 156 |
+
markdown_content = convert_latex_to_markdown(content)
|
| 157 |
+
if markdown_content:
|
| 158 |
+
content = markdown_content
|
| 159 |
+
logging.info(f"LaTeX conversion completed in {time.time() - start_convert:.2f} seconds")
|
| 160 |
+
else:
|
| 161 |
+
logging.warning("LaTeX to Markdown conversion failed, using raw LaTeX output")
|
| 162 |
+
except Exception as e:
|
| 163 |
+
logging.error(f"Error converting LaTeX to Markdown: {str(e)}")
|
| 164 |
+
# Continue with the original content on error
|
| 165 |
+
|
| 166 |
+
# Check for cancellation after conversion
|
| 167 |
+
if check_cancellation():
|
| 168 |
+
logging.info("Cancellation detected after LaTeX conversion")
|
| 169 |
+
safe_delete_file(temp_input)
|
| 170 |
+
return "Conversion cancelled.", None
|
| 171 |
+
|
| 172 |
except Exception as e:
|
| 173 |
safe_delete_file(temp_input)
|
| 174 |
return f"Error: {e}", None
|
src/core/latex_to_markdown_converter.py
ADDED
|
@@ -0,0 +1,67 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import logging
|
| 3 |
+
from typing import Optional
|
| 4 |
+
from google import genai
|
| 5 |
+
|
| 6 |
+
# Configure logging
|
| 7 |
+
logger = logging.getLogger(__name__)
|
| 8 |
+
logger.setLevel(logging.DEBUG)
|
| 9 |
+
|
| 10 |
+
# Load API key from environment variable
|
| 11 |
+
api_key = os.getenv("GOOGLE_API_KEY")
|
| 12 |
+
|
| 13 |
+
# Check if API key is available
|
| 14 |
+
if not api_key:
|
| 15 |
+
logger.warning("GOOGLE_API_KEY environment variable not found. LaTeX to Markdown conversion may not work.")
|
| 16 |
+
|
| 17 |
+
def convert_latex_to_markdown(latex_content: str) -> Optional[str]:
|
| 18 |
+
"""
|
| 19 |
+
Convert LaTeX content to Markdown using Gemini API.
|
| 20 |
+
|
| 21 |
+
Args:
|
| 22 |
+
latex_content: The LaTeX content to convert
|
| 23 |
+
|
| 24 |
+
Returns:
|
| 25 |
+
Converted markdown content or None if conversion fails
|
| 26 |
+
"""
|
| 27 |
+
if not api_key:
|
| 28 |
+
logger.error("GOOGLE_API_KEY environment variable not set")
|
| 29 |
+
return None
|
| 30 |
+
|
| 31 |
+
try:
|
| 32 |
+
# Create a client
|
| 33 |
+
client = genai.Client(api_key=api_key)
|
| 34 |
+
|
| 35 |
+
# Set up the prompt
|
| 36 |
+
prompt = """
|
| 37 |
+
Convert this LaTeX content to clean, well-formatted Markdown.
|
| 38 |
+
Preserve all tables, lists, and formatting.
|
| 39 |
+
For tables, use standard Markdown table syntax.
|
| 40 |
+
For mathematical expressions, use $ for inline and $$ for display math.
|
| 41 |
+
Keep the structure and hierarchy of the content. Return only the markdown content, no other text.
|
| 42 |
+
"""
|
| 43 |
+
|
| 44 |
+
# Generate the response
|
| 45 |
+
response = client.models.generate_content(
|
| 46 |
+
model="gemini-2.0-flash",
|
| 47 |
+
contents=[
|
| 48 |
+
prompt,
|
| 49 |
+
latex_content
|
| 50 |
+
],
|
| 51 |
+
config={
|
| 52 |
+
"temperature": 0.1,
|
| 53 |
+
"top_p": 0.95,
|
| 54 |
+
"top_k": 40,
|
| 55 |
+
"max_output_tokens": 8192,
|
| 56 |
+
}
|
| 57 |
+
)
|
| 58 |
+
|
| 59 |
+
# Extract the markdown text from the response
|
| 60 |
+
markdown_text = response.text
|
| 61 |
+
|
| 62 |
+
logger.info("Successfully converted LaTeX to Markdown")
|
| 63 |
+
return markdown_text
|
| 64 |
+
|
| 65 |
+
except Exception as e:
|
| 66 |
+
logger.error(f"Error converting LaTeX to Markdown: {str(e)}")
|
| 67 |
+
return None
|
src/parsers/gemini_flash_parser.py
CHANGED
|
@@ -79,6 +79,7 @@ class GeminiFlashParser(DocumentParser):
|
|
| 79 |
Convert this document to markdown format.
|
| 80 |
Preserve the structure, headings, lists, tables, and formatting as much as possible.
|
| 81 |
For images, include a brief description in markdown image syntax.
|
|
|
|
| 82 |
"""
|
| 83 |
|
| 84 |
# Generate the response
|
|
@@ -92,7 +93,7 @@ class GeminiFlashParser(DocumentParser):
|
|
| 92 |
)
|
| 93 |
],
|
| 94 |
config={
|
| 95 |
-
"temperature": 0.
|
| 96 |
"top_p": 0.95,
|
| 97 |
"top_k": 40,
|
| 98 |
"max_output_tokens": 8192,
|
|
|
|
| 79 |
Convert this document to markdown format.
|
| 80 |
Preserve the structure, headings, lists, tables, and formatting as much as possible.
|
| 81 |
For images, include a brief description in markdown image syntax.
|
| 82 |
+
Return only the markdown content, no other text.
|
| 83 |
"""
|
| 84 |
|
| 85 |
# Generate the response
|
|
|
|
| 93 |
)
|
| 94 |
],
|
| 95 |
config={
|
| 96 |
+
"temperature": 0.1,
|
| 97 |
"top_p": 0.95,
|
| 98 |
"top_k": 40,
|
| 99 |
"max_output_tokens": 8192,
|
src/parsers/got_ocr_parser.py
CHANGED
|
@@ -17,8 +17,8 @@ import copy
|
|
| 17 |
from src.parsers.parser_interface import DocumentParser
|
| 18 |
from src.parsers.parser_registry import ParserRegistry
|
| 19 |
|
| 20 |
-
# Import latex2markdown for conversion
|
| 21 |
-
import latex2markdown
|
| 22 |
|
| 23 |
# Configure logging
|
| 24 |
logger = logging.getLogger(__name__)
|
|
|
|
| 17 |
from src.parsers.parser_interface import DocumentParser
|
| 18 |
from src.parsers.parser_registry import ParserRegistry
|
| 19 |
|
| 20 |
+
# Import latex2markdown for conversion - No longer needed, using Gemini API
|
| 21 |
+
# import latex2markdown
|
| 22 |
|
| 23 |
# Configure logging
|
| 24 |
logger = logging.getLogger(__name__)
|