Spaces:
Sleeping
Sleeping
| import marker | |
| import os | |
| import sys | |
| import gc | |
| import torch | |
| from marker.config.parser import ConfigParser | |
| from marker.models import create_model_dict | |
| # Global variable to hold the pre-loaded converter | |
| _converter = None | |
| def initialize_converter(): | |
| """Initializes the marker converter models and stores it globally.""" | |
| global _converter | |
| if _converter is None: | |
| print("Initializing marker models...") | |
| try: | |
| # Clear any existing CUDA cache before loading models | |
| if torch.cuda.is_available(): | |
| torch.cuda.empty_cache() | |
| gc.collect() | |
| print(f"CUDA memory before initialization: {torch.cuda.memory_allocated()/1024**2:.2f} MB allocated, {torch.cuda.memory_reserved()/1024**2:.2f} MB reserved") | |
| # Set custom font path from environment variable if available | |
| font_path = os.environ.get('MARKER_FONT_PATH') | |
| if font_path: | |
| try: | |
| # Import marker settings and override font path | |
| from marker import settings | |
| os.makedirs(font_path, exist_ok=True) | |
| custom_font_path = os.path.join(font_path, 'NotoSans-Regular.ttf') | |
| settings.FONT_PATH = custom_font_path | |
| print(f"Using custom font path: {custom_font_path}") | |
| except ImportError: | |
| print("Could not import marker settings, using default font path") | |
| except Exception as e: | |
| print(f"Error setting custom font path: {e}", file=sys.stderr) | |
| # Create configuration, explicitly setting output format and batch multiplier | |
| config_parser = ConfigParser({ | |
| 'output_format': 'markdown', | |
| 'batch_multiplier': 4, # Increased from default 2 | |
| # Add any device-specific configuration here | |
| 'device': 'cuda' if torch.cuda.is_available() else 'cpu' | |
| }) | |
| # Load models with explicit device mapping | |
| models = create_model_dict() | |
| # Get converter class and create converter | |
| converter_cls = config_parser.get_converter_cls() | |
| _converter = converter_cls( | |
| config=config_parser.generate_config_dict(), | |
| artifact_dict=models, | |
| processor_list=config_parser.get_processors(), | |
| renderer=config_parser.get_renderer(), | |
| llm_service=config_parser.get_llm_service() | |
| ) | |
| # Force another garbage collection after model load | |
| if torch.cuda.is_available(): | |
| torch.cuda.empty_cache() | |
| gc.collect() | |
| print(f"CUDA memory after initialization: {torch.cuda.memory_allocated()/1024**2:.2f} MB allocated, {torch.cuda.memory_reserved()/1024**2:.2f} MB reserved") | |
| print("Marker models initialized successfully with batch_multiplier=4.") | |
| except Exception as e: | |
| print(f"Failed to initialize marker models: {e}", file=sys.stderr) | |
| _converter = None # Ensure it's None if init fails | |
| # Attempt to clean up GPU memory in case of initialization failure | |
| if torch.cuda.is_available(): | |
| torch.cuda.empty_cache() | |
| gc.collect() | |
| raise | |
| else: | |
| print("Marker models already initialized.") | |
| def convert_pdf(pdf_input_path, output_md_path=None): | |
| """ | |
| Convert PDF file to Markdown using the pre-loaded marker converter. | |
| Args: | |
| pdf_input_path (str): Path to the input PDF file | |
| output_md_path (str, optional): Path where to save the output Markdown file. If None, markdown is only returned. | |
| Returns: | |
| str: The markdown text | |
| """ | |
| # Check if the input PDF exists | |
| if not os.path.exists(pdf_input_path): | |
| raise FileNotFoundError(f"Input PDF file not found at '{pdf_input_path}'") | |
| # Check if converter is initialized | |
| if _converter is None: | |
| raise RuntimeError("Marker converter has not been initialized. Call initialize_converter() during application startup.") | |
| print(f"Starting conversion of '{pdf_input_path}' using pre-loaded models...") | |
| try: | |
| # Free up any temporary memory before conversion | |
| if torch.cuda.is_available(): | |
| torch.cuda.empty_cache() | |
| # Convert the PDF to markdown using the pre-loaded converter | |
| result = _converter(pdf_input_path) | |
| # Access the markdown content directly from the result object | |
| markdown_text = result.markdown | |
| # If output path is provided, save the markdown | |
| if output_md_path: | |
| output_dir = os.path.dirname(output_md_path) | |
| if output_dir and not os.path.exists(output_dir): | |
| os.makedirs(output_dir, exist_ok=True) | |
| with open(output_md_path, "w", encoding="utf-8") as f: | |
| f.write(markdown_text) | |
| print(f"Successfully saved markdown to '{output_md_path}'") | |
| # Clean up temporary GPU memory after conversion | |
| if torch.cuda.is_available(): | |
| torch.cuda.empty_cache() | |
| return markdown_text | |
| except Exception as e: | |
| print(f"An error occurred during conversion: {e}", file=sys.stderr) | |
| print(f"Error details: {str(type(e))}", file=sys.stderr) | |
| # Try to clean up GPU memory on error | |
| if torch.cuda.is_available(): | |
| torch.cuda.empty_cache() | |
| raise |