Spaces:

marcosremar2
/

docker_mineru

Sleeping

App Files Files Community

docker_mineru / pdf_converter /convert_pdf_to_md.py

marcosremar2

Update PDF to Markdown converter API with NVIDIA L4 support

3c3eb16 7 months ago

raw

history blame contribute delete

5.58 kB

	import marker
	import os
	import sys
	import gc
	import torch
	from marker.config.parser import ConfigParser
	from marker.models import create_model_dict

	# Global variable to hold the pre-loaded converter
	_converter = None

	def initialize_converter():
	"""Initializes the marker converter models and stores it globally."""
	global _converter
	if _converter is None:
	print("Initializing marker models...")
	try:
	# Clear any existing CUDA cache before loading models
	if torch.cuda.is_available():
	torch.cuda.empty_cache()
	gc.collect()
	print(f"CUDA memory before initialization: {torch.cuda.memory_allocated()/10242:.2f} MB allocated, {torch.cuda.memory_reserved()/10242:.2f} MB reserved")

	# Set custom font path from environment variable if available
	font_path = os.environ.get('MARKER_FONT_PATH')
	if font_path:
	try:
	# Import marker settings and override font path
	from marker import settings
	os.makedirs(font_path, exist_ok=True)
	custom_font_path = os.path.join(font_path, 'NotoSans-Regular.ttf')
	settings.FONT_PATH = custom_font_path
	print(f"Using custom font path: {custom_font_path}")
	except ImportError:
	print("Could not import marker settings, using default font path")
	except Exception as e:
	print(f"Error setting custom font path: {e}", file=sys.stderr)

	# Create configuration, explicitly setting output format and batch multiplier
	config_parser = ConfigParser({
	'output_format': 'markdown',
	'batch_multiplier': 4, # Increased from default 2
	# Add any device-specific configuration here
	'device': 'cuda' if torch.cuda.is_available() else 'cpu'
	})

	# Load models with explicit device mapping
	models = create_model_dict()

	# Get converter class and create converter
	converter_cls = config_parser.get_converter_cls()
	_converter = converter_cls(
	config=config_parser.generate_config_dict(),
	artifact_dict=models,
	processor_list=config_parser.get_processors(),
	renderer=config_parser.get_renderer(),
	llm_service=config_parser.get_llm_service()
	)

	# Force another garbage collection after model load
	if torch.cuda.is_available():
	torch.cuda.empty_cache()
	gc.collect()
	print(f"CUDA memory after initialization: {torch.cuda.memory_allocated()/10242:.2f} MB allocated, {torch.cuda.memory_reserved()/10242:.2f} MB reserved")

	print("Marker models initialized successfully with batch_multiplier=4.")
	except Exception as e:
	print(f"Failed to initialize marker models: {e}", file=sys.stderr)
	_converter = None # Ensure it's None if init fails
	# Attempt to clean up GPU memory in case of initialization failure
	if torch.cuda.is_available():
	torch.cuda.empty_cache()
	gc.collect()
	raise
	else:
	print("Marker models already initialized.")

	def convert_pdf(pdf_input_path, output_md_path=None):
	"""
	Convert PDF file to Markdown using the pre-loaded marker converter.

	Args:
	pdf_input_path (str): Path to the input PDF file
	output_md_path (str, optional): Path where to save the output Markdown file. If None, markdown is only returned.

	Returns:
	str: The markdown text
	"""
	# Check if the input PDF exists
	if not os.path.exists(pdf_input_path):
	raise FileNotFoundError(f"Input PDF file not found at '{pdf_input_path}'")

	# Check if converter is initialized
	if _converter is None:
	raise RuntimeError("Marker converter has not been initialized. Call initialize_converter() during application startup.")

	print(f"Starting conversion of '{pdf_input_path}' using pre-loaded models...")

	try:
	# Free up any temporary memory before conversion
	if torch.cuda.is_available():
	torch.cuda.empty_cache()

	# Convert the PDF to markdown using the pre-loaded converter
	result = _converter(pdf_input_path)

	# Access the markdown content directly from the result object
	markdown_text = result.markdown

	# If output path is provided, save the markdown
	if output_md_path:
	output_dir = os.path.dirname(output_md_path)
	if output_dir and not os.path.exists(output_dir):
	os.makedirs(output_dir, exist_ok=True)

	with open(output_md_path, "w", encoding="utf-8") as f:
	f.write(markdown_text)
	print(f"Successfully saved markdown to '{output_md_path}'")

	# Clean up temporary GPU memory after conversion
	if torch.cuda.is_available():
	torch.cuda.empty_cache()

	return markdown_text

	except Exception as e:
	print(f"An error occurred during conversion: {e}", file=sys.stderr)
	print(f"Error details: {str(type(e))}", file=sys.stderr)
	# Try to clean up GPU memory on error
	if torch.cuda.is_available():
	torch.cuda.empty_cache()
	raise