Spaces:

Ansemin101
/

Markit_v2

Runtime error

App Files Files Community

Markit_v2 / src /parsers /markitdown_parser.py

AnseMin

Integrate Gemini API for enhanced image processing in MarkItDown

033e4ba 6 months ago

raw

history blame contribute delete

8.38 kB

	import logging
	import os
	import threading
	import time
	from pathlib import Path
	from typing import Dict, List, Optional, Any, Union, Set
	import io

	# Import the parser interface and registry
	from src.parsers.parser_interface import DocumentParser
	from src.parsers.parser_registry import ParserRegistry
	from src.core.exceptions import DocumentProcessingError, ParserError

	# Check for MarkItDown availability
	try:
	from markitdown import MarkItDown
	HAS_MARKITDOWN = True
	except ImportError:
	HAS_MARKITDOWN = False
	logging.warning("MarkItDown package not installed. Please install with 'pip install markitdown[all]'")

	# Import our Gemini wrapper for LLM support
	try:
	from src.core.gemini_client_wrapper import create_gemini_client_for_markitdown
	HAS_GEMINI_WRAPPER = True
	except ImportError:
	HAS_GEMINI_WRAPPER = False

	# Configure logging
	logger = logging.getLogger(__name__)
	logger.setLevel(logging.DEBUG)

	class MarkItDownParser(DocumentParser):
	"""
	Parser implementation using MarkItDown for converting various file formats to Markdown.
	"""

	def __init__(self):
	super().__init__() # Initialize the base class (including _cancellation_flag)
	self.markdown_instance = None
	# Initialize MarkItDown instance
	if HAS_MARKITDOWN:
	try:
	# Initialize MarkItDown without LLM client for better performance
	# LLM client will only be used for image files when needed
	self.markdown_instance = MarkItDown()
	logger.info("MarkItDown initialized successfully")
	except Exception as e:
	logger.error(f"Error initializing MarkItDown: {str(e)}")
	self.markdown_instance = None

	def parse(self, file_path: Union[str, Path], ocr_method: Optional[str] = None, **kwargs) -> str:
	"""
	Parse a document and return its content as Markdown.

	Args:
	file_path: Path to the document
	ocr_method: OCR method to use (not used in this parser)
	**kwargs: Additional options including cancellation checking

	Returns:
	str: Markdown representation of the document
	"""
	# Validate file first
	self.validate_file(file_path)

	# Check if MarkItDown is available
	if not HAS_MARKITDOWN or self.markdown_instance is None:
	raise ParserError("MarkItDown is not available. Please install with 'pip install markitdown[all]'")

	# Check for cancellation before starting
	if self._check_cancellation():
	raise DocumentProcessingError("Conversion cancelled")

	file_path_str = str(file_path)
	file_ext = Path(file_path).suffix.lower()

	try:
	# Run conversion in a separate thread to support cancellation
	result_container = {"result": None, "error": None, "completed": False}

	def conversion_worker():
	try:
	# For image files, potentially use LLM if available
	if file_ext in ['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.webp']:
	if HAS_GEMINI_WRAPPER:
	try:
	# Create Gemini-enabled instance for image processing
	gemini_client = create_gemini_client_for_markitdown()
	if gemini_client:
	llm_instance = MarkItDown(llm_client=gemini_client, llm_model="gemini-2.5-flash")
	result = llm_instance.convert(file_path_str)
	else:
	# No Gemini client available, use standard conversion
	logger.info("Gemini client not available, using standard conversion for image")
	result = self.markdown_instance.convert(file_path_str)
	except Exception as llm_error:
	logger.warning(f"Gemini image processing failed, falling back to basic conversion: {llm_error}")
	result = self.markdown_instance.convert(file_path_str)
	else:
	# No Gemini wrapper available, use standard conversion
	logger.info("Gemini wrapper not available, using standard conversion for image")
	result = self.markdown_instance.convert(file_path_str)
	else:
	# For non-image files, use standard conversion
	result = self.markdown_instance.convert(file_path_str)

	result_container["result"] = result
	result_container["completed"] = True
	except Exception as e:
	result_container["error"] = e
	result_container["completed"] = True

	# Start conversion in background thread
	conversion_thread = threading.Thread(target=conversion_worker, daemon=True)
	conversion_thread.start()

	# Wait for completion or cancellation
	while conversion_thread.is_alive():
	if self._check_cancellation():
	logger.info("MarkItDown conversion cancelled by user")
	# Give thread a moment to finish cleanly
	conversion_thread.join(timeout=0.1)
	raise DocumentProcessingError("Conversion cancelled")
	time.sleep(0.1) # Check every 100ms

	# Ensure thread has completed
	conversion_thread.join()

	# Check for errors
	if result_container["error"]:
	raise result_container["error"]

	result = result_container["result"]
	if result is None:
	raise DocumentProcessingError("MarkItDown conversion returned no result")

	# Use the correct attribute - MarkItDown returns .text_content
	if hasattr(result, 'text_content') and result.text_content:
	return result.text_content
	elif hasattr(result, 'markdown') and result.markdown:
	return result.markdown
	elif hasattr(result, 'content') and result.content:
	return result.content
	else:
	# Fallback - convert result to string
	content = str(result)
	if content and content.strip():
	return content
	else:
	raise DocumentProcessingError("MarkItDown conversion returned empty content")

	except DocumentProcessingError:
	# Re-raise cancellation errors
	raise
	except Exception as e:
	logger.error(f"Error converting file with MarkItDown: {str(e)}")
	raise DocumentProcessingError(f"MarkItDown conversion failed: {str(e)}")

	@classmethod
	def get_name(cls) -> str:
	return "MarkItDown"

	@classmethod
	def get_supported_file_types(cls) -> Set[str]:
	"""Return a set of supported file extensions."""
	return {".pdf", ".docx", ".xlsx", ".pptx", ".html", ".txt", ".md", ".json", ".xml", ".csv", ".jpg", ".jpeg", ".png"}

	@classmethod
	def is_available(cls) -> bool:
	"""Check if this parser is available."""
	return HAS_MARKITDOWN

	@classmethod
	def get_supported_ocr_methods(cls) -> List[Dict[str, Any]]:
	return [
	{
	"id": "standard",
	"name": "Standard Conversion",
	"default_params": {}
	}
	]

	@classmethod
	def get_description(cls) -> str:
	return "MarkItDown parser for converting various file formats to Markdown. Uses Gemini Flash 2.5 for advanced image analysis."


	# Register the parser with the registry if available
	if HAS_MARKITDOWN:
	ParserRegistry.register(MarkItDownParser)
	logger.info("MarkItDown parser registered successfully")
	else:
	logger.warning("Could not register MarkItDown parser: Package not installed")