Spaces:
Runtime error
Runtime error
| from abc import ABC, abstractmethod | |
| from pathlib import Path | |
| from typing import Dict, List, Optional, Any, Union, Set | |
| import threading | |
| from src.core.exceptions import ParserError, UnsupportedFileTypeError | |
| class DocumentParser(ABC): | |
| """Base interface for all document parsers in the system.""" | |
| def __init__(self): | |
| """Initialize the parser.""" | |
| self._cancellation_flag: Optional[threading.Event] = None | |
| def set_cancellation_flag(self, flag: Optional[threading.Event]) -> None: | |
| """Set the cancellation flag for this parser.""" | |
| self._cancellation_flag = flag | |
| def _check_cancellation(self) -> bool: | |
| """Check if cancellation has been requested.""" | |
| return self._cancellation_flag is not None and self._cancellation_flag.is_set() | |
| def parse(self, file_path: Union[str, Path], ocr_method: Optional[str] = None, **kwargs) -> str: | |
| """ | |
| Parse a document and return its content. | |
| Args: | |
| file_path: Path to the document | |
| ocr_method: OCR method to use (if applicable) | |
| **kwargs: Additional parser-specific options | |
| Returns: | |
| str: The parsed content | |
| Raises: | |
| ParserError: For general parsing errors | |
| UnsupportedFileTypeError: For unsupported file types | |
| """ | |
| pass | |
| def get_name(cls) -> str: | |
| """Return the display name of this parser""" | |
| pass | |
| def get_supported_ocr_methods(cls) -> List[Dict[str, Any]]: | |
| """ | |
| Return a list of supported OCR methods. | |
| Returns: | |
| List of dictionaries with keys: | |
| - id: Unique identifier for the OCR method | |
| - name: Display name for the OCR method | |
| - default_params: Default parameters for this OCR method | |
| """ | |
| pass | |
| def get_description(cls) -> str: | |
| """Return a description of this parser""" | |
| return f"{cls.get_name()} document parser" | |
| def get_supported_file_types(cls) -> Set[str]: | |
| """Return a set of supported file extensions (including the dot).""" | |
| return {".pdf", ".png", ".jpg", ".jpeg", ".tiff", ".bmp", ".webp"} | |
| def is_available(cls) -> bool: | |
| """Check if this parser is available with current configuration.""" | |
| return True | |
| def validate_file(self, file_path: Union[str, Path]) -> None: | |
| """ | |
| Validate that the file can be processed by this parser. | |
| Args: | |
| file_path: Path to the file to validate | |
| Raises: | |
| UnsupportedFileTypeError: If file type is not supported | |
| ParserError: For other validation errors | |
| """ | |
| path = Path(file_path) | |
| if not path.exists(): | |
| raise ParserError(f"File not found: {file_path}") | |
| if path.suffix.lower() not in self.get_supported_file_types(): | |
| raise UnsupportedFileTypeError( | |
| f"File type '{path.suffix}' not supported by {self.get_name()}" | |
| ) | |
| def get_metadata(self) -> Dict[str, Any]: | |
| """Return metadata about this parser instance.""" | |
| return { | |
| "name": self.get_name(), | |
| "description": self.get_description(), | |
| "supported_file_types": list(self.get_supported_file_types()), | |
| "supported_ocr_methods": self.get_supported_ocr_methods(), | |
| "available": self.is_available() | |
| } |