File size: 4,241 Bytes
01d5a5d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
from pathlib import Path
from typing import Optional, Set, List
from ..core.file_type import FileType
from ..core.exceptions import UnsupportedFileType
from ..document import Document, ProcessStatus
from ..core.exceptions import FileProcessingError
import logging

logger = logging.getLogger(__name__)


class BaseFileProcessor:
    """Base processor"""

    # processor supported file types
    SUPPORTED_TYPES: Set[FileType] = set()

    @classmethod
    def process(
        cls, file_path: str, expected_type: Optional[FileType] = None
    ) -> Document:
        """
        Main entry point for processing files
        :param file_path: file path
        :param expected_type: expected file type
        :return: Document object
        """
        path = Path(file_path)
        file_type = cls._detect_type(path, expected_type)

        if file_type not in cls.SUPPORTED_TYPES:
            raise UnsupportedFileType(f"{cls.__name__} doesn't support {file_type}")

        return cls._process_file(path, cls._create_document(path, file_type))

    @classmethod
    def _detect_type(
        cls, path: Path, expected_type: Optional[FileType] = None
    ) -> FileType:
        """Detect file type"""
        if expected_type:
            return expected_type

        suffix = path.suffix.lower()
        mime_mapping = FileType.get_mime_mapping()

        if suffix not in mime_mapping:
            raise UnsupportedFileType(f"Unsupported file type: {suffix}")

        return mime_mapping[suffix]

    @classmethod
    def _create_document(cls, path: Path, file_type: FileType) -> Document:
        """Create base document object"""
        # try:
        #     file_size = path.stat().st_size if path.exists() else 0
        # except (OSError, IOError) as e:
        #     raise FileProcessingError(f"Cannot access file {path}: {str(e)}")

        """Create base document object"""
        return Document(
            name=path.name, mime_type=file_type.value, document_size=path.stat().st_size
        )

    @classmethod
    def _process_file(cls, file_path: Path, doc: Document) -> Document:
        """Specific processing logic implemented by subclasses"""
        raise NotImplementedError

    @classmethod
    def process_directory(
        cls,
        directory_path: str,
        expected_type: Optional[FileType] = None,
        recursive: bool = False,
    ) -> List[Document]:
        """
        Process all files in the directory
        :param directory_path: directory path
        :param expected_type: expected file type
        :param recursive: whether to process subdirectories
        :return: list of Document objects
        """
        path = Path(directory_path)
        logger.info(f"Processing directory: {path}")

        if not path.is_dir():
            logger.error(f"{directory_path} is not a directory")
            raise FileProcessingError(f"{directory_path} is not a directory")

        documents = []
        pattern = "**/*" if recursive else "*"

        # list all files
        files = list(path.glob(pattern))
        logger.info(f"Found files: {files}")

        for file_path in path.glob(pattern):
            if file_path.is_file():
                try:
                    logger.info(f"Processing file: {file_path}")
                    logger.info(f"File suffix: {file_path.suffix}")
                    logger.info(f"Supported types: {cls.SUPPORTED_TYPES}")

                    doc = cls.process(str(file_path), expected_type)
                    doc.status = ProcessStatus.SUCCESS
                    documents.append(doc)
                    logger.info(f"Successfully processed file: {file_path}")
                except UnsupportedFileType as e:
                    logger.warning(f"Unsupported file type: {file_path} - {str(e)}")
                    # create a document object representing failed processing
                    doc = Document(
                        name=file_path.name,
                        mime_type="unknown",
                        document_size=file_path.stat().st_size,
                        status=ProcessStatus.FAILED,
                        error_message=f"Unsupported file type: {str(e)}",
                    )
        return documents