File size: 8,425 Bytes
255cbd1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
"""
File type classification using multiple strategies
"""
import os
import mimetypes
from pathlib import Path
from typing import Optional, Tuple
from enum import Enum

from backend.models.enums import FileType


class FileClassifier:
    """
    Multi-strategy file type classifier
    
    Strategies (in order):
    1. MIME type detection (python-magic if available)
    2. Extension-based classification
    3. Magic number validation (for images)
    """
    
    # Extension to FileType mapping
    EXTENSION_MAP = {
        # Documents
        '.pdf': FileType.PDF,
        '.doc': FileType.DOC,
        '.docx': FileType.DOCX,
        
        # Spreadsheets
        '.xls': FileType.XLS,
        '.xlsx': FileType.XLSX,
        '.csv': FileType.CSV,
        
        # Images
        '.jpg': FileType.JPG,
        '.jpeg': FileType.JPEG,
        '.png': FileType.PNG,
        '.gif': FileType.GIF,
        '.webp': FileType.WEBP,
        
        # Videos
        '.mp4': FileType.MP4,
        '.avi': FileType.AVI,
        '.mov': FileType.MOV,
        '.mkv': FileType.MKV,
    }
    
    # MIME type to FileType mapping
    MIME_MAP = {
        # Documents
        'application/pdf': FileType.PDF,
        'application/msword': FileType.DOC,
        'application/vnd.openxmlformats-officedocument.wordprocessingml.document': FileType.DOCX,
        
        # Spreadsheets
        'application/vnd.ms-excel': FileType.XLS,
        'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': FileType.XLSX,
        'text/csv': FileType.CSV,
        'text/comma-separated-values': FileType.CSV,
        
        # Images
        'image/jpeg': FileType.JPG,
        'image/png': FileType.PNG,
        'image/gif': FileType.GIF,
        'image/webp': FileType.WEBP,
        
        # Videos
        'video/mp4': FileType.MP4,
        'video/x-msvideo': FileType.AVI,
        'video/quicktime': FileType.MOV,
        'video/x-matroska': FileType.MKV,
    }
    
    # Magic numbers for common file types (first few bytes)
    MAGIC_NUMBERS = {
        b'%PDF': FileType.PDF,
        b'\xd0\xcf\x11\xe0\xa1\xb1\x1a\xe1': FileType.DOC,  # OLE compound (old DOC)
        b'PK\x03\x04': None,  # ZIP-based (DOCX, XLSX - need further check)
        b'\xff\xd8\xff': FileType.JPG,
        b'\x89PNG\r\n\x1a\n': FileType.PNG,
        b'GIF87a': FileType.GIF,
        b'GIF89a': FileType.GIF,
        b'RIFF': None,  # Could be AVI or WEBP
    }
    
    def __init__(self):
        # Initialize mimetypes
        mimetypes.init()
        
        # Try to import python-magic (optional)
        self.magic_available = False
        try:
            import magic
            self.magic = magic
            self.magic_available = True
        except ImportError:
            pass
    
    def classify_file(self, file_path: str) -> Tuple[FileType, Optional[str]]:
        """
        Classify file using multiple strategies
        
        Args:
            file_path: Path to the file
            
        Returns:
            Tuple of (FileType, mime_type)
        """
        path = Path(file_path)
        
        if not path.exists():
            return FileType.UNKNOWN, None
        
        # Strategy 1: MIME type detection (python-magic)
        if self.magic_available:
            try:
                mime_type = self._detect_mime_with_magic(file_path)
                if mime_type:
                    file_type = self.MIME_MAP.get(mime_type)
                    if file_type:
                        return file_type, mime_type
            except Exception:
                pass  # Fall through to next strategy
        
        # Strategy 2: Extension-based classification
        file_type, mime_type = self._classify_by_extension(path)
        if file_type != FileType.UNKNOWN:
            return file_type, mime_type
        
        # Strategy 3: Magic number detection
        file_type = self._classify_by_magic_number(file_path)
        if file_type:
            # Get mime type from system for magic-based detection
            mime_type, _ = mimetypes.guess_type(str(path))
            return file_type, mime_type
        
        # Strategy 4: mimetypes library fallback
        mime_type, _ = mimetypes.guess_type(str(path))
        if mime_type:
            file_type = self.MIME_MAP.get(mime_type)
            if file_type:
                return file_type, mime_type
        
        # All strategies failed
        return FileType.UNKNOWN, None
    
    def _detect_mime_with_magic(self, file_path: str) -> Optional[str]:
        """
        Detect MIME type using python-magic
        """
        if not self.magic_available:
            return None
        
        try:
            mime = self.magic.Magic(mime=True)
            return mime.from_file(file_path)
        except Exception:
            return None
    
    def _classify_by_extension(self, path: Path) -> Tuple[FileType, Optional[str]]:
        """
        Classify file by extension
        """
        extension = path.suffix.lower()
        
        if extension in self.EXTENSION_MAP:
            file_type = self.EXTENSION_MAP[extension]
            mime_type, _ = mimetypes.guess_type(str(path))
            return file_type, mime_type
        
        return FileType.UNKNOWN, None
    
    def _classify_by_magic_number(self, file_path: str) -> Optional[FileType]:
        """
        Classify file by reading magic numbers
        """
        try:
            with open(file_path, 'rb') as f:
                header = f.read(16)  # Read first 16 bytes
                
                # Check for exact matches
                for magic_bytes, file_type in self.MAGIC_NUMBERS.items():
                    if header.startswith(magic_bytes):
                        if file_type is not None:
                            return file_type
                        
                        # Special handling for ZIP-based formats
                        if magic_bytes == b'PK\x03\x04':
                            return self._identify_zip_based_file(file_path)
                        
                        # Special handling for RIFF (AVI or WEBP)
                        if magic_bytes == b'RIFF':
                            if len(header) > 12 and header[8:12] == b'AVI ':
                                return FileType.AVI
                            elif len(header) > 12 and header[8:12] == b'WEBP':
                                return FileType.WEBP
                
                return None
                
        except (IOError, OSError):
            return None
    
    def _identify_zip_based_file(self, file_path: str) -> Optional[FileType]:
        """
        Identify ZIP-based file types (DOCX, XLSX, etc.)
        """
        path = Path(file_path)
        extension = path.suffix.lower()
        
        # Use extension as hint for ZIP-based formats
        if extension in self.EXTENSION_MAP:
            return self.EXTENSION_MAP[extension]
        
        # Try to inspect ZIP contents
        try:
            import zipfile
            with zipfile.ZipFile(file_path, 'r') as zip_file:
                names = zip_file.namelist()
                
                # Check for Word document markers
                if any('word/' in name for name in names):
                    return FileType.DOCX
                
                # Check for Excel workbook markers
                if any('xl/' in name for name in names):
                    return FileType.XLSX
                    
        except (zipfile.BadZipFile, Exception):
            pass
        
        return None
    
    def is_supported_type(self, file_type: FileType) -> bool:
        """
        Check if file type is supported for processing
        """
        return file_type != FileType.UNKNOWN
    
    def get_category(self, file_type: FileType) -> str:
        """
        Get category for file type
        """
        if file_type in [FileType.PDF, FileType.DOC, FileType.DOCX]:
            return "document"
        elif file_type in [FileType.XLS, FileType.XLSX, FileType.CSV]:
            return "spreadsheet"
        elif file_type in [FileType.JPG, FileType.JPEG, FileType.PNG, FileType.GIF, FileType.WEBP]:
            return "image"
        elif file_type in [FileType.MP4, FileType.AVI, FileType.MOV, FileType.MKV]:
            return "video"
        else:
            return "unknown"