""" File type classification using multiple strategies """ import os import mimetypes from pathlib import Path from typing import Optional, Tuple from enum import Enum from backend.models.enums import FileType class FileClassifier: """ Multi-strategy file type classifier Strategies (in order): 1. MIME type detection (python-magic if available) 2. Extension-based classification 3. Magic number validation (for images) """ # Extension to FileType mapping EXTENSION_MAP = { # Documents '.pdf': FileType.PDF, '.doc': FileType.DOC, '.docx': FileType.DOCX, # Spreadsheets '.xls': FileType.XLS, '.xlsx': FileType.XLSX, '.csv': FileType.CSV, # Images '.jpg': FileType.JPG, '.jpeg': FileType.JPEG, '.png': FileType.PNG, '.gif': FileType.GIF, '.webp': FileType.WEBP, # Videos '.mp4': FileType.MP4, '.avi': FileType.AVI, '.mov': FileType.MOV, '.mkv': FileType.MKV, } # MIME type to FileType mapping MIME_MAP = { # Documents 'application/pdf': FileType.PDF, 'application/msword': FileType.DOC, 'application/vnd.openxmlformats-officedocument.wordprocessingml.document': FileType.DOCX, # Spreadsheets 'application/vnd.ms-excel': FileType.XLS, 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': FileType.XLSX, 'text/csv': FileType.CSV, 'text/comma-separated-values': FileType.CSV, # Images 'image/jpeg': FileType.JPG, 'image/png': FileType.PNG, 'image/gif': FileType.GIF, 'image/webp': FileType.WEBP, # Videos 'video/mp4': FileType.MP4, 'video/x-msvideo': FileType.AVI, 'video/quicktime': FileType.MOV, 'video/x-matroska': FileType.MKV, } # Magic numbers for common file types (first few bytes) MAGIC_NUMBERS = { b'%PDF': FileType.PDF, b'\xd0\xcf\x11\xe0\xa1\xb1\x1a\xe1': FileType.DOC, # OLE compound (old DOC) b'PK\x03\x04': None, # ZIP-based (DOCX, XLSX - need further check) b'\xff\xd8\xff': FileType.JPG, b'\x89PNG\r\n\x1a\n': FileType.PNG, b'GIF87a': FileType.GIF, b'GIF89a': FileType.GIF, b'RIFF': None, # Could be AVI or WEBP } def __init__(self): # Initialize mimetypes mimetypes.init() # Try to import python-magic (optional) self.magic_available = False try: import magic self.magic = magic self.magic_available = True except ImportError: pass def classify_file(self, file_path: str) -> Tuple[FileType, Optional[str]]: """ Classify file using multiple strategies Args: file_path: Path to the file Returns: Tuple of (FileType, mime_type) """ path = Path(file_path) if not path.exists(): return FileType.UNKNOWN, None # Strategy 1: MIME type detection (python-magic) if self.magic_available: try: mime_type = self._detect_mime_with_magic(file_path) if mime_type: file_type = self.MIME_MAP.get(mime_type) if file_type: return file_type, mime_type except Exception: pass # Fall through to next strategy # Strategy 2: Extension-based classification file_type, mime_type = self._classify_by_extension(path) if file_type != FileType.UNKNOWN: return file_type, mime_type # Strategy 3: Magic number detection file_type = self._classify_by_magic_number(file_path) if file_type: # Get mime type from system for magic-based detection mime_type, _ = mimetypes.guess_type(str(path)) return file_type, mime_type # Strategy 4: mimetypes library fallback mime_type, _ = mimetypes.guess_type(str(path)) if mime_type: file_type = self.MIME_MAP.get(mime_type) if file_type: return file_type, mime_type # All strategies failed return FileType.UNKNOWN, None def _detect_mime_with_magic(self, file_path: str) -> Optional[str]: """ Detect MIME type using python-magic """ if not self.magic_available: return None try: mime = self.magic.Magic(mime=True) return mime.from_file(file_path) except Exception: return None def _classify_by_extension(self, path: Path) -> Tuple[FileType, Optional[str]]: """ Classify file by extension """ extension = path.suffix.lower() if extension in self.EXTENSION_MAP: file_type = self.EXTENSION_MAP[extension] mime_type, _ = mimetypes.guess_type(str(path)) return file_type, mime_type return FileType.UNKNOWN, None def _classify_by_magic_number(self, file_path: str) -> Optional[FileType]: """ Classify file by reading magic numbers """ try: with open(file_path, 'rb') as f: header = f.read(16) # Read first 16 bytes # Check for exact matches for magic_bytes, file_type in self.MAGIC_NUMBERS.items(): if header.startswith(magic_bytes): if file_type is not None: return file_type # Special handling for ZIP-based formats if magic_bytes == b'PK\x03\x04': return self._identify_zip_based_file(file_path) # Special handling for RIFF (AVI or WEBP) if magic_bytes == b'RIFF': if len(header) > 12 and header[8:12] == b'AVI ': return FileType.AVI elif len(header) > 12 and header[8:12] == b'WEBP': return FileType.WEBP return None except (IOError, OSError): return None def _identify_zip_based_file(self, file_path: str) -> Optional[FileType]: """ Identify ZIP-based file types (DOCX, XLSX, etc.) """ path = Path(file_path) extension = path.suffix.lower() # Use extension as hint for ZIP-based formats if extension in self.EXTENSION_MAP: return self.EXTENSION_MAP[extension] # Try to inspect ZIP contents try: import zipfile with zipfile.ZipFile(file_path, 'r') as zip_file: names = zip_file.namelist() # Check for Word document markers if any('word/' in name for name in names): return FileType.DOCX # Check for Excel workbook markers if any('xl/' in name for name in names): return FileType.XLSX except (zipfile.BadZipFile, Exception): pass return None def is_supported_type(self, file_type: FileType) -> bool: """ Check if file type is supported for processing """ return file_type != FileType.UNKNOWN def get_category(self, file_type: FileType) -> str: """ Get category for file type """ if file_type in [FileType.PDF, FileType.DOC, FileType.DOCX]: return "document" elif file_type in [FileType.XLS, FileType.XLSX, FileType.CSV]: return "spreadsheet" elif file_type in [FileType.JPG, FileType.JPEG, FileType.PNG, FileType.GIF, FileType.WEBP]: return "image" elif file_type in [FileType.MP4, FileType.AVI, FileType.MOV, FileType.MKV]: return "video" else: return "unknown"