Spaces:

Divs0910
/

Digi-Biz

Sleeping

File size: 8,425 Bytes

255cbd1

"""
File type classification using multiple strategies
"""
import os
import mimetypes
from pathlib import Path
from typing import Optional, Tuple
from enum import Enum

from backend.models.enums import FileType


class FileClassifier:
    """
    Multi-strategy file type classifier
    
    Strategies (in order):
    1. MIME type detection (python-magic if available)
    2. Extension-based classification
    3. Magic number validation (for images)
    """
    
    # Extension to FileType mapping
    EXTENSION_MAP = {
        # Documents
        '.pdf': FileType.PDF,
        '.doc': FileType.DOC,
        '.docx': FileType.DOCX,
        
        # Spreadsheets
        '.xls': FileType.XLS,
        '.xlsx': FileType.XLSX,
        '.csv': FileType.CSV,
        
        # Images
        '.jpg': FileType.JPG,
        '.jpeg': FileType.JPEG,
        '.png': FileType.PNG,
        '.gif': FileType.GIF,
        '.webp': FileType.WEBP,
        
        # Videos
        '.mp4': FileType.MP4,
        '.avi': FileType.AVI,
        '.mov': FileType.MOV,
        '.mkv': FileType.MKV,
    }
    
    # MIME type to FileType mapping
    MIME_MAP = {
        # Documents
        'application/pdf': FileType.PDF,
        'application/msword': FileType.DOC,
        'application/vnd.openxmlformats-officedocument.wordprocessingml.document': FileType.DOCX,
        
        # Spreadsheets
        'application/vnd.ms-excel': FileType.XLS,
        'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': FileType.XLSX,
        'text/csv': FileType.CSV,
        'text/comma-separated-values': FileType.CSV,
        
        # Images
        'image/jpeg': FileType.JPG,
        'image/png': FileType.PNG,
        'image/gif': FileType.GIF,
        'image/webp': FileType.WEBP,
        
        # Videos
        'video/mp4': FileType.MP4,
        'video/x-msvideo': FileType.AVI,
        'video/quicktime': FileType.MOV,
        'video/x-matroska': FileType.MKV,
    }
    
    # Magic numbers for common file types (first few bytes)
    MAGIC_NUMBERS = {
        b'%PDF': FileType.PDF,
        b'\xd0\xcf\x11\xe0\xa1\xb1\x1a\xe1': FileType.DOC,  # OLE compound (old DOC)
        b'PK\x03\x04': None,  # ZIP-based (DOCX, XLSX - need further check)
        b'\xff\xd8\xff': FileType.JPG,
        b'\x89PNG\r\n\x1a\n': FileType.PNG,
        b'GIF87a': FileType.GIF,
        b'GIF89a': FileType.GIF,
        b'RIFF': None,  # Could be AVI or WEBP
    }
    
    def __init__(self):
        # Initialize mimetypes
        mimetypes.init()
        
        # Try to import python-magic (optional)
        self.magic_available = False
        try:
            import magic
            self.magic = magic
            self.magic_available = True
        except ImportError:
            pass
    
    def classify_file(self, file_path: str) -> Tuple[FileType, Optional[str]]:
        """
        Classify file using multiple strategies
        
        Args:
            file_path: Path to the file
            
        Returns:
            Tuple of (FileType, mime_type)
        """
        path = Path(file_path)
        
        if not path.exists():
            return FileType.UNKNOWN, None
        
        # Strategy 1: MIME type detection (python-magic)
        if self.magic_available:
            try:
                mime_type = self._detect_mime_with_magic(file_path)
                if mime_type:
                    file_type = self.MIME_MAP.get(mime_type)
                    if file_type:
                        return file_type, mime_type
            except Exception:
                pass  # Fall through to next strategy
        
        # Strategy 2: Extension-based classification
        file_type, mime_type = self._classify_by_extension(path)
        if file_type != FileType.UNKNOWN:
            return file_type, mime_type
        
        # Strategy 3: Magic number detection
        file_type = self._classify_by_magic_number(file_path)
        if file_type:
            # Get mime type from system for magic-based detection
            mime_type, _ = mimetypes.guess_type(str(path))
            return file_type, mime_type
        
        # Strategy 4: mimetypes library fallback
        mime_type, _ = mimetypes.guess_type(str(path))
        if mime_type:
            file_type = self.MIME_MAP.get(mime_type)
            if file_type:
                return file_type, mime_type
        
        # All strategies failed
        return FileType.UNKNOWN, None
    
    def _detect_mime_with_magic(self, file_path: str) -> Optional[str]:
        """
        Detect MIME type using python-magic
        """
        if not self.magic_available:
            return None
        
        try:
            mime = self.magic.Magic(mime=True)
            return mime.from_file(file_path)
        except Exception:
            return None
    
    def _classify_by_extension(self, path: Path) -> Tuple[FileType, Optional[str]]:
        """
        Classify file by extension
        """
        extension = path.suffix.lower()
        
        if extension in self.EXTENSION_MAP:
            file_type = self.EXTENSION_MAP[extension]
            mime_type, _ = mimetypes.guess_type(str(path))
            return file_type, mime_type
        
        return FileType.UNKNOWN, None
    
    def _classify_by_magic_number(self, file_path: str) -> Optional[FileType]:
        """
        Classify file by reading magic numbers
        """
        try:
            with open(file_path, 'rb') as f:
                header = f.read(16)  # Read first 16 bytes
                
                # Check for exact matches
                for magic_bytes, file_type in self.MAGIC_NUMBERS.items():
                    if header.startswith(magic_bytes):
                        if file_type is not None:
                            return file_type
                        
                        # Special handling for ZIP-based formats
                        if magic_bytes == b'PK\x03\x04':
                            return self._identify_zip_based_file(file_path)
                        
                        # Special handling for RIFF (AVI or WEBP)
                        if magic_bytes == b'RIFF':
                            if len(header) > 12 and header[8:12] == b'AVI ':
                                return FileType.AVI
                            elif len(header) > 12 and header[8:12] == b'WEBP':
                                return FileType.WEBP
                
                return None
                
        except (IOError, OSError):
            return None
    
    def _identify_zip_based_file(self, file_path: str) -> Optional[FileType]:
        """
        Identify ZIP-based file types (DOCX, XLSX, etc.)
        """
        path = Path(file_path)
        extension = path.suffix.lower()
        
        # Use extension as hint for ZIP-based formats
        if extension in self.EXTENSION_MAP:
            return self.EXTENSION_MAP[extension]
        
        # Try to inspect ZIP contents
        try:
            import zipfile
            with zipfile.ZipFile(file_path, 'r') as zip_file:
                names = zip_file.namelist()
                
                # Check for Word document markers
                if any('word/' in name for name in names):
                    return FileType.DOCX
                
                # Check for Excel workbook markers
                if any('xl/' in name for name in names):
                    return FileType.XLSX
                    
        except (zipfile.BadZipFile, Exception):
            pass
        
        return None
    
    def is_supported_type(self, file_type: FileType) -> bool:
        """
        Check if file type is supported for processing
        """
        return file_type != FileType.UNKNOWN
    
    def get_category(self, file_type: FileType) -> str:
        """
        Get category for file type
        """
        if file_type in [FileType.PDF, FileType.DOC, FileType.DOCX]:
            return "document"
        elif file_type in [FileType.XLS, FileType.XLSX, FileType.CSV]:
            return "spreadsheet"
        elif file_type in [FileType.JPG, FileType.JPEG, FileType.PNG, FileType.GIF, FileType.WEBP]:
            return "image"
        elif file_type in [FileType.MP4, FileType.AVI, FileType.MOV, FileType.MKV]:
            return "video"
        else:
            return "unknown"