meccatronis
/

android_data_recovery

Model card Files Files and versions

xet

Community

meccatronis commited on Feb 11

Commit

68ef616

verified ·

1 Parent(s): 1ec3d37

Upload core/file_analyzer.py with huggingface_hub

Browse files

Files changed (1) hide show

core/file_analyzer.py +522 -0

core/file_analyzer.py ADDED Viewed

	@@ -0,0 +1,522 @@

+"""
+File Analyzer Module
+====================
+Analyzes files for recovery potential and extracts metadata.
+Supports various file formats including images, videos, documents, and databases.
+"""
+import os
+import logging
+import struct
+from typing import Dict, Optional, Any, List, Tuple
+from dataclasses import dataclass
+from datetime import datetime
+from enum import Enum
+logger = logging.getLogger(__name__)
+class FileHealth(Enum):
+    """File health status"""
+    HEALTHY = "healthy"
+    DAMAGED = "damaged"
+    CORRUPTED = "corrupted"
+    PARTIAL = "partial"
+    UNKNOWN = "unknown"
+@dataclass
+class FileMetadata:
+    """Metadata extracted from a file"""
+    file_type: str = ""
+    mime_type: str = ""
+    extension: str = ""
+    size: int = 0
+    created: Optional[datetime] = None
+    modified: Optional[datetime] = None
+    width: int = 0
+    height: int = 0
+    duration: float = 0.0
+    bitrate: int = 0
+    codec: str = ""
+    artist: str = ""
+    title: str = ""
+    album: str = ""
+    camera_make: str = ""
+    camera_model: str = ""
+    gps_latitude: float = 0.0
+    gps_longitude: float = 0.0
+    extra: Dict[str, Any] = None
+    def __post_init__(self):
+        if self.extra is None:
+            self.extra = {}
+class FileAnalyzer:
+    """
+    Analyzes files for recovery and extracts metadata.
+    Features:
+    - File signature detection
+    - Metadata extraction (EXIF, ID3, etc.)
+    - File health assessment
+    - Recovery potential estimation
+    """
+    # File signatures (magic bytes)
+    SIGNATURES = {
+        # Images
+        b'\xff\xd8\xff\xe0': ('image/jpeg', 'jpg', 'JPEG Image'),
+        b'\xff\xd8\xff\xe1': ('image/jpeg', 'jpg', 'JPEG Image with EXIF'),
+        b'\xff\xd8\xff\xdb': ('image/jpeg', 'jpg', 'JPEG Image'),
+        b'\x89PNG\r\n\x1a\n': ('image/png', 'png', 'PNG Image'),
+        b'GIF87a': ('image/gif', 'gif', 'GIF Image'),
+        b'GIF89a': ('image/gif', 'gif', 'GIF Image'),
+        b'BM': ('image/bmp', 'bmp', 'BMP Image'),
+        b'RIFF': ('image/webp', 'webp', 'WebP Image'),  # Need further check
+        b'\x00\x00\x01\x00': ('image/x-icon', 'ico', 'ICO Icon'),
+        # Videos
+        b'\x00\x00\x00\x18ftyp': ('video/mp4', 'mp4', 'MP4 Video'),
+        b'\x00\x00\x00\x1cftyp': ('video/mp4', 'mp4', 'MP4 Video'),
+        b'\x00\x00\x00\x20ftyp': ('video/mp4', 'mp4', 'MP4 Video'),
+        b'\x1aE\xdf\xa3': ('video/x-matroska', 'mkv', 'Matroska Video'),
+        b'\x00\x00\x00\x14ftyp3gp': ('video/3gpp', '3gp', '3GP Video'),
+        b'FLV\x01': ('video/x-flv', 'flv', 'Flash Video'),
+        # Audio
+        b'ID3': ('audio/mpeg', 'mp3', 'MP3 Audio'),
+        b'\xff\xfb': ('audio/mpeg', 'mp3', 'MP3 Audio'),
+        b'\xff\xf3': ('audio/mpeg', 'mp3', 'MP3 Audio'),
+        b'\xff\xf2': ('audio/mpeg', 'mp3', 'MP3 Audio'),
+        b'OggS': ('audio/ogg', 'ogg', 'OGG Audio'),
+        b'fLaC': ('audio/flac', 'flac', 'FLAC Audio'),
+        # Documents
+        b'%PDF': ('application/pdf', 'pdf', 'PDF Document'),
+        b'PK\x03\x04': ('application/zip', 'zip', 'ZIP Archive'),  # Also docx, xlsx
+        b'\xd0\xcf\x11\xe0': ('application/msword', 'doc', 'MS Office Document'),
+        b'{\rtf': ('application/rtf', 'rtf', 'RTF Document'),
+        # Databases
+        b'SQLite format 3': ('application/x-sqlite3', 'db', 'SQLite Database'),
+        # Archives
+        b'\x1f\x8b\x08': ('application/gzip', 'gz', 'GZIP Archive'),
+        b'Rar!\x1a\x07': ('application/x-rar', 'rar', 'RAR Archive'),
+        b'7z\xbc\xaf\x27\x1c': ('application/x-7z-compressed', '7z', '7-Zip Archive'),
+    }
+    # JPEG markers
+    JPEG_MARKERS = {
+        0xD8: 'SOI',   # Start of Image
+        0xE0: 'APP0',  # JFIF
+        0xE1: 'APP1',  # EXIF
+        0xDB: 'DQT',   # Define Quantization Table
+        0xC0: 'SOF0',  # Start of Frame (Baseline)
+        0xC2: 'SOF2',  # Start of Frame (Progressive)
+        0xC4: 'DHT',   # Define Huffman Table
+        0xDA: 'SOS',   # Start of Scan
+        0xD9: 'EOI',   # End of Image
+    }
+    def __init__(self):
+        """Initialize File Analyzer."""
+        pass
+    def analyze_file(self, filepath: str) -> Tuple[FileMetadata, FileHealth]:
+        """
+        Analyze a file and extract metadata.
+        Args:
+            filepath: Path to the file
+        Returns:
+            Tuple of (FileMetadata, FileHealth)
+        """
+        metadata = FileMetadata()
+        health = FileHealth.UNKNOWN
+        if not os.path.exists(filepath):
+            return metadata, FileHealth.CORRUPTED
+        metadata.size = os.path.getsize(filepath)
+        # Get file times
+        try:
+            stat = os.stat(filepath)
+            metadata.modified = datetime.fromtimestamp(stat.st_mtime)
+            metadata.created = datetime.fromtimestamp(stat.st_ctime)
+        except Exception:
+            pass
+        # Read file header
+        try:
+            with open(filepath, 'rb') as f:
+                header = f.read(32)
+        except Exception as e:
+            logger.error(f"Error reading file: {e}")
+            return metadata, FileHealth.CORRUPTED
+        # Identify file type
+        file_info = self._identify_file_type(header)
+        if file_info:
+            metadata.mime_type, metadata.extension, metadata.file_type = file_info
+        # Extract type-specific metadata
+        if metadata.mime_type.startswith('image/'):
+            metadata, health = self._analyze_image(filepath, metadata)
+        elif metadata.mime_type.startswith('video/'):
+            metadata, health = self._analyze_video(filepath, metadata)
+        elif metadata.mime_type.startswith('audio/'):
+            metadata, health = self._analyze_audio(filepath, metadata)
+        elif metadata.mime_type == 'application/pdf':
+            metadata, health = self._analyze_pdf(filepath, metadata)
+        elif metadata.mime_type == 'application/x-sqlite3':
+            metadata, health = self._analyze_sqlite(filepath, metadata)
+        else:
+            health = FileHealth.HEALTHY if metadata.size > 0 else FileHealth.CORRUPTED
+        return metadata, health
+    def _identify_file_type(self, header: bytes) -> Optional[Tuple[str, str, str]]:
+        """
+        Identify file type from header bytes.
+        Args:
+            header: First bytes of the file
+        Returns:
+            Tuple of (mime_type, extension, description) or None
+        """
+        for signature, info in self.SIGNATURES.items():
+            if header.startswith(signature):
+                return info
+        # Check for JPEG (various markers)
+        if header[:2] == b'\xff\xd8':
+            return ('image/jpeg', 'jpg', 'JPEG Image')
+        return None
+    def _analyze_image(self, filepath: str, metadata: FileMetadata) -> Tuple[FileMetadata, FileHealth]:
+        """Analyze an image file."""
+        health = FileHealth.HEALTHY
+        try:
+            # Try to use PIL for detailed analysis
+            from PIL import Image
+            from PIL.ExifTags import TAGS, GPSTAGS
+            with Image.open(filepath) as img:
+                metadata.width = img.width
+                metadata.height = img.height
+                # Extract EXIF data
+                exif_data = img._getexif()
+                if exif_data:
+                    for tag_id, value in exif_data.items():
+                        tag = TAGS.get(tag_id, tag_id)
+                        if tag == 'Make':
+                            metadata.camera_make = str(value)
+                        elif tag == 'Model':
+                            metadata.camera_model = str(value)
+                        elif tag == 'DateTime':
+                            try:
+                                metadata.created = datetime.strptime(value, '%Y:%m:%d %H:%M:%S')
+                            except Exception:
+                                pass
+                        elif tag == 'GPSInfo':
+                            gps = self._parse_gps_info(value)
+                            if gps:
+                                metadata.gps_latitude = gps[0]
+                                metadata.gps_longitude = gps[1]
+                # Verify image integrity
+                img.verify()
+        except Exception as e:
+            logger.debug(f"PIL analysis failed: {e}")
+            # Fallback to basic analysis
+            health = self._check_jpeg_integrity(filepath) if metadata.mime_type == 'image/jpeg' else FileHealth.UNKNOWN
+        return metadata, health
+    def _parse_gps_info(self, gps_info: Dict) -> Optional[Tuple[float, float]]:
+        """Parse GPS information from EXIF data."""
+        try:
+            def convert_to_degrees(value):
+                d = float(value[0])
+                m = float(value[1])
+                s = float(value[2])
+                return d + (m / 60.0) + (s / 3600.0)
+            lat = convert_to_degrees(gps_info[2])
+            if gps_info[1] == 'S':
+                lat = -lat
+            lon = convert_to_degrees(gps_info[4])
+            if gps_info[3] == 'W':
+                lon = -lon
+            return (lat, lon)
+        except Exception:
+            return None
+    def _check_jpeg_integrity(self, filepath: str) -> FileHealth:
+        """Check JPEG file integrity by scanning markers."""
+        try:
+            with open(filepath, 'rb') as f:
+                # Check SOI marker
+                if f.read(2) != b'\xff\xd8':
+                    return FileHealth.CORRUPTED
+                # Scan for EOI marker
+                f.seek(-2, 2)  # Go to end
+                if f.read(2) == b'\xff\xd9':
+                    return FileHealth.HEALTHY
+                # EOI not found at end, might be damaged
+                return FileHealth.DAMAGED
+        except Exception:
+            return FileHealth.CORRUPTED
+    def _analyze_video(self, filepath: str, metadata: FileMetadata) -> Tuple[FileMetadata, FileHealth]:
+        """Analyze a video file."""
+        health = FileHealth.HEALTHY
+        # Basic analysis - check file structure
+        try:
+            with open(filepath, 'rb') as f:
+                header = f.read(32)
+                # Check for MP4/MOV
+                if b'ftyp' in header:
+                    # Read moov atom for metadata
+                    f.seek(0)
+                    content = f.read(min(1024 * 1024, metadata.size))  # First 1MB
+                    if b'moov' in content:
+                        health = FileHealth.HEALTHY
+                    else:
+                        health = FileHealth.PARTIAL
+        except Exception as e:
+            logger.debug(f"Video analysis failed: {e}")
+            health = FileHealth.UNKNOWN
+        return metadata, health
+    def _analyze_audio(self, filepath: str, metadata: FileMetadata) -> Tuple[FileMetadata, FileHealth]:
+        """Analyze an audio file."""
+        health = FileHealth.HEALTHY
+        try:
+            with open(filepath, 'rb') as f:
+                header = f.read(128)
+                # Check for ID3 tag
+                if header[:3] == b'ID3':
+                    # Parse ID3v2 header
+                    version = header[3]
+                    flags = header[5]
+                    size = self._decode_syncsafe_int(header[6:10])
+                    # Read ID3 frames
+                    id3_data = f.read(size)
+                    metadata = self._parse_id3_tags(id3_data, metadata)
+                elif header[:2] in [b'\xff\xfb', b'\xff\xf3', b'\xff\xf2']:
+                    # MP3 without ID3, check frame sync
+                    health = FileHealth.HEALTHY
+        except Exception as e:
+            logger.debug(f"Audio analysis failed: {e}")
+            health = FileHealth.UNKNOWN
+        return metadata, health
+    def _decode_syncsafe_int(self, data: bytes) -> int:
+        """Decode ID3v2 syncsafe integer."""
+        return (data[0] << 21) | (data[1] << 14) | (data[2] << 7) | data[3]
+    def _parse_id3_tags(self, data: bytes, metadata: FileMetadata) -> FileMetadata:
+        """Parse ID3v2 tags."""
+        pos = 0
+        while pos < len(data) - 10:
+            frame_id = data[pos:pos+4].decode('latin-1', errors='ignore')
+            if not frame_id.strip() or frame_id[0] == '\x00':
+                break
+            frame_size = struct.unpack('>I', data[pos+4:pos+8])[0]
+            frame_data = data[pos+10:pos+10+frame_size]
+            try:
+                # Skip encoding byte
+                text = frame_data[1:].decode('utf-8', errors='ignore').strip('\x00')
+                if frame_id == 'TIT2':
+                    metadata.title = text
+                elif frame_id == 'TPE1':
+                    metadata.artist = text
+                elif frame_id == 'TALB':
+                    metadata.album = text
+            except Exception:
+                pass
+            pos += 10 + frame_size
+        return metadata
+    def _analyze_pdf(self, filepath: str, metadata: FileMetadata) -> Tuple[FileMetadata, FileHealth]:
+        """Analyze a PDF file."""
+        health = FileHealth.HEALTHY
+        try:
+            with open(filepath, 'rb') as f:
+                # Check header
+                header = f.read(8)
+                if not header.startswith(b'%PDF'):
+                    return metadata, FileHealth.CORRUPTED
+                # Check for EOF marker
+                f.seek(-1024, 2)
+                tail = f.read()
+                if b'%%EOF' in tail:
+                    health = FileHealth.HEALTHY
+                else:
+                    health = FileHealth.DAMAGED
+        except Exception:
+            health = FileHealth.UNKNOWN
+        return metadata, health
+    def _analyze_sqlite(self, filepath: str, metadata: FileMetadata) -> Tuple[FileMetadata, FileHealth]:
+        """Analyze a SQLite database file."""
+        health = FileHealth.HEALTHY
+        try:
+            import sqlite3
+            conn = sqlite3.connect(filepath)
+            cursor = conn.cursor()
+            # Run integrity check
+            cursor.execute("PRAGMA integrity_check")
+            result = cursor.fetchone()
+            if result[0] == 'ok':
+                health = FileHealth.HEALTHY
+            else:
+                health = FileHealth.DAMAGED
+            # Get table count
+            cursor.execute("SELECT COUNT(*) FROM sqlite_master WHERE type='table'")
+            table_count = cursor.fetchone()[0]
+            metadata.extra['table_count'] = table_count
+            conn.close()
+        except Exception as e:
+            logger.debug(f"SQLite analysis failed: {e}")
+            health = FileHealth.CORRUPTED
+        return metadata, health
+    def estimate_recovery_chance(self, filepath: str) -> float:
+        """
+        Estimate the chance of successful recovery.
+        Args:
+            filepath: Path to the file
+        Returns:
+            Recovery chance as percentage (0-100)
+        """
+        metadata, health = self.analyze_file(filepath)
+        if health == FileHealth.HEALTHY:
+            return 100.0
+        elif health == FileHealth.DAMAGED:
+            return 75.0
+        elif health == FileHealth.PARTIAL:
+            return 50.0
+        elif health == FileHealth.CORRUPTED:
+            return 25.0
+        else:
+            return 50.0
+    def get_file_preview(self, filepath: str, max_size: int = 1024) -> bytes:
+        """
+        Get a preview of file contents.
+        Args:
+            filepath: Path to the file
+            max_size: Maximum preview size in bytes
+        Returns:
+            Preview bytes
+        """
+        try:
+            with open(filepath, 'rb') as f:
+                return f.read(max_size)
+        except Exception:
+            return b''
+    def compare_files(self, file1: str, file2: str) -> Dict[str, Any]:
+        """
+        Compare two files.
+        Args:
+            file1: Path to first file
+            file2: Path to second file
+        Returns:
+            Comparison results
+        """
+        import hashlib
+        result = {
+            'identical': False,
+            'size_match': False,
+            'type_match': False,
+            'hash_match': False,
+        }
+        # Compare sizes
+        size1 = os.path.getsize(file1) if os.path.exists(file1) else 0
+        size2 = os.path.getsize(file2) if os.path.exists(file2) else 0
+        result['size_match'] = size1 == size2
+        # Compare types
+        meta1, _ = self.analyze_file(file1)
+        meta2, _ = self.analyze_file(file2)
+        result['type_match'] = meta1.mime_type == meta2.mime_type
+        # Compare hashes
+        def get_hash(filepath):
+            hasher = hashlib.md5()
+            try:
+                with open(filepath, 'rb') as f:
+                    for chunk in iter(lambda: f.read(8192), b''):
+                        hasher.update(chunk)
+                return hasher.hexdigest()
+            except Exception:
+                return None
+        hash1 = get_hash(file1)
+        hash2 = get_hash(file2)
+        result['hash_match'] = hash1 == hash2 and hash1 is not None
+        result['identical'] = all([
+            result['size_match'],
+            result['type_match'],
+            result['hash_match']
+        ])
+        return result