SolarImageDownloader / src /storage /validation_service.py
AK51's picture
Upload 13308 files
b610d23 verified
"""File integrity and validation service for NASA solar images."""
import hashlib
import logging
from pathlib import Path
from typing import Optional, Tuple
from PIL import Image
import io
from ..models import ImageMetadata
class ValidationService:
"""Validates downloaded images for integrity and format."""
def __init__(self):
"""Initialize validation service."""
self.logger = logging.getLogger(__name__)
def validate_image_format(self, file_path: Path) -> Tuple[bool, Optional[str]]:
"""
Validate that a file is a valid JPEG image.
Args:
file_path: Path to the image file
Returns:
Tuple of (is_valid, error_message)
"""
try:
with Image.open(file_path) as img:
# Check if it's a JPEG
if img.format != 'JPEG':
return False, f"Expected JPEG format, got {img.format}"
# Verify image can be loaded completely
img.verify()
self.logger.debug(f"Image format validation passed: {file_path}")
return True, None
except Exception as e:
error_msg = f"Image format validation failed: {str(e)}"
self.logger.warning(f"{error_msg} for {file_path}")
return False, error_msg
def calculate_file_hash(self, file_path: Path, algorithm: str = 'md5') -> Optional[str]:
"""
Calculate hash of a file for integrity checking.
Args:
file_path: Path to the file
algorithm: Hash algorithm ('md5', 'sha256', etc.)
Returns:
Hex digest of the hash, or None if error
"""
try:
hash_obj = hashlib.new(algorithm)
with open(file_path, 'rb') as f:
# Read in chunks to handle large files
for chunk in iter(lambda: f.read(8192), b""):
hash_obj.update(chunk)
return hash_obj.hexdigest()
except Exception as e:
self.logger.error(f"Error calculating hash for {file_path}: {e}")
return None
def validate_file_size(self, file_path: Path, expected_size: int, tolerance: int = 0) -> Tuple[bool, Optional[str]]:
"""
Validate file size matches expected size.
Args:
file_path: Path to the file
expected_size: Expected file size in bytes
tolerance: Allowed difference in bytes
Returns:
Tuple of (is_valid, error_message)
"""
try:
actual_size = file_path.stat().st_size
size_diff = abs(actual_size - expected_size)
if size_diff <= tolerance:
self.logger.debug(f"File size validation passed: {file_path} ({actual_size} bytes)")
return True, None
else:
error_msg = f"Size mismatch: expected {expected_size}, got {actual_size} (diff: {size_diff})"
self.logger.warning(f"File size validation failed: {error_msg} for {file_path}")
return False, error_msg
except Exception as e:
error_msg = f"Error checking file size: {str(e)}"
self.logger.error(f"{error_msg} for {file_path}")
return False, error_msg
def validate_image_content(self, file_path: Path) -> Tuple[bool, Optional[str]]:
"""
Validate image content and extract basic properties.
Args:
file_path: Path to the image file
Returns:
Tuple of (is_valid, error_message)
"""
try:
with Image.open(file_path) as img:
# Check image dimensions (NASA SDO images should be reasonable size)
width, height = img.size
if width < 100 or height < 100:
return False, f"Image too small: {width}x{height}"
if width > 10000 or height > 10000:
return False, f"Image too large: {width}x{height}"
# Check if image has reasonable color depth
if img.mode not in ['RGB', 'L', 'RGBA']:
return False, f"Unexpected color mode: {img.mode}"
self.logger.debug(f"Image content validation passed: {file_path} ({width}x{height}, {img.mode})")
return True, None
except Exception as e:
error_msg = f"Image content validation failed: {str(e)}"
self.logger.warning(f"{error_msg} for {file_path}")
return False, error_msg
def comprehensive_validation(self, file_path: Path, expected_size: Optional[int] = None) -> Tuple[bool, list]:
"""
Perform comprehensive validation of an image file.
Args:
file_path: Path to the image file
expected_size: Expected file size in bytes (optional)
Returns:
Tuple of (all_valid, list_of_errors)
"""
errors = []
# Check if file exists
if not file_path.exists():
errors.append(f"File does not exist: {file_path}")
return False, errors
# Check if file is not empty
if file_path.stat().st_size == 0:
errors.append(f"File is empty: {file_path}")
return False, errors
# Validate file size if expected size provided
if expected_size is not None:
size_valid, size_error = self.validate_file_size(file_path, expected_size)
if not size_valid:
errors.append(size_error)
# Validate image format
format_valid, format_error = self.validate_image_format(file_path)
if not format_valid:
errors.append(format_error)
# Validate image content
content_valid, content_error = self.validate_image_content(file_path)
if not content_valid:
errors.append(content_error)
all_valid = len(errors) == 0
if all_valid:
self.logger.info(f"Comprehensive validation passed: {file_path}")
else:
self.logger.warning(f"Comprehensive validation failed for {file_path}: {errors}")
return all_valid, errors
def repair_corrupted_image(self, file_path: Path) -> bool:
"""
Attempt to repair a corrupted image file.
Args:
file_path: Path to the corrupted image
Returns:
True if repair was successful, False otherwise
"""
try:
# Try to open and re-save the image
with Image.open(file_path) as img:
# Create a backup
backup_path = file_path.with_suffix('.backup')
file_path.rename(backup_path)
# Save the image again (this can fix minor corruption)
img.save(file_path, 'JPEG', quality=95)
# Validate the repaired image
is_valid, _ = self.validate_image_format(file_path)
if is_valid:
# Remove backup if repair successful
backup_path.unlink()
self.logger.info(f"Successfully repaired image: {file_path}")
return True
else:
# Restore backup if repair failed
file_path.unlink()
backup_path.rename(file_path)
self.logger.warning(f"Failed to repair image: {file_path}")
return False
except Exception as e:
self.logger.error(f"Error attempting to repair {file_path}: {e}")
return False
def get_image_info(self, file_path: Path) -> Optional[dict]:
"""
Extract detailed information about an image file.
Args:
file_path: Path to the image file
Returns:
Dictionary with image information, or None if error
"""
try:
with Image.open(file_path) as img:
info = {
'filename': file_path.name,
'format': img.format,
'mode': img.mode,
'size': img.size,
'width': img.size[0],
'height': img.size[1],
'file_size': file_path.stat().st_size,
'has_transparency': img.mode in ['RGBA', 'LA'] or 'transparency' in img.info
}
# Add EXIF data if available
if hasattr(img, '_getexif') and img._getexif():
info['has_exif'] = True
else:
info['has_exif'] = False
return info
except Exception as e:
self.logger.error(f"Error getting image info for {file_path}: {e}")
return None