Spaces:
Running
Running
| """ | |
| Image cleanup utility to remove orphaned images (not referenced in database). | |
| Can be run as a scheduled job or manually. | |
| """ | |
| import os | |
| import sqlite3 | |
| from pathlib import Path | |
| from typing import Set | |
| import logging | |
| logger = logging.getLogger(__name__) | |
| def get_referenced_images(db_path: str) -> Set[str]: | |
| """ | |
| Get set of all image filenames referenced in the database. | |
| Returns: | |
| Set of image filenames (basenames only) | |
| """ | |
| conn = sqlite3.connect(db_path, timeout=10) | |
| try: | |
| cur = conn.cursor() | |
| # Check if image_path column exists | |
| cur.execute("PRAGMA table_info(predictions)") | |
| columns = [row[1] for row in cur.fetchall()] | |
| if "image_path" not in columns: | |
| # Column doesn't exist yet, return empty set | |
| return set() | |
| # Get all non-empty image_path values | |
| cur.execute("SELECT DISTINCT image_path FROM predictions WHERE image_path IS NOT NULL AND image_path != ''") | |
| rows = cur.fetchall() | |
| # Extract just the filenames (basenames) | |
| referenced = set() | |
| for row in rows: | |
| if row[0]: | |
| filename = os.path.basename(row[0]) | |
| if filename: | |
| referenced.add(filename) | |
| return referenced | |
| finally: | |
| conn.close() | |
| def cleanup_orphaned_images(images_dir: str, db_path: str, dry_run: bool = True) -> dict: | |
| """ | |
| Remove image files that are not referenced in the database. | |
| Args: | |
| images_dir: Directory containing images | |
| db_path: Path to SQLite database | |
| dry_run: If True, only report what would be deleted without actually deleting | |
| Returns: | |
| Dict with cleanup statistics | |
| """ | |
| if not os.path.exists(images_dir): | |
| logger.warning(f"Images directory does not exist: {images_dir}") | |
| return { | |
| "total_images": 0, | |
| "referenced": 0, | |
| "orphaned": 0, | |
| "deleted": 0, | |
| "errors": 0, | |
| } | |
| # Get referenced images from database | |
| referenced = get_referenced_images(db_path) | |
| logger.info(f"Found {len(referenced)} referenced images in database") | |
| # Get all image files in directory | |
| image_extensions = {".jpg", ".jpeg", ".png", ".gif", ".bmp"} | |
| all_images = [] | |
| for file_path in Path(images_dir).iterdir(): | |
| if file_path.is_file() and file_path.suffix.lower() in image_extensions: | |
| all_images.append(file_path.name) | |
| total_images = len(all_images) | |
| logger.info(f"Found {total_images} image files in directory") | |
| # Find orphaned images | |
| orphaned = [img for img in all_images if img not in referenced] | |
| stats = { | |
| "total_images": total_images, | |
| "referenced": len(referenced), | |
| "orphaned": len(orphaned), | |
| "deleted": 0, | |
| "errors": 0, | |
| } | |
| if not orphaned: | |
| logger.info("No orphaned images found") | |
| return stats | |
| logger.info(f"Found {len(orphaned)} orphaned images") | |
| # Delete orphaned images | |
| for filename in orphaned: | |
| file_path = os.path.join(images_dir, filename) | |
| try: | |
| if not dry_run: | |
| os.remove(file_path) | |
| logger.debug(f"Deleted orphaned image: {filename}") | |
| else: | |
| logger.debug(f"Would delete orphaned image: {filename}") | |
| stats["deleted"] += 1 | |
| except Exception as e: | |
| logger.error(f"Failed to delete {filename}: {e}") | |
| stats["errors"] += 1 | |
| if dry_run: | |
| logger.info(f"DRY RUN: Would delete {stats['deleted']} orphaned images") | |
| else: | |
| logger.info(f"Deleted {stats['deleted']} orphaned images") | |
| return stats | |
| def cleanup_old_images(images_dir: str, db_path: str, days_old: int = 30, dry_run: bool = True) -> dict: | |
| """ | |
| Remove images older than specified days that are not referenced in recent predictions. | |
| Args: | |
| images_dir: Directory containing images | |
| db_path: Path to SQLite database | |
| days_old: Remove images older than this many days | |
| dry_run: If True, only report what would be deleted | |
| Returns: | |
| Dict with cleanup statistics | |
| """ | |
| import datetime | |
| if not os.path.exists(images_dir): | |
| return { | |
| "total_images": 0, | |
| "old_images": 0, | |
| "deleted": 0, | |
| "errors": 0, | |
| } | |
| # Calculate cutoff date | |
| cutoff_date = datetime.datetime.now(datetime.UTC) - datetime.timedelta(days=days_old) | |
| cutoff_iso = cutoff_date.isoformat() | |
| # Get images referenced after cutoff | |
| conn = sqlite3.connect(db_path, timeout=10) | |
| try: | |
| cur = conn.cursor() | |
| cur.execute(""" | |
| SELECT DISTINCT image_path | |
| FROM predictions | |
| WHERE image_path IS NOT NULL | |
| AND image_path != '' | |
| AND ts >= ? | |
| """, (cutoff_iso,)) | |
| recent_images = {os.path.basename(row[0]) for row in cur.fetchall() if row[0]} | |
| finally: | |
| conn.close() | |
| # Find old images | |
| image_extensions = {".jpg", ".jpeg", ".png", ".gif", ".bmp"} | |
| old_images = [] | |
| for file_path in Path(images_dir).iterdir(): | |
| if file_path.is_file() and file_path.suffix.lower() in image_extensions: | |
| # Check file modification time | |
| mtime = datetime.datetime.fromtimestamp(file_path.stat().st_mtime, tz=datetime.UTC) | |
| if mtime < cutoff_date: | |
| # Only delete if not in recent images | |
| if file_path.name not in recent_images: | |
| old_images.append(file_path.name) | |
| stats = { | |
| "total_images": len(list(Path(images_dir).iterdir())), | |
| "old_images": len(old_images), | |
| "deleted": 0, | |
| "errors": 0, | |
| } | |
| for filename in old_images: | |
| file_path = os.path.join(images_dir, filename) | |
| try: | |
| if not dry_run: | |
| os.remove(file_path) | |
| stats["deleted"] += 1 | |
| except Exception as e: | |
| logger.error(f"Failed to delete {filename}: {e}") | |
| stats["errors"] += 1 | |
| return stats | |