emotion-detection-api / app /image_cleanup.py
HimAJ's picture
upload 32 files for the ml
1e4fc28 verified
"""
Image cleanup utility to remove orphaned images (not referenced in database).
Can be run as a scheduled job or manually.
"""
import os
import sqlite3
from pathlib import Path
from typing import Set
import logging
logger = logging.getLogger(__name__)
def get_referenced_images(db_path: str) -> Set[str]:
"""
Get set of all image filenames referenced in the database.
Returns:
Set of image filenames (basenames only)
"""
conn = sqlite3.connect(db_path, timeout=10)
try:
cur = conn.cursor()
# Check if image_path column exists
cur.execute("PRAGMA table_info(predictions)")
columns = [row[1] for row in cur.fetchall()]
if "image_path" not in columns:
# Column doesn't exist yet, return empty set
return set()
# Get all non-empty image_path values
cur.execute("SELECT DISTINCT image_path FROM predictions WHERE image_path IS NOT NULL AND image_path != ''")
rows = cur.fetchall()
# Extract just the filenames (basenames)
referenced = set()
for row in rows:
if row[0]:
filename = os.path.basename(row[0])
if filename:
referenced.add(filename)
return referenced
finally:
conn.close()
def cleanup_orphaned_images(images_dir: str, db_path: str, dry_run: bool = True) -> dict:
"""
Remove image files that are not referenced in the database.
Args:
images_dir: Directory containing images
db_path: Path to SQLite database
dry_run: If True, only report what would be deleted without actually deleting
Returns:
Dict with cleanup statistics
"""
if not os.path.exists(images_dir):
logger.warning(f"Images directory does not exist: {images_dir}")
return {
"total_images": 0,
"referenced": 0,
"orphaned": 0,
"deleted": 0,
"errors": 0,
}
# Get referenced images from database
referenced = get_referenced_images(db_path)
logger.info(f"Found {len(referenced)} referenced images in database")
# Get all image files in directory
image_extensions = {".jpg", ".jpeg", ".png", ".gif", ".bmp"}
all_images = []
for file_path in Path(images_dir).iterdir():
if file_path.is_file() and file_path.suffix.lower() in image_extensions:
all_images.append(file_path.name)
total_images = len(all_images)
logger.info(f"Found {total_images} image files in directory")
# Find orphaned images
orphaned = [img for img in all_images if img not in referenced]
stats = {
"total_images": total_images,
"referenced": len(referenced),
"orphaned": len(orphaned),
"deleted": 0,
"errors": 0,
}
if not orphaned:
logger.info("No orphaned images found")
return stats
logger.info(f"Found {len(orphaned)} orphaned images")
# Delete orphaned images
for filename in orphaned:
file_path = os.path.join(images_dir, filename)
try:
if not dry_run:
os.remove(file_path)
logger.debug(f"Deleted orphaned image: {filename}")
else:
logger.debug(f"Would delete orphaned image: {filename}")
stats["deleted"] += 1
except Exception as e:
logger.error(f"Failed to delete {filename}: {e}")
stats["errors"] += 1
if dry_run:
logger.info(f"DRY RUN: Would delete {stats['deleted']} orphaned images")
else:
logger.info(f"Deleted {stats['deleted']} orphaned images")
return stats
def cleanup_old_images(images_dir: str, db_path: str, days_old: int = 30, dry_run: bool = True) -> dict:
"""
Remove images older than specified days that are not referenced in recent predictions.
Args:
images_dir: Directory containing images
db_path: Path to SQLite database
days_old: Remove images older than this many days
dry_run: If True, only report what would be deleted
Returns:
Dict with cleanup statistics
"""
import datetime
if not os.path.exists(images_dir):
return {
"total_images": 0,
"old_images": 0,
"deleted": 0,
"errors": 0,
}
# Calculate cutoff date
cutoff_date = datetime.datetime.now(datetime.UTC) - datetime.timedelta(days=days_old)
cutoff_iso = cutoff_date.isoformat()
# Get images referenced after cutoff
conn = sqlite3.connect(db_path, timeout=10)
try:
cur = conn.cursor()
cur.execute("""
SELECT DISTINCT image_path
FROM predictions
WHERE image_path IS NOT NULL
AND image_path != ''
AND ts >= ?
""", (cutoff_iso,))
recent_images = {os.path.basename(row[0]) for row in cur.fetchall() if row[0]}
finally:
conn.close()
# Find old images
image_extensions = {".jpg", ".jpeg", ".png", ".gif", ".bmp"}
old_images = []
for file_path in Path(images_dir).iterdir():
if file_path.is_file() and file_path.suffix.lower() in image_extensions:
# Check file modification time
mtime = datetime.datetime.fromtimestamp(file_path.stat().st_mtime, tz=datetime.UTC)
if mtime < cutoff_date:
# Only delete if not in recent images
if file_path.name not in recent_images:
old_images.append(file_path.name)
stats = {
"total_images": len(list(Path(images_dir).iterdir())),
"old_images": len(old_images),
"deleted": 0,
"errors": 0,
}
for filename in old_images:
file_path = os.path.join(images_dir, filename)
try:
if not dry_run:
os.remove(file_path)
stats["deleted"] += 1
except Exception as e:
logger.error(f"Failed to delete {filename}: {e}")
stats["errors"] += 1
return stats