childYb / app.py
rethinks's picture
Upload app.py
f5557f4 verified
"""
Photo Selection Web App
Flask-based frontend for testing the photo selection pipeline
Now with AUTOMATIC selection - no target number needed!
Two-Stage Workflow with Review Step:
1. Upload reference photos of your child (2-3 photos)
2. Upload all event photos (e.g., 1000 photos)
3. System filters to find photos containing your child
4. USER REVIEWS filtered photos (can remove false positives)
5. Quality-based selection runs on confirmed photos
6. Final results shown
"""
import os
import json
import uuid
import shutil
from pathlib import Path
from datetime import datetime
# Load environment variables from .env file
try:
from dotenv import load_dotenv
load_dotenv()
except ImportError:
pass # dotenv not installed, use system env vars
from flask import Flask, render_template, request, jsonify, send_from_directory, send_file, session, redirect, Response
from werkzeug.utils import secure_filename
from werkzeug.exceptions import RequestEntityTooLarge
import numpy as np
from PIL import Image
import threading
import time
# Supabase integration
from supabase_storage import (
is_supabase_available,
save_dataset_to_supabase,
load_dataset_from_supabase,
list_datasets_from_supabase,
delete_dataset_from_supabase
)
# HEIC support
try:
from pillow_heif import register_heif_opener
register_heif_opener()
except ImportError:
pass
app = Flask(__name__, static_folder='static', template_folder='templates')
app.secret_key = 'photo_selector_secret_key_2024' # For session management
# Configuration
UPLOAD_FOLDER = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'uploads')
RESULTS_FOLDER = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'results')
REFERENCE_FOLDER = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'references')
OUTPUT_FOLDER = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'selected_photos') # Auto-save location
DATASETS_FOLDER = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'datasets') # Saved datasets
ALLOWED_EXTENSIONS = {'jpg', 'jpeg', 'png', 'heic', 'heif', 'webp'}
MAX_CONTENT_LENGTH = 5 * 1024 * 1024 * 1024 # 5GB max (for large photo batches)
app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER
app.config['MAX_CONTENT_LENGTH'] = MAX_CONTENT_LENGTH
app.config['MAX_FORM_MEMORY_SIZE'] = 5 * 1024 * 1024 * 1024 # 5GB for form data
app.config['MAX_FORM_PARTS'] = 10000 # Allow up to 10000 files in one upload
# Create directories
os.makedirs(UPLOAD_FOLDER, exist_ok=True)
os.makedirs(RESULTS_FOLDER, exist_ok=True)
os.makedirs(REFERENCE_FOLDER, exist_ok=True)
os.makedirs(DATASETS_FOLDER, exist_ok=True)
# Store processing status
processing_jobs = {}
# Store face matchers for sessions (reuse to avoid reloading model)
face_matchers = {}
# Store chunked upload sessions
upload_sessions = {}
# Error handler for large uploads
@app.errorhandler(RequestEntityTooLarge)
def handle_large_upload(error):
return jsonify({
'error': 'Upload too large. Try uploading fewer files at once (max ~500 files per batch).'
}), 413
def allowed_file(filename):
return '.' in filename and filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS
def create_thumbnail(image_path, thumb_path, size=(300, 300)):
"""Create a thumbnail for display with proper EXIF rotation."""
from PIL import ExifTags
try:
with Image.open(image_path) as img:
# Apply EXIF rotation before creating thumbnail
try:
for orientation in ExifTags.TAGS.keys():
if ExifTags.TAGS[orientation] == 'Orientation':
break
exif = img._getexif()
if exif is not None:
orientation_value = exif.get(orientation)
if orientation_value == 3:
img = img.rotate(180, expand=True)
elif orientation_value == 6:
img = img.rotate(270, expand=True)
elif orientation_value == 8:
img = img.rotate(90, expand=True)
except (AttributeError, KeyError, IndexError):
pass
if img.mode != 'RGB':
img = img.convert('RGB')
img.thumbnail(size, Image.Resampling.LANCZOS)
img.save(thumb_path, 'JPEG', quality=85)
return True
except Exception as e:
print(f"Error creating thumbnail: {e}")
return False
def get_thumbnail_name(filename):
"""
Generate thumbnail name that includes the original extension to avoid collisions.
Example: IMG_5801.HEIC -> thumb_IMG_5801_HEIC.jpg
IMG_5801.jpg -> thumb_IMG_5801_jpg.jpg
"""
if '.' in filename:
name, ext = filename.rsplit('.', 1)
return f"thumb_{name}_{ext}.jpg"
else:
return f"thumb_{filename}.jpg"
def process_photos_face_filter_only(job_id, upload_dir, session_id=None):
"""
Phase 1: Face filtering only.
Scans all photos to find ones containing the target person.
Returns filtered photos for user review before quality selection.
"""
try:
print(f"\n{'='*60}")
print(f"[Job {job_id}] PHASE 1: Face Filtering Started")
print(f"{'='*60}")
processing_jobs[job_id]['status'] = 'processing'
processing_jobs[job_id]['progress'] = 5
processing_jobs[job_id]['message'] = 'Loading face recognition AI...'
print(f"[Job {job_id}] Loading InsightFace face recognition model...")
from photo_selector.face_matcher import FaceMatcher
# Get face matcher
face_matcher = None
if session_id and session_id in face_matchers:
face_matcher = face_matchers[session_id]
if face_matcher.get_reference_count() == 0:
face_matcher = None
if face_matcher is None:
print(f"[Job {job_id}] ERROR: No reference photos loaded!")
processing_jobs[job_id]['status'] = 'error'
processing_jobs[job_id]['message'] = 'No reference photos loaded'
return
ref_count = face_matcher.get_reference_count()
print(f"[Job {job_id}] Reference photos loaded: {ref_count}")
processing_jobs[job_id]['progress'] = 10
processing_jobs[job_id]['message'] = 'Scanning photos for your child using InsightFace...'
# Get all photo files
photo_files = []
for f in os.listdir(upload_dir):
if allowed_file(f) and not f.startswith('thumb_'):
photo_files.append(f)
total_photos = len(photo_files)
print(f"[Job {job_id}] Total photos to scan: {total_photos}")
processing_jobs[job_id]['total_photos'] = total_photos
processing_jobs[job_id]['message'] = f'Scanning {total_photos} photos for your child...'
# Create thumbnails directory - always in uploads/<job_id>/thumbnails
# This ensures thumbnails work for both browser upload and local folder mode
is_local_folder = processing_jobs[job_id].get('is_local_folder', False)
if is_local_folder:
thumbs_dir = os.path.join(UPLOAD_FOLDER, job_id, 'thumbnails')
else:
thumbs_dir = os.path.join(upload_dir, 'thumbnails')
os.makedirs(thumbs_dir, exist_ok=True)
# Get all photo paths
photo_paths = [os.path.join(upload_dir, fn) for fn in photo_files]
# Progress callback to update photos_checked
def progress_callback(current, total, message):
processing_jobs[job_id]['photos_checked'] = current
processing_jobs[job_id]['message'] = f'Checked {current}/{total} photos...'
# Update progress between 30-80%
progress_pct = 30 + int((current / total) * 50) if total > 0 else 30
processing_jobs[job_id]['progress'] = progress_pct
# Run face filtering
print(f"[Job {job_id}] Starting face detection and matching...")
processing_jobs[job_id]['progress'] = 30
filter_results = face_matcher.filter_photos(photo_paths, progress_callback=progress_callback)
if 'error' in filter_results:
print(f"[Job {job_id}] ERROR: Face matching failed - {filter_results['error']}")
processing_jobs[job_id]['status'] = 'error'
processing_jobs[job_id]['message'] = f"Face matching error: {filter_results['error']}"
return
# Print statistics
stats = filter_results.get('statistics', {})
matched_count = len(filter_results.get('matched_photos', []))
unmatched_count = len(filter_results.get('unmatched_photos', []))
print(f"\n[Job {job_id}] Face Filtering Results:")
print(f" - Photos with your child: {matched_count}")
print(f" - Photos without match: {unmatched_count}")
print(f" - Photos with no faces: {stats.get('no_faces', 0)}")
# Handle match_rate which may be a string or float
match_rate = stats.get('match_rate', 0)
if isinstance(match_rate, str):
print(f" - Match rate: {match_rate}")
else:
print(f" - Match rate: {match_rate:.1%}")
processing_jobs[job_id]['progress'] = 70
processing_jobs[job_id]['message'] = f'Creating thumbnails: 0/{matched_count}'
print(f"[Job {job_id}] Creating thumbnails for {matched_count} matched photos...")
# Prepare filtered photo data
filtered_photos = []
for i, match in enumerate(filter_results['matched_photos']):
filename = os.path.basename(match['path'])
thumb_name = get_thumbnail_name(filename)
thumb_path = os.path.join(thumbs_dir, thumb_name)
create_thumbnail(match['path'], thumb_path)
filtered_photos.append({
'filename': filename,
'thumbnail': thumb_name,
'face_match_score': match['similarity'],
'num_faces': match['num_faces'],
'matched_face_idx': match.get('matched_face_idx', 0),
'face_bboxes': match.get('face_bboxes', []) # Cached face locations for scoring
})
# Progress update every 10 photos or on last photo
if (i + 1) % 10 == 0 or (i + 1) == matched_count:
progress = 70 + int((i / matched_count) * 25)
processing_jobs[job_id]['progress'] = progress
processing_jobs[job_id]['message'] = f'Creating thumbnails: {i + 1}/{matched_count}'
print(f"[Job {job_id}] Thumbnails created: {i + 1}/{matched_count}")
# Sort by face match score (highest first)
filtered_photos.sort(key=lambda x: x['face_match_score'], reverse=True)
# Prepare unmatched photos data (photos where target was NOT found)
unmatched_photos = []
for unmatch in filter_results.get('unmatched_photos', []):
filename = os.path.basename(unmatch['path'])
# Get timestamp from EXIF if available
timestamp = None
try:
from photo_selector.utils import get_photo_timestamp
dt = get_photo_timestamp(unmatch['path'])
if dt:
timestamp = dt.timestamp()
except:
pass
unmatched_photos.append({
'filename': filename,
'best_similarity': unmatch.get('best_similarity', 0),
'num_faces': unmatch.get('num_faces', 0),
'timestamp': timestamp
})
# Also include photos with no faces detected
for no_face in filter_results.get('no_faces_photos', []):
filename = os.path.basename(no_face['path'])
timestamp = None
try:
from photo_selector.utils import get_photo_timestamp
dt = get_photo_timestamp(no_face['path'])
if dt:
timestamp = dt.timestamp()
except:
pass
unmatched_photos.append({
'filename': filename,
'best_similarity': 0,
'num_faces': 0,
'timestamp': timestamp
})
# Also include photos that had processing errors
for error_photo in filter_results.get('error_photos', []):
filename = os.path.basename(error_photo['path'])
timestamp = None
try:
from photo_selector.utils import get_photo_timestamp
dt = get_photo_timestamp(error_photo['path'])
if dt:
timestamp = dt.timestamp()
except:
pass
unmatched_photos.append({
'filename': filename,
'best_similarity': 0,
'num_faces': 0,
'timestamp': timestamp,
'error': error_photo.get('error', 'Processing error')
})
# Sort unmatched by timestamp
unmatched_photos.sort(key=lambda x: x.get('timestamp') or 0)
# Store results for review
review_data = {
'total_uploaded': total_photos,
'filtered_photos': filtered_photos,
'unmatched_photos': unmatched_photos,
'statistics': filter_results['statistics'],
'reference_count': face_matcher.get_reference_count()
}
# Save review data
review_file = os.path.join(RESULTS_FOLDER, f"{job_id}_review.json")
with open(review_file, 'w') as f:
json.dump(review_data, f, indent=2, default=str)
processing_jobs[job_id]['progress'] = 100
processing_jobs[job_id]['status'] = 'review_pending'
processing_jobs[job_id]['message'] = f'Found your child in {len(filtered_photos)} of {total_photos} photos!'
processing_jobs[job_id]['review_data'] = review_data
print(f"\n[Job {job_id}] PHASE 1 COMPLETE!")
print(f" - Found {len(filtered_photos)} photos of your child")
print(f" - Status: review_pending (waiting for user to confirm)")
print(f" - Review data saved to: {review_file}")
print(f"{'='*60}\n")
except Exception as e:
print(f"[Job {job_id}] EXCEPTION: {str(e)}")
processing_jobs[job_id]['status'] = 'error'
processing_jobs[job_id]['message'] = str(e)
import traceback
traceback.print_exc()
def process_drive_with_parallel_face_detection(job_id, folder_id, upload_dir, face_matcher):
"""
HYBRID APPROACH: Download files from Google Drive while running face detection in parallel.
This overlaps network I/O (downloading) with GPU compute (face detection) for faster processing.
Flow:
- Download thread: Downloads files and adds paths to queue
- Face detection thread: Processes files from queue as they become ready
- Both run simultaneously for maximum efficiency
"""
import queue
import threading
print(f"\n{'='*60}")
print(f"[Job {job_id}] HYBRID MODE: Parallel Download + Face Detection")
print(f"{'='*60}")
# Shared state
file_queue = queue.Queue()
results_lock = threading.Lock()
matched_photos = []
unmatched_photos = []
no_faces_photos = []
error_photos = []
# Counters
download_complete = threading.Event()
total_files = [0]
downloaded_count = [0]
processed_count = [0]
# Face detection worker
def face_detection_worker():
"""Process files from queue as they become available."""
while True:
try:
# Wait for file or check if download is complete
try:
filepath = file_queue.get(timeout=1.0)
except queue.Empty:
# Check if download is complete and queue is empty
if download_complete.is_set() and file_queue.empty():
break
continue
if filepath is None: # Poison pill
break
# Process the file
result = face_matcher.check_photo_for_target(filepath)
with results_lock:
processed_count[0] += 1
if 'error' in result:
error_photos.append({'path': filepath, 'error': result['error']})
elif result['num_faces'] == 0:
no_faces_photos.append({'path': filepath, 'num_faces': 0})
elif result['contains_target']:
matched_photos.append({
'path': filepath,
'similarity': result['best_match_similarity'],
'num_faces': result['num_faces'],
'all_similarities': result.get('all_face_similarities', []),
'face_bboxes': result.get('face_bboxes', [])
})
else:
unmatched_photos.append({
'path': filepath,
'best_similarity': result['best_match_similarity'],
'num_faces': result['num_faces']
})
# Update progress (use unified message format)
if processed_count[0] % 10 == 0:
# After downloads complete, show scan-only progress
if download_complete.is_set():
pct = 30 + int((processed_count[0] / max(total_files[0], 1)) * 40)
processing_jobs[job_id]['progress'] = min(pct, 70)
processing_jobs[job_id]['message'] = f'Scanning faces: {processed_count[0]}/{total_files[0]}'
processing_jobs[job_id]['photos_checked'] = processed_count[0]
print(f"[Job {job_id}] [HYBRID] Downloaded: {downloaded_count[0]}, Face checked: {processed_count[0]}, Matched: {len(matched_photos)}")
file_queue.task_done()
except Exception as e:
print(f"[Job {job_id}] Face detection error: {e}")
continue
# Callback when file is downloaded
def on_file_ready(filepath):
"""Called by download_folder when each file is ready."""
with results_lock:
downloaded_count[0] += 1
file_queue.put(filepath)
# Progress callback for download
def download_progress(current, total, _filename):
total_files[0] = total
pct = 5 + int((current / total) * 25) # 5-30%
processing_jobs[job_id]['progress'] = pct
processing_jobs[job_id]['message'] = f'Downloading: {current}/{total}, Scanning: {processed_count[0]}'
processing_jobs[job_id]['total_files'] = total
try:
processing_jobs[job_id]['status'] = 'processing'
processing_jobs[job_id]['progress'] = 5
processing_jobs[job_id]['message'] = 'Starting parallel download and face detection...'
# Start face detection workers (use multiple threads for better throughput)
num_workers = 4 # Face detection threads
workers = []
for _ in range(num_workers):
t = threading.Thread(target=face_detection_worker)
t.daemon = True
t.start()
workers.append(t)
print(f"[Job {job_id}] Started {num_workers} face detection workers")
# Start download (this will call on_file_ready for each file)
print(f"[Job {job_id}] Starting Google Drive download with parallel face detection...")
download_folder(
folder_id,
upload_dir,
progress_callback=download_progress,
file_ready_callback=on_file_ready
)
# Signal download complete
download_complete.set()
print(f"[Job {job_id}] Download complete. Waiting for face detection to finish...")
# Wait for queue to be processed
file_queue.join()
# Send poison pills to stop workers
for _ in workers:
file_queue.put(None)
# Wait for workers to finish
for t in workers:
t.join(timeout=5.0)
print(f"\n[Job {job_id}] HYBRID Face Detection Results:")
print(f" - Photos with your child: {len(matched_photos)}")
print(f" - Photos without match: {len(unmatched_photos)}")
print(f" - Photos with no faces: {len(no_faces_photos)}")
print(f" - Photos with errors: {len(error_photos)}")
if error_photos:
print(f" [ERRORS] First 5 error photos:")
for ep in error_photos[:5]:
print(f" - {os.path.basename(ep['path'])}: {ep.get('error', 'Unknown error')}")
# Now create thumbnails and prepare review data
processing_jobs[job_id]['progress'] = 75
processing_jobs[job_id]['message'] = f'Creating thumbnails for {len(matched_photos)} photos...'
thumbs_dir = os.path.join(upload_dir, 'thumbnails')
os.makedirs(thumbs_dir, exist_ok=True)
filtered_photos = []
for i, match in enumerate(matched_photos):
filename = os.path.basename(match['path'])
thumb_name = get_thumbnail_name(filename)
thumb_path = os.path.join(thumbs_dir, thumb_name)
create_thumbnail(match['path'], thumb_path)
filtered_photos.append({
'filename': filename,
'thumbnail': thumb_name,
'face_match_score': match['similarity'],
'num_faces': match['num_faces'],
'face_bboxes': match.get('face_bboxes', [])
})
if (i + 1) % 20 == 0:
processing_jobs[job_id]['message'] = f'Creating thumbnails: {i + 1}/{len(matched_photos)}'
# Sort by face match score
filtered_photos.sort(key=lambda x: x['face_match_score'], reverse=True)
# Prepare unmatched data
unmatched_data = []
for unmatch in unmatched_photos:
filename = os.path.basename(unmatch['path'])
unmatched_data.append({
'filename': filename,
'best_similarity': unmatch.get('best_similarity', 0),
'num_faces': unmatch.get('num_faces', 0)
})
for no_face in no_faces_photos:
filename = os.path.basename(no_face['path'])
unmatched_data.append({
'filename': filename,
'best_similarity': 0,
'num_faces': 0
})
# Also add error photos to unmatched (so they're visible to user)
for error_photo in error_photos:
filename = os.path.basename(error_photo['path'])
unmatched_data.append({
'filename': filename,
'best_similarity': 0,
'num_faces': 0,
'error': error_photo.get('error', 'Processing error')
})
# Store results
review_data = {
'total_uploaded': total_files[0],
'filtered_photos': filtered_photos,
'unmatched_photos': unmatched_data,
'statistics': {
'total_scanned': total_files[0],
'matched': len(matched_photos),
'unmatched': len(unmatched_photos),
'no_faces': len(no_faces_photos),
'errors': len(error_photos),
'match_rate': f"{(len(matched_photos) / max(total_files[0], 1) * 100):.1f}%"
},
'reference_count': face_matcher.get_reference_count()
}
# Save review data
review_file = os.path.join(RESULTS_FOLDER, f"{job_id}_review.json")
with open(review_file, 'w') as f:
json.dump(review_data, f, indent=2, default=str)
processing_jobs[job_id]['progress'] = 100
processing_jobs[job_id]['status'] = 'review_pending'
processing_jobs[job_id]['message'] = f'Found your child in {len(filtered_photos)} of {total_files[0]} photos!'
processing_jobs[job_id]['review_data'] = review_data
print(f"\n[Job {job_id}] HYBRID MODE COMPLETE!")
print(f" - Found {len(filtered_photos)} photos of your child")
print(f"{'='*60}\n")
except Exception as e:
print(f"[Job {job_id}] HYBRID EXCEPTION: {str(e)}")
processing_jobs[job_id]['status'] = 'error'
processing_jobs[job_id]['message'] = str(e)
import traceback
traceback.print_exc()
def save_photos_by_month(job_id, upload_dir, selected_photos, rejected_photos, month_stats):
"""
Automatically save both selected and not-selected photos organized by month.
Creates folder structure:
selected_photos/
└── {job_id}_{timestamp}/
├── selected/
│ ├── Jan/
│ │ ├── photo1.jpg
│ │ └── photo2.jpg
│ ├── Feb/
│ │ └── photo3.jpg
│ └── ...
├── not_selected/
│ ├── Jan/
│ │ └── photo4.jpg
│ ├── Feb/
│ │ └── photo5.jpg
│ └── ...
└── summary.txt
Args:
job_id: The job identifier
upload_dir: Source directory containing original photos
selected_photos: List of selected photo dicts with 'filename' and 'month' keys
rejected_photos: List of rejected photo dicts with 'filename' and 'month' keys
month_stats: Statistics about each month's selection
Returns:
Path to the output folder
"""
try:
# Create output folder with timestamp
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
output_base = os.path.join(OUTPUT_FOLDER, f"{job_id}_{timestamp}")
os.makedirs(output_base, exist_ok=True)
print(f"\n{'='*60}")
print(f" AUTO-SAVING PHOTOS BY MONTH (SELECTED & NOT SELECTED)")
print(f"{'='*60}")
print(f" Output folder: {output_base}")
# Create selected and not_selected folders
selected_base = os.path.join(output_base, "selected")
not_selected_base = os.path.join(output_base, "not_selected")
os.makedirs(selected_base, exist_ok=True)
os.makedirs(not_selected_base, exist_ok=True)
# Group selected photos by month
selected_by_month = {}
for photo in selected_photos:
month = photo.get('month', 'Unknown')
if month not in selected_by_month:
selected_by_month[month] = []
selected_by_month[month].append(photo)
# Group rejected photos by month
rejected_by_month = {}
for photo in rejected_photos:
month = photo.get('month', 'Unknown')
if month not in rejected_by_month:
rejected_by_month[month] = []
rejected_by_month[month].append(photo)
# Copy SELECTED photos to month folders
print(f"\n --- SELECTED PHOTOS ---")
total_selected_copied = 0
for month, photos in selected_by_month.items():
month_folder = os.path.join(selected_base, month)
os.makedirs(month_folder, exist_ok=True)
print(f" [selected/{month}] Saving {len(photos)} photos...")
for photo in photos:
src_path = os.path.join(upload_dir, photo['filename'])
dst_path = os.path.join(month_folder, photo['filename'])
if os.path.exists(src_path):
shutil.copy2(src_path, dst_path)
total_selected_copied += 1
# Copy NOT SELECTED photos to month folders
print(f"\n --- NOT SELECTED PHOTOS ---")
total_rejected_copied = 0
for month, photos in rejected_by_month.items():
month_folder = os.path.join(not_selected_base, month)
os.makedirs(month_folder, exist_ok=True)
print(f" [not_selected/{month}] Saving {len(photos)} photos...")
for photo in photos:
src_path = os.path.join(upload_dir, photo['filename'])
dst_path = os.path.join(month_folder, photo['filename'])
if os.path.exists(src_path):
shutil.copy2(src_path, dst_path)
total_rejected_copied += 1
# Create summary file
summary_path = os.path.join(output_base, "summary.txt")
with open(summary_path, 'w') as f:
f.write("=" * 60 + "\n")
f.write(" PHOTO SELECTION SUMMARY\n")
f.write("=" * 60 + "\n\n")
f.write(f"Job ID: {job_id}\n")
f.write(f"Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
f.write(f"Total Selected: {total_selected_copied} photos\n")
f.write(f"Total Not Selected: {total_rejected_copied} photos\n")
f.write(f"Grand Total: {total_selected_copied + total_rejected_copied} photos\n\n")
f.write("-" * 40 + "\n")
f.write(" BREAKDOWN BY MONTH\n")
f.write("-" * 40 + "\n\n")
f.write(f"{'Month':<12} {'Selected':>10} {'Not Selected':>14} {'Total':>8}\n")
f.write(f"{'-'*12} {'-'*10} {'-'*14} {'-'*8}\n")
for stat in month_stats:
month = stat['month']
selected = stat['selected']
total = stat['total_photos']
not_selected = total - selected
f.write(f"{month:<12} {selected:>10} {not_selected:>14} {total:>8}\n")
# Selected files by month
f.write("\n" + "=" * 60 + "\n")
f.write(" SELECTED FILES BY MONTH\n")
f.write("=" * 60 + "\n")
for month, photos in sorted(selected_by_month.items()):
f.write(f"\n[{month}] - {len(photos)} selected photos:\n")
for photo in sorted(photos, key=lambda x: x.get('score', 0), reverse=True):
score = photo.get('score', 0) * 100
cluster = photo.get('cluster_id', -1)
f.write(f" + {photo['filename']} (Score: {score:.0f}%, Cluster: {cluster})\n")
# Not selected files by month
f.write("\n" + "=" * 60 + "\n")
f.write(" NOT SELECTED FILES BY MONTH\n")
f.write("=" * 60 + "\n")
for month, photos in sorted(rejected_by_month.items()):
f.write(f"\n[{month}] - {len(photos)} not selected photos:\n")
for photo in sorted(photos, key=lambda x: x.get('score', 0), reverse=True):
score = photo.get('score', 0) * 100
cluster = photo.get('cluster_id', -1)
f.write(f" - {photo['filename']} (Score: {score:.0f}%, Cluster: {cluster})\n")
print(f"\n SUMMARY:")
print(f" - Selected photos saved: {total_selected_copied}")
print(f" - Not selected photos saved: {total_rejected_copied}")
print(f" - Total photos saved: {total_selected_copied + total_rejected_copied}")
print(f" - Summary written to: {summary_path}")
print(f"{'='*60}\n")
return output_base
except Exception as e:
print(f"[ERROR] Failed to save photos by month: {str(e)}")
import traceback
traceback.print_exc()
return None
def process_photos_quality_selection(job_id, upload_dir, quality_mode, similarity_threshold, confirmed_photos, face_data_cache=None, embedding_model='siglip'):
"""
Phase 2: Month-based category-aware photo selection.
Selects ~40 best photos per month with category diversity.
Args:
face_data_cache: Dict of filename -> {'num_faces': int, 'face_bboxes': list}
Cached face data from Step 2 to avoid re-detection
embedding_model: 'siglip' or 'clip' - which embedding model to use
"""
face_data_cache = face_data_cache or {}
try:
print(f"\n{'='*60}")
print(f"[Job {job_id}] PHASE 2: Monthly Category-Aware Selection Started")
print(f"{'='*60}")
print(f"[Job {job_id}] Confirmed photos: {len(confirmed_photos)}")
print(f"[Job {job_id}] Quality mode: {quality_mode}")
print(f"[Job {job_id}] Similarity threshold: {similarity_threshold}")
print(f"[Job {job_id}] Embedding model: {embedding_model.upper()}")
processing_jobs[job_id]['status'] = 'processing'
processing_jobs[job_id]['progress'] = 5
processing_jobs[job_id]['message'] = f'Loading {embedding_model.upper()} model...'
# Import the appropriate embedder based on selection
from photo_selector.monthly_selector import MonthlyPhotoSelector
if embedding_model == 'clip':
from photo_selector.clip_embeddings import CLIPEmbedder as Embedder
model_display_name = 'CLIP'
else:
from photo_selector.siglip_embeddings import SigLIPEmbedder as Embedder
model_display_name = 'SigLIP'
# Determine target per month based on quality mode
if quality_mode == 'keep_more':
target_per_month = 60 # More photos per month
elif quality_mode == 'strict':
target_per_month = 25 # Fewer, higher quality
else: # balanced
target_per_month = 40 # Default
print(f"[Job {job_id}] Target per month: {target_per_month}")
# Step 1: Generate embeddings for confirmed photos (with caching)
processing_jobs[job_id]['progress'] = 10
processing_jobs[job_id]['message'] = f'Checking embedding cache...'
print(f"[Job {job_id}] Processing {len(confirmed_photos)} photos for {model_display_name} embeddings...")
# Import cache functions
from supabase_storage import (
compute_file_hash,
get_cached_embeddings_batch,
save_embeddings_batch,
is_supabase_available
)
# Step 1a: Compute hashes for all files
file_hashes = {} # filename -> hash
hash_to_filename = {} # hash -> filename (for reverse lookup)
print(f"[Job {job_id}] Computing file hashes...")
for i, filename in enumerate(confirmed_photos):
filepath = os.path.join(upload_dir, filename)
if os.path.exists(filepath):
file_hash = compute_file_hash(filepath)
if file_hash:
file_hashes[filename] = file_hash
hash_to_filename[file_hash] = filename
# Update progress (10-15%)
if i % 100 == 0:
progress = 10 + int((i / len(confirmed_photos)) * 5)
processing_jobs[job_id]['progress'] = progress
print(f"[Job {job_id}] Computed {len(file_hashes)} hashes")
# Step 1b: Check cache for existing embeddings
embeddings = {}
cached_count = 0
uncached_filenames = []
if is_supabase_available() and file_hashes:
processing_jobs[job_id]['message'] = f'Checking embedding cache...'
all_hashes = list(file_hashes.values())
# Query cache in batches (Supabase has query limits)
cached_embeddings = {}
batch_size = 500
for i in range(0, len(all_hashes), batch_size):
batch_hashes = all_hashes[i:i + batch_size]
batch_result = get_cached_embeddings_batch(batch_hashes, embedding_model)
cached_embeddings.update(batch_result)
# Map cached embeddings back to filenames
for filename, file_hash in file_hashes.items():
if file_hash in cached_embeddings:
embeddings[filename] = cached_embeddings[file_hash]
cached_count += 1
else:
uncached_filenames.append(filename)
print(f"[Job {job_id}] Cache hit: {cached_count}/{len(file_hashes)} embeddings")
else:
uncached_filenames = list(file_hashes.keys())
print(f"[Job {job_id}] Cache not available, computing all embeddings")
# Step 1c: Compute embeddings for uncached files only
newly_computed = {}
if uncached_filenames:
processing_jobs[job_id]['message'] = f'Analyzing {len(uncached_filenames)} photos with {model_display_name}...'
print(f"[Job {job_id}] Computing {model_display_name} embeddings for {len(uncached_filenames)} uncached photos...")
embedder = Embedder()
for i, filename in enumerate(uncached_filenames):
filepath = os.path.join(upload_dir, filename)
if os.path.exists(filepath):
img = embedder.load_image(filepath)
if img is not None:
embedding = embedder.get_embedding(img)
if embedding is not None:
embeddings[filename] = embedding
newly_computed[filename] = embedding
img.close()
# Update progress (15-30%)
progress = 15 + int((i / len(uncached_filenames)) * 15)
processing_jobs[job_id]['progress'] = progress
print(f"[Job {job_id}] Computed {len(newly_computed)} new embeddings")
# Step 1d: Save newly computed embeddings to cache
if newly_computed and is_supabase_available():
processing_jobs[job_id]['message'] = 'Saving embeddings to cache...'
saved = save_embeddings_batch(newly_computed, file_hashes, embedding_model)
print(f"[Job {job_id}] Saved {saved} embeddings to cache")
print(f"[Job {job_id}] Total embeddings: {len(embeddings)} (cached: {cached_count}, computed: {len(newly_computed)})")
# Step 2: Initialize monthly selector
processing_jobs[job_id]['progress'] = 35
processing_jobs[job_id]['message'] = 'Grouping photos by month...'
# Note: duplicate_threshold is for CLIP embedding similarity (0.85 catches exact near-dupes)
# diversity_threshold ensures we don't select visually similar photos (different scenes)
# This is separate from face similarity_threshold (0.4-0.5 for face matching)
selector = MonthlyPhotoSelector(
target_per_month=target_per_month,
duplicate_threshold=0.85, # Remove exact duplicates (same moment, slight angle change)
diversity_threshold=0.75 # Ensure selected photos are visually diverse
)
# Step 3: Group photos by month (only confirmed photos)
# We need to manually build the photos_by_month structure for confirmed photos
from collections import defaultdict
MONTH_NAMES = {
1: "Jan", 2: "Feb", 3: "Mar", 4: "Apr",
5: "May", 6: "Jun", 7: "Jul", 8: "Aug",
9: "Sep", 10: "Oct", 11: "Nov", 12: "Dec"
}
photos_by_month = defaultdict(list)
# Debug: Track timestamp extraction success
timestamp_found = 0
timestamp_missing = 0
for filename in confirmed_photos:
filepath = os.path.join(upload_dir, filename)
if not os.path.exists(filepath):
print(f"[TIMESTAMP DEBUG] File not found: {filepath}")
continue
dt = selector.get_photo_date(filepath)
if dt:
timestamp_found += 1
else:
timestamp_missing += 1
# Get cached face data if available
cached_face = face_data_cache.get(filename, {})
photo_info = {
'filename': filename,
'filepath': filepath,
'date': dt.isoformat() if dt else None,
'month': MONTH_NAMES.get(dt.month, "Unknown") if dt else "Unknown",
'timestamp': dt.timestamp() if dt else None,
# Cached face data from Step 2 (avoids re-detection)
'num_faces': cached_face.get('num_faces'),
'face_bboxes': cached_face.get('face_bboxes', [])
}
photos_by_month[photo_info['month']].append(photo_info)
# Sort months in calendar order
month_order = list(MONTH_NAMES.values()) + ['Unknown']
photos_by_month = {m: photos_by_month[m] for m in month_order if m in photos_by_month}
print(f"[TIMESTAMP DEBUG] Timestamps found: {timestamp_found}, missing: {timestamp_missing}")
print(f"[Job {job_id}] Photos grouped into {len(photos_by_month)} months:")
for month, photos in photos_by_month.items():
print(f" - {month}: {len(photos)} photos")
# Step 4: Select best photos from each month (categories detected AFTER selection for speed)
processing_jobs[job_id]['progress'] = 60
processing_jobs[job_id]['message'] = 'Selecting best photos per month...'
def progress_callback(msg):
processing_jobs[job_id]['message'] = msg
selection_results = selector.select_all_months(photos_by_month, embeddings, progress_callback)
selected_photos = selection_results['selected']
month_stats = selection_results['month_stats']
summary = selection_results['summary']
print(f"\n[Job {job_id}] Selection Results:")
print(f" - Total photos: {summary['total_photos']}")
print(f" - Selected: {summary['total_selected']}")
print(f" - Selection rate: {summary['selection_rate']*100:.1f}%")
# Step 5: Detect categories ONLY for selected photos (much faster than all photos)
processing_jobs[job_id]['progress'] = 75
processing_jobs[job_id]['message'] = 'Detecting categories for selected photos...'
print(f"[Job {job_id}] Detecting categories for {len(selected_photos)} selected photos...")
selected_paths = [p['filepath'] for p in selected_photos]
if selected_paths:
selector._ensure_category_detector()
categories = selector.category_detector.detect_categories_batch(selected_paths)
for photo in selected_photos:
# categories dict is keyed by filename, not filepath
cat, conf = categories.get(photo['filename'], ('unknown', 0.0))
photo['category'] = cat
photo['category_confidence'] = conf
# Update month_stats with category breakdown from selected photos only
for stat in month_stats:
month_name = stat['month']
month_selected = [p for p in selected_photos if p.get('month') == month_name]
cat_breakdown = {}
for p in month_selected:
cat = p.get('category', 'unknown')
cat_breakdown[cat] = cat_breakdown.get(cat, 0) + 1
stat['categories'] = cat_breakdown
# Step 6: Build rejected list (photos not selected)
# Note: rejection_reason is already set by monthly_selector.py
selected_filenames = {p['filename'] for p in selected_photos}
rejected_photos = []
for month, photos in photos_by_month.items():
for photo in photos:
if photo['filename'] not in selected_filenames:
# Keep existing rejection_reason from monthly_selector, or set default
if not photo.get('rejection_reason'):
photo['rejection_reason'] = 'Not selected for month quota'
rejected_photos.append(photo)
# Create thumbnails directory
thumbs_dir = os.path.join(upload_dir, 'thumbnails')
os.makedirs(thumbs_dir, exist_ok=True)
# Calculate total thumbnails to create
total_thumbnails = len(selected_photos) + len(rejected_photos)
thumbnails_created = 0
processing_jobs[job_id]['progress'] = 85
processing_jobs[job_id]['message'] = f'Creating thumbnails: 0/{total_thumbnails}'
# Build final results structure
results = {
'selected': [],
'rejected': [],
'summary': {
'total_photos': summary['total_photos'],
'selected_count': summary['total_selected'],
'rejected_count': len(rejected_photos),
'selection_rate': summary['selection_rate'],
'face_filtering': {
'total_photos': processing_jobs[job_id].get('total_uploaded', len(confirmed_photos)),
'after_face_filter': len(confirmed_photos),
'user_confirmed': len(confirmed_photos)
},
'total_processed': len(confirmed_photos)
},
'month_stats': month_stats,
'rejection_breakdown': {}
}
# Count rejection reasons
rejection_counts = defaultdict(int)
# Compute cluster stats for display on photo cards (per-month)
# Cluster IDs are assigned per-month, so we need to track (month, cluster_id) pairs
# Count total photos per (month, cluster_id)
cluster_total_counts = defaultdict(int)
for month, photos in photos_by_month.items():
for photo in photos:
cid = photo.get('cluster_id', -1)
if cid != -1:
cluster_total_counts[(month, cid)] += 1
# Count selected photos per (month, cluster_id)
cluster_selected_counts = defaultdict(int)
for photo in selected_photos:
month = photo.get('month', 'Unknown')
cid = photo.get('cluster_id', -1)
if cid != -1:
cluster_selected_counts[(month, cid)] += 1
# Process selected photos
for photo in selected_photos:
filename = photo['filename']
thumb_name = get_thumbnail_name(filename)
thumb_path = os.path.join(thumbs_dir, thumb_name)
create_thumbnail(os.path.join(upload_dir, filename), thumb_path)
# Update thumbnail counter
thumbnails_created += 1
if thumbnails_created % 10 == 0 or thumbnails_created == total_thumbnails:
processing_jobs[job_id]['message'] = f'Creating thumbnails: {thumbnails_created}/{total_thumbnails}'
# Get embedding for this photo (convert to list for JSON serialization)
photo_embedding = embeddings.get(filename)
embedding_list = photo_embedding.tolist() if photo_embedding is not None else None
# Get cluster stats for this photo (per-month)
cid = photo.get('cluster_id', -1)
month = photo.get('month', 'Unknown')
cluster_total = cluster_total_counts.get((month, cid), 0) if cid != -1 else 0
cluster_selected = cluster_selected_counts.get((month, cid), 0) if cid != -1 else 0
results['selected'].append({
'filename': filename,
'thumbnail': thumb_name,
'score': float(photo.get('total', 0)),
'face_quality': float(photo.get('face_quality', 0)),
'aesthetic_quality': float(photo.get('aesthetic_quality', 0)),
'emotional_signal': float(photo.get('emotional_signal', 0)),
'uniqueness': float(photo.get('uniqueness', 0)),
'bucket': photo.get('month', 'unknown'),
'month': month,
'category': photo.get('category', 'unknown'),
'num_faces': int(photo.get('num_faces', 0)),
'cluster_id': cid,
'original_cluster_id': photo.get('original_cluster_id', cid),
'cluster_total': cluster_total,
'cluster_selected': cluster_selected,
'event_id': photo.get('event_id', -1),
'max_similarity': float(photo.get('max_similarity', 0)),
'embedding': embedding_list,
'selection_reason': f"Best in {photo.get('category', 'category')} for {month}",
'selection_detail': f"Selected from {month} - Category: {photo.get('category', 'unknown')}"
})
# Process rejected photos
for photo in rejected_photos:
filename = photo['filename']
thumb_name = get_thumbnail_name(filename)
thumb_path = os.path.join(thumbs_dir, thumb_name)
create_thumbnail(os.path.join(upload_dir, filename), thumb_path)
# Update thumbnail counter
thumbnails_created += 1
if thumbnails_created % 10 == 0 or thumbnails_created == total_thumbnails:
processing_jobs[job_id]['message'] = f'Creating thumbnails: {thumbnails_created}/{total_thumbnails}'
# Use actual rejection reason from monthly_selector
rejection_reason = photo.get('rejection_reason', 'Better photos selected')
# Categorize rejection reasons for breakdown chart
if 'Event' in rejection_reason:
breakdown_category = "Same event"
elif 'Cluster' in rejection_reason:
breakdown_category = "Same cluster"
elif 'similar' in rejection_reason.lower():
breakdown_category = "Too similar"
elif 'Target' in rejection_reason:
breakdown_category = "Target reached"
else:
breakdown_category = "Other"
rejection_counts[breakdown_category] += 1
# Get embedding for this photo (convert to list for JSON serialization)
photo_embedding = embeddings.get(filename)
embedding_list = photo_embedding.tolist() if photo_embedding is not None else None
# Get cluster stats for this photo (per-month)
cid = photo.get('cluster_id', -1)
month = photo.get('month', 'Unknown')
cluster_total = cluster_total_counts.get((month, cid), 0) if cid != -1 else 0
cluster_selected = cluster_selected_counts.get((month, cid), 0) if cid != -1 else 0
results['rejected'].append({
'filename': filename,
'thumbnail': thumb_name,
'score': float(photo.get('total', 0)),
'face_quality': float(photo.get('face_quality', 0)),
'aesthetic_quality': float(photo.get('aesthetic_quality', 0)),
'bucket': photo.get('month', 'unknown'),
'month': month,
'category': photo.get('category', 'unknown'),
'cluster_id': cid,
'original_cluster_id': photo.get('original_cluster_id', cid),
'cluster_total': cluster_total,
'cluster_selected': cluster_selected,
'event_id': photo.get('event_id', -1),
'max_similarity': float(photo.get('max_similarity', 0)),
'embedding': embedding_list,
'rejection_reason': rejection_reason,
'reason': rejection_reason,
'reason_detail': f"Category: {photo.get('category', 'unknown')}"
})
results['rejection_breakdown'] = dict(rejection_counts)
# Add face filtering count to breakdown (photos where target face was not detected)
face_filter_data = results['summary'].get('face_filtering', {})
total_uploaded = face_filter_data.get('total_photos', 0)
after_face_filter = face_filter_data.get('after_face_filter', 0)
face_filtered_out = total_uploaded - after_face_filter
if face_filtered_out > 0:
results['rejection_breakdown']['Face not detected'] = face_filtered_out
# Sort by score
results['selected'].sort(key=lambda x: x['score'], reverse=True)
results['rejected'].sort(key=lambda x: x['score'], reverse=True)
# Save results
results_file = os.path.join(RESULTS_FOLDER, f"{job_id}.json")
with open(results_file, 'w') as f:
json.dump(results, f, indent=2, default=str)
processing_jobs[job_id]['status'] = 'complete'
processing_jobs[job_id]['progress'] = 100
processing_jobs[job_id]['message'] = 'Selection complete!'
processing_jobs[job_id]['results'] = results
print(f"\n[Job {job_id}] PHASE 2 COMPLETE!")
print(f" - Final selection: {len(results['selected'])} photos")
print(f" - Filtered out: {len(results['rejected'])} photos")
print(f" - Results saved to: {results_file}")
print(f"\n=== Month Distribution ===")
for stat in month_stats:
print(f" {stat['month']}: {stat['selected']}/{stat['total_photos']} ({stat['category_summary']})")
print(f"{'='*60}\n")
# Auto-save disabled - uncomment below to re-enable
# output_folder = save_photos_by_month(job_id, upload_dir, selected_photos, rejected_photos, month_stats)
# if output_folder:
# processing_jobs[job_id]['output_folder'] = output_folder
# print(f"[Job {job_id}] Photos auto-saved to: {output_folder}")
except Exception as e:
print(f"[Job {job_id}] EXCEPTION: {str(e)}")
processing_jobs[job_id]['status'] = 'error'
processing_jobs[job_id]['message'] = str(e)
import traceback
traceback.print_exc()
def process_photos_automatic(job_id, upload_dir, quality_mode, similarity_threshold, session_id=None):
"""
Full automatic processing (no review step) - used when no reference photos loaded.
Processes all photos with quality-based selection.
"""
try:
processing_jobs[job_id]['status'] = 'processing'
processing_jobs[job_id]['progress'] = 5
processing_jobs[job_id]['message'] = 'Loading AI models...'
# Import pipeline components
from photo_selector.siglip_embeddings import SigLIPEmbedder
from photo_selector.temporal import TemporalSegmenter
from photo_selector.clustering import PhotoClusterer, BucketClusterManager
from photo_selector.scoring import PhotoScorer, ClusterScorer
from photo_selector.auto_selector import SmartPhotoSelector, SelectionReason
# Step 1: Embeddings (SigLIP for better visual understanding)
processing_jobs[job_id]['progress'] = 20
processing_jobs[job_id]['message'] = 'Analyzing photos with SigLIP AI...'
embedder = SigLIPEmbedder()
embeddings = embedder.process_folder(upload_dir)
processing_jobs[job_id]['progress'] = 40
processing_jobs[job_id]['message'] = 'Organizing by date...'
# Step 2: Temporal segmentation
segmenter = TemporalSegmenter(bucket_type="monthly")
buckets = segmenter.segment_folder(upload_dir)
# For clustering, use a reasonable estimate (will be refined by auto-selector)
estimated_target = max(10, len(embeddings) // 3)
targets = segmenter.calculate_target_per_bucket(buckets, estimated_target)
processing_jobs[job_id]['progress'] = 50
processing_jobs[job_id]['message'] = 'Grouping similar photos (adaptive clustering)...'
# Step 3: Clustering (HDBSCAN with timestamp-weighted features, 24h gap splitting)
# min_cluster_size=5 reduces single-photo clusters by requiring at least 5 similar photos
clusterer = BucketClusterManager(PhotoClusterer(min_cluster_size=5, temporal_gap_hours=24.0, timestamp_weight=0.3))
cluster_results = clusterer.cluster_all_buckets(buckets, embeddings, targets)
processing_jobs[job_id]['progress'] = 60
processing_jobs[job_id]['message'] = 'Scoring photo quality...'
# Step 4: Score ALL photos
scorer = ClusterScorer(PhotoScorer())
all_scores = {}
for bucket_key, bucket_data in cluster_results.items():
filenames = bucket_data['filenames']
labels = np.array(bucket_data['labels'])
bucket_embeddings = np.array([embeddings[fn] for fn in filenames])
for cluster_id in np.unique(labels):
cluster_mask = labels == cluster_id
cluster_indices = np.where(cluster_mask)[0]
cluster_filenames = [filenames[i] for i in cluster_indices]
cluster_embs = bucket_embeddings[cluster_mask]
cluster_paths = [os.path.join(upload_dir, fn) for fn in cluster_filenames]
scores = scorer.score_cluster(cluster_paths, cluster_embs)
for score in scores:
score['bucket'] = bucket_key
score['cluster'] = int(cluster_id)
score['cluster_key'] = f"{bucket_key}_cluster_{cluster_id}"
all_scores[score['filename']] = score
processing_jobs[job_id]['progress'] = 75
processing_jobs[job_id]['message'] = 'AI deciding which photos to keep...'
# Step 5: AUTOMATIC SELECTION
auto_selector = SmartPhotoSelector(
quality_mode=quality_mode,
similarity_threshold=similarity_threshold
)
selection_results = auto_selector.process_all_photos(
all_scores, embeddings, cluster_results
)
processing_jobs[job_id]['progress'] = 90
processing_jobs[job_id]['message'] = 'Preparing results...'
# Create thumbnails directory
thumbs_dir = os.path.join(upload_dir, 'thumbnails')
os.makedirs(thumbs_dir, exist_ok=True)
# Prepare results
results = {
'selected': [],
'rejected': [],
'summary': selection_results['summary'],
'rejection_breakdown': selection_results['rejection_breakdown'],
'bucket_stats': selection_results['bucket_stats']
}
# Process selected photos
for photo in selection_results['selected']:
filename = photo['filename']
thumb_name = get_thumbnail_name(filename)
thumb_path = os.path.join(thumbs_dir, thumb_name)
create_thumbnail(os.path.join(upload_dir, filename), thumb_path)
reason = photo.get('selection_reason', None)
if isinstance(reason, SelectionReason):
reason_text = reason.value
else:
reason_text = str(reason) if reason else 'High quality photo'
results['selected'].append({
'filename': filename,
'thumbnail': thumb_name,
'score': float(photo.get('total', 0)),
'face_quality': float(photo.get('face_quality', 0)),
'aesthetic_quality': float(photo.get('aesthetic_quality', 0)),
'emotional_signal': float(photo.get('emotional_signal', 0)),
'uniqueness': float(photo.get('uniqueness', 0)),
'bucket': photo.get('bucket', 'unknown'),
'num_faces': int(photo.get('num_faces', 0)),
'selection_reason': reason_text,
'selection_detail': photo.get('selection_detail', reason_text)
})
# Process rejected photos
for photo in selection_results['rejected']:
filename = photo['filename']
thumb_name = get_thumbnail_name(filename)
thumb_path = os.path.join(thumbs_dir, thumb_name)
create_thumbnail(os.path.join(upload_dir, filename), thumb_path)
reason = photo.get('rejection_reason', None)
if isinstance(reason, SelectionReason):
reason_text = reason.value
else:
reason_text = str(reason) if reason else 'Did not meet quality threshold'
results['rejected'].append({
'filename': filename,
'thumbnail': thumb_name,
'score': float(photo.get('total', 0)),
'face_quality': float(photo.get('face_quality', 0)),
'aesthetic_quality': float(photo.get('aesthetic_quality', 0)),
'bucket': photo.get('bucket', 'unknown'),
'reason': reason_text,
'reason_detail': photo.get('rejection_detail', '')
})
# Sort by score
results['selected'].sort(key=lambda x: x['score'], reverse=True)
results['rejected'].sort(key=lambda x: x['score'], reverse=True)
# Save results
results_file = os.path.join(RESULTS_FOLDER, f"{job_id}.json")
with open(results_file, 'w') as f:
json.dump(results, f, indent=2, default=str)
processing_jobs[job_id]['status'] = 'complete'
processing_jobs[job_id]['progress'] = 100
processing_jobs[job_id]['message'] = 'Selection complete!'
processing_jobs[job_id]['results'] = results
except Exception as e:
processing_jobs[job_id]['status'] = 'error'
processing_jobs[job_id]['message'] = str(e)
import traceback
traceback.print_exc()
@app.route('/')
def index():
"""Main page - redirects to step 1 (reference upload)."""
return render_template('index.html')
@app.route('/preload_model')
def preload_model():
"""Pre-load the InsightFace model in the background."""
from photo_selector.face_matcher import FaceMatcher
try:
# Create a temporary matcher to trigger model download/load
temp_matcher = FaceMatcher(similarity_threshold=0.5)
if temp_matcher.is_initialized:
return jsonify({'success': True, 'message': 'Model loaded'})
else:
return jsonify({'success': False, 'message': 'Model failed to initialize'})
except Exception as e:
return jsonify({'success': False, 'message': str(e)})
@app.route('/step1')
def step1_reference():
"""Step 1: Upload reference photos of target person."""
# Create a new session ID if not exists
if 'session_id' not in session:
session['session_id'] = str(uuid.uuid4())[:8]
return render_template('step1_reference.html', session_id=session['session_id'])
@app.route('/step2')
def step2_upload():
"""Step 2: Upload all event photos."""
session_id = session.get('session_id')
if not session_id:
return render_template('index.html')
# Check if we have reference photos loaded
ref_count = 0
if session_id in face_matchers:
ref_count = face_matchers[session_id].get_reference_count()
return render_template('step2_upload.html',
session_id=session_id,
reference_count=ref_count)
@app.route('/upload_reference', methods=['POST'])
def upload_reference():
"""Handle reference photo uploads (2-3 photos of target person)."""
from photo_selector.face_matcher import FaceMatcher
if 'files' not in request.files:
return jsonify({'error': 'No files provided'}), 400
files = request.files.getlist('files')
if not files or files[0].filename == '':
return jsonify({'error': 'No files selected'}), 400
# Get or create session ID
session_id = session.get('session_id')
if not session_id:
session_id = str(uuid.uuid4())[:8]
session['session_id'] = session_id
# Create reference directory for this session
ref_dir = os.path.join(REFERENCE_FOLDER, session_id)
os.makedirs(ref_dir, exist_ok=True)
# Initialize face matcher for this session if not exists
if session_id not in face_matchers:
face_matchers[session_id] = FaceMatcher(similarity_threshold=0.5)
matcher = face_matchers[session_id]
# Process each reference photo
results = []
for file in files:
if file and allowed_file(file.filename):
filename = secure_filename(file.filename)
filepath = os.path.join(ref_dir, filename)
file.save(filepath)
# Add to face matcher
result = matcher.add_reference_photo(filepath)
result['filename'] = filename
# Create thumbnail for preview
thumb_name = get_thumbnail_name(filename)
thumb_path = os.path.join(ref_dir, thumb_name)
create_thumbnail(filepath, thumb_path, size=(150, 150))
result['thumbnail'] = thumb_name
results.append(result)
return jsonify({
'session_id': session_id,
'results': results,
'total_references': matcher.get_reference_count(),
'message': f'Loaded {matcher.get_reference_count()} reference face(s)'
})
@app.route('/reference_status')
def reference_status():
"""Get current reference photo status."""
session_id = session.get('session_id')
if not session_id or session_id not in face_matchers:
return jsonify({
'session_id': session_id,
'reference_count': 0,
'ready': False
})
matcher = face_matchers[session_id]
return jsonify({
'session_id': session_id,
'reference_count': matcher.get_reference_count(),
'ready': matcher.get_reference_count() >= 1
})
@app.route('/clear_references', methods=['POST'])
def clear_references():
"""Clear all reference photos for current session."""
session_id = session.get('session_id')
if session_id and session_id in face_matchers:
face_matchers[session_id].clear_references()
# Delete reference files
ref_dir = os.path.join(REFERENCE_FOLDER, session_id)
if os.path.exists(ref_dir):
shutil.rmtree(ref_dir)
return jsonify({'message': 'References cleared', 'reference_count': 0})
@app.route('/reference_thumbnail/<filename>')
def get_reference_thumbnail(filename):
"""Serve reference photo thumbnails."""
session_id = session.get('session_id')
if not session_id:
return jsonify({'error': 'No session'}), 404
ref_dir = os.path.join(REFERENCE_FOLDER, session_id)
return send_from_directory(ref_dir, filename)
# ============== CHUNKED UPLOAD ENDPOINTS ==============
# These endpoints allow uploading large batches of photos in smaller chunks
# to avoid 413 (Request Entity Too Large) errors on Hugging Face Spaces
@app.route('/upload_init', methods=['POST'])
def upload_init():
"""Initialize a chunked upload session."""
data = request.json
total_files = data.get('total_files', 0)
quality_mode = data.get('quality_mode', 'balanced')
similarity_threshold = data.get('similarity_threshold', 0.92)
# Create a unique session ID for this upload
upload_session_id = str(uuid.uuid4())[:8]
upload_dir = os.path.join(UPLOAD_FOLDER, upload_session_id)
os.makedirs(upload_dir, exist_ok=True)
# Get face matcher session
face_session_id = session.get('session_id')
# Store session info
upload_sessions[upload_session_id] = {
'upload_dir': upload_dir,
'total_files': total_files,
'uploaded_files': [],
'quality_mode': quality_mode,
'similarity_threshold': similarity_threshold,
'face_session_id': face_session_id,
'created_at': time.time()
}
print(f"\n[Upload Session {upload_session_id}] Initialized for {total_files} files")
return jsonify({
'session_id': upload_session_id,
'message': 'Upload session initialized'
})
@app.route('/upload_chunk', methods=['POST'])
def upload_chunk():
"""Handle a chunk of files in a chunked upload."""
if 'files' not in request.files:
return jsonify({'error': 'No files provided'}), 400
session_id = request.form.get('session_id')
if not session_id or session_id not in upload_sessions:
return jsonify({'error': 'Invalid upload session'}), 400
upload_info = upload_sessions[session_id]
upload_dir = upload_info['upload_dir']
files = request.files.getlist('files')
saved_count = 0
for file in files:
if file and allowed_file(file.filename):
filename = secure_filename(file.filename)
# Handle duplicate filenames
base, ext = os.path.splitext(filename)
counter = 1
while os.path.exists(os.path.join(upload_dir, filename)):
filename = f"{base}_{counter}{ext}"
counter += 1
file.save(os.path.join(upload_dir, filename))
upload_info['uploaded_files'].append(filename)
saved_count += 1
chunk_index = request.form.get('chunk_index', '?')
print(f"[Upload Session {session_id}] Chunk {chunk_index}: saved {saved_count} files (total: {len(upload_info['uploaded_files'])})")
return jsonify({
'success': True,
'saved': saved_count,
'total_uploaded': len(upload_info['uploaded_files'])
})
@app.route('/upload_complete', methods=['POST'])
def upload_complete():
"""Complete a chunked upload and start processing."""
data = request.json
session_id = data.get('session_id')
if not session_id or session_id not in upload_sessions:
return jsonify({'error': 'Invalid upload session'}), 400
upload_info = upload_sessions[session_id]
upload_dir = upload_info['upload_dir']
saved_files = upload_info['uploaded_files']
quality_mode = upload_info['quality_mode']
similarity_threshold = upload_info['similarity_threshold']
face_session_id = upload_info['face_session_id']
if not saved_files:
shutil.rmtree(upload_dir)
del upload_sessions[session_id]
return jsonify({'error': 'No valid image files uploaded'}), 400
# Check if we have reference photos loaded
has_references = False
ref_count = 0
if face_session_id and face_session_id in face_matchers:
ref_count = face_matchers[face_session_id].get_reference_count()
has_references = ref_count > 0
# Create job (use same session_id as job_id for simplicity)
job_id = session_id
# Initialize job
processing_jobs[job_id] = {
'status': 'queued',
'progress': 30, # Start at 30% since upload is done
'message': 'Starting AI processing...',
'total_files': len(saved_files),
'total_uploaded': len(saved_files),
'upload_dir': upload_dir,
'session_id': face_session_id,
'has_reference_photos': has_references,
'reference_count': ref_count,
'quality_mode': quality_mode,
'similarity_threshold': similarity_threshold,
'results': None
}
# Clean up upload session
del upload_sessions[session_id]
# Decide which processing mode to use
if has_references:
print(f"\n[Job {job_id}] NEW JOB (Chunked Upload) - Face Filtering Mode")
print(f" - Files uploaded: {len(saved_files)}")
print(f" - Reference photos: {ref_count}")
thread = threading.Thread(
target=process_photos_face_filter_only,
args=(job_id, upload_dir, face_session_id)
)
message = f'Scanning {len(saved_files)} photos to find your child using {ref_count} reference(s)...'
else:
print(f"\n[Job {job_id}] NEW JOB (Chunked Upload) - No Face Filtering")
print(f" - Files uploaded: {len(saved_files)}")
thread = threading.Thread(
target=process_photos_quality_selection,
args=(job_id, upload_dir, quality_mode, similarity_threshold)
)
message = f'Selecting best photos from {len(saved_files)} images...'
thread.daemon = True
thread.start()
processing_jobs[job_id]['message'] = message
return jsonify({
'job_id': job_id,
'message': message,
'total_files': len(saved_files)
})
# ============== END CHUNKED UPLOAD ENDPOINTS ==============
# ============== GOOGLE DRIVE IMPORT ENDPOINTS ==============
# Import Google Drive module
try:
from google_drive import (
is_drive_available, extract_folder_id, list_images_in_folder,
download_folder, get_folder_info, get_drive_service
)
GDRIVE_SERVICE_ACCOUNT_AVAILABLE = is_drive_available()
except ImportError:
GDRIVE_SERVICE_ACCOUNT_AVAILABLE = False
@app.route('/check_drive_status')
def check_drive_status():
"""Check if Google Drive Service Account is configured."""
return jsonify({
'available': GDRIVE_SERVICE_ACCOUNT_AVAILABLE,
'message': 'Service Account configured' if GDRIVE_SERVICE_ACCOUNT_AVAILABLE else 'Service Account not configured'
})
@app.route('/preview_drive_folder', methods=['POST'])
def preview_drive_folder():
"""Preview contents of a Google Drive folder before importing."""
if not GDRIVE_SERVICE_ACCOUNT_AVAILABLE:
return jsonify({'error': 'Google Drive Service Account not configured'}), 400
data = request.get_json()
folder_url = data.get('folder_url', '').strip()
if not folder_url:
return jsonify({'error': 'Please provide a folder URL'}), 400
try:
folder_id = extract_folder_id(folder_url)
info = get_folder_info(folder_id)
if not info.get('success'):
return jsonify({'error': info.get('error', 'Could not access folder')}), 400
return jsonify({
'success': True,
'folder_id': folder_id,
'folder_name': info.get('folder_name', 'Unknown'),
'image_count': info.get('image_count', 0),
'preview_images': info.get('images', [])[:5]
})
except ValueError as e:
return jsonify({'error': str(e)}), 400
except Exception as e:
print(f"[Drive] Error previewing folder: {e}")
return jsonify({'error': f'Could not access folder: {str(e)}'}), 400
@app.route('/import_from_drive', methods=['POST'])
def import_from_drive():
"""Import photos from Google Drive folder (Step 2 - initial upload)."""
if not GDRIVE_SERVICE_ACCOUNT_AVAILABLE:
return jsonify({'error': 'Google Drive Service Account not configured'}), 400
data = request.get_json()
folder_url = data.get('folder_url', '').strip()
quality_mode = data.get('quality_mode', 'balanced')
similarity_threshold = float(data.get('similarity_threshold', 0.4))
if not folder_url:
return jsonify({'error': 'Please provide a folder URL'}), 400
# Get face session (step 1 stores it as 'session_id')
face_session_id = session.get('session_id')
has_references = False
ref_count = 0
if face_session_id and face_session_id in face_matchers:
ref_count = face_matchers[face_session_id].get_reference_count()
has_references = ref_count > 0
try:
folder_id = extract_folder_id(folder_url)
except ValueError as e:
return jsonify({'error': str(e)}), 400
# Create job
job_id = str(uuid.uuid4())[:8]
upload_dir = os.path.join(UPLOAD_FOLDER, job_id)
os.makedirs(upload_dir, exist_ok=True)
os.makedirs(os.path.join(upload_dir, 'thumbnails'), exist_ok=True)
# Initialize job
processing_jobs[job_id] = {
'status': 'downloading',
'progress': 5,
'message': 'Connecting to Google Drive...',
'total_files': 0,
'total_uploaded': 0,
'upload_dir': upload_dir,
'session_id': face_session_id,
'has_reference_photos': has_references,
'reference_count': ref_count,
'quality_mode': quality_mode,
'similarity_threshold': similarity_threshold,
'results': None
}
# Start download in background thread
def download_and_process():
try:
# HYBRID MODE: If we have face references, use parallel download + face detection
if has_references:
face_matcher = face_matchers.get(face_session_id)
if face_matcher and face_matcher.get_reference_count() > 0:
print(f"[Job {job_id}] Using HYBRID MODE: Parallel download + face detection")
process_drive_with_parallel_face_detection(job_id, folder_id, upload_dir, face_matcher)
return
# SEQUENTIAL MODE: Download all first, then process (for auto mode without face filtering)
def progress_callback(current, total, _filename):
pct = int(5 + (current / total) * 25) # 5% to 30%
processing_jobs[job_id]['progress'] = pct
processing_jobs[job_id]['message'] = f'Downloading from Drive: {current}/{total}'
processing_jobs[job_id]['total_files'] = total
processing_jobs[job_id]['total_uploaded'] = current
print(f"[Job {job_id}] Starting Google Drive download from folder {folder_id}")
result = download_folder(folder_id, upload_dir, progress_callback)
if not result.get('success') and result.get('downloaded', 0) == 0:
processing_jobs[job_id]['status'] = 'error'
processing_jobs[job_id]['message'] = result.get('message', 'Download failed')
return
downloaded_count = result.get('downloaded', 0) + result.get('skipped', 0)
downloaded_files = result.get('files', [])
processing_jobs[job_id]['total_uploaded'] = downloaded_count
processing_jobs[job_id]['total_files'] = downloaded_count
print(f"[Job {job_id}] Downloaded {downloaded_count} photos from Google Drive")
# No face filtering - use all downloaded photos (auto mode)
processing_jobs[job_id]['message'] = f'Selecting best from {downloaded_count} photos...'
process_photos_quality_selection(job_id, upload_dir, quality_mode, similarity_threshold, downloaded_files)
except Exception as e:
print(f"[Job {job_id}] Drive import error: {e}")
import traceback
traceback.print_exc()
processing_jobs[job_id]['status'] = 'error'
processing_jobs[job_id]['message'] = f'Import failed: {str(e)}'
thread = threading.Thread(target=download_and_process)
thread.daemon = True
thread.start()
return jsonify({
'job_id': job_id,
'message': 'Starting Google Drive import...'
})
@app.route('/import_from_drive_reupload/<dataset_name>', methods=['POST'])
def import_from_drive_reupload(dataset_name):
"""Import photos from Google Drive folder for reupload (after server restart)."""
if not GDRIVE_SERVICE_ACCOUNT_AVAILABLE:
return jsonify({'error': 'Google Drive Service Account not configured'}), 400
data = request.get_json()
folder_url = data.get('folder_url', '').strip()
if not folder_url:
return jsonify({'error': 'Please provide a folder URL'}), 400
try:
folder_id = extract_folder_id(folder_url)
except ValueError as e:
return jsonify({'error': str(e)}), 400
# Create job
job_id = str(uuid.uuid4())[:8]
upload_dir = os.path.join(UPLOAD_FOLDER, job_id)
os.makedirs(upload_dir, exist_ok=True)
os.makedirs(os.path.join(upload_dir, 'thumbnails'), exist_ok=True)
# Initialize job
processing_jobs[job_id] = {
'status': 'downloading',
'progress': 5,
'message': 'Connecting to Google Drive...'
}
# Start download and processing in background
def download_and_process_reupload():
try:
def progress_callback(current, total, filename):
pct = int(5 + (current / total) * 45) # 5% to 50%
processing_jobs[job_id]['progress'] = pct
processing_jobs[job_id]['message'] = f'Downloading from Drive: {current}/{total}'
print(f"[Job {job_id}] Starting Google Drive reupload for dataset '{dataset_name}'")
result = download_folder(folder_id, upload_dir, progress_callback)
if not result.get('success') and result.get('downloaded', 0) == 0:
processing_jobs[job_id]['status'] = 'error'
processing_jobs[job_id]['message'] = result.get('message', 'Download failed')
return
uploaded_filenames = result.get('files', [])
print(f"[Job {job_id}] Downloaded {len(uploaded_filenames)} photos")
# Load dataset from Supabase
processing_jobs[job_id]['message'] = 'Loading saved dataset...'
processing_jobs[job_id]['progress'] = 55
supabase_data = load_dataset_from_supabase(dataset_name)
if not supabase_data:
processing_jobs[job_id]['status'] = 'error'
processing_jobs[job_id]['message'] = 'Dataset not found in Supabase'
return
metadata = supabase_data.get('metadata', {})
face_results = supabase_data.get('face_results', {})
embeddings_data = supabase_data.get('embeddings_data')
# Load reference embeddings
new_session_id = str(uuid.uuid4())[:8]
if embeddings_data:
import io
from photo_selector.face_matcher import FaceMatcher
data_np = np.load(io.BytesIO(embeddings_data), allow_pickle=True)
matcher = FaceMatcher(similarity_threshold=float(data_np['threshold']))
matcher.reference_embeddings = list(data_np['embeddings'])
matcher.average_embedding = data_np['average']
face_matchers[new_session_id] = matcher
# Note: Can't set session here (background thread) - session_id stored in processing_jobs
print(f"[Job {job_id}] Loaded {len(matcher.reference_embeddings)} reference embeddings")
# Match uploaded files with saved face results
# Google Drive filenames differ from browser upload:
# 1. Duplicates: IMG_5197(1).JPG vs IMG_51971.JPG
# 2. Spaces: IMG_6970 Copy.JPG vs IMG_6970_Copy.JPG
import re
def normalize_filename(filename):
"""Normalize Google Drive filename to match browser upload format."""
# Step 1: Convert (N) suffix to N (Google Drive duplicate handling)
match = re.match(r'^(.+)\((\d+)\)(\.[^.]+)$', filename)
if match:
base, num, ext = match.groups()
filename = f"{base}{num}{ext}"
# Step 2: Apply secure_filename (spaces -> underscores, etc.)
return secure_filename(filename)
filtered_photos = face_results.get('filtered_photos', [])
uploaded_set = set(uploaded_filenames)
saved_filenames_set = {p.get('filename') for p in filtered_photos}
# Create mapping: normalized_name -> actual_uploaded_name
normalized_to_uploaded = {normalize_filename(f): f for f in uploaded_filenames}
matched_photos = []
for p in filtered_photos:
saved_filename = p.get('filename')
actual_filename = None
# Try direct match first
if saved_filename in uploaded_set:
actual_filename = saved_filename
# Try normalized match (saved name matches normalized uploaded name)
elif saved_filename in normalized_to_uploaded:
actual_filename = normalized_to_uploaded[saved_filename]
if actual_filename:
# Use actual uploaded filename for the photo entry
photo_entry = p.copy()
photo_entry['filename'] = actual_filename
photo_entry['thumbnail'] = get_thumbnail_name(actual_filename)
matched_photos.append(photo_entry)
# Debug: Find unmatched photos
matched_saved = {p.get('filename') for p in filtered_photos if p.get('filename') in uploaded_set or p.get('filename') in normalized_to_uploaded}
unmatched_from_saved = [p.get('filename') for p in filtered_photos if p.get('filename') not in matched_saved]
matched_uploaded = {m['filename'] for m in matched_photos}
unmatched_from_uploaded = [f for f in uploaded_filenames if f not in matched_uploaded]
print(f"[Job {job_id}] Matched {len(matched_photos)} of {len(filtered_photos)} photos")
print(f"[Job {job_id}] DEBUG: {len(unmatched_from_saved)} saved photos NOT found in uploaded files:")
for fname in unmatched_from_saved[:20]: # Show first 20
print(f" [SAVED NOT IN UPLOAD] '{fname}'")
if len(unmatched_from_saved) > 20:
print(f" ... and {len(unmatched_from_saved) - 20} more")
print(f"[Job {job_id}] DEBUG: {len(unmatched_from_uploaded)} uploaded files NOT found in saved data:")
for fname in unmatched_from_uploaded[:20]: # Show first 20
print(f" [UPLOAD NOT IN SAVED] '{fname}'")
if len(unmatched_from_uploaded) > 20:
print(f" ... and {len(unmatched_from_uploaded) - 20} more")
# Create review data
review_data = {
'filtered_photos': matched_photos,
'total_processed': len(uploaded_filenames),
'match_count': len(matched_photos)
}
with open(os.path.join(RESULTS_FOLDER, f"{job_id}_review.json"), 'w') as f:
json.dump(review_data, f)
# Update processing job
processing_jobs[job_id].update({
'status': 'review_pending',
'progress': 100,
'message': 'Photos downloaded from Google Drive',
'upload_dir': upload_dir,
'session_id': new_session_id,
'has_reference_photos': True,
'reference_count': metadata.get('reference_count', 0),
'quality_mode': metadata.get('quality_mode', 'balanced'),
'similarity_threshold': metadata.get('similarity_threshold', 0.4),
'confirmed_photos': [p['filename'] for p in matched_photos],
'review_data': review_data,
'total_photos': len(matched_photos),
'from_dataset': dataset_name,
'from_supabase': True,
'redirect_url': f'/step3_review/{job_id}'
})
print(f"[Job {job_id}] Reupload complete - ready for review")
except Exception as e:
print(f"[Job {job_id}] Drive reupload error: {e}")
import traceback
traceback.print_exc()
processing_jobs[job_id]['status'] = 'error'
processing_jobs[job_id]['message'] = f'Import failed: {str(e)}'
thread = threading.Thread(target=download_and_process_reupload)
thread.daemon = True
thread.start()
return jsonify({
'job_id': job_id,
'message': 'Starting Google Drive import...'
})
# ============== END GOOGLE DRIVE IMPORT ENDPOINTS ==============
@app.route('/upload', methods=['POST'])
def upload_files():
"""Handle file uploads and start processing."""
if 'files' not in request.files:
return jsonify({'error': 'No files provided'}), 400
files = request.files.getlist('files')
if not files or files[0].filename == '':
return jsonify({'error': 'No files selected'}), 400
# Get parameters - now using quality_mode instead of target
quality_mode = request.form.get('quality_mode', 'balanced')
similarity_threshold = float(request.form.get('similarity', 0.92))
# Get session ID for face matching
session_id = session.get('session_id')
# Create job
job_id = str(uuid.uuid4())[:8]
upload_dir = os.path.join(UPLOAD_FOLDER, job_id)
os.makedirs(upload_dir, exist_ok=True)
# Save files
saved_files = []
for file in files:
if file and allowed_file(file.filename):
filename = secure_filename(file.filename)
# Handle duplicate filenames
base, ext = os.path.splitext(filename)
counter = 1
while os.path.exists(os.path.join(upload_dir, filename)):
filename = f"{base}_{counter}{ext}"
counter += 1
file.save(os.path.join(upload_dir, filename))
saved_files.append(filename)
if not saved_files:
shutil.rmtree(upload_dir)
return jsonify({'error': 'No valid image files'}), 400
# Check if we have reference photos loaded
has_references = False
ref_count = 0
if session_id and session_id in face_matchers:
ref_count = face_matchers[session_id].get_reference_count()
has_references = ref_count > 0
# Initialize job
processing_jobs[job_id] = {
'status': 'queued',
'progress': 0,
'message': 'Uploading files...',
'total_files': len(saved_files),
'total_uploaded': len(saved_files),
'upload_dir': upload_dir,
'session_id': session_id,
'has_reference_photos': has_references,
'reference_count': ref_count,
'quality_mode': quality_mode,
'similarity_threshold': similarity_threshold,
'results': None
}
# Decide which processing mode to use
if has_references:
# With reference photos: Phase 1 = face filtering only, then review step
print(f"\n[Job {job_id}] NEW JOB - Face Filtering Mode")
print(f" - Files uploaded: {len(saved_files)}")
print(f" - Reference photos: {ref_count}")
print(f" - Session ID: {session_id}")
thread = threading.Thread(
target=process_photos_face_filter_only,
args=(job_id, upload_dir, session_id)
)
message = f'Scanning {len(saved_files)} photos to find your child using {ref_count} reference(s)...'
else:
# Without reference photos: Full automatic processing (no review step)
print(f"\n[Job {job_id}] NEW JOB - Full Automatic Mode")
print(f" - Files uploaded: {len(saved_files)}")
print(f" - Quality mode: {quality_mode}")
print(f" - Similarity threshold: {similarity_threshold}")
thread = threading.Thread(
target=process_photos_automatic,
args=(job_id, upload_dir, quality_mode, similarity_threshold, session_id)
)
message = 'Processing started - AI will automatically select the best photos!'
thread.start()
return jsonify({
'job_id': job_id,
'files_uploaded': len(saved_files),
'has_reference_photos': has_references,
'reference_count': ref_count,
'message': message,
'needs_review': has_references # Client should redirect to review page
})
@app.route('/upload_folder', methods=['POST'])
def upload_folder():
"""Process photos from a local folder path (for large batches)."""
data = request.get_json()
folder_path = data.get('folder_path', '').strip()
quality_mode = data.get('quality_mode', 'balanced')
similarity_threshold = float(data.get('similarity_threshold', 0.92))
if not folder_path:
return jsonify({'error': 'No folder path provided'}), 400
# Validate folder exists
if not os.path.isdir(folder_path):
return jsonify({'error': f'Folder not found: {folder_path}'}), 400
# Get session ID for face matching
session_id = session.get('session_id')
# Create job with reference to original folder
job_id = str(uuid.uuid4())[:8]
# Count valid image files
image_extensions = {'.jpg', '.jpeg', '.png', '.heic', '.heif', '.webp'}
image_files = [f for f in os.listdir(folder_path)
if os.path.splitext(f.lower())[1] in image_extensions]
if not image_files:
return jsonify({'error': 'No valid image files found in folder'}), 400
print(f"\n[Job {job_id}] LOCAL FOLDER MODE")
print(f" - Folder: {folder_path}")
print(f" - Images found: {len(image_files)}")
# Check if we have reference photos loaded
has_references = False
ref_count = 0
if session_id and session_id in face_matchers:
ref_count = face_matchers[session_id].get_reference_count()
has_references = ref_count > 0
# Create thumbnails directory
thumb_dir = os.path.join(UPLOAD_FOLDER, job_id, 'thumbnails')
os.makedirs(thumb_dir, exist_ok=True)
# Initialize job - use original folder path as upload_dir
processing_jobs[job_id] = {
'status': 'queued',
'progress': 0,
'message': 'Preparing to process photos...',
'total_files': len(image_files),
'total_uploaded': len(image_files),
'upload_dir': folder_path, # Point to original folder
'thumb_dir': thumb_dir,
'session_id': session_id,
'has_reference_photos': has_references,
'reference_count': ref_count,
'quality_mode': quality_mode,
'similarity_threshold': similarity_threshold,
'is_local_folder': True, # Flag for local folder mode
'results': None
}
# Decide which processing mode to use
if has_references:
print(f" - Reference photos: {ref_count}")
print(f" - Mode: Face Filtering")
thread = threading.Thread(
target=process_photos_face_filter_only,
args=(job_id, folder_path, session_id)
)
message = f'Scanning {len(image_files)} photos to find your child...'
else:
print(f" - Mode: Full Automatic")
thread = threading.Thread(
target=process_photos_automatic,
args=(job_id, folder_path, quality_mode, similarity_threshold, session_id)
)
message = 'Processing started - AI will automatically select the best photos!'
thread.start()
return jsonify({
'job_id': job_id,
'files_found': len(image_files),
'has_reference_photos': has_references,
'reference_count': ref_count,
'message': message,
'needs_review': has_references
})
@app.route('/status/<job_id>')
def get_status(job_id):
"""Get processing status."""
if job_id not in processing_jobs:
return jsonify({'error': 'Job not found'}), 404
job = processing_jobs[job_id]
response = {
'status': job['status'],
'progress': job['progress'],
'message': job['message'],
'total_photos': job.get('total_photos', 0),
'photos_checked': job.get('photos_checked', 0)
}
if job['status'] == 'complete' and job['results']:
response['summary'] = job['results']['summary']
return jsonify(response)
@app.route('/results/<job_id>')
def get_results(job_id):
"""Get processing results."""
try:
if job_id not in processing_jobs:
# Try loading from file
results_file = os.path.join(RESULTS_FOLDER, f"{job_id}.json")
if os.path.exists(results_file):
with open(results_file, 'r') as f:
return jsonify(json.load(f))
return jsonify({'error': 'Job not found'}), 404
job = processing_jobs[job_id]
if job['status'] != 'complete':
return jsonify({'error': 'Processing not complete', 'status': job['status'], 'message': job.get('message', '')}), 400
# Try from memory first, then file
if 'results' in job and job['results']:
return jsonify(job['results'])
# Fallback to file
results_file = os.path.join(RESULTS_FOLDER, f"{job_id}.json")
if os.path.exists(results_file):
with open(results_file, 'r') as f:
return jsonify(json.load(f))
return jsonify({'error': 'Results not found'}), 404
except Exception as e:
import traceback
traceback.print_exc()
return jsonify({'error': str(e)}), 500
@app.route('/thumbnail/<job_id>/<filename>')
def get_thumbnail(job_id, filename):
"""Serve thumbnail images, generating on-demand if needed."""
thumb_dir = os.path.join(UPLOAD_FOLDER, job_id, 'thumbnails')
thumb_name = get_thumbnail_name(filename)
thumb_path = os.path.join(thumb_dir, thumb_name)
# If thumbnail exists, serve it
if os.path.exists(thumb_path):
return send_from_directory(thumb_dir, thumb_name)
# Generate thumbnail on-demand for unmatched photos
original_path = os.path.join(UPLOAD_FOLDER, job_id, filename)
if os.path.exists(original_path):
os.makedirs(thumb_dir, exist_ok=True)
create_thumbnail(original_path, thumb_path)
if os.path.exists(thumb_path):
return send_from_directory(thumb_dir, thumb_name)
# Fallback - try to serve the original filename from thumbnails
if os.path.exists(os.path.join(thumb_dir, filename)):
return send_from_directory(thumb_dir, filename)
return jsonify({'error': 'Thumbnail not found'}), 404
@app.route('/photo/<job_id>/<filename>')
def get_photo(job_id, filename):
"""Serve full-size photos with proper EXIF rotation handling."""
from io import BytesIO
from PIL import ExifTags
photo_dir = os.path.join(UPLOAD_FOLDER, job_id)
filepath = os.path.join(photo_dir, filename)
if not os.path.exists(filepath):
return jsonify({'error': 'File not found'}), 404
ext = os.path.splitext(filename)[1].lower()
# Handle HEIC/HEIF - convert to JPEG
if ext in ['.heic', '.heif']:
try:
img = Image.open(filepath)
img = img.convert('RGB')
buffer = BytesIO()
img.save(buffer, format='JPEG', quality=90)
buffer.seek(0)
return send_file(buffer, mimetype='image/jpeg')
except Exception as e:
print(f"Error converting HEIC: {e}")
return send_from_directory(photo_dir, filename)
# Handle JPG/JPEG - apply EXIF rotation
if ext in ['.jpg', '.jpeg']:
try:
img = Image.open(filepath)
# Get EXIF orientation and rotate if needed
try:
for orientation in ExifTags.TAGS.keys():
if ExifTags.TAGS[orientation] == 'Orientation':
break
exif = img._getexif()
if exif is not None:
orientation_value = exif.get(orientation)
if orientation_value == 3:
img = img.rotate(180, expand=True)
elif orientation_value == 6:
img = img.rotate(270, expand=True)
elif orientation_value == 8:
img = img.rotate(90, expand=True)
except (AttributeError, KeyError, IndexError):
pass
# Convert to RGB if needed (handles RGBA, P mode, etc.)
if img.mode != 'RGB':
img = img.convert('RGB')
buffer = BytesIO()
img.save(buffer, format='JPEG', quality=90)
buffer.seek(0)
return send_file(buffer, mimetype='image/jpeg')
except Exception as e:
print(f"Error processing JPEG: {e}")
return send_from_directory(photo_dir, filename)
# Other formats - serve directly
return send_from_directory(photo_dir, filename)
@app.route('/download/<job_id>')
def download_selected(job_id):
"""Download selected photos as zip with timestamp-sorted naming.
Uses DISK-BASED ZIP creation (not memory) to handle large photo sets (1000+).
The ZIP is created on disk, then streamed to the browser in chunks.
This prevents memory issues and timeouts on large downloads.
"""
import zipfile
import tempfile
from datetime import datetime
from collections import defaultdict
if job_id not in processing_jobs:
return jsonify({'error': 'Job not found'}), 404
job = processing_jobs[job_id]
if job['status'] != 'complete':
return jsonify({'error': 'Processing not complete'}), 400
results = job.get('results', {})
selected = results.get('selected', [])
upload_dir = job.get('upload_dir', '')
if not selected:
return jsonify({'error': 'No selected photos found'}), 404
if not upload_dir:
return jsonify({'error': 'Upload directory not found'}), 404
print(f"[Download] Starting disk-based ZIP for {len(selected)} photos...")
# Month abbreviations
MONTH_ABBREV = {
1: "Jan", 2: "Feb", 3: "Mar", 4: "Apr",
5: "May", 6: "Jun", 7: "Jul", 8: "Aug",
9: "Sep", 10: "Oct", 11: "Nov", 12: "Dec"
}
# Import timestamp extractor
from photo_selector.utils import get_photo_timestamp
# Group photos by month and sort by timestamp
photos_by_month = defaultdict(list)
photos_no_timestamp = []
for photo in selected:
filename = photo.get('filename', '')
ts = photo.get('timestamp')
# If no timestamp stored, try to extract it from the photo file
if not ts:
photo_path = os.path.join(upload_dir, filename)
if os.path.exists(photo_path):
dt = get_photo_timestamp(photo_path)
if dt:
ts = dt.timestamp()
if ts:
dt = datetime.fromtimestamp(ts)
month_key = (dt.year, dt.month) # Group by year-month to handle multi-year datasets
photos_by_month[month_key].append({
'filename': filename,
'timestamp': ts,
'datetime': dt
})
else:
photos_no_timestamp.append({'filename': filename, 'timestamp': 0})
# Sort photos within each month by timestamp
for month_key in photos_by_month:
photos_by_month[month_key].sort(key=lambda x: x['timestamp'])
# Create ZIP file ON DISK (not in memory) to handle large photo sets
temp_zip_path = os.path.join(tempfile.gettempdir(), f'selected_photos_{job_id}.zip')
files_added = 0
try:
# Use ZIP_STORED (no compression) for faster creation with photos (already compressed)
with zipfile.ZipFile(temp_zip_path, 'w', zipfile.ZIP_STORED) as zf:
# Add photos with timestamps (sorted and renamed)
for month_key in sorted(photos_by_month.keys()):
year, month = month_key
month_abbrev = MONTH_ABBREV[month]
photos = photos_by_month[month_key]
for idx, photo in enumerate(photos, start=1):
original_filename = photo['filename']
photo_path = os.path.join(upload_dir, original_filename)
if os.path.exists(photo_path):
# Create new filename: Jan_1_originalname.jpg
ext = os.path.splitext(original_filename)[1]
base_name = os.path.splitext(original_filename)[0]
new_filename = f"{month_abbrev}_{idx}_{base_name}{ext}"
zf.write(photo_path, new_filename)
files_added += 1
# Log progress every 100 files
if files_added % 100 == 0:
print(f"[Download] Added {files_added} files to ZIP...")
else:
print(f"[Download] File not found: {photo_path}")
# Add photos without timestamps at the end with "NoDate" prefix
for idx, photo in enumerate(photos_no_timestamp, start=1):
original_filename = photo['filename']
photo_path = os.path.join(upload_dir, original_filename)
if os.path.exists(photo_path):
ext = os.path.splitext(original_filename)[1]
base_name = os.path.splitext(original_filename)[0]
new_filename = f"NoDate_{idx}_{base_name}{ext}"
zf.write(photo_path, new_filename)
files_added += 1
else:
print(f"[Download] File not found: {photo_path}")
if files_added == 0:
# Clean up empty zip
if os.path.exists(temp_zip_path):
os.remove(temp_zip_path)
return jsonify({'error': f'No files found in {upload_dir}. Files may have been cleaned up.'}), 404
# Get file size for logging
zip_size_mb = os.path.getsize(temp_zip_path) / (1024 * 1024)
print(f"[Download] ZIP created: {files_added} files, {zip_size_mb:.1f} MB")
# Stream the file to browser and delete after sending
def generate_and_cleanup():
"""Generator that streams ZIP file and deletes it after completion."""
try:
with open(temp_zip_path, 'rb') as f:
while True:
chunk = f.read(8192 * 16) # 128KB chunks for faster streaming
if not chunk:
break
yield chunk
finally:
# Clean up temp file after streaming
try:
if os.path.exists(temp_zip_path):
os.remove(temp_zip_path)
print(f"[Download] Cleaned up temp ZIP: {temp_zip_path}")
except Exception as e:
print(f"[Download] Error cleaning up temp ZIP: {e}")
# Return streaming response
response = Response(
generate_and_cleanup(),
mimetype='application/zip',
headers={
'Content-Disposition': f'attachment; filename=selected_photos_{job_id}.zip',
'Content-Length': str(os.path.getsize(temp_zip_path))
}
)
return response
except Exception as e:
# Clean up on error
if os.path.exists(temp_zip_path):
os.remove(temp_zip_path)
print(f"[Download] Error creating ZIP: {e}")
return jsonify({'error': f'Error creating ZIP: {str(e)}'}), 500
@app.route('/download_filtered/<job_id>')
def download_filtered(job_id):
"""Download all filtered photos (after face matching, before quality selection).
Uses DISK-BASED ZIP creation (not memory) to handle large photo sets (1000+).
"""
import zipfile
import tempfile
if job_id not in processing_jobs:
return jsonify({'error': 'Job not found'}), 404
job = processing_jobs[job_id]
# Get filtered photos from review data
filtered_photos = []
if 'review_data' in job:
filtered_photos = [p['filename'] for p in job['review_data'].get('filtered_photos', [])]
else:
# Try to load from file
review_file = os.path.join(RESULTS_FOLDER, f"{job_id}_review.json")
if os.path.exists(review_file):
with open(review_file, 'r') as f:
review_data = json.load(f)
filtered_photos = [p['filename'] for p in review_data.get('filtered_photos', [])]
if not filtered_photos:
return jsonify({'error': 'No filtered photos found'}), 404
print(f"[Download] Starting disk-based ZIP for {len(filtered_photos)} filtered photos...")
# Create ZIP file ON DISK (not in memory) to handle large photo sets
temp_zip_path = os.path.join(tempfile.gettempdir(), f'filtered_photos_{job_id}.zip')
files_added = 0
try:
with zipfile.ZipFile(temp_zip_path, 'w', zipfile.ZIP_STORED) as zf:
for filename in filtered_photos:
photo_path = os.path.join(job['upload_dir'], filename)
if os.path.exists(photo_path):
zf.write(photo_path, filename)
files_added += 1
if files_added % 100 == 0:
print(f"[Download] Added {files_added} files to ZIP...")
if files_added == 0:
if os.path.exists(temp_zip_path):
os.remove(temp_zip_path)
return jsonify({'error': 'No files found. Files may have been cleaned up.'}), 404
zip_size_mb = os.path.getsize(temp_zip_path) / (1024 * 1024)
print(f"[Download] ZIP created: {files_added} files, {zip_size_mb:.1f} MB")
# Stream the file and delete after sending
def generate_and_cleanup():
try:
with open(temp_zip_path, 'rb') as f:
while True:
chunk = f.read(8192 * 16) # 128KB chunks
if not chunk:
break
yield chunk
finally:
try:
if os.path.exists(temp_zip_path):
os.remove(temp_zip_path)
print(f"[Download] Cleaned up temp ZIP: {temp_zip_path}")
except Exception as e:
print(f"[Download] Error cleaning up temp ZIP: {e}")
return Response(
generate_and_cleanup(),
mimetype='application/zip',
headers={
'Content-Disposition': f'attachment; filename=filtered_photos_{job_id}.zip',
'Content-Length': str(os.path.getsize(temp_zip_path))
}
)
except Exception as e:
if os.path.exists(temp_zip_path):
os.remove(temp_zip_path)
print(f"[Download] Error creating ZIP: {e}")
return jsonify({'error': f'Error creating ZIP: {str(e)}'}), 500
@app.route('/download_unmatched/<job_id>')
def download_unmatched(job_id):
"""Download photos where target person was NOT detected, with timestamp-sorted naming."""
import zipfile
import tempfile
from datetime import datetime
from collections import defaultdict
if job_id not in processing_jobs:
return jsonify({'error': 'Job not found'}), 404
job = processing_jobs[job_id]
upload_dir = job.get('upload_dir', '')
if not upload_dir:
return jsonify({'error': 'Upload directory not found'}), 404
# Get unmatched photos from review data
unmatched_photos = []
if 'review_data' in job:
unmatched_photos = job['review_data'].get('unmatched_photos', [])
else:
# Try to load from file
review_file = os.path.join(RESULTS_FOLDER, f"{job_id}_review.json")
if os.path.exists(review_file):
with open(review_file, 'r') as f:
review_data = json.load(f)
unmatched_photos = review_data.get('unmatched_photos', [])
if not unmatched_photos:
return jsonify({'error': 'No unmatched photos found'}), 404
print(f"[Download] Starting disk-based ZIP for {len(unmatched_photos)} unmatched photos...")
# Month abbreviations
MONTH_ABBREV = {
1: "Jan", 2: "Feb", 3: "Mar", 4: "Apr",
5: "May", 6: "Jun", 7: "Jul", 8: "Aug",
9: "Sep", 10: "Oct", 11: "Nov", 12: "Dec"
}
# Import timestamp extractor
from photo_selector.utils import get_photo_timestamp
# Group photos by month and sort by timestamp
photos_by_month = defaultdict(list)
photos_no_timestamp = []
for photo in unmatched_photos:
filename = photo.get('filename', '')
ts = photo.get('timestamp')
# If no timestamp stored, try to extract it from the photo file
if not ts:
photo_path = os.path.join(upload_dir, filename)
if os.path.exists(photo_path):
dt = get_photo_timestamp(photo_path)
if dt:
ts = dt.timestamp()
if ts:
dt = datetime.fromtimestamp(ts)
month_key = (dt.year, dt.month)
photos_by_month[month_key].append({
'filename': filename,
'timestamp': ts
})
else:
photos_no_timestamp.append({'filename': filename})
# Sort photos within each month by timestamp
for month_key in photos_by_month:
photos_by_month[month_key].sort(key=lambda x: x['timestamp'])
# Create ZIP file ON DISK (not in memory) to handle large photo sets
temp_zip_path = os.path.join(tempfile.gettempdir(), f'unmatched_photos_{job_id}.zip')
files_added = 0
try:
with zipfile.ZipFile(temp_zip_path, 'w', zipfile.ZIP_STORED) as zf:
# Add photos with timestamps (sorted and renamed)
for month_key in sorted(photos_by_month.keys()):
year, month = month_key
month_abbrev = MONTH_ABBREV[month]
photos = photos_by_month[month_key]
for idx, photo in enumerate(photos, start=1):
original_filename = photo['filename']
photo_path = os.path.join(upload_dir, original_filename)
if os.path.exists(photo_path):
ext = os.path.splitext(original_filename)[1]
base_name = os.path.splitext(original_filename)[0]
new_filename = f"{month_abbrev}_{idx}_{base_name}{ext}"
zf.write(photo_path, new_filename)
files_added += 1
if files_added % 100 == 0:
print(f"[Download] Added {files_added} files to ZIP...")
# Add photos without timestamps at the end
for idx, photo in enumerate(photos_no_timestamp, start=1):
original_filename = photo['filename']
photo_path = os.path.join(upload_dir, original_filename)
if os.path.exists(photo_path):
ext = os.path.splitext(original_filename)[1]
base_name = os.path.splitext(original_filename)[0]
new_filename = f"NoDate_{idx}_{base_name}{ext}"
zf.write(photo_path, new_filename)
files_added += 1
if files_added == 0:
if os.path.exists(temp_zip_path):
os.remove(temp_zip_path)
return jsonify({'error': 'No files found in upload directory'}), 404
zip_size_mb = os.path.getsize(temp_zip_path) / (1024 * 1024)
print(f"[Download] ZIP created: {files_added} files, {zip_size_mb:.1f} MB")
# Stream the file and delete after sending
def generate_and_cleanup():
try:
with open(temp_zip_path, 'rb') as f:
while True:
chunk = f.read(8192 * 16) # 128KB chunks
if not chunk:
break
yield chunk
finally:
try:
if os.path.exists(temp_zip_path):
os.remove(temp_zip_path)
print(f"[Download] Cleaned up temp ZIP: {temp_zip_path}")
except Exception as e:
print(f"[Download] Error cleaning up temp ZIP: {e}")
return Response(
generate_and_cleanup(),
mimetype='application/zip',
headers={
'Content-Disposition': f'attachment; filename=unmatched_photos_{job_id}.zip',
'Content-Length': str(os.path.getsize(temp_zip_path))
}
)
except Exception as e:
if os.path.exists(temp_zip_path):
os.remove(temp_zip_path)
print(f"[Download] Error creating ZIP: {e}")
return jsonify({'error': f'Error creating ZIP: {str(e)}'}), 500
@app.route('/cleanup/<job_id>', methods=['POST'])
def cleanup_job(job_id):
"""Clean up job files."""
if job_id in processing_jobs:
upload_dir = processing_jobs[job_id].get('upload_dir')
if upload_dir and os.path.exists(upload_dir):
shutil.rmtree(upload_dir)
del processing_jobs[job_id]
results_file = os.path.join(RESULTS_FOLDER, f"{job_id}.json")
if os.path.exists(results_file):
os.remove(results_file)
# Also clean up review file
review_file = os.path.join(RESULTS_FOLDER, f"{job_id}_review.json")
if os.path.exists(review_file):
os.remove(review_file)
return jsonify({'message': 'Cleaned up'})
# ==================== REVIEW WORKFLOW ROUTES ====================
@app.route('/step3_review/<job_id>')
def step3_review(job_id):
"""Step 3: Review filtered photos before quality selection."""
if job_id not in processing_jobs:
return render_template('index.html')
job = processing_jobs[job_id]
# Check if face filtering is complete
if job['status'] not in ['review_pending', 'complete']:
# Still processing or error - redirect back to step2
return render_template('step2_upload.html',
session_id=session.get('session_id'),
reference_count=job.get('reference_count', 0))
return render_template('step3_review.html', job_id=job_id)
@app.route('/review_data/<job_id>')
def get_review_data(job_id):
"""Get the filtered photos data for review."""
if job_id not in processing_jobs:
return jsonify({'error': 'Job not found'}), 404
job = processing_jobs[job_id]
# Check if we have review data
if 'review_data' in job:
return jsonify(job['review_data'])
# Try to load from file
review_file = os.path.join(RESULTS_FOLDER, f"{job_id}_review.json")
if os.path.exists(review_file):
with open(review_file, 'r') as f:
review_data = json.load(f)
return jsonify(review_data)
return jsonify({'error': 'Review data not found'}), 404
@app.route('/review_thumbnail/<job_id>/<filename>')
def get_review_thumbnail(job_id, filename):
"""Serve thumbnail for review page."""
# Thumbnails are always stored in uploads/<job_id>/thumbnails
thumb_dir = os.path.join(UPLOAD_FOLDER, job_id, 'thumbnails')
if os.path.exists(os.path.join(thumb_dir, filename)):
return send_from_directory(thumb_dir, filename)
# Fallback: check if thumbnails are in the upload_dir (for older jobs)
if job_id in processing_jobs:
job = processing_jobs[job_id]
upload_dir = job.get('upload_dir', '')
fallback_dir = os.path.join(upload_dir, 'thumbnails')
if os.path.exists(os.path.join(fallback_dir, filename)):
return send_from_directory(fallback_dir, filename)
return send_from_directory(thumb_dir, filename)
@app.route('/review_photo/<job_id>/<filename>')
def get_review_photo(job_id, filename):
"""Serve full-size photo for review modal with EXIF rotation handling."""
from io import BytesIO
from PIL import ExifTags
photo_dir = os.path.join(UPLOAD_FOLDER, job_id)
filepath = os.path.join(photo_dir, filename)
if not os.path.exists(filepath):
return jsonify({'error': 'File not found'}), 404
ext = os.path.splitext(filename)[1].lower()
# Handle HEIC/HEIF - convert to JPEG
if ext in ['.heic', '.heif']:
try:
img = Image.open(filepath)
img = img.convert('RGB')
buffer = BytesIO()
img.save(buffer, format='JPEG', quality=90)
buffer.seek(0)
return send_file(buffer, mimetype='image/jpeg')
except Exception as e:
print(f"Error converting HEIC: {e}")
return send_from_directory(photo_dir, filename)
# Handle JPG/JPEG - apply EXIF rotation
if ext in ['.jpg', '.jpeg']:
try:
img = Image.open(filepath)
# Get EXIF orientation and rotate if needed
try:
for orientation in ExifTags.TAGS.keys():
if ExifTags.TAGS[orientation] == 'Orientation':
break
exif = img._getexif()
if exif is not None:
orientation_value = exif.get(orientation)
if orientation_value == 3:
img = img.rotate(180, expand=True)
elif orientation_value == 6:
img = img.rotate(270, expand=True)
elif orientation_value == 8:
img = img.rotate(90, expand=True)
except (AttributeError, KeyError, IndexError):
pass
if img.mode != 'RGB':
img = img.convert('RGB')
buffer = BytesIO()
img.save(buffer, format='JPEG', quality=90)
buffer.seek(0)
return send_file(buffer, mimetype='image/jpeg')
except Exception as e:
print(f"Error processing JPEG: {e}")
return send_from_directory(photo_dir, filename)
return send_from_directory(photo_dir, filename)
@app.route('/confirm_selection/<job_id>', methods=['POST'])
def confirm_selection(job_id):
"""User confirms their selection - proceed to quality-based selection."""
if job_id not in processing_jobs:
return jsonify({'error': 'Job not found'}), 404
job = processing_jobs[job_id]
# Get confirmed photos from request
data = request.get_json()
if not data or 'selected_photos' not in data:
return jsonify({'error': 'No photos selected'}), 400
confirmed_photos = data['selected_photos']
if len(confirmed_photos) == 0:
return jsonify({'error': 'At least one photo must be selected'}), 400
# Get embedding model selection (default to siglip)
embedding_model = data.get('embedding_model', 'siglip')
if embedding_model not in ['siglip', 'clip']:
embedding_model = 'siglip'
# Get processing parameters from job
quality_mode = job.get('quality_mode', 'balanced')
similarity_threshold = job.get('similarity_threshold', 0.92)
upload_dir = job.get('upload_dir')
# Load cached face data from review_data (to avoid re-detection in scoring)
face_data_cache = {}
if 'review_data' in job:
for photo in job['review_data'].get('filtered_photos', []):
filename = photo.get('filename')
if filename:
face_data_cache[filename] = {
'num_faces': photo.get('num_faces', 0),
'face_bboxes': photo.get('face_bboxes', [])
}
else:
# Try loading from review file
review_file = os.path.join(RESULTS_FOLDER, f"{job_id}_review.json")
if os.path.exists(review_file):
with open(review_file, 'r') as f:
review_data = json.load(f)
for photo in review_data.get('filtered_photos', []):
filename = photo.get('filename')
if filename:
face_data_cache[filename] = {
'num_faces': photo.get('num_faces', 0),
'face_bboxes': photo.get('face_bboxes', [])
}
print(f"[Job {job_id}] Loaded face data cache for {len(face_data_cache)} photos")
# Update job status
job['status'] = 'processing'
job['progress'] = 0
job['message'] = 'Starting quality-based selection...'
job['confirmed_photos'] = confirmed_photos
# Start phase 2 processing
thread = threading.Thread(
target=process_photos_quality_selection,
args=(job_id, upload_dir, quality_mode, similarity_threshold, confirmed_photos, face_data_cache, embedding_model)
)
thread.start()
return jsonify({
'message': f'Processing {len(confirmed_photos)} confirmed photos...',
'confirmed_count': len(confirmed_photos)
})
@app.route('/step4_results/<job_id>')
def step4_results(job_id):
"""Step 4: Final results page."""
if job_id not in processing_jobs:
return render_template('index.html')
job = processing_jobs[job_id]
# Check reference count from session
session_id = session.get('session_id')
ref_count = 0
if session_id and session_id in face_matchers:
ref_count = face_matchers[session_id].get_reference_count()
return render_template('step4_results.html',
job_id=job_id,
reference_count=ref_count)
# ==================== TEST SINGLE MONTH ROUTES ====================
@app.route('/test-month')
def test_month_page():
"""Test page for single month photo selection."""
return render_template('test_month.html')
@app.route('/test-month/start', methods=['POST'])
def test_month_start():
"""Start processing a single month folder."""
data = request.get_json()
folder_path = data.get('folder_path', '').strip()
target = int(data.get('target', 40))
organize_by_month = data.get('organize_by_month', False)
if not folder_path:
return jsonify({'error': 'No folder path provided'}), 400
if not os.path.isdir(folder_path):
return jsonify({'error': f'Folder not found: {folder_path}'}), 400
# Count valid image files
extensions = {'.jpg', '.jpeg', '.png', '.heic', '.heif', '.webp'}
image_files = [f for f in os.listdir(folder_path)
if os.path.splitext(f.lower())[1] in extensions]
if not image_files:
return jsonify({'error': 'No valid image files found in folder'}), 400
# Create job
job_id = str(uuid.uuid4())[:8]
# Create thumbnails directory
thumb_dir = os.path.join(UPLOAD_FOLDER, job_id, 'thumbnails')
os.makedirs(thumb_dir, exist_ok=True)
processing_jobs[job_id] = {
'status': 'processing',
'progress': 0,
'message': 'Starting test...',
'folder_path': folder_path,
'thumb_dir': thumb_dir,
'target': target,
'total_files': len(image_files),
'results': None,
'organize_by_month': organize_by_month
}
# Start processing in background
thread = threading.Thread(
target=process_test_month,
args=(job_id, folder_path, target, thumb_dir, organize_by_month)
)
thread.start()
return jsonify({
'job_id': job_id,
'total_photos': len(image_files),
'target': target,
'organize_by_month': organize_by_month,
'message': f'Processing {len(image_files)} photos...'
})
@app.route('/test-month/upload', methods=['POST'])
def test_month_upload():
"""Handle uploaded photos for test-month (for HuggingFace deployment)."""
if 'photos' not in request.files:
return jsonify({'error': 'No photos uploaded'}), 400
files = request.files.getlist('photos')
target = int(request.form.get('target', 40))
organize_by_month = request.form.get('organize_by_month', 'false').lower() == 'true'
if not files or len(files) == 0:
return jsonify({'error': 'No photos uploaded'}), 400
# Filter valid image files
extensions = {'.jpg', '.jpeg', '.png', '.heic', '.heif', '.webp'}
valid_files = [f for f in files if f.filename and
os.path.splitext(f.filename.lower())[1] in extensions]
if not valid_files:
return jsonify({'error': 'No valid image files uploaded'}), 400
# Create job and upload directory
job_id = str(uuid.uuid4())[:8]
upload_dir = os.path.join(UPLOAD_FOLDER, job_id, 'photos')
thumb_dir = os.path.join(UPLOAD_FOLDER, job_id, 'thumbnails')
os.makedirs(upload_dir, exist_ok=True)
os.makedirs(thumb_dir, exist_ok=True)
# Save uploaded files
saved_files = []
for f in valid_files:
filename = secure_filename(f.filename)
# Handle duplicate filenames
base, ext = os.path.splitext(filename)
counter = 1
while os.path.exists(os.path.join(upload_dir, filename)):
filename = f"{base}_{counter}{ext}"
counter += 1
filepath = os.path.join(upload_dir, filename)
f.save(filepath)
saved_files.append(filename)
processing_jobs[job_id] = {
'status': 'processing',
'progress': 0,
'message': 'Starting test...',
'folder_path': upload_dir, # Use upload dir as folder path
'thumb_dir': thumb_dir,
'target': target,
'total_files': len(saved_files),
'results': None,
'is_upload': True,
'organize_by_month': organize_by_month
}
# Start processing in background
thread = threading.Thread(
target=process_test_month,
args=(job_id, upload_dir, target, thumb_dir, organize_by_month)
)
thread.start()
return jsonify({
'job_id': job_id,
'total_photos': len(saved_files),
'target': target,
'organize_by_month': organize_by_month,
'message': f'Processing {len(saved_files)} uploaded photos...'
})
@app.route('/test-month/upload-init', methods=['POST'])
def test_month_upload_init():
"""Initialize chunked upload for test-month."""
data = request.json
total_files = data.get('total_files', 0)
target = data.get('target', 40)
organize_by_month = data.get('organize_by_month', False)
job_id = str(uuid.uuid4())[:8]
upload_dir = os.path.join(UPLOAD_FOLDER, job_id, 'photos')
thumb_dir = os.path.join(UPLOAD_FOLDER, job_id, 'thumbnails')
os.makedirs(upload_dir, exist_ok=True)
os.makedirs(thumb_dir, exist_ok=True)
# Store upload session
session_id = f"test_{job_id}"
upload_sessions[session_id] = {
'job_id': job_id,
'upload_dir': upload_dir,
'thumb_dir': thumb_dir,
'target': target,
'organize_by_month': organize_by_month,
'total_files': total_files,
'uploaded_files': []
}
print(f"[Test-Month Upload {job_id}] Initialized for {total_files} files")
return jsonify({
'session_id': session_id,
'job_id': job_id
})
@app.route('/test-month/upload-chunk', methods=['POST'])
def test_month_upload_chunk():
"""Handle a chunk of files for test-month."""
session_id = request.form.get('session_id')
if not session_id or session_id not in upload_sessions:
return jsonify({'error': 'Invalid session'}), 400
session_data = upload_sessions[session_id]
upload_dir = session_data['upload_dir']
files = request.files.getlist('files')
extensions = {'.jpg', '.jpeg', '.png', '.heic', '.heif', '.webp'}
saved_count = 0
for f in files:
if f and f.filename:
ext = os.path.splitext(f.filename.lower())[1]
if ext in extensions:
filename = secure_filename(f.filename)
# Handle duplicate filenames
base, ext = os.path.splitext(filename)
counter = 1
while os.path.exists(os.path.join(upload_dir, filename)):
filename = f"{base}_{counter}{ext}"
counter += 1
f.save(os.path.join(upload_dir, filename))
session_data['uploaded_files'].append(filename)
saved_count += 1
chunk_index = request.form.get('chunk_index', '?')
print(f"[Test-Month Upload {session_data['job_id']}] Chunk {chunk_index}: saved {saved_count} files (total: {len(session_data['uploaded_files'])})")
return jsonify({
'uploaded': len(session_data['uploaded_files']),
'total': session_data['total_files']
})
@app.route('/test-month/upload-complete', methods=['POST'])
def test_month_upload_complete():
"""Complete chunked upload and start processing for test-month."""
data = request.json
session_id = data.get('session_id')
if not session_id or session_id not in upload_sessions:
return jsonify({'error': 'Invalid session'}), 400
session_data = upload_sessions[session_id]
job_id = session_data['job_id']
upload_dir = session_data['upload_dir']
thumb_dir = session_data['thumb_dir']
target = session_data['target']
organize_by_month = session_data['organize_by_month']
saved_files = session_data['uploaded_files']
# Clean up session
del upload_sessions[session_id]
if not saved_files:
return jsonify({'error': 'No valid image files uploaded'}), 400
print(f"[Test-Month Upload {job_id}] Complete: {len(saved_files)} files, starting processing...")
# Create processing job
processing_jobs[job_id] = {
'status': 'processing',
'progress': 0,
'message': 'Starting test...',
'folder_path': upload_dir,
'thumb_dir': thumb_dir,
'target': target,
'total_files': len(saved_files),
'results': None,
'is_upload': True,
'organize_by_month': organize_by_month
}
# Start processing in background
thread = threading.Thread(
target=process_test_month,
args=(job_id, upload_dir, target, thumb_dir, organize_by_month)
)
thread.start()
return jsonify({
'job_id': job_id,
'total_photos': len(saved_files),
'target': target,
'organize_by_month': organize_by_month,
'message': f'Processing {len(saved_files)} uploaded photos...'
})
def process_test_month(job_id, folder_path, target, thumb_dir, organize_by_month=False):
"""Process photos for testing with category-aware selection.
If organize_by_month is True, groups photos by EXIF date and runs
selection per month (same as main app Step 4).
"""
try:
from photo_selector.monthly_selector import MonthlyPhotoSelector, CategoryDetector
from photo_selector.siglip_embeddings import SigLIPEmbedder
from photo_selector.scoring import PhotoScorer
from datetime import datetime
job = processing_jobs[job_id]
# Get all photos
extensions = {'.jpg', '.jpeg', '.png', '.heic', '.heif', '.webp'}
photo_files = [f for f in os.listdir(folder_path)
if os.path.splitext(f.lower())[1] in extensions]
photo_paths = [os.path.join(folder_path, f) for f in photo_files]
job['message'] = 'Loading SigLIP model...'
job['progress'] = 5
# Initialize embedder and selector
embedder = SigLIPEmbedder()
selector = MonthlyPhotoSelector()
# Step 1: Generate embeddings
job['message'] = f'Generating SigLIP embeddings for {len(photo_paths)} photos...'
job['progress'] = 10
embeddings = embedder.process_folder(folder_path)
job['progress'] = 30
# Step 2: Detect categories for all photos
job['message'] = 'Detecting photo categories...'
job['progress'] = 35
selector._ensure_category_detector()
categories = selector.category_detector.detect_categories_batch(photo_paths)
job['progress'] = 45
# Step 3: Score photos and add category + timestamp
job['message'] = 'Scoring photos...'
scorer = PhotoScorer()
scored_photos = []
for i, photo_path in enumerate(photo_paths):
filename = os.path.basename(photo_path)
scores = scorer.score_photo(photo_path)
# Get category
cat, conf = categories.get(filename, ('unknown', 0.0))
# Get timestamp from EXIF
dt = selector.get_photo_date(photo_path)
scored_photos.append({
'filename': filename,
'filepath': photo_path,
'total': scores.get('total', 0),
'face_quality': scores.get('face_quality', 0),
'aesthetic_quality': scores.get('aesthetic_quality', 0),
'emotional_signal': scores.get('emotional_signal', 0),
'uniqueness': scores.get('uniqueness', 0.5),
'num_faces': scores.get('num_faces', 0),
'category': cat,
'category_confidence': conf,
'timestamp': dt.timestamp() if dt else None
})
if (i + 1) % 10 == 0:
job['progress'] = 45 + int((i / len(photo_paths)) * 20)
job['message'] = f'Scoring photos... {i + 1}/{len(photo_paths)}'
job['progress'] = 70
# Step 4: Run category-aware HDBSCAN selection
if organize_by_month:
# Group photos by month using EXIF dates
job['message'] = 'Grouping photos by month...'
# Month names for mapping
MONTH_NAMES = ['January', 'February', 'March', 'April', 'May', 'June',
'July', 'August', 'September', 'October', 'November', 'December']
photos_by_month = {}
for photo in scored_photos:
ts = photo.get('timestamp')
if ts:
dt = datetime.fromtimestamp(ts)
month_name = MONTH_NAMES[dt.month - 1]
else:
month_name = 'Unknown'
photo['month'] = month_name
if month_name not in photos_by_month:
photos_by_month[month_name] = []
photos_by_month[month_name].append(photo)
# Calculate target per month (proportional allocation)
total_photos = len(scored_photos)
selected = []
month_stats = []
for month_name, month_photos in photos_by_month.items():
# Proportional target for this month
month_proportion = len(month_photos) / total_photos
month_target = max(1, int(target * month_proportion))
job['message'] = f'Processing {month_name} ({len(month_photos)} photos)...'
# Get embeddings for this month's photos
month_embeddings = {p['filename']: embeddings.get(p['filename']) for p in month_photos}
# Run selection for this month
month_selected = selector.select_hybrid_hdbscan(month_photos, month_embeddings, target=month_target)
# Add month info to each selected photo
for photo in month_selected:
photo['month'] = month_name
selected.extend(month_selected)
month_stats.append({
'month': month_name,
'total_photos': len(month_photos),
'selected': len(month_selected),
'target': month_target
})
print(f"[Test Month {job_id}] Organized by month: {len(photos_by_month)} months, {len(selected)} total selected")
else:
# Single batch selection (original behavior)
job['message'] = 'Running category-aware clustering and selection...'
selected = selector.select_hybrid_hdbscan(scored_photos, embeddings, target=target)
# Add 'Unknown' month to all photos when not organized
for photo in selected:
photo['month'] = 'Unknown'
for photo in scored_photos:
photo['month'] = 'Unknown'
month_stats = []
job['progress'] = 85
job['message'] = 'Creating thumbnails...'
# Create thumbnails and build results
selected_results = []
for photo in selected:
filename = photo['filename']
filepath = photo['filepath']
thumb_name = get_thumbnail_name(filename)
thumb_path = os.path.join(thumb_dir, thumb_name)
create_thumbnail(filepath, thumb_path)
# Get embedding for this photo
photo_emb = embeddings.get(filename)
embedding_list = photo_emb.tolist() if photo_emb is not None else None
# Format timestamp for display
ts = photo.get('timestamp')
datetime_str = ''
if ts:
dt = datetime.fromtimestamp(ts)
datetime_str = dt.strftime('%Y-%m-%d %H:%M:%S')
selected_results.append({
'filename': filename,
'thumbnail': thumb_name,
'score': float(photo.get('total', 0)),
'face_quality': float(photo.get('face_quality', 0)),
'aesthetic_quality': float(photo.get('aesthetic_quality', 0)),
'emotional_signal': float(photo.get('emotional_signal', 0)),
'uniqueness': float(photo.get('uniqueness', 0)),
'num_faces': int(photo.get('num_faces', 0)),
'multi_face_bonus': float(photo.get('multi_face_bonus', 0)),
'cluster_id': photo.get('cluster_id', -1),
'max_similarity': float(photo.get('max_similarity', 0)),
'category': photo.get('category', 'unknown'),
'category_confidence': float(photo.get('category_confidence', 0)),
'event_id': photo.get('event_id', -1),
'selection_reason': photo.get('selection_reason', ''),
'datetime': datetime_str,
'embedding': embedding_list,
'month': photo.get('month', 'Unknown')
})
# Build rejected list
selected_filenames = {p['filename'] for p in selected}
rejected_results = []
for photo in scored_photos:
if photo['filename'] not in selected_filenames:
filename = photo['filename']
filepath = photo['filepath']
thumb_name = get_thumbnail_name(filename)
thumb_path = os.path.join(thumb_dir, thumb_name)
create_thumbnail(filepath, thumb_path)
photo_emb = embeddings.get(filename)
embedding_list = photo_emb.tolist() if photo_emb is not None else None
# Format timestamp for display
ts = photo.get('timestamp')
datetime_str = ''
if ts:
from datetime import datetime
dt = datetime.fromtimestamp(ts)
datetime_str = dt.strftime('%Y-%m-%d %H:%M:%S')
rejected_results.append({
'filename': filename,
'thumbnail': thumb_name,
'score': float(photo.get('total', 0)),
'face_quality': float(photo.get('face_quality', 0)),
'aesthetic_quality': float(photo.get('aesthetic_quality', 0)),
'num_faces': int(photo.get('num_faces', 0)),
'cluster_id': photo.get('cluster_id', -1),
'category': photo.get('category', 'unknown'),
'event_id': photo.get('event_id', -1),
'embedding': embedding_list,
'max_similarity': float(photo.get('max_similarity', 0)),
'selection_reason': photo.get('rejection_reason', 'Not selected'),
'datetime': datetime_str,
'month': photo.get('month', 'Unknown')
})
# Sort results
selected_results.sort(key=lambda x: x['score'], reverse=True)
rejected_results.sort(key=lambda x: x['score'], reverse=True)
# Cluster distribution
cluster_counts = {}
for photo in selected_results:
cid = photo.get('cluster_id', -1)
cluster_counts[cid] = cluster_counts.get(cid, 0) + 1
# Category distribution
category_counts = {}
for photo in selected_results:
cat = photo.get('category', 'unknown')
category_counts[cat] = category_counts.get(cat, 0) + 1
# Build results
job['results'] = {
'selected': selected_results,
'rejected': rejected_results,
'summary': {
'total_photos': len(photo_paths),
'selected_count': len(selected_results),
'rejected_count': len(rejected_results),
'target': target
},
'cluster_distribution': cluster_counts,
'category_distribution': category_counts,
'organized_by_month': organize_by_month,
'month_stats': month_stats
}
job['status'] = 'complete'
job['progress'] = 100
job['message'] = f'Done! Selected {len(selected_results)} of {len(photo_paths)} photos'
print(f"\n[Test Month {job_id}] Complete!")
print(f" - Total: {len(photo_paths)}")
print(f" - Selected: {len(selected_results)}")
print(f" - Organized by month: {organize_by_month}")
if month_stats:
print(f" - Month stats: {month_stats}")
print(f" - Clusters: {cluster_counts}")
print(f" - Categories: {category_counts}")
except Exception as e:
processing_jobs[job_id]['status'] = 'error'
processing_jobs[job_id]['message'] = str(e)
import traceback
traceback.print_exc()
@app.route('/test-month/status/<job_id>')
def test_month_status(job_id):
"""Get test month job status."""
if job_id not in processing_jobs:
return jsonify({'error': 'Job not found'}), 404
job = processing_jobs[job_id]
return jsonify({
'status': job['status'],
'progress': job['progress'],
'message': job['message']
})
@app.route('/test-month/results/<job_id>')
def test_month_results(job_id):
"""Get test month results."""
if job_id not in processing_jobs:
return jsonify({'error': 'Job not found'}), 404
job = processing_jobs[job_id]
if job['status'] != 'complete':
return jsonify({'error': 'Not complete', 'status': job['status']}), 400
return jsonify(job['results'])
@app.route('/test-month/thumbnail/<job_id>/<filename>')
def test_month_thumbnail(job_id, filename):
"""Serve test month thumbnails."""
thumb_dir = os.path.join(UPLOAD_FOLDER, job_id, 'thumbnails')
return send_from_directory(thumb_dir, filename)
@app.route('/test-month/download/<job_id>')
def test_month_download(job_id):
"""Download selected photos from test-month as ZIP."""
import zipfile
from io import BytesIO
if job_id not in processing_jobs:
return jsonify({'error': 'Job not found'}), 404
job = processing_jobs[job_id]
if job['status'] != 'complete':
return jsonify({'error': 'Processing not complete'}), 400
results = job.get('results', {})
selected = results.get('selected', [])
folder_path = job.get('folder_path', '')
if not selected:
return jsonify({'error': 'No selected photos'}), 404
if not folder_path:
return jsonify({'error': 'Folder path not found'}), 404
# Create zip file
memory_file = BytesIO()
files_added = 0
with zipfile.ZipFile(memory_file, 'w', zipfile.ZIP_DEFLATED) as zf:
for photo in selected:
filename = photo.get('filename', '')
# Build full path from folder_path + filename
photo_path = os.path.join(folder_path, filename)
if os.path.exists(photo_path):
zf.write(photo_path, filename)
files_added += 1
if files_added == 0:
return jsonify({'error': 'No files could be added to ZIP'}), 404
memory_file.seek(0)
return send_file(
memory_file,
mimetype='application/zip',
as_attachment=True,
download_name=f'test_selected_{job_id}.zip'
)
# ============================================
# DATASET SAVE/LOAD ROUTES
# ============================================
@app.route('/datasets')
def datasets_page():
"""Show saved datasets page."""
return render_template('datasets.html')
@app.route('/api/datasets')
def list_datasets():
"""List all saved datasets (local + Supabase)."""
datasets = []
seen_names = set()
# 1. Get local datasets
if os.path.exists(DATASETS_FOLDER):
for name in os.listdir(DATASETS_FOLDER):
meta_path = os.path.join(DATASETS_FOLDER, name, 'metadata.json')
if os.path.exists(meta_path):
try:
with open(meta_path, 'r') as f:
meta = json.load(f)
meta['folder_name'] = name
meta['source'] = 'local'
datasets.append(meta)
seen_names.add(name)
except:
pass
# 2. Get Supabase datasets (if available)
if is_supabase_available():
try:
supabase_datasets = list_datasets_from_supabase()
for meta in supabase_datasets:
folder_name = meta.get('folder_name', '')
# Only add if not already in local (local takes priority)
if folder_name and folder_name not in seen_names:
meta['source'] = 'supabase'
datasets.append(meta)
except Exception as e:
print(f"[Datasets] Error fetching from Supabase: {e}")
# Sort by date, newest first
datasets.sort(key=lambda x: x.get('created_at', '') or '', reverse=True)
return jsonify({'datasets': datasets, 'supabase_available': is_supabase_available()})
@app.route('/save_dataset/<job_id>', methods=['POST'])
def save_dataset(job_id):
"""Save dataset after Step 3 review."""
try:
data = request.get_json()
dataset_name = data.get('name', f"dataset_{job_id}")
# Validate name (alphanumeric, underscore, hyphen, space only)
import re
safe_name = re.sub(r'[^a-zA-Z0-9_\- ]', '', dataset_name).strip()
if not safe_name:
safe_name = f"dataset_{job_id}"
# Create folder name (replace spaces with underscores)
folder_name = safe_name.replace(' ', '_')
dataset_path = os.path.join(DATASETS_FOLDER, folder_name)
# Check if already exists
if os.path.exists(dataset_path):
return jsonify({'error': f'Dataset "{safe_name}" already exists'}), 400
os.makedirs(dataset_path, exist_ok=True)
# Get job data
if job_id not in processing_jobs:
return jsonify({'error': 'Job not found'}), 404
job = processing_jobs[job_id]
session_id = job.get('session_id')
# 1. Save reference embeddings
if session_id and session_id in face_matchers:
matcher = face_matchers[session_id]
embeddings_path = os.path.join(dataset_path, 'reference_embeddings.npz')
np.savez_compressed(
embeddings_path,
embeddings=np.array(matcher.reference_embeddings),
average=matcher.average_embedding,
threshold=matcher.similarity_threshold
)
# 2. Copy face results from review JSON
review_file = os.path.join(RESULTS_FOLDER, f"{job_id}_review.json")
if os.path.exists(review_file):
shutil.copy(review_file, os.path.join(dataset_path, 'face_results.json'))
# 3. Save confirmed photos list
confirmed_photos = job.get('confirmed_photos', [])
if not confirmed_photos:
# Try loading from review JSON (Step 3) - contains filtered_photos
review_file = os.path.join(RESULTS_FOLDER, f"{job_id}_review.json")
if os.path.exists(review_file):
with open(review_file, 'r') as f:
review_data = json.load(f)
filtered = review_data.get('filtered_photos', [])
confirmed_photos = [p['filename'] for p in filtered]
# Fallback: Try loading from confirm step if not in memory
if not confirmed_photos:
results_file = os.path.join(RESULTS_FOLDER, f"{job_id}.json")
if os.path.exists(results_file):
with open(results_file, 'r') as f:
results_data = json.load(f)
selected = results_data.get('selected_photos', [])
rejected = results_data.get('rejected_photos', [])
confirmed_photos = [p['filename'] for p in selected + rejected]
with open(os.path.join(dataset_path, 'confirmed_photos.json'), 'w') as f:
json.dump({'photos': confirmed_photos}, f)
# 4. Copy thumbnails folder
upload_dir = job.get('upload_dir', os.path.join(UPLOAD_FOLDER, job_id))
thumb_dir = os.path.join(upload_dir, 'thumbnails')
dataset_thumb_dir = os.path.join(dataset_path, 'thumbnails')
if os.path.exists(thumb_dir):
shutil.copytree(thumb_dir, dataset_thumb_dir)
# 5. Copy original photos (for reload)
photos_dir = os.path.join(dataset_path, 'photos')
os.makedirs(photos_dir, exist_ok=True)
for filename in confirmed_photos:
src = os.path.join(upload_dir, filename)
if os.path.exists(src):
shutil.copy(src, os.path.join(photos_dir, filename))
# 6. Save metadata
metadata = {
'name': safe_name,
'created_at': datetime.now().isoformat(),
'original_job_id': job_id,
'session_id': session_id,
'total_photos': len(confirmed_photos),
'quality_mode': job.get('quality_mode', 'balanced'),
'similarity_threshold': job.get('similarity_threshold', 0.4),
'reference_count': len(face_matchers.get(session_id, {}).reference_embeddings) if session_id in face_matchers else 0
}
with open(os.path.join(dataset_path, 'metadata.json'), 'w') as f:
json.dump(metadata, f, indent=2)
print(f"[Dataset] Saved '{safe_name}' with {len(confirmed_photos)} photos locally")
# 7. Also save to Supabase (for persistence across HF restarts)
supabase_saved = False
if is_supabase_available():
try:
# Read embeddings file as bytes
embeddings_path = os.path.join(dataset_path, 'reference_embeddings.npz')
embeddings_data = None
if os.path.exists(embeddings_path):
with open(embeddings_path, 'rb') as f:
embeddings_data = f.read()
# Read face results
face_results_path = os.path.join(dataset_path, 'face_results.json')
face_results = {}
if os.path.exists(face_results_path):
with open(face_results_path, 'r') as f:
face_results = json.load(f)
# Save to Supabase
if embeddings_data:
supabase_saved = save_dataset_to_supabase(
folder_name,
embeddings_data,
face_results,
metadata
)
except Exception as e:
print(f"[Dataset] Supabase save error: {e}")
return jsonify({
'success': True,
'name': safe_name,
'folder_name': folder_name,
'total_photos': len(confirmed_photos),
'supabase_saved': supabase_saved
})
except Exception as e:
import traceback
traceback.print_exc()
return jsonify({'error': str(e)}), 500
@app.route('/load_dataset/<dataset_name>')
def load_dataset(dataset_name):
"""Load a saved dataset and redirect to review or selection."""
try:
dataset_path = os.path.join(DATASETS_FOLDER, dataset_name)
from_supabase = False
# Check if dataset exists locally
if not os.path.exists(dataset_path):
# Try loading from Supabase
if is_supabase_available():
print(f"[Dataset] Not found locally, trying Supabase...")
supabase_data = load_dataset_from_supabase(dataset_name)
if supabase_data:
from_supabase = True
# Redirect to re-upload page (photos not stored in Supabase)
return redirect(f'/reupload_photos/{dataset_name}')
else:
return jsonify({'error': 'Dataset not found in local or Supabase'}), 404
else:
return jsonify({'error': 'Dataset not found'}), 404
# Load metadata
with open(os.path.join(dataset_path, 'metadata.json'), 'r') as f:
metadata = json.load(f)
# Create new job ID
job_id = str(uuid.uuid4())[:8]
new_session_id = str(uuid.uuid4())[:8]
# Set up upload directory with photos
upload_dir = os.path.join(UPLOAD_FOLDER, job_id)
os.makedirs(upload_dir, exist_ok=True)
# Copy photos from dataset
dataset_photos_dir = os.path.join(dataset_path, 'photos')
if os.path.exists(dataset_photos_dir):
for filename in os.listdir(dataset_photos_dir):
src = os.path.join(dataset_photos_dir, filename)
shutil.copy(src, os.path.join(upload_dir, filename))
# Copy thumbnails
dataset_thumb_dir = os.path.join(dataset_path, 'thumbnails')
if os.path.exists(dataset_thumb_dir):
shutil.copytree(dataset_thumb_dir, os.path.join(upload_dir, 'thumbnails'))
# Load reference embeddings into face_matchers
embeddings_path = os.path.join(dataset_path, 'reference_embeddings.npz')
if os.path.exists(embeddings_path):
from photo_selector.face_matcher import FaceMatcher
data = np.load(embeddings_path, allow_pickle=True)
matcher = FaceMatcher(similarity_threshold=float(data['threshold']))
matcher.reference_embeddings = list(data['embeddings'])
matcher.average_embedding = data['average']
face_matchers[new_session_id] = matcher
session['face_session_id'] = new_session_id
# Load confirmed photos
confirmed_file = os.path.join(dataset_path, 'confirmed_photos.json')
confirmed_photos = []
if os.path.exists(confirmed_file):
with open(confirmed_file, 'r') as f:
confirmed_photos = json.load(f).get('photos', [])
# Load face results
face_results_path = os.path.join(dataset_path, 'face_results.json')
review_data = None
if os.path.exists(face_results_path):
with open(face_results_path, 'r') as f:
review_data = json.load(f)
# Create processing job
processing_jobs[job_id] = {
'status': 'review_pending',
'progress': 100,
'message': 'Dataset loaded - ready for review',
'upload_dir': upload_dir,
'session_id': new_session_id,
'has_reference_photos': True,
'reference_count': metadata.get('reference_count', 0),
'quality_mode': metadata.get('quality_mode', 'balanced'),
'similarity_threshold': metadata.get('similarity_threshold', 0.4),
'confirmed_photos': confirmed_photos,
'review_data': review_data,
'total_photos': len(confirmed_photos),
'from_dataset': dataset_name
}
# Copy face results to results folder for step3
if review_data:
with open(os.path.join(RESULTS_FOLDER, f"{job_id}_review.json"), 'w') as f:
json.dump(review_data, f)
print(f"[Dataset] Loaded '{dataset_name}' as job {job_id}")
# Check which page to go to
goto = request.args.get('goto', 'review')
if goto == 'select':
# Go directly to Step 4 - start quality selection
return redirect(f'/step4_results/{job_id}?from_dataset=1')
else:
# Go to Step 3 - review page
return redirect(f'/step3_review/{job_id}')
except Exception as e:
import traceback
traceback.print_exc()
return jsonify({'error': str(e)}), 500
@app.route('/delete_dataset/<dataset_name>', methods=['DELETE'])
def delete_dataset(dataset_name):
"""Delete a saved dataset (local and Supabase)."""
try:
deleted_local = False
deleted_supabase = False
# Delete local
dataset_path = os.path.join(DATASETS_FOLDER, dataset_name)
if os.path.exists(dataset_path):
shutil.rmtree(dataset_path)
deleted_local = True
print(f"[Dataset] Deleted '{dataset_name}' locally")
# Delete from Supabase
if is_supabase_available():
deleted_supabase = delete_dataset_from_supabase(dataset_name)
if not deleted_local and not deleted_supabase:
return jsonify({'error': 'Dataset not found'}), 404
return jsonify({'success': True, 'deleted_local': deleted_local, 'deleted_supabase': deleted_supabase})
except Exception as e:
return jsonify({'error': str(e)}), 500
@app.route('/dataset_thumbnail/<dataset_name>/<filename>')
def dataset_thumbnail(dataset_name, filename):
"""Serve dataset thumbnail."""
thumb_dir = os.path.join(DATASETS_FOLDER, dataset_name, 'thumbnails')
return send_from_directory(thumb_dir, filename)
# ============================================
# SUPABASE RE-UPLOAD ROUTES
# ============================================
@app.route('/reupload_photos/<dataset_name>')
def reupload_photos_page(dataset_name):
"""Show page to re-upload photos for a Supabase dataset."""
# Get metadata from Supabase
if not is_supabase_available():
return jsonify({'error': 'Supabase not available'}), 500
supabase_data = load_dataset_from_supabase(dataset_name)
if not supabase_data:
return jsonify({'error': 'Dataset not found in Supabase'}), 404
metadata = supabase_data.get('metadata', {})
return render_template('reupload_photos.html',
dataset_name=dataset_name,
metadata=metadata)
@app.route('/download_from_gdrive/<dataset_name>', methods=['POST'])
def download_from_gdrive(dataset_name):
"""Download zip from Google Drive and process photos."""
try:
import re
import zipfile
import gdown
data = request.get_json()
gdrive_link = data.get('gdrive_link', '')
print(f"[GDrive] Starting download for dataset '{dataset_name}'")
print(f"[GDrive] Link: {gdrive_link}")
# Extract file ID from Google Drive link
file_id = None
patterns = [
r'/file/d/([a-zA-Z0-9_-]+)',
r'id=([a-zA-Z0-9_-]+)',
r'/d/([a-zA-Z0-9_-]+)'
]
for pattern in patterns:
match = re.search(pattern, gdrive_link)
if match:
file_id = match.group(1)
break
if not file_id:
return jsonify({'error': 'Could not extract file ID from Google Drive link'}), 400
print(f"[GDrive] File ID: {file_id}")
# Create job and upload directory
job_id = str(uuid.uuid4())[:8]
upload_dir = os.path.join(UPLOAD_FOLDER, job_id)
os.makedirs(upload_dir, exist_ok=True)
os.makedirs(os.path.join(upload_dir, 'thumbnails'), exist_ok=True)
# Download using gdown (handles large files properly)
zip_path = os.path.join(upload_dir, 'photos.zip')
gdrive_url = f"https://drive.google.com/uc?id={file_id}"
print(f"[GDrive] Downloading using gdown...")
try:
gdown.download(gdrive_url, zip_path, quiet=False, fuzzy=True)
except Exception as e:
print(f"[GDrive] gdown failed: {e}")
# Try with confirm flag for large files
try:
gdown.download(gdrive_url, zip_path, quiet=False, fuzzy=True, use_cookies=False)
except Exception as e2:
print(f"[GDrive] gdown retry failed: {e2}")
return jsonify({'error': f'Download failed: {str(e2)}'}), 400
# Check if file was downloaded
if not os.path.exists(zip_path) or os.path.getsize(zip_path) < 1000:
print(f"[GDrive] ERROR: Download failed or file too small")
return jsonify({'error': 'Download failed. Make sure the file is shared with "Anyone with link".'}), 400
print(f"[GDrive] Download complete: {os.path.getsize(zip_path) / 1024 / 1024:.1f} MB")
# Extract zip file
print(f"[GDrive] Extracting zip file...")
uploaded_filenames = []
image_extensions = {'.jpg', '.jpeg', '.png', '.heic', '.heif', '.webp', '.bmp', '.gif'}
try:
with zipfile.ZipFile(zip_path, 'r') as zf:
for member in zf.namelist():
if member.endswith('/') or '/__MACOSX' in member or member.startswith('.'):
continue
ext = os.path.splitext(member.lower())[1]
if ext in image_extensions:
filename = secure_filename(os.path.basename(member))
if filename:
with zf.open(member) as src:
filepath = os.path.join(upload_dir, filename)
with open(filepath, 'wb') as dst:
dst.write(src.read())
uploaded_filenames.append(filename)
if len(uploaded_filenames) % 200 == 0:
print(f"[GDrive] Extracted {len(uploaded_filenames)} files...")
print(f"[GDrive] Extracted {len(uploaded_filenames)} photos")
finally:
# Clean up zip
if os.path.exists(zip_path):
os.remove(zip_path)
# Load dataset from Supabase
print(f"[GDrive] Loading dataset from Supabase...")
supabase_data = load_dataset_from_supabase(dataset_name)
if not supabase_data:
return jsonify({'error': 'Dataset not found in Supabase'}), 404
metadata = supabase_data.get('metadata', {})
face_results = supabase_data.get('face_results', {})
embeddings_data = supabase_data.get('embeddings_data')
# Load reference embeddings
new_session_id = str(uuid.uuid4())[:8]
if embeddings_data:
import io
from photo_selector.face_matcher import FaceMatcher
data_np = np.load(io.BytesIO(embeddings_data), allow_pickle=True)
matcher = FaceMatcher(similarity_threshold=float(data_np['threshold']))
matcher.reference_embeddings = list(data_np['embeddings'])
matcher.average_embedding = data_np['average']
face_matchers[new_session_id] = matcher
session['face_session_id'] = new_session_id
print(f"[GDrive] Loaded {len(matcher.reference_embeddings)} reference embeddings")
# Match uploaded files with saved face results
filtered_photos = face_results.get('filtered_photos', [])
uploaded_set = set(uploaded_filenames)
matched_photos = [p for p in filtered_photos if p.get('filename') in uploaded_set]
print(f"[GDrive] Matched {len(matched_photos)} of {len(filtered_photos)} photos")
# Create review data
review_data = {
'filtered_photos': matched_photos,
'total_processed': len(uploaded_filenames),
'match_count': len(matched_photos)
}
with open(os.path.join(RESULTS_FOLDER, f"{job_id}_review.json"), 'w') as f:
json.dump(review_data, f)
# Create processing job
processing_jobs[job_id] = {
'status': 'review_pending',
'progress': 100,
'message': 'Photos downloaded from Google Drive',
'upload_dir': upload_dir,
'session_id': new_session_id,
'has_reference_photos': True,
'reference_count': metadata.get('reference_count', 0),
'quality_mode': metadata.get('quality_mode', 'balanced'),
'similarity_threshold': metadata.get('similarity_threshold', 0.4),
'confirmed_photos': [p['filename'] for p in matched_photos],
'review_data': review_data,
'total_photos': len(matched_photos),
'from_dataset': dataset_name,
'from_supabase': True
}
print(f"[GDrive] SUCCESS! Redirecting to step3_review/{job_id}")
return jsonify({
'success': True,
'job_id': job_id,
'matched_photos': len(matched_photos),
'total_uploaded': len(uploaded_filenames),
'redirect_url': f'/step3_review/{job_id}'
})
except Exception as e:
print(f"[GDrive] Error: {e}")
import traceback
traceback.print_exc()
return jsonify({'error': str(e)}), 500
# Store chunked upload sessions
chunked_uploads = {}
@app.route('/start_chunked_upload/<dataset_name>', methods=['POST'])
def start_chunked_upload(dataset_name):
"""Start a chunked upload session."""
try:
data = request.get_json()
total_files = data.get('total_files', 0)
total_chunks = data.get('total_chunks', 0)
upload_id = str(uuid.uuid4())[:8]
job_id = str(uuid.uuid4())[:8]
upload_dir = os.path.join(UPLOAD_FOLDER, job_id)
os.makedirs(upload_dir, exist_ok=True)
os.makedirs(os.path.join(upload_dir, 'thumbnails'), exist_ok=True)
chunked_uploads[upload_id] = {
'dataset_name': dataset_name,
'job_id': job_id,
'upload_dir': upload_dir,
'total_files': total_files,
'total_chunks': total_chunks,
'received_chunks': set(),
'uploaded_filenames': []
}
print(f"[Chunked] Started upload session {upload_id} for dataset '{dataset_name}' ({total_files} files, {total_chunks} chunks)")
return jsonify({'success': True, 'upload_id': upload_id})
except Exception as e:
print(f"[Chunked] Error starting session: {e}")
return jsonify({'error': str(e)}), 500
@app.route('/upload_reupload_chunk/<dataset_name>', methods=['POST'])
def upload_reupload_chunk(dataset_name):
"""Receive a chunk of photos for reupload."""
from werkzeug.exceptions import ClientDisconnected
try:
upload_id = request.form.get('upload_id')
chunk_index = int(request.form.get('chunk_index', 0))
if upload_id not in chunked_uploads:
return jsonify({'error': 'Invalid upload session'}), 400
session_data = chunked_uploads[upload_id]
upload_dir = session_data['upload_dir']
files = request.files.getlist('photos')
if not files:
return jsonify({'error': 'No files in chunk'}), 400
# Save files from this chunk
for file in files:
if file and allowed_file(file.filename):
filename = secure_filename(file.filename)
filepath = os.path.join(upload_dir, filename)
file.save(filepath)
session_data['uploaded_filenames'].append(filename)
session_data['received_chunks'].add(chunk_index)
print(f"[Chunked] Upload {upload_id}: Received chunk {chunk_index + 1}/{session_data['total_chunks']} ({len(files)} files)")
return jsonify({'success': True, 'chunk': chunk_index, 'files_saved': len(files)})
except ClientDisconnected:
# Client disconnected during upload - this is expected on slow connections
print(f"[Chunked] Client disconnected during chunk upload (timeout)")
return jsonify({'error': 'Connection timeout - please retry'}), 408
except Exception as e:
print(f"[Chunked] Error receiving chunk: {e}")
import traceback
traceback.print_exc()
return jsonify({'error': str(e)}), 500
@app.route('/finish_chunked_upload/<dataset_name>', methods=['POST'])
def finish_chunked_upload(dataset_name):
"""Finalize chunked upload and process photos."""
try:
data = request.get_json()
upload_id = data.get('upload_id')
if upload_id not in chunked_uploads:
return jsonify({'error': 'Invalid upload session'}), 400
session_data = chunked_uploads[upload_id]
job_id = session_data['job_id']
upload_dir = session_data['upload_dir']
uploaded_filenames = session_data['uploaded_filenames']
print(f"[Chunked] Finalizing upload {upload_id}: {len(uploaded_filenames)} files received")
# Load dataset from Supabase
print(f"[Chunked] Loading dataset from Supabase...")
supabase_data = load_dataset_from_supabase(dataset_name)
if not supabase_data:
return jsonify({'error': 'Dataset not found in Supabase'}), 404
metadata = supabase_data.get('metadata', {})
face_results = supabase_data.get('face_results', {})
embeddings_data = supabase_data.get('embeddings_data')
# Load reference embeddings
new_session_id = str(uuid.uuid4())[:8]
if embeddings_data:
import io
from photo_selector.face_matcher import FaceMatcher
data_np = np.load(io.BytesIO(embeddings_data), allow_pickle=True)
matcher = FaceMatcher(similarity_threshold=float(data_np['threshold']))
matcher.reference_embeddings = list(data_np['embeddings'])
matcher.average_embedding = data_np['average']
face_matchers[new_session_id] = matcher
session['face_session_id'] = new_session_id
print(f"[Chunked] Loaded {len(matcher.reference_embeddings)} reference embeddings")
# Match uploaded files with saved face results
filtered_photos = face_results.get('filtered_photos', [])
uploaded_set = set(uploaded_filenames)
matched_photos = [p for p in filtered_photos if p.get('filename') in uploaded_set]
print(f"[Chunked] Matched {len(matched_photos)} of {len(filtered_photos)} photos")
# Create review data
review_data = {
'filtered_photos': matched_photos,
'total_processed': len(uploaded_filenames),
'match_count': len(matched_photos)
}
with open(os.path.join(RESULTS_FOLDER, f"{job_id}_review.json"), 'w') as f:
json.dump(review_data, f)
# Create processing job
processing_jobs[job_id] = {
'status': 'review_pending',
'progress': 100,
'message': 'Photos matched with saved face results',
'upload_dir': upload_dir,
'session_id': new_session_id,
'has_reference_photos': True,
'reference_count': metadata.get('reference_count', 0),
'quality_mode': metadata.get('quality_mode', 'balanced'),
'similarity_threshold': metadata.get('similarity_threshold', 0.4),
'confirmed_photos': [p['filename'] for p in matched_photos],
'review_data': review_data,
'total_photos': len(matched_photos),
'from_dataset': dataset_name,
'from_supabase': True
}
# Clean up session
del chunked_uploads[upload_id]
print(f"[Chunked] SUCCESS! Redirecting to step3_review/{job_id}")
return jsonify({
'success': True,
'job_id': job_id,
'matched_photos': len(matched_photos),
'total_uploaded': len(uploaded_filenames),
'redirect_url': f'/step3_review/{job_id}'
})
except Exception as e:
print(f"[Chunked] Error finalizing: {e}")
import traceback
traceback.print_exc()
return jsonify({'error': str(e)}), 500
@app.route('/process_reupload/<dataset_name>', methods=['POST'])
def process_reupload(dataset_name):
"""Process re-uploaded photos using saved face results from Supabase."""
from werkzeug.exceptions import ClientDisconnected
try:
print(f"[Reupload] Starting reupload for dataset '{dataset_name}'")
# Load dataset from Supabase
print(f"[Reupload] Loading dataset from Supabase...")
supabase_data = load_dataset_from_supabase(dataset_name)
if not supabase_data:
print(f"[Reupload] ERROR: Dataset not found in Supabase")
return jsonify({'error': 'Dataset not found in Supabase'}), 404
metadata = supabase_data.get('metadata', {})
face_results = supabase_data.get('face_results', {})
embeddings_data = supabase_data.get('embeddings_data')
print(f"[Reupload] Dataset loaded: {len(face_results.get('filtered_photos', []))} photos in face results")
# Create new job
job_id = str(uuid.uuid4())[:8]
new_session_id = str(uuid.uuid4())[:8]
upload_dir = os.path.join(UPLOAD_FOLDER, job_id)
os.makedirs(upload_dir, exist_ok=True)
os.makedirs(os.path.join(upload_dir, 'thumbnails'), exist_ok=True)
# Check if zip file was uploaded
zipfile_upload = request.files.get('zipfile')
uploaded_filenames = []
if zipfile_upload and zipfile_upload.filename.lower().endswith('.zip'):
# Handle zip file upload
import zipfile
print(f"[Reupload] Received zip file: {zipfile_upload.filename}")
# Save zip temporarily
zip_path = os.path.join(upload_dir, 'upload.zip')
zipfile_upload.save(zip_path)
print(f"[Reupload] Zip saved, extracting...")
# Extract zip file
try:
with zipfile.ZipFile(zip_path, 'r') as zf:
# Get list of image files in zip
image_extensions = {'.jpg', '.jpeg', '.png', '.heic', '.heif', '.webp', '.bmp', '.gif'}
for member in zf.namelist():
# Skip directories and hidden files
if member.endswith('/') or '/__MACOSX' in member or member.startswith('.'):
continue
# Check if it's an image
ext = os.path.splitext(member.lower())[1]
if ext in image_extensions:
# Extract with flat structure (no subdirectories)
filename = secure_filename(os.path.basename(member))
if filename:
# Read from zip and save to upload_dir
with zf.open(member) as src:
filepath = os.path.join(upload_dir, filename)
with open(filepath, 'wb') as dst:
dst.write(src.read())
uploaded_filenames.append(filename)
if len(uploaded_filenames) % 200 == 0:
print(f"[Reupload] Extracted {len(uploaded_filenames)} files...")
print(f"[Reupload] Extracted {len(uploaded_filenames)} photos from zip")
finally:
# Clean up zip file
if os.path.exists(zip_path):
os.remove(zip_path)
else:
# Handle individual photo uploads
files = request.files.getlist('photos')
if not files or (len(files) == 1 and files[0].filename == ''):
print(f"[Reupload] ERROR: No photos uploaded")
return jsonify({'error': 'No photos uploaded'}), 400
print(f"[Reupload] Saving {len(files)} uploaded files (thumbnails skipped for speed)...")
for i, file in enumerate(files):
if file and allowed_file(file.filename):
filename = secure_filename(file.filename)
filepath = os.path.join(upload_dir, filename)
file.save(filepath)
uploaded_filenames.append(filename)
# Log progress every 200 files
if (i + 1) % 200 == 0:
print(f"[Reupload] Saved {i + 1}/{len(files)} files...")
print(f"[Reupload] Saved {len(uploaded_filenames)} photos for dataset '{dataset_name}'")
# Load reference embeddings
print(f"[Reupload] Loading reference embeddings...")
if embeddings_data:
import io
from photo_selector.face_matcher import FaceMatcher
# Load directly from bytes using BytesIO (no temp file needed)
data = np.load(io.BytesIO(embeddings_data), allow_pickle=True)
matcher = FaceMatcher(similarity_threshold=float(data['threshold']))
matcher.reference_embeddings = list(data['embeddings'])
matcher.average_embedding = data['average']
face_matchers[new_session_id] = matcher
session['face_session_id'] = new_session_id
print(f"[Reupload] Loaded {len(matcher.reference_embeddings)} reference embeddings")
# Match uploaded files with saved face results
print(f"[Reupload] Matching uploaded files with saved face results...")
filtered_photos = face_results.get('filtered_photos', [])
# Create a set for faster lookup
uploaded_set = set(uploaded_filenames)
# Filter to only photos that were uploaded
matched_photos = []
for photo in filtered_photos:
if photo.get('filename') in uploaded_set:
matched_photos.append(photo)
print(f"[Reupload] Matched {len(matched_photos)} of {len(filtered_photos)} photos from face results")
# Create review data
review_data = {
'filtered_photos': matched_photos,
'total_processed': len(uploaded_filenames),
'match_count': len(matched_photos)
}
# Save review data
with open(os.path.join(RESULTS_FOLDER, f"{job_id}_review.json"), 'w') as f:
json.dump(review_data, f)
print(f"[Reupload] Saved review data")
# Create processing job - mark as ready for quality selection
processing_jobs[job_id] = {
'status': 'review_pending',
'progress': 100,
'message': 'Photos matched with saved face results',
'upload_dir': upload_dir,
'session_id': new_session_id,
'has_reference_photos': True,
'reference_count': metadata.get('reference_count', 0),
'quality_mode': metadata.get('quality_mode', 'balanced'),
'similarity_threshold': metadata.get('similarity_threshold', 0.4),
'confirmed_photos': [p['filename'] for p in matched_photos],
'review_data': review_data,
'total_photos': len(matched_photos),
'from_dataset': dataset_name,
'from_supabase': True
}
print(f"[Reupload] SUCCESS! Redirecting to step3_review/{job_id}")
return jsonify({
'success': True,
'job_id': job_id,
'matched_photos': len(matched_photos),
'total_uploaded': len(uploaded_filenames),
'redirect_url': f'/step3_review/{job_id}'
})
except ClientDisconnected:
print(f"[Reupload] Client disconnected during upload (timeout)")
return jsonify({'error': 'Connection timeout - please retry with smaller batch or better connection'}), 408
except Exception as e:
import traceback
traceback.print_exc()
return jsonify({'error': str(e)}), 500
if __name__ == '__main__':
print("""
============================================
PHOTO SELECTION WEB APP
Open http://localhost:5000 in your browser
NEW: Automatic selection mode!
The AI decides which photos to keep.
TEST: /test-month for single folder testing
============================================
""")
# Use port 7860 for Hugging Face Spaces, 5000 for local
import os
port = int(os.environ.get('PORT', 7860))
app.run(debug=False, host='0.0.0.0', port=port)