""" Photo Selection Web App Flask-based frontend for testing the photo selection pipeline Now with AUTOMATIC selection - no target number needed! Two-Stage Workflow with Review Step: 1. Upload reference photos of your child (2-3 photos) 2. Upload all event photos (e.g., 1000 photos) 3. System filters to find photos containing your child 4. USER REVIEWS filtered photos (can remove false positives) 5. Quality-based selection runs on confirmed photos 6. Final results shown """ import os import json import uuid import shutil from pathlib import Path from datetime import datetime # Load environment variables from .env file try: from dotenv import load_dotenv load_dotenv() except ImportError: pass # dotenv not installed, use system env vars from flask import Flask, render_template, request, jsonify, send_from_directory, send_file, session, redirect, Response from werkzeug.utils import secure_filename from werkzeug.exceptions import RequestEntityTooLarge import numpy as np from PIL import Image import threading import time # Supabase integration from supabase_storage import ( is_supabase_available, save_dataset_to_supabase, load_dataset_from_supabase, list_datasets_from_supabase, delete_dataset_from_supabase ) # HEIC support try: from pillow_heif import register_heif_opener register_heif_opener() except ImportError: pass app = Flask(__name__, static_folder='static', template_folder='templates') app.secret_key = 'photo_selector_secret_key_2024' # For session management # Configuration UPLOAD_FOLDER = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'uploads') RESULTS_FOLDER = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'results') REFERENCE_FOLDER = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'references') OUTPUT_FOLDER = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'selected_photos') # Auto-save location DATASETS_FOLDER = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'datasets') # Saved datasets ALLOWED_EXTENSIONS = {'jpg', 'jpeg', 'png', 'heic', 'heif', 'webp'} MAX_CONTENT_LENGTH = 5 * 1024 * 1024 * 1024 # 5GB max (for large photo batches) app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER app.config['MAX_CONTENT_LENGTH'] = MAX_CONTENT_LENGTH app.config['MAX_FORM_MEMORY_SIZE'] = 5 * 1024 * 1024 * 1024 # 5GB for form data app.config['MAX_FORM_PARTS'] = 10000 # Allow up to 10000 files in one upload # Create directories os.makedirs(UPLOAD_FOLDER, exist_ok=True) os.makedirs(RESULTS_FOLDER, exist_ok=True) os.makedirs(REFERENCE_FOLDER, exist_ok=True) os.makedirs(DATASETS_FOLDER, exist_ok=True) # Store processing status processing_jobs = {} # Store face matchers for sessions (reuse to avoid reloading model) face_matchers = {} # Store chunked upload sessions upload_sessions = {} # Error handler for large uploads @app.errorhandler(RequestEntityTooLarge) def handle_large_upload(error): return jsonify({ 'error': 'Upload too large. Try uploading fewer files at once (max ~500 files per batch).' }), 413 def allowed_file(filename): return '.' in filename and filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS def create_thumbnail(image_path, thumb_path, size=(300, 300)): """Create a thumbnail for display with proper EXIF rotation.""" from PIL import ExifTags try: with Image.open(image_path) as img: # Apply EXIF rotation before creating thumbnail try: for orientation in ExifTags.TAGS.keys(): if ExifTags.TAGS[orientation] == 'Orientation': break exif = img._getexif() if exif is not None: orientation_value = exif.get(orientation) if orientation_value == 3: img = img.rotate(180, expand=True) elif orientation_value == 6: img = img.rotate(270, expand=True) elif orientation_value == 8: img = img.rotate(90, expand=True) except (AttributeError, KeyError, IndexError): pass if img.mode != 'RGB': img = img.convert('RGB') img.thumbnail(size, Image.Resampling.LANCZOS) img.save(thumb_path, 'JPEG', quality=85) return True except Exception as e: print(f"Error creating thumbnail: {e}") return False def get_thumbnail_name(filename): """ Generate thumbnail name that includes the original extension to avoid collisions. Example: IMG_5801.HEIC -> thumb_IMG_5801_HEIC.jpg IMG_5801.jpg -> thumb_IMG_5801_jpg.jpg """ if '.' in filename: name, ext = filename.rsplit('.', 1) return f"thumb_{name}_{ext}.jpg" else: return f"thumb_{filename}.jpg" def process_photos_face_filter_only(job_id, upload_dir, session_id=None): """ Phase 1: Face filtering only. Scans all photos to find ones containing the target person. Returns filtered photos for user review before quality selection. """ try: print(f"\n{'='*60}") print(f"[Job {job_id}] PHASE 1: Face Filtering Started") print(f"{'='*60}") processing_jobs[job_id]['status'] = 'processing' processing_jobs[job_id]['progress'] = 5 processing_jobs[job_id]['message'] = 'Loading face recognition AI...' print(f"[Job {job_id}] Loading InsightFace face recognition model...") from photo_selector.face_matcher import FaceMatcher # Get face matcher face_matcher = None if session_id and session_id in face_matchers: face_matcher = face_matchers[session_id] if face_matcher.get_reference_count() == 0: face_matcher = None if face_matcher is None: print(f"[Job {job_id}] ERROR: No reference photos loaded!") processing_jobs[job_id]['status'] = 'error' processing_jobs[job_id]['message'] = 'No reference photos loaded' return ref_count = face_matcher.get_reference_count() print(f"[Job {job_id}] Reference photos loaded: {ref_count}") processing_jobs[job_id]['progress'] = 10 processing_jobs[job_id]['message'] = 'Scanning photos for your child using InsightFace...' # Get all photo files photo_files = [] for f in os.listdir(upload_dir): if allowed_file(f) and not f.startswith('thumb_'): photo_files.append(f) total_photos = len(photo_files) print(f"[Job {job_id}] Total photos to scan: {total_photos}") processing_jobs[job_id]['total_photos'] = total_photos processing_jobs[job_id]['message'] = f'Scanning {total_photos} photos for your child...' # Create thumbnails directory - always in uploads//thumbnails # This ensures thumbnails work for both browser upload and local folder mode is_local_folder = processing_jobs[job_id].get('is_local_folder', False) if is_local_folder: thumbs_dir = os.path.join(UPLOAD_FOLDER, job_id, 'thumbnails') else: thumbs_dir = os.path.join(upload_dir, 'thumbnails') os.makedirs(thumbs_dir, exist_ok=True) # Get all photo paths photo_paths = [os.path.join(upload_dir, fn) for fn in photo_files] # Progress callback to update photos_checked def progress_callback(current, total, message): processing_jobs[job_id]['photos_checked'] = current processing_jobs[job_id]['message'] = f'Checked {current}/{total} photos...' # Update progress between 30-80% progress_pct = 30 + int((current / total) * 50) if total > 0 else 30 processing_jobs[job_id]['progress'] = progress_pct # Run face filtering print(f"[Job {job_id}] Starting face detection and matching...") processing_jobs[job_id]['progress'] = 30 filter_results = face_matcher.filter_photos(photo_paths, progress_callback=progress_callback) if 'error' in filter_results: print(f"[Job {job_id}] ERROR: Face matching failed - {filter_results['error']}") processing_jobs[job_id]['status'] = 'error' processing_jobs[job_id]['message'] = f"Face matching error: {filter_results['error']}" return # Print statistics stats = filter_results.get('statistics', {}) matched_count = len(filter_results.get('matched_photos', [])) unmatched_count = len(filter_results.get('unmatched_photos', [])) print(f"\n[Job {job_id}] Face Filtering Results:") print(f" - Photos with your child: {matched_count}") print(f" - Photos without match: {unmatched_count}") print(f" - Photos with no faces: {stats.get('no_faces', 0)}") # Handle match_rate which may be a string or float match_rate = stats.get('match_rate', 0) if isinstance(match_rate, str): print(f" - Match rate: {match_rate}") else: print(f" - Match rate: {match_rate:.1%}") processing_jobs[job_id]['progress'] = 70 processing_jobs[job_id]['message'] = f'Creating thumbnails: 0/{matched_count}' print(f"[Job {job_id}] Creating thumbnails for {matched_count} matched photos...") # Prepare filtered photo data filtered_photos = [] for i, match in enumerate(filter_results['matched_photos']): filename = os.path.basename(match['path']) thumb_name = get_thumbnail_name(filename) thumb_path = os.path.join(thumbs_dir, thumb_name) create_thumbnail(match['path'], thumb_path) filtered_photos.append({ 'filename': filename, 'thumbnail': thumb_name, 'face_match_score': match['similarity'], 'num_faces': match['num_faces'], 'matched_face_idx': match.get('matched_face_idx', 0), 'face_bboxes': match.get('face_bboxes', []) # Cached face locations for scoring }) # Progress update every 10 photos or on last photo if (i + 1) % 10 == 0 or (i + 1) == matched_count: progress = 70 + int((i / matched_count) * 25) processing_jobs[job_id]['progress'] = progress processing_jobs[job_id]['message'] = f'Creating thumbnails: {i + 1}/{matched_count}' print(f"[Job {job_id}] Thumbnails created: {i + 1}/{matched_count}") # Sort by face match score (highest first) filtered_photos.sort(key=lambda x: x['face_match_score'], reverse=True) # Prepare unmatched photos data (photos where target was NOT found) unmatched_photos = [] for unmatch in filter_results.get('unmatched_photos', []): filename = os.path.basename(unmatch['path']) # Get timestamp from EXIF if available timestamp = None try: from photo_selector.utils import get_photo_timestamp dt = get_photo_timestamp(unmatch['path']) if dt: timestamp = dt.timestamp() except: pass unmatched_photos.append({ 'filename': filename, 'best_similarity': unmatch.get('best_similarity', 0), 'num_faces': unmatch.get('num_faces', 0), 'timestamp': timestamp }) # Also include photos with no faces detected for no_face in filter_results.get('no_faces_photos', []): filename = os.path.basename(no_face['path']) timestamp = None try: from photo_selector.utils import get_photo_timestamp dt = get_photo_timestamp(no_face['path']) if dt: timestamp = dt.timestamp() except: pass unmatched_photos.append({ 'filename': filename, 'best_similarity': 0, 'num_faces': 0, 'timestamp': timestamp }) # Also include photos that had processing errors for error_photo in filter_results.get('error_photos', []): filename = os.path.basename(error_photo['path']) timestamp = None try: from photo_selector.utils import get_photo_timestamp dt = get_photo_timestamp(error_photo['path']) if dt: timestamp = dt.timestamp() except: pass unmatched_photos.append({ 'filename': filename, 'best_similarity': 0, 'num_faces': 0, 'timestamp': timestamp, 'error': error_photo.get('error', 'Processing error') }) # Sort unmatched by timestamp unmatched_photos.sort(key=lambda x: x.get('timestamp') or 0) # Store results for review review_data = { 'total_uploaded': total_photos, 'filtered_photos': filtered_photos, 'unmatched_photos': unmatched_photos, 'statistics': filter_results['statistics'], 'reference_count': face_matcher.get_reference_count() } # Save review data review_file = os.path.join(RESULTS_FOLDER, f"{job_id}_review.json") with open(review_file, 'w') as f: json.dump(review_data, f, indent=2, default=str) processing_jobs[job_id]['progress'] = 100 processing_jobs[job_id]['status'] = 'review_pending' processing_jobs[job_id]['message'] = f'Found your child in {len(filtered_photos)} of {total_photos} photos!' processing_jobs[job_id]['review_data'] = review_data print(f"\n[Job {job_id}] PHASE 1 COMPLETE!") print(f" - Found {len(filtered_photos)} photos of your child") print(f" - Status: review_pending (waiting for user to confirm)") print(f" - Review data saved to: {review_file}") print(f"{'='*60}\n") except Exception as e: print(f"[Job {job_id}] EXCEPTION: {str(e)}") processing_jobs[job_id]['status'] = 'error' processing_jobs[job_id]['message'] = str(e) import traceback traceback.print_exc() def process_drive_with_parallel_face_detection(job_id, folder_id, upload_dir, face_matcher): """ HYBRID APPROACH: Download files from Google Drive while running face detection in parallel. This overlaps network I/O (downloading) with GPU compute (face detection) for faster processing. Flow: - Download thread: Downloads files and adds paths to queue - Face detection thread: Processes files from queue as they become ready - Both run simultaneously for maximum efficiency """ import queue import threading print(f"\n{'='*60}") print(f"[Job {job_id}] HYBRID MODE: Parallel Download + Face Detection") print(f"{'='*60}") # Shared state file_queue = queue.Queue() results_lock = threading.Lock() matched_photos = [] unmatched_photos = [] no_faces_photos = [] error_photos = [] # Counters download_complete = threading.Event() total_files = [0] downloaded_count = [0] processed_count = [0] # Face detection worker def face_detection_worker(): """Process files from queue as they become available.""" while True: try: # Wait for file or check if download is complete try: filepath = file_queue.get(timeout=1.0) except queue.Empty: # Check if download is complete and queue is empty if download_complete.is_set() and file_queue.empty(): break continue if filepath is None: # Poison pill break # Process the file result = face_matcher.check_photo_for_target(filepath) with results_lock: processed_count[0] += 1 if 'error' in result: error_photos.append({'path': filepath, 'error': result['error']}) elif result['num_faces'] == 0: no_faces_photos.append({'path': filepath, 'num_faces': 0}) elif result['contains_target']: matched_photos.append({ 'path': filepath, 'similarity': result['best_match_similarity'], 'num_faces': result['num_faces'], 'all_similarities': result.get('all_face_similarities', []), 'face_bboxes': result.get('face_bboxes', []) }) else: unmatched_photos.append({ 'path': filepath, 'best_similarity': result['best_match_similarity'], 'num_faces': result['num_faces'] }) # Update progress (use unified message format) if processed_count[0] % 10 == 0: # After downloads complete, show scan-only progress if download_complete.is_set(): pct = 30 + int((processed_count[0] / max(total_files[0], 1)) * 40) processing_jobs[job_id]['progress'] = min(pct, 70) processing_jobs[job_id]['message'] = f'Scanning faces: {processed_count[0]}/{total_files[0]}' processing_jobs[job_id]['photos_checked'] = processed_count[0] print(f"[Job {job_id}] [HYBRID] Downloaded: {downloaded_count[0]}, Face checked: {processed_count[0]}, Matched: {len(matched_photos)}") file_queue.task_done() except Exception as e: print(f"[Job {job_id}] Face detection error: {e}") continue # Callback when file is downloaded def on_file_ready(filepath): """Called by download_folder when each file is ready.""" with results_lock: downloaded_count[0] += 1 file_queue.put(filepath) # Progress callback for download def download_progress(current, total, _filename): total_files[0] = total pct = 5 + int((current / total) * 25) # 5-30% processing_jobs[job_id]['progress'] = pct processing_jobs[job_id]['message'] = f'Downloading: {current}/{total}, Scanning: {processed_count[0]}' processing_jobs[job_id]['total_files'] = total try: processing_jobs[job_id]['status'] = 'processing' processing_jobs[job_id]['progress'] = 5 processing_jobs[job_id]['message'] = 'Starting parallel download and face detection...' # Start face detection workers (use multiple threads for better throughput) num_workers = 4 # Face detection threads workers = [] for _ in range(num_workers): t = threading.Thread(target=face_detection_worker) t.daemon = True t.start() workers.append(t) print(f"[Job {job_id}] Started {num_workers} face detection workers") # Start download (this will call on_file_ready for each file) print(f"[Job {job_id}] Starting Google Drive download with parallel face detection...") download_folder( folder_id, upload_dir, progress_callback=download_progress, file_ready_callback=on_file_ready ) # Signal download complete download_complete.set() print(f"[Job {job_id}] Download complete. Waiting for face detection to finish...") # Wait for queue to be processed file_queue.join() # Send poison pills to stop workers for _ in workers: file_queue.put(None) # Wait for workers to finish for t in workers: t.join(timeout=5.0) print(f"\n[Job {job_id}] HYBRID Face Detection Results:") print(f" - Photos with your child: {len(matched_photos)}") print(f" - Photos without match: {len(unmatched_photos)}") print(f" - Photos with no faces: {len(no_faces_photos)}") print(f" - Photos with errors: {len(error_photos)}") if error_photos: print(f" [ERRORS] First 5 error photos:") for ep in error_photos[:5]: print(f" - {os.path.basename(ep['path'])}: {ep.get('error', 'Unknown error')}") # Now create thumbnails and prepare review data processing_jobs[job_id]['progress'] = 75 processing_jobs[job_id]['message'] = f'Creating thumbnails for {len(matched_photos)} photos...' thumbs_dir = os.path.join(upload_dir, 'thumbnails') os.makedirs(thumbs_dir, exist_ok=True) filtered_photos = [] for i, match in enumerate(matched_photos): filename = os.path.basename(match['path']) thumb_name = get_thumbnail_name(filename) thumb_path = os.path.join(thumbs_dir, thumb_name) create_thumbnail(match['path'], thumb_path) filtered_photos.append({ 'filename': filename, 'thumbnail': thumb_name, 'face_match_score': match['similarity'], 'num_faces': match['num_faces'], 'face_bboxes': match.get('face_bboxes', []) }) if (i + 1) % 20 == 0: processing_jobs[job_id]['message'] = f'Creating thumbnails: {i + 1}/{len(matched_photos)}' # Sort by face match score filtered_photos.sort(key=lambda x: x['face_match_score'], reverse=True) # Prepare unmatched data unmatched_data = [] for unmatch in unmatched_photos: filename = os.path.basename(unmatch['path']) unmatched_data.append({ 'filename': filename, 'best_similarity': unmatch.get('best_similarity', 0), 'num_faces': unmatch.get('num_faces', 0) }) for no_face in no_faces_photos: filename = os.path.basename(no_face['path']) unmatched_data.append({ 'filename': filename, 'best_similarity': 0, 'num_faces': 0 }) # Also add error photos to unmatched (so they're visible to user) for error_photo in error_photos: filename = os.path.basename(error_photo['path']) unmatched_data.append({ 'filename': filename, 'best_similarity': 0, 'num_faces': 0, 'error': error_photo.get('error', 'Processing error') }) # Store results review_data = { 'total_uploaded': total_files[0], 'filtered_photos': filtered_photos, 'unmatched_photos': unmatched_data, 'statistics': { 'total_scanned': total_files[0], 'matched': len(matched_photos), 'unmatched': len(unmatched_photos), 'no_faces': len(no_faces_photos), 'errors': len(error_photos), 'match_rate': f"{(len(matched_photos) / max(total_files[0], 1) * 100):.1f}%" }, 'reference_count': face_matcher.get_reference_count() } # Save review data review_file = os.path.join(RESULTS_FOLDER, f"{job_id}_review.json") with open(review_file, 'w') as f: json.dump(review_data, f, indent=2, default=str) processing_jobs[job_id]['progress'] = 100 processing_jobs[job_id]['status'] = 'review_pending' processing_jobs[job_id]['message'] = f'Found your child in {len(filtered_photos)} of {total_files[0]} photos!' processing_jobs[job_id]['review_data'] = review_data print(f"\n[Job {job_id}] HYBRID MODE COMPLETE!") print(f" - Found {len(filtered_photos)} photos of your child") print(f"{'='*60}\n") except Exception as e: print(f"[Job {job_id}] HYBRID EXCEPTION: {str(e)}") processing_jobs[job_id]['status'] = 'error' processing_jobs[job_id]['message'] = str(e) import traceback traceback.print_exc() def save_photos_by_month(job_id, upload_dir, selected_photos, rejected_photos, month_stats): """ Automatically save both selected and not-selected photos organized by month. Creates folder structure: selected_photos/ └── {job_id}_{timestamp}/ ├── selected/ │ ├── Jan/ │ │ ├── photo1.jpg │ │ └── photo2.jpg │ ├── Feb/ │ │ └── photo3.jpg │ └── ... ├── not_selected/ │ ├── Jan/ │ │ └── photo4.jpg │ ├── Feb/ │ │ └── photo5.jpg │ └── ... └── summary.txt Args: job_id: The job identifier upload_dir: Source directory containing original photos selected_photos: List of selected photo dicts with 'filename' and 'month' keys rejected_photos: List of rejected photo dicts with 'filename' and 'month' keys month_stats: Statistics about each month's selection Returns: Path to the output folder """ try: # Create output folder with timestamp timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") output_base = os.path.join(OUTPUT_FOLDER, f"{job_id}_{timestamp}") os.makedirs(output_base, exist_ok=True) print(f"\n{'='*60}") print(f" AUTO-SAVING PHOTOS BY MONTH (SELECTED & NOT SELECTED)") print(f"{'='*60}") print(f" Output folder: {output_base}") # Create selected and not_selected folders selected_base = os.path.join(output_base, "selected") not_selected_base = os.path.join(output_base, "not_selected") os.makedirs(selected_base, exist_ok=True) os.makedirs(not_selected_base, exist_ok=True) # Group selected photos by month selected_by_month = {} for photo in selected_photos: month = photo.get('month', 'Unknown') if month not in selected_by_month: selected_by_month[month] = [] selected_by_month[month].append(photo) # Group rejected photos by month rejected_by_month = {} for photo in rejected_photos: month = photo.get('month', 'Unknown') if month not in rejected_by_month: rejected_by_month[month] = [] rejected_by_month[month].append(photo) # Copy SELECTED photos to month folders print(f"\n --- SELECTED PHOTOS ---") total_selected_copied = 0 for month, photos in selected_by_month.items(): month_folder = os.path.join(selected_base, month) os.makedirs(month_folder, exist_ok=True) print(f" [selected/{month}] Saving {len(photos)} photos...") for photo in photos: src_path = os.path.join(upload_dir, photo['filename']) dst_path = os.path.join(month_folder, photo['filename']) if os.path.exists(src_path): shutil.copy2(src_path, dst_path) total_selected_copied += 1 # Copy NOT SELECTED photos to month folders print(f"\n --- NOT SELECTED PHOTOS ---") total_rejected_copied = 0 for month, photos in rejected_by_month.items(): month_folder = os.path.join(not_selected_base, month) os.makedirs(month_folder, exist_ok=True) print(f" [not_selected/{month}] Saving {len(photos)} photos...") for photo in photos: src_path = os.path.join(upload_dir, photo['filename']) dst_path = os.path.join(month_folder, photo['filename']) if os.path.exists(src_path): shutil.copy2(src_path, dst_path) total_rejected_copied += 1 # Create summary file summary_path = os.path.join(output_base, "summary.txt") with open(summary_path, 'w') as f: f.write("=" * 60 + "\n") f.write(" PHOTO SELECTION SUMMARY\n") f.write("=" * 60 + "\n\n") f.write(f"Job ID: {job_id}\n") f.write(f"Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n") f.write(f"Total Selected: {total_selected_copied} photos\n") f.write(f"Total Not Selected: {total_rejected_copied} photos\n") f.write(f"Grand Total: {total_selected_copied + total_rejected_copied} photos\n\n") f.write("-" * 40 + "\n") f.write(" BREAKDOWN BY MONTH\n") f.write("-" * 40 + "\n\n") f.write(f"{'Month':<12} {'Selected':>10} {'Not Selected':>14} {'Total':>8}\n") f.write(f"{'-'*12} {'-'*10} {'-'*14} {'-'*8}\n") for stat in month_stats: month = stat['month'] selected = stat['selected'] total = stat['total_photos'] not_selected = total - selected f.write(f"{month:<12} {selected:>10} {not_selected:>14} {total:>8}\n") # Selected files by month f.write("\n" + "=" * 60 + "\n") f.write(" SELECTED FILES BY MONTH\n") f.write("=" * 60 + "\n") for month, photos in sorted(selected_by_month.items()): f.write(f"\n[{month}] - {len(photos)} selected photos:\n") for photo in sorted(photos, key=lambda x: x.get('score', 0), reverse=True): score = photo.get('score', 0) * 100 cluster = photo.get('cluster_id', -1) f.write(f" + {photo['filename']} (Score: {score:.0f}%, Cluster: {cluster})\n") # Not selected files by month f.write("\n" + "=" * 60 + "\n") f.write(" NOT SELECTED FILES BY MONTH\n") f.write("=" * 60 + "\n") for month, photos in sorted(rejected_by_month.items()): f.write(f"\n[{month}] - {len(photos)} not selected photos:\n") for photo in sorted(photos, key=lambda x: x.get('score', 0), reverse=True): score = photo.get('score', 0) * 100 cluster = photo.get('cluster_id', -1) f.write(f" - {photo['filename']} (Score: {score:.0f}%, Cluster: {cluster})\n") print(f"\n SUMMARY:") print(f" - Selected photos saved: {total_selected_copied}") print(f" - Not selected photos saved: {total_rejected_copied}") print(f" - Total photos saved: {total_selected_copied + total_rejected_copied}") print(f" - Summary written to: {summary_path}") print(f"{'='*60}\n") return output_base except Exception as e: print(f"[ERROR] Failed to save photos by month: {str(e)}") import traceback traceback.print_exc() return None def process_photos_quality_selection(job_id, upload_dir, quality_mode, similarity_threshold, confirmed_photos, face_data_cache=None, embedding_model='siglip'): """ Phase 2: Month-based category-aware photo selection. Selects ~40 best photos per month with category diversity. Args: face_data_cache: Dict of filename -> {'num_faces': int, 'face_bboxes': list} Cached face data from Step 2 to avoid re-detection embedding_model: 'siglip' or 'clip' - which embedding model to use """ face_data_cache = face_data_cache or {} try: print(f"\n{'='*60}") print(f"[Job {job_id}] PHASE 2: Monthly Category-Aware Selection Started") print(f"{'='*60}") print(f"[Job {job_id}] Confirmed photos: {len(confirmed_photos)}") print(f"[Job {job_id}] Quality mode: {quality_mode}") print(f"[Job {job_id}] Similarity threshold: {similarity_threshold}") print(f"[Job {job_id}] Embedding model: {embedding_model.upper()}") processing_jobs[job_id]['status'] = 'processing' processing_jobs[job_id]['progress'] = 5 processing_jobs[job_id]['message'] = f'Loading {embedding_model.upper()} model...' # Import the appropriate embedder based on selection from photo_selector.monthly_selector import MonthlyPhotoSelector if embedding_model == 'clip': from photo_selector.clip_embeddings import CLIPEmbedder as Embedder model_display_name = 'CLIP' else: from photo_selector.siglip_embeddings import SigLIPEmbedder as Embedder model_display_name = 'SigLIP' # Determine target per month based on quality mode if quality_mode == 'keep_more': target_per_month = 60 # More photos per month elif quality_mode == 'strict': target_per_month = 25 # Fewer, higher quality else: # balanced target_per_month = 40 # Default print(f"[Job {job_id}] Target per month: {target_per_month}") # Step 1: Generate embeddings for confirmed photos (with caching) processing_jobs[job_id]['progress'] = 10 processing_jobs[job_id]['message'] = f'Checking embedding cache...' print(f"[Job {job_id}] Processing {len(confirmed_photos)} photos for {model_display_name} embeddings...") # Import cache functions from supabase_storage import ( compute_file_hash, get_cached_embeddings_batch, save_embeddings_batch, is_supabase_available ) # Step 1a: Compute hashes for all files file_hashes = {} # filename -> hash hash_to_filename = {} # hash -> filename (for reverse lookup) print(f"[Job {job_id}] Computing file hashes...") for i, filename in enumerate(confirmed_photos): filepath = os.path.join(upload_dir, filename) if os.path.exists(filepath): file_hash = compute_file_hash(filepath) if file_hash: file_hashes[filename] = file_hash hash_to_filename[file_hash] = filename # Update progress (10-15%) if i % 100 == 0: progress = 10 + int((i / len(confirmed_photos)) * 5) processing_jobs[job_id]['progress'] = progress print(f"[Job {job_id}] Computed {len(file_hashes)} hashes") # Step 1b: Check cache for existing embeddings embeddings = {} cached_count = 0 uncached_filenames = [] if is_supabase_available() and file_hashes: processing_jobs[job_id]['message'] = f'Checking embedding cache...' all_hashes = list(file_hashes.values()) # Query cache in batches (Supabase has query limits) cached_embeddings = {} batch_size = 500 for i in range(0, len(all_hashes), batch_size): batch_hashes = all_hashes[i:i + batch_size] batch_result = get_cached_embeddings_batch(batch_hashes, embedding_model) cached_embeddings.update(batch_result) # Map cached embeddings back to filenames for filename, file_hash in file_hashes.items(): if file_hash in cached_embeddings: embeddings[filename] = cached_embeddings[file_hash] cached_count += 1 else: uncached_filenames.append(filename) print(f"[Job {job_id}] Cache hit: {cached_count}/{len(file_hashes)} embeddings") else: uncached_filenames = list(file_hashes.keys()) print(f"[Job {job_id}] Cache not available, computing all embeddings") # Step 1c: Compute embeddings for uncached files only newly_computed = {} if uncached_filenames: processing_jobs[job_id]['message'] = f'Analyzing {len(uncached_filenames)} photos with {model_display_name}...' print(f"[Job {job_id}] Computing {model_display_name} embeddings for {len(uncached_filenames)} uncached photos...") embedder = Embedder() for i, filename in enumerate(uncached_filenames): filepath = os.path.join(upload_dir, filename) if os.path.exists(filepath): img = embedder.load_image(filepath) if img is not None: embedding = embedder.get_embedding(img) if embedding is not None: embeddings[filename] = embedding newly_computed[filename] = embedding img.close() # Update progress (15-30%) progress = 15 + int((i / len(uncached_filenames)) * 15) processing_jobs[job_id]['progress'] = progress print(f"[Job {job_id}] Computed {len(newly_computed)} new embeddings") # Step 1d: Save newly computed embeddings to cache if newly_computed and is_supabase_available(): processing_jobs[job_id]['message'] = 'Saving embeddings to cache...' saved = save_embeddings_batch(newly_computed, file_hashes, embedding_model) print(f"[Job {job_id}] Saved {saved} embeddings to cache") print(f"[Job {job_id}] Total embeddings: {len(embeddings)} (cached: {cached_count}, computed: {len(newly_computed)})") # Step 2: Initialize monthly selector processing_jobs[job_id]['progress'] = 35 processing_jobs[job_id]['message'] = 'Grouping photos by month...' # Note: duplicate_threshold is for CLIP embedding similarity (0.85 catches exact near-dupes) # diversity_threshold ensures we don't select visually similar photos (different scenes) # This is separate from face similarity_threshold (0.4-0.5 for face matching) selector = MonthlyPhotoSelector( target_per_month=target_per_month, duplicate_threshold=0.85, # Remove exact duplicates (same moment, slight angle change) diversity_threshold=0.75 # Ensure selected photos are visually diverse ) # Step 3: Group photos by month (only confirmed photos) # We need to manually build the photos_by_month structure for confirmed photos from collections import defaultdict MONTH_NAMES = { 1: "Jan", 2: "Feb", 3: "Mar", 4: "Apr", 5: "May", 6: "Jun", 7: "Jul", 8: "Aug", 9: "Sep", 10: "Oct", 11: "Nov", 12: "Dec" } photos_by_month = defaultdict(list) # Debug: Track timestamp extraction success timestamp_found = 0 timestamp_missing = 0 for filename in confirmed_photos: filepath = os.path.join(upload_dir, filename) if not os.path.exists(filepath): print(f"[TIMESTAMP DEBUG] File not found: {filepath}") continue dt = selector.get_photo_date(filepath) if dt: timestamp_found += 1 else: timestamp_missing += 1 # Get cached face data if available cached_face = face_data_cache.get(filename, {}) photo_info = { 'filename': filename, 'filepath': filepath, 'date': dt.isoformat() if dt else None, 'month': MONTH_NAMES.get(dt.month, "Unknown") if dt else "Unknown", 'timestamp': dt.timestamp() if dt else None, # Cached face data from Step 2 (avoids re-detection) 'num_faces': cached_face.get('num_faces'), 'face_bboxes': cached_face.get('face_bboxes', []) } photos_by_month[photo_info['month']].append(photo_info) # Sort months in calendar order month_order = list(MONTH_NAMES.values()) + ['Unknown'] photos_by_month = {m: photos_by_month[m] for m in month_order if m in photos_by_month} print(f"[TIMESTAMP DEBUG] Timestamps found: {timestamp_found}, missing: {timestamp_missing}") print(f"[Job {job_id}] Photos grouped into {len(photos_by_month)} months:") for month, photos in photos_by_month.items(): print(f" - {month}: {len(photos)} photos") # Step 4: Select best photos from each month (categories detected AFTER selection for speed) processing_jobs[job_id]['progress'] = 60 processing_jobs[job_id]['message'] = 'Selecting best photos per month...' def progress_callback(msg): processing_jobs[job_id]['message'] = msg selection_results = selector.select_all_months(photos_by_month, embeddings, progress_callback) selected_photos = selection_results['selected'] month_stats = selection_results['month_stats'] summary = selection_results['summary'] print(f"\n[Job {job_id}] Selection Results:") print(f" - Total photos: {summary['total_photos']}") print(f" - Selected: {summary['total_selected']}") print(f" - Selection rate: {summary['selection_rate']*100:.1f}%") # Step 5: Detect categories ONLY for selected photos (much faster than all photos) processing_jobs[job_id]['progress'] = 75 processing_jobs[job_id]['message'] = 'Detecting categories for selected photos...' print(f"[Job {job_id}] Detecting categories for {len(selected_photos)} selected photos...") selected_paths = [p['filepath'] for p in selected_photos] if selected_paths: selector._ensure_category_detector() categories = selector.category_detector.detect_categories_batch(selected_paths) for photo in selected_photos: # categories dict is keyed by filename, not filepath cat, conf = categories.get(photo['filename'], ('unknown', 0.0)) photo['category'] = cat photo['category_confidence'] = conf # Update month_stats with category breakdown from selected photos only for stat in month_stats: month_name = stat['month'] month_selected = [p for p in selected_photos if p.get('month') == month_name] cat_breakdown = {} for p in month_selected: cat = p.get('category', 'unknown') cat_breakdown[cat] = cat_breakdown.get(cat, 0) + 1 stat['categories'] = cat_breakdown # Step 6: Build rejected list (photos not selected) # Note: rejection_reason is already set by monthly_selector.py selected_filenames = {p['filename'] for p in selected_photos} rejected_photos = [] for month, photos in photos_by_month.items(): for photo in photos: if photo['filename'] not in selected_filenames: # Keep existing rejection_reason from monthly_selector, or set default if not photo.get('rejection_reason'): photo['rejection_reason'] = 'Not selected for month quota' rejected_photos.append(photo) # Create thumbnails directory thumbs_dir = os.path.join(upload_dir, 'thumbnails') os.makedirs(thumbs_dir, exist_ok=True) # Calculate total thumbnails to create total_thumbnails = len(selected_photos) + len(rejected_photos) thumbnails_created = 0 processing_jobs[job_id]['progress'] = 85 processing_jobs[job_id]['message'] = f'Creating thumbnails: 0/{total_thumbnails}' # Build final results structure results = { 'selected': [], 'rejected': [], 'summary': { 'total_photos': summary['total_photos'], 'selected_count': summary['total_selected'], 'rejected_count': len(rejected_photos), 'selection_rate': summary['selection_rate'], 'face_filtering': { 'total_photos': processing_jobs[job_id].get('total_uploaded', len(confirmed_photos)), 'after_face_filter': len(confirmed_photos), 'user_confirmed': len(confirmed_photos) }, 'total_processed': len(confirmed_photos) }, 'month_stats': month_stats, 'rejection_breakdown': {} } # Count rejection reasons rejection_counts = defaultdict(int) # Compute cluster stats for display on photo cards (per-month) # Cluster IDs are assigned per-month, so we need to track (month, cluster_id) pairs # Count total photos per (month, cluster_id) cluster_total_counts = defaultdict(int) for month, photos in photos_by_month.items(): for photo in photos: cid = photo.get('cluster_id', -1) if cid != -1: cluster_total_counts[(month, cid)] += 1 # Count selected photos per (month, cluster_id) cluster_selected_counts = defaultdict(int) for photo in selected_photos: month = photo.get('month', 'Unknown') cid = photo.get('cluster_id', -1) if cid != -1: cluster_selected_counts[(month, cid)] += 1 # Process selected photos for photo in selected_photos: filename = photo['filename'] thumb_name = get_thumbnail_name(filename) thumb_path = os.path.join(thumbs_dir, thumb_name) create_thumbnail(os.path.join(upload_dir, filename), thumb_path) # Update thumbnail counter thumbnails_created += 1 if thumbnails_created % 10 == 0 or thumbnails_created == total_thumbnails: processing_jobs[job_id]['message'] = f'Creating thumbnails: {thumbnails_created}/{total_thumbnails}' # Get embedding for this photo (convert to list for JSON serialization) photo_embedding = embeddings.get(filename) embedding_list = photo_embedding.tolist() if photo_embedding is not None else None # Get cluster stats for this photo (per-month) cid = photo.get('cluster_id', -1) month = photo.get('month', 'Unknown') cluster_total = cluster_total_counts.get((month, cid), 0) if cid != -1 else 0 cluster_selected = cluster_selected_counts.get((month, cid), 0) if cid != -1 else 0 results['selected'].append({ 'filename': filename, 'thumbnail': thumb_name, 'score': float(photo.get('total', 0)), 'face_quality': float(photo.get('face_quality', 0)), 'aesthetic_quality': float(photo.get('aesthetic_quality', 0)), 'emotional_signal': float(photo.get('emotional_signal', 0)), 'uniqueness': float(photo.get('uniqueness', 0)), 'bucket': photo.get('month', 'unknown'), 'month': month, 'category': photo.get('category', 'unknown'), 'num_faces': int(photo.get('num_faces', 0)), 'cluster_id': cid, 'original_cluster_id': photo.get('original_cluster_id', cid), 'cluster_total': cluster_total, 'cluster_selected': cluster_selected, 'event_id': photo.get('event_id', -1), 'max_similarity': float(photo.get('max_similarity', 0)), 'embedding': embedding_list, 'selection_reason': f"Best in {photo.get('category', 'category')} for {month}", 'selection_detail': f"Selected from {month} - Category: {photo.get('category', 'unknown')}" }) # Process rejected photos for photo in rejected_photos: filename = photo['filename'] thumb_name = get_thumbnail_name(filename) thumb_path = os.path.join(thumbs_dir, thumb_name) create_thumbnail(os.path.join(upload_dir, filename), thumb_path) # Update thumbnail counter thumbnails_created += 1 if thumbnails_created % 10 == 0 or thumbnails_created == total_thumbnails: processing_jobs[job_id]['message'] = f'Creating thumbnails: {thumbnails_created}/{total_thumbnails}' # Use actual rejection reason from monthly_selector rejection_reason = photo.get('rejection_reason', 'Better photos selected') # Categorize rejection reasons for breakdown chart if 'Event' in rejection_reason: breakdown_category = "Same event" elif 'Cluster' in rejection_reason: breakdown_category = "Same cluster" elif 'similar' in rejection_reason.lower(): breakdown_category = "Too similar" elif 'Target' in rejection_reason: breakdown_category = "Target reached" else: breakdown_category = "Other" rejection_counts[breakdown_category] += 1 # Get embedding for this photo (convert to list for JSON serialization) photo_embedding = embeddings.get(filename) embedding_list = photo_embedding.tolist() if photo_embedding is not None else None # Get cluster stats for this photo (per-month) cid = photo.get('cluster_id', -1) month = photo.get('month', 'Unknown') cluster_total = cluster_total_counts.get((month, cid), 0) if cid != -1 else 0 cluster_selected = cluster_selected_counts.get((month, cid), 0) if cid != -1 else 0 results['rejected'].append({ 'filename': filename, 'thumbnail': thumb_name, 'score': float(photo.get('total', 0)), 'face_quality': float(photo.get('face_quality', 0)), 'aesthetic_quality': float(photo.get('aesthetic_quality', 0)), 'bucket': photo.get('month', 'unknown'), 'month': month, 'category': photo.get('category', 'unknown'), 'cluster_id': cid, 'original_cluster_id': photo.get('original_cluster_id', cid), 'cluster_total': cluster_total, 'cluster_selected': cluster_selected, 'event_id': photo.get('event_id', -1), 'max_similarity': float(photo.get('max_similarity', 0)), 'embedding': embedding_list, 'rejection_reason': rejection_reason, 'reason': rejection_reason, 'reason_detail': f"Category: {photo.get('category', 'unknown')}" }) results['rejection_breakdown'] = dict(rejection_counts) # Add face filtering count to breakdown (photos where target face was not detected) face_filter_data = results['summary'].get('face_filtering', {}) total_uploaded = face_filter_data.get('total_photos', 0) after_face_filter = face_filter_data.get('after_face_filter', 0) face_filtered_out = total_uploaded - after_face_filter if face_filtered_out > 0: results['rejection_breakdown']['Face not detected'] = face_filtered_out # Sort by score results['selected'].sort(key=lambda x: x['score'], reverse=True) results['rejected'].sort(key=lambda x: x['score'], reverse=True) # Save results results_file = os.path.join(RESULTS_FOLDER, f"{job_id}.json") with open(results_file, 'w') as f: json.dump(results, f, indent=2, default=str) processing_jobs[job_id]['status'] = 'complete' processing_jobs[job_id]['progress'] = 100 processing_jobs[job_id]['message'] = 'Selection complete!' processing_jobs[job_id]['results'] = results print(f"\n[Job {job_id}] PHASE 2 COMPLETE!") print(f" - Final selection: {len(results['selected'])} photos") print(f" - Filtered out: {len(results['rejected'])} photos") print(f" - Results saved to: {results_file}") print(f"\n=== Month Distribution ===") for stat in month_stats: print(f" {stat['month']}: {stat['selected']}/{stat['total_photos']} ({stat['category_summary']})") print(f"{'='*60}\n") # Auto-save disabled - uncomment below to re-enable # output_folder = save_photos_by_month(job_id, upload_dir, selected_photos, rejected_photos, month_stats) # if output_folder: # processing_jobs[job_id]['output_folder'] = output_folder # print(f"[Job {job_id}] Photos auto-saved to: {output_folder}") except Exception as e: print(f"[Job {job_id}] EXCEPTION: {str(e)}") processing_jobs[job_id]['status'] = 'error' processing_jobs[job_id]['message'] = str(e) import traceback traceback.print_exc() def process_photos_automatic(job_id, upload_dir, quality_mode, similarity_threshold, session_id=None): """ Full automatic processing (no review step) - used when no reference photos loaded. Processes all photos with quality-based selection. """ try: processing_jobs[job_id]['status'] = 'processing' processing_jobs[job_id]['progress'] = 5 processing_jobs[job_id]['message'] = 'Loading AI models...' # Import pipeline components from photo_selector.siglip_embeddings import SigLIPEmbedder from photo_selector.temporal import TemporalSegmenter from photo_selector.clustering import PhotoClusterer, BucketClusterManager from photo_selector.scoring import PhotoScorer, ClusterScorer from photo_selector.auto_selector import SmartPhotoSelector, SelectionReason # Step 1: Embeddings (SigLIP for better visual understanding) processing_jobs[job_id]['progress'] = 20 processing_jobs[job_id]['message'] = 'Analyzing photos with SigLIP AI...' embedder = SigLIPEmbedder() embeddings = embedder.process_folder(upload_dir) processing_jobs[job_id]['progress'] = 40 processing_jobs[job_id]['message'] = 'Organizing by date...' # Step 2: Temporal segmentation segmenter = TemporalSegmenter(bucket_type="monthly") buckets = segmenter.segment_folder(upload_dir) # For clustering, use a reasonable estimate (will be refined by auto-selector) estimated_target = max(10, len(embeddings) // 3) targets = segmenter.calculate_target_per_bucket(buckets, estimated_target) processing_jobs[job_id]['progress'] = 50 processing_jobs[job_id]['message'] = 'Grouping similar photos (adaptive clustering)...' # Step 3: Clustering (HDBSCAN with timestamp-weighted features, 24h gap splitting) # min_cluster_size=5 reduces single-photo clusters by requiring at least 5 similar photos clusterer = BucketClusterManager(PhotoClusterer(min_cluster_size=5, temporal_gap_hours=24.0, timestamp_weight=0.3)) cluster_results = clusterer.cluster_all_buckets(buckets, embeddings, targets) processing_jobs[job_id]['progress'] = 60 processing_jobs[job_id]['message'] = 'Scoring photo quality...' # Step 4: Score ALL photos scorer = ClusterScorer(PhotoScorer()) all_scores = {} for bucket_key, bucket_data in cluster_results.items(): filenames = bucket_data['filenames'] labels = np.array(bucket_data['labels']) bucket_embeddings = np.array([embeddings[fn] for fn in filenames]) for cluster_id in np.unique(labels): cluster_mask = labels == cluster_id cluster_indices = np.where(cluster_mask)[0] cluster_filenames = [filenames[i] for i in cluster_indices] cluster_embs = bucket_embeddings[cluster_mask] cluster_paths = [os.path.join(upload_dir, fn) for fn in cluster_filenames] scores = scorer.score_cluster(cluster_paths, cluster_embs) for score in scores: score['bucket'] = bucket_key score['cluster'] = int(cluster_id) score['cluster_key'] = f"{bucket_key}_cluster_{cluster_id}" all_scores[score['filename']] = score processing_jobs[job_id]['progress'] = 75 processing_jobs[job_id]['message'] = 'AI deciding which photos to keep...' # Step 5: AUTOMATIC SELECTION auto_selector = SmartPhotoSelector( quality_mode=quality_mode, similarity_threshold=similarity_threshold ) selection_results = auto_selector.process_all_photos( all_scores, embeddings, cluster_results ) processing_jobs[job_id]['progress'] = 90 processing_jobs[job_id]['message'] = 'Preparing results...' # Create thumbnails directory thumbs_dir = os.path.join(upload_dir, 'thumbnails') os.makedirs(thumbs_dir, exist_ok=True) # Prepare results results = { 'selected': [], 'rejected': [], 'summary': selection_results['summary'], 'rejection_breakdown': selection_results['rejection_breakdown'], 'bucket_stats': selection_results['bucket_stats'] } # Process selected photos for photo in selection_results['selected']: filename = photo['filename'] thumb_name = get_thumbnail_name(filename) thumb_path = os.path.join(thumbs_dir, thumb_name) create_thumbnail(os.path.join(upload_dir, filename), thumb_path) reason = photo.get('selection_reason', None) if isinstance(reason, SelectionReason): reason_text = reason.value else: reason_text = str(reason) if reason else 'High quality photo' results['selected'].append({ 'filename': filename, 'thumbnail': thumb_name, 'score': float(photo.get('total', 0)), 'face_quality': float(photo.get('face_quality', 0)), 'aesthetic_quality': float(photo.get('aesthetic_quality', 0)), 'emotional_signal': float(photo.get('emotional_signal', 0)), 'uniqueness': float(photo.get('uniqueness', 0)), 'bucket': photo.get('bucket', 'unknown'), 'num_faces': int(photo.get('num_faces', 0)), 'selection_reason': reason_text, 'selection_detail': photo.get('selection_detail', reason_text) }) # Process rejected photos for photo in selection_results['rejected']: filename = photo['filename'] thumb_name = get_thumbnail_name(filename) thumb_path = os.path.join(thumbs_dir, thumb_name) create_thumbnail(os.path.join(upload_dir, filename), thumb_path) reason = photo.get('rejection_reason', None) if isinstance(reason, SelectionReason): reason_text = reason.value else: reason_text = str(reason) if reason else 'Did not meet quality threshold' results['rejected'].append({ 'filename': filename, 'thumbnail': thumb_name, 'score': float(photo.get('total', 0)), 'face_quality': float(photo.get('face_quality', 0)), 'aesthetic_quality': float(photo.get('aesthetic_quality', 0)), 'bucket': photo.get('bucket', 'unknown'), 'reason': reason_text, 'reason_detail': photo.get('rejection_detail', '') }) # Sort by score results['selected'].sort(key=lambda x: x['score'], reverse=True) results['rejected'].sort(key=lambda x: x['score'], reverse=True) # Save results results_file = os.path.join(RESULTS_FOLDER, f"{job_id}.json") with open(results_file, 'w') as f: json.dump(results, f, indent=2, default=str) processing_jobs[job_id]['status'] = 'complete' processing_jobs[job_id]['progress'] = 100 processing_jobs[job_id]['message'] = 'Selection complete!' processing_jobs[job_id]['results'] = results except Exception as e: processing_jobs[job_id]['status'] = 'error' processing_jobs[job_id]['message'] = str(e) import traceback traceback.print_exc() @app.route('/') def index(): """Main page - redirects to step 1 (reference upload).""" return render_template('index.html') @app.route('/preload_model') def preload_model(): """Pre-load the InsightFace model in the background.""" from photo_selector.face_matcher import FaceMatcher try: # Create a temporary matcher to trigger model download/load temp_matcher = FaceMatcher(similarity_threshold=0.5) if temp_matcher.is_initialized: return jsonify({'success': True, 'message': 'Model loaded'}) else: return jsonify({'success': False, 'message': 'Model failed to initialize'}) except Exception as e: return jsonify({'success': False, 'message': str(e)}) @app.route('/step1') def step1_reference(): """Step 1: Upload reference photos of target person.""" # Create a new session ID if not exists if 'session_id' not in session: session['session_id'] = str(uuid.uuid4())[:8] return render_template('step1_reference.html', session_id=session['session_id']) @app.route('/step2') def step2_upload(): """Step 2: Upload all event photos.""" session_id = session.get('session_id') if not session_id: return render_template('index.html') # Check if we have reference photos loaded ref_count = 0 if session_id in face_matchers: ref_count = face_matchers[session_id].get_reference_count() return render_template('step2_upload.html', session_id=session_id, reference_count=ref_count) @app.route('/upload_reference', methods=['POST']) def upload_reference(): """Handle reference photo uploads (2-3 photos of target person).""" from photo_selector.face_matcher import FaceMatcher if 'files' not in request.files: return jsonify({'error': 'No files provided'}), 400 files = request.files.getlist('files') if not files or files[0].filename == '': return jsonify({'error': 'No files selected'}), 400 # Get or create session ID session_id = session.get('session_id') if not session_id: session_id = str(uuid.uuid4())[:8] session['session_id'] = session_id # Create reference directory for this session ref_dir = os.path.join(REFERENCE_FOLDER, session_id) os.makedirs(ref_dir, exist_ok=True) # Initialize face matcher for this session if not exists if session_id not in face_matchers: face_matchers[session_id] = FaceMatcher(similarity_threshold=0.5) matcher = face_matchers[session_id] # Process each reference photo results = [] for file in files: if file and allowed_file(file.filename): filename = secure_filename(file.filename) filepath = os.path.join(ref_dir, filename) file.save(filepath) # Add to face matcher result = matcher.add_reference_photo(filepath) result['filename'] = filename # Create thumbnail for preview thumb_name = get_thumbnail_name(filename) thumb_path = os.path.join(ref_dir, thumb_name) create_thumbnail(filepath, thumb_path, size=(150, 150)) result['thumbnail'] = thumb_name results.append(result) return jsonify({ 'session_id': session_id, 'results': results, 'total_references': matcher.get_reference_count(), 'message': f'Loaded {matcher.get_reference_count()} reference face(s)' }) @app.route('/reference_status') def reference_status(): """Get current reference photo status.""" session_id = session.get('session_id') if not session_id or session_id not in face_matchers: return jsonify({ 'session_id': session_id, 'reference_count': 0, 'ready': False }) matcher = face_matchers[session_id] return jsonify({ 'session_id': session_id, 'reference_count': matcher.get_reference_count(), 'ready': matcher.get_reference_count() >= 1 }) @app.route('/clear_references', methods=['POST']) def clear_references(): """Clear all reference photos for current session.""" session_id = session.get('session_id') if session_id and session_id in face_matchers: face_matchers[session_id].clear_references() # Delete reference files ref_dir = os.path.join(REFERENCE_FOLDER, session_id) if os.path.exists(ref_dir): shutil.rmtree(ref_dir) return jsonify({'message': 'References cleared', 'reference_count': 0}) @app.route('/reference_thumbnail/') def get_reference_thumbnail(filename): """Serve reference photo thumbnails.""" session_id = session.get('session_id') if not session_id: return jsonify({'error': 'No session'}), 404 ref_dir = os.path.join(REFERENCE_FOLDER, session_id) return send_from_directory(ref_dir, filename) # ============== CHUNKED UPLOAD ENDPOINTS ============== # These endpoints allow uploading large batches of photos in smaller chunks # to avoid 413 (Request Entity Too Large) errors on Hugging Face Spaces @app.route('/upload_init', methods=['POST']) def upload_init(): """Initialize a chunked upload session.""" data = request.json total_files = data.get('total_files', 0) quality_mode = data.get('quality_mode', 'balanced') similarity_threshold = data.get('similarity_threshold', 0.92) # Create a unique session ID for this upload upload_session_id = str(uuid.uuid4())[:8] upload_dir = os.path.join(UPLOAD_FOLDER, upload_session_id) os.makedirs(upload_dir, exist_ok=True) # Get face matcher session face_session_id = session.get('session_id') # Store session info upload_sessions[upload_session_id] = { 'upload_dir': upload_dir, 'total_files': total_files, 'uploaded_files': [], 'quality_mode': quality_mode, 'similarity_threshold': similarity_threshold, 'face_session_id': face_session_id, 'created_at': time.time() } print(f"\n[Upload Session {upload_session_id}] Initialized for {total_files} files") return jsonify({ 'session_id': upload_session_id, 'message': 'Upload session initialized' }) @app.route('/upload_chunk', methods=['POST']) def upload_chunk(): """Handle a chunk of files in a chunked upload.""" if 'files' not in request.files: return jsonify({'error': 'No files provided'}), 400 session_id = request.form.get('session_id') if not session_id or session_id not in upload_sessions: return jsonify({'error': 'Invalid upload session'}), 400 upload_info = upload_sessions[session_id] upload_dir = upload_info['upload_dir'] files = request.files.getlist('files') saved_count = 0 for file in files: if file and allowed_file(file.filename): filename = secure_filename(file.filename) # Handle duplicate filenames base, ext = os.path.splitext(filename) counter = 1 while os.path.exists(os.path.join(upload_dir, filename)): filename = f"{base}_{counter}{ext}" counter += 1 file.save(os.path.join(upload_dir, filename)) upload_info['uploaded_files'].append(filename) saved_count += 1 chunk_index = request.form.get('chunk_index', '?') print(f"[Upload Session {session_id}] Chunk {chunk_index}: saved {saved_count} files (total: {len(upload_info['uploaded_files'])})") return jsonify({ 'success': True, 'saved': saved_count, 'total_uploaded': len(upload_info['uploaded_files']) }) @app.route('/upload_complete', methods=['POST']) def upload_complete(): """Complete a chunked upload and start processing.""" data = request.json session_id = data.get('session_id') if not session_id or session_id not in upload_sessions: return jsonify({'error': 'Invalid upload session'}), 400 upload_info = upload_sessions[session_id] upload_dir = upload_info['upload_dir'] saved_files = upload_info['uploaded_files'] quality_mode = upload_info['quality_mode'] similarity_threshold = upload_info['similarity_threshold'] face_session_id = upload_info['face_session_id'] if not saved_files: shutil.rmtree(upload_dir) del upload_sessions[session_id] return jsonify({'error': 'No valid image files uploaded'}), 400 # Check if we have reference photos loaded has_references = False ref_count = 0 if face_session_id and face_session_id in face_matchers: ref_count = face_matchers[face_session_id].get_reference_count() has_references = ref_count > 0 # Create job (use same session_id as job_id for simplicity) job_id = session_id # Initialize job processing_jobs[job_id] = { 'status': 'queued', 'progress': 30, # Start at 30% since upload is done 'message': 'Starting AI processing...', 'total_files': len(saved_files), 'total_uploaded': len(saved_files), 'upload_dir': upload_dir, 'session_id': face_session_id, 'has_reference_photos': has_references, 'reference_count': ref_count, 'quality_mode': quality_mode, 'similarity_threshold': similarity_threshold, 'results': None } # Clean up upload session del upload_sessions[session_id] # Decide which processing mode to use if has_references: print(f"\n[Job {job_id}] NEW JOB (Chunked Upload) - Face Filtering Mode") print(f" - Files uploaded: {len(saved_files)}") print(f" - Reference photos: {ref_count}") thread = threading.Thread( target=process_photos_face_filter_only, args=(job_id, upload_dir, face_session_id) ) message = f'Scanning {len(saved_files)} photos to find your child using {ref_count} reference(s)...' else: print(f"\n[Job {job_id}] NEW JOB (Chunked Upload) - No Face Filtering") print(f" - Files uploaded: {len(saved_files)}") thread = threading.Thread( target=process_photos_quality_selection, args=(job_id, upload_dir, quality_mode, similarity_threshold) ) message = f'Selecting best photos from {len(saved_files)} images...' thread.daemon = True thread.start() processing_jobs[job_id]['message'] = message return jsonify({ 'job_id': job_id, 'message': message, 'total_files': len(saved_files) }) # ============== END CHUNKED UPLOAD ENDPOINTS ============== # ============== GOOGLE DRIVE IMPORT ENDPOINTS ============== # Import Google Drive module try: from google_drive import ( is_drive_available, extract_folder_id, list_images_in_folder, download_folder, get_folder_info, get_drive_service ) GDRIVE_SERVICE_ACCOUNT_AVAILABLE = is_drive_available() except ImportError: GDRIVE_SERVICE_ACCOUNT_AVAILABLE = False @app.route('/check_drive_status') def check_drive_status(): """Check if Google Drive Service Account is configured.""" return jsonify({ 'available': GDRIVE_SERVICE_ACCOUNT_AVAILABLE, 'message': 'Service Account configured' if GDRIVE_SERVICE_ACCOUNT_AVAILABLE else 'Service Account not configured' }) @app.route('/preview_drive_folder', methods=['POST']) def preview_drive_folder(): """Preview contents of a Google Drive folder before importing.""" if not GDRIVE_SERVICE_ACCOUNT_AVAILABLE: return jsonify({'error': 'Google Drive Service Account not configured'}), 400 data = request.get_json() folder_url = data.get('folder_url', '').strip() if not folder_url: return jsonify({'error': 'Please provide a folder URL'}), 400 try: folder_id = extract_folder_id(folder_url) info = get_folder_info(folder_id) if not info.get('success'): return jsonify({'error': info.get('error', 'Could not access folder')}), 400 return jsonify({ 'success': True, 'folder_id': folder_id, 'folder_name': info.get('folder_name', 'Unknown'), 'image_count': info.get('image_count', 0), 'preview_images': info.get('images', [])[:5] }) except ValueError as e: return jsonify({'error': str(e)}), 400 except Exception as e: print(f"[Drive] Error previewing folder: {e}") return jsonify({'error': f'Could not access folder: {str(e)}'}), 400 @app.route('/import_from_drive', methods=['POST']) def import_from_drive(): """Import photos from Google Drive folder (Step 2 - initial upload).""" if not GDRIVE_SERVICE_ACCOUNT_AVAILABLE: return jsonify({'error': 'Google Drive Service Account not configured'}), 400 data = request.get_json() folder_url = data.get('folder_url', '').strip() quality_mode = data.get('quality_mode', 'balanced') similarity_threshold = float(data.get('similarity_threshold', 0.4)) if not folder_url: return jsonify({'error': 'Please provide a folder URL'}), 400 # Get face session (step 1 stores it as 'session_id') face_session_id = session.get('session_id') has_references = False ref_count = 0 if face_session_id and face_session_id in face_matchers: ref_count = face_matchers[face_session_id].get_reference_count() has_references = ref_count > 0 try: folder_id = extract_folder_id(folder_url) except ValueError as e: return jsonify({'error': str(e)}), 400 # Create job job_id = str(uuid.uuid4())[:8] upload_dir = os.path.join(UPLOAD_FOLDER, job_id) os.makedirs(upload_dir, exist_ok=True) os.makedirs(os.path.join(upload_dir, 'thumbnails'), exist_ok=True) # Initialize job processing_jobs[job_id] = { 'status': 'downloading', 'progress': 5, 'message': 'Connecting to Google Drive...', 'total_files': 0, 'total_uploaded': 0, 'upload_dir': upload_dir, 'session_id': face_session_id, 'has_reference_photos': has_references, 'reference_count': ref_count, 'quality_mode': quality_mode, 'similarity_threshold': similarity_threshold, 'results': None } # Start download in background thread def download_and_process(): try: # HYBRID MODE: If we have face references, use parallel download + face detection if has_references: face_matcher = face_matchers.get(face_session_id) if face_matcher and face_matcher.get_reference_count() > 0: print(f"[Job {job_id}] Using HYBRID MODE: Parallel download + face detection") process_drive_with_parallel_face_detection(job_id, folder_id, upload_dir, face_matcher) return # SEQUENTIAL MODE: Download all first, then process (for auto mode without face filtering) def progress_callback(current, total, _filename): pct = int(5 + (current / total) * 25) # 5% to 30% processing_jobs[job_id]['progress'] = pct processing_jobs[job_id]['message'] = f'Downloading from Drive: {current}/{total}' processing_jobs[job_id]['total_files'] = total processing_jobs[job_id]['total_uploaded'] = current print(f"[Job {job_id}] Starting Google Drive download from folder {folder_id}") result = download_folder(folder_id, upload_dir, progress_callback) if not result.get('success') and result.get('downloaded', 0) == 0: processing_jobs[job_id]['status'] = 'error' processing_jobs[job_id]['message'] = result.get('message', 'Download failed') return downloaded_count = result.get('downloaded', 0) + result.get('skipped', 0) downloaded_files = result.get('files', []) processing_jobs[job_id]['total_uploaded'] = downloaded_count processing_jobs[job_id]['total_files'] = downloaded_count print(f"[Job {job_id}] Downloaded {downloaded_count} photos from Google Drive") # No face filtering - use all downloaded photos (auto mode) processing_jobs[job_id]['message'] = f'Selecting best from {downloaded_count} photos...' process_photos_quality_selection(job_id, upload_dir, quality_mode, similarity_threshold, downloaded_files) except Exception as e: print(f"[Job {job_id}] Drive import error: {e}") import traceback traceback.print_exc() processing_jobs[job_id]['status'] = 'error' processing_jobs[job_id]['message'] = f'Import failed: {str(e)}' thread = threading.Thread(target=download_and_process) thread.daemon = True thread.start() return jsonify({ 'job_id': job_id, 'message': 'Starting Google Drive import...' }) @app.route('/import_from_drive_reupload/', methods=['POST']) def import_from_drive_reupload(dataset_name): """Import photos from Google Drive folder for reupload (after server restart).""" if not GDRIVE_SERVICE_ACCOUNT_AVAILABLE: return jsonify({'error': 'Google Drive Service Account not configured'}), 400 data = request.get_json() folder_url = data.get('folder_url', '').strip() if not folder_url: return jsonify({'error': 'Please provide a folder URL'}), 400 try: folder_id = extract_folder_id(folder_url) except ValueError as e: return jsonify({'error': str(e)}), 400 # Create job job_id = str(uuid.uuid4())[:8] upload_dir = os.path.join(UPLOAD_FOLDER, job_id) os.makedirs(upload_dir, exist_ok=True) os.makedirs(os.path.join(upload_dir, 'thumbnails'), exist_ok=True) # Initialize job processing_jobs[job_id] = { 'status': 'downloading', 'progress': 5, 'message': 'Connecting to Google Drive...' } # Start download and processing in background def download_and_process_reupload(): try: def progress_callback(current, total, filename): pct = int(5 + (current / total) * 45) # 5% to 50% processing_jobs[job_id]['progress'] = pct processing_jobs[job_id]['message'] = f'Downloading from Drive: {current}/{total}' print(f"[Job {job_id}] Starting Google Drive reupload for dataset '{dataset_name}'") result = download_folder(folder_id, upload_dir, progress_callback) if not result.get('success') and result.get('downloaded', 0) == 0: processing_jobs[job_id]['status'] = 'error' processing_jobs[job_id]['message'] = result.get('message', 'Download failed') return uploaded_filenames = result.get('files', []) print(f"[Job {job_id}] Downloaded {len(uploaded_filenames)} photos") # Load dataset from Supabase processing_jobs[job_id]['message'] = 'Loading saved dataset...' processing_jobs[job_id]['progress'] = 55 supabase_data = load_dataset_from_supabase(dataset_name) if not supabase_data: processing_jobs[job_id]['status'] = 'error' processing_jobs[job_id]['message'] = 'Dataset not found in Supabase' return metadata = supabase_data.get('metadata', {}) face_results = supabase_data.get('face_results', {}) embeddings_data = supabase_data.get('embeddings_data') # Load reference embeddings new_session_id = str(uuid.uuid4())[:8] if embeddings_data: import io from photo_selector.face_matcher import FaceMatcher data_np = np.load(io.BytesIO(embeddings_data), allow_pickle=True) matcher = FaceMatcher(similarity_threshold=float(data_np['threshold'])) matcher.reference_embeddings = list(data_np['embeddings']) matcher.average_embedding = data_np['average'] face_matchers[new_session_id] = matcher # Note: Can't set session here (background thread) - session_id stored in processing_jobs print(f"[Job {job_id}] Loaded {len(matcher.reference_embeddings)} reference embeddings") # Match uploaded files with saved face results # Google Drive filenames differ from browser upload: # 1. Duplicates: IMG_5197(1).JPG vs IMG_51971.JPG # 2. Spaces: IMG_6970 Copy.JPG vs IMG_6970_Copy.JPG import re def normalize_filename(filename): """Normalize Google Drive filename to match browser upload format.""" # Step 1: Convert (N) suffix to N (Google Drive duplicate handling) match = re.match(r'^(.+)\((\d+)\)(\.[^.]+)$', filename) if match: base, num, ext = match.groups() filename = f"{base}{num}{ext}" # Step 2: Apply secure_filename (spaces -> underscores, etc.) return secure_filename(filename) filtered_photos = face_results.get('filtered_photos', []) uploaded_set = set(uploaded_filenames) saved_filenames_set = {p.get('filename') for p in filtered_photos} # Create mapping: normalized_name -> actual_uploaded_name normalized_to_uploaded = {normalize_filename(f): f for f in uploaded_filenames} matched_photos = [] for p in filtered_photos: saved_filename = p.get('filename') actual_filename = None # Try direct match first if saved_filename in uploaded_set: actual_filename = saved_filename # Try normalized match (saved name matches normalized uploaded name) elif saved_filename in normalized_to_uploaded: actual_filename = normalized_to_uploaded[saved_filename] if actual_filename: # Use actual uploaded filename for the photo entry photo_entry = p.copy() photo_entry['filename'] = actual_filename photo_entry['thumbnail'] = get_thumbnail_name(actual_filename) matched_photos.append(photo_entry) # Debug: Find unmatched photos matched_saved = {p.get('filename') for p in filtered_photos if p.get('filename') in uploaded_set or p.get('filename') in normalized_to_uploaded} unmatched_from_saved = [p.get('filename') for p in filtered_photos if p.get('filename') not in matched_saved] matched_uploaded = {m['filename'] for m in matched_photos} unmatched_from_uploaded = [f for f in uploaded_filenames if f not in matched_uploaded] print(f"[Job {job_id}] Matched {len(matched_photos)} of {len(filtered_photos)} photos") print(f"[Job {job_id}] DEBUG: {len(unmatched_from_saved)} saved photos NOT found in uploaded files:") for fname in unmatched_from_saved[:20]: # Show first 20 print(f" [SAVED NOT IN UPLOAD] '{fname}'") if len(unmatched_from_saved) > 20: print(f" ... and {len(unmatched_from_saved) - 20} more") print(f"[Job {job_id}] DEBUG: {len(unmatched_from_uploaded)} uploaded files NOT found in saved data:") for fname in unmatched_from_uploaded[:20]: # Show first 20 print(f" [UPLOAD NOT IN SAVED] '{fname}'") if len(unmatched_from_uploaded) > 20: print(f" ... and {len(unmatched_from_uploaded) - 20} more") # Create review data review_data = { 'filtered_photos': matched_photos, 'total_processed': len(uploaded_filenames), 'match_count': len(matched_photos) } with open(os.path.join(RESULTS_FOLDER, f"{job_id}_review.json"), 'w') as f: json.dump(review_data, f) # Update processing job processing_jobs[job_id].update({ 'status': 'review_pending', 'progress': 100, 'message': 'Photos downloaded from Google Drive', 'upload_dir': upload_dir, 'session_id': new_session_id, 'has_reference_photos': True, 'reference_count': metadata.get('reference_count', 0), 'quality_mode': metadata.get('quality_mode', 'balanced'), 'similarity_threshold': metadata.get('similarity_threshold', 0.4), 'confirmed_photos': [p['filename'] for p in matched_photos], 'review_data': review_data, 'total_photos': len(matched_photos), 'from_dataset': dataset_name, 'from_supabase': True, 'redirect_url': f'/step3_review/{job_id}' }) print(f"[Job {job_id}] Reupload complete - ready for review") except Exception as e: print(f"[Job {job_id}] Drive reupload error: {e}") import traceback traceback.print_exc() processing_jobs[job_id]['status'] = 'error' processing_jobs[job_id]['message'] = f'Import failed: {str(e)}' thread = threading.Thread(target=download_and_process_reupload) thread.daemon = True thread.start() return jsonify({ 'job_id': job_id, 'message': 'Starting Google Drive import...' }) # ============== END GOOGLE DRIVE IMPORT ENDPOINTS ============== @app.route('/upload', methods=['POST']) def upload_files(): """Handle file uploads and start processing.""" if 'files' not in request.files: return jsonify({'error': 'No files provided'}), 400 files = request.files.getlist('files') if not files or files[0].filename == '': return jsonify({'error': 'No files selected'}), 400 # Get parameters - now using quality_mode instead of target quality_mode = request.form.get('quality_mode', 'balanced') similarity_threshold = float(request.form.get('similarity', 0.92)) # Get session ID for face matching session_id = session.get('session_id') # Create job job_id = str(uuid.uuid4())[:8] upload_dir = os.path.join(UPLOAD_FOLDER, job_id) os.makedirs(upload_dir, exist_ok=True) # Save files saved_files = [] for file in files: if file and allowed_file(file.filename): filename = secure_filename(file.filename) # Handle duplicate filenames base, ext = os.path.splitext(filename) counter = 1 while os.path.exists(os.path.join(upload_dir, filename)): filename = f"{base}_{counter}{ext}" counter += 1 file.save(os.path.join(upload_dir, filename)) saved_files.append(filename) if not saved_files: shutil.rmtree(upload_dir) return jsonify({'error': 'No valid image files'}), 400 # Check if we have reference photos loaded has_references = False ref_count = 0 if session_id and session_id in face_matchers: ref_count = face_matchers[session_id].get_reference_count() has_references = ref_count > 0 # Initialize job processing_jobs[job_id] = { 'status': 'queued', 'progress': 0, 'message': 'Uploading files...', 'total_files': len(saved_files), 'total_uploaded': len(saved_files), 'upload_dir': upload_dir, 'session_id': session_id, 'has_reference_photos': has_references, 'reference_count': ref_count, 'quality_mode': quality_mode, 'similarity_threshold': similarity_threshold, 'results': None } # Decide which processing mode to use if has_references: # With reference photos: Phase 1 = face filtering only, then review step print(f"\n[Job {job_id}] NEW JOB - Face Filtering Mode") print(f" - Files uploaded: {len(saved_files)}") print(f" - Reference photos: {ref_count}") print(f" - Session ID: {session_id}") thread = threading.Thread( target=process_photos_face_filter_only, args=(job_id, upload_dir, session_id) ) message = f'Scanning {len(saved_files)} photos to find your child using {ref_count} reference(s)...' else: # Without reference photos: Full automatic processing (no review step) print(f"\n[Job {job_id}] NEW JOB - Full Automatic Mode") print(f" - Files uploaded: {len(saved_files)}") print(f" - Quality mode: {quality_mode}") print(f" - Similarity threshold: {similarity_threshold}") thread = threading.Thread( target=process_photos_automatic, args=(job_id, upload_dir, quality_mode, similarity_threshold, session_id) ) message = 'Processing started - AI will automatically select the best photos!' thread.start() return jsonify({ 'job_id': job_id, 'files_uploaded': len(saved_files), 'has_reference_photos': has_references, 'reference_count': ref_count, 'message': message, 'needs_review': has_references # Client should redirect to review page }) @app.route('/upload_folder', methods=['POST']) def upload_folder(): """Process photos from a local folder path (for large batches).""" data = request.get_json() folder_path = data.get('folder_path', '').strip() quality_mode = data.get('quality_mode', 'balanced') similarity_threshold = float(data.get('similarity_threshold', 0.92)) if not folder_path: return jsonify({'error': 'No folder path provided'}), 400 # Validate folder exists if not os.path.isdir(folder_path): return jsonify({'error': f'Folder not found: {folder_path}'}), 400 # Get session ID for face matching session_id = session.get('session_id') # Create job with reference to original folder job_id = str(uuid.uuid4())[:8] # Count valid image files image_extensions = {'.jpg', '.jpeg', '.png', '.heic', '.heif', '.webp'} image_files = [f for f in os.listdir(folder_path) if os.path.splitext(f.lower())[1] in image_extensions] if not image_files: return jsonify({'error': 'No valid image files found in folder'}), 400 print(f"\n[Job {job_id}] LOCAL FOLDER MODE") print(f" - Folder: {folder_path}") print(f" - Images found: {len(image_files)}") # Check if we have reference photos loaded has_references = False ref_count = 0 if session_id and session_id in face_matchers: ref_count = face_matchers[session_id].get_reference_count() has_references = ref_count > 0 # Create thumbnails directory thumb_dir = os.path.join(UPLOAD_FOLDER, job_id, 'thumbnails') os.makedirs(thumb_dir, exist_ok=True) # Initialize job - use original folder path as upload_dir processing_jobs[job_id] = { 'status': 'queued', 'progress': 0, 'message': 'Preparing to process photos...', 'total_files': len(image_files), 'total_uploaded': len(image_files), 'upload_dir': folder_path, # Point to original folder 'thumb_dir': thumb_dir, 'session_id': session_id, 'has_reference_photos': has_references, 'reference_count': ref_count, 'quality_mode': quality_mode, 'similarity_threshold': similarity_threshold, 'is_local_folder': True, # Flag for local folder mode 'results': None } # Decide which processing mode to use if has_references: print(f" - Reference photos: {ref_count}") print(f" - Mode: Face Filtering") thread = threading.Thread( target=process_photos_face_filter_only, args=(job_id, folder_path, session_id) ) message = f'Scanning {len(image_files)} photos to find your child...' else: print(f" - Mode: Full Automatic") thread = threading.Thread( target=process_photos_automatic, args=(job_id, folder_path, quality_mode, similarity_threshold, session_id) ) message = 'Processing started - AI will automatically select the best photos!' thread.start() return jsonify({ 'job_id': job_id, 'files_found': len(image_files), 'has_reference_photos': has_references, 'reference_count': ref_count, 'message': message, 'needs_review': has_references }) @app.route('/status/') def get_status(job_id): """Get processing status.""" if job_id not in processing_jobs: return jsonify({'error': 'Job not found'}), 404 job = processing_jobs[job_id] response = { 'status': job['status'], 'progress': job['progress'], 'message': job['message'], 'total_photos': job.get('total_photos', 0), 'photos_checked': job.get('photos_checked', 0) } if job['status'] == 'complete' and job['results']: response['summary'] = job['results']['summary'] return jsonify(response) @app.route('/results/') def get_results(job_id): """Get processing results.""" try: if job_id not in processing_jobs: # Try loading from file results_file = os.path.join(RESULTS_FOLDER, f"{job_id}.json") if os.path.exists(results_file): with open(results_file, 'r') as f: return jsonify(json.load(f)) return jsonify({'error': 'Job not found'}), 404 job = processing_jobs[job_id] if job['status'] != 'complete': return jsonify({'error': 'Processing not complete', 'status': job['status'], 'message': job.get('message', '')}), 400 # Try from memory first, then file if 'results' in job and job['results']: return jsonify(job['results']) # Fallback to file results_file = os.path.join(RESULTS_FOLDER, f"{job_id}.json") if os.path.exists(results_file): with open(results_file, 'r') as f: return jsonify(json.load(f)) return jsonify({'error': 'Results not found'}), 404 except Exception as e: import traceback traceback.print_exc() return jsonify({'error': str(e)}), 500 @app.route('/thumbnail//') def get_thumbnail(job_id, filename): """Serve thumbnail images, generating on-demand if needed.""" thumb_dir = os.path.join(UPLOAD_FOLDER, job_id, 'thumbnails') thumb_name = get_thumbnail_name(filename) thumb_path = os.path.join(thumb_dir, thumb_name) # If thumbnail exists, serve it if os.path.exists(thumb_path): return send_from_directory(thumb_dir, thumb_name) # Generate thumbnail on-demand for unmatched photos original_path = os.path.join(UPLOAD_FOLDER, job_id, filename) if os.path.exists(original_path): os.makedirs(thumb_dir, exist_ok=True) create_thumbnail(original_path, thumb_path) if os.path.exists(thumb_path): return send_from_directory(thumb_dir, thumb_name) # Fallback - try to serve the original filename from thumbnails if os.path.exists(os.path.join(thumb_dir, filename)): return send_from_directory(thumb_dir, filename) return jsonify({'error': 'Thumbnail not found'}), 404 @app.route('/photo//') def get_photo(job_id, filename): """Serve full-size photos with proper EXIF rotation handling.""" from io import BytesIO from PIL import ExifTags photo_dir = os.path.join(UPLOAD_FOLDER, job_id) filepath = os.path.join(photo_dir, filename) if not os.path.exists(filepath): return jsonify({'error': 'File not found'}), 404 ext = os.path.splitext(filename)[1].lower() # Handle HEIC/HEIF - convert to JPEG if ext in ['.heic', '.heif']: try: img = Image.open(filepath) img = img.convert('RGB') buffer = BytesIO() img.save(buffer, format='JPEG', quality=90) buffer.seek(0) return send_file(buffer, mimetype='image/jpeg') except Exception as e: print(f"Error converting HEIC: {e}") return send_from_directory(photo_dir, filename) # Handle JPG/JPEG - apply EXIF rotation if ext in ['.jpg', '.jpeg']: try: img = Image.open(filepath) # Get EXIF orientation and rotate if needed try: for orientation in ExifTags.TAGS.keys(): if ExifTags.TAGS[orientation] == 'Orientation': break exif = img._getexif() if exif is not None: orientation_value = exif.get(orientation) if orientation_value == 3: img = img.rotate(180, expand=True) elif orientation_value == 6: img = img.rotate(270, expand=True) elif orientation_value == 8: img = img.rotate(90, expand=True) except (AttributeError, KeyError, IndexError): pass # Convert to RGB if needed (handles RGBA, P mode, etc.) if img.mode != 'RGB': img = img.convert('RGB') buffer = BytesIO() img.save(buffer, format='JPEG', quality=90) buffer.seek(0) return send_file(buffer, mimetype='image/jpeg') except Exception as e: print(f"Error processing JPEG: {e}") return send_from_directory(photo_dir, filename) # Other formats - serve directly return send_from_directory(photo_dir, filename) @app.route('/download/') def download_selected(job_id): """Download selected photos as zip with timestamp-sorted naming. Uses DISK-BASED ZIP creation (not memory) to handle large photo sets (1000+). The ZIP is created on disk, then streamed to the browser in chunks. This prevents memory issues and timeouts on large downloads. """ import zipfile import tempfile from datetime import datetime from collections import defaultdict if job_id not in processing_jobs: return jsonify({'error': 'Job not found'}), 404 job = processing_jobs[job_id] if job['status'] != 'complete': return jsonify({'error': 'Processing not complete'}), 400 results = job.get('results', {}) selected = results.get('selected', []) upload_dir = job.get('upload_dir', '') if not selected: return jsonify({'error': 'No selected photos found'}), 404 if not upload_dir: return jsonify({'error': 'Upload directory not found'}), 404 print(f"[Download] Starting disk-based ZIP for {len(selected)} photos...") # Month abbreviations MONTH_ABBREV = { 1: "Jan", 2: "Feb", 3: "Mar", 4: "Apr", 5: "May", 6: "Jun", 7: "Jul", 8: "Aug", 9: "Sep", 10: "Oct", 11: "Nov", 12: "Dec" } # Import timestamp extractor from photo_selector.utils import get_photo_timestamp # Group photos by month and sort by timestamp photos_by_month = defaultdict(list) photos_no_timestamp = [] for photo in selected: filename = photo.get('filename', '') ts = photo.get('timestamp') # If no timestamp stored, try to extract it from the photo file if not ts: photo_path = os.path.join(upload_dir, filename) if os.path.exists(photo_path): dt = get_photo_timestamp(photo_path) if dt: ts = dt.timestamp() if ts: dt = datetime.fromtimestamp(ts) month_key = (dt.year, dt.month) # Group by year-month to handle multi-year datasets photos_by_month[month_key].append({ 'filename': filename, 'timestamp': ts, 'datetime': dt }) else: photos_no_timestamp.append({'filename': filename, 'timestamp': 0}) # Sort photos within each month by timestamp for month_key in photos_by_month: photos_by_month[month_key].sort(key=lambda x: x['timestamp']) # Create ZIP file ON DISK (not in memory) to handle large photo sets temp_zip_path = os.path.join(tempfile.gettempdir(), f'selected_photos_{job_id}.zip') files_added = 0 try: # Use ZIP_STORED (no compression) for faster creation with photos (already compressed) with zipfile.ZipFile(temp_zip_path, 'w', zipfile.ZIP_STORED) as zf: # Add photos with timestamps (sorted and renamed) for month_key in sorted(photos_by_month.keys()): year, month = month_key month_abbrev = MONTH_ABBREV[month] photos = photos_by_month[month_key] for idx, photo in enumerate(photos, start=1): original_filename = photo['filename'] photo_path = os.path.join(upload_dir, original_filename) if os.path.exists(photo_path): # Create new filename: Jan_1_originalname.jpg ext = os.path.splitext(original_filename)[1] base_name = os.path.splitext(original_filename)[0] new_filename = f"{month_abbrev}_{idx}_{base_name}{ext}" zf.write(photo_path, new_filename) files_added += 1 # Log progress every 100 files if files_added % 100 == 0: print(f"[Download] Added {files_added} files to ZIP...") else: print(f"[Download] File not found: {photo_path}") # Add photos without timestamps at the end with "NoDate" prefix for idx, photo in enumerate(photos_no_timestamp, start=1): original_filename = photo['filename'] photo_path = os.path.join(upload_dir, original_filename) if os.path.exists(photo_path): ext = os.path.splitext(original_filename)[1] base_name = os.path.splitext(original_filename)[0] new_filename = f"NoDate_{idx}_{base_name}{ext}" zf.write(photo_path, new_filename) files_added += 1 else: print(f"[Download] File not found: {photo_path}") if files_added == 0: # Clean up empty zip if os.path.exists(temp_zip_path): os.remove(temp_zip_path) return jsonify({'error': f'No files found in {upload_dir}. Files may have been cleaned up.'}), 404 # Get file size for logging zip_size_mb = os.path.getsize(temp_zip_path) / (1024 * 1024) print(f"[Download] ZIP created: {files_added} files, {zip_size_mb:.1f} MB") # Stream the file to browser and delete after sending def generate_and_cleanup(): """Generator that streams ZIP file and deletes it after completion.""" try: with open(temp_zip_path, 'rb') as f: while True: chunk = f.read(8192 * 16) # 128KB chunks for faster streaming if not chunk: break yield chunk finally: # Clean up temp file after streaming try: if os.path.exists(temp_zip_path): os.remove(temp_zip_path) print(f"[Download] Cleaned up temp ZIP: {temp_zip_path}") except Exception as e: print(f"[Download] Error cleaning up temp ZIP: {e}") # Return streaming response response = Response( generate_and_cleanup(), mimetype='application/zip', headers={ 'Content-Disposition': f'attachment; filename=selected_photos_{job_id}.zip', 'Content-Length': str(os.path.getsize(temp_zip_path)) } ) return response except Exception as e: # Clean up on error if os.path.exists(temp_zip_path): os.remove(temp_zip_path) print(f"[Download] Error creating ZIP: {e}") return jsonify({'error': f'Error creating ZIP: {str(e)}'}), 500 @app.route('/download_filtered/') def download_filtered(job_id): """Download all filtered photos (after face matching, before quality selection). Uses DISK-BASED ZIP creation (not memory) to handle large photo sets (1000+). """ import zipfile import tempfile if job_id not in processing_jobs: return jsonify({'error': 'Job not found'}), 404 job = processing_jobs[job_id] # Get filtered photos from review data filtered_photos = [] if 'review_data' in job: filtered_photos = [p['filename'] for p in job['review_data'].get('filtered_photos', [])] else: # Try to load from file review_file = os.path.join(RESULTS_FOLDER, f"{job_id}_review.json") if os.path.exists(review_file): with open(review_file, 'r') as f: review_data = json.load(f) filtered_photos = [p['filename'] for p in review_data.get('filtered_photos', [])] if not filtered_photos: return jsonify({'error': 'No filtered photos found'}), 404 print(f"[Download] Starting disk-based ZIP for {len(filtered_photos)} filtered photos...") # Create ZIP file ON DISK (not in memory) to handle large photo sets temp_zip_path = os.path.join(tempfile.gettempdir(), f'filtered_photos_{job_id}.zip') files_added = 0 try: with zipfile.ZipFile(temp_zip_path, 'w', zipfile.ZIP_STORED) as zf: for filename in filtered_photos: photo_path = os.path.join(job['upload_dir'], filename) if os.path.exists(photo_path): zf.write(photo_path, filename) files_added += 1 if files_added % 100 == 0: print(f"[Download] Added {files_added} files to ZIP...") if files_added == 0: if os.path.exists(temp_zip_path): os.remove(temp_zip_path) return jsonify({'error': 'No files found. Files may have been cleaned up.'}), 404 zip_size_mb = os.path.getsize(temp_zip_path) / (1024 * 1024) print(f"[Download] ZIP created: {files_added} files, {zip_size_mb:.1f} MB") # Stream the file and delete after sending def generate_and_cleanup(): try: with open(temp_zip_path, 'rb') as f: while True: chunk = f.read(8192 * 16) # 128KB chunks if not chunk: break yield chunk finally: try: if os.path.exists(temp_zip_path): os.remove(temp_zip_path) print(f"[Download] Cleaned up temp ZIP: {temp_zip_path}") except Exception as e: print(f"[Download] Error cleaning up temp ZIP: {e}") return Response( generate_and_cleanup(), mimetype='application/zip', headers={ 'Content-Disposition': f'attachment; filename=filtered_photos_{job_id}.zip', 'Content-Length': str(os.path.getsize(temp_zip_path)) } ) except Exception as e: if os.path.exists(temp_zip_path): os.remove(temp_zip_path) print(f"[Download] Error creating ZIP: {e}") return jsonify({'error': f'Error creating ZIP: {str(e)}'}), 500 @app.route('/download_unmatched/') def download_unmatched(job_id): """Download photos where target person was NOT detected, with timestamp-sorted naming.""" import zipfile import tempfile from datetime import datetime from collections import defaultdict if job_id not in processing_jobs: return jsonify({'error': 'Job not found'}), 404 job = processing_jobs[job_id] upload_dir = job.get('upload_dir', '') if not upload_dir: return jsonify({'error': 'Upload directory not found'}), 404 # Get unmatched photos from review data unmatched_photos = [] if 'review_data' in job: unmatched_photos = job['review_data'].get('unmatched_photos', []) else: # Try to load from file review_file = os.path.join(RESULTS_FOLDER, f"{job_id}_review.json") if os.path.exists(review_file): with open(review_file, 'r') as f: review_data = json.load(f) unmatched_photos = review_data.get('unmatched_photos', []) if not unmatched_photos: return jsonify({'error': 'No unmatched photos found'}), 404 print(f"[Download] Starting disk-based ZIP for {len(unmatched_photos)} unmatched photos...") # Month abbreviations MONTH_ABBREV = { 1: "Jan", 2: "Feb", 3: "Mar", 4: "Apr", 5: "May", 6: "Jun", 7: "Jul", 8: "Aug", 9: "Sep", 10: "Oct", 11: "Nov", 12: "Dec" } # Import timestamp extractor from photo_selector.utils import get_photo_timestamp # Group photos by month and sort by timestamp photos_by_month = defaultdict(list) photos_no_timestamp = [] for photo in unmatched_photos: filename = photo.get('filename', '') ts = photo.get('timestamp') # If no timestamp stored, try to extract it from the photo file if not ts: photo_path = os.path.join(upload_dir, filename) if os.path.exists(photo_path): dt = get_photo_timestamp(photo_path) if dt: ts = dt.timestamp() if ts: dt = datetime.fromtimestamp(ts) month_key = (dt.year, dt.month) photos_by_month[month_key].append({ 'filename': filename, 'timestamp': ts }) else: photos_no_timestamp.append({'filename': filename}) # Sort photos within each month by timestamp for month_key in photos_by_month: photos_by_month[month_key].sort(key=lambda x: x['timestamp']) # Create ZIP file ON DISK (not in memory) to handle large photo sets temp_zip_path = os.path.join(tempfile.gettempdir(), f'unmatched_photos_{job_id}.zip') files_added = 0 try: with zipfile.ZipFile(temp_zip_path, 'w', zipfile.ZIP_STORED) as zf: # Add photos with timestamps (sorted and renamed) for month_key in sorted(photos_by_month.keys()): year, month = month_key month_abbrev = MONTH_ABBREV[month] photos = photos_by_month[month_key] for idx, photo in enumerate(photos, start=1): original_filename = photo['filename'] photo_path = os.path.join(upload_dir, original_filename) if os.path.exists(photo_path): ext = os.path.splitext(original_filename)[1] base_name = os.path.splitext(original_filename)[0] new_filename = f"{month_abbrev}_{idx}_{base_name}{ext}" zf.write(photo_path, new_filename) files_added += 1 if files_added % 100 == 0: print(f"[Download] Added {files_added} files to ZIP...") # Add photos without timestamps at the end for idx, photo in enumerate(photos_no_timestamp, start=1): original_filename = photo['filename'] photo_path = os.path.join(upload_dir, original_filename) if os.path.exists(photo_path): ext = os.path.splitext(original_filename)[1] base_name = os.path.splitext(original_filename)[0] new_filename = f"NoDate_{idx}_{base_name}{ext}" zf.write(photo_path, new_filename) files_added += 1 if files_added == 0: if os.path.exists(temp_zip_path): os.remove(temp_zip_path) return jsonify({'error': 'No files found in upload directory'}), 404 zip_size_mb = os.path.getsize(temp_zip_path) / (1024 * 1024) print(f"[Download] ZIP created: {files_added} files, {zip_size_mb:.1f} MB") # Stream the file and delete after sending def generate_and_cleanup(): try: with open(temp_zip_path, 'rb') as f: while True: chunk = f.read(8192 * 16) # 128KB chunks if not chunk: break yield chunk finally: try: if os.path.exists(temp_zip_path): os.remove(temp_zip_path) print(f"[Download] Cleaned up temp ZIP: {temp_zip_path}") except Exception as e: print(f"[Download] Error cleaning up temp ZIP: {e}") return Response( generate_and_cleanup(), mimetype='application/zip', headers={ 'Content-Disposition': f'attachment; filename=unmatched_photos_{job_id}.zip', 'Content-Length': str(os.path.getsize(temp_zip_path)) } ) except Exception as e: if os.path.exists(temp_zip_path): os.remove(temp_zip_path) print(f"[Download] Error creating ZIP: {e}") return jsonify({'error': f'Error creating ZIP: {str(e)}'}), 500 @app.route('/cleanup/', methods=['POST']) def cleanup_job(job_id): """Clean up job files.""" if job_id in processing_jobs: upload_dir = processing_jobs[job_id].get('upload_dir') if upload_dir and os.path.exists(upload_dir): shutil.rmtree(upload_dir) del processing_jobs[job_id] results_file = os.path.join(RESULTS_FOLDER, f"{job_id}.json") if os.path.exists(results_file): os.remove(results_file) # Also clean up review file review_file = os.path.join(RESULTS_FOLDER, f"{job_id}_review.json") if os.path.exists(review_file): os.remove(review_file) return jsonify({'message': 'Cleaned up'}) # ==================== REVIEW WORKFLOW ROUTES ==================== @app.route('/step3_review/') def step3_review(job_id): """Step 3: Review filtered photos before quality selection.""" if job_id not in processing_jobs: return render_template('index.html') job = processing_jobs[job_id] # Check if face filtering is complete if job['status'] not in ['review_pending', 'complete']: # Still processing or error - redirect back to step2 return render_template('step2_upload.html', session_id=session.get('session_id'), reference_count=job.get('reference_count', 0)) return render_template('step3_review.html', job_id=job_id) @app.route('/review_data/') def get_review_data(job_id): """Get the filtered photos data for review.""" if job_id not in processing_jobs: return jsonify({'error': 'Job not found'}), 404 job = processing_jobs[job_id] # Check if we have review data if 'review_data' in job: return jsonify(job['review_data']) # Try to load from file review_file = os.path.join(RESULTS_FOLDER, f"{job_id}_review.json") if os.path.exists(review_file): with open(review_file, 'r') as f: review_data = json.load(f) return jsonify(review_data) return jsonify({'error': 'Review data not found'}), 404 @app.route('/review_thumbnail//') def get_review_thumbnail(job_id, filename): """Serve thumbnail for review page.""" # Thumbnails are always stored in uploads//thumbnails thumb_dir = os.path.join(UPLOAD_FOLDER, job_id, 'thumbnails') if os.path.exists(os.path.join(thumb_dir, filename)): return send_from_directory(thumb_dir, filename) # Fallback: check if thumbnails are in the upload_dir (for older jobs) if job_id in processing_jobs: job = processing_jobs[job_id] upload_dir = job.get('upload_dir', '') fallback_dir = os.path.join(upload_dir, 'thumbnails') if os.path.exists(os.path.join(fallback_dir, filename)): return send_from_directory(fallback_dir, filename) return send_from_directory(thumb_dir, filename) @app.route('/review_photo//') def get_review_photo(job_id, filename): """Serve full-size photo for review modal with EXIF rotation handling.""" from io import BytesIO from PIL import ExifTags photo_dir = os.path.join(UPLOAD_FOLDER, job_id) filepath = os.path.join(photo_dir, filename) if not os.path.exists(filepath): return jsonify({'error': 'File not found'}), 404 ext = os.path.splitext(filename)[1].lower() # Handle HEIC/HEIF - convert to JPEG if ext in ['.heic', '.heif']: try: img = Image.open(filepath) img = img.convert('RGB') buffer = BytesIO() img.save(buffer, format='JPEG', quality=90) buffer.seek(0) return send_file(buffer, mimetype='image/jpeg') except Exception as e: print(f"Error converting HEIC: {e}") return send_from_directory(photo_dir, filename) # Handle JPG/JPEG - apply EXIF rotation if ext in ['.jpg', '.jpeg']: try: img = Image.open(filepath) # Get EXIF orientation and rotate if needed try: for orientation in ExifTags.TAGS.keys(): if ExifTags.TAGS[orientation] == 'Orientation': break exif = img._getexif() if exif is not None: orientation_value = exif.get(orientation) if orientation_value == 3: img = img.rotate(180, expand=True) elif orientation_value == 6: img = img.rotate(270, expand=True) elif orientation_value == 8: img = img.rotate(90, expand=True) except (AttributeError, KeyError, IndexError): pass if img.mode != 'RGB': img = img.convert('RGB') buffer = BytesIO() img.save(buffer, format='JPEG', quality=90) buffer.seek(0) return send_file(buffer, mimetype='image/jpeg') except Exception as e: print(f"Error processing JPEG: {e}") return send_from_directory(photo_dir, filename) return send_from_directory(photo_dir, filename) @app.route('/confirm_selection/', methods=['POST']) def confirm_selection(job_id): """User confirms their selection - proceed to quality-based selection.""" if job_id not in processing_jobs: return jsonify({'error': 'Job not found'}), 404 job = processing_jobs[job_id] # Get confirmed photos from request data = request.get_json() if not data or 'selected_photos' not in data: return jsonify({'error': 'No photos selected'}), 400 confirmed_photos = data['selected_photos'] if len(confirmed_photos) == 0: return jsonify({'error': 'At least one photo must be selected'}), 400 # Get embedding model selection (default to siglip) embedding_model = data.get('embedding_model', 'siglip') if embedding_model not in ['siglip', 'clip']: embedding_model = 'siglip' # Get processing parameters from job quality_mode = job.get('quality_mode', 'balanced') similarity_threshold = job.get('similarity_threshold', 0.92) upload_dir = job.get('upload_dir') # Load cached face data from review_data (to avoid re-detection in scoring) face_data_cache = {} if 'review_data' in job: for photo in job['review_data'].get('filtered_photos', []): filename = photo.get('filename') if filename: face_data_cache[filename] = { 'num_faces': photo.get('num_faces', 0), 'face_bboxes': photo.get('face_bboxes', []) } else: # Try loading from review file review_file = os.path.join(RESULTS_FOLDER, f"{job_id}_review.json") if os.path.exists(review_file): with open(review_file, 'r') as f: review_data = json.load(f) for photo in review_data.get('filtered_photos', []): filename = photo.get('filename') if filename: face_data_cache[filename] = { 'num_faces': photo.get('num_faces', 0), 'face_bboxes': photo.get('face_bboxes', []) } print(f"[Job {job_id}] Loaded face data cache for {len(face_data_cache)} photos") # Update job status job['status'] = 'processing' job['progress'] = 0 job['message'] = 'Starting quality-based selection...' job['confirmed_photos'] = confirmed_photos # Start phase 2 processing thread = threading.Thread( target=process_photos_quality_selection, args=(job_id, upload_dir, quality_mode, similarity_threshold, confirmed_photos, face_data_cache, embedding_model) ) thread.start() return jsonify({ 'message': f'Processing {len(confirmed_photos)} confirmed photos...', 'confirmed_count': len(confirmed_photos) }) @app.route('/step4_results/') def step4_results(job_id): """Step 4: Final results page.""" if job_id not in processing_jobs: return render_template('index.html') job = processing_jobs[job_id] # Check reference count from session session_id = session.get('session_id') ref_count = 0 if session_id and session_id in face_matchers: ref_count = face_matchers[session_id].get_reference_count() return render_template('step4_results.html', job_id=job_id, reference_count=ref_count) # ==================== TEST SINGLE MONTH ROUTES ==================== @app.route('/test-month') def test_month_page(): """Test page for single month photo selection.""" return render_template('test_month.html') @app.route('/test-month/start', methods=['POST']) def test_month_start(): """Start processing a single month folder.""" data = request.get_json() folder_path = data.get('folder_path', '').strip() target = int(data.get('target', 40)) organize_by_month = data.get('organize_by_month', False) if not folder_path: return jsonify({'error': 'No folder path provided'}), 400 if not os.path.isdir(folder_path): return jsonify({'error': f'Folder not found: {folder_path}'}), 400 # Count valid image files extensions = {'.jpg', '.jpeg', '.png', '.heic', '.heif', '.webp'} image_files = [f for f in os.listdir(folder_path) if os.path.splitext(f.lower())[1] in extensions] if not image_files: return jsonify({'error': 'No valid image files found in folder'}), 400 # Create job job_id = str(uuid.uuid4())[:8] # Create thumbnails directory thumb_dir = os.path.join(UPLOAD_FOLDER, job_id, 'thumbnails') os.makedirs(thumb_dir, exist_ok=True) processing_jobs[job_id] = { 'status': 'processing', 'progress': 0, 'message': 'Starting test...', 'folder_path': folder_path, 'thumb_dir': thumb_dir, 'target': target, 'total_files': len(image_files), 'results': None, 'organize_by_month': organize_by_month } # Start processing in background thread = threading.Thread( target=process_test_month, args=(job_id, folder_path, target, thumb_dir, organize_by_month) ) thread.start() return jsonify({ 'job_id': job_id, 'total_photos': len(image_files), 'target': target, 'organize_by_month': organize_by_month, 'message': f'Processing {len(image_files)} photos...' }) @app.route('/test-month/upload', methods=['POST']) def test_month_upload(): """Handle uploaded photos for test-month (for HuggingFace deployment).""" if 'photos' not in request.files: return jsonify({'error': 'No photos uploaded'}), 400 files = request.files.getlist('photos') target = int(request.form.get('target', 40)) organize_by_month = request.form.get('organize_by_month', 'false').lower() == 'true' if not files or len(files) == 0: return jsonify({'error': 'No photos uploaded'}), 400 # Filter valid image files extensions = {'.jpg', '.jpeg', '.png', '.heic', '.heif', '.webp'} valid_files = [f for f in files if f.filename and os.path.splitext(f.filename.lower())[1] in extensions] if not valid_files: return jsonify({'error': 'No valid image files uploaded'}), 400 # Create job and upload directory job_id = str(uuid.uuid4())[:8] upload_dir = os.path.join(UPLOAD_FOLDER, job_id, 'photos') thumb_dir = os.path.join(UPLOAD_FOLDER, job_id, 'thumbnails') os.makedirs(upload_dir, exist_ok=True) os.makedirs(thumb_dir, exist_ok=True) # Save uploaded files saved_files = [] for f in valid_files: filename = secure_filename(f.filename) # Handle duplicate filenames base, ext = os.path.splitext(filename) counter = 1 while os.path.exists(os.path.join(upload_dir, filename)): filename = f"{base}_{counter}{ext}" counter += 1 filepath = os.path.join(upload_dir, filename) f.save(filepath) saved_files.append(filename) processing_jobs[job_id] = { 'status': 'processing', 'progress': 0, 'message': 'Starting test...', 'folder_path': upload_dir, # Use upload dir as folder path 'thumb_dir': thumb_dir, 'target': target, 'total_files': len(saved_files), 'results': None, 'is_upload': True, 'organize_by_month': organize_by_month } # Start processing in background thread = threading.Thread( target=process_test_month, args=(job_id, upload_dir, target, thumb_dir, organize_by_month) ) thread.start() return jsonify({ 'job_id': job_id, 'total_photos': len(saved_files), 'target': target, 'organize_by_month': organize_by_month, 'message': f'Processing {len(saved_files)} uploaded photos...' }) @app.route('/test-month/upload-init', methods=['POST']) def test_month_upload_init(): """Initialize chunked upload for test-month.""" data = request.json total_files = data.get('total_files', 0) target = data.get('target', 40) organize_by_month = data.get('organize_by_month', False) job_id = str(uuid.uuid4())[:8] upload_dir = os.path.join(UPLOAD_FOLDER, job_id, 'photos') thumb_dir = os.path.join(UPLOAD_FOLDER, job_id, 'thumbnails') os.makedirs(upload_dir, exist_ok=True) os.makedirs(thumb_dir, exist_ok=True) # Store upload session session_id = f"test_{job_id}" upload_sessions[session_id] = { 'job_id': job_id, 'upload_dir': upload_dir, 'thumb_dir': thumb_dir, 'target': target, 'organize_by_month': organize_by_month, 'total_files': total_files, 'uploaded_files': [] } print(f"[Test-Month Upload {job_id}] Initialized for {total_files} files") return jsonify({ 'session_id': session_id, 'job_id': job_id }) @app.route('/test-month/upload-chunk', methods=['POST']) def test_month_upload_chunk(): """Handle a chunk of files for test-month.""" session_id = request.form.get('session_id') if not session_id or session_id not in upload_sessions: return jsonify({'error': 'Invalid session'}), 400 session_data = upload_sessions[session_id] upload_dir = session_data['upload_dir'] files = request.files.getlist('files') extensions = {'.jpg', '.jpeg', '.png', '.heic', '.heif', '.webp'} saved_count = 0 for f in files: if f and f.filename: ext = os.path.splitext(f.filename.lower())[1] if ext in extensions: filename = secure_filename(f.filename) # Handle duplicate filenames base, ext = os.path.splitext(filename) counter = 1 while os.path.exists(os.path.join(upload_dir, filename)): filename = f"{base}_{counter}{ext}" counter += 1 f.save(os.path.join(upload_dir, filename)) session_data['uploaded_files'].append(filename) saved_count += 1 chunk_index = request.form.get('chunk_index', '?') print(f"[Test-Month Upload {session_data['job_id']}] Chunk {chunk_index}: saved {saved_count} files (total: {len(session_data['uploaded_files'])})") return jsonify({ 'uploaded': len(session_data['uploaded_files']), 'total': session_data['total_files'] }) @app.route('/test-month/upload-complete', methods=['POST']) def test_month_upload_complete(): """Complete chunked upload and start processing for test-month.""" data = request.json session_id = data.get('session_id') if not session_id or session_id not in upload_sessions: return jsonify({'error': 'Invalid session'}), 400 session_data = upload_sessions[session_id] job_id = session_data['job_id'] upload_dir = session_data['upload_dir'] thumb_dir = session_data['thumb_dir'] target = session_data['target'] organize_by_month = session_data['organize_by_month'] saved_files = session_data['uploaded_files'] # Clean up session del upload_sessions[session_id] if not saved_files: return jsonify({'error': 'No valid image files uploaded'}), 400 print(f"[Test-Month Upload {job_id}] Complete: {len(saved_files)} files, starting processing...") # Create processing job processing_jobs[job_id] = { 'status': 'processing', 'progress': 0, 'message': 'Starting test...', 'folder_path': upload_dir, 'thumb_dir': thumb_dir, 'target': target, 'total_files': len(saved_files), 'results': None, 'is_upload': True, 'organize_by_month': organize_by_month } # Start processing in background thread = threading.Thread( target=process_test_month, args=(job_id, upload_dir, target, thumb_dir, organize_by_month) ) thread.start() return jsonify({ 'job_id': job_id, 'total_photos': len(saved_files), 'target': target, 'organize_by_month': organize_by_month, 'message': f'Processing {len(saved_files)} uploaded photos...' }) def process_test_month(job_id, folder_path, target, thumb_dir, organize_by_month=False): """Process photos for testing with category-aware selection. If organize_by_month is True, groups photos by EXIF date and runs selection per month (same as main app Step 4). """ try: from photo_selector.monthly_selector import MonthlyPhotoSelector, CategoryDetector from photo_selector.siglip_embeddings import SigLIPEmbedder from photo_selector.scoring import PhotoScorer from datetime import datetime job = processing_jobs[job_id] # Get all photos extensions = {'.jpg', '.jpeg', '.png', '.heic', '.heif', '.webp'} photo_files = [f for f in os.listdir(folder_path) if os.path.splitext(f.lower())[1] in extensions] photo_paths = [os.path.join(folder_path, f) for f in photo_files] job['message'] = 'Loading SigLIP model...' job['progress'] = 5 # Initialize embedder and selector embedder = SigLIPEmbedder() selector = MonthlyPhotoSelector() # Step 1: Generate embeddings job['message'] = f'Generating SigLIP embeddings for {len(photo_paths)} photos...' job['progress'] = 10 embeddings = embedder.process_folder(folder_path) job['progress'] = 30 # Step 2: Detect categories for all photos job['message'] = 'Detecting photo categories...' job['progress'] = 35 selector._ensure_category_detector() categories = selector.category_detector.detect_categories_batch(photo_paths) job['progress'] = 45 # Step 3: Score photos and add category + timestamp job['message'] = 'Scoring photos...' scorer = PhotoScorer() scored_photos = [] for i, photo_path in enumerate(photo_paths): filename = os.path.basename(photo_path) scores = scorer.score_photo(photo_path) # Get category cat, conf = categories.get(filename, ('unknown', 0.0)) # Get timestamp from EXIF dt = selector.get_photo_date(photo_path) scored_photos.append({ 'filename': filename, 'filepath': photo_path, 'total': scores.get('total', 0), 'face_quality': scores.get('face_quality', 0), 'aesthetic_quality': scores.get('aesthetic_quality', 0), 'emotional_signal': scores.get('emotional_signal', 0), 'uniqueness': scores.get('uniqueness', 0.5), 'num_faces': scores.get('num_faces', 0), 'category': cat, 'category_confidence': conf, 'timestamp': dt.timestamp() if dt else None }) if (i + 1) % 10 == 0: job['progress'] = 45 + int((i / len(photo_paths)) * 20) job['message'] = f'Scoring photos... {i + 1}/{len(photo_paths)}' job['progress'] = 70 # Step 4: Run category-aware HDBSCAN selection if organize_by_month: # Group photos by month using EXIF dates job['message'] = 'Grouping photos by month...' # Month names for mapping MONTH_NAMES = ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December'] photos_by_month = {} for photo in scored_photos: ts = photo.get('timestamp') if ts: dt = datetime.fromtimestamp(ts) month_name = MONTH_NAMES[dt.month - 1] else: month_name = 'Unknown' photo['month'] = month_name if month_name not in photos_by_month: photos_by_month[month_name] = [] photos_by_month[month_name].append(photo) # Calculate target per month (proportional allocation) total_photos = len(scored_photos) selected = [] month_stats = [] for month_name, month_photos in photos_by_month.items(): # Proportional target for this month month_proportion = len(month_photos) / total_photos month_target = max(1, int(target * month_proportion)) job['message'] = f'Processing {month_name} ({len(month_photos)} photos)...' # Get embeddings for this month's photos month_embeddings = {p['filename']: embeddings.get(p['filename']) for p in month_photos} # Run selection for this month month_selected = selector.select_hybrid_hdbscan(month_photos, month_embeddings, target=month_target) # Add month info to each selected photo for photo in month_selected: photo['month'] = month_name selected.extend(month_selected) month_stats.append({ 'month': month_name, 'total_photos': len(month_photos), 'selected': len(month_selected), 'target': month_target }) print(f"[Test Month {job_id}] Organized by month: {len(photos_by_month)} months, {len(selected)} total selected") else: # Single batch selection (original behavior) job['message'] = 'Running category-aware clustering and selection...' selected = selector.select_hybrid_hdbscan(scored_photos, embeddings, target=target) # Add 'Unknown' month to all photos when not organized for photo in selected: photo['month'] = 'Unknown' for photo in scored_photos: photo['month'] = 'Unknown' month_stats = [] job['progress'] = 85 job['message'] = 'Creating thumbnails...' # Create thumbnails and build results selected_results = [] for photo in selected: filename = photo['filename'] filepath = photo['filepath'] thumb_name = get_thumbnail_name(filename) thumb_path = os.path.join(thumb_dir, thumb_name) create_thumbnail(filepath, thumb_path) # Get embedding for this photo photo_emb = embeddings.get(filename) embedding_list = photo_emb.tolist() if photo_emb is not None else None # Format timestamp for display ts = photo.get('timestamp') datetime_str = '' if ts: dt = datetime.fromtimestamp(ts) datetime_str = dt.strftime('%Y-%m-%d %H:%M:%S') selected_results.append({ 'filename': filename, 'thumbnail': thumb_name, 'score': float(photo.get('total', 0)), 'face_quality': float(photo.get('face_quality', 0)), 'aesthetic_quality': float(photo.get('aesthetic_quality', 0)), 'emotional_signal': float(photo.get('emotional_signal', 0)), 'uniqueness': float(photo.get('uniqueness', 0)), 'num_faces': int(photo.get('num_faces', 0)), 'multi_face_bonus': float(photo.get('multi_face_bonus', 0)), 'cluster_id': photo.get('cluster_id', -1), 'max_similarity': float(photo.get('max_similarity', 0)), 'category': photo.get('category', 'unknown'), 'category_confidence': float(photo.get('category_confidence', 0)), 'event_id': photo.get('event_id', -1), 'selection_reason': photo.get('selection_reason', ''), 'datetime': datetime_str, 'embedding': embedding_list, 'month': photo.get('month', 'Unknown') }) # Build rejected list selected_filenames = {p['filename'] for p in selected} rejected_results = [] for photo in scored_photos: if photo['filename'] not in selected_filenames: filename = photo['filename'] filepath = photo['filepath'] thumb_name = get_thumbnail_name(filename) thumb_path = os.path.join(thumb_dir, thumb_name) create_thumbnail(filepath, thumb_path) photo_emb = embeddings.get(filename) embedding_list = photo_emb.tolist() if photo_emb is not None else None # Format timestamp for display ts = photo.get('timestamp') datetime_str = '' if ts: from datetime import datetime dt = datetime.fromtimestamp(ts) datetime_str = dt.strftime('%Y-%m-%d %H:%M:%S') rejected_results.append({ 'filename': filename, 'thumbnail': thumb_name, 'score': float(photo.get('total', 0)), 'face_quality': float(photo.get('face_quality', 0)), 'aesthetic_quality': float(photo.get('aesthetic_quality', 0)), 'num_faces': int(photo.get('num_faces', 0)), 'cluster_id': photo.get('cluster_id', -1), 'category': photo.get('category', 'unknown'), 'event_id': photo.get('event_id', -1), 'embedding': embedding_list, 'max_similarity': float(photo.get('max_similarity', 0)), 'selection_reason': photo.get('rejection_reason', 'Not selected'), 'datetime': datetime_str, 'month': photo.get('month', 'Unknown') }) # Sort results selected_results.sort(key=lambda x: x['score'], reverse=True) rejected_results.sort(key=lambda x: x['score'], reverse=True) # Cluster distribution cluster_counts = {} for photo in selected_results: cid = photo.get('cluster_id', -1) cluster_counts[cid] = cluster_counts.get(cid, 0) + 1 # Category distribution category_counts = {} for photo in selected_results: cat = photo.get('category', 'unknown') category_counts[cat] = category_counts.get(cat, 0) + 1 # Build results job['results'] = { 'selected': selected_results, 'rejected': rejected_results, 'summary': { 'total_photos': len(photo_paths), 'selected_count': len(selected_results), 'rejected_count': len(rejected_results), 'target': target }, 'cluster_distribution': cluster_counts, 'category_distribution': category_counts, 'organized_by_month': organize_by_month, 'month_stats': month_stats } job['status'] = 'complete' job['progress'] = 100 job['message'] = f'Done! Selected {len(selected_results)} of {len(photo_paths)} photos' print(f"\n[Test Month {job_id}] Complete!") print(f" - Total: {len(photo_paths)}") print(f" - Selected: {len(selected_results)}") print(f" - Organized by month: {organize_by_month}") if month_stats: print(f" - Month stats: {month_stats}") print(f" - Clusters: {cluster_counts}") print(f" - Categories: {category_counts}") except Exception as e: processing_jobs[job_id]['status'] = 'error' processing_jobs[job_id]['message'] = str(e) import traceback traceback.print_exc() @app.route('/test-month/status/') def test_month_status(job_id): """Get test month job status.""" if job_id not in processing_jobs: return jsonify({'error': 'Job not found'}), 404 job = processing_jobs[job_id] return jsonify({ 'status': job['status'], 'progress': job['progress'], 'message': job['message'] }) @app.route('/test-month/results/') def test_month_results(job_id): """Get test month results.""" if job_id not in processing_jobs: return jsonify({'error': 'Job not found'}), 404 job = processing_jobs[job_id] if job['status'] != 'complete': return jsonify({'error': 'Not complete', 'status': job['status']}), 400 return jsonify(job['results']) @app.route('/test-month/thumbnail//') def test_month_thumbnail(job_id, filename): """Serve test month thumbnails.""" thumb_dir = os.path.join(UPLOAD_FOLDER, job_id, 'thumbnails') return send_from_directory(thumb_dir, filename) @app.route('/test-month/download/') def test_month_download(job_id): """Download selected photos from test-month as ZIP.""" import zipfile from io import BytesIO if job_id not in processing_jobs: return jsonify({'error': 'Job not found'}), 404 job = processing_jobs[job_id] if job['status'] != 'complete': return jsonify({'error': 'Processing not complete'}), 400 results = job.get('results', {}) selected = results.get('selected', []) folder_path = job.get('folder_path', '') if not selected: return jsonify({'error': 'No selected photos'}), 404 if not folder_path: return jsonify({'error': 'Folder path not found'}), 404 # Create zip file memory_file = BytesIO() files_added = 0 with zipfile.ZipFile(memory_file, 'w', zipfile.ZIP_DEFLATED) as zf: for photo in selected: filename = photo.get('filename', '') # Build full path from folder_path + filename photo_path = os.path.join(folder_path, filename) if os.path.exists(photo_path): zf.write(photo_path, filename) files_added += 1 if files_added == 0: return jsonify({'error': 'No files could be added to ZIP'}), 404 memory_file.seek(0) return send_file( memory_file, mimetype='application/zip', as_attachment=True, download_name=f'test_selected_{job_id}.zip' ) # ============================================ # DATASET SAVE/LOAD ROUTES # ============================================ @app.route('/datasets') def datasets_page(): """Show saved datasets page.""" return render_template('datasets.html') @app.route('/api/datasets') def list_datasets(): """List all saved datasets (local + Supabase).""" datasets = [] seen_names = set() # 1. Get local datasets if os.path.exists(DATASETS_FOLDER): for name in os.listdir(DATASETS_FOLDER): meta_path = os.path.join(DATASETS_FOLDER, name, 'metadata.json') if os.path.exists(meta_path): try: with open(meta_path, 'r') as f: meta = json.load(f) meta['folder_name'] = name meta['source'] = 'local' datasets.append(meta) seen_names.add(name) except: pass # 2. Get Supabase datasets (if available) if is_supabase_available(): try: supabase_datasets = list_datasets_from_supabase() for meta in supabase_datasets: folder_name = meta.get('folder_name', '') # Only add if not already in local (local takes priority) if folder_name and folder_name not in seen_names: meta['source'] = 'supabase' datasets.append(meta) except Exception as e: print(f"[Datasets] Error fetching from Supabase: {e}") # Sort by date, newest first datasets.sort(key=lambda x: x.get('created_at', '') or '', reverse=True) return jsonify({'datasets': datasets, 'supabase_available': is_supabase_available()}) @app.route('/save_dataset/', methods=['POST']) def save_dataset(job_id): """Save dataset after Step 3 review.""" try: data = request.get_json() dataset_name = data.get('name', f"dataset_{job_id}") # Validate name (alphanumeric, underscore, hyphen, space only) import re safe_name = re.sub(r'[^a-zA-Z0-9_\- ]', '', dataset_name).strip() if not safe_name: safe_name = f"dataset_{job_id}" # Create folder name (replace spaces with underscores) folder_name = safe_name.replace(' ', '_') dataset_path = os.path.join(DATASETS_FOLDER, folder_name) # Check if already exists if os.path.exists(dataset_path): return jsonify({'error': f'Dataset "{safe_name}" already exists'}), 400 os.makedirs(dataset_path, exist_ok=True) # Get job data if job_id not in processing_jobs: return jsonify({'error': 'Job not found'}), 404 job = processing_jobs[job_id] session_id = job.get('session_id') # 1. Save reference embeddings if session_id and session_id in face_matchers: matcher = face_matchers[session_id] embeddings_path = os.path.join(dataset_path, 'reference_embeddings.npz') np.savez_compressed( embeddings_path, embeddings=np.array(matcher.reference_embeddings), average=matcher.average_embedding, threshold=matcher.similarity_threshold ) # 2. Copy face results from review JSON review_file = os.path.join(RESULTS_FOLDER, f"{job_id}_review.json") if os.path.exists(review_file): shutil.copy(review_file, os.path.join(dataset_path, 'face_results.json')) # 3. Save confirmed photos list confirmed_photos = job.get('confirmed_photos', []) if not confirmed_photos: # Try loading from review JSON (Step 3) - contains filtered_photos review_file = os.path.join(RESULTS_FOLDER, f"{job_id}_review.json") if os.path.exists(review_file): with open(review_file, 'r') as f: review_data = json.load(f) filtered = review_data.get('filtered_photos', []) confirmed_photos = [p['filename'] for p in filtered] # Fallback: Try loading from confirm step if not in memory if not confirmed_photos: results_file = os.path.join(RESULTS_FOLDER, f"{job_id}.json") if os.path.exists(results_file): with open(results_file, 'r') as f: results_data = json.load(f) selected = results_data.get('selected_photos', []) rejected = results_data.get('rejected_photos', []) confirmed_photos = [p['filename'] for p in selected + rejected] with open(os.path.join(dataset_path, 'confirmed_photos.json'), 'w') as f: json.dump({'photos': confirmed_photos}, f) # 4. Copy thumbnails folder upload_dir = job.get('upload_dir', os.path.join(UPLOAD_FOLDER, job_id)) thumb_dir = os.path.join(upload_dir, 'thumbnails') dataset_thumb_dir = os.path.join(dataset_path, 'thumbnails') if os.path.exists(thumb_dir): shutil.copytree(thumb_dir, dataset_thumb_dir) # 5. Copy original photos (for reload) photos_dir = os.path.join(dataset_path, 'photos') os.makedirs(photos_dir, exist_ok=True) for filename in confirmed_photos: src = os.path.join(upload_dir, filename) if os.path.exists(src): shutil.copy(src, os.path.join(photos_dir, filename)) # 6. Save metadata metadata = { 'name': safe_name, 'created_at': datetime.now().isoformat(), 'original_job_id': job_id, 'session_id': session_id, 'total_photos': len(confirmed_photos), 'quality_mode': job.get('quality_mode', 'balanced'), 'similarity_threshold': job.get('similarity_threshold', 0.4), 'reference_count': len(face_matchers.get(session_id, {}).reference_embeddings) if session_id in face_matchers else 0 } with open(os.path.join(dataset_path, 'metadata.json'), 'w') as f: json.dump(metadata, f, indent=2) print(f"[Dataset] Saved '{safe_name}' with {len(confirmed_photos)} photos locally") # 7. Also save to Supabase (for persistence across HF restarts) supabase_saved = False if is_supabase_available(): try: # Read embeddings file as bytes embeddings_path = os.path.join(dataset_path, 'reference_embeddings.npz') embeddings_data = None if os.path.exists(embeddings_path): with open(embeddings_path, 'rb') as f: embeddings_data = f.read() # Read face results face_results_path = os.path.join(dataset_path, 'face_results.json') face_results = {} if os.path.exists(face_results_path): with open(face_results_path, 'r') as f: face_results = json.load(f) # Save to Supabase if embeddings_data: supabase_saved = save_dataset_to_supabase( folder_name, embeddings_data, face_results, metadata ) except Exception as e: print(f"[Dataset] Supabase save error: {e}") return jsonify({ 'success': True, 'name': safe_name, 'folder_name': folder_name, 'total_photos': len(confirmed_photos), 'supabase_saved': supabase_saved }) except Exception as e: import traceback traceback.print_exc() return jsonify({'error': str(e)}), 500 @app.route('/load_dataset/') def load_dataset(dataset_name): """Load a saved dataset and redirect to review or selection.""" try: dataset_path = os.path.join(DATASETS_FOLDER, dataset_name) from_supabase = False # Check if dataset exists locally if not os.path.exists(dataset_path): # Try loading from Supabase if is_supabase_available(): print(f"[Dataset] Not found locally, trying Supabase...") supabase_data = load_dataset_from_supabase(dataset_name) if supabase_data: from_supabase = True # Redirect to re-upload page (photos not stored in Supabase) return redirect(f'/reupload_photos/{dataset_name}') else: return jsonify({'error': 'Dataset not found in local or Supabase'}), 404 else: return jsonify({'error': 'Dataset not found'}), 404 # Load metadata with open(os.path.join(dataset_path, 'metadata.json'), 'r') as f: metadata = json.load(f) # Create new job ID job_id = str(uuid.uuid4())[:8] new_session_id = str(uuid.uuid4())[:8] # Set up upload directory with photos upload_dir = os.path.join(UPLOAD_FOLDER, job_id) os.makedirs(upload_dir, exist_ok=True) # Copy photos from dataset dataset_photos_dir = os.path.join(dataset_path, 'photos') if os.path.exists(dataset_photos_dir): for filename in os.listdir(dataset_photos_dir): src = os.path.join(dataset_photos_dir, filename) shutil.copy(src, os.path.join(upload_dir, filename)) # Copy thumbnails dataset_thumb_dir = os.path.join(dataset_path, 'thumbnails') if os.path.exists(dataset_thumb_dir): shutil.copytree(dataset_thumb_dir, os.path.join(upload_dir, 'thumbnails')) # Load reference embeddings into face_matchers embeddings_path = os.path.join(dataset_path, 'reference_embeddings.npz') if os.path.exists(embeddings_path): from photo_selector.face_matcher import FaceMatcher data = np.load(embeddings_path, allow_pickle=True) matcher = FaceMatcher(similarity_threshold=float(data['threshold'])) matcher.reference_embeddings = list(data['embeddings']) matcher.average_embedding = data['average'] face_matchers[new_session_id] = matcher session['face_session_id'] = new_session_id # Load confirmed photos confirmed_file = os.path.join(dataset_path, 'confirmed_photos.json') confirmed_photos = [] if os.path.exists(confirmed_file): with open(confirmed_file, 'r') as f: confirmed_photos = json.load(f).get('photos', []) # Load face results face_results_path = os.path.join(dataset_path, 'face_results.json') review_data = None if os.path.exists(face_results_path): with open(face_results_path, 'r') as f: review_data = json.load(f) # Create processing job processing_jobs[job_id] = { 'status': 'review_pending', 'progress': 100, 'message': 'Dataset loaded - ready for review', 'upload_dir': upload_dir, 'session_id': new_session_id, 'has_reference_photos': True, 'reference_count': metadata.get('reference_count', 0), 'quality_mode': metadata.get('quality_mode', 'balanced'), 'similarity_threshold': metadata.get('similarity_threshold', 0.4), 'confirmed_photos': confirmed_photos, 'review_data': review_data, 'total_photos': len(confirmed_photos), 'from_dataset': dataset_name } # Copy face results to results folder for step3 if review_data: with open(os.path.join(RESULTS_FOLDER, f"{job_id}_review.json"), 'w') as f: json.dump(review_data, f) print(f"[Dataset] Loaded '{dataset_name}' as job {job_id}") # Check which page to go to goto = request.args.get('goto', 'review') if goto == 'select': # Go directly to Step 4 - start quality selection return redirect(f'/step4_results/{job_id}?from_dataset=1') else: # Go to Step 3 - review page return redirect(f'/step3_review/{job_id}') except Exception as e: import traceback traceback.print_exc() return jsonify({'error': str(e)}), 500 @app.route('/delete_dataset/', methods=['DELETE']) def delete_dataset(dataset_name): """Delete a saved dataset (local and Supabase).""" try: deleted_local = False deleted_supabase = False # Delete local dataset_path = os.path.join(DATASETS_FOLDER, dataset_name) if os.path.exists(dataset_path): shutil.rmtree(dataset_path) deleted_local = True print(f"[Dataset] Deleted '{dataset_name}' locally") # Delete from Supabase if is_supabase_available(): deleted_supabase = delete_dataset_from_supabase(dataset_name) if not deleted_local and not deleted_supabase: return jsonify({'error': 'Dataset not found'}), 404 return jsonify({'success': True, 'deleted_local': deleted_local, 'deleted_supabase': deleted_supabase}) except Exception as e: return jsonify({'error': str(e)}), 500 @app.route('/dataset_thumbnail//') def dataset_thumbnail(dataset_name, filename): """Serve dataset thumbnail.""" thumb_dir = os.path.join(DATASETS_FOLDER, dataset_name, 'thumbnails') return send_from_directory(thumb_dir, filename) # ============================================ # SUPABASE RE-UPLOAD ROUTES # ============================================ @app.route('/reupload_photos/') def reupload_photos_page(dataset_name): """Show page to re-upload photos for a Supabase dataset.""" # Get metadata from Supabase if not is_supabase_available(): return jsonify({'error': 'Supabase not available'}), 500 supabase_data = load_dataset_from_supabase(dataset_name) if not supabase_data: return jsonify({'error': 'Dataset not found in Supabase'}), 404 metadata = supabase_data.get('metadata', {}) return render_template('reupload_photos.html', dataset_name=dataset_name, metadata=metadata) @app.route('/download_from_gdrive/', methods=['POST']) def download_from_gdrive(dataset_name): """Download zip from Google Drive and process photos.""" try: import re import zipfile import gdown data = request.get_json() gdrive_link = data.get('gdrive_link', '') print(f"[GDrive] Starting download for dataset '{dataset_name}'") print(f"[GDrive] Link: {gdrive_link}") # Extract file ID from Google Drive link file_id = None patterns = [ r'/file/d/([a-zA-Z0-9_-]+)', r'id=([a-zA-Z0-9_-]+)', r'/d/([a-zA-Z0-9_-]+)' ] for pattern in patterns: match = re.search(pattern, gdrive_link) if match: file_id = match.group(1) break if not file_id: return jsonify({'error': 'Could not extract file ID from Google Drive link'}), 400 print(f"[GDrive] File ID: {file_id}") # Create job and upload directory job_id = str(uuid.uuid4())[:8] upload_dir = os.path.join(UPLOAD_FOLDER, job_id) os.makedirs(upload_dir, exist_ok=True) os.makedirs(os.path.join(upload_dir, 'thumbnails'), exist_ok=True) # Download using gdown (handles large files properly) zip_path = os.path.join(upload_dir, 'photos.zip') gdrive_url = f"https://drive.google.com/uc?id={file_id}" print(f"[GDrive] Downloading using gdown...") try: gdown.download(gdrive_url, zip_path, quiet=False, fuzzy=True) except Exception as e: print(f"[GDrive] gdown failed: {e}") # Try with confirm flag for large files try: gdown.download(gdrive_url, zip_path, quiet=False, fuzzy=True, use_cookies=False) except Exception as e2: print(f"[GDrive] gdown retry failed: {e2}") return jsonify({'error': f'Download failed: {str(e2)}'}), 400 # Check if file was downloaded if not os.path.exists(zip_path) or os.path.getsize(zip_path) < 1000: print(f"[GDrive] ERROR: Download failed or file too small") return jsonify({'error': 'Download failed. Make sure the file is shared with "Anyone with link".'}), 400 print(f"[GDrive] Download complete: {os.path.getsize(zip_path) / 1024 / 1024:.1f} MB") # Extract zip file print(f"[GDrive] Extracting zip file...") uploaded_filenames = [] image_extensions = {'.jpg', '.jpeg', '.png', '.heic', '.heif', '.webp', '.bmp', '.gif'} try: with zipfile.ZipFile(zip_path, 'r') as zf: for member in zf.namelist(): if member.endswith('/') or '/__MACOSX' in member or member.startswith('.'): continue ext = os.path.splitext(member.lower())[1] if ext in image_extensions: filename = secure_filename(os.path.basename(member)) if filename: with zf.open(member) as src: filepath = os.path.join(upload_dir, filename) with open(filepath, 'wb') as dst: dst.write(src.read()) uploaded_filenames.append(filename) if len(uploaded_filenames) % 200 == 0: print(f"[GDrive] Extracted {len(uploaded_filenames)} files...") print(f"[GDrive] Extracted {len(uploaded_filenames)} photos") finally: # Clean up zip if os.path.exists(zip_path): os.remove(zip_path) # Load dataset from Supabase print(f"[GDrive] Loading dataset from Supabase...") supabase_data = load_dataset_from_supabase(dataset_name) if not supabase_data: return jsonify({'error': 'Dataset not found in Supabase'}), 404 metadata = supabase_data.get('metadata', {}) face_results = supabase_data.get('face_results', {}) embeddings_data = supabase_data.get('embeddings_data') # Load reference embeddings new_session_id = str(uuid.uuid4())[:8] if embeddings_data: import io from photo_selector.face_matcher import FaceMatcher data_np = np.load(io.BytesIO(embeddings_data), allow_pickle=True) matcher = FaceMatcher(similarity_threshold=float(data_np['threshold'])) matcher.reference_embeddings = list(data_np['embeddings']) matcher.average_embedding = data_np['average'] face_matchers[new_session_id] = matcher session['face_session_id'] = new_session_id print(f"[GDrive] Loaded {len(matcher.reference_embeddings)} reference embeddings") # Match uploaded files with saved face results filtered_photos = face_results.get('filtered_photos', []) uploaded_set = set(uploaded_filenames) matched_photos = [p for p in filtered_photos if p.get('filename') in uploaded_set] print(f"[GDrive] Matched {len(matched_photos)} of {len(filtered_photos)} photos") # Create review data review_data = { 'filtered_photos': matched_photos, 'total_processed': len(uploaded_filenames), 'match_count': len(matched_photos) } with open(os.path.join(RESULTS_FOLDER, f"{job_id}_review.json"), 'w') as f: json.dump(review_data, f) # Create processing job processing_jobs[job_id] = { 'status': 'review_pending', 'progress': 100, 'message': 'Photos downloaded from Google Drive', 'upload_dir': upload_dir, 'session_id': new_session_id, 'has_reference_photos': True, 'reference_count': metadata.get('reference_count', 0), 'quality_mode': metadata.get('quality_mode', 'balanced'), 'similarity_threshold': metadata.get('similarity_threshold', 0.4), 'confirmed_photos': [p['filename'] for p in matched_photos], 'review_data': review_data, 'total_photos': len(matched_photos), 'from_dataset': dataset_name, 'from_supabase': True } print(f"[GDrive] SUCCESS! Redirecting to step3_review/{job_id}") return jsonify({ 'success': True, 'job_id': job_id, 'matched_photos': len(matched_photos), 'total_uploaded': len(uploaded_filenames), 'redirect_url': f'/step3_review/{job_id}' }) except Exception as e: print(f"[GDrive] Error: {e}") import traceback traceback.print_exc() return jsonify({'error': str(e)}), 500 # Store chunked upload sessions chunked_uploads = {} @app.route('/start_chunked_upload/', methods=['POST']) def start_chunked_upload(dataset_name): """Start a chunked upload session.""" try: data = request.get_json() total_files = data.get('total_files', 0) total_chunks = data.get('total_chunks', 0) upload_id = str(uuid.uuid4())[:8] job_id = str(uuid.uuid4())[:8] upload_dir = os.path.join(UPLOAD_FOLDER, job_id) os.makedirs(upload_dir, exist_ok=True) os.makedirs(os.path.join(upload_dir, 'thumbnails'), exist_ok=True) chunked_uploads[upload_id] = { 'dataset_name': dataset_name, 'job_id': job_id, 'upload_dir': upload_dir, 'total_files': total_files, 'total_chunks': total_chunks, 'received_chunks': set(), 'uploaded_filenames': [] } print(f"[Chunked] Started upload session {upload_id} for dataset '{dataset_name}' ({total_files} files, {total_chunks} chunks)") return jsonify({'success': True, 'upload_id': upload_id}) except Exception as e: print(f"[Chunked] Error starting session: {e}") return jsonify({'error': str(e)}), 500 @app.route('/upload_reupload_chunk/', methods=['POST']) def upload_reupload_chunk(dataset_name): """Receive a chunk of photos for reupload.""" from werkzeug.exceptions import ClientDisconnected try: upload_id = request.form.get('upload_id') chunk_index = int(request.form.get('chunk_index', 0)) if upload_id not in chunked_uploads: return jsonify({'error': 'Invalid upload session'}), 400 session_data = chunked_uploads[upload_id] upload_dir = session_data['upload_dir'] files = request.files.getlist('photos') if not files: return jsonify({'error': 'No files in chunk'}), 400 # Save files from this chunk for file in files: if file and allowed_file(file.filename): filename = secure_filename(file.filename) filepath = os.path.join(upload_dir, filename) file.save(filepath) session_data['uploaded_filenames'].append(filename) session_data['received_chunks'].add(chunk_index) print(f"[Chunked] Upload {upload_id}: Received chunk {chunk_index + 1}/{session_data['total_chunks']} ({len(files)} files)") return jsonify({'success': True, 'chunk': chunk_index, 'files_saved': len(files)}) except ClientDisconnected: # Client disconnected during upload - this is expected on slow connections print(f"[Chunked] Client disconnected during chunk upload (timeout)") return jsonify({'error': 'Connection timeout - please retry'}), 408 except Exception as e: print(f"[Chunked] Error receiving chunk: {e}") import traceback traceback.print_exc() return jsonify({'error': str(e)}), 500 @app.route('/finish_chunked_upload/', methods=['POST']) def finish_chunked_upload(dataset_name): """Finalize chunked upload and process photos.""" try: data = request.get_json() upload_id = data.get('upload_id') if upload_id not in chunked_uploads: return jsonify({'error': 'Invalid upload session'}), 400 session_data = chunked_uploads[upload_id] job_id = session_data['job_id'] upload_dir = session_data['upload_dir'] uploaded_filenames = session_data['uploaded_filenames'] print(f"[Chunked] Finalizing upload {upload_id}: {len(uploaded_filenames)} files received") # Load dataset from Supabase print(f"[Chunked] Loading dataset from Supabase...") supabase_data = load_dataset_from_supabase(dataset_name) if not supabase_data: return jsonify({'error': 'Dataset not found in Supabase'}), 404 metadata = supabase_data.get('metadata', {}) face_results = supabase_data.get('face_results', {}) embeddings_data = supabase_data.get('embeddings_data') # Load reference embeddings new_session_id = str(uuid.uuid4())[:8] if embeddings_data: import io from photo_selector.face_matcher import FaceMatcher data_np = np.load(io.BytesIO(embeddings_data), allow_pickle=True) matcher = FaceMatcher(similarity_threshold=float(data_np['threshold'])) matcher.reference_embeddings = list(data_np['embeddings']) matcher.average_embedding = data_np['average'] face_matchers[new_session_id] = matcher session['face_session_id'] = new_session_id print(f"[Chunked] Loaded {len(matcher.reference_embeddings)} reference embeddings") # Match uploaded files with saved face results filtered_photos = face_results.get('filtered_photos', []) uploaded_set = set(uploaded_filenames) matched_photos = [p for p in filtered_photos if p.get('filename') in uploaded_set] print(f"[Chunked] Matched {len(matched_photos)} of {len(filtered_photos)} photos") # Create review data review_data = { 'filtered_photos': matched_photos, 'total_processed': len(uploaded_filenames), 'match_count': len(matched_photos) } with open(os.path.join(RESULTS_FOLDER, f"{job_id}_review.json"), 'w') as f: json.dump(review_data, f) # Create processing job processing_jobs[job_id] = { 'status': 'review_pending', 'progress': 100, 'message': 'Photos matched with saved face results', 'upload_dir': upload_dir, 'session_id': new_session_id, 'has_reference_photos': True, 'reference_count': metadata.get('reference_count', 0), 'quality_mode': metadata.get('quality_mode', 'balanced'), 'similarity_threshold': metadata.get('similarity_threshold', 0.4), 'confirmed_photos': [p['filename'] for p in matched_photos], 'review_data': review_data, 'total_photos': len(matched_photos), 'from_dataset': dataset_name, 'from_supabase': True } # Clean up session del chunked_uploads[upload_id] print(f"[Chunked] SUCCESS! Redirecting to step3_review/{job_id}") return jsonify({ 'success': True, 'job_id': job_id, 'matched_photos': len(matched_photos), 'total_uploaded': len(uploaded_filenames), 'redirect_url': f'/step3_review/{job_id}' }) except Exception as e: print(f"[Chunked] Error finalizing: {e}") import traceback traceback.print_exc() return jsonify({'error': str(e)}), 500 @app.route('/process_reupload/', methods=['POST']) def process_reupload(dataset_name): """Process re-uploaded photos using saved face results from Supabase.""" from werkzeug.exceptions import ClientDisconnected try: print(f"[Reupload] Starting reupload for dataset '{dataset_name}'") # Load dataset from Supabase print(f"[Reupload] Loading dataset from Supabase...") supabase_data = load_dataset_from_supabase(dataset_name) if not supabase_data: print(f"[Reupload] ERROR: Dataset not found in Supabase") return jsonify({'error': 'Dataset not found in Supabase'}), 404 metadata = supabase_data.get('metadata', {}) face_results = supabase_data.get('face_results', {}) embeddings_data = supabase_data.get('embeddings_data') print(f"[Reupload] Dataset loaded: {len(face_results.get('filtered_photos', []))} photos in face results") # Create new job job_id = str(uuid.uuid4())[:8] new_session_id = str(uuid.uuid4())[:8] upload_dir = os.path.join(UPLOAD_FOLDER, job_id) os.makedirs(upload_dir, exist_ok=True) os.makedirs(os.path.join(upload_dir, 'thumbnails'), exist_ok=True) # Check if zip file was uploaded zipfile_upload = request.files.get('zipfile') uploaded_filenames = [] if zipfile_upload and zipfile_upload.filename.lower().endswith('.zip'): # Handle zip file upload import zipfile print(f"[Reupload] Received zip file: {zipfile_upload.filename}") # Save zip temporarily zip_path = os.path.join(upload_dir, 'upload.zip') zipfile_upload.save(zip_path) print(f"[Reupload] Zip saved, extracting...") # Extract zip file try: with zipfile.ZipFile(zip_path, 'r') as zf: # Get list of image files in zip image_extensions = {'.jpg', '.jpeg', '.png', '.heic', '.heif', '.webp', '.bmp', '.gif'} for member in zf.namelist(): # Skip directories and hidden files if member.endswith('/') or '/__MACOSX' in member or member.startswith('.'): continue # Check if it's an image ext = os.path.splitext(member.lower())[1] if ext in image_extensions: # Extract with flat structure (no subdirectories) filename = secure_filename(os.path.basename(member)) if filename: # Read from zip and save to upload_dir with zf.open(member) as src: filepath = os.path.join(upload_dir, filename) with open(filepath, 'wb') as dst: dst.write(src.read()) uploaded_filenames.append(filename) if len(uploaded_filenames) % 200 == 0: print(f"[Reupload] Extracted {len(uploaded_filenames)} files...") print(f"[Reupload] Extracted {len(uploaded_filenames)} photos from zip") finally: # Clean up zip file if os.path.exists(zip_path): os.remove(zip_path) else: # Handle individual photo uploads files = request.files.getlist('photos') if not files or (len(files) == 1 and files[0].filename == ''): print(f"[Reupload] ERROR: No photos uploaded") return jsonify({'error': 'No photos uploaded'}), 400 print(f"[Reupload] Saving {len(files)} uploaded files (thumbnails skipped for speed)...") for i, file in enumerate(files): if file and allowed_file(file.filename): filename = secure_filename(file.filename) filepath = os.path.join(upload_dir, filename) file.save(filepath) uploaded_filenames.append(filename) # Log progress every 200 files if (i + 1) % 200 == 0: print(f"[Reupload] Saved {i + 1}/{len(files)} files...") print(f"[Reupload] Saved {len(uploaded_filenames)} photos for dataset '{dataset_name}'") # Load reference embeddings print(f"[Reupload] Loading reference embeddings...") if embeddings_data: import io from photo_selector.face_matcher import FaceMatcher # Load directly from bytes using BytesIO (no temp file needed) data = np.load(io.BytesIO(embeddings_data), allow_pickle=True) matcher = FaceMatcher(similarity_threshold=float(data['threshold'])) matcher.reference_embeddings = list(data['embeddings']) matcher.average_embedding = data['average'] face_matchers[new_session_id] = matcher session['face_session_id'] = new_session_id print(f"[Reupload] Loaded {len(matcher.reference_embeddings)} reference embeddings") # Match uploaded files with saved face results print(f"[Reupload] Matching uploaded files with saved face results...") filtered_photos = face_results.get('filtered_photos', []) # Create a set for faster lookup uploaded_set = set(uploaded_filenames) # Filter to only photos that were uploaded matched_photos = [] for photo in filtered_photos: if photo.get('filename') in uploaded_set: matched_photos.append(photo) print(f"[Reupload] Matched {len(matched_photos)} of {len(filtered_photos)} photos from face results") # Create review data review_data = { 'filtered_photos': matched_photos, 'total_processed': len(uploaded_filenames), 'match_count': len(matched_photos) } # Save review data with open(os.path.join(RESULTS_FOLDER, f"{job_id}_review.json"), 'w') as f: json.dump(review_data, f) print(f"[Reupload] Saved review data") # Create processing job - mark as ready for quality selection processing_jobs[job_id] = { 'status': 'review_pending', 'progress': 100, 'message': 'Photos matched with saved face results', 'upload_dir': upload_dir, 'session_id': new_session_id, 'has_reference_photos': True, 'reference_count': metadata.get('reference_count', 0), 'quality_mode': metadata.get('quality_mode', 'balanced'), 'similarity_threshold': metadata.get('similarity_threshold', 0.4), 'confirmed_photos': [p['filename'] for p in matched_photos], 'review_data': review_data, 'total_photos': len(matched_photos), 'from_dataset': dataset_name, 'from_supabase': True } print(f"[Reupload] SUCCESS! Redirecting to step3_review/{job_id}") return jsonify({ 'success': True, 'job_id': job_id, 'matched_photos': len(matched_photos), 'total_uploaded': len(uploaded_filenames), 'redirect_url': f'/step3_review/{job_id}' }) except ClientDisconnected: print(f"[Reupload] Client disconnected during upload (timeout)") return jsonify({'error': 'Connection timeout - please retry with smaller batch or better connection'}), 408 except Exception as e: import traceback traceback.print_exc() return jsonify({'error': str(e)}), 500 if __name__ == '__main__': print(""" ============================================ PHOTO SELECTION WEB APP Open http://localhost:5000 in your browser NEW: Automatic selection mode! The AI decides which photos to keep. TEST: /test-month for single folder testing ============================================ """) # Use port 7860 for Hugging Face Spaces, 5000 for local import os port = int(os.environ.get('PORT', 7860)) app.run(debug=False, host='0.0.0.0', port=port)