Spaces:

rethinks
/

childYb

Running

File size: 188,180 Bytes

"""

Photo Selection Web App

Flask-based frontend for testing the photo selection pipeline

Now with AUTOMATIC selection - no target number needed!



Two-Stage Workflow with Review Step:

1. Upload reference photos of your child (2-3 photos)

2. Upload all event photos (e.g., 1000 photos)

3. System filters to find photos containing your child

4. USER REVIEWS filtered photos (can remove false positives)

5. Quality-based selection runs on confirmed photos

6. Final results shown

"""

import os
import json
import uuid
import shutil
from pathlib import Path
from datetime import datetime

# Load environment variables from .env file
try:
    from dotenv import load_dotenv
    load_dotenv()
except ImportError:
    pass  # dotenv not installed, use system env vars

from flask import Flask, render_template, request, jsonify, send_from_directory, send_file, session, redirect, Response
from werkzeug.utils import secure_filename
from werkzeug.exceptions import RequestEntityTooLarge
import numpy as np
from PIL import Image
import threading
import time

# Supabase integration
from supabase_storage import (
    is_supabase_available,
    save_dataset_to_supabase,
    load_dataset_from_supabase,
    list_datasets_from_supabase,
    delete_dataset_from_supabase
)

# HEIC support
try:
    from pillow_heif import register_heif_opener
    register_heif_opener()
except ImportError:
    pass

app = Flask(__name__, static_folder='static', template_folder='templates')
app.secret_key = 'photo_selector_secret_key_2024'  # For session management

# Configuration
UPLOAD_FOLDER = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'uploads')
RESULTS_FOLDER = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'results')
REFERENCE_FOLDER = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'references')
OUTPUT_FOLDER = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'selected_photos')  # Auto-save location
DATASETS_FOLDER = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'datasets')  # Saved datasets
ALLOWED_EXTENSIONS = {'jpg', 'jpeg', 'png', 'heic', 'heif', 'webp'}
MAX_CONTENT_LENGTH = 5 * 1024 * 1024 * 1024  # 5GB max (for large photo batches)

app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER
app.config['MAX_CONTENT_LENGTH'] = MAX_CONTENT_LENGTH
app.config['MAX_FORM_MEMORY_SIZE'] = 5 * 1024 * 1024 * 1024  # 5GB for form data
app.config['MAX_FORM_PARTS'] = 10000  # Allow up to 10000 files in one upload

# Create directories
os.makedirs(UPLOAD_FOLDER, exist_ok=True)
os.makedirs(RESULTS_FOLDER, exist_ok=True)
os.makedirs(REFERENCE_FOLDER, exist_ok=True)
os.makedirs(DATASETS_FOLDER, exist_ok=True)

# Store processing status
processing_jobs = {}

# Store face matchers for sessions (reuse to avoid reloading model)
face_matchers = {}

# Store chunked upload sessions
upload_sessions = {}


# Error handler for large uploads
@app.errorhandler(RequestEntityTooLarge)
def handle_large_upload(error):
    return jsonify({
        'error': 'Upload too large. Try uploading fewer files at once (max ~500 files per batch).'
    }), 413


def allowed_file(filename):
    return '.' in filename and filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS


def create_thumbnail(image_path, thumb_path, size=(300, 300)):
    """Create a thumbnail for display with proper EXIF rotation."""
    from PIL import ExifTags
    try:
        with Image.open(image_path) as img:
            # Apply EXIF rotation before creating thumbnail
            try:
                for orientation in ExifTags.TAGS.keys():
                    if ExifTags.TAGS[orientation] == 'Orientation':
                        break
                exif = img._getexif()
                if exif is not None:
                    orientation_value = exif.get(orientation)
                    if orientation_value == 3:
                        img = img.rotate(180, expand=True)
                    elif orientation_value == 6:
                        img = img.rotate(270, expand=True)
                    elif orientation_value == 8:
                        img = img.rotate(90, expand=True)
            except (AttributeError, KeyError, IndexError):
                pass

            if img.mode != 'RGB':
                img = img.convert('RGB')
            img.thumbnail(size, Image.Resampling.LANCZOS)
            img.save(thumb_path, 'JPEG', quality=85)
        return True
    except Exception as e:
        print(f"Error creating thumbnail: {e}")
        return False


def get_thumbnail_name(filename):
    """

    Generate thumbnail name that includes the original extension to avoid collisions.



    Example: IMG_5801.HEIC -> thumb_IMG_5801_HEIC.jpg

             IMG_5801.jpg  -> thumb_IMG_5801_jpg.jpg

    """
    if '.' in filename:
        name, ext = filename.rsplit('.', 1)
        return f"thumb_{name}_{ext}.jpg"
    else:
        return f"thumb_{filename}.jpg"


def process_photos_face_filter_only(job_id, upload_dir, session_id=None):
    """

    Phase 1: Face filtering only.

    Scans all photos to find ones containing the target person.

    Returns filtered photos for user review before quality selection.

    """
    try:
        print(f"\n{'='*60}")
        print(f"[Job {job_id}] PHASE 1: Face Filtering Started")
        print(f"{'='*60}")

        processing_jobs[job_id]['status'] = 'processing'
        processing_jobs[job_id]['progress'] = 5
        processing_jobs[job_id]['message'] = 'Loading face recognition AI...'

        print(f"[Job {job_id}] Loading InsightFace face recognition model...")

        from photo_selector.face_matcher import FaceMatcher

        # Get face matcher
        face_matcher = None
        if session_id and session_id in face_matchers:
            face_matcher = face_matchers[session_id]
            if face_matcher.get_reference_count() == 0:
                face_matcher = None

        if face_matcher is None:
            print(f"[Job {job_id}] ERROR: No reference photos loaded!")
            processing_jobs[job_id]['status'] = 'error'
            processing_jobs[job_id]['message'] = 'No reference photos loaded'
            return

        ref_count = face_matcher.get_reference_count()
        print(f"[Job {job_id}] Reference photos loaded: {ref_count}")

        processing_jobs[job_id]['progress'] = 10
        processing_jobs[job_id]['message'] = 'Scanning photos for your child using InsightFace...'

        # Get all photo files
        photo_files = []
        for f in os.listdir(upload_dir):
            if allowed_file(f) and not f.startswith('thumb_'):
                photo_files.append(f)

        total_photos = len(photo_files)
        print(f"[Job {job_id}] Total photos to scan: {total_photos}")
        processing_jobs[job_id]['total_photos'] = total_photos
        processing_jobs[job_id]['message'] = f'Scanning {total_photos} photos for your child...'

        # Create thumbnails directory - always in uploads/<job_id>/thumbnails
        # This ensures thumbnails work for both browser upload and local folder mode
        is_local_folder = processing_jobs[job_id].get('is_local_folder', False)
        if is_local_folder:
            thumbs_dir = os.path.join(UPLOAD_FOLDER, job_id, 'thumbnails')
        else:
            thumbs_dir = os.path.join(upload_dir, 'thumbnails')
        os.makedirs(thumbs_dir, exist_ok=True)

        # Get all photo paths
        photo_paths = [os.path.join(upload_dir, fn) for fn in photo_files]

        # Progress callback to update photos_checked
        def progress_callback(current, total, message):
            processing_jobs[job_id]['photos_checked'] = current
            processing_jobs[job_id]['message'] = f'Checked {current}/{total} photos...'
            # Update progress between 30-80%
            progress_pct = 30 + int((current / total) * 50) if total > 0 else 30
            processing_jobs[job_id]['progress'] = progress_pct

        # Run face filtering
        print(f"[Job {job_id}] Starting face detection and matching...")
        processing_jobs[job_id]['progress'] = 30
        filter_results = face_matcher.filter_photos(photo_paths, progress_callback=progress_callback)

        if 'error' in filter_results:
            print(f"[Job {job_id}] ERROR: Face matching failed - {filter_results['error']}")
            processing_jobs[job_id]['status'] = 'error'
            processing_jobs[job_id]['message'] = f"Face matching error: {filter_results['error']}"
            return

        # Print statistics
        stats = filter_results.get('statistics', {})
        matched_count = len(filter_results.get('matched_photos', []))
        unmatched_count = len(filter_results.get('unmatched_photos', []))

        print(f"\n[Job {job_id}] Face Filtering Results:")
        print(f"  - Photos with your child: {matched_count}")
        print(f"  - Photos without match: {unmatched_count}")
        print(f"  - Photos with no faces: {stats.get('no_faces', 0)}")
        # Handle match_rate which may be a string or float
        match_rate = stats.get('match_rate', 0)
        if isinstance(match_rate, str):
            print(f"  - Match rate: {match_rate}")
        else:
            print(f"  - Match rate: {match_rate:.1%}")

        processing_jobs[job_id]['progress'] = 70
        processing_jobs[job_id]['message'] = f'Creating thumbnails: 0/{matched_count}'

        print(f"[Job {job_id}] Creating thumbnails for {matched_count} matched photos...")

        # Prepare filtered photo data
        filtered_photos = []
        for i, match in enumerate(filter_results['matched_photos']):
            filename = os.path.basename(match['path'])
            thumb_name = get_thumbnail_name(filename)
            thumb_path = os.path.join(thumbs_dir, thumb_name)

            create_thumbnail(match['path'], thumb_path)

            filtered_photos.append({
                'filename': filename,
                'thumbnail': thumb_name,
                'face_match_score': match['similarity'],
                'num_faces': match['num_faces'],
                'matched_face_idx': match.get('matched_face_idx', 0),
                'face_bboxes': match.get('face_bboxes', [])  # Cached face locations for scoring
            })

            # Progress update every 10 photos or on last photo
            if (i + 1) % 10 == 0 or (i + 1) == matched_count:
                progress = 70 + int((i / matched_count) * 25)
                processing_jobs[job_id]['progress'] = progress
                processing_jobs[job_id]['message'] = f'Creating thumbnails: {i + 1}/{matched_count}'
                print(f"[Job {job_id}] Thumbnails created: {i + 1}/{matched_count}")

        # Sort by face match score (highest first)
        filtered_photos.sort(key=lambda x: x['face_match_score'], reverse=True)

        # Prepare unmatched photos data (photos where target was NOT found)
        unmatched_photos = []
        for unmatch in filter_results.get('unmatched_photos', []):
            filename = os.path.basename(unmatch['path'])
            # Get timestamp from EXIF if available
            timestamp = None
            try:
                from photo_selector.utils import get_photo_timestamp
                dt = get_photo_timestamp(unmatch['path'])
                if dt:
                    timestamp = dt.timestamp()
            except:
                pass
            unmatched_photos.append({
                'filename': filename,
                'best_similarity': unmatch.get('best_similarity', 0),
                'num_faces': unmatch.get('num_faces', 0),
                'timestamp': timestamp
            })

        # Also include photos with no faces detected
        for no_face in filter_results.get('no_faces_photos', []):
            filename = os.path.basename(no_face['path'])
            timestamp = None
            try:
                from photo_selector.utils import get_photo_timestamp
                dt = get_photo_timestamp(no_face['path'])
                if dt:
                    timestamp = dt.timestamp()
            except:
                pass
            unmatched_photos.append({
                'filename': filename,
                'best_similarity': 0,
                'num_faces': 0,
                'timestamp': timestamp
            })

        # Also include photos that had processing errors
        for error_photo in filter_results.get('error_photos', []):
            filename = os.path.basename(error_photo['path'])
            timestamp = None
            try:
                from photo_selector.utils import get_photo_timestamp
                dt = get_photo_timestamp(error_photo['path'])
                if dt:
                    timestamp = dt.timestamp()
            except:
                pass
            unmatched_photos.append({
                'filename': filename,
                'best_similarity': 0,
                'num_faces': 0,
                'timestamp': timestamp,
                'error': error_photo.get('error', 'Processing error')
            })

        # Sort unmatched by timestamp
        unmatched_photos.sort(key=lambda x: x.get('timestamp') or 0)

        # Store results for review
        review_data = {
            'total_uploaded': total_photos,
            'filtered_photos': filtered_photos,
            'unmatched_photos': unmatched_photos,
            'statistics': filter_results['statistics'],
            'reference_count': face_matcher.get_reference_count()
        }

        # Save review data
        review_file = os.path.join(RESULTS_FOLDER, f"{job_id}_review.json")
        with open(review_file, 'w') as f:
            json.dump(review_data, f, indent=2, default=str)

        processing_jobs[job_id]['progress'] = 100
        processing_jobs[job_id]['status'] = 'review_pending'
        processing_jobs[job_id]['message'] = f'Found your child in {len(filtered_photos)} of {total_photos} photos!'
        processing_jobs[job_id]['review_data'] = review_data

        print(f"\n[Job {job_id}] PHASE 1 COMPLETE!")
        print(f"  - Found {len(filtered_photos)} photos of your child")
        print(f"  - Status: review_pending (waiting for user to confirm)")
        print(f"  - Review data saved to: {review_file}")
        print(f"{'='*60}\n")

    except Exception as e:
        print(f"[Job {job_id}] EXCEPTION: {str(e)}")
        processing_jobs[job_id]['status'] = 'error'
        processing_jobs[job_id]['message'] = str(e)
        import traceback
        traceback.print_exc()


def process_drive_with_parallel_face_detection(job_id, folder_id, upload_dir, face_matcher):
    """

    HYBRID APPROACH: Download files from Google Drive while running face detection in parallel.



    This overlaps network I/O (downloading) with GPU compute (face detection) for faster processing.



    Flow:

    - Download thread: Downloads files and adds paths to queue

    - Face detection thread: Processes files from queue as they become ready

    - Both run simultaneously for maximum efficiency

    """
    import queue
    import threading

    print(f"\n{'='*60}")
    print(f"[Job {job_id}] HYBRID MODE: Parallel Download + Face Detection")
    print(f"{'='*60}")

    # Shared state
    file_queue = queue.Queue()
    results_lock = threading.Lock()
    matched_photos = []
    unmatched_photos = []
    no_faces_photos = []
    error_photos = []

    # Counters
    download_complete = threading.Event()
    total_files = [0]
    downloaded_count = [0]
    processed_count = [0]

    # Face detection worker
    def face_detection_worker():
        """Process files from queue as they become available."""
        while True:
            try:
                # Wait for file or check if download is complete
                try:
                    filepath = file_queue.get(timeout=1.0)
                except queue.Empty:
                    # Check if download is complete and queue is empty
                    if download_complete.is_set() and file_queue.empty():
                        break
                    continue

                if filepath is None:  # Poison pill
                    break

                # Process the file
                result = face_matcher.check_photo_for_target(filepath)

                with results_lock:
                    processed_count[0] += 1

                    if 'error' in result:
                        error_photos.append({'path': filepath, 'error': result['error']})
                    elif result['num_faces'] == 0:
                        no_faces_photos.append({'path': filepath, 'num_faces': 0})
                    elif result['contains_target']:
                        matched_photos.append({
                            'path': filepath,
                            'similarity': result['best_match_similarity'],
                            'num_faces': result['num_faces'],
                            'all_similarities': result.get('all_face_similarities', []),
                            'face_bboxes': result.get('face_bboxes', [])
                        })
                    else:
                        unmatched_photos.append({
                            'path': filepath,
                            'best_similarity': result['best_match_similarity'],
                            'num_faces': result['num_faces']
                        })

                    # Update progress (use unified message format)
                    if processed_count[0] % 10 == 0:
                        # After downloads complete, show scan-only progress
                        if download_complete.is_set():
                            pct = 30 + int((processed_count[0] / max(total_files[0], 1)) * 40)
                            processing_jobs[job_id]['progress'] = min(pct, 70)
                            processing_jobs[job_id]['message'] = f'Scanning faces: {processed_count[0]}/{total_files[0]}'
                        processing_jobs[job_id]['photos_checked'] = processed_count[0]
                        print(f"[Job {job_id}] [HYBRID] Downloaded: {downloaded_count[0]}, Face checked: {processed_count[0]}, Matched: {len(matched_photos)}")

                file_queue.task_done()

            except Exception as e:
                print(f"[Job {job_id}] Face detection error: {e}")
                continue

    # Callback when file is downloaded
    def on_file_ready(filepath):
        """Called by download_folder when each file is ready."""
        with results_lock:
            downloaded_count[0] += 1
        file_queue.put(filepath)

    # Progress callback for download
    def download_progress(current, total, _filename):
        total_files[0] = total
        pct = 5 + int((current / total) * 25)  # 5-30%
        processing_jobs[job_id]['progress'] = pct
        processing_jobs[job_id]['message'] = f'Downloading: {current}/{total}, Scanning: {processed_count[0]}'
        processing_jobs[job_id]['total_files'] = total

    try:
        processing_jobs[job_id]['status'] = 'processing'
        processing_jobs[job_id]['progress'] = 5
        processing_jobs[job_id]['message'] = 'Starting parallel download and face detection...'

        # Start face detection workers (use multiple threads for better throughput)
        num_workers = 4  # Face detection threads
        workers = []
        for _ in range(num_workers):
            t = threading.Thread(target=face_detection_worker)
            t.daemon = True
            t.start()
            workers.append(t)

        print(f"[Job {job_id}] Started {num_workers} face detection workers")

        # Start download (this will call on_file_ready for each file)
        print(f"[Job {job_id}] Starting Google Drive download with parallel face detection...")

        download_folder(
            folder_id,
            upload_dir,
            progress_callback=download_progress,
            file_ready_callback=on_file_ready
        )

        # Signal download complete
        download_complete.set()
        print(f"[Job {job_id}] Download complete. Waiting for face detection to finish...")

        # Wait for queue to be processed
        file_queue.join()

        # Send poison pills to stop workers
        for _ in workers:
            file_queue.put(None)

        # Wait for workers to finish
        for t in workers:
            t.join(timeout=5.0)

        print(f"\n[Job {job_id}] HYBRID Face Detection Results:")
        print(f"  - Photos with your child: {len(matched_photos)}")
        print(f"  - Photos without match: {len(unmatched_photos)}")
        print(f"  - Photos with no faces: {len(no_faces_photos)}")
        print(f"  - Photos with errors: {len(error_photos)}")
        if error_photos:
            print(f"  [ERRORS] First 5 error photos:")
            for ep in error_photos[:5]:
                print(f"    - {os.path.basename(ep['path'])}: {ep.get('error', 'Unknown error')}")

        # Now create thumbnails and prepare review data
        processing_jobs[job_id]['progress'] = 75
        processing_jobs[job_id]['message'] = f'Creating thumbnails for {len(matched_photos)} photos...'

        thumbs_dir = os.path.join(upload_dir, 'thumbnails')
        os.makedirs(thumbs_dir, exist_ok=True)

        filtered_photos = []
        for i, match in enumerate(matched_photos):
            filename = os.path.basename(match['path'])
            thumb_name = get_thumbnail_name(filename)
            thumb_path = os.path.join(thumbs_dir, thumb_name)

            create_thumbnail(match['path'], thumb_path)

            filtered_photos.append({
                'filename': filename,
                'thumbnail': thumb_name,
                'face_match_score': match['similarity'],
                'num_faces': match['num_faces'],
                'face_bboxes': match.get('face_bboxes', [])
            })

            if (i + 1) % 20 == 0:
                processing_jobs[job_id]['message'] = f'Creating thumbnails: {i + 1}/{len(matched_photos)}'

        # Sort by face match score
        filtered_photos.sort(key=lambda x: x['face_match_score'], reverse=True)

        # Prepare unmatched data
        unmatched_data = []
        for unmatch in unmatched_photos:
            filename = os.path.basename(unmatch['path'])
            unmatched_data.append({
                'filename': filename,
                'best_similarity': unmatch.get('best_similarity', 0),
                'num_faces': unmatch.get('num_faces', 0)
            })

        for no_face in no_faces_photos:
            filename = os.path.basename(no_face['path'])
            unmatched_data.append({
                'filename': filename,
                'best_similarity': 0,
                'num_faces': 0
            })

        # Also add error photos to unmatched (so they're visible to user)
        for error_photo in error_photos:
            filename = os.path.basename(error_photo['path'])
            unmatched_data.append({
                'filename': filename,
                'best_similarity': 0,
                'num_faces': 0,
                'error': error_photo.get('error', 'Processing error')
            })

        # Store results
        review_data = {
            'total_uploaded': total_files[0],
            'filtered_photos': filtered_photos,
            'unmatched_photos': unmatched_data,
            'statistics': {
                'total_scanned': total_files[0],
                'matched': len(matched_photos),
                'unmatched': len(unmatched_photos),
                'no_faces': len(no_faces_photos),
                'errors': len(error_photos),
                'match_rate': f"{(len(matched_photos) / max(total_files[0], 1) * 100):.1f}%"
            },
            'reference_count': face_matcher.get_reference_count()
        }

        # Save review data
        review_file = os.path.join(RESULTS_FOLDER, f"{job_id}_review.json")
        with open(review_file, 'w') as f:
            json.dump(review_data, f, indent=2, default=str)

        processing_jobs[job_id]['progress'] = 100
        processing_jobs[job_id]['status'] = 'review_pending'
        processing_jobs[job_id]['message'] = f'Found your child in {len(filtered_photos)} of {total_files[0]} photos!'
        processing_jobs[job_id]['review_data'] = review_data

        print(f"\n[Job {job_id}] HYBRID MODE COMPLETE!")
        print(f"  - Found {len(filtered_photos)} photos of your child")
        print(f"{'='*60}\n")

    except Exception as e:
        print(f"[Job {job_id}] HYBRID EXCEPTION: {str(e)}")
        processing_jobs[job_id]['status'] = 'error'
        processing_jobs[job_id]['message'] = str(e)
        import traceback
        traceback.print_exc()


def save_photos_by_month(job_id, upload_dir, selected_photos, rejected_photos, month_stats):
    """

    Automatically save both selected and not-selected photos organized by month.



    Creates folder structure:

    selected_photos/

    └── {job_id}_{timestamp}/

        ├── selected/

        │   ├── Jan/

        │   │   ├── photo1.jpg

        │   │   └── photo2.jpg

        │   ├── Feb/

        │   │   └── photo3.jpg

        │   └── ...

        ├── not_selected/

        │   ├── Jan/

        │   │   └── photo4.jpg

        │   ├── Feb/

        │   │   └── photo5.jpg

        │   └── ...

        └── summary.txt



    Args:

        job_id: The job identifier

        upload_dir: Source directory containing original photos

        selected_photos: List of selected photo dicts with 'filename' and 'month' keys

        rejected_photos: List of rejected photo dicts with 'filename' and 'month' keys

        month_stats: Statistics about each month's selection



    Returns:

        Path to the output folder

    """
    try:
        # Create output folder with timestamp
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        output_base = os.path.join(OUTPUT_FOLDER, f"{job_id}_{timestamp}")
        os.makedirs(output_base, exist_ok=True)

        print(f"\n{'='*60}")
        print(f"  AUTO-SAVING PHOTOS BY MONTH (SELECTED & NOT SELECTED)")
        print(f"{'='*60}")
        print(f"  Output folder: {output_base}")

        # Create selected and not_selected folders
        selected_base = os.path.join(output_base, "selected")
        not_selected_base = os.path.join(output_base, "not_selected")
        os.makedirs(selected_base, exist_ok=True)
        os.makedirs(not_selected_base, exist_ok=True)

        # Group selected photos by month
        selected_by_month = {}
        for photo in selected_photos:
            month = photo.get('month', 'Unknown')
            if month not in selected_by_month:
                selected_by_month[month] = []
            selected_by_month[month].append(photo)

        # Group rejected photos by month
        rejected_by_month = {}
        for photo in rejected_photos:
            month = photo.get('month', 'Unknown')
            if month not in rejected_by_month:
                rejected_by_month[month] = []
            rejected_by_month[month].append(photo)

        # Copy SELECTED photos to month folders
        print(f"\n  --- SELECTED PHOTOS ---")
        total_selected_copied = 0
        for month, photos in selected_by_month.items():
            month_folder = os.path.join(selected_base, month)
            os.makedirs(month_folder, exist_ok=True)

            print(f"  [selected/{month}] Saving {len(photos)} photos...")

            for photo in photos:
                src_path = os.path.join(upload_dir, photo['filename'])
                dst_path = os.path.join(month_folder, photo['filename'])

                if os.path.exists(src_path):
                    shutil.copy2(src_path, dst_path)
                    total_selected_copied += 1

        # Copy NOT SELECTED photos to month folders
        print(f"\n  --- NOT SELECTED PHOTOS ---")
        total_rejected_copied = 0
        for month, photos in rejected_by_month.items():
            month_folder = os.path.join(not_selected_base, month)
            os.makedirs(month_folder, exist_ok=True)

            print(f"  [not_selected/{month}] Saving {len(photos)} photos...")

            for photo in photos:
                src_path = os.path.join(upload_dir, photo['filename'])
                dst_path = os.path.join(month_folder, photo['filename'])

                if os.path.exists(src_path):
                    shutil.copy2(src_path, dst_path)
                    total_rejected_copied += 1

        # Create summary file
        summary_path = os.path.join(output_base, "summary.txt")
        with open(summary_path, 'w') as f:
            f.write("=" * 60 + "\n")
            f.write("  PHOTO SELECTION SUMMARY\n")
            f.write("=" * 60 + "\n\n")
            f.write(f"Job ID: {job_id}\n")
            f.write(f"Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
            f.write(f"Total Selected: {total_selected_copied} photos\n")
            f.write(f"Total Not Selected: {total_rejected_copied} photos\n")
            f.write(f"Grand Total: {total_selected_copied + total_rejected_copied} photos\n\n")

            f.write("-" * 40 + "\n")
            f.write("  BREAKDOWN BY MONTH\n")
            f.write("-" * 40 + "\n\n")
            f.write(f"{'Month':<12} {'Selected':>10} {'Not Selected':>14} {'Total':>8}\n")
            f.write(f"{'-'*12} {'-'*10} {'-'*14} {'-'*8}\n")

            for stat in month_stats:
                month = stat['month']
                selected = stat['selected']
                total = stat['total_photos']
                not_selected = total - selected
                f.write(f"{month:<12} {selected:>10} {not_selected:>14} {total:>8}\n")

            # Selected files by month
            f.write("\n" + "=" * 60 + "\n")
            f.write("  SELECTED FILES BY MONTH\n")
            f.write("=" * 60 + "\n")

            for month, photos in sorted(selected_by_month.items()):
                f.write(f"\n[{month}] - {len(photos)} selected photos:\n")
                for photo in sorted(photos, key=lambda x: x.get('score', 0), reverse=True):
                    score = photo.get('score', 0) * 100
                    cluster = photo.get('cluster_id', -1)
                    f.write(f"  + {photo['filename']} (Score: {score:.0f}%, Cluster: {cluster})\n")

            # Not selected files by month
            f.write("\n" + "=" * 60 + "\n")
            f.write("  NOT SELECTED FILES BY MONTH\n")
            f.write("=" * 60 + "\n")

            for month, photos in sorted(rejected_by_month.items()):
                f.write(f"\n[{month}] - {len(photos)} not selected photos:\n")
                for photo in sorted(photos, key=lambda x: x.get('score', 0), reverse=True):
                    score = photo.get('score', 0) * 100
                    cluster = photo.get('cluster_id', -1)
                    f.write(f"  - {photo['filename']} (Score: {score:.0f}%, Cluster: {cluster})\n")

        print(f"\n  SUMMARY:")
        print(f"  - Selected photos saved: {total_selected_copied}")
        print(f"  - Not selected photos saved: {total_rejected_copied}")
        print(f"  - Total photos saved: {total_selected_copied + total_rejected_copied}")
        print(f"  - Summary written to: {summary_path}")
        print(f"{'='*60}\n")

        return output_base

    except Exception as e:
        print(f"[ERROR] Failed to save photos by month: {str(e)}")
        import traceback
        traceback.print_exc()
        return None


def process_photos_quality_selection(job_id, upload_dir, quality_mode, similarity_threshold, confirmed_photos, face_data_cache=None, embedding_model='siglip'):
    """

    Phase 2: Month-based category-aware photo selection.

    Selects ~40 best photos per month with category diversity.



    Args:

        face_data_cache: Dict of filename -> {'num_faces': int, 'face_bboxes': list}

                        Cached face data from Step 2 to avoid re-detection

        embedding_model: 'siglip' or 'clip' - which embedding model to use

    """
    face_data_cache = face_data_cache or {}
    try:
        print(f"\n{'='*60}")
        print(f"[Job {job_id}] PHASE 2: Monthly Category-Aware Selection Started")
        print(f"{'='*60}")
        print(f"[Job {job_id}] Confirmed photos: {len(confirmed_photos)}")
        print(f"[Job {job_id}] Quality mode: {quality_mode}")
        print(f"[Job {job_id}] Similarity threshold: {similarity_threshold}")
        print(f"[Job {job_id}] Embedding model: {embedding_model.upper()}")

        processing_jobs[job_id]['status'] = 'processing'
        processing_jobs[job_id]['progress'] = 5
        processing_jobs[job_id]['message'] = f'Loading {embedding_model.upper()} model...'

        # Import the appropriate embedder based on selection
        from photo_selector.monthly_selector import MonthlyPhotoSelector
        if embedding_model == 'clip':
            from photo_selector.clip_embeddings import CLIPEmbedder as Embedder
            model_display_name = 'CLIP'
        else:
            from photo_selector.siglip_embeddings import SigLIPEmbedder as Embedder
            model_display_name = 'SigLIP'

        # Determine target per month based on quality mode
        if quality_mode == 'keep_more':
            target_per_month = 60  # More photos per month
        elif quality_mode == 'strict':
            target_per_month = 25  # Fewer, higher quality
        else:  # balanced
            target_per_month = 40  # Default

        print(f"[Job {job_id}] Target per month: {target_per_month}")

        # Step 1: Generate embeddings for confirmed photos (with caching)
        processing_jobs[job_id]['progress'] = 10
        processing_jobs[job_id]['message'] = f'Checking embedding cache...'

        print(f"[Job {job_id}] Processing {len(confirmed_photos)} photos for {model_display_name} embeddings...")

        # Import cache functions
        from supabase_storage import (
            compute_file_hash,
            get_cached_embeddings_batch,
            save_embeddings_batch,
            is_supabase_available
        )

        # Step 1a: Compute hashes for all files
        file_hashes = {}  # filename -> hash
        hash_to_filename = {}  # hash -> filename (for reverse lookup)

        print(f"[Job {job_id}] Computing file hashes...")
        for i, filename in enumerate(confirmed_photos):
            filepath = os.path.join(upload_dir, filename)
            if os.path.exists(filepath):
                file_hash = compute_file_hash(filepath)
                if file_hash:
                    file_hashes[filename] = file_hash
                    hash_to_filename[file_hash] = filename

            # Update progress (10-15%)
            if i % 100 == 0:
                progress = 10 + int((i / len(confirmed_photos)) * 5)
                processing_jobs[job_id]['progress'] = progress

        print(f"[Job {job_id}] Computed {len(file_hashes)} hashes")

        # Step 1b: Check cache for existing embeddings
        embeddings = {}
        cached_count = 0
        uncached_filenames = []

        if is_supabase_available() and file_hashes:
            processing_jobs[job_id]['message'] = f'Checking embedding cache...'
            all_hashes = list(file_hashes.values())

            # Query cache in batches (Supabase has query limits)
            cached_embeddings = {}
            batch_size = 500
            for i in range(0, len(all_hashes), batch_size):
                batch_hashes = all_hashes[i:i + batch_size]
                batch_result = get_cached_embeddings_batch(batch_hashes, embedding_model)
                cached_embeddings.update(batch_result)

            # Map cached embeddings back to filenames
            for filename, file_hash in file_hashes.items():
                if file_hash in cached_embeddings:
                    embeddings[filename] = cached_embeddings[file_hash]
                    cached_count += 1
                else:
                    uncached_filenames.append(filename)

            print(f"[Job {job_id}] Cache hit: {cached_count}/{len(file_hashes)} embeddings")
        else:
            uncached_filenames = list(file_hashes.keys())
            print(f"[Job {job_id}] Cache not available, computing all embeddings")

        # Step 1c: Compute embeddings for uncached files only
        newly_computed = {}
        if uncached_filenames:
            processing_jobs[job_id]['message'] = f'Analyzing {len(uncached_filenames)} photos with {model_display_name}...'
            print(f"[Job {job_id}] Computing {model_display_name} embeddings for {len(uncached_filenames)} uncached photos...")

            embedder = Embedder()

            for i, filename in enumerate(uncached_filenames):
                filepath = os.path.join(upload_dir, filename)
                if os.path.exists(filepath):
                    img = embedder.load_image(filepath)
                    if img is not None:
                        embedding = embedder.get_embedding(img)
                        if embedding is not None:
                            embeddings[filename] = embedding
                            newly_computed[filename] = embedding
                        img.close()

                # Update progress (15-30%)
                progress = 15 + int((i / len(uncached_filenames)) * 15)
                processing_jobs[job_id]['progress'] = progress

            print(f"[Job {job_id}] Computed {len(newly_computed)} new embeddings")

            # Step 1d: Save newly computed embeddings to cache
            if newly_computed and is_supabase_available():
                processing_jobs[job_id]['message'] = 'Saving embeddings to cache...'
                saved = save_embeddings_batch(newly_computed, file_hashes, embedding_model)
                print(f"[Job {job_id}] Saved {saved} embeddings to cache")

        print(f"[Job {job_id}] Total embeddings: {len(embeddings)} (cached: {cached_count}, computed: {len(newly_computed)})")

        # Step 2: Initialize monthly selector
        processing_jobs[job_id]['progress'] = 35
        processing_jobs[job_id]['message'] = 'Grouping photos by month...'

        # Note: duplicate_threshold is for CLIP embedding similarity (0.85 catches exact near-dupes)
        # diversity_threshold ensures we don't select visually similar photos (different scenes)
        # This is separate from face similarity_threshold (0.4-0.5 for face matching)
        selector = MonthlyPhotoSelector(
            target_per_month=target_per_month,
            duplicate_threshold=0.85,    # Remove exact duplicates (same moment, slight angle change)
            diversity_threshold=0.75     # Ensure selected photos are visually diverse
        )

        # Step 3: Group photos by month (only confirmed photos)
        # We need to manually build the photos_by_month structure for confirmed photos
        from collections import defaultdict

        MONTH_NAMES = {
            1: "Jan", 2: "Feb", 3: "Mar", 4: "Apr",
            5: "May", 6: "Jun", 7: "Jul", 8: "Aug",
            9: "Sep", 10: "Oct", 11: "Nov", 12: "Dec"
        }

        photos_by_month = defaultdict(list)

        # Debug: Track timestamp extraction success
        timestamp_found = 0
        timestamp_missing = 0

        for filename in confirmed_photos:
            filepath = os.path.join(upload_dir, filename)
            if not os.path.exists(filepath):
                print(f"[TIMESTAMP DEBUG] File not found: {filepath}")
                continue

            dt = selector.get_photo_date(filepath)
            if dt:
                timestamp_found += 1
            else:
                timestamp_missing += 1

            # Get cached face data if available
            cached_face = face_data_cache.get(filename, {})

            photo_info = {
                'filename': filename,
                'filepath': filepath,
                'date': dt.isoformat() if dt else None,
                'month': MONTH_NAMES.get(dt.month, "Unknown") if dt else "Unknown",
                'timestamp': dt.timestamp() if dt else None,
                # Cached face data from Step 2 (avoids re-detection)
                'num_faces': cached_face.get('num_faces'),
                'face_bboxes': cached_face.get('face_bboxes', [])
            }

            photos_by_month[photo_info['month']].append(photo_info)

        # Sort months in calendar order
        month_order = list(MONTH_NAMES.values()) + ['Unknown']
        photos_by_month = {m: photos_by_month[m] for m in month_order if m in photos_by_month}

        print(f"[TIMESTAMP DEBUG] Timestamps found: {timestamp_found}, missing: {timestamp_missing}")
        print(f"[Job {job_id}] Photos grouped into {len(photos_by_month)} months:")
        for month, photos in photos_by_month.items():
            print(f"  - {month}: {len(photos)} photos")

        # Step 4: Select best photos from each month (categories detected AFTER selection for speed)
        processing_jobs[job_id]['progress'] = 60
        processing_jobs[job_id]['message'] = 'Selecting best photos per month...'

        def progress_callback(msg):
            processing_jobs[job_id]['message'] = msg

        selection_results = selector.select_all_months(photos_by_month, embeddings, progress_callback)

        selected_photos = selection_results['selected']
        month_stats = selection_results['month_stats']
        summary = selection_results['summary']

        print(f"\n[Job {job_id}] Selection Results:")
        print(f"  - Total photos: {summary['total_photos']}")
        print(f"  - Selected: {summary['total_selected']}")
        print(f"  - Selection rate: {summary['selection_rate']*100:.1f}%")

        # Step 5: Detect categories ONLY for selected photos (much faster than all photos)
        processing_jobs[job_id]['progress'] = 75
        processing_jobs[job_id]['message'] = 'Detecting categories for selected photos...'

        print(f"[Job {job_id}] Detecting categories for {len(selected_photos)} selected photos...")
        selected_paths = [p['filepath'] for p in selected_photos]
        if selected_paths:
            selector._ensure_category_detector()
            categories = selector.category_detector.detect_categories_batch(selected_paths)
            for photo in selected_photos:
                # categories dict is keyed by filename, not filepath
                cat, conf = categories.get(photo['filename'], ('unknown', 0.0))
                photo['category'] = cat
                photo['category_confidence'] = conf

        # Update month_stats with category breakdown from selected photos only
        for stat in month_stats:
            month_name = stat['month']
            month_selected = [p for p in selected_photos if p.get('month') == month_name]
            cat_breakdown = {}
            for p in month_selected:
                cat = p.get('category', 'unknown')
                cat_breakdown[cat] = cat_breakdown.get(cat, 0) + 1
            stat['categories'] = cat_breakdown

        # Step 6: Build rejected list (photos not selected)
        # Note: rejection_reason is already set by monthly_selector.py
        selected_filenames = {p['filename'] for p in selected_photos}
        rejected_photos = []

        for month, photos in photos_by_month.items():
            for photo in photos:
                if photo['filename'] not in selected_filenames:
                    # Keep existing rejection_reason from monthly_selector, or set default
                    if not photo.get('rejection_reason'):
                        photo['rejection_reason'] = 'Not selected for month quota'
                    rejected_photos.append(photo)

        # Create thumbnails directory
        thumbs_dir = os.path.join(upload_dir, 'thumbnails')
        os.makedirs(thumbs_dir, exist_ok=True)

        # Calculate total thumbnails to create
        total_thumbnails = len(selected_photos) + len(rejected_photos)
        thumbnails_created = 0

        processing_jobs[job_id]['progress'] = 85
        processing_jobs[job_id]['message'] = f'Creating thumbnails: 0/{total_thumbnails}'

        # Build final results structure
        results = {
            'selected': [],
            'rejected': [],
            'summary': {
                'total_photos': summary['total_photos'],
                'selected_count': summary['total_selected'],
                'rejected_count': len(rejected_photos),
                'selection_rate': summary['selection_rate'],
                'face_filtering': {
                    'total_photos': processing_jobs[job_id].get('total_uploaded', len(confirmed_photos)),
                    'after_face_filter': len(confirmed_photos),
                    'user_confirmed': len(confirmed_photos)
                },
                'total_processed': len(confirmed_photos)
            },
            'month_stats': month_stats,
            'rejection_breakdown': {}
        }

        # Count rejection reasons
        rejection_counts = defaultdict(int)

        # Compute cluster stats for display on photo cards (per-month)
        # Cluster IDs are assigned per-month, so we need to track (month, cluster_id) pairs
        # Count total photos per (month, cluster_id)
        cluster_total_counts = defaultdict(int)
        for month, photos in photos_by_month.items():
            for photo in photos:
                cid = photo.get('cluster_id', -1)
                if cid != -1:
                    cluster_total_counts[(month, cid)] += 1

        # Count selected photos per (month, cluster_id)
        cluster_selected_counts = defaultdict(int)
        for photo in selected_photos:
            month = photo.get('month', 'Unknown')
            cid = photo.get('cluster_id', -1)
            if cid != -1:
                cluster_selected_counts[(month, cid)] += 1

        # Process selected photos
        for photo in selected_photos:
            filename = photo['filename']
            thumb_name = get_thumbnail_name(filename)
            thumb_path = os.path.join(thumbs_dir, thumb_name)

            create_thumbnail(os.path.join(upload_dir, filename), thumb_path)

            # Update thumbnail counter
            thumbnails_created += 1
            if thumbnails_created % 10 == 0 or thumbnails_created == total_thumbnails:
                processing_jobs[job_id]['message'] = f'Creating thumbnails: {thumbnails_created}/{total_thumbnails}'

            # Get embedding for this photo (convert to list for JSON serialization)
            photo_embedding = embeddings.get(filename)
            embedding_list = photo_embedding.tolist() if photo_embedding is not None else None

            # Get cluster stats for this photo (per-month)
            cid = photo.get('cluster_id', -1)
            month = photo.get('month', 'Unknown')
            cluster_total = cluster_total_counts.get((month, cid), 0) if cid != -1 else 0
            cluster_selected = cluster_selected_counts.get((month, cid), 0) if cid != -1 else 0

            results['selected'].append({
                'filename': filename,
                'thumbnail': thumb_name,
                'score': float(photo.get('total', 0)),
                'face_quality': float(photo.get('face_quality', 0)),
                'aesthetic_quality': float(photo.get('aesthetic_quality', 0)),
                'emotional_signal': float(photo.get('emotional_signal', 0)),
                'uniqueness': float(photo.get('uniqueness', 0)),
                'bucket': photo.get('month', 'unknown'),
                'month': month,
                'category': photo.get('category', 'unknown'),
                'num_faces': int(photo.get('num_faces', 0)),
                'cluster_id': cid,
                'original_cluster_id': photo.get('original_cluster_id', cid),
                'cluster_total': cluster_total,
                'cluster_selected': cluster_selected,
                'event_id': photo.get('event_id', -1),
                'max_similarity': float(photo.get('max_similarity', 0)),
                'embedding': embedding_list,
                'selection_reason': f"Best in {photo.get('category', 'category')} for {month}",
                'selection_detail': f"Selected from {month} - Category: {photo.get('category', 'unknown')}"
            })

        # Process rejected photos
        for photo in rejected_photos:
            filename = photo['filename']
            thumb_name = get_thumbnail_name(filename)
            thumb_path = os.path.join(thumbs_dir, thumb_name)

            create_thumbnail(os.path.join(upload_dir, filename), thumb_path)

            # Update thumbnail counter
            thumbnails_created += 1
            if thumbnails_created % 10 == 0 or thumbnails_created == total_thumbnails:
                processing_jobs[job_id]['message'] = f'Creating thumbnails: {thumbnails_created}/{total_thumbnails}'

            # Use actual rejection reason from monthly_selector
            rejection_reason = photo.get('rejection_reason', 'Better photos selected')

            # Categorize rejection reasons for breakdown chart
            if 'Event' in rejection_reason:
                breakdown_category = "Same event"
            elif 'Cluster' in rejection_reason:
                breakdown_category = "Same cluster"
            elif 'similar' in rejection_reason.lower():
                breakdown_category = "Too similar"
            elif 'Target' in rejection_reason:
                breakdown_category = "Target reached"
            else:
                breakdown_category = "Other"
            rejection_counts[breakdown_category] += 1

            # Get embedding for this photo (convert to list for JSON serialization)
            photo_embedding = embeddings.get(filename)
            embedding_list = photo_embedding.tolist() if photo_embedding is not None else None

            # Get cluster stats for this photo (per-month)
            cid = photo.get('cluster_id', -1)
            month = photo.get('month', 'Unknown')
            cluster_total = cluster_total_counts.get((month, cid), 0) if cid != -1 else 0
            cluster_selected = cluster_selected_counts.get((month, cid), 0) if cid != -1 else 0

            results['rejected'].append({
                'filename': filename,
                'thumbnail': thumb_name,
                'score': float(photo.get('total', 0)),
                'face_quality': float(photo.get('face_quality', 0)),
                'aesthetic_quality': float(photo.get('aesthetic_quality', 0)),
                'bucket': photo.get('month', 'unknown'),
                'month': month,
                'category': photo.get('category', 'unknown'),
                'cluster_id': cid,
                'original_cluster_id': photo.get('original_cluster_id', cid),
                'cluster_total': cluster_total,
                'cluster_selected': cluster_selected,
                'event_id': photo.get('event_id', -1),
                'max_similarity': float(photo.get('max_similarity', 0)),
                'embedding': embedding_list,
                'rejection_reason': rejection_reason,
                'reason': rejection_reason,
                'reason_detail': f"Category: {photo.get('category', 'unknown')}"
            })

        results['rejection_breakdown'] = dict(rejection_counts)

        # Add face filtering count to breakdown (photos where target face was not detected)
        face_filter_data = results['summary'].get('face_filtering', {})
        total_uploaded = face_filter_data.get('total_photos', 0)
        after_face_filter = face_filter_data.get('after_face_filter', 0)
        face_filtered_out = total_uploaded - after_face_filter
        if face_filtered_out > 0:
            results['rejection_breakdown']['Face not detected'] = face_filtered_out

        # Sort by score
        results['selected'].sort(key=lambda x: x['score'], reverse=True)
        results['rejected'].sort(key=lambda x: x['score'], reverse=True)

        # Save results
        results_file = os.path.join(RESULTS_FOLDER, f"{job_id}.json")
        with open(results_file, 'w') as f:
            json.dump(results, f, indent=2, default=str)

        processing_jobs[job_id]['status'] = 'complete'
        processing_jobs[job_id]['progress'] = 100
        processing_jobs[job_id]['message'] = 'Selection complete!'
        processing_jobs[job_id]['results'] = results

        print(f"\n[Job {job_id}] PHASE 2 COMPLETE!")
        print(f"  - Final selection: {len(results['selected'])} photos")
        print(f"  - Filtered out: {len(results['rejected'])} photos")
        print(f"  - Results saved to: {results_file}")
        print(f"\n=== Month Distribution ===")
        for stat in month_stats:
            print(f"  {stat['month']}: {stat['selected']}/{stat['total_photos']} ({stat['category_summary']})")
        print(f"{'='*60}\n")

        # Auto-save disabled - uncomment below to re-enable
        # output_folder = save_photos_by_month(job_id, upload_dir, selected_photos, rejected_photos, month_stats)
        # if output_folder:
        #     processing_jobs[job_id]['output_folder'] = output_folder
        #     print(f"[Job {job_id}] Photos auto-saved to: {output_folder}")

    except Exception as e:
        print(f"[Job {job_id}] EXCEPTION: {str(e)}")
        processing_jobs[job_id]['status'] = 'error'
        processing_jobs[job_id]['message'] = str(e)
        import traceback
        traceback.print_exc()


def process_photos_automatic(job_id, upload_dir, quality_mode, similarity_threshold, session_id=None):
    """

    Full automatic processing (no review step) - used when no reference photos loaded.

    Processes all photos with quality-based selection.

    """
    try:
        processing_jobs[job_id]['status'] = 'processing'
        processing_jobs[job_id]['progress'] = 5
        processing_jobs[job_id]['message'] = 'Loading AI models...'

        # Import pipeline components
        from photo_selector.siglip_embeddings import SigLIPEmbedder
        from photo_selector.temporal import TemporalSegmenter
        from photo_selector.clustering import PhotoClusterer, BucketClusterManager
        from photo_selector.scoring import PhotoScorer, ClusterScorer
        from photo_selector.auto_selector import SmartPhotoSelector, SelectionReason

        # Step 1: Embeddings (SigLIP for better visual understanding)
        processing_jobs[job_id]['progress'] = 20
        processing_jobs[job_id]['message'] = 'Analyzing photos with SigLIP AI...'

        embedder = SigLIPEmbedder()
        embeddings = embedder.process_folder(upload_dir)

        processing_jobs[job_id]['progress'] = 40
        processing_jobs[job_id]['message'] = 'Organizing by date...'

        # Step 2: Temporal segmentation
        segmenter = TemporalSegmenter(bucket_type="monthly")
        buckets = segmenter.segment_folder(upload_dir)

        # For clustering, use a reasonable estimate (will be refined by auto-selector)
        estimated_target = max(10, len(embeddings) // 3)
        targets = segmenter.calculate_target_per_bucket(buckets, estimated_target)

        processing_jobs[job_id]['progress'] = 50
        processing_jobs[job_id]['message'] = 'Grouping similar photos (adaptive clustering)...'

        # Step 3: Clustering (HDBSCAN with timestamp-weighted features, 24h gap splitting)
        # min_cluster_size=5 reduces single-photo clusters by requiring at least 5 similar photos
        clusterer = BucketClusterManager(PhotoClusterer(min_cluster_size=5, temporal_gap_hours=24.0, timestamp_weight=0.3))
        cluster_results = clusterer.cluster_all_buckets(buckets, embeddings, targets)

        processing_jobs[job_id]['progress'] = 60
        processing_jobs[job_id]['message'] = 'Scoring photo quality...'

        # Step 4: Score ALL photos
        scorer = ClusterScorer(PhotoScorer())
        all_scores = {}

        for bucket_key, bucket_data in cluster_results.items():
            filenames = bucket_data['filenames']
            labels = np.array(bucket_data['labels'])
            bucket_embeddings = np.array([embeddings[fn] for fn in filenames])

            for cluster_id in np.unique(labels):
                cluster_mask = labels == cluster_id
                cluster_indices = np.where(cluster_mask)[0]
                cluster_filenames = [filenames[i] for i in cluster_indices]
                cluster_embs = bucket_embeddings[cluster_mask]
                cluster_paths = [os.path.join(upload_dir, fn) for fn in cluster_filenames]

                scores = scorer.score_cluster(cluster_paths, cluster_embs)

                for score in scores:
                    score['bucket'] = bucket_key
                    score['cluster'] = int(cluster_id)
                    score['cluster_key'] = f"{bucket_key}_cluster_{cluster_id}"
                    all_scores[score['filename']] = score

        processing_jobs[job_id]['progress'] = 75
        processing_jobs[job_id]['message'] = 'AI deciding which photos to keep...'

        # Step 5: AUTOMATIC SELECTION
        auto_selector = SmartPhotoSelector(
            quality_mode=quality_mode,
            similarity_threshold=similarity_threshold
        )

        selection_results = auto_selector.process_all_photos(
            all_scores, embeddings, cluster_results
        )

        processing_jobs[job_id]['progress'] = 90
        processing_jobs[job_id]['message'] = 'Preparing results...'

        # Create thumbnails directory
        thumbs_dir = os.path.join(upload_dir, 'thumbnails')
        os.makedirs(thumbs_dir, exist_ok=True)

        # Prepare results
        results = {
            'selected': [],
            'rejected': [],
            'summary': selection_results['summary'],
            'rejection_breakdown': selection_results['rejection_breakdown'],
            'bucket_stats': selection_results['bucket_stats']
        }

        # Process selected photos
        for photo in selection_results['selected']:
            filename = photo['filename']
            thumb_name = get_thumbnail_name(filename)
            thumb_path = os.path.join(thumbs_dir, thumb_name)

            create_thumbnail(os.path.join(upload_dir, filename), thumb_path)

            reason = photo.get('selection_reason', None)
            if isinstance(reason, SelectionReason):
                reason_text = reason.value
            else:
                reason_text = str(reason) if reason else 'High quality photo'

            results['selected'].append({
                'filename': filename,
                'thumbnail': thumb_name,
                'score': float(photo.get('total', 0)),
                'face_quality': float(photo.get('face_quality', 0)),
                'aesthetic_quality': float(photo.get('aesthetic_quality', 0)),
                'emotional_signal': float(photo.get('emotional_signal', 0)),
                'uniqueness': float(photo.get('uniqueness', 0)),
                'bucket': photo.get('bucket', 'unknown'),
                'num_faces': int(photo.get('num_faces', 0)),
                'selection_reason': reason_text,
                'selection_detail': photo.get('selection_detail', reason_text)
            })

        # Process rejected photos
        for photo in selection_results['rejected']:
            filename = photo['filename']
            thumb_name = get_thumbnail_name(filename)
            thumb_path = os.path.join(thumbs_dir, thumb_name)

            create_thumbnail(os.path.join(upload_dir, filename), thumb_path)

            reason = photo.get('rejection_reason', None)
            if isinstance(reason, SelectionReason):
                reason_text = reason.value
            else:
                reason_text = str(reason) if reason else 'Did not meet quality threshold'

            results['rejected'].append({
                'filename': filename,
                'thumbnail': thumb_name,
                'score': float(photo.get('total', 0)),
                'face_quality': float(photo.get('face_quality', 0)),
                'aesthetic_quality': float(photo.get('aesthetic_quality', 0)),
                'bucket': photo.get('bucket', 'unknown'),
                'reason': reason_text,
                'reason_detail': photo.get('rejection_detail', '')
            })

        # Sort by score
        results['selected'].sort(key=lambda x: x['score'], reverse=True)
        results['rejected'].sort(key=lambda x: x['score'], reverse=True)

        # Save results
        results_file = os.path.join(RESULTS_FOLDER, f"{job_id}.json")
        with open(results_file, 'w') as f:
            json.dump(results, f, indent=2, default=str)

        processing_jobs[job_id]['status'] = 'complete'
        processing_jobs[job_id]['progress'] = 100
        processing_jobs[job_id]['message'] = 'Selection complete!'
        processing_jobs[job_id]['results'] = results

    except Exception as e:
        processing_jobs[job_id]['status'] = 'error'
        processing_jobs[job_id]['message'] = str(e)
        import traceback
        traceback.print_exc()


@app.route('/')
def index():
    """Main page - redirects to step 1 (reference upload)."""
    return render_template('index.html')


@app.route('/preload_model')
def preload_model():
    """Pre-load the InsightFace model in the background."""
    from photo_selector.face_matcher import FaceMatcher
    try:
        # Create a temporary matcher to trigger model download/load
        temp_matcher = FaceMatcher(similarity_threshold=0.5)
        if temp_matcher.is_initialized:
            return jsonify({'success': True, 'message': 'Model loaded'})
        else:
            return jsonify({'success': False, 'message': 'Model failed to initialize'})
    except Exception as e:
        return jsonify({'success': False, 'message': str(e)})


@app.route('/step1')
def step1_reference():
    """Step 1: Upload reference photos of target person."""
    # Create a new session ID if not exists
    if 'session_id' not in session:
        session['session_id'] = str(uuid.uuid4())[:8]
    return render_template('step1_reference.html', session_id=session['session_id'])


@app.route('/step2')
def step2_upload():
    """Step 2: Upload all event photos."""
    session_id = session.get('session_id')
    if not session_id:
        return render_template('index.html')

    # Check if we have reference photos loaded
    ref_count = 0
    if session_id in face_matchers:
        ref_count = face_matchers[session_id].get_reference_count()

    return render_template('step2_upload.html',
                          session_id=session_id,
                          reference_count=ref_count)


@app.route('/upload_reference', methods=['POST'])
def upload_reference():
    """Handle reference photo uploads (2-3 photos of target person)."""
    from photo_selector.face_matcher import FaceMatcher

    if 'files' not in request.files:
        return jsonify({'error': 'No files provided'}), 400

    files = request.files.getlist('files')
    if not files or files[0].filename == '':
        return jsonify({'error': 'No files selected'}), 400

    # Get or create session ID
    session_id = session.get('session_id')
    if not session_id:
        session_id = str(uuid.uuid4())[:8]
        session['session_id'] = session_id

    # Create reference directory for this session
    ref_dir = os.path.join(REFERENCE_FOLDER, session_id)
    os.makedirs(ref_dir, exist_ok=True)

    # Initialize face matcher for this session if not exists
    if session_id not in face_matchers:
        face_matchers[session_id] = FaceMatcher(similarity_threshold=0.5)

    matcher = face_matchers[session_id]

    # Process each reference photo
    results = []
    for file in files:
        if file and allowed_file(file.filename):
            filename = secure_filename(file.filename)
            filepath = os.path.join(ref_dir, filename)
            file.save(filepath)

            # Add to face matcher
            result = matcher.add_reference_photo(filepath)
            result['filename'] = filename

            # Create thumbnail for preview
            thumb_name = get_thumbnail_name(filename)
            thumb_path = os.path.join(ref_dir, thumb_name)
            create_thumbnail(filepath, thumb_path, size=(150, 150))
            result['thumbnail'] = thumb_name

            results.append(result)

    return jsonify({
        'session_id': session_id,
        'results': results,
        'total_references': matcher.get_reference_count(),
        'message': f'Loaded {matcher.get_reference_count()} reference face(s)'
    })


@app.route('/reference_status')
def reference_status():
    """Get current reference photo status."""
    session_id = session.get('session_id')
    if not session_id or session_id not in face_matchers:
        return jsonify({
            'session_id': session_id,
            'reference_count': 0,
            'ready': False
        })

    matcher = face_matchers[session_id]
    return jsonify({
        'session_id': session_id,
        'reference_count': matcher.get_reference_count(),
        'ready': matcher.get_reference_count() >= 1
    })


@app.route('/clear_references', methods=['POST'])
def clear_references():
    """Clear all reference photos for current session."""
    session_id = session.get('session_id')

    if session_id and session_id in face_matchers:
        face_matchers[session_id].clear_references()

        # Delete reference files
        ref_dir = os.path.join(REFERENCE_FOLDER, session_id)
        if os.path.exists(ref_dir):
            shutil.rmtree(ref_dir)

    return jsonify({'message': 'References cleared', 'reference_count': 0})


@app.route('/reference_thumbnail/<filename>')
def get_reference_thumbnail(filename):
    """Serve reference photo thumbnails."""
    session_id = session.get('session_id')
    if not session_id:
        return jsonify({'error': 'No session'}), 404
    ref_dir = os.path.join(REFERENCE_FOLDER, session_id)
    return send_from_directory(ref_dir, filename)


# ============== CHUNKED UPLOAD ENDPOINTS ==============
# These endpoints allow uploading large batches of photos in smaller chunks
# to avoid 413 (Request Entity Too Large) errors on Hugging Face Spaces

@app.route('/upload_init', methods=['POST'])
def upload_init():
    """Initialize a chunked upload session."""
    data = request.json
    total_files = data.get('total_files', 0)
    quality_mode = data.get('quality_mode', 'balanced')
    similarity_threshold = data.get('similarity_threshold', 0.92)

    # Create a unique session ID for this upload
    upload_session_id = str(uuid.uuid4())[:8]
    upload_dir = os.path.join(UPLOAD_FOLDER, upload_session_id)
    os.makedirs(upload_dir, exist_ok=True)

    # Get face matcher session
    face_session_id = session.get('session_id')

    # Store session info
    upload_sessions[upload_session_id] = {
        'upload_dir': upload_dir,
        'total_files': total_files,
        'uploaded_files': [],
        'quality_mode': quality_mode,
        'similarity_threshold': similarity_threshold,
        'face_session_id': face_session_id,
        'created_at': time.time()
    }

    print(f"\n[Upload Session {upload_session_id}] Initialized for {total_files} files")

    return jsonify({
        'session_id': upload_session_id,
        'message': 'Upload session initialized'
    })


@app.route('/upload_chunk', methods=['POST'])
def upload_chunk():
    """Handle a chunk of files in a chunked upload."""
    if 'files' not in request.files:
        return jsonify({'error': 'No files provided'}), 400

    session_id = request.form.get('session_id')
    if not session_id or session_id not in upload_sessions:
        return jsonify({'error': 'Invalid upload session'}), 400

    upload_info = upload_sessions[session_id]
    upload_dir = upload_info['upload_dir']

    files = request.files.getlist('files')
    saved_count = 0

    for file in files:
        if file and allowed_file(file.filename):
            filename = secure_filename(file.filename)
            # Handle duplicate filenames
            base, ext = os.path.splitext(filename)
            counter = 1
            while os.path.exists(os.path.join(upload_dir, filename)):
                filename = f"{base}_{counter}{ext}"
                counter += 1

            file.save(os.path.join(upload_dir, filename))
            upload_info['uploaded_files'].append(filename)
            saved_count += 1

    chunk_index = request.form.get('chunk_index', '?')
    print(f"[Upload Session {session_id}] Chunk {chunk_index}: saved {saved_count} files (total: {len(upload_info['uploaded_files'])})")

    return jsonify({
        'success': True,
        'saved': saved_count,
        'total_uploaded': len(upload_info['uploaded_files'])
    })


@app.route('/upload_complete', methods=['POST'])
def upload_complete():
    """Complete a chunked upload and start processing."""
    data = request.json
    session_id = data.get('session_id')

    if not session_id or session_id not in upload_sessions:
        return jsonify({'error': 'Invalid upload session'}), 400

    upload_info = upload_sessions[session_id]
    upload_dir = upload_info['upload_dir']
    saved_files = upload_info['uploaded_files']
    quality_mode = upload_info['quality_mode']
    similarity_threshold = upload_info['similarity_threshold']
    face_session_id = upload_info['face_session_id']

    if not saved_files:
        shutil.rmtree(upload_dir)
        del upload_sessions[session_id]
        return jsonify({'error': 'No valid image files uploaded'}), 400

    # Check if we have reference photos loaded
    has_references = False
    ref_count = 0
    if face_session_id and face_session_id in face_matchers:
        ref_count = face_matchers[face_session_id].get_reference_count()
        has_references = ref_count > 0

    # Create job (use same session_id as job_id for simplicity)
    job_id = session_id

    # Initialize job
    processing_jobs[job_id] = {
        'status': 'queued',
        'progress': 30,  # Start at 30% since upload is done
        'message': 'Starting AI processing...',
        'total_files': len(saved_files),
        'total_uploaded': len(saved_files),
        'upload_dir': upload_dir,
        'session_id': face_session_id,
        'has_reference_photos': has_references,
        'reference_count': ref_count,
        'quality_mode': quality_mode,
        'similarity_threshold': similarity_threshold,
        'results': None
    }

    # Clean up upload session
    del upload_sessions[session_id]

    # Decide which processing mode to use
    if has_references:
        print(f"\n[Job {job_id}] NEW JOB (Chunked Upload) - Face Filtering Mode")
        print(f"  - Files uploaded: {len(saved_files)}")
        print(f"  - Reference photos: {ref_count}")
        thread = threading.Thread(
            target=process_photos_face_filter_only,
            args=(job_id, upload_dir, face_session_id)
        )
        message = f'Scanning {len(saved_files)} photos to find your child using {ref_count} reference(s)...'
    else:
        print(f"\n[Job {job_id}] NEW JOB (Chunked Upload) - No Face Filtering")
        print(f"  - Files uploaded: {len(saved_files)}")
        thread = threading.Thread(
            target=process_photos_quality_selection,
            args=(job_id, upload_dir, quality_mode, similarity_threshold)
        )
        message = f'Selecting best photos from {len(saved_files)} images...'

    thread.daemon = True
    thread.start()

    processing_jobs[job_id]['message'] = message

    return jsonify({
        'job_id': job_id,
        'message': message,
        'total_files': len(saved_files)
    })


# ============== END CHUNKED UPLOAD ENDPOINTS ==============


# ============== GOOGLE DRIVE IMPORT ENDPOINTS ==============

# Import Google Drive module
try:
    from google_drive import (
        is_drive_available, extract_folder_id, list_images_in_folder,
        download_folder, get_folder_info, get_drive_service
    )
    GDRIVE_SERVICE_ACCOUNT_AVAILABLE = is_drive_available()
except ImportError:
    GDRIVE_SERVICE_ACCOUNT_AVAILABLE = False


@app.route('/check_drive_status')
def check_drive_status():
    """Check if Google Drive Service Account is configured."""
    return jsonify({
        'available': GDRIVE_SERVICE_ACCOUNT_AVAILABLE,
        'message': 'Service Account configured' if GDRIVE_SERVICE_ACCOUNT_AVAILABLE else 'Service Account not configured'
    })


@app.route('/preview_drive_folder', methods=['POST'])
def preview_drive_folder():
    """Preview contents of a Google Drive folder before importing."""
    if not GDRIVE_SERVICE_ACCOUNT_AVAILABLE:
        return jsonify({'error': 'Google Drive Service Account not configured'}), 400

    data = request.get_json()
    folder_url = data.get('folder_url', '').strip()

    if not folder_url:
        return jsonify({'error': 'Please provide a folder URL'}), 400

    try:
        folder_id = extract_folder_id(folder_url)
        info = get_folder_info(folder_id)

        if not info.get('success'):
            return jsonify({'error': info.get('error', 'Could not access folder')}), 400

        return jsonify({
            'success': True,
            'folder_id': folder_id,
            'folder_name': info.get('folder_name', 'Unknown'),
            'image_count': info.get('image_count', 0),
            'preview_images': info.get('images', [])[:5]
        })
    except ValueError as e:
        return jsonify({'error': str(e)}), 400
    except Exception as e:
        print(f"[Drive] Error previewing folder: {e}")
        return jsonify({'error': f'Could not access folder: {str(e)}'}), 400


@app.route('/import_from_drive', methods=['POST'])
def import_from_drive():
    """Import photos from Google Drive folder (Step 2 - initial upload)."""
    if not GDRIVE_SERVICE_ACCOUNT_AVAILABLE:
        return jsonify({'error': 'Google Drive Service Account not configured'}), 400

    data = request.get_json()
    folder_url = data.get('folder_url', '').strip()
    quality_mode = data.get('quality_mode', 'balanced')
    similarity_threshold = float(data.get('similarity_threshold', 0.4))

    if not folder_url:
        return jsonify({'error': 'Please provide a folder URL'}), 400

    # Get face session (step 1 stores it as 'session_id')
    face_session_id = session.get('session_id')
    has_references = False
    ref_count = 0
    if face_session_id and face_session_id in face_matchers:
        ref_count = face_matchers[face_session_id].get_reference_count()
        has_references = ref_count > 0

    try:
        folder_id = extract_folder_id(folder_url)
    except ValueError as e:
        return jsonify({'error': str(e)}), 400

    # Create job
    job_id = str(uuid.uuid4())[:8]
    upload_dir = os.path.join(UPLOAD_FOLDER, job_id)
    os.makedirs(upload_dir, exist_ok=True)
    os.makedirs(os.path.join(upload_dir, 'thumbnails'), exist_ok=True)

    # Initialize job
    processing_jobs[job_id] = {
        'status': 'downloading',
        'progress': 5,
        'message': 'Connecting to Google Drive...',
        'total_files': 0,
        'total_uploaded': 0,
        'upload_dir': upload_dir,
        'session_id': face_session_id,
        'has_reference_photos': has_references,
        'reference_count': ref_count,
        'quality_mode': quality_mode,
        'similarity_threshold': similarity_threshold,
        'results': None
    }

    # Start download in background thread
    def download_and_process():
        try:
            # HYBRID MODE: If we have face references, use parallel download + face detection
            if has_references:
                face_matcher = face_matchers.get(face_session_id)
                if face_matcher and face_matcher.get_reference_count() > 0:
                    print(f"[Job {job_id}] Using HYBRID MODE: Parallel download + face detection")
                    process_drive_with_parallel_face_detection(job_id, folder_id, upload_dir, face_matcher)
                    return

            # SEQUENTIAL MODE: Download all first, then process (for auto mode without face filtering)
            def progress_callback(current, total, _filename):
                pct = int(5 + (current / total) * 25)  # 5% to 30%
                processing_jobs[job_id]['progress'] = pct
                processing_jobs[job_id]['message'] = f'Downloading from Drive: {current}/{total}'
                processing_jobs[job_id]['total_files'] = total
                processing_jobs[job_id]['total_uploaded'] = current

            print(f"[Job {job_id}] Starting Google Drive download from folder {folder_id}")

            result = download_folder(folder_id, upload_dir, progress_callback)

            if not result.get('success') and result.get('downloaded', 0) == 0:
                processing_jobs[job_id]['status'] = 'error'
                processing_jobs[job_id]['message'] = result.get('message', 'Download failed')
                return

            downloaded_count = result.get('downloaded', 0) + result.get('skipped', 0)
            downloaded_files = result.get('files', [])
            processing_jobs[job_id]['total_uploaded'] = downloaded_count
            processing_jobs[job_id]['total_files'] = downloaded_count

            print(f"[Job {job_id}] Downloaded {downloaded_count} photos from Google Drive")

            # No face filtering - use all downloaded photos (auto mode)
            processing_jobs[job_id]['message'] = f'Selecting best from {downloaded_count} photos...'
            process_photos_quality_selection(job_id, upload_dir, quality_mode, similarity_threshold, downloaded_files)

        except Exception as e:
            print(f"[Job {job_id}] Drive import error: {e}")
            import traceback
            traceback.print_exc()
            processing_jobs[job_id]['status'] = 'error'
            processing_jobs[job_id]['message'] = f'Import failed: {str(e)}'

    thread = threading.Thread(target=download_and_process)
    thread.daemon = True
    thread.start()

    return jsonify({
        'job_id': job_id,
        'message': 'Starting Google Drive import...'
    })


@app.route('/import_from_drive_reupload/<dataset_name>', methods=['POST'])
def import_from_drive_reupload(dataset_name):
    """Import photos from Google Drive folder for reupload (after server restart)."""
    if not GDRIVE_SERVICE_ACCOUNT_AVAILABLE:
        return jsonify({'error': 'Google Drive Service Account not configured'}), 400

    data = request.get_json()
    folder_url = data.get('folder_url', '').strip()

    if not folder_url:
        return jsonify({'error': 'Please provide a folder URL'}), 400

    try:
        folder_id = extract_folder_id(folder_url)
    except ValueError as e:
        return jsonify({'error': str(e)}), 400

    # Create job
    job_id = str(uuid.uuid4())[:8]
    upload_dir = os.path.join(UPLOAD_FOLDER, job_id)
    os.makedirs(upload_dir, exist_ok=True)
    os.makedirs(os.path.join(upload_dir, 'thumbnails'), exist_ok=True)

    # Initialize job
    processing_jobs[job_id] = {
        'status': 'downloading',
        'progress': 5,
        'message': 'Connecting to Google Drive...'
    }

    # Start download and processing in background
    def download_and_process_reupload():
        try:
            def progress_callback(current, total, filename):
                pct = int(5 + (current / total) * 45)  # 5% to 50%
                processing_jobs[job_id]['progress'] = pct
                processing_jobs[job_id]['message'] = f'Downloading from Drive: {current}/{total}'

            print(f"[Job {job_id}] Starting Google Drive reupload for dataset '{dataset_name}'")

            result = download_folder(folder_id, upload_dir, progress_callback)

            if not result.get('success') and result.get('downloaded', 0) == 0:
                processing_jobs[job_id]['status'] = 'error'
                processing_jobs[job_id]['message'] = result.get('message', 'Download failed')
                return

            uploaded_filenames = result.get('files', [])
            print(f"[Job {job_id}] Downloaded {len(uploaded_filenames)} photos")

            # Load dataset from Supabase
            processing_jobs[job_id]['message'] = 'Loading saved dataset...'
            processing_jobs[job_id]['progress'] = 55

            supabase_data = load_dataset_from_supabase(dataset_name)
            if not supabase_data:
                processing_jobs[job_id]['status'] = 'error'
                processing_jobs[job_id]['message'] = 'Dataset not found in Supabase'
                return

            metadata = supabase_data.get('metadata', {})
            face_results = supabase_data.get('face_results', {})
            embeddings_data = supabase_data.get('embeddings_data')

            # Load reference embeddings
            new_session_id = str(uuid.uuid4())[:8]
            if embeddings_data:
                import io
                from photo_selector.face_matcher import FaceMatcher
                data_np = np.load(io.BytesIO(embeddings_data), allow_pickle=True)
                matcher = FaceMatcher(similarity_threshold=float(data_np['threshold']))
                matcher.reference_embeddings = list(data_np['embeddings'])
                matcher.average_embedding = data_np['average']
                face_matchers[new_session_id] = matcher
                # Note: Can't set session here (background thread) - session_id stored in processing_jobs
                print(f"[Job {job_id}] Loaded {len(matcher.reference_embeddings)} reference embeddings")

            # Match uploaded files with saved face results
            # Google Drive filenames differ from browser upload:
            # 1. Duplicates: IMG_5197(1).JPG vs IMG_51971.JPG
            # 2. Spaces: IMG_6970 Copy.JPG vs IMG_6970_Copy.JPG
            import re
            def normalize_filename(filename):
                """Normalize Google Drive filename to match browser upload format."""
                # Step 1: Convert (N) suffix to N (Google Drive duplicate handling)
                match = re.match(r'^(.+)\((\d+)\)(\.[^.]+)$', filename)
                if match:
                    base, num, ext = match.groups()
                    filename = f"{base}{num}{ext}"
                # Step 2: Apply secure_filename (spaces -> underscores, etc.)
                return secure_filename(filename)

            filtered_photos = face_results.get('filtered_photos', [])
            uploaded_set = set(uploaded_filenames)
            saved_filenames_set = {p.get('filename') for p in filtered_photos}

            # Create mapping: normalized_name -> actual_uploaded_name
            normalized_to_uploaded = {normalize_filename(f): f for f in uploaded_filenames}

            matched_photos = []
            for p in filtered_photos:
                saved_filename = p.get('filename')
                actual_filename = None

                # Try direct match first
                if saved_filename in uploaded_set:
                    actual_filename = saved_filename
                # Try normalized match (saved name matches normalized uploaded name)
                elif saved_filename in normalized_to_uploaded:
                    actual_filename = normalized_to_uploaded[saved_filename]

                if actual_filename:
                    # Use actual uploaded filename for the photo entry
                    photo_entry = p.copy()
                    photo_entry['filename'] = actual_filename
                    photo_entry['thumbnail'] = get_thumbnail_name(actual_filename)
                    matched_photos.append(photo_entry)

            # Debug: Find unmatched photos
            matched_saved = {p.get('filename') for p in filtered_photos if p.get('filename') in uploaded_set or p.get('filename') in normalized_to_uploaded}
            unmatched_from_saved = [p.get('filename') for p in filtered_photos if p.get('filename') not in matched_saved]
            matched_uploaded = {m['filename'] for m in matched_photos}
            unmatched_from_uploaded = [f for f in uploaded_filenames if f not in matched_uploaded]

            print(f"[Job {job_id}] Matched {len(matched_photos)} of {len(filtered_photos)} photos")
            print(f"[Job {job_id}] DEBUG: {len(unmatched_from_saved)} saved photos NOT found in uploaded files:")
            for fname in unmatched_from_saved[:20]:  # Show first 20
                print(f"  [SAVED NOT IN UPLOAD] '{fname}'")
            if len(unmatched_from_saved) > 20:
                print(f"  ... and {len(unmatched_from_saved) - 20} more")

            print(f"[Job {job_id}] DEBUG: {len(unmatched_from_uploaded)} uploaded files NOT found in saved data:")
            for fname in unmatched_from_uploaded[:20]:  # Show first 20
                print(f"  [UPLOAD NOT IN SAVED] '{fname}'")
            if len(unmatched_from_uploaded) > 20:
                print(f"  ... and {len(unmatched_from_uploaded) - 20} more")

            # Create review data
            review_data = {
                'filtered_photos': matched_photos,
                'total_processed': len(uploaded_filenames),
                'match_count': len(matched_photos)
            }

            with open(os.path.join(RESULTS_FOLDER, f"{job_id}_review.json"), 'w') as f:
                json.dump(review_data, f)

            # Update processing job
            processing_jobs[job_id].update({
                'status': 'review_pending',
                'progress': 100,
                'message': 'Photos downloaded from Google Drive',
                'upload_dir': upload_dir,
                'session_id': new_session_id,
                'has_reference_photos': True,
                'reference_count': metadata.get('reference_count', 0),
                'quality_mode': metadata.get('quality_mode', 'balanced'),
                'similarity_threshold': metadata.get('similarity_threshold', 0.4),
                'confirmed_photos': [p['filename'] for p in matched_photos],
                'review_data': review_data,
                'total_photos': len(matched_photos),
                'from_dataset': dataset_name,
                'from_supabase': True,
                'redirect_url': f'/step3_review/{job_id}'
            })

            print(f"[Job {job_id}] Reupload complete - ready for review")

        except Exception as e:
            print(f"[Job {job_id}] Drive reupload error: {e}")
            import traceback
            traceback.print_exc()
            processing_jobs[job_id]['status'] = 'error'
            processing_jobs[job_id]['message'] = f'Import failed: {str(e)}'

    thread = threading.Thread(target=download_and_process_reupload)
    thread.daemon = True
    thread.start()

    return jsonify({
        'job_id': job_id,
        'message': 'Starting Google Drive import...'
    })


# ============== END GOOGLE DRIVE IMPORT ENDPOINTS ==============


@app.route('/upload', methods=['POST'])
def upload_files():
    """Handle file uploads and start processing."""
    if 'files' not in request.files:
        return jsonify({'error': 'No files provided'}), 400

    files = request.files.getlist('files')
    if not files or files[0].filename == '':
        return jsonify({'error': 'No files selected'}), 400

    # Get parameters - now using quality_mode instead of target
    quality_mode = request.form.get('quality_mode', 'balanced')
    similarity_threshold = float(request.form.get('similarity', 0.92))

    # Get session ID for face matching
    session_id = session.get('session_id')

    # Create job
    job_id = str(uuid.uuid4())[:8]
    upload_dir = os.path.join(UPLOAD_FOLDER, job_id)
    os.makedirs(upload_dir, exist_ok=True)

    # Save files
    saved_files = []
    for file in files:
        if file and allowed_file(file.filename):
            filename = secure_filename(file.filename)
            # Handle duplicate filenames
            base, ext = os.path.splitext(filename)
            counter = 1
            while os.path.exists(os.path.join(upload_dir, filename)):
                filename = f"{base}_{counter}{ext}"
                counter += 1

            file.save(os.path.join(upload_dir, filename))
            saved_files.append(filename)

    if not saved_files:
        shutil.rmtree(upload_dir)
        return jsonify({'error': 'No valid image files'}), 400

    # Check if we have reference photos loaded
    has_references = False
    ref_count = 0
    if session_id and session_id in face_matchers:
        ref_count = face_matchers[session_id].get_reference_count()
        has_references = ref_count > 0

    # Initialize job
    processing_jobs[job_id] = {
        'status': 'queued',
        'progress': 0,
        'message': 'Uploading files...',
        'total_files': len(saved_files),
        'total_uploaded': len(saved_files),
        'upload_dir': upload_dir,
        'session_id': session_id,
        'has_reference_photos': has_references,
        'reference_count': ref_count,
        'quality_mode': quality_mode,
        'similarity_threshold': similarity_threshold,
        'results': None
    }

    # Decide which processing mode to use
    if has_references:
        # With reference photos: Phase 1 = face filtering only, then review step
        print(f"\n[Job {job_id}] NEW JOB - Face Filtering Mode")
        print(f"  - Files uploaded: {len(saved_files)}")
        print(f"  - Reference photos: {ref_count}")
        print(f"  - Session ID: {session_id}")
        thread = threading.Thread(
            target=process_photos_face_filter_only,
            args=(job_id, upload_dir, session_id)
        )
        message = f'Scanning {len(saved_files)} photos to find your child using {ref_count} reference(s)...'
    else:
        # Without reference photos: Full automatic processing (no review step)
        print(f"\n[Job {job_id}] NEW JOB - Full Automatic Mode")
        print(f"  - Files uploaded: {len(saved_files)}")
        print(f"  - Quality mode: {quality_mode}")
        print(f"  - Similarity threshold: {similarity_threshold}")
        thread = threading.Thread(
            target=process_photos_automatic,
            args=(job_id, upload_dir, quality_mode, similarity_threshold, session_id)
        )
        message = 'Processing started - AI will automatically select the best photos!'

    thread.start()

    return jsonify({
        'job_id': job_id,
        'files_uploaded': len(saved_files),
        'has_reference_photos': has_references,
        'reference_count': ref_count,
        'message': message,
        'needs_review': has_references  # Client should redirect to review page
    })


@app.route('/upload_folder', methods=['POST'])
def upload_folder():
    """Process photos from a local folder path (for large batches)."""
    data = request.get_json()
    folder_path = data.get('folder_path', '').strip()
    quality_mode = data.get('quality_mode', 'balanced')
    similarity_threshold = float(data.get('similarity_threshold', 0.92))

    if not folder_path:
        return jsonify({'error': 'No folder path provided'}), 400

    # Validate folder exists
    if not os.path.isdir(folder_path):
        return jsonify({'error': f'Folder not found: {folder_path}'}), 400

    # Get session ID for face matching
    session_id = session.get('session_id')

    # Create job with reference to original folder
    job_id = str(uuid.uuid4())[:8]

    # Count valid image files
    image_extensions = {'.jpg', '.jpeg', '.png', '.heic', '.heif', '.webp'}
    image_files = [f for f in os.listdir(folder_path)
                   if os.path.splitext(f.lower())[1] in image_extensions]

    if not image_files:
        return jsonify({'error': 'No valid image files found in folder'}), 400

    print(f"\n[Job {job_id}] LOCAL FOLDER MODE")
    print(f"  - Folder: {folder_path}")
    print(f"  - Images found: {len(image_files)}")

    # Check if we have reference photos loaded
    has_references = False
    ref_count = 0
    if session_id and session_id in face_matchers:
        ref_count = face_matchers[session_id].get_reference_count()
        has_references = ref_count > 0

    # Create thumbnails directory
    thumb_dir = os.path.join(UPLOAD_FOLDER, job_id, 'thumbnails')
    os.makedirs(thumb_dir, exist_ok=True)

    # Initialize job - use original folder path as upload_dir
    processing_jobs[job_id] = {
        'status': 'queued',
        'progress': 0,
        'message': 'Preparing to process photos...',
        'total_files': len(image_files),
        'total_uploaded': len(image_files),
        'upload_dir': folder_path,  # Point to original folder
        'thumb_dir': thumb_dir,
        'session_id': session_id,
        'has_reference_photos': has_references,
        'reference_count': ref_count,
        'quality_mode': quality_mode,
        'similarity_threshold': similarity_threshold,
        'is_local_folder': True,  # Flag for local folder mode
        'results': None
    }

    # Decide which processing mode to use
    if has_references:
        print(f"  - Reference photos: {ref_count}")
        print(f"  - Mode: Face Filtering")
        thread = threading.Thread(
            target=process_photos_face_filter_only,
            args=(job_id, folder_path, session_id)
        )
        message = f'Scanning {len(image_files)} photos to find your child...'
    else:
        print(f"  - Mode: Full Automatic")
        thread = threading.Thread(
            target=process_photos_automatic,
            args=(job_id, folder_path, quality_mode, similarity_threshold, session_id)
        )
        message = 'Processing started - AI will automatically select the best photos!'

    thread.start()

    return jsonify({
        'job_id': job_id,
        'files_found': len(image_files),
        'has_reference_photos': has_references,
        'reference_count': ref_count,
        'message': message,
        'needs_review': has_references
    })


@app.route('/status/<job_id>')
def get_status(job_id):
    """Get processing status."""
    if job_id not in processing_jobs:
        return jsonify({'error': 'Job not found'}), 404

    job = processing_jobs[job_id]
    response = {
        'status': job['status'],
        'progress': job['progress'],
        'message': job['message'],
        'total_photos': job.get('total_photos', 0),
        'photos_checked': job.get('photos_checked', 0)
    }

    if job['status'] == 'complete' and job['results']:
        response['summary'] = job['results']['summary']

    return jsonify(response)


@app.route('/results/<job_id>')
def get_results(job_id):
    """Get processing results."""
    try:
        if job_id not in processing_jobs:
            # Try loading from file
            results_file = os.path.join(RESULTS_FOLDER, f"{job_id}.json")
            if os.path.exists(results_file):
                with open(results_file, 'r') as f:
                    return jsonify(json.load(f))
            return jsonify({'error': 'Job not found'}), 404

        job = processing_jobs[job_id]
        if job['status'] != 'complete':
            return jsonify({'error': 'Processing not complete', 'status': job['status'], 'message': job.get('message', '')}), 400

        # Try from memory first, then file
        if 'results' in job and job['results']:
            return jsonify(job['results'])

        # Fallback to file
        results_file = os.path.join(RESULTS_FOLDER, f"{job_id}.json")
        if os.path.exists(results_file):
            with open(results_file, 'r') as f:
                return jsonify(json.load(f))

        return jsonify({'error': 'Results not found'}), 404
    except Exception as e:
        import traceback
        traceback.print_exc()
        return jsonify({'error': str(e)}), 500


@app.route('/thumbnail/<job_id>/<filename>')
def get_thumbnail(job_id, filename):
    """Serve thumbnail images, generating on-demand if needed."""
    thumb_dir = os.path.join(UPLOAD_FOLDER, job_id, 'thumbnails')
    thumb_name = get_thumbnail_name(filename)
    thumb_path = os.path.join(thumb_dir, thumb_name)

    # If thumbnail exists, serve it
    if os.path.exists(thumb_path):
        return send_from_directory(thumb_dir, thumb_name)

    # Generate thumbnail on-demand for unmatched photos
    original_path = os.path.join(UPLOAD_FOLDER, job_id, filename)
    if os.path.exists(original_path):
        os.makedirs(thumb_dir, exist_ok=True)
        create_thumbnail(original_path, thumb_path)
        if os.path.exists(thumb_path):
            return send_from_directory(thumb_dir, thumb_name)

    # Fallback - try to serve the original filename from thumbnails
    if os.path.exists(os.path.join(thumb_dir, filename)):
        return send_from_directory(thumb_dir, filename)

    return jsonify({'error': 'Thumbnail not found'}), 404


@app.route('/photo/<job_id>/<filename>')
def get_photo(job_id, filename):
    """Serve full-size photos with proper EXIF rotation handling."""
    from io import BytesIO
    from PIL import ExifTags

    photo_dir = os.path.join(UPLOAD_FOLDER, job_id)
    filepath = os.path.join(photo_dir, filename)

    if not os.path.exists(filepath):
        return jsonify({'error': 'File not found'}), 404

    ext = os.path.splitext(filename)[1].lower()

    # Handle HEIC/HEIF - convert to JPEG
    if ext in ['.heic', '.heif']:
        try:
            img = Image.open(filepath)
            img = img.convert('RGB')
            buffer = BytesIO()
            img.save(buffer, format='JPEG', quality=90)
            buffer.seek(0)
            return send_file(buffer, mimetype='image/jpeg')
        except Exception as e:
            print(f"Error converting HEIC: {e}")
            return send_from_directory(photo_dir, filename)

    # Handle JPG/JPEG - apply EXIF rotation
    if ext in ['.jpg', '.jpeg']:
        try:
            img = Image.open(filepath)

            # Get EXIF orientation and rotate if needed
            try:
                for orientation in ExifTags.TAGS.keys():
                    if ExifTags.TAGS[orientation] == 'Orientation':
                        break
                exif = img._getexif()
                if exif is not None:
                    orientation_value = exif.get(orientation)
                    if orientation_value == 3:
                        img = img.rotate(180, expand=True)
                    elif orientation_value == 6:
                        img = img.rotate(270, expand=True)
                    elif orientation_value == 8:
                        img = img.rotate(90, expand=True)
            except (AttributeError, KeyError, IndexError):
                pass

            # Convert to RGB if needed (handles RGBA, P mode, etc.)
            if img.mode != 'RGB':
                img = img.convert('RGB')

            buffer = BytesIO()
            img.save(buffer, format='JPEG', quality=90)
            buffer.seek(0)
            return send_file(buffer, mimetype='image/jpeg')
        except Exception as e:
            print(f"Error processing JPEG: {e}")
            return send_from_directory(photo_dir, filename)

    # Other formats - serve directly
    return send_from_directory(photo_dir, filename)


@app.route('/download/<job_id>')
def download_selected(job_id):
    """Download selected photos as zip with timestamp-sorted naming.



    Uses DISK-BASED ZIP creation (not memory) to handle large photo sets (1000+).

    The ZIP is created on disk, then streamed to the browser in chunks.

    This prevents memory issues and timeouts on large downloads.

    """
    import zipfile
    import tempfile
    from datetime import datetime
    from collections import defaultdict

    if job_id not in processing_jobs:
        return jsonify({'error': 'Job not found'}), 404

    job = processing_jobs[job_id]
    if job['status'] != 'complete':
        return jsonify({'error': 'Processing not complete'}), 400

    results = job.get('results', {})
    selected = results.get('selected', [])
    upload_dir = job.get('upload_dir', '')

    if not selected:
        return jsonify({'error': 'No selected photos found'}), 404

    if not upload_dir:
        return jsonify({'error': 'Upload directory not found'}), 404

    print(f"[Download] Starting disk-based ZIP for {len(selected)} photos...")

    # Month abbreviations
    MONTH_ABBREV = {
        1: "Jan", 2: "Feb", 3: "Mar", 4: "Apr",
        5: "May", 6: "Jun", 7: "Jul", 8: "Aug",
        9: "Sep", 10: "Oct", 11: "Nov", 12: "Dec"
    }

    # Import timestamp extractor
    from photo_selector.utils import get_photo_timestamp

    # Group photos by month and sort by timestamp
    photos_by_month = defaultdict(list)
    photos_no_timestamp = []

    for photo in selected:
        filename = photo.get('filename', '')
        ts = photo.get('timestamp')

        # If no timestamp stored, try to extract it from the photo file
        if not ts:
            photo_path = os.path.join(upload_dir, filename)
            if os.path.exists(photo_path):
                dt = get_photo_timestamp(photo_path)
                if dt:
                    ts = dt.timestamp()

        if ts:
            dt = datetime.fromtimestamp(ts)
            month_key = (dt.year, dt.month)  # Group by year-month to handle multi-year datasets
            photos_by_month[month_key].append({
                'filename': filename,
                'timestamp': ts,
                'datetime': dt
            })
        else:
            photos_no_timestamp.append({'filename': filename, 'timestamp': 0})

    # Sort photos within each month by timestamp
    for month_key in photos_by_month:
        photos_by_month[month_key].sort(key=lambda x: x['timestamp'])

    # Create ZIP file ON DISK (not in memory) to handle large photo sets
    temp_zip_path = os.path.join(tempfile.gettempdir(), f'selected_photos_{job_id}.zip')
    files_added = 0

    try:
        # Use ZIP_STORED (no compression) for faster creation with photos (already compressed)
        with zipfile.ZipFile(temp_zip_path, 'w', zipfile.ZIP_STORED) as zf:
            # Add photos with timestamps (sorted and renamed)
            for month_key in sorted(photos_by_month.keys()):
                year, month = month_key
                month_abbrev = MONTH_ABBREV[month]
                photos = photos_by_month[month_key]

                for idx, photo in enumerate(photos, start=1):
                    original_filename = photo['filename']
                    photo_path = os.path.join(upload_dir, original_filename)

                    if os.path.exists(photo_path):
                        # Create new filename: Jan_1_originalname.jpg
                        ext = os.path.splitext(original_filename)[1]
                        base_name = os.path.splitext(original_filename)[0]
                        new_filename = f"{month_abbrev}_{idx}_{base_name}{ext}"

                        zf.write(photo_path, new_filename)
                        files_added += 1

                        # Log progress every 100 files
                        if files_added % 100 == 0:
                            print(f"[Download] Added {files_added} files to ZIP...")
                    else:
                        print(f"[Download] File not found: {photo_path}")

            # Add photos without timestamps at the end with "NoDate" prefix
            for idx, photo in enumerate(photos_no_timestamp, start=1):
                original_filename = photo['filename']
                photo_path = os.path.join(upload_dir, original_filename)

                if os.path.exists(photo_path):
                    ext = os.path.splitext(original_filename)[1]
                    base_name = os.path.splitext(original_filename)[0]
                    new_filename = f"NoDate_{idx}_{base_name}{ext}"

                    zf.write(photo_path, new_filename)
                    files_added += 1
                else:
                    print(f"[Download] File not found: {photo_path}")

        if files_added == 0:
            # Clean up empty zip
            if os.path.exists(temp_zip_path):
                os.remove(temp_zip_path)
            return jsonify({'error': f'No files found in {upload_dir}. Files may have been cleaned up.'}), 404

        # Get file size for logging
        zip_size_mb = os.path.getsize(temp_zip_path) / (1024 * 1024)
        print(f"[Download] ZIP created: {files_added} files, {zip_size_mb:.1f} MB")

        # Stream the file to browser and delete after sending
        def generate_and_cleanup():
            """Generator that streams ZIP file and deletes it after completion."""
            try:
                with open(temp_zip_path, 'rb') as f:
                    while True:
                        chunk = f.read(8192 * 16)  # 128KB chunks for faster streaming
                        if not chunk:
                            break
                        yield chunk
            finally:
                # Clean up temp file after streaming
                try:
                    if os.path.exists(temp_zip_path):
                        os.remove(temp_zip_path)
                        print(f"[Download] Cleaned up temp ZIP: {temp_zip_path}")
                except Exception as e:
                    print(f"[Download] Error cleaning up temp ZIP: {e}")

        # Return streaming response
        response = Response(
            generate_and_cleanup(),
            mimetype='application/zip',
            headers={
                'Content-Disposition': f'attachment; filename=selected_photos_{job_id}.zip',
                'Content-Length': str(os.path.getsize(temp_zip_path))
            }
        )
        return response

    except Exception as e:
        # Clean up on error
        if os.path.exists(temp_zip_path):
            os.remove(temp_zip_path)
        print(f"[Download] Error creating ZIP: {e}")
        return jsonify({'error': f'Error creating ZIP: {str(e)}'}), 500


@app.route('/download_filtered/<job_id>')
def download_filtered(job_id):
    """Download all filtered photos (after face matching, before quality selection).



    Uses DISK-BASED ZIP creation (not memory) to handle large photo sets (1000+).

    """
    import zipfile
    import tempfile

    if job_id not in processing_jobs:
        return jsonify({'error': 'Job not found'}), 404

    job = processing_jobs[job_id]

    # Get filtered photos from review data
    filtered_photos = []
    if 'review_data' in job:
        filtered_photos = [p['filename'] for p in job['review_data'].get('filtered_photos', [])]
    else:
        # Try to load from file
        review_file = os.path.join(RESULTS_FOLDER, f"{job_id}_review.json")
        if os.path.exists(review_file):
            with open(review_file, 'r') as f:
                review_data = json.load(f)
            filtered_photos = [p['filename'] for p in review_data.get('filtered_photos', [])]

    if not filtered_photos:
        return jsonify({'error': 'No filtered photos found'}), 404

    print(f"[Download] Starting disk-based ZIP for {len(filtered_photos)} filtered photos...")

    # Create ZIP file ON DISK (not in memory) to handle large photo sets
    temp_zip_path = os.path.join(tempfile.gettempdir(), f'filtered_photos_{job_id}.zip')
    files_added = 0

    try:
        with zipfile.ZipFile(temp_zip_path, 'w', zipfile.ZIP_STORED) as zf:
            for filename in filtered_photos:
                photo_path = os.path.join(job['upload_dir'], filename)
                if os.path.exists(photo_path):
                    zf.write(photo_path, filename)
                    files_added += 1
                    if files_added % 100 == 0:
                        print(f"[Download] Added {files_added} files to ZIP...")

        if files_added == 0:
            if os.path.exists(temp_zip_path):
                os.remove(temp_zip_path)
            return jsonify({'error': 'No files found. Files may have been cleaned up.'}), 404

        zip_size_mb = os.path.getsize(temp_zip_path) / (1024 * 1024)
        print(f"[Download] ZIP created: {files_added} files, {zip_size_mb:.1f} MB")

        # Stream the file and delete after sending
        def generate_and_cleanup():
            try:
                with open(temp_zip_path, 'rb') as f:
                    while True:
                        chunk = f.read(8192 * 16)  # 128KB chunks
                        if not chunk:
                            break
                        yield chunk
            finally:
                try:
                    if os.path.exists(temp_zip_path):
                        os.remove(temp_zip_path)
                        print(f"[Download] Cleaned up temp ZIP: {temp_zip_path}")
                except Exception as e:
                    print(f"[Download] Error cleaning up temp ZIP: {e}")

        return Response(
            generate_and_cleanup(),
            mimetype='application/zip',
            headers={
                'Content-Disposition': f'attachment; filename=filtered_photos_{job_id}.zip',
                'Content-Length': str(os.path.getsize(temp_zip_path))
            }
        )

    except Exception as e:
        if os.path.exists(temp_zip_path):
            os.remove(temp_zip_path)
        print(f"[Download] Error creating ZIP: {e}")
        return jsonify({'error': f'Error creating ZIP: {str(e)}'}), 500


@app.route('/download_unmatched/<job_id>')
def download_unmatched(job_id):
    """Download photos where target person was NOT detected, with timestamp-sorted naming."""
    import zipfile
    import tempfile
    from datetime import datetime
    from collections import defaultdict

    if job_id not in processing_jobs:
        return jsonify({'error': 'Job not found'}), 404

    job = processing_jobs[job_id]
    upload_dir = job.get('upload_dir', '')

    if not upload_dir:
        return jsonify({'error': 'Upload directory not found'}), 404

    # Get unmatched photos from review data
    unmatched_photos = []
    if 'review_data' in job:
        unmatched_photos = job['review_data'].get('unmatched_photos', [])
    else:
        # Try to load from file
        review_file = os.path.join(RESULTS_FOLDER, f"{job_id}_review.json")
        if os.path.exists(review_file):
            with open(review_file, 'r') as f:
                review_data = json.load(f)
            unmatched_photos = review_data.get('unmatched_photos', [])

    if not unmatched_photos:
        return jsonify({'error': 'No unmatched photos found'}), 404

    print(f"[Download] Starting disk-based ZIP for {len(unmatched_photos)} unmatched photos...")

    # Month abbreviations
    MONTH_ABBREV = {
        1: "Jan", 2: "Feb", 3: "Mar", 4: "Apr",
        5: "May", 6: "Jun", 7: "Jul", 8: "Aug",
        9: "Sep", 10: "Oct", 11: "Nov", 12: "Dec"
    }

    # Import timestamp extractor
    from photo_selector.utils import get_photo_timestamp

    # Group photos by month and sort by timestamp
    photos_by_month = defaultdict(list)
    photos_no_timestamp = []

    for photo in unmatched_photos:
        filename = photo.get('filename', '')
        ts = photo.get('timestamp')

        # If no timestamp stored, try to extract it from the photo file
        if not ts:
            photo_path = os.path.join(upload_dir, filename)
            if os.path.exists(photo_path):
                dt = get_photo_timestamp(photo_path)
                if dt:
                    ts = dt.timestamp()

        if ts:
            dt = datetime.fromtimestamp(ts)
            month_key = (dt.year, dt.month)
            photos_by_month[month_key].append({
                'filename': filename,
                'timestamp': ts
            })
        else:
            photos_no_timestamp.append({'filename': filename})

    # Sort photos within each month by timestamp
    for month_key in photos_by_month:
        photos_by_month[month_key].sort(key=lambda x: x['timestamp'])

    # Create ZIP file ON DISK (not in memory) to handle large photo sets
    temp_zip_path = os.path.join(tempfile.gettempdir(), f'unmatched_photos_{job_id}.zip')
    files_added = 0

    try:
        with zipfile.ZipFile(temp_zip_path, 'w', zipfile.ZIP_STORED) as zf:
            # Add photos with timestamps (sorted and renamed)
            for month_key in sorted(photos_by_month.keys()):
                year, month = month_key
                month_abbrev = MONTH_ABBREV[month]
                photos = photos_by_month[month_key]

                for idx, photo in enumerate(photos, start=1):
                    original_filename = photo['filename']
                    photo_path = os.path.join(upload_dir, original_filename)

                    if os.path.exists(photo_path):
                        ext = os.path.splitext(original_filename)[1]
                        base_name = os.path.splitext(original_filename)[0]
                        new_filename = f"{month_abbrev}_{idx}_{base_name}{ext}"
                        zf.write(photo_path, new_filename)
                        files_added += 1
                        if files_added % 100 == 0:
                            print(f"[Download] Added {files_added} files to ZIP...")

            # Add photos without timestamps at the end
            for idx, photo in enumerate(photos_no_timestamp, start=1):
                original_filename = photo['filename']
                photo_path = os.path.join(upload_dir, original_filename)

                if os.path.exists(photo_path):
                    ext = os.path.splitext(original_filename)[1]
                    base_name = os.path.splitext(original_filename)[0]
                    new_filename = f"NoDate_{idx}_{base_name}{ext}"
                    zf.write(photo_path, new_filename)
                    files_added += 1

        if files_added == 0:
            if os.path.exists(temp_zip_path):
                os.remove(temp_zip_path)
            return jsonify({'error': 'No files found in upload directory'}), 404

        zip_size_mb = os.path.getsize(temp_zip_path) / (1024 * 1024)
        print(f"[Download] ZIP created: {files_added} files, {zip_size_mb:.1f} MB")

        # Stream the file and delete after sending
        def generate_and_cleanup():
            try:
                with open(temp_zip_path, 'rb') as f:
                    while True:
                        chunk = f.read(8192 * 16)  # 128KB chunks
                        if not chunk:
                            break
                        yield chunk
            finally:
                try:
                    if os.path.exists(temp_zip_path):
                        os.remove(temp_zip_path)
                        print(f"[Download] Cleaned up temp ZIP: {temp_zip_path}")
                except Exception as e:
                    print(f"[Download] Error cleaning up temp ZIP: {e}")

        return Response(
            generate_and_cleanup(),
            mimetype='application/zip',
            headers={
                'Content-Disposition': f'attachment; filename=unmatched_photos_{job_id}.zip',
                'Content-Length': str(os.path.getsize(temp_zip_path))
            }
        )

    except Exception as e:
        if os.path.exists(temp_zip_path):
            os.remove(temp_zip_path)
        print(f"[Download] Error creating ZIP: {e}")
        return jsonify({'error': f'Error creating ZIP: {str(e)}'}), 500


@app.route('/cleanup/<job_id>', methods=['POST'])
def cleanup_job(job_id):
    """Clean up job files."""
    if job_id in processing_jobs:
        upload_dir = processing_jobs[job_id].get('upload_dir')
        if upload_dir and os.path.exists(upload_dir):
            shutil.rmtree(upload_dir)
        del processing_jobs[job_id]

    results_file = os.path.join(RESULTS_FOLDER, f"{job_id}.json")
    if os.path.exists(results_file):
        os.remove(results_file)

    # Also clean up review file
    review_file = os.path.join(RESULTS_FOLDER, f"{job_id}_review.json")
    if os.path.exists(review_file):
        os.remove(review_file)

    return jsonify({'message': 'Cleaned up'})


# ==================== REVIEW WORKFLOW ROUTES ====================

@app.route('/step3_review/<job_id>')
def step3_review(job_id):
    """Step 3: Review filtered photos before quality selection."""
    if job_id not in processing_jobs:
        return render_template('index.html')

    job = processing_jobs[job_id]

    # Check if face filtering is complete
    if job['status'] not in ['review_pending', 'complete']:
        # Still processing or error - redirect back to step2
        return render_template('step2_upload.html',
                              session_id=session.get('session_id'),
                              reference_count=job.get('reference_count', 0))

    return render_template('step3_review.html', job_id=job_id)


@app.route('/review_data/<job_id>')
def get_review_data(job_id):
    """Get the filtered photos data for review."""
    if job_id not in processing_jobs:
        return jsonify({'error': 'Job not found'}), 404

    job = processing_jobs[job_id]

    # Check if we have review data
    if 'review_data' in job:
        return jsonify(job['review_data'])

    # Try to load from file
    review_file = os.path.join(RESULTS_FOLDER, f"{job_id}_review.json")
    if os.path.exists(review_file):
        with open(review_file, 'r') as f:
            review_data = json.load(f)
        return jsonify(review_data)

    return jsonify({'error': 'Review data not found'}), 404


@app.route('/review_thumbnail/<job_id>/<filename>')
def get_review_thumbnail(job_id, filename):
    """Serve thumbnail for review page."""
    # Thumbnails are always stored in uploads/<job_id>/thumbnails
    thumb_dir = os.path.join(UPLOAD_FOLDER, job_id, 'thumbnails')
    if os.path.exists(os.path.join(thumb_dir, filename)):
        return send_from_directory(thumb_dir, filename)

    # Fallback: check if thumbnails are in the upload_dir (for older jobs)
    if job_id in processing_jobs:
        job = processing_jobs[job_id]
        upload_dir = job.get('upload_dir', '')
        fallback_dir = os.path.join(upload_dir, 'thumbnails')
        if os.path.exists(os.path.join(fallback_dir, filename)):
            return send_from_directory(fallback_dir, filename)

    return send_from_directory(thumb_dir, filename)


@app.route('/review_photo/<job_id>/<filename>')
def get_review_photo(job_id, filename):
    """Serve full-size photo for review modal with EXIF rotation handling."""
    from io import BytesIO
    from PIL import ExifTags

    photo_dir = os.path.join(UPLOAD_FOLDER, job_id)
    filepath = os.path.join(photo_dir, filename)

    if not os.path.exists(filepath):
        return jsonify({'error': 'File not found'}), 404

    ext = os.path.splitext(filename)[1].lower()

    # Handle HEIC/HEIF - convert to JPEG
    if ext in ['.heic', '.heif']:
        try:
            img = Image.open(filepath)
            img = img.convert('RGB')
            buffer = BytesIO()
            img.save(buffer, format='JPEG', quality=90)
            buffer.seek(0)
            return send_file(buffer, mimetype='image/jpeg')
        except Exception as e:
            print(f"Error converting HEIC: {e}")
            return send_from_directory(photo_dir, filename)

    # Handle JPG/JPEG - apply EXIF rotation
    if ext in ['.jpg', '.jpeg']:
        try:
            img = Image.open(filepath)

            # Get EXIF orientation and rotate if needed
            try:
                for orientation in ExifTags.TAGS.keys():
                    if ExifTags.TAGS[orientation] == 'Orientation':
                        break
                exif = img._getexif()
                if exif is not None:
                    orientation_value = exif.get(orientation)
                    if orientation_value == 3:
                        img = img.rotate(180, expand=True)
                    elif orientation_value == 6:
                        img = img.rotate(270, expand=True)
                    elif orientation_value == 8:
                        img = img.rotate(90, expand=True)
            except (AttributeError, KeyError, IndexError):
                pass

            if img.mode != 'RGB':
                img = img.convert('RGB')

            buffer = BytesIO()
            img.save(buffer, format='JPEG', quality=90)
            buffer.seek(0)
            return send_file(buffer, mimetype='image/jpeg')
        except Exception as e:
            print(f"Error processing JPEG: {e}")
            return send_from_directory(photo_dir, filename)

    return send_from_directory(photo_dir, filename)


@app.route('/confirm_selection/<job_id>', methods=['POST'])
def confirm_selection(job_id):
    """User confirms their selection - proceed to quality-based selection."""
    if job_id not in processing_jobs:
        return jsonify({'error': 'Job not found'}), 404

    job = processing_jobs[job_id]

    # Get confirmed photos from request
    data = request.get_json()
    if not data or 'selected_photos' not in data:
        return jsonify({'error': 'No photos selected'}), 400

    confirmed_photos = data['selected_photos']
    if len(confirmed_photos) == 0:
        return jsonify({'error': 'At least one photo must be selected'}), 400

    # Get embedding model selection (default to siglip)
    embedding_model = data.get('embedding_model', 'siglip')
    if embedding_model not in ['siglip', 'clip']:
        embedding_model = 'siglip'

    # Get processing parameters from job
    quality_mode = job.get('quality_mode', 'balanced')
    similarity_threshold = job.get('similarity_threshold', 0.92)
    upload_dir = job.get('upload_dir')

    # Load cached face data from review_data (to avoid re-detection in scoring)
    face_data_cache = {}
    if 'review_data' in job:
        for photo in job['review_data'].get('filtered_photos', []):
            filename = photo.get('filename')
            if filename:
                face_data_cache[filename] = {
                    'num_faces': photo.get('num_faces', 0),
                    'face_bboxes': photo.get('face_bboxes', [])
                }
    else:
        # Try loading from review file
        review_file = os.path.join(RESULTS_FOLDER, f"{job_id}_review.json")
        if os.path.exists(review_file):
            with open(review_file, 'r') as f:
                review_data = json.load(f)
            for photo in review_data.get('filtered_photos', []):
                filename = photo.get('filename')
                if filename:
                    face_data_cache[filename] = {
                        'num_faces': photo.get('num_faces', 0),
                        'face_bboxes': photo.get('face_bboxes', [])
                    }

    print(f"[Job {job_id}] Loaded face data cache for {len(face_data_cache)} photos")

    # Update job status
    job['status'] = 'processing'
    job['progress'] = 0
    job['message'] = 'Starting quality-based selection...'
    job['confirmed_photos'] = confirmed_photos

    # Start phase 2 processing
    thread = threading.Thread(
        target=process_photos_quality_selection,
        args=(job_id, upload_dir, quality_mode, similarity_threshold, confirmed_photos, face_data_cache, embedding_model)
    )
    thread.start()

    return jsonify({
        'message': f'Processing {len(confirmed_photos)} confirmed photos...',
        'confirmed_count': len(confirmed_photos)
    })


@app.route('/step4_results/<job_id>')
def step4_results(job_id):
    """Step 4: Final results page."""
    if job_id not in processing_jobs:
        return render_template('index.html')

    job = processing_jobs[job_id]

    # Check reference count from session
    session_id = session.get('session_id')
    ref_count = 0
    if session_id and session_id in face_matchers:
        ref_count = face_matchers[session_id].get_reference_count()

    return render_template('step4_results.html',
                          job_id=job_id,
                          reference_count=ref_count)


# ==================== TEST SINGLE MONTH ROUTES ====================

@app.route('/test-month')
def test_month_page():
    """Test page for single month photo selection."""
    return render_template('test_month.html')


@app.route('/test-month/start', methods=['POST'])
def test_month_start():
    """Start processing a single month folder."""
    data = request.get_json()
    folder_path = data.get('folder_path', '').strip()
    target = int(data.get('target', 40))
    organize_by_month = data.get('organize_by_month', False)

    if not folder_path:
        return jsonify({'error': 'No folder path provided'}), 400

    if not os.path.isdir(folder_path):
        return jsonify({'error': f'Folder not found: {folder_path}'}), 400

    # Count valid image files
    extensions = {'.jpg', '.jpeg', '.png', '.heic', '.heif', '.webp'}
    image_files = [f for f in os.listdir(folder_path)
                   if os.path.splitext(f.lower())[1] in extensions]

    if not image_files:
        return jsonify({'error': 'No valid image files found in folder'}), 400

    # Create job
    job_id = str(uuid.uuid4())[:8]

    # Create thumbnails directory
    thumb_dir = os.path.join(UPLOAD_FOLDER, job_id, 'thumbnails')
    os.makedirs(thumb_dir, exist_ok=True)

    processing_jobs[job_id] = {
        'status': 'processing',
        'progress': 0,
        'message': 'Starting test...',
        'folder_path': folder_path,
        'thumb_dir': thumb_dir,
        'target': target,
        'total_files': len(image_files),
        'results': None,
        'organize_by_month': organize_by_month
    }

    # Start processing in background
    thread = threading.Thread(
        target=process_test_month,
        args=(job_id, folder_path, target, thumb_dir, organize_by_month)
    )
    thread.start()

    return jsonify({
        'job_id': job_id,
        'total_photos': len(image_files),
        'target': target,
        'organize_by_month': organize_by_month,
        'message': f'Processing {len(image_files)} photos...'
    })


@app.route('/test-month/upload', methods=['POST'])
def test_month_upload():
    """Handle uploaded photos for test-month (for HuggingFace deployment)."""
    if 'photos' not in request.files:
        return jsonify({'error': 'No photos uploaded'}), 400

    files = request.files.getlist('photos')
    target = int(request.form.get('target', 40))
    organize_by_month = request.form.get('organize_by_month', 'false').lower() == 'true'

    if not files or len(files) == 0:
        return jsonify({'error': 'No photos uploaded'}), 400

    # Filter valid image files
    extensions = {'.jpg', '.jpeg', '.png', '.heic', '.heif', '.webp'}
    valid_files = [f for f in files if f.filename and
                   os.path.splitext(f.filename.lower())[1] in extensions]

    if not valid_files:
        return jsonify({'error': 'No valid image files uploaded'}), 400

    # Create job and upload directory
    job_id = str(uuid.uuid4())[:8]
    upload_dir = os.path.join(UPLOAD_FOLDER, job_id, 'photos')
    thumb_dir = os.path.join(UPLOAD_FOLDER, job_id, 'thumbnails')
    os.makedirs(upload_dir, exist_ok=True)
    os.makedirs(thumb_dir, exist_ok=True)

    # Save uploaded files
    saved_files = []
    for f in valid_files:
        filename = secure_filename(f.filename)
        # Handle duplicate filenames
        base, ext = os.path.splitext(filename)
        counter = 1
        while os.path.exists(os.path.join(upload_dir, filename)):
            filename = f"{base}_{counter}{ext}"
            counter += 1

        filepath = os.path.join(upload_dir, filename)
        f.save(filepath)
        saved_files.append(filename)

    processing_jobs[job_id] = {
        'status': 'processing',
        'progress': 0,
        'message': 'Starting test...',
        'folder_path': upload_dir,  # Use upload dir as folder path
        'thumb_dir': thumb_dir,
        'target': target,
        'total_files': len(saved_files),
        'results': None,
        'is_upload': True,
        'organize_by_month': organize_by_month
    }

    # Start processing in background
    thread = threading.Thread(
        target=process_test_month,
        args=(job_id, upload_dir, target, thumb_dir, organize_by_month)
    )
    thread.start()

    return jsonify({
        'job_id': job_id,
        'total_photos': len(saved_files),
        'target': target,
        'organize_by_month': organize_by_month,
        'message': f'Processing {len(saved_files)} uploaded photos...'
    })


@app.route('/test-month/upload-init', methods=['POST'])
def test_month_upload_init():
    """Initialize chunked upload for test-month."""
    data = request.json
    total_files = data.get('total_files', 0)
    target = data.get('target', 40)
    organize_by_month = data.get('organize_by_month', False)

    job_id = str(uuid.uuid4())[:8]
    upload_dir = os.path.join(UPLOAD_FOLDER, job_id, 'photos')
    thumb_dir = os.path.join(UPLOAD_FOLDER, job_id, 'thumbnails')
    os.makedirs(upload_dir, exist_ok=True)
    os.makedirs(thumb_dir, exist_ok=True)

    # Store upload session
    session_id = f"test_{job_id}"
    upload_sessions[session_id] = {
        'job_id': job_id,
        'upload_dir': upload_dir,
        'thumb_dir': thumb_dir,
        'target': target,
        'organize_by_month': organize_by_month,
        'total_files': total_files,
        'uploaded_files': []
    }

    print(f"[Test-Month Upload {job_id}] Initialized for {total_files} files")

    return jsonify({
        'session_id': session_id,
        'job_id': job_id
    })


@app.route('/test-month/upload-chunk', methods=['POST'])
def test_month_upload_chunk():
    """Handle a chunk of files for test-month."""
    session_id = request.form.get('session_id')
    if not session_id or session_id not in upload_sessions:
        return jsonify({'error': 'Invalid session'}), 400

    session_data = upload_sessions[session_id]
    upload_dir = session_data['upload_dir']
    files = request.files.getlist('files')

    extensions = {'.jpg', '.jpeg', '.png', '.heic', '.heif', '.webp'}
    saved_count = 0

    for f in files:
        if f and f.filename:
            ext = os.path.splitext(f.filename.lower())[1]
            if ext in extensions:
                filename = secure_filename(f.filename)
                # Handle duplicate filenames
                base, ext = os.path.splitext(filename)
                counter = 1
                while os.path.exists(os.path.join(upload_dir, filename)):
                    filename = f"{base}_{counter}{ext}"
                    counter += 1

                f.save(os.path.join(upload_dir, filename))
                session_data['uploaded_files'].append(filename)
                saved_count += 1

    chunk_index = request.form.get('chunk_index', '?')
    print(f"[Test-Month Upload {session_data['job_id']}] Chunk {chunk_index}: saved {saved_count} files (total: {len(session_data['uploaded_files'])})")

    return jsonify({
        'uploaded': len(session_data['uploaded_files']),
        'total': session_data['total_files']
    })


@app.route('/test-month/upload-complete', methods=['POST'])
def test_month_upload_complete():
    """Complete chunked upload and start processing for test-month."""
    data = request.json
    session_id = data.get('session_id')

    if not session_id or session_id not in upload_sessions:
        return jsonify({'error': 'Invalid session'}), 400

    session_data = upload_sessions[session_id]
    job_id = session_data['job_id']
    upload_dir = session_data['upload_dir']
    thumb_dir = session_data['thumb_dir']
    target = session_data['target']
    organize_by_month = session_data['organize_by_month']
    saved_files = session_data['uploaded_files']

    # Clean up session
    del upload_sessions[session_id]

    if not saved_files:
        return jsonify({'error': 'No valid image files uploaded'}), 400

    print(f"[Test-Month Upload {job_id}] Complete: {len(saved_files)} files, starting processing...")

    # Create processing job
    processing_jobs[job_id] = {
        'status': 'processing',
        'progress': 0,
        'message': 'Starting test...',
        'folder_path': upload_dir,
        'thumb_dir': thumb_dir,
        'target': target,
        'total_files': len(saved_files),
        'results': None,
        'is_upload': True,
        'organize_by_month': organize_by_month
    }

    # Start processing in background
    thread = threading.Thread(
        target=process_test_month,
        args=(job_id, upload_dir, target, thumb_dir, organize_by_month)
    )
    thread.start()

    return jsonify({
        'job_id': job_id,
        'total_photos': len(saved_files),
        'target': target,
        'organize_by_month': organize_by_month,
        'message': f'Processing {len(saved_files)} uploaded photos...'
    })


def process_test_month(job_id, folder_path, target, thumb_dir, organize_by_month=False):
    """Process photos for testing with category-aware selection.



    If organize_by_month is True, groups photos by EXIF date and runs

    selection per month (same as main app Step 4).

    """
    try:
        from photo_selector.monthly_selector import MonthlyPhotoSelector, CategoryDetector
        from photo_selector.siglip_embeddings import SigLIPEmbedder
        from photo_selector.scoring import PhotoScorer
        from datetime import datetime

        job = processing_jobs[job_id]

        # Get all photos
        extensions = {'.jpg', '.jpeg', '.png', '.heic', '.heif', '.webp'}
        photo_files = [f for f in os.listdir(folder_path)
                       if os.path.splitext(f.lower())[1] in extensions]
        photo_paths = [os.path.join(folder_path, f) for f in photo_files]

        job['message'] = 'Loading SigLIP model...'
        job['progress'] = 5

        # Initialize embedder and selector
        embedder = SigLIPEmbedder()
        selector = MonthlyPhotoSelector()

        # Step 1: Generate embeddings
        job['message'] = f'Generating SigLIP embeddings for {len(photo_paths)} photos...'
        job['progress'] = 10
        embeddings = embedder.process_folder(folder_path)
        job['progress'] = 30

        # Step 2: Detect categories for all photos
        job['message'] = 'Detecting photo categories...'
        job['progress'] = 35
        selector._ensure_category_detector()
        categories = selector.category_detector.detect_categories_batch(photo_paths)
        job['progress'] = 45

        # Step 3: Score photos and add category + timestamp
        job['message'] = 'Scoring photos...'
        scorer = PhotoScorer()
        scored_photos = []

        for i, photo_path in enumerate(photo_paths):
            filename = os.path.basename(photo_path)
            scores = scorer.score_photo(photo_path)

            # Get category
            cat, conf = categories.get(filename, ('unknown', 0.0))

            # Get timestamp from EXIF
            dt = selector.get_photo_date(photo_path)

            scored_photos.append({
                'filename': filename,
                'filepath': photo_path,
                'total': scores.get('total', 0),
                'face_quality': scores.get('face_quality', 0),
                'aesthetic_quality': scores.get('aesthetic_quality', 0),
                'emotional_signal': scores.get('emotional_signal', 0),
                'uniqueness': scores.get('uniqueness', 0.5),
                'num_faces': scores.get('num_faces', 0),
                'category': cat,
                'category_confidence': conf,
                'timestamp': dt.timestamp() if dt else None
            })

            if (i + 1) % 10 == 0:
                job['progress'] = 45 + int((i / len(photo_paths)) * 20)
                job['message'] = f'Scoring photos... {i + 1}/{len(photo_paths)}'

        job['progress'] = 70

        # Step 4: Run category-aware HDBSCAN selection
        if organize_by_month:
            # Group photos by month using EXIF dates
            job['message'] = 'Grouping photos by month...'

            # Month names for mapping
            MONTH_NAMES = ['January', 'February', 'March', 'April', 'May', 'June',
                          'July', 'August', 'September', 'October', 'November', 'December']

            photos_by_month = {}
            for photo in scored_photos:
                ts = photo.get('timestamp')
                if ts:
                    dt = datetime.fromtimestamp(ts)
                    month_name = MONTH_NAMES[dt.month - 1]
                else:
                    month_name = 'Unknown'

                photo['month'] = month_name
                if month_name not in photos_by_month:
                    photos_by_month[month_name] = []
                photos_by_month[month_name].append(photo)

            # Calculate target per month (proportional allocation)
            total_photos = len(scored_photos)
            selected = []
            month_stats = []

            for month_name, month_photos in photos_by_month.items():
                # Proportional target for this month
                month_proportion = len(month_photos) / total_photos
                month_target = max(1, int(target * month_proportion))

                job['message'] = f'Processing {month_name} ({len(month_photos)} photos)...'

                # Get embeddings for this month's photos
                month_embeddings = {p['filename']: embeddings.get(p['filename']) for p in month_photos}

                # Run selection for this month
                month_selected = selector.select_hybrid_hdbscan(month_photos, month_embeddings, target=month_target)

                # Add month info to each selected photo
                for photo in month_selected:
                    photo['month'] = month_name

                selected.extend(month_selected)

                month_stats.append({
                    'month': month_name,
                    'total_photos': len(month_photos),
                    'selected': len(month_selected),
                    'target': month_target
                })

            print(f"[Test Month {job_id}] Organized by month: {len(photos_by_month)} months, {len(selected)} total selected")
        else:
            # Single batch selection (original behavior)
            job['message'] = 'Running category-aware clustering and selection...'
            selected = selector.select_hybrid_hdbscan(scored_photos, embeddings, target=target)
            # Add 'Unknown' month to all photos when not organized
            for photo in selected:
                photo['month'] = 'Unknown'
            for photo in scored_photos:
                photo['month'] = 'Unknown'
            month_stats = []

        job['progress'] = 85
        job['message'] = 'Creating thumbnails...'

        # Create thumbnails and build results
        selected_results = []
        for photo in selected:
            filename = photo['filename']
            filepath = photo['filepath']
            thumb_name = get_thumbnail_name(filename)
            thumb_path = os.path.join(thumb_dir, thumb_name)

            create_thumbnail(filepath, thumb_path)

            # Get embedding for this photo
            photo_emb = embeddings.get(filename)
            embedding_list = photo_emb.tolist() if photo_emb is not None else None

            # Format timestamp for display
            ts = photo.get('timestamp')
            datetime_str = ''
            if ts:
                dt = datetime.fromtimestamp(ts)
                datetime_str = dt.strftime('%Y-%m-%d %H:%M:%S')

            selected_results.append({
                'filename': filename,
                'thumbnail': thumb_name,
                'score': float(photo.get('total', 0)),
                'face_quality': float(photo.get('face_quality', 0)),
                'aesthetic_quality': float(photo.get('aesthetic_quality', 0)),
                'emotional_signal': float(photo.get('emotional_signal', 0)),
                'uniqueness': float(photo.get('uniqueness', 0)),
                'num_faces': int(photo.get('num_faces', 0)),
                'multi_face_bonus': float(photo.get('multi_face_bonus', 0)),
                'cluster_id': photo.get('cluster_id', -1),
                'max_similarity': float(photo.get('max_similarity', 0)),
                'category': photo.get('category', 'unknown'),
                'category_confidence': float(photo.get('category_confidence', 0)),
                'event_id': photo.get('event_id', -1),
                'selection_reason': photo.get('selection_reason', ''),
                'datetime': datetime_str,
                'embedding': embedding_list,
                'month': photo.get('month', 'Unknown')
            })

        # Build rejected list
        selected_filenames = {p['filename'] for p in selected}
        rejected_results = []

        for photo in scored_photos:
            if photo['filename'] not in selected_filenames:
                filename = photo['filename']
                filepath = photo['filepath']
                thumb_name = get_thumbnail_name(filename)
                thumb_path = os.path.join(thumb_dir, thumb_name)

                create_thumbnail(filepath, thumb_path)

                photo_emb = embeddings.get(filename)
                embedding_list = photo_emb.tolist() if photo_emb is not None else None

                # Format timestamp for display
                ts = photo.get('timestamp')
                datetime_str = ''
                if ts:
                    from datetime import datetime
                    dt = datetime.fromtimestamp(ts)
                    datetime_str = dt.strftime('%Y-%m-%d %H:%M:%S')

                rejected_results.append({
                    'filename': filename,
                    'thumbnail': thumb_name,
                    'score': float(photo.get('total', 0)),
                    'face_quality': float(photo.get('face_quality', 0)),
                    'aesthetic_quality': float(photo.get('aesthetic_quality', 0)),
                    'num_faces': int(photo.get('num_faces', 0)),
                    'cluster_id': photo.get('cluster_id', -1),
                    'category': photo.get('category', 'unknown'),
                    'event_id': photo.get('event_id', -1),
                    'embedding': embedding_list,
                    'max_similarity': float(photo.get('max_similarity', 0)),
                    'selection_reason': photo.get('rejection_reason', 'Not selected'),
                    'datetime': datetime_str,
                    'month': photo.get('month', 'Unknown')
                })

        # Sort results
        selected_results.sort(key=lambda x: x['score'], reverse=True)
        rejected_results.sort(key=lambda x: x['score'], reverse=True)

        # Cluster distribution
        cluster_counts = {}
        for photo in selected_results:
            cid = photo.get('cluster_id', -1)
            cluster_counts[cid] = cluster_counts.get(cid, 0) + 1

        # Category distribution
        category_counts = {}
        for photo in selected_results:
            cat = photo.get('category', 'unknown')
            category_counts[cat] = category_counts.get(cat, 0) + 1

        # Build results
        job['results'] = {
            'selected': selected_results,
            'rejected': rejected_results,
            'summary': {
                'total_photos': len(photo_paths),
                'selected_count': len(selected_results),
                'rejected_count': len(rejected_results),
                'target': target
            },
            'cluster_distribution': cluster_counts,
            'category_distribution': category_counts,
            'organized_by_month': organize_by_month,
            'month_stats': month_stats
        }

        job['status'] = 'complete'
        job['progress'] = 100
        job['message'] = f'Done! Selected {len(selected_results)} of {len(photo_paths)} photos'

        print(f"\n[Test Month {job_id}] Complete!")
        print(f"  - Total: {len(photo_paths)}")
        print(f"  - Selected: {len(selected_results)}")
        print(f"  - Organized by month: {organize_by_month}")
        if month_stats:
            print(f"  - Month stats: {month_stats}")
        print(f"  - Clusters: {cluster_counts}")
        print(f"  - Categories: {category_counts}")

    except Exception as e:
        processing_jobs[job_id]['status'] = 'error'
        processing_jobs[job_id]['message'] = str(e)
        import traceback
        traceback.print_exc()


@app.route('/test-month/status/<job_id>')
def test_month_status(job_id):
    """Get test month job status."""
    if job_id not in processing_jobs:
        return jsonify({'error': 'Job not found'}), 404

    job = processing_jobs[job_id]
    return jsonify({
        'status': job['status'],
        'progress': job['progress'],
        'message': job['message']
    })


@app.route('/test-month/results/<job_id>')
def test_month_results(job_id):
    """Get test month results."""
    if job_id not in processing_jobs:
        return jsonify({'error': 'Job not found'}), 404

    job = processing_jobs[job_id]
    if job['status'] != 'complete':
        return jsonify({'error': 'Not complete', 'status': job['status']}), 400

    return jsonify(job['results'])


@app.route('/test-month/thumbnail/<job_id>/<filename>')
def test_month_thumbnail(job_id, filename):
    """Serve test month thumbnails."""
    thumb_dir = os.path.join(UPLOAD_FOLDER, job_id, 'thumbnails')
    return send_from_directory(thumb_dir, filename)


@app.route('/test-month/download/<job_id>')
def test_month_download(job_id):
    """Download selected photos from test-month as ZIP."""
    import zipfile
    from io import BytesIO

    if job_id not in processing_jobs:
        return jsonify({'error': 'Job not found'}), 404

    job = processing_jobs[job_id]
    if job['status'] != 'complete':
        return jsonify({'error': 'Processing not complete'}), 400

    results = job.get('results', {})
    selected = results.get('selected', [])
    folder_path = job.get('folder_path', '')

    if not selected:
        return jsonify({'error': 'No selected photos'}), 404

    if not folder_path:
        return jsonify({'error': 'Folder path not found'}), 404

    # Create zip file
    memory_file = BytesIO()
    files_added = 0
    with zipfile.ZipFile(memory_file, 'w', zipfile.ZIP_DEFLATED) as zf:
        for photo in selected:
            filename = photo.get('filename', '')
            # Build full path from folder_path + filename
            photo_path = os.path.join(folder_path, filename)
            if os.path.exists(photo_path):
                zf.write(photo_path, filename)
                files_added += 1

    if files_added == 0:
        return jsonify({'error': 'No files could be added to ZIP'}), 404

    memory_file.seek(0)
    return send_file(
        memory_file,
        mimetype='application/zip',
        as_attachment=True,
        download_name=f'test_selected_{job_id}.zip'
    )


# ============================================
# DATASET SAVE/LOAD ROUTES
# ============================================

@app.route('/datasets')
def datasets_page():
    """Show saved datasets page."""
    return render_template('datasets.html')


@app.route('/api/datasets')
def list_datasets():
    """List all saved datasets (local + Supabase)."""
    datasets = []
    seen_names = set()

    # 1. Get local datasets
    if os.path.exists(DATASETS_FOLDER):
        for name in os.listdir(DATASETS_FOLDER):
            meta_path = os.path.join(DATASETS_FOLDER, name, 'metadata.json')
            if os.path.exists(meta_path):
                try:
                    with open(meta_path, 'r') as f:
                        meta = json.load(f)
                        meta['folder_name'] = name
                        meta['source'] = 'local'
                        datasets.append(meta)
                        seen_names.add(name)
                except:
                    pass

    # 2. Get Supabase datasets (if available)
    if is_supabase_available():
        try:
            supabase_datasets = list_datasets_from_supabase()
            for meta in supabase_datasets:
                folder_name = meta.get('folder_name', '')
                # Only add if not already in local (local takes priority)
                if folder_name and folder_name not in seen_names:
                    meta['source'] = 'supabase'
                    datasets.append(meta)
        except Exception as e:
            print(f"[Datasets] Error fetching from Supabase: {e}")

    # Sort by date, newest first
    datasets.sort(key=lambda x: x.get('created_at', '') or '', reverse=True)
    return jsonify({'datasets': datasets, 'supabase_available': is_supabase_available()})


@app.route('/save_dataset/<job_id>', methods=['POST'])
def save_dataset(job_id):
    """Save dataset after Step 3 review."""
    try:
        data = request.get_json()
        dataset_name = data.get('name', f"dataset_{job_id}")

        # Validate name (alphanumeric, underscore, hyphen, space only)
        import re
        safe_name = re.sub(r'[^a-zA-Z0-9_\- ]', '', dataset_name).strip()
        if not safe_name:
            safe_name = f"dataset_{job_id}"

        # Create folder name (replace spaces with underscores)
        folder_name = safe_name.replace(' ', '_')
        dataset_path = os.path.join(DATASETS_FOLDER, folder_name)

        # Check if already exists
        if os.path.exists(dataset_path):
            return jsonify({'error': f'Dataset "{safe_name}" already exists'}), 400

        os.makedirs(dataset_path, exist_ok=True)

        # Get job data
        if job_id not in processing_jobs:
            return jsonify({'error': 'Job not found'}), 404

        job = processing_jobs[job_id]
        session_id = job.get('session_id')

        # 1. Save reference embeddings
        if session_id and session_id in face_matchers:
            matcher = face_matchers[session_id]
            embeddings_path = os.path.join(dataset_path, 'reference_embeddings.npz')
            np.savez_compressed(
                embeddings_path,
                embeddings=np.array(matcher.reference_embeddings),
                average=matcher.average_embedding,
                threshold=matcher.similarity_threshold
            )

        # 2. Copy face results from review JSON
        review_file = os.path.join(RESULTS_FOLDER, f"{job_id}_review.json")
        if os.path.exists(review_file):
            shutil.copy(review_file, os.path.join(dataset_path, 'face_results.json'))

        # 3. Save confirmed photos list
        confirmed_photos = job.get('confirmed_photos', [])
        if not confirmed_photos:
            # Try loading from review JSON (Step 3) - contains filtered_photos
            review_file = os.path.join(RESULTS_FOLDER, f"{job_id}_review.json")
            if os.path.exists(review_file):
                with open(review_file, 'r') as f:
                    review_data = json.load(f)
                    filtered = review_data.get('filtered_photos', [])
                    confirmed_photos = [p['filename'] for p in filtered]

            # Fallback: Try loading from confirm step if not in memory
            if not confirmed_photos:
                results_file = os.path.join(RESULTS_FOLDER, f"{job_id}.json")
                if os.path.exists(results_file):
                    with open(results_file, 'r') as f:
                        results_data = json.load(f)
                        selected = results_data.get('selected_photos', [])
                        rejected = results_data.get('rejected_photos', [])
                        confirmed_photos = [p['filename'] for p in selected + rejected]

        with open(os.path.join(dataset_path, 'confirmed_photos.json'), 'w') as f:
            json.dump({'photos': confirmed_photos}, f)

        # 4. Copy thumbnails folder
        upload_dir = job.get('upload_dir', os.path.join(UPLOAD_FOLDER, job_id))
        thumb_dir = os.path.join(upload_dir, 'thumbnails')
        dataset_thumb_dir = os.path.join(dataset_path, 'thumbnails')
        if os.path.exists(thumb_dir):
            shutil.copytree(thumb_dir, dataset_thumb_dir)

        # 5. Copy original photos (for reload)
        photos_dir = os.path.join(dataset_path, 'photos')
        os.makedirs(photos_dir, exist_ok=True)
        for filename in confirmed_photos:
            src = os.path.join(upload_dir, filename)
            if os.path.exists(src):
                shutil.copy(src, os.path.join(photos_dir, filename))

        # 6. Save metadata
        metadata = {
            'name': safe_name,
            'created_at': datetime.now().isoformat(),
            'original_job_id': job_id,
            'session_id': session_id,
            'total_photos': len(confirmed_photos),
            'quality_mode': job.get('quality_mode', 'balanced'),
            'similarity_threshold': job.get('similarity_threshold', 0.4),
            'reference_count': len(face_matchers.get(session_id, {}).reference_embeddings) if session_id in face_matchers else 0
        }

        with open(os.path.join(dataset_path, 'metadata.json'), 'w') as f:
            json.dump(metadata, f, indent=2)

        print(f"[Dataset] Saved '{safe_name}' with {len(confirmed_photos)} photos locally")

        # 7. Also save to Supabase (for persistence across HF restarts)
        supabase_saved = False
        if is_supabase_available():
            try:
                # Read embeddings file as bytes
                embeddings_path = os.path.join(dataset_path, 'reference_embeddings.npz')
                embeddings_data = None
                if os.path.exists(embeddings_path):
                    with open(embeddings_path, 'rb') as f:
                        embeddings_data = f.read()

                # Read face results
                face_results_path = os.path.join(dataset_path, 'face_results.json')
                face_results = {}
                if os.path.exists(face_results_path):
                    with open(face_results_path, 'r') as f:
                        face_results = json.load(f)

                # Save to Supabase
                if embeddings_data:
                    supabase_saved = save_dataset_to_supabase(
                        folder_name,
                        embeddings_data,
                        face_results,
                        metadata
                    )
            except Exception as e:
                print(f"[Dataset] Supabase save error: {e}")

        return jsonify({
            'success': True,
            'name': safe_name,
            'folder_name': folder_name,
            'total_photos': len(confirmed_photos),
            'supabase_saved': supabase_saved
        })

    except Exception as e:
        import traceback
        traceback.print_exc()
        return jsonify({'error': str(e)}), 500


@app.route('/load_dataset/<dataset_name>')
def load_dataset(dataset_name):
    """Load a saved dataset and redirect to review or selection."""
    try:
        dataset_path = os.path.join(DATASETS_FOLDER, dataset_name)
        from_supabase = False

        # Check if dataset exists locally
        if not os.path.exists(dataset_path):
            # Try loading from Supabase
            if is_supabase_available():
                print(f"[Dataset] Not found locally, trying Supabase...")
                supabase_data = load_dataset_from_supabase(dataset_name)
                if supabase_data:
                    from_supabase = True
                    # Redirect to re-upload page (photos not stored in Supabase)
                    return redirect(f'/reupload_photos/{dataset_name}')
                else:
                    return jsonify({'error': 'Dataset not found in local or Supabase'}), 404
            else:
                return jsonify({'error': 'Dataset not found'}), 404

        # Load metadata
        with open(os.path.join(dataset_path, 'metadata.json'), 'r') as f:
            metadata = json.load(f)

        # Create new job ID
        job_id = str(uuid.uuid4())[:8]
        new_session_id = str(uuid.uuid4())[:8]

        # Set up upload directory with photos
        upload_dir = os.path.join(UPLOAD_FOLDER, job_id)
        os.makedirs(upload_dir, exist_ok=True)

        # Copy photos from dataset
        dataset_photos_dir = os.path.join(dataset_path, 'photos')
        if os.path.exists(dataset_photos_dir):
            for filename in os.listdir(dataset_photos_dir):
                src = os.path.join(dataset_photos_dir, filename)
                shutil.copy(src, os.path.join(upload_dir, filename))

        # Copy thumbnails
        dataset_thumb_dir = os.path.join(dataset_path, 'thumbnails')
        if os.path.exists(dataset_thumb_dir):
            shutil.copytree(dataset_thumb_dir, os.path.join(upload_dir, 'thumbnails'))

        # Load reference embeddings into face_matchers
        embeddings_path = os.path.join(dataset_path, 'reference_embeddings.npz')
        if os.path.exists(embeddings_path):
            from photo_selector.face_matcher import FaceMatcher
            data = np.load(embeddings_path, allow_pickle=True)
            matcher = FaceMatcher(similarity_threshold=float(data['threshold']))
            matcher.reference_embeddings = list(data['embeddings'])
            matcher.average_embedding = data['average']
            face_matchers[new_session_id] = matcher
            session['face_session_id'] = new_session_id

        # Load confirmed photos
        confirmed_file = os.path.join(dataset_path, 'confirmed_photos.json')
        confirmed_photos = []
        if os.path.exists(confirmed_file):
            with open(confirmed_file, 'r') as f:
                confirmed_photos = json.load(f).get('photos', [])

        # Load face results
        face_results_path = os.path.join(dataset_path, 'face_results.json')
        review_data = None
        if os.path.exists(face_results_path):
            with open(face_results_path, 'r') as f:
                review_data = json.load(f)

        # Create processing job
        processing_jobs[job_id] = {
            'status': 'review_pending',
            'progress': 100,
            'message': 'Dataset loaded - ready for review',
            'upload_dir': upload_dir,
            'session_id': new_session_id,
            'has_reference_photos': True,
            'reference_count': metadata.get('reference_count', 0),
            'quality_mode': metadata.get('quality_mode', 'balanced'),
            'similarity_threshold': metadata.get('similarity_threshold', 0.4),
            'confirmed_photos': confirmed_photos,
            'review_data': review_data,
            'total_photos': len(confirmed_photos),
            'from_dataset': dataset_name
        }

        # Copy face results to results folder for step3
        if review_data:
            with open(os.path.join(RESULTS_FOLDER, f"{job_id}_review.json"), 'w') as f:
                json.dump(review_data, f)

        print(f"[Dataset] Loaded '{dataset_name}' as job {job_id}")

        # Check which page to go to
        goto = request.args.get('goto', 'review')

        if goto == 'select':
            # Go directly to Step 4 - start quality selection
            return redirect(f'/step4_results/{job_id}?from_dataset=1')
        else:
            # Go to Step 3 - review page
            return redirect(f'/step3_review/{job_id}')

    except Exception as e:
        import traceback
        traceback.print_exc()
        return jsonify({'error': str(e)}), 500


@app.route('/delete_dataset/<dataset_name>', methods=['DELETE'])
def delete_dataset(dataset_name):
    """Delete a saved dataset (local and Supabase)."""
    try:
        deleted_local = False
        deleted_supabase = False

        # Delete local
        dataset_path = os.path.join(DATASETS_FOLDER, dataset_name)
        if os.path.exists(dataset_path):
            shutil.rmtree(dataset_path)
            deleted_local = True
            print(f"[Dataset] Deleted '{dataset_name}' locally")

        # Delete from Supabase
        if is_supabase_available():
            deleted_supabase = delete_dataset_from_supabase(dataset_name)

        if not deleted_local and not deleted_supabase:
            return jsonify({'error': 'Dataset not found'}), 404

        return jsonify({'success': True, 'deleted_local': deleted_local, 'deleted_supabase': deleted_supabase})

    except Exception as e:
        return jsonify({'error': str(e)}), 500


@app.route('/dataset_thumbnail/<dataset_name>/<filename>')
def dataset_thumbnail(dataset_name, filename):
    """Serve dataset thumbnail."""
    thumb_dir = os.path.join(DATASETS_FOLDER, dataset_name, 'thumbnails')
    return send_from_directory(thumb_dir, filename)


# ============================================
# SUPABASE RE-UPLOAD ROUTES
# ============================================

@app.route('/reupload_photos/<dataset_name>')
def reupload_photos_page(dataset_name):
    """Show page to re-upload photos for a Supabase dataset."""
    # Get metadata from Supabase
    if not is_supabase_available():
        return jsonify({'error': 'Supabase not available'}), 500

    supabase_data = load_dataset_from_supabase(dataset_name)
    if not supabase_data:
        return jsonify({'error': 'Dataset not found in Supabase'}), 404

    metadata = supabase_data.get('metadata', {})
    return render_template('reupload_photos.html',
                         dataset_name=dataset_name,
                         metadata=metadata)


@app.route('/download_from_gdrive/<dataset_name>', methods=['POST'])
def download_from_gdrive(dataset_name):
    """Download zip from Google Drive and process photos."""
    try:
        import re
        import zipfile
        import gdown

        data = request.get_json()
        gdrive_link = data.get('gdrive_link', '')

        print(f"[GDrive] Starting download for dataset '{dataset_name}'")
        print(f"[GDrive] Link: {gdrive_link}")

        # Extract file ID from Google Drive link
        file_id = None
        patterns = [
            r'/file/d/([a-zA-Z0-9_-]+)',
            r'id=([a-zA-Z0-9_-]+)',
            r'/d/([a-zA-Z0-9_-]+)'
        ]
        for pattern in patterns:
            match = re.search(pattern, gdrive_link)
            if match:
                file_id = match.group(1)
                break

        if not file_id:
            return jsonify({'error': 'Could not extract file ID from Google Drive link'}), 400

        print(f"[GDrive] File ID: {file_id}")

        # Create job and upload directory
        job_id = str(uuid.uuid4())[:8]
        upload_dir = os.path.join(UPLOAD_FOLDER, job_id)
        os.makedirs(upload_dir, exist_ok=True)
        os.makedirs(os.path.join(upload_dir, 'thumbnails'), exist_ok=True)

        # Download using gdown (handles large files properly)
        zip_path = os.path.join(upload_dir, 'photos.zip')
        gdrive_url = f"https://drive.google.com/uc?id={file_id}"

        print(f"[GDrive] Downloading using gdown...")
        try:
            gdown.download(gdrive_url, zip_path, quiet=False, fuzzy=True)
        except Exception as e:
            print(f"[GDrive] gdown failed: {e}")
            # Try with confirm flag for large files
            try:
                gdown.download(gdrive_url, zip_path, quiet=False, fuzzy=True, use_cookies=False)
            except Exception as e2:
                print(f"[GDrive] gdown retry failed: {e2}")
                return jsonify({'error': f'Download failed: {str(e2)}'}), 400

        # Check if file was downloaded
        if not os.path.exists(zip_path) or os.path.getsize(zip_path) < 1000:
            print(f"[GDrive] ERROR: Download failed or file too small")
            return jsonify({'error': 'Download failed. Make sure the file is shared with "Anyone with link".'}), 400

        print(f"[GDrive] Download complete: {os.path.getsize(zip_path) / 1024 / 1024:.1f} MB")

        # Extract zip file
        print(f"[GDrive] Extracting zip file...")
        uploaded_filenames = []
        image_extensions = {'.jpg', '.jpeg', '.png', '.heic', '.heif', '.webp', '.bmp', '.gif'}

        try:
            with zipfile.ZipFile(zip_path, 'r') as zf:
                for member in zf.namelist():
                    if member.endswith('/') or '/__MACOSX' in member or member.startswith('.'):
                        continue
                    ext = os.path.splitext(member.lower())[1]
                    if ext in image_extensions:
                        filename = secure_filename(os.path.basename(member))
                        if filename:
                            with zf.open(member) as src:
                                filepath = os.path.join(upload_dir, filename)
                                with open(filepath, 'wb') as dst:
                                    dst.write(src.read())
                            uploaded_filenames.append(filename)

                            if len(uploaded_filenames) % 200 == 0:
                                print(f"[GDrive] Extracted {len(uploaded_filenames)} files...")

            print(f"[GDrive] Extracted {len(uploaded_filenames)} photos")
        finally:
            # Clean up zip
            if os.path.exists(zip_path):
                os.remove(zip_path)

        # Load dataset from Supabase
        print(f"[GDrive] Loading dataset from Supabase...")
        supabase_data = load_dataset_from_supabase(dataset_name)
        if not supabase_data:
            return jsonify({'error': 'Dataset not found in Supabase'}), 404

        metadata = supabase_data.get('metadata', {})
        face_results = supabase_data.get('face_results', {})
        embeddings_data = supabase_data.get('embeddings_data')

        # Load reference embeddings
        new_session_id = str(uuid.uuid4())[:8]
        if embeddings_data:
            import io
            from photo_selector.face_matcher import FaceMatcher
            data_np = np.load(io.BytesIO(embeddings_data), allow_pickle=True)
            matcher = FaceMatcher(similarity_threshold=float(data_np['threshold']))
            matcher.reference_embeddings = list(data_np['embeddings'])
            matcher.average_embedding = data_np['average']
            face_matchers[new_session_id] = matcher
            session['face_session_id'] = new_session_id
            print(f"[GDrive] Loaded {len(matcher.reference_embeddings)} reference embeddings")

        # Match uploaded files with saved face results
        filtered_photos = face_results.get('filtered_photos', [])
        uploaded_set = set(uploaded_filenames)
        matched_photos = [p for p in filtered_photos if p.get('filename') in uploaded_set]

        print(f"[GDrive] Matched {len(matched_photos)} of {len(filtered_photos)} photos")

        # Create review data
        review_data = {
            'filtered_photos': matched_photos,
            'total_processed': len(uploaded_filenames),
            'match_count': len(matched_photos)
        }

        with open(os.path.join(RESULTS_FOLDER, f"{job_id}_review.json"), 'w') as f:
            json.dump(review_data, f)

        # Create processing job
        processing_jobs[job_id] = {
            'status': 'review_pending',
            'progress': 100,
            'message': 'Photos downloaded from Google Drive',
            'upload_dir': upload_dir,
            'session_id': new_session_id,
            'has_reference_photos': True,
            'reference_count': metadata.get('reference_count', 0),
            'quality_mode': metadata.get('quality_mode', 'balanced'),
            'similarity_threshold': metadata.get('similarity_threshold', 0.4),
            'confirmed_photos': [p['filename'] for p in matched_photos],
            'review_data': review_data,
            'total_photos': len(matched_photos),
            'from_dataset': dataset_name,
            'from_supabase': True
        }

        print(f"[GDrive] SUCCESS! Redirecting to step3_review/{job_id}")
        return jsonify({
            'success': True,
            'job_id': job_id,
            'matched_photos': len(matched_photos),
            'total_uploaded': len(uploaded_filenames),
            'redirect_url': f'/step3_review/{job_id}'
        })

    except Exception as e:
        print(f"[GDrive] Error: {e}")
        import traceback
        traceback.print_exc()
        return jsonify({'error': str(e)}), 500


# Store chunked upload sessions
chunked_uploads = {}

@app.route('/start_chunked_upload/<dataset_name>', methods=['POST'])
def start_chunked_upload(dataset_name):
    """Start a chunked upload session."""
    try:
        data = request.get_json()
        total_files = data.get('total_files', 0)
        total_chunks = data.get('total_chunks', 0)

        upload_id = str(uuid.uuid4())[:8]
        job_id = str(uuid.uuid4())[:8]
        upload_dir = os.path.join(UPLOAD_FOLDER, job_id)
        os.makedirs(upload_dir, exist_ok=True)
        os.makedirs(os.path.join(upload_dir, 'thumbnails'), exist_ok=True)

        chunked_uploads[upload_id] = {
            'dataset_name': dataset_name,
            'job_id': job_id,
            'upload_dir': upload_dir,
            'total_files': total_files,
            'total_chunks': total_chunks,
            'received_chunks': set(),
            'uploaded_filenames': []
        }

        print(f"[Chunked] Started upload session {upload_id} for dataset '{dataset_name}' ({total_files} files, {total_chunks} chunks)")
        return jsonify({'success': True, 'upload_id': upload_id})
    except Exception as e:
        print(f"[Chunked] Error starting session: {e}")
        return jsonify({'error': str(e)}), 500


@app.route('/upload_reupload_chunk/<dataset_name>', methods=['POST'])
def upload_reupload_chunk(dataset_name):
    """Receive a chunk of photos for reupload."""
    from werkzeug.exceptions import ClientDisconnected
    try:
        upload_id = request.form.get('upload_id')
        chunk_index = int(request.form.get('chunk_index', 0))

        if upload_id not in chunked_uploads:
            return jsonify({'error': 'Invalid upload session'}), 400

        session_data = chunked_uploads[upload_id]
        upload_dir = session_data['upload_dir']

        files = request.files.getlist('photos')
        if not files:
            return jsonify({'error': 'No files in chunk'}), 400

        # Save files from this chunk
        for file in files:
            if file and allowed_file(file.filename):
                filename = secure_filename(file.filename)
                filepath = os.path.join(upload_dir, filename)
                file.save(filepath)
                session_data['uploaded_filenames'].append(filename)

        session_data['received_chunks'].add(chunk_index)
        print(f"[Chunked] Upload {upload_id}: Received chunk {chunk_index + 1}/{session_data['total_chunks']} ({len(files)} files)")

        return jsonify({'success': True, 'chunk': chunk_index, 'files_saved': len(files)})
    except ClientDisconnected:
        # Client disconnected during upload - this is expected on slow connections
        print(f"[Chunked] Client disconnected during chunk upload (timeout)")
        return jsonify({'error': 'Connection timeout - please retry'}), 408
    except Exception as e:
        print(f"[Chunked] Error receiving chunk: {e}")
        import traceback
        traceback.print_exc()
        return jsonify({'error': str(e)}), 500


@app.route('/finish_chunked_upload/<dataset_name>', methods=['POST'])
def finish_chunked_upload(dataset_name):
    """Finalize chunked upload and process photos."""
    try:
        data = request.get_json()
        upload_id = data.get('upload_id')

        if upload_id not in chunked_uploads:
            return jsonify({'error': 'Invalid upload session'}), 400

        session_data = chunked_uploads[upload_id]
        job_id = session_data['job_id']
        upload_dir = session_data['upload_dir']
        uploaded_filenames = session_data['uploaded_filenames']

        print(f"[Chunked] Finalizing upload {upload_id}: {len(uploaded_filenames)} files received")

        # Load dataset from Supabase
        print(f"[Chunked] Loading dataset from Supabase...")
        supabase_data = load_dataset_from_supabase(dataset_name)
        if not supabase_data:
            return jsonify({'error': 'Dataset not found in Supabase'}), 404

        metadata = supabase_data.get('metadata', {})
        face_results = supabase_data.get('face_results', {})
        embeddings_data = supabase_data.get('embeddings_data')

        # Load reference embeddings
        new_session_id = str(uuid.uuid4())[:8]
        if embeddings_data:
            import io
            from photo_selector.face_matcher import FaceMatcher
            data_np = np.load(io.BytesIO(embeddings_data), allow_pickle=True)
            matcher = FaceMatcher(similarity_threshold=float(data_np['threshold']))
            matcher.reference_embeddings = list(data_np['embeddings'])
            matcher.average_embedding = data_np['average']
            face_matchers[new_session_id] = matcher
            session['face_session_id'] = new_session_id
            print(f"[Chunked] Loaded {len(matcher.reference_embeddings)} reference embeddings")

        # Match uploaded files with saved face results
        filtered_photos = face_results.get('filtered_photos', [])
        uploaded_set = set(uploaded_filenames)
        matched_photos = [p for p in filtered_photos if p.get('filename') in uploaded_set]

        print(f"[Chunked] Matched {len(matched_photos)} of {len(filtered_photos)} photos")

        # Create review data
        review_data = {
            'filtered_photos': matched_photos,
            'total_processed': len(uploaded_filenames),
            'match_count': len(matched_photos)
        }

        with open(os.path.join(RESULTS_FOLDER, f"{job_id}_review.json"), 'w') as f:
            json.dump(review_data, f)

        # Create processing job
        processing_jobs[job_id] = {
            'status': 'review_pending',
            'progress': 100,
            'message': 'Photos matched with saved face results',
            'upload_dir': upload_dir,
            'session_id': new_session_id,
            'has_reference_photos': True,
            'reference_count': metadata.get('reference_count', 0),
            'quality_mode': metadata.get('quality_mode', 'balanced'),
            'similarity_threshold': metadata.get('similarity_threshold', 0.4),
            'confirmed_photos': [p['filename'] for p in matched_photos],
            'review_data': review_data,
            'total_photos': len(matched_photos),
            'from_dataset': dataset_name,
            'from_supabase': True
        }

        # Clean up session
        del chunked_uploads[upload_id]

        print(f"[Chunked] SUCCESS! Redirecting to step3_review/{job_id}")
        return jsonify({
            'success': True,
            'job_id': job_id,
            'matched_photos': len(matched_photos),
            'total_uploaded': len(uploaded_filenames),
            'redirect_url': f'/step3_review/{job_id}'
        })

    except Exception as e:
        print(f"[Chunked] Error finalizing: {e}")
        import traceback
        traceback.print_exc()
        return jsonify({'error': str(e)}), 500


@app.route('/process_reupload/<dataset_name>', methods=['POST'])
def process_reupload(dataset_name):
    """Process re-uploaded photos using saved face results from Supabase."""
    from werkzeug.exceptions import ClientDisconnected
    try:
        print(f"[Reupload] Starting reupload for dataset '{dataset_name}'")

        # Load dataset from Supabase
        print(f"[Reupload] Loading dataset from Supabase...")
        supabase_data = load_dataset_from_supabase(dataset_name)
        if not supabase_data:
            print(f"[Reupload] ERROR: Dataset not found in Supabase")
            return jsonify({'error': 'Dataset not found in Supabase'}), 404

        metadata = supabase_data.get('metadata', {})
        face_results = supabase_data.get('face_results', {})
        embeddings_data = supabase_data.get('embeddings_data')
        print(f"[Reupload] Dataset loaded: {len(face_results.get('filtered_photos', []))} photos in face results")

        # Create new job
        job_id = str(uuid.uuid4())[:8]
        new_session_id = str(uuid.uuid4())[:8]
        upload_dir = os.path.join(UPLOAD_FOLDER, job_id)
        os.makedirs(upload_dir, exist_ok=True)
        os.makedirs(os.path.join(upload_dir, 'thumbnails'), exist_ok=True)

        # Check if zip file was uploaded
        zipfile_upload = request.files.get('zipfile')
        uploaded_filenames = []

        if zipfile_upload and zipfile_upload.filename.lower().endswith('.zip'):
            # Handle zip file upload
            import zipfile
            print(f"[Reupload] Received zip file: {zipfile_upload.filename}")

            # Save zip temporarily
            zip_path = os.path.join(upload_dir, 'upload.zip')
            zipfile_upload.save(zip_path)
            print(f"[Reupload] Zip saved, extracting...")

            # Extract zip file
            try:
                with zipfile.ZipFile(zip_path, 'r') as zf:
                    # Get list of image files in zip
                    image_extensions = {'.jpg', '.jpeg', '.png', '.heic', '.heif', '.webp', '.bmp', '.gif'}
                    for member in zf.namelist():
                        # Skip directories and hidden files
                        if member.endswith('/') or '/__MACOSX' in member or member.startswith('.'):
                            continue
                        # Check if it's an image
                        ext = os.path.splitext(member.lower())[1]
                        if ext in image_extensions:
                            # Extract with flat structure (no subdirectories)
                            filename = secure_filename(os.path.basename(member))
                            if filename:
                                # Read from zip and save to upload_dir
                                with zf.open(member) as src:
                                    filepath = os.path.join(upload_dir, filename)
                                    with open(filepath, 'wb') as dst:
                                        dst.write(src.read())
                                uploaded_filenames.append(filename)

                                if len(uploaded_filenames) % 200 == 0:
                                    print(f"[Reupload] Extracted {len(uploaded_filenames)} files...")

                print(f"[Reupload] Extracted {len(uploaded_filenames)} photos from zip")
            finally:
                # Clean up zip file
                if os.path.exists(zip_path):
                    os.remove(zip_path)
        else:
            # Handle individual photo uploads
            files = request.files.getlist('photos')
            if not files or (len(files) == 1 and files[0].filename == ''):
                print(f"[Reupload] ERROR: No photos uploaded")
                return jsonify({'error': 'No photos uploaded'}), 400

            print(f"[Reupload] Saving {len(files)} uploaded files (thumbnails skipped for speed)...")
            for i, file in enumerate(files):
                if file and allowed_file(file.filename):
                    filename = secure_filename(file.filename)
                    filepath = os.path.join(upload_dir, filename)
                    file.save(filepath)
                    uploaded_filenames.append(filename)

                    # Log progress every 200 files
                    if (i + 1) % 200 == 0:
                        print(f"[Reupload] Saved {i + 1}/{len(files)} files...")

            print(f"[Reupload] Saved {len(uploaded_filenames)} photos for dataset '{dataset_name}'")

        # Load reference embeddings
        print(f"[Reupload] Loading reference embeddings...")
        if embeddings_data:
            import io
            from photo_selector.face_matcher import FaceMatcher

            # Load directly from bytes using BytesIO (no temp file needed)
            data = np.load(io.BytesIO(embeddings_data), allow_pickle=True)
            matcher = FaceMatcher(similarity_threshold=float(data['threshold']))
            matcher.reference_embeddings = list(data['embeddings'])
            matcher.average_embedding = data['average']
            face_matchers[new_session_id] = matcher
            session['face_session_id'] = new_session_id
            print(f"[Reupload] Loaded {len(matcher.reference_embeddings)} reference embeddings")

        # Match uploaded files with saved face results
        print(f"[Reupload] Matching uploaded files with saved face results...")
        filtered_photos = face_results.get('filtered_photos', [])

        # Create a set for faster lookup
        uploaded_set = set(uploaded_filenames)

        # Filter to only photos that were uploaded
        matched_photos = []
        for photo in filtered_photos:
            if photo.get('filename') in uploaded_set:
                matched_photos.append(photo)

        print(f"[Reupload] Matched {len(matched_photos)} of {len(filtered_photos)} photos from face results")

        # Create review data
        review_data = {
            'filtered_photos': matched_photos,
            'total_processed': len(uploaded_filenames),
            'match_count': len(matched_photos)
        }

        # Save review data
        with open(os.path.join(RESULTS_FOLDER, f"{job_id}_review.json"), 'w') as f:
            json.dump(review_data, f)
        print(f"[Reupload] Saved review data")

        # Create processing job - mark as ready for quality selection
        processing_jobs[job_id] = {
            'status': 'review_pending',
            'progress': 100,
            'message': 'Photos matched with saved face results',
            'upload_dir': upload_dir,
            'session_id': new_session_id,
            'has_reference_photos': True,
            'reference_count': metadata.get('reference_count', 0),
            'quality_mode': metadata.get('quality_mode', 'balanced'),
            'similarity_threshold': metadata.get('similarity_threshold', 0.4),
            'confirmed_photos': [p['filename'] for p in matched_photos],
            'review_data': review_data,
            'total_photos': len(matched_photos),
            'from_dataset': dataset_name,
            'from_supabase': True
        }

        print(f"[Reupload] SUCCESS! Redirecting to step3_review/{job_id}")
        return jsonify({
            'success': True,
            'job_id': job_id,
            'matched_photos': len(matched_photos),
            'total_uploaded': len(uploaded_filenames),
            'redirect_url': f'/step3_review/{job_id}'
        })

    except ClientDisconnected:
        print(f"[Reupload] Client disconnected during upload (timeout)")
        return jsonify({'error': 'Connection timeout - please retry with smaller batch or better connection'}), 408
    except Exception as e:
        import traceback
        traceback.print_exc()
        return jsonify({'error': str(e)}), 500


if __name__ == '__main__':
    print("""

    ============================================

        PHOTO SELECTION WEB APP

        Open http://localhost:5000 in your browser



        NEW: Automatic selection mode!

        The AI decides which photos to keep.



        TEST: /test-month for single folder testing

    ============================================

    """)
    # Use port 7860 for Hugging Face Spaces, 5000 for local
    import os
    port = int(os.environ.get('PORT', 7860))
    app.run(debug=False, host='0.0.0.0', port=port)