rethinks commited on
Commit
00e6a0c
·
verified ·
1 Parent(s): d072264

Upload 6 files

Browse files
Files changed (6) hide show
  1. Dockerfile +38 -0
  2. README.md +18 -10
  3. app.py +0 -0
  4. requirements.txt +34 -0
  5. supabase_storage.py +324 -0
  6. test_single_month.py +149 -0
Dockerfile ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.11-slim
2
+
3
+ # Set working directory
4
+ WORKDIR /app
5
+
6
+ # Install system dependencies
7
+ RUN apt-get update && apt-get install -y \
8
+ libgl1 \
9
+ libglib2.0-0 \
10
+ libsm6 \
11
+ libxext6 \
12
+ libxrender-dev \
13
+ libgomp1 \
14
+ build-essential \
15
+ git \
16
+ && rm -rf /var/lib/apt/lists/*
17
+
18
+ # Upgrade pip
19
+ RUN pip install --upgrade pip
20
+
21
+ # Copy requirements first (for caching)
22
+ COPY requirements.txt .
23
+
24
+ # Install Python dependencies
25
+ RUN pip install --no-cache-dir flask gunicorn && \
26
+ pip install --no-cache-dir -r requirements.txt
27
+
28
+ # Copy app code
29
+ COPY . .
30
+
31
+ # Create necessary directories
32
+ RUN mkdir -p uploads results references selected_photos thumbnails
33
+
34
+ # Expose port
35
+ EXPOSE 7860
36
+
37
+ # Run the app
38
+ CMD ["python", "app.py"]
README.md CHANGED
@@ -1,10 +1,18 @@
1
- ---
2
- title: ChildYb
3
- emoji: 🏆
4
- colorFrom: indigo
5
- colorTo: purple
6
- sdk: docker
7
- pinned: false
8
- ---
9
-
10
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: CustomYB Photo Selector
3
+ emoji: 📸
4
+ colorFrom: blue
5
+ colorTo: purple
6
+ sdk: docker
7
+ pinned: false
8
+ ---
9
+
10
+ # CustomYB - Smart Photo Selection
11
+
12
+ AI-powered photo selection for yearbooks. Upload photos and let AI select the best ones featuring your child.
13
+
14
+ ## Features
15
+ - Face recognition to find your child
16
+ - Quality scoring
17
+ - Duplicate removal
18
+ - Category detection (portrait, group, candid)
app.py ADDED
The diff for this file is too large to render. See raw diff
 
requirements.txt ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Core dependencies
2
+ Flask>=3.0.0
3
+ Werkzeug>=3.0.1
4
+ gunicorn>=21.2.0
5
+
6
+ # Image processing
7
+ Pillow>=10.0.0
8
+ pillow-heif>=0.14.0
9
+ opencv-python-headless>=4.8.0
10
+
11
+ # Machine Learning
12
+ sentence-transformers>=2.2.2
13
+ torch>=2.1.0
14
+ torchvision>=0.16.0
15
+ hdbscan>=0.8.33
16
+ scikit-learn>=1.3.0
17
+ numpy>=1.26.0
18
+
19
+ # Face Recognition
20
+ insightface>=0.7.3
21
+ onnxruntime>=1.16.0
22
+
23
+ # Utilities
24
+ tqdm>=4.66.0
25
+ python-dotenv>=1.0.0
26
+
27
+ # Cloud Storage
28
+ supabase>=2.0.0
29
+
30
+ # CLIP
31
+ ftfy
32
+ regex
33
+ git+https://github.com/openai/CLIP.git
34
+
supabase_storage.py ADDED
@@ -0,0 +1,324 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Supabase Storage Integration for Photo Selection App
3
+ Handles persistent storage of dataset metadata (not photos) in Supabase.
4
+ """
5
+
6
+ import os
7
+ import json
8
+ from typing import Optional, List, Dict, Any
9
+
10
+ # Supabase credentials
11
+ SUPABASE_URL = os.environ.get('SUPABASE_URL', 'https://cqnyibiopjcwuxmyqbgy.supabase.co')
12
+ SUPABASE_KEY = os.environ.get('SUPABASE_KEY', '')
13
+ BUCKET_NAME = 'datasets'
14
+
15
+ # Initialize Supabase client (lazy loading)
16
+ _supabase_client = None
17
+
18
+ def get_supabase_client():
19
+ """Get or create Supabase client."""
20
+ global _supabase_client
21
+
22
+ if not SUPABASE_KEY:
23
+ print("[Supabase] No SUPABASE_KEY found in environment")
24
+ return None
25
+
26
+ if _supabase_client is None:
27
+ try:
28
+ from supabase import create_client
29
+ _supabase_client = create_client(SUPABASE_URL, SUPABASE_KEY)
30
+ print(f"[Supabase] Connected to {SUPABASE_URL}")
31
+ except ImportError:
32
+ print("[Supabase] supabase-py not installed. Run: pip install supabase")
33
+ return None
34
+ except Exception as e:
35
+ print(f"[Supabase] Connection error: {e}")
36
+ return None
37
+
38
+ return _supabase_client
39
+
40
+
41
+ def is_supabase_available() -> bool:
42
+ """Check if Supabase is configured and available."""
43
+ return get_supabase_client() is not None
44
+
45
+
46
+ def _get_dataset_registry(client) -> List[str]:
47
+ """Get the list of dataset names from the registry file."""
48
+ try:
49
+ storage = client.storage.from_(BUCKET_NAME)
50
+ response = storage.download("_registry.json")
51
+ registry = json.loads(response.decode('utf-8'))
52
+ return registry.get('datasets', [])
53
+ except Exception:
54
+ # Registry doesn't exist yet
55
+ return []
56
+
57
+
58
+ def _update_dataset_registry(client, dataset_name: str, action: str = 'add'):
59
+ """Update the registry file with dataset names."""
60
+ try:
61
+ storage = client.storage.from_(BUCKET_NAME)
62
+
63
+ # Get current registry
64
+ datasets = _get_dataset_registry(client)
65
+
66
+ if action == 'add' and dataset_name not in datasets:
67
+ datasets.append(dataset_name)
68
+ elif action == 'remove' and dataset_name in datasets:
69
+ datasets.remove(dataset_name)
70
+ else:
71
+ return # No changes needed
72
+
73
+ # Save updated registry
74
+ registry_data = json.dumps({'datasets': datasets}, indent=2).encode('utf-8')
75
+
76
+ # Try to update (upsert)
77
+ try:
78
+ storage.update(
79
+ path="_registry.json",
80
+ file=registry_data,
81
+ file_options={"content-type": "application/json"}
82
+ )
83
+ except Exception:
84
+ # File doesn't exist, create it
85
+ storage.upload(
86
+ path="_registry.json",
87
+ file=registry_data,
88
+ file_options={"content-type": "application/json"}
89
+ )
90
+
91
+ print(f"[Supabase] Registry updated: {action} '{dataset_name}'")
92
+ except Exception as e:
93
+ print(f"[Supabase] Error updating registry: {e}")
94
+
95
+
96
+ def save_dataset_to_supabase(
97
+ dataset_name: str,
98
+ embeddings_data: bytes,
99
+ face_results: dict,
100
+ metadata: dict
101
+ ) -> bool:
102
+ """
103
+ Save dataset files to Supabase Storage.
104
+
105
+ Args:
106
+ dataset_name: Unique name for the dataset (folder name)
107
+ embeddings_data: Binary data of reference_embeddings.npz
108
+ face_results: Dictionary of face detection results
109
+ metadata: Dataset metadata dictionary
110
+
111
+ Returns:
112
+ True if successful, False otherwise
113
+ """
114
+ client = get_supabase_client()
115
+ if not client:
116
+ print("[Supabase] Client not available, skipping cloud save")
117
+ return False
118
+
119
+ try:
120
+ # 1. Upload reference embeddings (.npz file)
121
+ embeddings_path = f"{dataset_name}/reference_embeddings.npz"
122
+ result = client.storage.from_(BUCKET_NAME).upload(
123
+ path=embeddings_path,
124
+ file=embeddings_data,
125
+ file_options={"content-type": "application/octet-stream"}
126
+ )
127
+ print(f"[Supabase] Uploaded {embeddings_path}: {result}")
128
+
129
+ # 2. Upload face results (JSON)
130
+ face_results_path = f"{dataset_name}/face_results.json"
131
+ face_results_bytes = json.dumps(face_results, indent=2).encode('utf-8')
132
+ result = client.storage.from_(BUCKET_NAME).upload(
133
+ path=face_results_path,
134
+ file=face_results_bytes,
135
+ file_options={"content-type": "application/json"}
136
+ )
137
+ print(f"[Supabase] Uploaded {face_results_path}: {result}")
138
+
139
+ # 3. Upload metadata (JSON)
140
+ metadata_path = f"{dataset_name}/metadata.json"
141
+ metadata_bytes = json.dumps(metadata, indent=2).encode('utf-8')
142
+ result = client.storage.from_(BUCKET_NAME).upload(
143
+ path=metadata_path,
144
+ file=metadata_bytes,
145
+ file_options={"content-type": "application/json"}
146
+ )
147
+ print(f"[Supabase] Uploaded {metadata_path}")
148
+
149
+ # 4. Update the registry file (list of all dataset names)
150
+ _update_dataset_registry(client, dataset_name, action='add')
151
+
152
+ print(f"[Supabase] Dataset '{dataset_name}' saved successfully")
153
+ return True
154
+
155
+ except Exception as e:
156
+ print(f"[Supabase] Error saving dataset: {e}")
157
+ return False
158
+
159
+
160
+ def load_dataset_from_supabase(dataset_name: str) -> Optional[Dict[str, Any]]:
161
+ """
162
+ Load dataset files from Supabase Storage.
163
+
164
+ Args:
165
+ dataset_name: Name of the dataset to load
166
+
167
+ Returns:
168
+ Dictionary with 'embeddings_data', 'face_results', 'metadata' or None if failed
169
+ """
170
+ client = get_supabase_client()
171
+ if not client:
172
+ print("[Supabase] Client not available")
173
+ return None
174
+
175
+ try:
176
+ result = {}
177
+
178
+ # 1. Download reference embeddings
179
+ embeddings_path = f"{dataset_name}/reference_embeddings.npz"
180
+ response = client.storage.from_(BUCKET_NAME).download(embeddings_path)
181
+ result['embeddings_data'] = response
182
+ print(f"[Supabase] Downloaded {embeddings_path}")
183
+
184
+ # 2. Download face results
185
+ face_results_path = f"{dataset_name}/face_results.json"
186
+ response = client.storage.from_(BUCKET_NAME).download(face_results_path)
187
+ result['face_results'] = json.loads(response.decode('utf-8'))
188
+ print(f"[Supabase] Downloaded {face_results_path}")
189
+
190
+ # 3. Download metadata
191
+ metadata_path = f"{dataset_name}/metadata.json"
192
+ response = client.storage.from_(BUCKET_NAME).download(metadata_path)
193
+ result['metadata'] = json.loads(response.decode('utf-8'))
194
+ print(f"[Supabase] Downloaded {metadata_path}")
195
+
196
+ print(f"[Supabase] Dataset '{dataset_name}' loaded successfully")
197
+ return result
198
+
199
+ except Exception as e:
200
+ print(f"[Supabase] Error loading dataset: {e}")
201
+ return None
202
+
203
+
204
+ def list_datasets_from_supabase() -> List[Dict[str, Any]]:
205
+ """
206
+ List all datasets stored in Supabase.
207
+
208
+ Returns:
209
+ List of dataset metadata dictionaries
210
+ """
211
+ client = get_supabase_client()
212
+ if not client:
213
+ print("[Supabase] Client not available")
214
+ return []
215
+
216
+ try:
217
+ storage = client.storage.from_(BUCKET_NAME)
218
+
219
+ # Get dataset names from registry
220
+ dataset_names = _get_dataset_registry(client)
221
+ print(f"[Supabase] Registry contains: {dataset_names}")
222
+
223
+ # If registry is empty, try to find existing datasets by checking known names
224
+ # This handles the case where datasets were saved before registry was implemented
225
+ if not dataset_names:
226
+ print("[Supabase] Registry empty, checking for existing datasets...")
227
+ # Try some known/common dataset names
228
+ potential_names = ['testing']
229
+ for name in potential_names:
230
+ try:
231
+ storage.download(f"{name}/metadata.json")
232
+ dataset_names.append(name)
233
+ print(f"[Supabase] Found existing dataset: {name}")
234
+ except Exception:
235
+ pass
236
+
237
+ datasets = []
238
+ for folder_name in dataset_names:
239
+ try:
240
+ metadata_path = f"{folder_name}/metadata.json"
241
+ metadata_response = storage.download(metadata_path)
242
+ metadata = json.loads(metadata_response.decode('utf-8'))
243
+ metadata['folder_name'] = folder_name
244
+ metadata['source'] = 'supabase'
245
+ datasets.append(metadata)
246
+ print(f"[Supabase] Loaded metadata for {folder_name}")
247
+ except Exception as e:
248
+ print(f"[Supabase] Could not load metadata for {folder_name}: {e}")
249
+ # Add basic info without full metadata
250
+ datasets.append({
251
+ 'name': folder_name,
252
+ 'folder_name': folder_name,
253
+ 'source': 'supabase',
254
+ 'total_photos': 0,
255
+ 'created_at': None
256
+ })
257
+
258
+ print(f"[Supabase] Found {len(datasets)} datasets")
259
+ return datasets
260
+
261
+ except Exception as e:
262
+ print(f"[Supabase] Error listing datasets: {e}")
263
+ import traceback
264
+ traceback.print_exc()
265
+ return []
266
+
267
+
268
+ def delete_dataset_from_supabase(dataset_name: str) -> bool:
269
+ """
270
+ Delete a dataset from Supabase Storage.
271
+
272
+ Args:
273
+ dataset_name: Name of the dataset to delete
274
+
275
+ Returns:
276
+ True if successful, False otherwise
277
+ """
278
+ client = get_supabase_client()
279
+ if not client:
280
+ print("[Supabase] Client not available")
281
+ return False
282
+
283
+ try:
284
+ # List all files in the dataset folder
285
+ files = client.storage.from_(BUCKET_NAME).list(dataset_name)
286
+
287
+ # Delete each file
288
+ file_paths = [f"{dataset_name}/{f['name']}" for f in files if f.get('name')]
289
+
290
+ if file_paths:
291
+ client.storage.from_(BUCKET_NAME).remove(file_paths)
292
+ print(f"[Supabase] Deleted {len(file_paths)} files from '{dataset_name}'")
293
+
294
+ # Remove from registry
295
+ _update_dataset_registry(client, dataset_name, action='remove')
296
+
297
+ print(f"[Supabase] Dataset '{dataset_name}' deleted successfully")
298
+ return True
299
+
300
+ except Exception as e:
301
+ print(f"[Supabase] Error deleting dataset: {e}")
302
+ return False
303
+
304
+
305
+ def check_dataset_exists_in_supabase(dataset_name: str) -> bool:
306
+ """
307
+ Check if a dataset exists in Supabase.
308
+
309
+ Args:
310
+ dataset_name: Name of the dataset to check
311
+
312
+ Returns:
313
+ True if exists, False otherwise
314
+ """
315
+ client = get_supabase_client()
316
+ if not client:
317
+ return False
318
+
319
+ try:
320
+ # Try to list files in the dataset folder
321
+ files = client.storage.from_(BUCKET_NAME).list(dataset_name)
322
+ return len(files) > 0
323
+ except:
324
+ return False
test_single_month.py ADDED
@@ -0,0 +1,149 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Test script: Select best 40 photos from a single month folder.
3
+ Usage: python test_single_month.py <folder_path> [target_count]
4
+
5
+ Example:
6
+ python test_single_month.py "C:/Photos/2024/January" 40
7
+ """
8
+
9
+ import sys
10
+ import os
11
+ from pathlib import Path
12
+
13
+ # Add project to path
14
+ sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
15
+
16
+ from photo_selector.monthly_selector import MonthlyPhotoSelector
17
+
18
+
19
+ def test_single_month(folder_path: str, target: int = 40):
20
+ """
21
+ Test photo selection on a single folder.
22
+
23
+ Args:
24
+ folder_path: Path to folder containing photos
25
+ target: Number of photos to select (default 40)
26
+ """
27
+ folder = Path(folder_path)
28
+
29
+ if not folder.exists():
30
+ print(f"Error: Folder not found: {folder}")
31
+ return
32
+
33
+ # Count photos
34
+ extensions = {'.jpg', '.jpeg', '.png', '.heic', '.heif', '.webp'}
35
+ photos = [f for f in folder.iterdir() if f.suffix.lower() in extensions]
36
+
37
+ print(f"\n{'='*60}")
38
+ print(f"SINGLE MONTH TEST")
39
+ print(f"{'='*60}")
40
+ print(f"Folder: {folder}")
41
+ print(f"Photos found: {len(photos)}")
42
+ print(f"Target selection: {target}")
43
+ print(f"{'='*60}\n")
44
+
45
+ if len(photos) == 0:
46
+ print("No photos found in folder!")
47
+ return
48
+
49
+ # Initialize selector
50
+ print("Initializing selector (loading CLIP model)...")
51
+ selector = MonthlyPhotoSelector()
52
+
53
+ # Step 1: Generate embeddings
54
+ print(f"\n[Step 1] Generating CLIP embeddings for {len(photos)} photos...")
55
+ photo_paths = [str(p) for p in photos]
56
+ embeddings = selector.generate_embeddings(photo_paths)
57
+ print(f"Generated embeddings for {len(embeddings)} photos")
58
+
59
+ # Step 2: Score photos
60
+ print(f"\n[Step 2] Scoring photos...")
61
+ from photo_selector.scoring import PhotoScorer
62
+ scorer = PhotoScorer()
63
+
64
+ scored_photos = []
65
+ for i, photo_path in enumerate(photo_paths):
66
+ if (i + 1) % 10 == 0:
67
+ print(f" Scoring {i + 1}/{len(photo_paths)}...")
68
+
69
+ filename = Path(photo_path).name
70
+ emb = embeddings.get(filename)
71
+
72
+ # Get scores
73
+ scores = scorer.score_photo(photo_path)
74
+
75
+ scored_photos.append({
76
+ 'filename': filename,
77
+ 'filepath': photo_path,
78
+ 'total': scores.get('total', 0),
79
+ 'face_quality': scores.get('face_quality', 0),
80
+ 'aesthetic_quality': scores.get('aesthetic_quality', 0),
81
+ 'emotional_signal': scores.get('emotional_signal', 0),
82
+ 'uniqueness': scores.get('uniqueness', 0.5),
83
+ 'num_faces': scores.get('num_faces', 0)
84
+ })
85
+
86
+ print(f"Scored {len(scored_photos)} photos")
87
+
88
+ # Step 3: Cluster and select using HDBSCAN
89
+ print(f"\n[Step 3] Running HDBSCAN clustering and selection...")
90
+ selected = selector.select_hybrid_hdbscan(
91
+ scored_photos,
92
+ embeddings,
93
+ target=target
94
+ )
95
+
96
+ # Results
97
+ print(f"\n{'='*60}")
98
+ print(f"RESULTS")
99
+ print(f"{'='*60}")
100
+ print(f"Total photos: {len(photos)}")
101
+ print(f"Selected: {len(selected)}")
102
+ print(f"{'='*60}\n")
103
+
104
+ # Show selected photos
105
+ print("Selected photos (ranked by score):\n")
106
+ print(f"{'#':<4} {'Score':>6} {'Faces':>6} {'Cluster':>8} {'Similarity':>10} {'Filename':<40}")
107
+ print("-" * 80)
108
+
109
+ for i, photo in enumerate(selected, 1):
110
+ score = photo.get('total', 0) * 100
111
+ faces = photo.get('num_faces', 0)
112
+ cluster = photo.get('cluster_id', -1)
113
+ cluster_label = f"C{cluster}" if cluster >= 0 else "Fallback"
114
+ similarity = photo.get('max_similarity', 0) * 100
115
+ filename = photo.get('filename', '?')[:38]
116
+
117
+ print(f"{i:<4} {score:>5.1f}% {faces:>6} {cluster_label:>8} {similarity:>9.1f}% {filename:<40}")
118
+
119
+ # Cluster distribution
120
+ print(f"\n{'='*60}")
121
+ print("CLUSTER DISTRIBUTION")
122
+ print(f"{'='*60}")
123
+
124
+ cluster_counts = {}
125
+ for photo in selected:
126
+ cid = photo.get('cluster_id', -1)
127
+ cluster_counts[cid] = cluster_counts.get(cid, 0) + 1
128
+
129
+ for cid in sorted(cluster_counts.keys()):
130
+ label = f"Cluster {cid}" if cid >= 0 else "Fallback"
131
+ count = cluster_counts[cid]
132
+ bar = "█" * count
133
+ print(f" {label:<12}: {count:>3} {bar}")
134
+
135
+ print(f"\n{'='*60}")
136
+
137
+ return selected
138
+
139
+
140
+ if __name__ == "__main__":
141
+ if len(sys.argv) < 2:
142
+ print(__doc__)
143
+ print("\nNo folder provided. Please specify a folder path.")
144
+ sys.exit(1)
145
+
146
+ folder_path = sys.argv[1]
147
+ target = int(sys.argv[2]) if len(sys.argv) > 2 else 40
148
+
149
+ test_single_month(folder_path, target)