brightening-eyes commited on
Commit
835c41f
·
1 Parent(s): 1b312ca

initial commit

Browse files
Files changed (4) hide show
  1. Dockerfile +17 -0
  2. main.py +304 -0
  3. requirements.txt +13 -0
  4. templates/index.html +92 -0
Dockerfile ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.10.5
2
+
3
+ # Set the working directory to /code
4
+ WORKDIR /code
5
+
6
+ # Copy the current directory contents into the container at .
7
+ COPY . .
8
+
9
+ # upgrade pip
10
+ RUN python -m pip install --upgrade pip
11
+
12
+ # Install requirements.txt
13
+ RUN pip install --no-cache-dir --upgrade -r requirements.txt
14
+
15
+ # Start the FastAPI app on port 7860, the default port expected by Spaces
16
+ ENTRYPOINT ["uvicorn"]
17
+ CMD ["main:app", "--host", "0.0.0.0", "--port", "7860"]
main.py ADDED
@@ -0,0 +1,304 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import uuid
3
+ import shutil
4
+ import sqlite3
5
+ import json
6
+ import logging
7
+ import asyncio
8
+ import numpy as np
9
+ import chromadb
10
+ import cv2
11
+ from datetime import datetime
12
+ from typing import List, Optional
13
+ from contextlib import asynccontextmanager
14
+
15
+ # FastAPI & Utilities
16
+ from fastapi import FastAPI, UploadFile, File, BackgroundTasks, HTTPException, Request, Form
17
+ from fastapi.responses import JSONResponse, HTMLResponse
18
+ from fastapi.staticfiles import StaticFiles
19
+ from fastapi.templating import Jinja2Templates
20
+ from pydantic import BaseModel
21
+
22
+ # AI Libraries
23
+ import torch
24
+ from PIL import Image
25
+ from sentence_transformers import SentenceTransformer
26
+ from transformers import BlipProcessor, BlipForConditionalGeneration
27
+ from insightface.app import FaceAnalysis
28
+
29
+ # --- CONFIGURATION ---
30
+ UPLOAD_DIR = "static/uploads"
31
+ DB_PATH = "photos.db"
32
+ CHROMA_PATH = "chroma_db"
33
+ device = "cuda" if torch.cuda.is_available() else "cpu"
34
+
35
+ # Ensure directories exist
36
+ os.makedirs(UPLOAD_DIR, exist_ok=True)
37
+
38
+ # --- LOGGING ---
39
+ logging.basicConfig(level=logging.INFO)
40
+ logger = logging.getLogger("CloudzyAI")
41
+
42
+ # --- GLOBAL MODELS (Loaded on Startup) ---
43
+ ai_models = {}
44
+
45
+ @asynccontextmanager
46
+ async def lifespan(app: FastAPI):
47
+ # 1. Load CLIP for Semantic Search (Text <-> Image)
48
+ logger.info("Loading CLIP model...")
49
+ ai_models["clip"] = SentenceTransformer('clip-ViT-B-32', device=device)
50
+
51
+ # 2. Load BLIP for Captioning
52
+ logger.info("Loading BLIP model...")
53
+ ai_models["blip_processor"] = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
54
+ ai_models["blip_model"] = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base").to(device)
55
+
56
+ # 3. Load InsightFace for Smart Analysis
57
+ logger.info("Loading InsightFace model...")
58
+ # 'buffalo_l' is a good default model pack. It downloads automatically on first run.
59
+ app_face = FaceAnalysis(name='buffalo_l', providers=['CUDAExecutionProvider', 'CPUExecutionProvider'])
60
+ app_face.prepare(ctx_id=0, det_size=(640, 640))
61
+ ai_models["face"] = app_face
62
+
63
+ # 4. Initialize Database
64
+ init_db()
65
+
66
+ yield
67
+ logger.info("Shutting down...")
68
+
69
+ app = FastAPI(lifespan=lifespan)
70
+ app.mount("/static", StaticFiles(directory="static"), name="static")
71
+ templates = Jinja2Templates(directory="templates") # Create this folder
72
+
73
+ # --- DATABASE SETUP (SQLite + ChromaDB) ---
74
+ chroma_client = chromadb.PersistentClient(path=CHROMA_PATH)
75
+ collection = chroma_client.get_or_create_collection(name="photo_embeddings")
76
+
77
+ def init_db():
78
+ conn = sqlite3.connect(DB_PATH)
79
+ cursor = conn.cursor()
80
+ cursor.execute("""
81
+ CREATE TABLE IF NOT EXISTS photos (
82
+ id TEXT PRIMARY KEY,
83
+ filename TEXT,
84
+ filepath TEXT,
85
+ upload_date TEXT,
86
+ caption TEXT,
87
+ tags TEXT,
88
+ smart_analysis TEXT,
89
+ status TEXT
90
+ )
91
+ """)
92
+ conn.commit()
93
+ conn.close()
94
+
95
+ # --- Pydantic Models ---
96
+ class PhotoResponse(BaseModel):
97
+ id: str
98
+ filename: str
99
+ url: str
100
+ caption: Optional[str] = None
101
+ tags: List[str] = []
102
+ smart_features: Optional[dict] = None
103
+ upload_date: str
104
+
105
+ # --- AI PROCESSING TASKS ---
106
+ def process_image_task(photo_id: str, file_path: str):
107
+ """
108
+ Background task that runs the AI pipeline:
109
+ 1. Generate Caption (BLIP)
110
+ 2. Analyze Faces (InsightFace)
111
+ 3. Create Embeddings (CLIP)
112
+ 4. Update DBs
113
+ """
114
+ logger.info(f"Starting AI analysis for {photo_id}")
115
+
116
+ try:
117
+ # Load Images
118
+ pil_image = Image.open(file_path).convert("RGB")
119
+ cv_image = cv2.imread(file_path) # InsightFace needs OpenCV format
120
+
121
+ # A. Captioning (BLIP)
122
+ inputs = ai_models["blip_processor"](pil_image, return_tensors="pt").to(device)
123
+ out = ai_models["blip_model"].generate(**inputs)
124
+ caption = ai_models["blip_processor"].decode(out[0], skip_special_tokens=True)
125
+
126
+ # B. Smart Feature: Face Analysis (InsightFace)
127
+ faces = ai_models["face"].get(cv_image)
128
+ face_data = []
129
+ tags = ["ai-generated"]
130
+
131
+ if len(faces) > 0:
132
+ avg_age = np.mean([face.age for face in faces])
133
+ gender_counts = {"M": 0, "F": 0}
134
+ for face in faces:
135
+ gender = "M" if face.sex == 1 else "F"
136
+ gender_counts[gender] += 1
137
+ face_data.append({
138
+ "age": int(face.age),
139
+ "gender": gender,
140
+ "confidence": float(face.det_score)
141
+ })
142
+
143
+ # Smart Tagging based on Analysis
144
+ tags.append("person")
145
+ tags.append(f"{len(faces)} people")
146
+ if gender_counts["M"] > gender_counts["F"]: tags.append("mostly_male")
147
+ if gender_counts["F"] > gender_counts["M"]: tags.append("mostly_female")
148
+ if avg_age < 18: tags.append("youth")
149
+ elif avg_age > 60: tags.append("senior")
150
+ else: tags.append("adult")
151
+ else:
152
+ tags.append("scenery") # Fallback tag
153
+ face_data = {"message": "No faces detected"}
154
+
155
+ # Combine caption words into tags (Simple approach)
156
+ tags.extend([word for word in caption.split() if len(word) > 4])
157
+ tags = list(set(tags)) # unique
158
+
159
+ # C. Embedding (CLIP)
160
+ # We embed the IMAGE itself for semantic search
161
+ embedding = ai_models["clip"].encode(pil_image).tolist()
162
+
163
+ # D. Save Results
164
+ conn = sqlite3.connect(DB_PATH)
165
+ cursor = conn.cursor()
166
+ cursor.execute("""
167
+ UPDATE photos
168
+ SET caption = ?, tags = ?, smart_analysis = ?, status = 'completed'
169
+ WHERE id = ?
170
+ """, (caption, json.dumps(tags), json.dumps(face_data), photo_id))
171
+ conn.commit()
172
+ conn.close()
173
+
174
+ # Save to ChromaDB
175
+ collection.add(
176
+ ids=[photo_id],
177
+ embeddings=[embedding],
178
+ metadatas=[{"caption": caption}]
179
+ )
180
+
181
+ logger.info(f"AI processing completed for {photo_id}")
182
+
183
+ except Exception as e:
184
+ logger.error(f"Error processing {photo_id}: {e}")
185
+ conn = sqlite3.connect(DB_PATH)
186
+ conn.execute("UPDATE photos SET status = 'failed' WHERE id = ?", (photo_id,))
187
+ conn.commit()
188
+ conn.close()
189
+
190
+ # --- API ENDPOINTS ---
191
+
192
+ @app.get("/", response_class=HTMLResponse)
193
+ async def read_root(request: Request):
194
+ """Serve the UI"""
195
+ return templates.TemplateResponse("index.html", {"request": request})
196
+
197
+ @app.post("/upload", response_model=PhotoResponse)
198
+ async def upload_photo(file: UploadFile = File(...), background_tasks: BackgroundTasks = None):
199
+ """
200
+ 1. Validate file
201
+ 2. Save to disk
202
+ 3. create DB record
203
+ 4. Trigger Async AI Task
204
+ """
205
+ if not file.content_type.startswith("image/"):
206
+ raise HTTPException(status_code=400, detail="File must be an image")
207
+
208
+ file_id = str(uuid.uuid4())
209
+ ext = file.filename.split(".")[-1]
210
+ filename = f"{file_id}.{ext}"
211
+ file_path = os.path.join(UPLOAD_DIR, filename)
212
+
213
+ # Save file
214
+ with open(file_path, "wb") as buffer:
215
+ shutil.copyfileobj(file.file, buffer)
216
+
217
+ # Initial DB Record
218
+ conn = sqlite3.connect(DB_PATH)
219
+ cursor = conn.cursor()
220
+ cursor.execute("""
221
+ INSERT INTO photos (id, filename, filepath, upload_date, status)
222
+ VALUES (?, ?, ?, ?, 'processing')
223
+ """, (file_id, file.filename, file_path, datetime.now().isoformat()))
224
+ conn.commit()
225
+ conn.close()
226
+
227
+ # Trigger AI
228
+ background_tasks.add_task(process_image_task, file_id, file_path)
229
+
230
+ return {
231
+ "id": file_id,
232
+ "filename": file.filename,
233
+ "url": f"/static/uploads/{filename}",
234
+ "upload_date": datetime.now().isoformat()
235
+ }
236
+
237
+ @app.get("/photo/{photo_id}", response_model=PhotoResponse)
238
+ async def get_photo(photo_id: str):
239
+ conn = sqlite3.connect(DB_PATH)
240
+ conn.row_factory = sqlite3.Row
241
+ cursor = conn.cursor()
242
+ row = cursor.execute("SELECT * FROM photos WHERE id = ?", (photo_id,)).fetchone()
243
+ conn.close()
244
+
245
+ if not row:
246
+ raise HTTPException(status_code=404, detail="Photo not found")
247
+
248
+ return {
249
+ "id": row["id"],
250
+ "filename": row["filename"],
251
+ "url": f"/{row['filepath']}",
252
+ "caption": row["caption"],
253
+ "tags": json.loads(row["tags"]) if row["tags"] else [],
254
+ "smart_features": json.loads(row["smart_analysis"]) if row["smart_analysis"] else None,
255
+ "upload_date": row["upload_date"]
256
+ }
257
+
258
+ @app.get("/search")
259
+ async def search_photos(q: str):
260
+ """
261
+ Semantic Search:
262
+ 1. Embed query text using CLIP.
263
+ 2. Search ChromaDB for nearest image vectors.
264
+ 3. Retrieve metadata from SQLite.
265
+ """
266
+ # Embed query text
267
+ query_vec = ai_models["clip"].encode(q).tolist()
268
+
269
+ # Query Vector DB
270
+ results = collection.query(
271
+ query_embeddings=[query_vec],
272
+ n_results=5
273
+ )
274
+
275
+ ids = results["ids"][0]
276
+ if not ids:
277
+ return []
278
+
279
+ # Fetch details from SQLite
280
+ placeholders = ",".join("?" * len(ids))
281
+ conn = sqlite3.connect(DB_PATH)
282
+ conn.row_factory = sqlite3.Row
283
+ cursor = conn.cursor()
284
+
285
+ # We use a trick to preserve order or just fetch all and map
286
+ rows = cursor.execute(f"SELECT * FROM photos WHERE id IN ({placeholders})", ids).fetchall()
287
+ conn.close()
288
+
289
+ # Format response
290
+ response_data = []
291
+ for row in rows:
292
+ response_data.append({
293
+ "id": row["id"],
294
+ "url": f"/{row['filepath']}",
295
+ "caption": row["caption"],
296
+ "tags": json.loads(row["tags"]) if row["tags"] else [],
297
+ "smart_features": json.loads(row["smart_analysis"]) if row["smart_analysis"] else None,
298
+ })
299
+
300
+ return response_data
301
+
302
+ if __name__ == '__main__':
303
+ import uvicorn
304
+ uvicorn.run(app, host="0.0.0.0", port=8000)
requirements.txt ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ fastapi
2
+ uvicorn
3
+ python-multipart
4
+ chromadb
5
+ sentence-transformers
6
+ transformers
7
+ torch
8
+ pillow
9
+ insightface
10
+ onnxruntime
11
+ opencv-python
12
+ jinja2
13
+ numpy
templates/index.html ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html lang="en">
3
+ <head>
4
+ <meta charset="UTF-8">
5
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
6
+ <title>Cloudzy AI Photo Manager</title>
7
+ <style>
8
+ body { font-family: sans-serif; max-width: 800px; margin: 0 auto; padding: 20px; }
9
+ .upload-box { border: 2px dashed #ccc; padding: 20px; text-align: center; margin-bottom: 20px; }
10
+ .gallery { display: grid; grid-template-columns: repeat(auto-fill, minmax(200px, 1fr)); gap: 20px; }
11
+ .photo-card { border: 1px solid #eee; padding: 10px; border-radius: 8px; }
12
+ .photo-card img { width: 100%; height: 150px; object-fit: cover; border-radius: 4px; }
13
+ .tags { font-size: 0.8em; color: #666; }
14
+ .meta { font-size: 0.7em; color: #888; margin-top: 5px; }
15
+ </style>
16
+ </head>
17
+ <body>
18
+ <h1>Cloudzy AI Challenge</h1>
19
+
20
+ <div class="upload-box">
21
+ <h3>Upload Photo</h3>
22
+ <input type="file" id="fileInput">
23
+ <button onclick="uploadPhoto()">Upload</button>
24
+ <p id="uploadStatus"></p>
25
+ </div>
26
+
27
+ <div style="margin-bottom: 20px;">
28
+ <input type="text" id="searchInput" placeholder="Search (e.g., 'dog in grass' or 'happy person')..." style="width: 70%;">
29
+ <button onclick="searchPhotos()">Semantic Search</button>
30
+ </div>
31
+
32
+ <div id="gallery" class="gallery"></div>
33
+
34
+ <script>
35
+ async function uploadPhoto() {
36
+ const fileInput = document.getElementById('fileInput');
37
+ const status = document.getElementById('uploadStatus');
38
+
39
+ if (!fileInput.files[0]) return alert("Select a file!");
40
+
41
+ const formData = new FormData();
42
+ formData.append('file', fileInput.files[0]);
43
+
44
+ status.innerText = "Uploading...";
45
+
46
+ try {
47
+ const res = await fetch('/upload', { method: 'POST', body: formData });
48
+ const data = await res.json();
49
+ status.innerText = "Upload successful! ID: " + data.id + ". Processing AI...";
50
+ setTimeout(() => searchPhotos(""), 2000); // Auto refresh
51
+ } catch (e) {
52
+ status.innerText = "Error uploading.";
53
+ }
54
+ }
55
+
56
+ async function searchPhotos() {
57
+ const query = document.getElementById('searchInput').value;
58
+ const gallery = document.getElementById('gallery');
59
+ gallery.innerHTML = "Loading...";
60
+
61
+ let url = query ? `/search?q=${encodeURIComponent(query)}` : '/search?q=recent';
62
+ // Note: Empty search isn't strictly defined in backend, using "recent" logic or just querying specific keyword for demo if empty
63
+ if (!query) return;
64
+
65
+ const res = await fetch(url);
66
+ const photos = await res.json();
67
+
68
+ gallery.innerHTML = "";
69
+ photos.forEach(photo => {
70
+ const div = document.createElement('div');
71
+ div.className = 'photo-card';
72
+
73
+ // Parse smart features for display
74
+ let faceInfo = "";
75
+ if (photo.smart_features && Array.isArray(photo.smart_features)) {
76
+ faceInfo = `${photo.smart_features.length} Face(s) detected`;
77
+ } else if (photo.smart_features && photo.smart_features.message) {
78
+ faceInfo = photo.smart_features.message;
79
+ }
80
+
81
+ div.innerHTML = `
82
+ <img src="${photo.url}" alt="photo">
83
+ <p><strong>${photo.caption || "Processing..."}</strong></p>
84
+ <div class="tags">${photo.tags.slice(0, 5).join(", ")}</div>
85
+ <div class="meta">${faceInfo}</div>
86
+ `;
87
+ gallery.appendChild(div);
88
+ });
89
+ }
90
+ </script>
91
+ </body>
92
+ </html>