Fred808 commited on
Commit
0a3dd67
·
verified ·
1 Parent(s): 71669df

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +450 -497
app.py CHANGED
@@ -1,498 +1,451 @@
1
- import subprocess
2
- import torch
3
- from PIL import Image
4
- import requests
5
- from io import BytesIO
6
- from transformers import AutoProcessor, AutoModelForCausalLM
7
- import os
8
- import threading
9
- import time
10
- import urllib.parse
11
- from fastapi import FastAPI, UploadFile, File, HTTPException, Form
12
- from fastapi.responses import JSONResponse
13
-
14
- app = FastAPI(
15
- title="Florence-2 Image Captioning Server",
16
- description="Auto-captions images from middleware server using Florence-2"
17
- )
18
- import threading
19
- import time
20
- import urllib.parse
21
-
22
- # Attempt to install flash-attn
23
- try:
24
- subprocess.run('pip install flash-attn --no-build-isolation timm einops', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, check=True, shell=True)
25
- except subprocess.CalledProcessError as e:
26
- print(f"Error installing flash-attn: {e}")
27
- print("Continuing without flash-attn.")
28
-
29
- # Determine the device to use
30
- device = "cuda" if torch.cuda.is_available() else "cpu"
31
-
32
- # Load the base model and processor
33
- try:
34
- vision_language_model_base = AutoModelForCausalLM.from_pretrained('microsoft/Florence-2-base', trust_remote_code=True).to(device).eval()
35
- vision_language_processor_base = AutoProcessor.from_pretrained('microsoft/Florence-2-base', trust_remote_code=True)
36
- print("✓ Base model loaded successfully")
37
- except Exception as e:
38
- print(f"Error loading base model: {e}")
39
- vision_language_model_base = None
40
- vision_language_processor_base = None
41
-
42
- # Load the large model and processor
43
- try:
44
- vision_language_model_large = AutoModelForCausalLM.from_pretrained('microsoft/Florence-2-large', trust_remote_code=True).to(device).eval()
45
- vision_language_processor_large = AutoProcessor.from_pretrained('microsoft/Florence-2-large', trust_remote_code=True)
46
- print("✓ Large model loaded successfully")
47
- except Exception as e:
48
- print(f"Error loading large model: {e}")
49
- vision_language_model_large = None
50
- vision_language_processor_large = None
51
-
52
- def load_image_from_url(image_url):
53
- """Load an image from a URL."""
54
- try:
55
- response = requests.get(image_url, timeout=30)
56
- response.raise_for_status()
57
- image = Image.open(BytesIO(response.content))
58
- return image.convert('RGB')
59
- except Exception as e:
60
- raise ValueError(f"Error loading image from URL: {e}")
61
-
62
- def process_image_description(model, processor, image):
63
- """Process an image and generate description using the specified model."""
64
- if not isinstance(image, Image.Image):
65
- image = Image.fromarray(image)
66
-
67
- inputs = processor(text="<MORE_DETAILED_CAPTION>", images=image, return_tensors="pt").to(device)
68
- with torch.no_grad():
69
- generated_ids = model.generate(
70
- input_ids=inputs["input_ids"],
71
- pixel_values=inputs["pixel_values"],
72
- max_new_tokens=1024,
73
- early_stopping=False,
74
- do_sample=False,
75
- num_beams=3,
76
- )
77
- generated_text = processor.batch_decode(generated_ids, skip_special_tokens=False)[0]
78
- processed_description = processor.post_process_generation(
79
- generated_text,
80
- task="<MORE_DETAILED_CAPTION>",
81
- image_size=(image.width, image.height)
82
- )
83
- image_description = processed_description["<MORE_DETAILED_CAPTION>"]
84
- return image_description
85
-
86
- def describe_image(uploaded_image, model_choice):
87
- """Generate description from uploaded image."""
88
- if uploaded_image is None:
89
- return "Please upload an image."
90
-
91
- if model_choice == "Florence-2-base":
92
- if vision_language_model_base is None:
93
- return "Base model failed to load."
94
- model = vision_language_model_base
95
- processor = vision_language_processor_base
96
- elif model_choice == "Florence-2-large":
97
- if vision_language_model_large is None:
98
- return "Large model failed to load."
99
- model = vision_language_model_large
100
- processor = vision_language_processor_large
101
- else:
102
- return "Invalid model choice."
103
-
104
- try:
105
- return process_image_description(model, processor, uploaded_image)
106
- except Exception as e:
107
- return f"Error generating caption: {str(e)}"
108
-
109
- def describe_image_from_url(image_url, model_choice):
110
- """Generate description from image URL."""
111
- try:
112
- if not image_url:
113
- return {"error": "image_url is required"}
114
-
115
- if model_choice not in ["Florence-2-base", "Florence-2-large"]:
116
- return {"error": "Invalid model choice. Use 'Florence-2-base' or 'Florence-2-large'"}
117
-
118
- # Load image from URL
119
- image = load_image_from_url(image_url)
120
-
121
- # Select model and processor
122
- if model_choice == "Florence-2-base":
123
- if vision_language_model_base is None:
124
- return {"error": "Base model not available"}
125
- model = vision_language_model_base
126
- processor = vision_language_processor_base
127
- else:
128
- if vision_language_model_large is None:
129
- return {"error": "Large model not available"}
130
- model = vision_language_model_large
131
- processor = vision_language_processor_large
132
-
133
- # Generate caption
134
- caption = process_image_description(model, processor, image)
135
-
136
- return {
137
- "status": "success",
138
- "model": model_choice,
139
- "caption": caption,
140
- "image_size": {"width": image.width, "height": image.height}
141
- }
142
-
143
- except Exception as e:
144
- return {"error": f"Error processing image: {str(e)}"}
145
-
146
-
147
- IMAGE_SERVER_BASE = os.getenv("IMAGE_SERVER_BASE", " ")
148
- DATA_COLLECTION_BASE = os.getenv("DATA_COLLECTION_BASE", "https://fred808-flow.hf.space")
149
- REQUESTER_ID = os.getenv("FLO_REQUESTER_ID", f"florence-2-{os.getpid()}")
150
- MODEL_CHOICE = os.getenv("FLO_MODEL_CHOICE", "Florence-2-base")
151
-
152
-
153
- def sanitize_name(name: str, max_len: int = 200) -> str:
154
- """Sanitize a filename while preserving extension."""
155
- import re
156
- name = str(name).strip()
157
- # replace spaces with underscores
158
- name = re.sub(r"\s+", "_", name)
159
- # remove any characters not alphanumeric, dot, dash, or underscore
160
- name = re.sub(r"[^A-Za-z0-9_.-]", "", name)
161
- if len(name) > max_len:
162
- base, ext = os.path.splitext(name)
163
- name = base[: max_len - len(ext)] + ext
164
- return name or "file"
165
-
166
- def _build_download_url(course: str, video: str, frame: str) -> str:
167
- """Build download URL with proper encoding of all path segments."""
168
- # The middleware /download endpoint expects the 'file' parameter to be
169
- # a path relative to the course folder (e.g. "video_name/frame.jpg").
170
- # Frames live under a "{base_course}_frames" folder.
171
- base_course = course
172
- if not base_course.endswith("_frames"):
173
- course_dir = f"{base_course}_frames"
174
- else:
175
- course_dir = base_course
176
- base_course = course_dir[:-7] # strip _frames for consistency
177
-
178
- # Sanitize and encode path segments
179
- safe_course = sanitize_name(course_dir)
180
- safe_video = sanitize_name(video)
181
- safe_frame = sanitize_name(frame)
182
-
183
- file_param = f"{safe_video}/{safe_frame}"
184
- url = f"{IMAGE_SERVER_BASE.rstrip('/')}/download?course={urllib.parse.quote(safe_course, safe='')}&file={urllib.parse.quote(file_param, safe='')}"
185
- print(f"[BACKGROUND] Built URL: {url}")
186
- return url, safe_frame
187
-
188
-
189
- def _download_bytes(url: str, timeout: int = 30, chunk_size=32768):
190
- try:
191
- print(f"[BACKGROUND] Starting download: {url}")
192
- response = requests.get(url, timeout=timeout, stream=True)
193
- response.raise_for_status()
194
- content = BytesIO()
195
- total_size = int(response.headers.get('content-length', 0))
196
- print(f"[BACKGROUND] Total size: {total_size} bytes")
197
-
198
- bytes_read = 0
199
- for chunk in response.iter_content(chunk_size=chunk_size):
200
- if chunk:
201
- content.write(chunk)
202
- bytes_read += len(chunk)
203
- if total_size:
204
- print(f"\rDownloading: {bytes_read}/{total_size} bytes ({(bytes_read/total_size)*100:.1f}%)", end="", flush=True)
205
- print() # New line after progress
206
- print(f"[BACKGROUND] Download complete: {bytes_read} bytes")
207
- return content.getvalue(), response.headers.get('content-type')
208
- except Exception as e:
209
- print(f"[BACKGROUND] download failed {url}: {e}")
210
- return None, None
211
-
212
-
213
- def _post_submit(caption: str, image_name: str, course: str, image_url: str, image_bytes: bytes):
214
- submit_url = f"{DATA_COLLECTION_BASE.rstrip('/')}/submit"
215
- files = {'image': (image_name, image_bytes, 'application/octet-stream')}
216
- data = {'caption': caption, 'image_name': image_name, 'course': course, 'image_url': image_url}
217
-
218
- print(f"[BACKGROUND] Submitting to {submit_url}")
219
- print(f"[BACKGROUND] Image name: {image_name}")
220
- print(f"[BACKGROUND] Course: {course}")
221
- print(f"[BACKGROUND] Caption length: {len(caption)} chars")
222
-
223
- try:
224
- r = requests.post(submit_url, data=data, files=files, timeout=30)
225
- print(f"[BACKGROUND] Submit response status: {r.status}")
226
- try:
227
- resp = r.json()
228
- print(f"[BACKGROUND] Submit response JSON: {resp}")
229
- return r.status_code, resp
230
- except Exception:
231
- print(f"[BACKGROUND] Submit response text: {r.text}")
232
- return r.status_code, r.text
233
- except Exception as e:
234
- print(f"[BACKGROUND] Submit POST failed: {e}")
235
- return None, None
236
-
237
-
238
- def _release_frame(course: str, video: str, frame: str):
239
- try:
240
- release_url = f"{IMAGE_SERVER_BASE.rstrip('/')}/middleware/release/frame/{urllib.parse.quote(course, safe='')}/{urllib.parse.quote(video, safe='')}/{urllib.parse.quote(frame, safe='')}"
241
- requests.post(release_url, params={"requester_id": REQUESTER_ID}, timeout=10)
242
- except Exception as e:
243
- print(f"[BACKGROUND] release frame failed: {e}")
244
-
245
-
246
- def _release_course(course: str):
247
- try:
248
- release_url = f"{IMAGE_SERVER_BASE.rstrip('/')}/middleware/release/course/{urllib.parse.quote(course, safe='')}"
249
- requests.post(release_url, params={"requester_id": REQUESTER_ID}, timeout=10)
250
- except Exception as e:
251
- print(f"[BACKGROUND] release course failed: {e}")
252
-
253
-
254
- # Background worker implementation
255
- def background_worker():
256
- """Background worker that processes images from the middleware server."""
257
- print("[BACKGROUND] Starting worker, waiting for model...")
258
-
259
- # Wait for model to be ready
260
- waited = 0
261
- while waited < 120:
262
- if MODEL_CHOICE == "Florence-2-base" and vision_language_model_base:
263
- break
264
- elif MODEL_CHOICE == "Florence-2-large" and vision_language_model_large:
265
- break
266
- time.sleep(1)
267
- waited += 1
268
-
269
- if waited >= 120:
270
- print("[BACKGROUND] Model not available after timeout")
271
- return
272
-
273
- print(f"[BACKGROUND] Model {MODEL_CHOICE} ready, starting processing loop")
274
-
275
- while True:
276
- try:
277
- # Get next course
278
- courses_url = f"{IMAGE_SERVER_BASE}/courses"
279
- print(f"[BACKGROUND] Fetching courses from {courses_url}")
280
-
281
- try:
282
- r = requests.get(courses_url, timeout=15)
283
- r.raise_for_status()
284
- courses_data = r.json()
285
-
286
- if not courses_data.get('courses'):
287
- print("[BACKGROUND] No courses found, waiting...")
288
- time.sleep(3)
289
- continue
290
-
291
- # Get first course
292
- course_entry = courses_data['courses'][0]
293
- if isinstance(course_entry, dict):
294
- course = course_entry.get('course_folder')
295
- else:
296
- course = str(course_entry)
297
-
298
- if not course:
299
- print("[BACKGROUND] Invalid course entry")
300
- time.sleep(2)
301
- continue
302
-
303
- print(f"[BACKGROUND] Processing course: {course}")
304
-
305
- # Get images list
306
- images_url = f"{IMAGE_SERVER_BASE}/images/{urllib.parse.quote(course, safe='')}"
307
- r = requests.get(images_url, timeout=15)
308
- r.raise_for_status()
309
- images_data = r.json()
310
-
311
- if isinstance(images_data, dict):
312
- image_list = images_data.get('images', [])
313
- else:
314
- image_list = images_data
315
-
316
- if not image_list:
317
- print(f"[BACKGROUND] No images found for course {course}")
318
- time.sleep(2)
319
- continue
320
-
321
- print(f"[BACKGROUND] Found {len(image_list)} images")
322
-
323
- # Process images
324
- for img_entry in image_list:
325
- try:
326
- # Extract filename and metadata
327
- if isinstance(img_entry, dict):
328
- filename = img_entry.get('filename')
329
- if not filename:
330
- continue
331
- else:
332
- filename = str(img_entry)
333
-
334
- # Download image
335
- download_url = f"{IMAGE_SERVER_BASE}/images/{urllib.parse.quote(course, safe='')}/{urllib.parse.quote(filename, safe='')}"
336
- print(f"[BACKGROUND] Downloading {download_url}")
337
-
338
- img_bytes, _ = _download_bytes(download_url)
339
- if not img_bytes:
340
- print(f"[BACKGROUND] Failed to download {filename}")
341
- continue
342
-
343
- # Process with Florence
344
- try:
345
- pil_img = Image.open(BytesIO(img_bytes)).convert('RGB')
346
-
347
- if MODEL_CHOICE == "Florence-2-base":
348
- model = vision_language_model_base
349
- processor = vision_language_processor_base
350
- else:
351
- model = vision_language_model_large
352
- processor = vision_language_processor_large
353
-
354
- print(f"[BACKGROUND] Generating caption for {filename}")
355
- caption = process_image_description(model, processor, pil_img)
356
- print(f"[BACKGROUND] Generated caption for {filename}:")
357
- print("-" * 40)
358
- print(caption)
359
- print("-" * 40)
360
-
361
- # Submit result
362
- print(f"[BACKGROUND] Submitting caption to {DATA_COLLECTION_BASE}/submit")
363
- status, resp = _post_submit(caption, filename, course, download_url, img_bytes)
364
- if status and status < 400:
365
- print(f"[BACKGROUND] Successfully submitted {filename} (status={status})")
366
- if resp:
367
- print(f"[BACKGROUND] Response: {resp}")
368
- else:
369
- print(f"[BACKGROUND] Failed to submit {filename}: status={status}, response={resp}")
370
-
371
- except Exception as e:
372
- print(f"[BACKGROUND] Error processing {filename}: {e}")
373
- continue
374
- finally:
375
- # Clean up
376
- if 'pil_img' in locals():
377
- del pil_img
378
- if 'img_bytes' in locals():
379
- del img_bytes
380
-
381
- time.sleep(0.5) # Small delay between images
382
-
383
- except Exception as e:
384
- print(f"[BACKGROUND] Error in image loop: {e}")
385
- continue
386
-
387
- print(f"[BACKGROUND] Completed course {course}")
388
- time.sleep(1)
389
-
390
- except Exception as e:
391
- print(f"[BACKGROUND] Error in course loop: {e}")
392
- time.sleep(5)
393
- continue
394
-
395
- except Exception as e:
396
- print(f"[BACKGROUND] Main loop error: {e}")
397
- time.sleep(5)
398
-
399
-
400
- def _start_worker_thread():
401
- """Start the background worker thread."""
402
- t = threading.Thread(target=background_worker, daemon=True)
403
- t.start()
404
- return t
405
-
406
-
407
- # FastAPI endpoints for status/health
408
- @app.get("/")
409
- async def root():
410
- return {
411
- "name": "Florence-2 Image Captioning Server",
412
- "status": "running",
413
- "model_base": vision_language_model_base is not None,
414
- "model_large": vision_language_model_large is not None,
415
- "device": device
416
- }
417
-
418
- @app.get("/health")
419
- async def health():
420
- return {
421
- "status": "healthy",
422
- "model_base": vision_language_model_base is not None,
423
- "model_large": vision_language_model_large is not None,
424
- "device": device,
425
- "model_choice": MODEL_CHOICE
426
- }
427
-
428
- # Start background worker thread (daemon) so it doesn't block shutdown
429
- def _start_worker_thread():
430
- t = threading.Thread(target=background_worker, daemon=True)
431
- t.start()
432
-
433
- # Start background worker when FastAPI starts
434
- @app.on_event("startup")
435
- async def startup_event():
436
- _start_worker_thread()
437
-
438
-
439
- @app.get("/analyze")
440
- async def analyze_get(image_url: str = None, model_choice: str = None):
441
- """Analyze an image by URL. Usage: /analyze?image_url=https://...&model_choice=Florence-2-base"""
442
- try:
443
- mc = model_choice or MODEL_CHOICE
444
- if image_url:
445
- result = describe_image_from_url(image_url, mc)
446
- if isinstance(result, dict) and result.get("status") == "success":
447
- return JSONResponse(content={"success": True, "caption": result.get("caption"), "image_size": result.get("image_size")})
448
- else:
449
- return JSONResponse(status_code=400, content={"success": False, "error": result})
450
- else:
451
- raise HTTPException(status_code=400, detail="image_url query parameter is required")
452
- except HTTPException:
453
- raise
454
- except Exception as e:
455
- return JSONResponse(status_code=500, content={"success": False, "error": str(e)})
456
-
457
-
458
- @app.post("/analyze")
459
- async def analyze_post(file: UploadFile = File(None), model_choice: str = Form(None)):
460
- """Analyze an uploaded image (multipart/form-data). Returns caption JSON."""
461
- try:
462
- mc = model_choice or MODEL_CHOICE
463
- if file is None:
464
- raise HTTPException(status_code=400, detail="file is required")
465
-
466
- content = await file.read()
467
- try:
468
- pil_img = Image.open(BytesIO(content)).convert('RGB')
469
- except Exception as e:
470
- raise HTTPException(status_code=400, detail=f"Failed to read uploaded image: {e}")
471
-
472
- # Choose model
473
- if mc == "Florence-2-large":
474
- if vision_language_model_large is None:
475
- raise HTTPException(status_code=503, detail="Base model not loaded")
476
- model = vision_language_model_large
477
- processor = vision_language_processor_large
478
- else:
479
- if vision_language_model_large is None:
480
- raise HTTPException(status_code=503, detail="Large model not loaded")
481
- model = vision_language_model_large
482
- processor = vision_language_processor_large
483
-
484
- caption = process_image_description(model, processor, pil_img)
485
- return JSONResponse(content={"success": True, "caption": caption})
486
-
487
- except HTTPException:
488
- raise
489
- except Exception as e:
490
- return JSONResponse(status_code=500, content={"success": False, "error": str(e)})
491
-
492
- # Get the port from environment variable (for Hugging Face Spaces)
493
- port = int(os.environ.get("PORT", 7860))
494
-
495
- # Launch FastAPI with uvicorn when run directly
496
- if __name__ == "__main__":
497
- import uvicorn
498
  uvicorn.run(app, host="0.0.0.0", port=port)
 
1
+ import subprocess
2
+ import torch
3
+ from PIL import Image
4
+ import requests
5
+ from io import BytesIO
6
+ from transformers import AutoProcessor, AutoModelForCausalLM
7
+ import os
8
+ import threading
9
+ import time
10
+ import urllib.parse
11
+ from fastapi import FastAPI, UploadFile, File, HTTPException, Form
12
+ from fastapi.responses import JSONResponse
13
+
14
+ app = FastAPI(
15
+ title="Florence-2 Image Captioning Server",
16
+ description="Auto-captions images from middleware server using Florence-2"
17
+ )
18
+ import threading
19
+ import time
20
+ import urllib.parse
21
+
22
+ # Attempt to install flash-attn
23
+ try:
24
+ subprocess.run('pip install flash-attn --no-build-isolation timm einops', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, check=True, shell=True)
25
+ except subprocess.CalledProcessError as e:
26
+ print(f"Error installing flash-attn: {e}")
27
+ print("Continuing without flash-attn.")
28
+
29
+ # Determine the device to use
30
+ device = "cuda" if torch.cuda.is_available() else "cpu"
31
+
32
+ # Load Florence-2-large model and processor
33
+ try:
34
+ vision_language_model = AutoModelForCausalLM.from_pretrained('microsoft/Florence-2-large', trust_remote_code=True).to(device).eval()
35
+ vision_language_processor = AutoProcessor.from_pretrained('microsoft/Florence-2-large', trust_remote_code=True)
36
+ print("✓ Florence-2-large model loaded successfully")
37
+ except Exception as e:
38
+ print(f"Error loading Florence-2-large model: {e}")
39
+ vision_language_model = None
40
+ vision_language_processor = None
41
+
42
+ def load_image_from_url(image_url):
43
+ """Load an image from a URL."""
44
+ try:
45
+ response = requests.get(image_url, timeout=30)
46
+ response.raise_for_status()
47
+ image = Image.open(BytesIO(response.content))
48
+ return image.convert('RGB')
49
+ except Exception as e:
50
+ raise ValueError(f"Error loading image from URL: {e}")
51
+
52
+ def process_image_description(model, processor, image):
53
+ """Process an image and generate description using the specified model."""
54
+ if not isinstance(image, Image.Image):
55
+ image = Image.fromarray(image)
56
+
57
+ inputs = processor(text="<MORE_DETAILED_CAPTION>", images=image, return_tensors="pt").to(device)
58
+ with torch.no_grad():
59
+ generated_ids = model.generate(
60
+ input_ids=inputs["input_ids"],
61
+ pixel_values=inputs["pixel_values"],
62
+ max_new_tokens=1024,
63
+ early_stopping=False,
64
+ do_sample=False,
65
+ num_beams=3,
66
+ )
67
+ generated_text = processor.batch_decode(generated_ids, skip_special_tokens=False)[0]
68
+ processed_description = processor.post_process_generation(
69
+ generated_text,
70
+ task="<MORE_DETAILED_CAPTION>",
71
+ image_size=(image.width, image.height)
72
+ )
73
+ image_description = processed_description["<MORE_DETAILED_CAPTION>"]
74
+ return image_description
75
+
76
+ def describe_image(uploaded_image, model_choice):
77
+ """Generate description from uploaded image."""
78
+ if uploaded_image is None:
79
+ return "Please upload an image."
80
+
81
+ if vision_language_model is None:
82
+ return "Florence-2-large model failed to load."
83
+
84
+ model = vision_language_model
85
+ processor = vision_language_processor
86
+
87
+ try:
88
+ return process_image_description(model, processor, uploaded_image)
89
+ except Exception as e:
90
+ return f"Error generating caption: {str(e)}"
91
+
92
+ def describe_image_from_url(image_url, model_choice=None):
93
+ """Generate description from image URL."""
94
+ try:
95
+ if not image_url:
96
+ return {"error": "image_url is required"}
97
+
98
+ if vision_language_model is None:
99
+ return {"error": "Florence-2-large model not available"}
100
+
101
+ # Load image from URL
102
+ image = load_image_from_url(image_url)
103
+
104
+ # Use the loaded large model
105
+ model = vision_language_model
106
+ processor = vision_language_processor
107
+
108
+ # Generate caption
109
+ caption = process_image_description(model, processor, image)
110
+
111
+ return {
112
+ "status": "success",
113
+ "model": model_choice,
114
+ "caption": caption,
115
+ "image_size": {"width": image.width, "height": image.height}
116
+ }
117
+
118
+ except Exception as e:
119
+ return {"error": f"Error processing image: {str(e)}"}
120
+
121
+
122
+ IMAGE_SERVER_BASE = os.getenv("IMAGE_SERVER_BASE", " ")
123
+ DATA_COLLECTION_BASE = os.getenv("DATA_COLLECTION_BASE", "https://fred808-flow.hf.space")
124
+ REQUESTER_ID = os.getenv("FLO_REQUESTER_ID", f"florence-2-{os.getpid()}")
125
+ MODEL_CHOICE = "Florence-2-large" # Always use large model
126
+
127
+
128
+ def sanitize_name(name: str, max_len: int = 200) -> str:
129
+ """Sanitize a filename while preserving extension."""
130
+ import re
131
+ name = str(name).strip()
132
+ # replace spaces with underscores
133
+ name = re.sub(r"\s+", "_", name)
134
+ # remove any characters not alphanumeric, dot, dash, or underscore
135
+ name = re.sub(r"[^A-Za-z0-9_.-]", "", name)
136
+ if len(name) > max_len:
137
+ base, ext = os.path.splitext(name)
138
+ name = base[: max_len - len(ext)] + ext
139
+ return name or "file"
140
+
141
+ def _build_download_url(course: str, video: str, frame: str) -> str:
142
+ """Build download URL with proper encoding of all path segments."""
143
+ # The middleware /download endpoint expects the 'file' parameter to be
144
+ # a path relative to the course folder (e.g. "video_name/frame.jpg").
145
+ # Frames live under a "{base_course}_frames" folder.
146
+ base_course = course
147
+ if not base_course.endswith("_frames"):
148
+ course_dir = f"{base_course}_frames"
149
+ else:
150
+ course_dir = base_course
151
+ base_course = course_dir[:-7] # strip _frames for consistency
152
+
153
+ # Sanitize and encode path segments
154
+ safe_course = sanitize_name(course_dir)
155
+ safe_video = sanitize_name(video)
156
+ safe_frame = sanitize_name(frame)
157
+
158
+ file_param = f"{safe_video}/{safe_frame}"
159
+ url = f"{IMAGE_SERVER_BASE.rstrip('/')}/download?course={urllib.parse.quote(safe_course, safe='')}&file={urllib.parse.quote(file_param, safe='')}"
160
+ print(f"[BACKGROUND] Built URL: {url}")
161
+ return url, safe_frame
162
+
163
+
164
+ def _download_bytes(url: str, timeout: int = 30, chunk_size=32768):
165
+ try:
166
+ print(f"[BACKGROUND] Starting download: {url}")
167
+ response = requests.get(url, timeout=timeout, stream=True)
168
+ response.raise_for_status()
169
+ content = BytesIO()
170
+ total_size = int(response.headers.get('content-length', 0))
171
+ print(f"[BACKGROUND] Total size: {total_size} bytes")
172
+
173
+ bytes_read = 0
174
+ for chunk in response.iter_content(chunk_size=chunk_size):
175
+ if chunk:
176
+ content.write(chunk)
177
+ bytes_read += len(chunk)
178
+ if total_size:
179
+ print(f"\rDownloading: {bytes_read}/{total_size} bytes ({(bytes_read/total_size)*100:.1f}%)", end="", flush=True)
180
+ print() # New line after progress
181
+ print(f"[BACKGROUND] Download complete: {bytes_read} bytes")
182
+ return content.getvalue(), response.headers.get('content-type')
183
+ except Exception as e:
184
+ print(f"[BACKGROUND] download failed {url}: {e}")
185
+ return None, None
186
+
187
+
188
+ def _post_submit(caption: str, image_name: str, course: str, image_url: str, image_bytes: bytes):
189
+ submit_url = f"{DATA_COLLECTION_BASE.rstrip('/')}/submit"
190
+ files = {'image': (image_name, image_bytes, 'application/octet-stream')}
191
+ data = {'caption': caption, 'image_name': image_name, 'course': course, 'image_url': image_url}
192
+
193
+ print(f"[BACKGROUND] Submitting to {submit_url}")
194
+ print(f"[BACKGROUND] Image name: {image_name}")
195
+ print(f"[BACKGROUND] Course: {course}")
196
+ print(f"[BACKGROUND] Caption length: {len(caption)} chars")
197
+
198
+ try:
199
+ r = requests.post(submit_url, data=data, files=files, timeout=30)
200
+ print(f"[BACKGROUND] Submit response status: {r.status}")
201
+ try:
202
+ resp = r.json()
203
+ print(f"[BACKGROUND] Submit response JSON: {resp}")
204
+ return r.status_code, resp
205
+ except Exception:
206
+ print(f"[BACKGROUND] Submit response text: {r.text}")
207
+ return r.status_code, r.text
208
+ except Exception as e:
209
+ print(f"[BACKGROUND] Submit POST failed: {e}")
210
+ return None, None
211
+
212
+
213
+ def _release_frame(course: str, video: str, frame: str):
214
+ try:
215
+ release_url = f"{IMAGE_SERVER_BASE.rstrip('/')}/middleware/release/frame/{urllib.parse.quote(course, safe='')}/{urllib.parse.quote(video, safe='')}/{urllib.parse.quote(frame, safe='')}"
216
+ requests.post(release_url, params={"requester_id": REQUESTER_ID}, timeout=10)
217
+ except Exception as e:
218
+ print(f"[BACKGROUND] release frame failed: {e}")
219
+
220
+
221
+ def _release_course(course: str):
222
+ try:
223
+ release_url = f"{IMAGE_SERVER_BASE.rstrip('/')}/middleware/release/course/{urllib.parse.quote(course, safe='')}"
224
+ requests.post(release_url, params={"requester_id": REQUESTER_ID}, timeout=10)
225
+ except Exception as e:
226
+ print(f"[BACKGROUND] release course failed: {e}")
227
+
228
+
229
+ # Background worker implementation
230
+ def background_worker():
231
+ """Background worker that processes images from the middleware server."""
232
+ print("[BACKGROUND] Starting worker, waiting for model...")
233
+
234
+ # Wait for model to be ready
235
+ waited = 0
236
+ while waited < 120:
237
+ if vision_language_model is not None:
238
+ break
239
+ time.sleep(1)
240
+ waited += 1
241
+
242
+ if waited >= 120:
243
+ print("[BACKGROUND] Model not available after timeout")
244
+ return
245
+
246
+ print(f"[BACKGROUND] Model {MODEL_CHOICE} ready, starting processing loop")
247
+
248
+ while True:
249
+ try:
250
+ # Get next course
251
+ courses_url = f"{IMAGE_SERVER_BASE}/courses"
252
+ print(f"[BACKGROUND] Fetching courses from {courses_url}")
253
+
254
+ try:
255
+ r = requests.get(courses_url, timeout=15)
256
+ r.raise_for_status()
257
+ courses_data = r.json()
258
+
259
+ if not courses_data.get('courses'):
260
+ print("[BACKGROUND] No courses found, waiting...")
261
+ time.sleep(3)
262
+ continue
263
+
264
+ # Get first course
265
+ course_entry = courses_data['courses'][0]
266
+ if isinstance(course_entry, dict):
267
+ course = course_entry.get('course_folder')
268
+ else:
269
+ course = str(course_entry)
270
+
271
+ if not course:
272
+ print("[BACKGROUND] Invalid course entry")
273
+ time.sleep(2)
274
+ continue
275
+
276
+ print(f"[BACKGROUND] Processing course: {course}")
277
+
278
+ # Get images list
279
+ images_url = f"{IMAGE_SERVER_BASE}/images/{urllib.parse.quote(course, safe='')}"
280
+ r = requests.get(images_url, timeout=15)
281
+ r.raise_for_status()
282
+ images_data = r.json()
283
+
284
+ if isinstance(images_data, dict):
285
+ image_list = images_data.get('images', [])
286
+ else:
287
+ image_list = images_data
288
+
289
+ if not image_list:
290
+ print(f"[BACKGROUND] No images found for course {course}")
291
+ time.sleep(2)
292
+ continue
293
+
294
+ print(f"[BACKGROUND] Found {len(image_list)} images")
295
+
296
+ # Process images
297
+ for img_entry in image_list:
298
+ try:
299
+ # Extract filename and metadata
300
+ if isinstance(img_entry, dict):
301
+ filename = img_entry.get('filename')
302
+ if not filename:
303
+ continue
304
+ else:
305
+ filename = str(img_entry)
306
+
307
+ # Download image
308
+ download_url = f"{IMAGE_SERVER_BASE}/images/{urllib.parse.quote(course, safe='')}/{urllib.parse.quote(filename, safe='')}"
309
+ print(f"[BACKGROUND] Downloading {download_url}")
310
+
311
+ img_bytes, _ = _download_bytes(download_url)
312
+ if not img_bytes:
313
+ print(f"[BACKGROUND] Failed to download {filename}")
314
+ continue
315
+
316
+ # Process with Florence
317
+ try:
318
+ pil_img = Image.open(BytesIO(img_bytes)).convert('RGB')
319
+
320
+ model = vision_language_model
321
+ processor = vision_language_processor
322
+
323
+ print(f"[BACKGROUND] Generating caption for {filename}")
324
+ caption = process_image_description(model, processor, pil_img)
325
+ print(f"[BACKGROUND] Generated caption for {filename}:")
326
+ print("-" * 40)
327
+ print(caption)
328
+ print("-" * 40)
329
+
330
+ # Submit result
331
+ print(f"[BACKGROUND] Submitting caption to {DATA_COLLECTION_BASE}/submit")
332
+ status, resp = _post_submit(caption, filename, course, download_url, img_bytes)
333
+ if status and status < 400:
334
+ print(f"[BACKGROUND] Successfully submitted {filename} (status={status})")
335
+ if resp:
336
+ print(f"[BACKGROUND] Response: {resp}")
337
+ else:
338
+ print(f"[BACKGROUND] Failed to submit {filename}: status={status}, response={resp}")
339
+
340
+ except Exception as e:
341
+ print(f"[BACKGROUND] Error processing {filename}: {e}")
342
+ continue
343
+ finally:
344
+ # Clean up
345
+ if 'pil_img' in locals():
346
+ del pil_img
347
+ if 'img_bytes' in locals():
348
+ del img_bytes
349
+
350
+ time.sleep(0.5) # Small delay between images
351
+
352
+ except Exception as e:
353
+ print(f"[BACKGROUND] Error in image loop: {e}")
354
+ continue
355
+
356
+ print(f"[BACKGROUND] Completed course {course}")
357
+ time.sleep(1)
358
+
359
+ except Exception as e:
360
+ print(f"[BACKGROUND] Error in course loop: {e}")
361
+ time.sleep(5)
362
+ continue
363
+
364
+ except Exception as e:
365
+ print(f"[BACKGROUND] Main loop error: {e}")
366
+ time.sleep(5)
367
+
368
+
369
+ def _start_worker_thread():
370
+ """Start the background worker thread."""
371
+ t = threading.Thread(target=background_worker, daemon=True)
372
+ t.start()
373
+ return t
374
+
375
+
376
+ # FastAPI endpoints for status/health
377
+ @app.get("/")
378
+ async def root():
379
+ return {
380
+ "name": "Florence-2 Image Captioning Server",
381
+ "status": "running",
382
+ "model": "Florence-2-large",
383
+ "model_loaded": vision_language_model is not None,
384
+ "device": device
385
+ }
386
+
387
+ @app.get("/health")
388
+ async def health():
389
+ return {
390
+ "status": "healthy",
391
+ "model": "Florence-2-large",
392
+ "model_loaded": vision_language_model is not None,
393
+ "device": device,
394
+ "model_choice": MODEL_CHOICE
395
+ }
396
+
397
+
398
+
399
+ @app.get("/analyze")
400
+ async def analyze_get(image_url: str = None, model_choice: str = None):
401
+ """Analyze an image by URL. Usage: /analyze?image_url=https://...&model_choice=Florence-2-base"""
402
+ try:
403
+ mc = model_choice or MODEL_CHOICE
404
+ if image_url:
405
+ result = describe_image_from_url(image_url, mc)
406
+ if isinstance(result, dict) and result.get("status") == "success":
407
+ return JSONResponse(content={"success": True, "caption": result.get("caption"), "image_size": result.get("image_size")})
408
+ else:
409
+ return JSONResponse(status_code=400, content={"success": False, "error": result})
410
+ else:
411
+ raise HTTPException(status_code=400, detail="image_url query parameter is required")
412
+ except HTTPException:
413
+ raise
414
+ except Exception as e:
415
+ return JSONResponse(status_code=500, content={"success": False, "error": str(e)})
416
+
417
+
418
+ @app.post("/analyze")
419
+ async def analyze_post(file: UploadFile = File(None), model_choice: str = Form(None)):
420
+ """Analyze an uploaded image (multipart/form-data). Returns caption JSON."""
421
+ try:
422
+ if file is None:
423
+ raise HTTPException(status_code=400, detail="file is required")
424
+
425
+ content = await file.read()
426
+ try:
427
+ pil_img = Image.open(BytesIO(content)).convert('RGB')
428
+ except Exception as e:
429
+ raise HTTPException(status_code=400, detail=f"Failed to read uploaded image: {e}")
430
+
431
+ if vision_language_model is None:
432
+ raise HTTPException(status_code=503, detail="Florence-2-large model not loaded")
433
+
434
+ model = vision_language_model
435
+ processor = vision_language_processor
436
+
437
+ caption = process_image_description(model, processor, pil_img)
438
+ return JSONResponse(content={"success": True, "caption": caption})
439
+
440
+ except HTTPException:
441
+ raise
442
+ except Exception as e:
443
+ return JSONResponse(status_code=500, content={"success": False, "error": str(e)})
444
+
445
+ # Get the port from environment variable (for Hugging Face Spaces)
446
+ port = int(os.environ.get("PORT", 7860))
447
+
448
+ # Launch FastAPI with uvicorn when run directly
449
+ if __name__ == "__main__":
450
+ import uvicorn
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
451
  uvicorn.run(app, host="0.0.0.0", port=port)