Samfredoly commited on
Commit
d9eb8bf
·
verified ·
1 Parent(s): a50498d

Upload app.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. app.py +451 -0
app.py ADDED
@@ -0,0 +1,451 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import subprocess
2
+ import torch
3
+ from PIL import Image
4
+ import requests
5
+ from io import BytesIO
6
+ from transformers import AutoProcessor, AutoModelForCausalLM
7
+ import os
8
+ import threading
9
+ import time
10
+ import urllib.parse
11
+ from fastapi import FastAPI, UploadFile, File, HTTPException, Form
12
+ from fastapi.responses import JSONResponse
13
+
14
+ app = FastAPI(
15
+ title="Florence-2 Image Captioning Server",
16
+ description="Auto-captions images from middleware server using Florence-2"
17
+ )
18
+ import threading
19
+ import time
20
+ import urllib.parse
21
+
22
+ # Attempt to install flash-attn
23
+ try:
24
+ subprocess.run('pip install flash-attn --no-build-isolation timm einops', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, check=True, shell=True)
25
+ except subprocess.CalledProcessError as e:
26
+ print(f"Error installing flash-attn: {e}")
27
+ print("Continuing without flash-attn.")
28
+
29
+ # Determine the device to use
30
+ device = "cuda" if torch.cuda.is_available() else "cpu"
31
+
32
+ # Load Florence-2-large model and processor
33
+ try:
34
+ vision_language_model = AutoModelForCausalLM.from_pretrained('microsoft/Florence-2-large', trust_remote_code=True).to(device).eval()
35
+ vision_language_processor = AutoProcessor.from_pretrained('microsoft/Florence-2-large', trust_remote_code=True)
36
+ print("✓ Florence-2-large model loaded successfully")
37
+ except Exception as e:
38
+ print(f"Error loading Florence-2-large model: {e}")
39
+ vision_language_model = None
40
+ vision_language_processor = None
41
+
42
+ def load_image_from_url(image_url):
43
+ """Load an image from a URL."""
44
+ try:
45
+ response = requests.get(image_url, timeout=30)
46
+ response.raise_for_status()
47
+ image = Image.open(BytesIO(response.content))
48
+ return image.convert('RGB')
49
+ except Exception as e:
50
+ raise ValueError(f"Error loading image from URL: {e}")
51
+
52
+ def process_image_description(model, processor, image):
53
+ """Process an image and generate description using the specified model."""
54
+ if not isinstance(image, Image.Image):
55
+ image = Image.fromarray(image)
56
+
57
+ inputs = processor(text="<MORE_DETAILED_CAPTION>", images=image, return_tensors="pt").to(device)
58
+ with torch.no_grad():
59
+ generated_ids = model.generate(
60
+ input_ids=inputs["input_ids"],
61
+ pixel_values=inputs["pixel_values"],
62
+ max_new_tokens=1024,
63
+ early_stopping=False,
64
+ do_sample=False,
65
+ num_beams=3,
66
+ )
67
+ generated_text = processor.batch_decode(generated_ids, skip_special_tokens=False)[0]
68
+ processed_description = processor.post_process_generation(
69
+ generated_text,
70
+ task="<MORE_DETAILED_CAPTION>",
71
+ image_size=(image.width, image.height)
72
+ )
73
+ image_description = processed_description["<MORE_DETAILED_CAPTION>"]
74
+ return image_description
75
+
76
+ def describe_image(uploaded_image, model_choice):
77
+ """Generate description from uploaded image."""
78
+ if uploaded_image is None:
79
+ return "Please upload an image."
80
+
81
+ if vision_language_model is None:
82
+ return "Florence-2-large model failed to load."
83
+
84
+ model = vision_language_model
85
+ processor = vision_language_processor
86
+
87
+ try:
88
+ return process_image_description(model, processor, uploaded_image)
89
+ except Exception as e:
90
+ return f"Error generating caption: {str(e)}"
91
+
92
+ def describe_image_from_url(image_url, model_choice=None):
93
+ """Generate description from image URL."""
94
+ try:
95
+ if not image_url:
96
+ return {"error": "image_url is required"}
97
+
98
+ if vision_language_model is None:
99
+ return {"error": "Florence-2-large model not available"}
100
+
101
+ # Load image from URL
102
+ image = load_image_from_url(image_url)
103
+
104
+ # Use the loaded large model
105
+ model = vision_language_model
106
+ processor = vision_language_processor
107
+
108
+ # Generate caption
109
+ caption = process_image_description(model, processor, image)
110
+
111
+ return {
112
+ "status": "success",
113
+ "model": model_choice,
114
+ "caption": caption,
115
+ "image_size": {"width": image.width, "height": image.height}
116
+ }
117
+
118
+ except Exception as e:
119
+ return {"error": f"Error processing image: {str(e)}"}
120
+
121
+
122
+ IMAGE_SERVER_BASE = os.getenv("IMAGE_SERVER_BASE", " ")
123
+ DATA_COLLECTION_BASE = os.getenv("DATA_COLLECTION_BASE", "https://fred808-flow.hf.space")
124
+ REQUESTER_ID = os.getenv("FLO_REQUESTER_ID", f"florence-2-{os.getpid()}")
125
+ MODEL_CHOICE = "Florence-2-large" # Always use large model
126
+
127
+
128
+ def sanitize_name(name: str, max_len: int = 200) -> str:
129
+ """Sanitize a filename while preserving extension."""
130
+ import re
131
+ name = str(name).strip()
132
+ # replace spaces with underscores
133
+ name = re.sub(r"\s+", "_", name)
134
+ # remove any characters not alphanumeric, dot, dash, or underscore
135
+ name = re.sub(r"[^A-Za-z0-9_.-]", "", name)
136
+ if len(name) > max_len:
137
+ base, ext = os.path.splitext(name)
138
+ name = base[: max_len - len(ext)] + ext
139
+ return name or "file"
140
+
141
+ def _build_download_url(course: str, video: str, frame: str) -> str:
142
+ """Build download URL with proper encoding of all path segments."""
143
+ # The middleware /download endpoint expects the 'file' parameter to be
144
+ # a path relative to the course folder (e.g. "video_name/frame.jpg").
145
+ # Frames live under a "{base_course}_frames" folder.
146
+ base_course = course
147
+ if not base_course.endswith("_frames"):
148
+ course_dir = f"{base_course}_frames"
149
+ else:
150
+ course_dir = base_course
151
+ base_course = course_dir[:-7] # strip _frames for consistency
152
+
153
+ # Sanitize and encode path segments
154
+ safe_course = sanitize_name(course_dir)
155
+ safe_video = sanitize_name(video)
156
+ safe_frame = sanitize_name(frame)
157
+
158
+ file_param = f"{safe_video}/{safe_frame}"
159
+ url = f"{IMAGE_SERVER_BASE.rstrip('/')}/download?course={urllib.parse.quote(safe_course, safe='')}&file={urllib.parse.quote(file_param, safe='')}"
160
+ print(f"[BACKGROUND] Built URL: {url}")
161
+ return url, safe_frame
162
+
163
+
164
+ def _download_bytes(url: str, timeout: int = 30, chunk_size=32768):
165
+ try:
166
+ print(f"[BACKGROUND] Starting download: {url}")
167
+ response = requests.get(url, timeout=timeout, stream=True)
168
+ response.raise_for_status()
169
+ content = BytesIO()
170
+ total_size = int(response.headers.get('content-length', 0))
171
+ print(f"[BACKGROUND] Total size: {total_size} bytes")
172
+
173
+ bytes_read = 0
174
+ for chunk in response.iter_content(chunk_size=chunk_size):
175
+ if chunk:
176
+ content.write(chunk)
177
+ bytes_read += len(chunk)
178
+ if total_size:
179
+ print(f"\rDownloading: {bytes_read}/{total_size} bytes ({(bytes_read/total_size)*100:.1f}%)", end="", flush=True)
180
+ print() # New line after progress
181
+ print(f"[BACKGROUND] Download complete: {bytes_read} bytes")
182
+ return content.getvalue(), response.headers.get('content-type')
183
+ except Exception as e:
184
+ print(f"[BACKGROUND] download failed {url}: {e}")
185
+ return None, None
186
+
187
+
188
+ def _post_submit(caption: str, image_name: str, course: str, image_url: str, image_bytes: bytes):
189
+ submit_url = f"{DATA_COLLECTION_BASE.rstrip('/')}/submit"
190
+ files = {'image': (image_name, image_bytes, 'application/octet-stream')}
191
+ data = {'caption': caption, 'image_name': image_name, 'course': course, 'image_url': image_url}
192
+
193
+ print(f"[BACKGROUND] Submitting to {submit_url}")
194
+ print(f"[BACKGROUND] Image name: {image_name}")
195
+ print(f"[BACKGROUND] Course: {course}")
196
+ print(f"[BACKGROUND] Caption length: {len(caption)} chars")
197
+
198
+ try:
199
+ r = requests.post(submit_url, data=data, files=files, timeout=30)
200
+ print(f"[BACKGROUND] Submit response status: {r.status}")
201
+ try:
202
+ resp = r.json()
203
+ print(f"[BACKGROUND] Submit response JSON: {resp}")
204
+ return r.status_code, resp
205
+ except Exception:
206
+ print(f"[BACKGROUND] Submit response text: {r.text}")
207
+ return r.status_code, r.text
208
+ except Exception as e:
209
+ print(f"[BACKGROUND] Submit POST failed: {e}")
210
+ return None, None
211
+
212
+
213
+ def _release_frame(course: str, video: str, frame: str):
214
+ try:
215
+ release_url = f"{IMAGE_SERVER_BASE.rstrip('/')}/middleware/release/frame/{urllib.parse.quote(course, safe='')}/{urllib.parse.quote(video, safe='')}/{urllib.parse.quote(frame, safe='')}"
216
+ requests.post(release_url, params={"requester_id": REQUESTER_ID}, timeout=10)
217
+ except Exception as e:
218
+ print(f"[BACKGROUND] release frame failed: {e}")
219
+
220
+
221
+ def _release_course(course: str):
222
+ try:
223
+ release_url = f"{IMAGE_SERVER_BASE.rstrip('/')}/middleware/release/course/{urllib.parse.quote(course, safe='')}"
224
+ requests.post(release_url, params={"requester_id": REQUESTER_ID}, timeout=10)
225
+ except Exception as e:
226
+ print(f"[BACKGROUND] release course failed: {e}")
227
+
228
+
229
+ # Background worker implementation
230
+ def background_worker():
231
+ """Background worker that processes images from the middleware server."""
232
+ print("[BACKGROUND] Starting worker, waiting for model...")
233
+
234
+ # Wait for model to be ready
235
+ waited = 0
236
+ while waited < 120:
237
+ if vision_language_model is not None:
238
+ break
239
+ time.sleep(1)
240
+ waited += 1
241
+
242
+ if waited >= 120:
243
+ print("[BACKGROUND] Model not available after timeout")
244
+ return
245
+
246
+ print(f"[BACKGROUND] Model {MODEL_CHOICE} ready, starting processing loop")
247
+
248
+ while True:
249
+ try:
250
+ # Get next course
251
+ courses_url = f"{IMAGE_SERVER_BASE}/courses"
252
+ print(f"[BACKGROUND] Fetching courses from {courses_url}")
253
+
254
+ try:
255
+ r = requests.get(courses_url, timeout=15)
256
+ r.raise_for_status()
257
+ courses_data = r.json()
258
+
259
+ if not courses_data.get('courses'):
260
+ print("[BACKGROUND] No courses found, waiting...")
261
+ time.sleep(3)
262
+ continue
263
+
264
+ # Get first course
265
+ course_entry = courses_data['courses'][0]
266
+ if isinstance(course_entry, dict):
267
+ course = course_entry.get('course_folder')
268
+ else:
269
+ course = str(course_entry)
270
+
271
+ if not course:
272
+ print("[BACKGROUND] Invalid course entry")
273
+ time.sleep(2)
274
+ continue
275
+
276
+ print(f"[BACKGROUND] Processing course: {course}")
277
+
278
+ # Get images list
279
+ images_url = f"{IMAGE_SERVER_BASE}/images/{urllib.parse.quote(course, safe='')}"
280
+ r = requests.get(images_url, timeout=15)
281
+ r.raise_for_status()
282
+ images_data = r.json()
283
+
284
+ if isinstance(images_data, dict):
285
+ image_list = images_data.get('images', [])
286
+ else:
287
+ image_list = images_data
288
+
289
+ if not image_list:
290
+ print(f"[BACKGROUND] No images found for course {course}")
291
+ time.sleep(2)
292
+ continue
293
+
294
+ print(f"[BACKGROUND] Found {len(image_list)} images")
295
+
296
+ # Process images
297
+ for img_entry in image_list:
298
+ try:
299
+ # Extract filename and metadata
300
+ if isinstance(img_entry, dict):
301
+ filename = img_entry.get('filename')
302
+ if not filename:
303
+ continue
304
+ else:
305
+ filename = str(img_entry)
306
+
307
+ # Download image
308
+ download_url = f"{IMAGE_SERVER_BASE}/images/{urllib.parse.quote(course, safe='')}/{urllib.parse.quote(filename, safe='')}"
309
+ print(f"[BACKGROUND] Downloading {download_url}")
310
+
311
+ img_bytes, _ = _download_bytes(download_url)
312
+ if not img_bytes:
313
+ print(f"[BACKGROUND] Failed to download {filename}")
314
+ continue
315
+
316
+ # Process with Florence
317
+ try:
318
+ pil_img = Image.open(BytesIO(img_bytes)).convert('RGB')
319
+
320
+ model = vision_language_model
321
+ processor = vision_language_processor
322
+
323
+ print(f"[BACKGROUND] Generating caption for {filename}")
324
+ caption = process_image_description(model, processor, pil_img)
325
+ print(f"[BACKGROUND] Generated caption for {filename}:")
326
+ print("-" * 40)
327
+ print(caption)
328
+ print("-" * 40)
329
+
330
+ # Submit result
331
+ print(f"[BACKGROUND] Submitting caption to {DATA_COLLECTION_BASE}/submit")
332
+ status, resp = _post_submit(caption, filename, course, download_url, img_bytes)
333
+ if status and status < 400:
334
+ print(f"[BACKGROUND] Successfully submitted {filename} (status={status})")
335
+ if resp:
336
+ print(f"[BACKGROUND] Response: {resp}")
337
+ else:
338
+ print(f"[BACKGROUND] Failed to submit {filename}: status={status}, response={resp}")
339
+
340
+ except Exception as e:
341
+ print(f"[BACKGROUND] Error processing {filename}: {e}")
342
+ continue
343
+ finally:
344
+ # Clean up
345
+ if 'pil_img' in locals():
346
+ del pil_img
347
+ if 'img_bytes' in locals():
348
+ del img_bytes
349
+
350
+ time.sleep(0.5) # Small delay between images
351
+
352
+ except Exception as e:
353
+ print(f"[BACKGROUND] Error in image loop: {e}")
354
+ continue
355
+
356
+ print(f"[BACKGROUND] Completed course {course}")
357
+ time.sleep(1)
358
+
359
+ except Exception as e:
360
+ print(f"[BACKGROUND] Error in course loop: {e}")
361
+ time.sleep(5)
362
+ continue
363
+
364
+ except Exception as e:
365
+ print(f"[BACKGROUND] Main loop error: {e}")
366
+ time.sleep(5)
367
+
368
+
369
+ def _start_worker_thread():
370
+ """Start the background worker thread."""
371
+ t = threading.Thread(target=background_worker, daemon=True)
372
+ t.start()
373
+ return t
374
+
375
+
376
+ # FastAPI endpoints for status/health
377
+ @app.get("/")
378
+ async def root():
379
+ return {
380
+ "name": "Florence-2 Image Captioning Server",
381
+ "status": "running",
382
+ "model": "Florence-2-large",
383
+ "model_loaded": vision_language_model is not None,
384
+ "device": device
385
+ }
386
+
387
+ @app.get("/health")
388
+ async def health():
389
+ return {
390
+ "status": "healthy",
391
+ "model": "Florence-2-large",
392
+ "model_loaded": vision_language_model is not None,
393
+ "device": device,
394
+ "model_choice": MODEL_CHOICE
395
+ }
396
+
397
+
398
+
399
+ @app.get("/analyze")
400
+ async def analyze_get(image_url: str = None, model_choice: str = None):
401
+ """Analyze an image by URL. Usage: /analyze?image_url=https://...&model_choice=Florence-2-base"""
402
+ try:
403
+ mc = model_choice or MODEL_CHOICE
404
+ if image_url:
405
+ result = describe_image_from_url(image_url, mc)
406
+ if isinstance(result, dict) and result.get("status") == "success":
407
+ return JSONResponse(content={"success": True, "caption": result.get("caption"), "image_size": result.get("image_size")})
408
+ else:
409
+ return JSONResponse(status_code=400, content={"success": False, "error": result})
410
+ else:
411
+ raise HTTPException(status_code=400, detail="image_url query parameter is required")
412
+ except HTTPException:
413
+ raise
414
+ except Exception as e:
415
+ return JSONResponse(status_code=500, content={"success": False, "error": str(e)})
416
+
417
+
418
+ @app.post("/analyze")
419
+ async def analyze_post(file: UploadFile = File(None), model_choice: str = Form(None)):
420
+ """Analyze an uploaded image (multipart/form-data). Returns caption JSON."""
421
+ try:
422
+ if file is None:
423
+ raise HTTPException(status_code=400, detail="file is required")
424
+
425
+ content = await file.read()
426
+ try:
427
+ pil_img = Image.open(BytesIO(content)).convert('RGB')
428
+ except Exception as e:
429
+ raise HTTPException(status_code=400, detail=f"Failed to read uploaded image: {e}")
430
+
431
+ if vision_language_model is None:
432
+ raise HTTPException(status_code=503, detail="Florence-2-large model not loaded")
433
+
434
+ model = vision_language_model
435
+ processor = vision_language_processor
436
+
437
+ caption = process_image_description(model, processor, pil_img)
438
+ return JSONResponse(content={"success": True, "caption": caption})
439
+
440
+ except HTTPException:
441
+ raise
442
+ except Exception as e:
443
+ return JSONResponse(status_code=500, content={"success": False, "error": str(e)})
444
+
445
+ # Get the port from environment variable (for Hugging Face Spaces)
446
+ port = int(os.environ.get("PORT", 7860))
447
+
448
+ # Launch FastAPI with uvicorn when run directly
449
+ if __name__ == "__main__":
450
+ import uvicorn
451
+ uvicorn.run(app, host="0.0.0.0", port=port)