Update app.py
Browse files
app.py
CHANGED
|
@@ -29,25 +29,15 @@ except subprocess.CalledProcessError as e:
|
|
| 29 |
# Determine the device to use
|
| 30 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 31 |
|
| 32 |
-
# Load
|
| 33 |
try:
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
print("✓
|
| 37 |
except Exception as e:
|
| 38 |
-
print(f"Error loading
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
# Load the large model and processor
|
| 43 |
-
try:
|
| 44 |
-
vision_language_model_large = AutoModelForCausalLM.from_pretrained('microsoft/Florence-2-large', trust_remote_code=True).to(device).eval()
|
| 45 |
-
vision_language_processor_large = AutoProcessor.from_pretrained('microsoft/Florence-2-large', trust_remote_code=True)
|
| 46 |
-
print("✓ Large model loaded successfully")
|
| 47 |
-
except Exception as e:
|
| 48 |
-
print(f"Error loading large model: {e}")
|
| 49 |
-
vision_language_model_large = None
|
| 50 |
-
vision_language_processor_large = None
|
| 51 |
|
| 52 |
def load_image_from_url(image_url):
|
| 53 |
"""Load an image from a URL."""
|
|
@@ -88,47 +78,32 @@ def describe_image(uploaded_image, model_choice):
|
|
| 88 |
if uploaded_image is None:
|
| 89 |
return "Please upload an image."
|
| 90 |
|
| 91 |
-
if
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
elif model_choice == "Florence-2-large":
|
| 97 |
-
if vision_language_model_large is None:
|
| 98 |
-
return "Large model failed to load."
|
| 99 |
-
model = vision_language_model_large
|
| 100 |
-
processor = vision_language_processor_large
|
| 101 |
-
else:
|
| 102 |
-
return "Invalid model choice."
|
| 103 |
|
| 104 |
try:
|
| 105 |
return process_image_description(model, processor, uploaded_image)
|
| 106 |
except Exception as e:
|
| 107 |
return f"Error generating caption: {str(e)}"
|
| 108 |
|
| 109 |
-
def describe_image_from_url(image_url, model_choice):
|
| 110 |
"""Generate description from image URL."""
|
| 111 |
try:
|
| 112 |
if not image_url:
|
| 113 |
return {"error": "image_url is required"}
|
| 114 |
|
| 115 |
-
if
|
| 116 |
-
return {"error": "
|
| 117 |
-
|
| 118 |
# Load image from URL
|
| 119 |
image = load_image_from_url(image_url)
|
| 120 |
|
| 121 |
-
#
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
return {"error": "Base model not available"}
|
| 125 |
-
model = vision_language_model_base
|
| 126 |
-
processor = vision_language_processor_base
|
| 127 |
-
else:
|
| 128 |
-
if vision_language_model_large is None:
|
| 129 |
-
return {"error": "Large model not available"}
|
| 130 |
-
model = vision_language_model_large
|
| 131 |
-
processor = vision_language_processor_large
|
| 132 |
|
| 133 |
# Generate caption
|
| 134 |
caption = process_image_description(model, processor, image)
|
|
@@ -147,7 +122,7 @@ def describe_image_from_url(image_url, model_choice):
|
|
| 147 |
IMAGE_SERVER_BASE = os.getenv("IMAGE_SERVER_BASE", " ")
|
| 148 |
DATA_COLLECTION_BASE = os.getenv("DATA_COLLECTION_BASE", "https://fred808-flow.hf.space")
|
| 149 |
REQUESTER_ID = os.getenv("FLO_REQUESTER_ID", f"florence-2-{os.getpid()}")
|
| 150 |
-
MODEL_CHOICE =
|
| 151 |
|
| 152 |
|
| 153 |
def sanitize_name(name: str, max_len: int = 200) -> str:
|
|
@@ -259,9 +234,7 @@ def background_worker():
|
|
| 259 |
# Wait for model to be ready
|
| 260 |
waited = 0
|
| 261 |
while waited < 120:
|
| 262 |
-
if
|
| 263 |
-
break
|
| 264 |
-
elif MODEL_CHOICE == "Florence-2-large" and vision_language_model_large:
|
| 265 |
break
|
| 266 |
time.sleep(1)
|
| 267 |
waited += 1
|
|
@@ -344,12 +317,8 @@ def background_worker():
|
|
| 344 |
try:
|
| 345 |
pil_img = Image.open(BytesIO(img_bytes)).convert('RGB')
|
| 346 |
|
| 347 |
-
|
| 348 |
-
|
| 349 |
-
processor = vision_language_processor_base
|
| 350 |
-
else:
|
| 351 |
-
model = vision_language_model_large
|
| 352 |
-
processor = vision_language_processor_large
|
| 353 |
|
| 354 |
print(f"[BACKGROUND] Generating caption for {filename}")
|
| 355 |
caption = process_image_description(model, processor, pil_img)
|
|
@@ -410,8 +379,8 @@ async def root():
|
|
| 410 |
return {
|
| 411 |
"name": "Florence-2 Image Captioning Server",
|
| 412 |
"status": "running",
|
| 413 |
-
"
|
| 414 |
-
"
|
| 415 |
"device": device
|
| 416 |
}
|
| 417 |
|
|
@@ -419,21 +388,12 @@ async def root():
|
|
| 419 |
async def health():
|
| 420 |
return {
|
| 421 |
"status": "healthy",
|
| 422 |
-
"
|
| 423 |
-
"
|
| 424 |
"device": device,
|
| 425 |
"model_choice": MODEL_CHOICE
|
| 426 |
}
|
| 427 |
|
| 428 |
-
# Start background worker thread (daemon) so it doesn't block shutdown
|
| 429 |
-
def _start_worker_thread():
|
| 430 |
-
t = threading.Thread(target=background_worker, daemon=True)
|
| 431 |
-
t.start()
|
| 432 |
-
|
| 433 |
-
# Start background worker when FastAPI starts
|
| 434 |
-
@app.on_event("startup")
|
| 435 |
-
async def startup_event():
|
| 436 |
-
_start_worker_thread()
|
| 437 |
|
| 438 |
|
| 439 |
@app.get("/analyze")
|
|
@@ -459,7 +419,6 @@ async def analyze_get(image_url: str = None, model_choice: str = None):
|
|
| 459 |
async def analyze_post(file: UploadFile = File(None), model_choice: str = Form(None)):
|
| 460 |
"""Analyze an uploaded image (multipart/form-data). Returns caption JSON."""
|
| 461 |
try:
|
| 462 |
-
mc = model_choice or MODEL_CHOICE
|
| 463 |
if file is None:
|
| 464 |
raise HTTPException(status_code=400, detail="file is required")
|
| 465 |
|
|
@@ -469,17 +428,11 @@ async def analyze_post(file: UploadFile = File(None), model_choice: str = Form(N
|
|
| 469 |
except Exception as e:
|
| 470 |
raise HTTPException(status_code=400, detail=f"Failed to read uploaded image: {e}")
|
| 471 |
|
| 472 |
-
|
| 473 |
-
|
| 474 |
-
|
| 475 |
-
|
| 476 |
-
|
| 477 |
-
processor = vision_language_processor_large
|
| 478 |
-
else:
|
| 479 |
-
if vision_language_model_large is None:
|
| 480 |
-
raise HTTPException(status_code=503, detail="Large model not loaded")
|
| 481 |
-
model = vision_language_model_large
|
| 482 |
-
processor = vision_language_processor_large
|
| 483 |
|
| 484 |
caption = process_image_description(model, processor, pil_img)
|
| 485 |
return JSONResponse(content={"success": True, "caption": caption})
|
|
|
|
| 29 |
# Determine the device to use
|
| 30 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 31 |
|
| 32 |
+
# Load Florence-2-large model and processor
|
| 33 |
try:
|
| 34 |
+
vision_language_model = AutoModelForCausalLM.from_pretrained('microsoft/Florence-2-large', trust_remote_code=True).to(device).eval()
|
| 35 |
+
vision_language_processor = AutoProcessor.from_pretrained('microsoft/Florence-2-large', trust_remote_code=True)
|
| 36 |
+
print("✓ Florence-2-large model loaded successfully")
|
| 37 |
except Exception as e:
|
| 38 |
+
print(f"Error loading Florence-2-large model: {e}")
|
| 39 |
+
vision_language_model = None
|
| 40 |
+
vision_language_processor = None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 41 |
|
| 42 |
def load_image_from_url(image_url):
|
| 43 |
"""Load an image from a URL."""
|
|
|
|
| 78 |
if uploaded_image is None:
|
| 79 |
return "Please upload an image."
|
| 80 |
|
| 81 |
+
if vision_language_model is None:
|
| 82 |
+
return "Florence-2-large model failed to load."
|
| 83 |
+
|
| 84 |
+
model = vision_language_model
|
| 85 |
+
processor = vision_language_processor
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 86 |
|
| 87 |
try:
|
| 88 |
return process_image_description(model, processor, uploaded_image)
|
| 89 |
except Exception as e:
|
| 90 |
return f"Error generating caption: {str(e)}"
|
| 91 |
|
| 92 |
+
def describe_image_from_url(image_url, model_choice=None):
|
| 93 |
"""Generate description from image URL."""
|
| 94 |
try:
|
| 95 |
if not image_url:
|
| 96 |
return {"error": "image_url is required"}
|
| 97 |
|
| 98 |
+
if vision_language_model is None:
|
| 99 |
+
return {"error": "Florence-2-large model not available"}
|
| 100 |
+
|
| 101 |
# Load image from URL
|
| 102 |
image = load_image_from_url(image_url)
|
| 103 |
|
| 104 |
+
# Use the loaded large model
|
| 105 |
+
model = vision_language_model
|
| 106 |
+
processor = vision_language_processor
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 107 |
|
| 108 |
# Generate caption
|
| 109 |
caption = process_image_description(model, processor, image)
|
|
|
|
| 122 |
IMAGE_SERVER_BASE = os.getenv("IMAGE_SERVER_BASE", " ")
|
| 123 |
DATA_COLLECTION_BASE = os.getenv("DATA_COLLECTION_BASE", "https://fred808-flow.hf.space")
|
| 124 |
REQUESTER_ID = os.getenv("FLO_REQUESTER_ID", f"florence-2-{os.getpid()}")
|
| 125 |
+
MODEL_CHOICE = "Florence-2-large" # Always use large model
|
| 126 |
|
| 127 |
|
| 128 |
def sanitize_name(name: str, max_len: int = 200) -> str:
|
|
|
|
| 234 |
# Wait for model to be ready
|
| 235 |
waited = 0
|
| 236 |
while waited < 120:
|
| 237 |
+
if vision_language_model is not None:
|
|
|
|
|
|
|
| 238 |
break
|
| 239 |
time.sleep(1)
|
| 240 |
waited += 1
|
|
|
|
| 317 |
try:
|
| 318 |
pil_img = Image.open(BytesIO(img_bytes)).convert('RGB')
|
| 319 |
|
| 320 |
+
model = vision_language_model
|
| 321 |
+
processor = vision_language_processor
|
|
|
|
|
|
|
|
|
|
|
|
|
| 322 |
|
| 323 |
print(f"[BACKGROUND] Generating caption for {filename}")
|
| 324 |
caption = process_image_description(model, processor, pil_img)
|
|
|
|
| 379 |
return {
|
| 380 |
"name": "Florence-2 Image Captioning Server",
|
| 381 |
"status": "running",
|
| 382 |
+
"model": "Florence-2-large",
|
| 383 |
+
"model_loaded": vision_language_model is not None,
|
| 384 |
"device": device
|
| 385 |
}
|
| 386 |
|
|
|
|
| 388 |
async def health():
|
| 389 |
return {
|
| 390 |
"status": "healthy",
|
| 391 |
+
"model": "Florence-2-large",
|
| 392 |
+
"model_loaded": vision_language_model is not None,
|
| 393 |
"device": device,
|
| 394 |
"model_choice": MODEL_CHOICE
|
| 395 |
}
|
| 396 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 397 |
|
| 398 |
|
| 399 |
@app.get("/analyze")
|
|
|
|
| 419 |
async def analyze_post(file: UploadFile = File(None), model_choice: str = Form(None)):
|
| 420 |
"""Analyze an uploaded image (multipart/form-data). Returns caption JSON."""
|
| 421 |
try:
|
|
|
|
| 422 |
if file is None:
|
| 423 |
raise HTTPException(status_code=400, detail="file is required")
|
| 424 |
|
|
|
|
| 428 |
except Exception as e:
|
| 429 |
raise HTTPException(status_code=400, detail=f"Failed to read uploaded image: {e}")
|
| 430 |
|
| 431 |
+
if vision_language_model is None:
|
| 432 |
+
raise HTTPException(status_code=503, detail="Florence-2-large model not loaded")
|
| 433 |
+
|
| 434 |
+
model = vision_language_model
|
| 435 |
+
processor = vision_language_processor
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 436 |
|
| 437 |
caption = process_image_description(model, processor, pil_img)
|
| 438 |
return JSONResponse(content={"success": True, "caption": caption})
|