File size: 4,448 Bytes
ac185c8 133b29b ac185c8 133b29b ac185c8 133b29b ac185c8 133b29b ac185c8 3e5baf9 133b29b 6c687cf 30f9a65 ac185c8 30f9a65 ac185c8 133b29b ac185c8 133b29b 3e5baf9 133b29b 3e5baf9 133b29b 3e5baf9 133b29b 3e5baf9 133b29b c14f018 133b29b ac185c8 133b29b ac185c8 133b29b ac185c8 133b29b ac185c8 133b29b ac185c8 133b29b ac185c8 133b29b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 |
import os
import json
import time
from typing import Dict
from PIL import Image
from io import BytesIO
import torch
from transformers import AutoModelForVision2Seq, AutoProcessor
from fastapi import FastAPI, File, UploadFile
from fastapi.responses import JSONResponse
import uvicorn
# Set environment variable to handle remote code trust
os.environ["TRANSFORMERS_OFFLINE"] = "1"
# Configure PyTorch settings
torch.backends.cuda.enable_flash_sdp(False)
torch.backends.cuda.enable_math_sdp(True)
torch.backends.cuda.enable_mem_efficient_sdp(True)
# ==== CONFIGURATION ====
# Florence-2 Configuration
MODEL_ID = "microsoft/Florence-2-large"
DEVICE = "cpu" # Using CPU instead of GPU
# Create FastAPI app
app = FastAPI(title="Florence-2 Image Captioning API")
# Florence-2 Model (will be loaded once)
model = None
processor = None
def log_message(message: str):
"""Simple logging function"""
timestamp = time.strftime("%Y-%m-%d %H:%M:%S")
print(f"[{timestamp}] {message}")
def load_florence_model():
"""Load Florence-2 model and processor"""
global model, processor
if model is None or processor is None:
try:
log_message("[*] Loading Florence-2 model and processor...")
# Load model and processor
# Load processor with explicit trust and local files
processor = AutoProcessor.from_pretrained(
MODEL_ID,
trust_remote_code=True,
local_files_only=True
)
# Load model with explicit trust and local files
model = AutoModelForVision2Seq.from_pretrained(
MODEL_ID,
trust_remote_code=True,
local_files_only=True,
torch_dtype=torch.float32
).to(DEVICE)
model.eval()
log_message("[ ] Florence-2 loaded and ready.")
except Exception as e:
log_message(f"[ERROR] Failed to load Florence-2 model: {e}")
raise
def caption_image(image: Image.Image) -> str:
"""Generate detailed caption for an image using Florence-2"""
if model is None or processor is None:
return "Model not loaded."
task_prompt = "<MORE_DETAILED_CAPTION>"
prompt = task_prompt
try:
# Process image
inputs = processor(
text=prompt,
images=image,
return_tensors="pt",
padding=True,
truncation=True
).to(DEVICE)
with torch.no_grad():
generated_ids = model.generate(
input_ids=inputs["input_ids"],
pixel_values=inputs["pixel_values"],
max_new_tokens=1350,
do_sample=True,
temperature=0.7,
top_p=0.9,
num_beams=3,
)
generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
return generated_text
except Exception as e:
log_message(f"[!] Caption generation failed: {e}")
return "Captioning error."
@app.on_event("startup")
async def startup_event():
"""Load model on startup"""
load_florence_model()
@app.post("/caption")
async def create_caption(file: UploadFile = File(...)) -> Dict:
"""
API endpoint to receive an image and return its caption
"""
try:
log_message(f"[API] Received image: {file.filename}")
# Read and validate image
contents = await file.read()
image = Image.open(BytesIO(contents)).convert("RGB")
# Generate caption
log_message(f"[API] Generating caption for {file.filename}")
caption = caption_image(image)
log_message(f"[API] Caption generated for {file.filename}: {caption[:100]}...")
return {
"status": "success",
"filename": file.filename,
"caption": caption
}
except Exception as e:
error_msg = f"Error processing image: {str(e)}"
log_message(f"[ERROR] {error_msg}")
return JSONResponse(
status_code=500,
content={
"status": "error",
"message": error_msg
}
)
if __name__ == "__main__":
log_message("Starting Florence-2 Vision Analysis API Server")
uvicorn.run(app, host="0.0.0.0", port=8000) |