Spaces:

Fred808
/

PIL2

Paused

App Files Files Community

Fred808 commited on Jul 16, 2025

Commit

99c4852

verified ·

1 Parent(s): 1838e15

Update app.py

Browse files

Files changed (1) hide show

app.py +52 -62

app.py CHANGED Viewed

@@ -1,30 +1,31 @@
-import gradio as gr
-import subprocess
-import torch
 from PIL import Image
 from transformers import AutoProcessor, AutoModelForCausalLM
-# Attempt to install flash-attn
 try:
     subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, check=True, shell=True)
 except subprocess.CalledProcessError as e:
     print(f"Error installing flash-attn: {e}")
     print("Continuing without flash-attn.")
-# Determine the device to use
 device = "cuda" if torch.cuda.is_available() else "cpu"
-# Load the base model and processor
 try:
-    vision_language_model_base = AutoModelForCausalLM.from_pretrained('microsoft/Florence-2-base', trust_remote_code=True,
-            attn_implementation="eager" ).to(device).eval()
     vision_language_processor_base = AutoProcessor.from_pretrained('microsoft/Florence-2-base', trust_remote_code=True)
 except Exception as e:
     print(f"Error loading base model: {e}")
     vision_language_model_base = None
     vision_language_processor_base = None
-# Load the large model and processor
 try:
     vision_language_model_large = AutoModelForCausalLM.from_pretrained('microsoft/Florence-2-large', trust_remote_code=True).to(device).eval()
     vision_language_processor_large = AutoProcessor.from_pretrained('microsoft/Florence-2-large', trust_remote_code=True)
@@ -33,71 +34,60 @@ except Exception as e:
     vision_language_model_large = None
     vision_language_processor_large = None
-def describe_image(uploaded_image, model_choice):
-    """
-    Generates a detailed description of the input image using the selected model.
-    Args:
-        uploaded_image (PIL.Image.Image): The image to describe.
-        model_choice (str): The model to use, either "Base" or "Large".
-    Returns:
-        str: A detailed textual description of the image or an error message.
-    """
-    if uploaded_image is None:
-        return "Please upload an image."
     if model_choice == "Base":
         if vision_language_model_base is None:
-            return "Base model failed to load."
         model = vision_language_model_base
         processor = vision_language_processor_base
     elif model_choice == "Large":
         if vision_language_model_large is None:
-            return "Large model failed to load."
         model = vision_language_model_large
         processor = vision_language_processor_large
     else:
-        return "Invalid model choice."
-    if not isinstance(uploaded_image, Image.Image):
-        uploaded_image = Image.fromarray(uploaded_image)
-    inputs = processor(text="<MORE_DETAILED_CAPTION>", images=uploaded_image, return_tensors="pt").to(device)
-    with torch.no_grad():
-        generated_ids = model.generate(
-            input_ids=inputs["input_ids"],
-            pixel_values=inputs["pixel_values"],
-            max_new_tokens=1024,
-            early_stopping=False,
-            do_sample=False,
-            num_beams=3,
         )
-    generated_text = processor.batch_decode(generated_ids, skip_special_tokens=False)[0]
-    processed_description = processor.post_process_generation(
-        generated_text,
-        task="<MORE_DETAILED_CAPTION>",
-        image_size=(uploaded_image.width, uploaded_image.height)
-    )
-    image_description = processed_description["<MORE_DETAILED_CAPTION>"]
-    print("\nImage description generated!:", image_description)
-    return image_description
-# Description for the interface
-description = "Select the model to use for generating the image description. 'Base' is smaller and faster, while 'Large' is more accurate but slower."
-if device == "cpu":
-    description += " Note: Running on CPU, which may be slow for large models."
-# Create the Gradio interface
-image_description_interface = gr.Interface(
-    fn=describe_image,
-    inputs=[
-        gr.Image(label="Upload Image", type="pil"),
-        gr.Radio(["Base", "Large"], label="Model Choice", value="Base")
-    ],
-    outputs=gr.Textbox(label="Generated Caption", lines=4, show_copy_button=True),
-    live=False,
-    title="Florence-2 Models Image Captions",
-    description=description
-)
-# Launch the interface
-image_description_interface.launch(debug=True, ssr_mode=False)

+from fastapi import FastAPI, File, UploadFile, Form
+from fastapi.responses import JSONResponse
 from PIL import Image
+import torch
+import io
 from transformers import AutoProcessor, AutoModelForCausalLM
+import subprocess
+# Attempt to install flash-attn (if needed)
 try:
     subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, check=True, shell=True)
 except subprocess.CalledProcessError as e:
     print(f"Error installing flash-attn: {e}")
     print("Continuing without flash-attn.")
+# Determine device
 device = "cuda" if torch.cuda.is_available() else "cpu"
+# Load Florence-2 Base
 try:
+    vision_language_model_base = AutoModelForCausalLM.from_pretrained('microsoft/Florence-2-base', trust_remote_code=True, attn_implementation="eager").to(device).eval()
     vision_language_processor_base = AutoProcessor.from_pretrained('microsoft/Florence-2-base', trust_remote_code=True)
 except Exception as e:
     print(f"Error loading base model: {e}")
     vision_language_model_base = None
     vision_language_processor_base = None
+# Load Florence-2 Large
 try:
     vision_language_model_large = AutoModelForCausalLM.from_pretrained('microsoft/Florence-2-large', trust_remote_code=True).to(device).eval()
     vision_language_processor_large = AutoProcessor.from_pretrained('microsoft/Florence-2-large', trust_remote_code=True)
     vision_language_model_large = None
     vision_language_processor_large = None
+# Initialize FastAPI
+app = FastAPI()
+@app.post("/describe-image")
+async def describe_image(
+    file: UploadFile = File(...),
+    model_choice: str = Form("Base")
+):
+    if not file.filename.lower().endswith((".jpg", ".jpeg", ".png")):
+        return JSONResponse(status_code=400, content={"error": "Invalid image file type."})
+    try:
+        image_bytes = await file.read()
+        image = Image.open(io.BytesIO(image_bytes)).convert("RGB")
+    except Exception as e:
+        return JSONResponse(status_code=400, content={"error": f"Failed to process image: {str(e)}"})
     if model_choice == "Base":
         if vision_language_model_base is None:
+            return JSONResponse(status_code=500, content={"error": "Base model not loaded."})
         model = vision_language_model_base
         processor = vision_language_processor_base
     elif model_choice == "Large":
         if vision_language_model_large is None:
+            return JSONResponse(status_code=500, content={"error": "Large model not loaded."})
         model = vision_language_model_large
         processor = vision_language_processor_large
     else:
+        return JSONResponse(status_code=400, content={"error": "Invalid model choice."})
+    try:
+        inputs = processor(text="<MORE_DETAILED_CAPTION>", images=image, return_tensors="pt").to(device)
+        with torch.no_grad():
+            generated_ids = model.generate(
+                input_ids=inputs["input_ids"],
+                pixel_values=inputs["pixel_values"],
+                max_new_tokens=1024,
+                early_stopping=False,
+                do_sample=False,
+                num_beams=3,
+            )
+        generated_text = processor.batch_decode(generated_ids, skip_special_tokens=False)[0]
+        processed_description = processor.post_process_generation(
+            generated_text,
+            task="<MORE_DETAILED_CAPTION>",
+            image_size=(image.width, image.height)
         )
+        image_description = processed_description["<MORE_DETAILED_CAPTION>"]
+        return JSONResponse(content={"description": image_description})
+    except Exception as e:
+        return JSONResponse(status_code=500, content={"error": f"Image processing failed: {str(e)}"})
+@app.get("/health")
+def health():
+    return {"status": "ok", "device": device}