Spaces:

Fred808
/

PIL2

Paused

App Files Files Community

Fred808 commited on Oct 12, 2025

Commit

bf125da

verified ·

1 Parent(s): f4f8231

Update app.py

Browse files

Files changed (1) hide show

app.py +16 -68

app.py CHANGED Viewed

@@ -33,17 +33,16 @@ class ImageAnalysisResponse(BaseModel):
 # ===== Load Florence-2 Base Model =====
 print("[INFO] Loading Florence-2 model on CPU...")
 try:
-    MODEL_ID = "microsoft/Florence-2-base"  # Using base for better compatibility
     # Load processor
     processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
-    # Load model with specific parameters to avoid SDPA issues
     model = AutoModelForCausalLM.from_pretrained(
         MODEL_ID,
         trust_remote_code=True,
         torch_dtype=torch.float32,
-        attn_implementation="eager",  # Force eager attention to avoid SDPA issues
     )
     # Move to device manually
@@ -60,19 +59,16 @@ except Exception as e:
 def download_image(url: str) -> Image.Image:
     """Download image from URL and return PIL Image"""
     try:
-        # Set headers to mimic browser request
         headers = {
-            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
         }
         response = requests.get(str(url), headers=headers, timeout=30)
         response.raise_for_status()
-        # Check content length
         if len(response.content) > MAX_IMAGE_SIZE:
-            raise ValueError(f"Image too large: {len(response.content)} bytes (max: {MAX_IMAGE_SIZE})")
-        # Check if content is actually an image
         content_type = response.headers.get('content-type', '')
         if not content_type.startswith('image/'):
             raise ValueError(f"URL does not point to an image. Content-Type: {content_type}")
@@ -91,56 +87,38 @@ def analyze_image(image: Image.Image) -> str:
         raise ValueError("Model not loaded properly")
     try:
-        print(f"[DEBUG] Input image size: {image.size}, mode: {image.mode}")
-        # Resize image for faster processing
-        original_size = image.size
         image = image.resize(RESIZE_DIM, Image.LANCZOS)
-        print(f"[DEBUG] Resized image: {original_size} -> {image.size}")
-        # Prepare inputs with explicit attention mask handling
-        print(f"[DEBUG] Processing image with task: {TASK}")
         inputs = processor(
             text=TASK,
             images=image,
             return_tensors="pt",
-            padding=True,
-            truncation=True
         )
         print(f"[DEBUG] Input keys: {list(inputs.keys())}")
-        print(f"[DEBUG] Pixel values type: {type(inputs.get('pixel_values'))}")
-        if inputs.get('pixel_values') is not None:
-            print(f"[DEBUG] Pixel values shape: {inputs['pixel_values'].shape}")
-        else:
-            print("[DEBUG] Pixel values is None!")
-            raise ValueError("Pixel values are None - image processing failed")
         # Move to device
-        inputs = {k: v.to(DEVICE) if hasattr(v, 'to') else v for k, v in inputs.items()}
-        # Ensure attention mask is set
-        if 'attention_mask' not in inputs:
-            inputs['attention_mask'] = torch.ones_like(inputs['input_ids'])
-        print(f"[DEBUG] Input IDs shape: {inputs['input_ids'].shape}")
-        print(f"[DEBUG] Attention mask shape: {inputs['attention_mask'].shape}")
-        print(f"[DEBUG] Pixel values device: {inputs['pixel_values'].device}")
-        # Generate caption with error handling
         print("[DEBUG] Starting generation...")
         with torch.no_grad():
             generated_ids = model.generate(
                 input_ids=inputs["input_ids"],
-                attention_mask=inputs["attention_mask"],
                 pixel_values=inputs["pixel_values"],
-                max_new_tokens=128,  # Reduced for stability
-                num_beams=2,        # Reduced for CPU
                 do_sample=False,
                 early_stopping=True,
-                pad_token_id=processor.tokenizer.pad_token_id or processor.tokenizer.eos_token_id,
-                eos_token_id=processor.tokenizer.eos_token_id
             )
         print("[DEBUG] Generation completed")
@@ -190,19 +168,16 @@ async def analyze_image_endpoint(request: ImageAnalysisRequest):
     Always uses <MORE_DETAILED_CAPTION> task for detailed image descriptions
     """
     try:
-        # Validate model is loaded
         if not processor or not model:
             raise HTTPException(
                 status_code=503,
                 detail="Model not loaded. Please check server logs."
             )
-        # Download and process image
         print(f"[INFO] Processing image from: {request.image_url}")
         image = download_image(request.image_url)
         print(f"[INFO] Image downloaded successfully: {image.size}")
-        # Analyze image with hardcoded task
         caption = analyze_image(image)
         print(f"[INFO] Analysis complete")
@@ -222,8 +197,6 @@ async def analyze_image_endpoint(request: ImageAnalysisRequest):
         )
     except Exception as e:
         print(f"[ERROR] Unexpected error: {e}")
-        import traceback
-        print(f"[ERROR] Traceback: {traceback.format_exc()}")
         return ImageAnalysisResponse(
             caption="",
             success=False,
@@ -242,31 +215,6 @@ async def analyze_image_get(image_url: str):
     except Exception as e:
         raise HTTPException(status_code=400, detail=str(e))
-# ===== Test Endpoint =====
-@app.post("/test-processor")
-async def test_processor(request: ImageAnalysisRequest):
-    """Test endpoint to debug the processor without full model inference"""
-    try:
-        image = download_image(request.image_url)
-        print(f"[TEST] Image downloaded: {image.size}")
-        # Test just the processor
-        inputs = processor(
-            text=TASK,
-            images=image,
-            return_tensors="pt"
-        )
-        return {
-            "success": True,
-            "input_keys": list(inputs.keys()),
-            "input_ids_shape": inputs["input_ids"].shape if "input_ids" in inputs else None,
-            "pixel_values_shape": inputs["pixel_values"].shape if "pixel_values" in inputs else None,
-            "pixel_values_type": str(inputs["pixel_values"].dtype) if "pixel_values" in inputs else None
-        }
-    except Exception as e:
-        return {"success": False, "error": str(e)}
 # ===== Main Execution =====
 if __name__ == "__main__":
     port = int(os.getenv("PORT", 7860))

 # ===== Load Florence-2 Base Model =====
 print("[INFO] Loading Florence-2 model on CPU...")
 try:
+    MODEL_ID = "microsoft/Florence-2-base"
     # Load processor
     processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
+    # Load model
     model = AutoModelForCausalLM.from_pretrained(
         MODEL_ID,
         trust_remote_code=True,
         torch_dtype=torch.float32,
     )
     # Move to device manually
 def download_image(url: str) -> Image.Image:
     """Download image from URL and return PIL Image"""
     try:
         headers = {
+            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
         }
         response = requests.get(str(url), headers=headers, timeout=30)
         response.raise_for_status()
         if len(response.content) > MAX_IMAGE_SIZE:
+            raise ValueError(f"Image too large: {len(response.content)} bytes")
         content_type = response.headers.get('content-type', '')
         if not content_type.startswith('image/'):
             raise ValueError(f"URL does not point to an image. Content-Type: {content_type}")
         raise ValueError("Model not loaded properly")
     try:
+        print(f"[DEBUG] Input image size: {image.size}")
+        # Resize image
         image = image.resize(RESIZE_DIM, Image.LANCZOS)
+        # Prepare inputs - use the same approach that worked in the test
         inputs = processor(
             text=TASK,
             images=image,
             return_tensors="pt",
+            padding=True
         )
         print(f"[DEBUG] Input keys: {list(inputs.keys())}")
+        print(f"[DEBUG] Input IDs shape: {inputs['input_ids'].shape}")
+        print(f"[DEBUG] Pixel values shape: {inputs['pixel_values'].shape}")
         # Move to device
+        inputs = {k: v.to(DEVICE) for k, v in inputs.items()}
+        # Generate caption - use the specific Florence-2 generation approach
         print("[DEBUG] Starting generation...")
         with torch.no_grad():
             generated_ids = model.generate(
                 input_ids=inputs["input_ids"],
                 pixel_values=inputs["pixel_values"],
+                max_new_tokens=100,
+                num_beams=3,
                 do_sample=False,
                 early_stopping=True,
+                no_repeat_ngram_size=3,
+                length_penalty=1.0,
             )
         print("[DEBUG] Generation completed")
     Always uses <MORE_DETAILED_CAPTION> task for detailed image descriptions
     """
     try:
         if not processor or not model:
             raise HTTPException(
                 status_code=503,
                 detail="Model not loaded. Please check server logs."
             )
         print(f"[INFO] Processing image from: {request.image_url}")
         image = download_image(request.image_url)
         print(f"[INFO] Image downloaded successfully: {image.size}")
         caption = analyze_image(image)
         print(f"[INFO] Analysis complete")
         )
     except Exception as e:
         print(f"[ERROR] Unexpected error: {e}")
         return ImageAnalysisResponse(
             caption="",
             success=False,
     except Exception as e:
         raise HTTPException(status_code=400, detail=str(e))
 # ===== Main Execution =====
 if __name__ == "__main__":
     port = int(os.getenv("PORT", 7860))