Spaces:

Fred808
/

PIL2

Paused

App Files Files Community

Fred808 commited on Oct 12, 2025

Commit

f4f8231

verified ·

1 Parent(s): 05ab361

Update app.py

Browse files

Files changed (1) hide show

app.py +73 -26

app.py CHANGED Viewed

@@ -33,7 +33,7 @@ class ImageAnalysisResponse(BaseModel):
 # ===== Load Florence-2 Base Model =====
 print("[INFO] Loading Florence-2 model on CPU...")
 try:
-    MODEL_ID = "microsoft/Florence-2-large"
     # Load processor
     processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
@@ -44,7 +44,6 @@ try:
         trust_remote_code=True,
         torch_dtype=torch.float32,
         attn_implementation="eager",  # Force eager attention to avoid SDPA issues
-        device_map=None  # Explicitly set to None for CPU
     )
     # Move to device manually
@@ -54,23 +53,8 @@ try:
     print("[INFO] Model loaded successfully!")
 except Exception as e:
     print(f"[ERROR] Failed to load model: {e}")
-    # Try fallback to base model if large fails
-    try:
-        print("[INFO] Trying Florence-2-base as fallback...")
-        MODEL_ID = "microsoft/Florence-2-base"
-        processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
-        model = AutoModelForCausalLM.from_pretrained(
-            MODEL_ID,
-            trust_remote_code=True,
-            torch_dtype=torch.float32,
-            attn_implementation="eager",
-            device_map=None
-        ).to(DEVICE).eval()
-        print("[INFO] Fallback model loaded successfully!")
-    except Exception as fallback_error:
-        print(f"[ERROR] Fallback also failed: {fallback_error}")
-        processor = None
-        model = None
 # ===== Helper Functions =====
 def download_image(url: str) -> Image.Image:
@@ -107,39 +91,75 @@ def analyze_image(image: Image.Image) -> str:
         raise ValueError("Model not loaded properly")
     try:
         # Resize image for faster processing
-        image = image.resize(RESIZE_DIM, Image.BILINEAR)
-        # Prepare inputs with hardcoded task
         inputs = processor(
             text=TASK,
             images=image,
-            return_tensors="pt"
-        ).to(DEVICE)
         # Generate caption with error handling
         with torch.no_grad():
             generated_ids = model.generate(
                 input_ids=inputs["input_ids"],
                 pixel_values=inputs["pixel_values"],
-                max_new_tokens=256,  # Reduced for stability
-                num_beams=3,
                 do_sample=False,
                 early_stopping=True,
-                pad_token_id=processor.tokenizer.eos_token_id
             )
         # Decode and clean output
         generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
         # Remove the task prompt from the beginning if present
         if generated_text.startswith(TASK):
             generated_text = generated_text[len(TASK):].strip()
         return generated_text
     except Exception as e:
         print(f"[ERROR] Exception in analyze_image: {e}")
         raise ValueError(f"Failed to analyze image: {e}")
 # ===== API Endpoints =====
@@ -202,6 +222,8 @@ async def analyze_image_endpoint(request: ImageAnalysisRequest):
         )
     except Exception as e:
         print(f"[ERROR] Unexpected error: {e}")
         return ImageAnalysisResponse(
             caption="",
             success=False,
@@ -220,6 +242,31 @@ async def analyze_image_get(image_url: str):
     except Exception as e:
         raise HTTPException(status_code=400, detail=str(e))
 # ===== Main Execution =====
 if __name__ == "__main__":
     port = int(os.getenv("PORT", 7860))

 # ===== Load Florence-2 Base Model =====
 print("[INFO] Loading Florence-2 model on CPU...")
 try:
+    MODEL_ID = "microsoft/Florence-2-base"  # Using base for better compatibility
     # Load processor
     processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
         trust_remote_code=True,
         torch_dtype=torch.float32,
         attn_implementation="eager",  # Force eager attention to avoid SDPA issues
     )
     # Move to device manually
     print("[INFO] Model loaded successfully!")
 except Exception as e:
     print(f"[ERROR] Failed to load model: {e}")
+    processor = None
+    model = None
 # ===== Helper Functions =====
 def download_image(url: str) -> Image.Image:
         raise ValueError("Model not loaded properly")
     try:
+        print(f"[DEBUG] Input image size: {image.size}, mode: {image.mode}")
         # Resize image for faster processing
+        original_size = image.size
+        image = image.resize(RESIZE_DIM, Image.LANCZOS)
+        print(f"[DEBUG] Resized image: {original_size} -> {image.size}")
+        # Prepare inputs with explicit attention mask handling
+        print(f"[DEBUG] Processing image with task: {TASK}")
         inputs = processor(
             text=TASK,
             images=image,
+            return_tensors="pt",
+            padding=True,
+            truncation=True
+        )
+        print(f"[DEBUG] Input keys: {list(inputs.keys())}")
+        print(f"[DEBUG] Pixel values type: {type(inputs.get('pixel_values'))}")
+        if inputs.get('pixel_values') is not None:
+            print(f"[DEBUG] Pixel values shape: {inputs['pixel_values'].shape}")
+        else:
+            print("[DEBUG] Pixel values is None!")
+            raise ValueError("Pixel values are None - image processing failed")
+        # Move to device
+        inputs = {k: v.to(DEVICE) if hasattr(v, 'to') else v for k, v in inputs.items()}
+        # Ensure attention mask is set
+        if 'attention_mask' not in inputs:
+            inputs['attention_mask'] = torch.ones_like(inputs['input_ids'])
+        print(f"[DEBUG] Input IDs shape: {inputs['input_ids'].shape}")
+        print(f"[DEBUG] Attention mask shape: {inputs['attention_mask'].shape}")
+        print(f"[DEBUG] Pixel values device: {inputs['pixel_values'].device}")
         # Generate caption with error handling
+        print("[DEBUG] Starting generation...")
         with torch.no_grad():
             generated_ids = model.generate(
                 input_ids=inputs["input_ids"],
+                attention_mask=inputs["attention_mask"],
                 pixel_values=inputs["pixel_values"],
+                max_new_tokens=128,  # Reduced for stability
+                num_beams=2,        # Reduced for CPU
                 do_sample=False,
                 early_stopping=True,
+                pad_token_id=processor.tokenizer.pad_token_id or processor.tokenizer.eos_token_id,
+                eos_token_id=processor.tokenizer.eos_token_id
             )
+        print("[DEBUG] Generation completed")
         # Decode and clean output
         generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
+        print(f"[DEBUG] Raw generated text: {repr(generated_text)}")
         # Remove the task prompt from the beginning if present
         if generated_text.startswith(TASK):
             generated_text = generated_text[len(TASK):].strip()
+        print(f"[INFO] Final caption: {generated_text}")
         return generated_text
     except Exception as e:
         print(f"[ERROR] Exception in analyze_image: {e}")
+        import traceback
+        print(f"[ERROR] Traceback: {traceback.format_exc()}")
         raise ValueError(f"Failed to analyze image: {e}")
 # ===== API Endpoints =====
         )
     except Exception as e:
         print(f"[ERROR] Unexpected error: {e}")
+        import traceback
+        print(f"[ERROR] Traceback: {traceback.format_exc()}")
         return ImageAnalysisResponse(
             caption="",
             success=False,
     except Exception as e:
         raise HTTPException(status_code=400, detail=str(e))
+# ===== Test Endpoint =====
+@app.post("/test-processor")
+async def test_processor(request: ImageAnalysisRequest):
+    """Test endpoint to debug the processor without full model inference"""
+    try:
+        image = download_image(request.image_url)
+        print(f"[TEST] Image downloaded: {image.size}")
+        # Test just the processor
+        inputs = processor(
+            text=TASK,
+            images=image,
+            return_tensors="pt"
+        )
+        return {
+            "success": True,
+            "input_keys": list(inputs.keys()),
+            "input_ids_shape": inputs["input_ids"].shape if "input_ids" in inputs else None,
+            "pixel_values_shape": inputs["pixel_values"].shape if "pixel_values" in inputs else None,
+            "pixel_values_type": str(inputs["pixel_values"].dtype) if "pixel_values" in inputs else None
+        }
+    except Exception as e:
+        return {"success": False, "error": str(e)}
 # ===== Main Execution =====
 if __name__ == "__main__":
     port = int(os.getenv("PORT", 7860))