Spaces:
Sleeping
Sleeping
File size: 6,494 Bytes
2a729e6 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 | import os
import io
import base64
from pathlib import Path
from typing import Optional, List
from fastapi import FastAPI, File, UploadFile, HTTPException
from fastapi.responses import JSONResponse
from fastapi.middleware.cors import CORSMiddleware
from pydantic import BaseModel
import cv2
import numpy as np
from PIL import Image
import logging
# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
# Initialize FastAPI app
app = FastAPI(
title="OmniParser-v2.0 API",
description="Extract UI elements and cursor coordinates from screenshots",
version="1.0.0"
)
# Add CORS middleware
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
# Global OmniParser model (lazy loaded)
omni_parser = None
class ParseRequest(BaseModel):
"""Request model for UI parsing"""
image_base64: str
extract_text: bool = True
extract_icons: bool = True
class UIElement(BaseModel):
"""Model for UI element"""
element_id: int
label: str
bbox: List[int] # [x1, y1, x2, y2]
element_type: str
confidence: float
class ParseResponse(BaseModel):
"""Response model for parsing results"""
elements: List[UIElement]
image_width: int
image_height: int
processing_time: float
model_used: str = "OmniParser-v2.0"
def load_omniparser():
"""Load OmniParser model (lazy loading)"""
global omni_parser
if omni_parser is None:
try:
logger.info("Loading OmniParser-v2.0 from HuggingFace...")
# Import and initialize OmniParser
# For now, we'll use a placeholder that demonstrates the structure
# You can replace this with actual OmniParser initialization
omni_parser = {
"loaded": True,
"model_name": "microsoft/OmniParser-v2.0"
}
logger.info("OmniParser loaded successfully")
except Exception as e:
logger.error(f"Failed to load OmniParser: {e}")
raise
return omni_parser
def extract_image_from_base64(image_base64: str) -> Image.Image:
"""Decode base64 image"""
try:
image_data = base64.b64decode(image_base64)
image = Image.open(io.BytesIO(image_data))
return image
except Exception as e:
raise ValueError(f"Failed to decode image: {e}")
def parse_ui_elements(image: Image.Image) -> List[UIElement]:
"""Parse UI elements from image using OmniParser"""
try:
# Load model
load_omniparser()
# Placeholder implementation - replace with actual OmniParser logic
logger.info(f"Processing image of size: {image.size}")
# For demonstration, create mock UI elements
# Replace this with actual OmniParser parsing logic
elements = [
UIElement(
element_id=1,
label="Button",
bbox=[10, 10, 100, 50],
element_type="button",
confidence=0.95
),
UIElement(
element_id=2,
label="Search",
bbox=[150, 10, 400, 50],
element_type="textfield",
confidence=0.92
),
]
return elements
except Exception as e:
logger.error(f"Error parsing UI elements: {e}")
raise
@app.get("/")
async def root():
"""Root endpoint"""
return {
"message": "OmniParser-v2.0 API",
"status": "running",
"endpoints": [
"/docs - API documentation",
"/health - Health check",
"/parse - Parse UI elements from screenshot"
]
}
@app.get("/health")
async def health_check():
"""Health check endpoint"""
try:
load_omniparser()
return {"status": "healthy", "model": "OmniParser-v2.0"}
except Exception as e:
return JSONResponse(
status_code=503,
content={"status": "unhealthy", "error": str(e)}
)
@app.post("/parse", response_model=ParseResponse)
async def parse_screenshot(file: UploadFile = File(...)):
"""
Parse UI elements from a screenshot.
- **file**: Image file (PNG, JPG, etc.)
Returns UI elements with bounding boxes and cursor coordinates.
"""
try:
import time
start_time = time.time()
# Read uploaded file
contents = await file.read()
image = Image.open(io.BytesIO(contents))
# Parse UI elements
elements = parse_ui_elements(image)
# Calculate processing time
processing_time = time.time() - start_time
return ParseResponse(
elements=elements,
image_width=image.width,
image_height=image.height,
processing_time=processing_time
)
except Exception as e:
logger.error(f"Error in parse endpoint: {e}")
raise HTTPException(status_code=400, detail=str(e))
@app.post("/parse-base64", response_model=ParseResponse)
async def parse_base64(request: ParseRequest):
"""
Parse UI elements from base64-encoded image.
Request body:
- **image_base64**: Base64-encoded image string
- **extract_text**: Extract text from elements (default: True)
- **extract_icons**: Extract icons (default: True)
"""
try:
import time
start_time = time.time()
# Decode image
image = extract_image_from_base64(request.image_base64)
# Parse UI elements
elements = parse_ui_elements(image)
# Calculate processing time
processing_time = time.time() - start_time
return ParseResponse(
elements=elements,
image_width=image.width,
image_height=image.height,
processing_time=processing_time
)
except Exception as e:
logger.error(f"Error in parse-base64 endpoint: {e}")
raise HTTPException(status_code=400, detail=str(e))
if __name__ == "__main__":
import uvicorn
uvicorn.run(app, host="0.0.0.0", port=8000)
|