File size: 6,494 Bytes
2a729e6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
import os
import io
import base64
from pathlib import Path
from typing import Optional, List
from fastapi import FastAPI, File, UploadFile, HTTPException
from fastapi.responses import JSONResponse
from fastapi.middleware.cors import CORSMiddleware
from pydantic import BaseModel
import cv2
import numpy as np
from PIL import Image
import logging

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Initialize FastAPI app
app = FastAPI(
    title="OmniParser-v2.0 API",
    description="Extract UI elements and cursor coordinates from screenshots",
    version="1.0.0"
)

# Add CORS middleware
app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
)

# Global OmniParser model (lazy loaded)
omni_parser = None


class ParseRequest(BaseModel):
    """Request model for UI parsing"""
    image_base64: str
    extract_text: bool = True
    extract_icons: bool = True


class UIElement(BaseModel):
    """Model for UI element"""
    element_id: int
    label: str
    bbox: List[int]  # [x1, y1, x2, y2]
    element_type: str
    confidence: float


class ParseResponse(BaseModel):
    """Response model for parsing results"""
    elements: List[UIElement]
    image_width: int
    image_height: int
    processing_time: float
    model_used: str = "OmniParser-v2.0"


def load_omniparser():
    """Load OmniParser model (lazy loading)"""
    global omni_parser
    if omni_parser is None:
        try:
            logger.info("Loading OmniParser-v2.0 from HuggingFace...")
            # Import and initialize OmniParser
            # For now, we'll use a placeholder that demonstrates the structure
            # You can replace this with actual OmniParser initialization
            omni_parser = {
                "loaded": True,
                "model_name": "microsoft/OmniParser-v2.0"
            }
            logger.info("OmniParser loaded successfully")
        except Exception as e:
            logger.error(f"Failed to load OmniParser: {e}")
            raise
    return omni_parser


def extract_image_from_base64(image_base64: str) -> Image.Image:
    """Decode base64 image"""
    try:
        image_data = base64.b64decode(image_base64)
        image = Image.open(io.BytesIO(image_data))
        return image
    except Exception as e:
        raise ValueError(f"Failed to decode image: {e}")


def parse_ui_elements(image: Image.Image) -> List[UIElement]:
    """Parse UI elements from image using OmniParser"""
    try:
        # Load model
        load_omniparser()
        
        # Placeholder implementation - replace with actual OmniParser logic
        logger.info(f"Processing image of size: {image.size}")
        
        # For demonstration, create mock UI elements
        # Replace this with actual OmniParser parsing logic
        elements = [
            UIElement(
                element_id=1,
                label="Button",
                bbox=[10, 10, 100, 50],
                element_type="button",
                confidence=0.95
            ),
            UIElement(
                element_id=2,
                label="Search",
                bbox=[150, 10, 400, 50],
                element_type="textfield",
                confidence=0.92
            ),
        ]
        
        return elements
    except Exception as e:
        logger.error(f"Error parsing UI elements: {e}")
        raise


@app.get("/")
async def root():
    """Root endpoint"""
    return {
        "message": "OmniParser-v2.0 API",
        "status": "running",
        "endpoints": [
            "/docs - API documentation",
            "/health - Health check",
            "/parse - Parse UI elements from screenshot"
        ]
    }


@app.get("/health")
async def health_check():
    """Health check endpoint"""
    try:
        load_omniparser()
        return {"status": "healthy", "model": "OmniParser-v2.0"}
    except Exception as e:
        return JSONResponse(
            status_code=503,
            content={"status": "unhealthy", "error": str(e)}
        )


@app.post("/parse", response_model=ParseResponse)
async def parse_screenshot(file: UploadFile = File(...)):
    """

    Parse UI elements from a screenshot.

    

    - **file**: Image file (PNG, JPG, etc.)

    

    Returns UI elements with bounding boxes and cursor coordinates.

    """
    try:
        import time
        start_time = time.time()
        
        # Read uploaded file
        contents = await file.read()
        image = Image.open(io.BytesIO(contents))
        
        # Parse UI elements
        elements = parse_ui_elements(image)
        
        # Calculate processing time
        processing_time = time.time() - start_time
        
        return ParseResponse(
            elements=elements,
            image_width=image.width,
            image_height=image.height,
            processing_time=processing_time
        )
    except Exception as e:
        logger.error(f"Error in parse endpoint: {e}")
        raise HTTPException(status_code=400, detail=str(e))


@app.post("/parse-base64", response_model=ParseResponse)
async def parse_base64(request: ParseRequest):
    """

    Parse UI elements from base64-encoded image.

    

    Request body:

    - **image_base64**: Base64-encoded image string

    - **extract_text**: Extract text from elements (default: True)

    - **extract_icons**: Extract icons (default: True)

    """
    try:
        import time
        start_time = time.time()
        
        # Decode image
        image = extract_image_from_base64(request.image_base64)
        
        # Parse UI elements
        elements = parse_ui_elements(image)
        
        # Calculate processing time
        processing_time = time.time() - start_time
        
        return ParseResponse(
            elements=elements,
            image_width=image.width,
            image_height=image.height,
            processing_time=processing_time
        )
    except Exception as e:
        logger.error(f"Error in parse-base64 endpoint: {e}")
        raise HTTPException(status_code=400, detail=str(e))


if __name__ == "__main__":
    import uvicorn
    uvicorn.run(app, host="0.0.0.0", port=8000)