Spaces:
Running
Running
| name: vision-coder | |
| version: "0.1.0" | |
| description: > | |
| Screenshot-to-HTML reinforcement learning environment. | |
| An agent receives a UI screenshot and must generate HTML+CSS | |
| that visually reproduces the original page. | |
| server: | |
| host: "0.0.0.0" | |
| port: 7860 | |
| start_command: "uvicorn openenv.server.app:app --host 0.0.0.0 --port 7860" | |
| tasks: | |
| - id: easy | |
| description: "Simple single-section web pages with minimal layout" | |
| difficulty: easy | |
| max_steps: 1 | |
| reset_params: | |
| difficulty: easy | |
| - id: medium | |
| description: "Multi-section pages with navigation bars and content blocks" | |
| difficulty: medium | |
| max_steps: 1 | |
| reset_params: | |
| difficulty: medium | |
| - id: hard | |
| description: "Complex pages with forms, tables, and rich interactive layouts" | |
| difficulty: hard | |
| max_steps: 1 | |
| reset_params: | |
| difficulty: hard | |
| action_space: | |
| type: text | |
| description: "Raw HTML string (the agent must NOT include markdown fencing)" | |
| example: "<!DOCTYPE html><html><head>...</head><body>...</body></html>" | |
| observation_space: | |
| screenshot_b64: | |
| type: string | |
| encoding: base64 | |
| format: png | |
| description: "Base64-encoded PNG screenshot of the target UI (present after reset)" | |
| prompt: | |
| type: string | |
| description: "Task instruction for the agent" | |
| done: | |
| type: boolean | |
| description: "True after step(), False after reset()" | |
| reward: | |
| type: number | |
| range: [0.0, 1.0] | |
| description: "Composite reward (present after step(), null after reset())" | |
| reward: | |
| range: [0.0, 1.0] | |
| normalisation: weighted_sum_divided_by_total_weight | |
| total_weight: 11.0 | |
| components: | |
| - name: format | |
| weight: 0.5 | |
| description: "Markdown fencing present and <html>/DOCTYPE tags found" | |
| - name: validity | |
| weight: 0.5 | |
| description: "HTML parseability, structural completeness, tag diversity (>=8 unique tags)" | |
| - name: structural | |
| weight: 0.5 | |
| description: "DOM tag-sequence similarity + inline style property coverage vs. reference" | |
| - name: text_block | |
| weight: 3.0 | |
| description: "Text block match rate + content similarity via Hungarian matching on IoU" | |
| - name: position | |
| weight: 1.0 | |
| description: "Spatial layout accuracy — normalised centre-to-centre distance of matched blocks" | |
| - name: color | |
| weight: 1.5 | |
| description: "Perceptual color accuracy via spatial CIEDE2000 on reference non-white pixels" | |
| - name: clip | |
| weight: 2.5 | |
| description: "CLIP cosine similarity after rendering (openai/clip-vit-base-patch32, CPU)" | |
| - name: ssim | |
| weight: 1.5 | |
| description: "Pixel-level SSIM at 320x240 RGB (skimage) for near-perfect discrimination" | |
| environment_variables: | |
| - name: HF_TOKEN | |
| required: true | |
| description: "Hugging Face / API key for LLM calls (also checked as API_KEY)" | |
| - name: API_BASE_URL | |
| required: true | |
| description: "OpenAI-compatible LLM endpoint (default: https://router.huggingface.co/v1)" | |
| - name: MODEL_NAME | |
| required: true | |
| description: "Model identifier to use for LLM calls (must support vision/image inputs)" | |