vision-coder-openenv / openenv.yaml
amaljoe88's picture
deploy: sync 712e5bc -> HF
cf6c0e0
name: vision-coder
version: "0.1.0"
description: >
Screenshot-to-HTML reinforcement learning environment.
An agent receives a UI screenshot and must generate HTML+CSS
that visually reproduces the original page.
server:
host: "0.0.0.0"
port: 7860
start_command: "uvicorn openenv.server.app:app --host 0.0.0.0 --port 7860"
tasks:
- id: easy
description: "Simple single-section web pages with minimal layout"
difficulty: easy
max_steps: 1
reset_params:
difficulty: easy
- id: medium
description: "Multi-section pages with navigation bars and content blocks"
difficulty: medium
max_steps: 1
reset_params:
difficulty: medium
- id: hard
description: "Complex pages with forms, tables, and rich interactive layouts"
difficulty: hard
max_steps: 1
reset_params:
difficulty: hard
action_space:
type: text
description: "Raw HTML string (the agent must NOT include markdown fencing)"
example: "<!DOCTYPE html><html><head>...</head><body>...</body></html>"
observation_space:
screenshot_b64:
type: string
encoding: base64
format: png
description: "Base64-encoded PNG screenshot of the target UI (present after reset)"
prompt:
type: string
description: "Task instruction for the agent"
done:
type: boolean
description: "True after step(), False after reset()"
reward:
type: number
range: [0.0, 1.0]
description: "Composite reward (present after step(), null after reset())"
reward:
range: [0.0, 1.0]
normalisation: weighted_sum_divided_by_total_weight
total_weight: 11.0
components:
- name: format
weight: 0.5
description: "Markdown fencing present and <html>/DOCTYPE tags found"
- name: validity
weight: 0.5
description: "HTML parseability, structural completeness, tag diversity (>=8 unique tags)"
- name: structural
weight: 0.5
description: "DOM tag-sequence similarity + inline style property coverage vs. reference"
- name: text_block
weight: 3.0
description: "Text block match rate + content similarity via Hungarian matching on IoU"
- name: position
weight: 1.0
description: "Spatial layout accuracy — normalised centre-to-centre distance of matched blocks"
- name: color
weight: 1.5
description: "Perceptual color accuracy via spatial CIEDE2000 on reference non-white pixels"
- name: clip
weight: 2.5
description: "CLIP cosine similarity after rendering (openai/clip-vit-base-patch32, CPU)"
- name: ssim
weight: 1.5
description: "Pixel-level SSIM at 320x240 RGB (skimage) for near-perfect discrimination"
environment_variables:
- name: HF_TOKEN
required: true
description: "Hugging Face / API key for LLM calls (also checked as API_KEY)"
- name: API_BASE_URL
required: true
description: "OpenAI-compatible LLM endpoint (default: https://router.huggingface.co/v1)"
- name: MODEL_NAME
required: true
description: "Model identifier to use for LLM calls (must support vision/image inputs)"