name: vision-coder version: "0.1.0" description: > Screenshot-to-HTML reinforcement learning environment. An agent receives a UI screenshot and must generate HTML+CSS that visually reproduces the original page. server: host: "0.0.0.0" port: 7860 start_command: "uvicorn openenv.server.app:app --host 0.0.0.0 --port 7860" tasks: - id: easy description: "Simple single-section web pages with minimal layout" difficulty: easy max_steps: 1 reset_params: difficulty: easy - id: medium description: "Multi-section pages with navigation bars and content blocks" difficulty: medium max_steps: 1 reset_params: difficulty: medium - id: hard description: "Complex pages with forms, tables, and rich interactive layouts" difficulty: hard max_steps: 1 reset_params: difficulty: hard action_space: type: text description: "Raw HTML string (the agent must NOT include markdown fencing)" example: "......" observation_space: screenshot_b64: type: string encoding: base64 format: png description: "Base64-encoded PNG screenshot of the target UI (present after reset)" prompt: type: string description: "Task instruction for the agent" done: type: boolean description: "True after step(), False after reset()" reward: type: number range: [0.0, 1.0] description: "Composite reward (present after step(), null after reset())" reward: range: [0.0, 1.0] normalisation: weighted_sum_divided_by_total_weight total_weight: 11.0 components: - name: format weight: 0.5 description: "Markdown fencing present and /DOCTYPE tags found" - name: validity weight: 0.5 description: "HTML parseability, structural completeness, tag diversity (>=8 unique tags)" - name: structural weight: 0.5 description: "DOM tag-sequence similarity + inline style property coverage vs. reference" - name: text_block weight: 3.0 description: "Text block match rate + content similarity via Hungarian matching on IoU" - name: position weight: 1.0 description: "Spatial layout accuracy — normalised centre-to-centre distance of matched blocks" - name: color weight: 1.5 description: "Perceptual color accuracy via spatial CIEDE2000 on reference non-white pixels" - name: clip weight: 2.5 description: "CLIP cosine similarity after rendering (openai/clip-vit-base-patch32, CPU)" - name: ssim weight: 1.5 description: "Pixel-level SSIM at 320x240 RGB (skimage) for near-perfect discrimination" environment_variables: - name: HF_TOKEN required: true description: "Hugging Face / API key for LLM calls (also checked as API_KEY)" - name: API_BASE_URL required: true description: "OpenAI-compatible LLM endpoint (default: https://router.huggingface.co/v1)" - name: MODEL_NAME required: true description: "Model identifier to use for LLM calls (must support vision/image inputs)"