Spaces:

amaljoe88
/

vision-coder-openenv

Running

App Files Files Community

vision-coder-openenv / openenv.yaml

amaljoe88

deploy: sync 712e5bc -> HF

cf6c0e0 about 1 month ago

raw

history blame contribute delete

3.1 kB

	name: vision-coder
	version: "0.1.0"
	description: >
	Screenshot-to-HTML reinforcement learning environment.
	An agent receives a UI screenshot and must generate HTML+CSS
	that visually reproduces the original page.

	server:
	host: "0.0.0.0"
	port: 7860
	start_command: "uvicorn openenv.server.app:app --host 0.0.0.0 --port 7860"

	tasks:
	- id: easy
	description: "Simple single-section web pages with minimal layout"
	difficulty: easy
	max_steps: 1
	reset_params:
	difficulty: easy

	- id: medium
	description: "Multi-section pages with navigation bars and content blocks"
	difficulty: medium
	max_steps: 1
	reset_params:
	difficulty: medium

	- id: hard
	description: "Complex pages with forms, tables, and rich interactive layouts"
	difficulty: hard
	max_steps: 1
	reset_params:
	difficulty: hard

	action_space:
	type: text
	description: "Raw HTML string (the agent must NOT include markdown fencing)"
	example: "<!DOCTYPE html><html><head>...</head><body>...</body></html>"

	observation_space:
	screenshot_b64:
	type: string
	encoding: base64
	format: png
	description: "Base64-encoded PNG screenshot of the target UI (present after reset)"
	prompt:
	type: string
	description: "Task instruction for the agent"
	done:
	type: boolean
	description: "True after step(), False after reset()"
	reward:
	type: number
	range: [0.0, 1.0]
	description: "Composite reward (present after step(), null after reset())"

	reward:
	range: [0.0, 1.0]
	normalisation: weighted_sum_divided_by_total_weight
	total_weight: 11.0
	components:
	- name: format
	weight: 0.5
	description: "Markdown fencing present and <html>/DOCTYPE tags found"
	- name: validity
	weight: 0.5
	description: "HTML parseability, structural completeness, tag diversity (>=8 unique tags)"
	- name: structural
	weight: 0.5
	description: "DOM tag-sequence similarity + inline style property coverage vs. reference"
	- name: text_block
	weight: 3.0
	description: "Text block match rate + content similarity via Hungarian matching on IoU"
	- name: position
	weight: 1.0
	description: "Spatial layout accuracy — normalised centre-to-centre distance of matched blocks"
	- name: color
	weight: 1.5
	description: "Perceptual color accuracy via spatial CIEDE2000 on reference non-white pixels"
	- name: clip
	weight: 2.5
	description: "CLIP cosine similarity after rendering (openai/clip-vit-base-patch32, CPU)"
	- name: ssim
	weight: 1.5
	description: "Pixel-level SSIM at 320x240 RGB (skimage) for near-perfect discrimination"

	environment_variables:
	- name: HF_TOKEN
	required: true
	description: "Hugging Face / API key for LLM calls (also checked as API_KEY)"
	- name: API_BASE_URL
	required: true
	description: "OpenAI-compatible LLM endpoint (default: https://router.huggingface.co/v1)"
	- name: MODEL_NAME
	required: true
	description: "Model identifier to use for LLM calls (must support vision/image inputs)"