Spaces:

yashmarathe
/

data-cleaning-openenv

Sleeping

App Files Files Community

data-cleaning-openenv / .github /workflows /ci.yml

yashmarathe

refactor: move all root-level files to repo root

7492bfe about 2 months ago

raw

history blame contribute delete

9 kB

	name: CI – Docker Build, Server Tests, Inference Smoke Test

	on:
	push:
	branches: [main]
	pull_request:
	branches: [main]

	env:
	PYTHON_VERSION: "3.11"

	jobs:
	# --------------------------------------------------------------------------
	# Job 1: Docker image builds successfully
	# --------------------------------------------------------------------------
	docker-build:
	name: Docker Build
	runs-on: ubuntu-latest
	steps:
	- uses: actions/checkout@v4

	- name: Build Docker image
	run: docker build -t data-cleaning-env .

	- name: Start container
	run: \|
	docker run -d --name env-server -p 7860:7860 data-cleaning-env
	echo "Waiting for server startup..."
	for i in $(seq 1 30); do
	if curl -sf http://localhost:7860/health > /dev/null 2>&1; then
	echo "Server is up after ${i}s"
	break
	fi
	sleep 2
	done

	- name: Health check
	run: \|
	STATUS=$(curl -sf http://localhost:7860/health \| python3 -c "import sys,json; print(json.load(sys.stdin)['status'])")
	echo "Health: $STATUS"
	[ "$STATUS" = "ok" ] \|\| exit 1

	- name: Test /tasks returns 4 tasks
	run: \|
	COUNT=$(curl -sf http://localhost:7860/tasks \| python3 -c "import sys,json; print(len(json.load(sys.stdin)['tasks']))")
	echo "Task count: $COUNT"
	[ "$COUNT" = "4" ] \|\| exit 1

	- name: Test /reset for each task
	run: \|
	for task in easy medium hard expert; do
	RESP=$(curl -sf -X POST http://localhost:7860/reset \
	-H "Content-Type: application/json" -d "{\"task\":\"$task\"}")
	EID=$(echo "$RESP" \| python3 -c "import sys,json; print(json.load(sys.stdin)['state']['episode_id'])")
	echo "$task: episode_id=$EID"
	[ -n "$EID" ] \|\| exit 1
	done

	- name: Test /grader returns score in [0,1]
	run: \|
	RESP=$(curl -sf -X POST http://localhost:7860/reset \
	-H "Content-Type: application/json" -d '{"task":"easy"}')
	EID=$(echo "$RESP" \| python3 -c "import sys,json; print(json.load(sys.stdin)['state']['episode_id'])")
	GRADE=$(curl -sf -X POST http://localhost:7860/grader \
	-H "Content-Type: application/json" -d "{\"episode_id\":\"$EID\"}")
	echo "Grade response: $GRADE"
	python3 -c "
	import json, sys
	d = json.loads('$GRADE')
	assert 0.0 <= d['score'] <= 1.0, f'Score out of range: {d[\"score\"]}'
	assert 'breakdown' in d, 'Missing breakdown'
	for k in ['completeness', 'consistency', 'accuracy', 'format']:
	assert k in d['breakdown'], f'Missing {k} in breakdown'
	print('Grader: OK')
	"

	- name: Test /step with action
	run: \|
	RESP=$(curl -sf -X POST http://localhost:7860/reset \
	-H "Content-Type: application/json" -d '{"task":"easy"}')
	EID=$(echo "$RESP" \| python3 -c "import sys,json; print(json.load(sys.stdin)['state']['episode_id'])")
	COL=$(echo "$RESP" \| python3 -c "import sys,json; print(json.load(sys.stdin)['observation']['columns'][0])")
	STEP=$(curl -sf -X POST http://localhost:7860/step \
	-H "Content-Type: application/json" \
	-d "{\"episode_id\":\"$EID\",\"action\":{\"action_type\":\"fill_missing\",\"column\":\"$COL\",\"strategy\":\"median\"}}")
	python3 -c "
	import json
	d = json.loads('$STEP')
	assert 'reward' in d, 'Missing reward'
	assert 'done' in d, 'Missing done'
	assert 'observation' in d, 'Missing observation'
	obs = d['observation']
	assert 'sample_rows' in obs, 'Missing sample_rows'
	assert 'action_history' in obs, 'Missing action_history'
	assert 'budget_remaining' in obs, 'Missing budget_remaining'
	print(f'Step: reward={d[\"reward\"]:.4f}, budget={obs[\"budget_remaining\"]}')
	"

	- name: Cleanup
	if: always()
	run: docker rm -f env-server 2>/dev/null \|\| true

	# --------------------------------------------------------------------------
	# Job 2: Unit tests + openenv validate (no Docker needed)
	# --------------------------------------------------------------------------
	unit-tests:
	name: Unit Tests & Validation
	runs-on: ubuntu-latest
	steps:
	- uses: actions/checkout@v4

	- name: Install uv
	uses: astral-sh/setup-uv@v4

	- name: Set up Python
	run: uv python install ${{ env.PYTHON_VERSION }}

	- name: Install dependencies
	run: \|
	uv venv .venv
	uv pip install --python .venv/bin/python -r requirements.txt
	uv pip install --python .venv/bin/python openenv-core pytest

	- name: Install package in dev mode
	run: \|
	uv pip install --python .venv/bin/python -e .

	- name: Run pytest suite
	run: \|
	.venv/bin/python -m pytest tests/ -v

	- name: Wheel install smoke test
	run: \|
	uv pip install --python .venv/bin/python build
	.venv/bin/python -m build --wheel
	uv venv .venv-smoke
	uv pip install --python .venv-smoke/bin/python dist/*.whl
	.venv-smoke/bin/python -c "
	import data_cleaning_env
	from data_cleaning_env.models import ActionType, Observation, CleaningAction
	from data_cleaning_env.grader import compute_quality_score
	from data_cleaning_env.client import DataCleaningEnvClient
	actions = list(ActionType)
	assert len(actions) == 16, f'Expected 16 actions, got {len(actions)}'
	print(f'Wheel smoke test: OK ({len(actions)} actions)')
	"

	- name: openenv validate
	run: \|
	.venv/bin/openenv validate

	- name: YAML validation
	run: \|
	.venv/bin/python -c "
	import yaml
	with open('openenv.yaml') as f:
	data = yaml.safe_load(f)
	assert 'openenv' in data['tags']
	assert data['license'] == 'MIT'
	assert len(data['tasks']) == 4
	print('YAML: OK')
	"

	# --------------------------------------------------------------------------
	# Job 3: Inference smoke test (heuristic mode, no LLM credits needed)
	# --------------------------------------------------------------------------
	inference-smoke:
	name: Inference Smoke Test
	runs-on: ubuntu-latest
	steps:
	- uses: actions/checkout@v4

	- name: Install uv
	uses: astral-sh/setup-uv@v4

	- name: Set up Python
	run: uv python install ${{ env.PYTHON_VERSION }}

	- name: Install dependencies
	run: \|
	uv venv .venv
	uv pip install --python .venv/bin/python -r requirements.txt
	uv pip install --python .venv/bin/python -e .

	- name: Start server in background
	run: \|
	.venv/bin/uvicorn data_cleaning_env.server.app:app --host 0.0.0.0 --port 8000 &
	echo "Waiting for startup..."
	for i in $(seq 1 30); do
	if curl -sf http://localhost:8000/health > /dev/null 2>&1; then
	echo "Server up after ${i}s"
	break
	fi
	sleep 2
	done

	- name: Run inference (heuristic mode)
	run: \|
	OUTPUT=$(.venv/bin/python inference.py --url http://localhost:8000 2>&1)
	echo "$OUTPUT"

	python3 -c "
	import re, sys

	output = '''$OUTPUT'''
	lines = [l for l in output.strip().split('\n') if l.startswith('[')]

	start_re = re.compile(r'^\[START\] task=\S+ env=\S+ model=\S+$')
	step_re = re.compile(r'^\[STEP\] step=\d+ action=.+ reward=-?\d+\.\d{2} done=(true\|false) error=.+$')
	end_re = re.compile(r'^\[END\] success=(true\|false) steps=\d+ score=\d+\.\d{3} rewards=[\d,.\-]+$')

	starts = [l for l in lines if start_re.match(l)]
	ends = [l for l in lines if end_re.match(l)]
	steps = [l for l in lines if step_re.match(l)]

	assert len(starts) == 4, f'Expected 4 START, got {len(starts)}: {starts}'
	assert len(ends) == 4, f'Expected 4 END, got {len(ends)}: {ends}'
	assert len(steps) > 0, 'No STEP lines found'

	print(f'Log format: {len(starts)} START, {len(steps)} STEP, {len(ends)} END -- COMPLIANT')
	"

	- name: Run baseline
	run: \|
	RESP=$(curl -sf -X POST http://localhost:8000/baseline)
	echo "Baseline: $RESP"
	python3 -c "
	import json
	d = json.loads('$RESP')
	scores = d['baseline_scores']
	assert len(scores) == 4, f'Expected 4 tasks, got {len(scores)}'
	for task, score in scores.items():
	assert 0.0 <= score <= 1.0, f'{task} score out of range: {score}'
	print('Baseline: OK')
	"