name: CI – Docker Build, Server Tests, Inference Smoke Test on: push: branches: [main] pull_request: branches: [main] env: PYTHON_VERSION: "3.11" jobs: # -------------------------------------------------------------------------- # Job 1: Docker image builds successfully # -------------------------------------------------------------------------- docker-build: name: Docker Build runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 - name: Build Docker image run: docker build -t data-cleaning-env . - name: Start container run: | docker run -d --name env-server -p 7860:7860 data-cleaning-env echo "Waiting for server startup..." for i in $(seq 1 30); do if curl -sf http://localhost:7860/health > /dev/null 2>&1; then echo "Server is up after ${i}s" break fi sleep 2 done - name: Health check run: | STATUS=$(curl -sf http://localhost:7860/health | python3 -c "import sys,json; print(json.load(sys.stdin)['status'])") echo "Health: $STATUS" [ "$STATUS" = "ok" ] || exit 1 - name: Test /tasks returns 4 tasks run: | COUNT=$(curl -sf http://localhost:7860/tasks | python3 -c "import sys,json; print(len(json.load(sys.stdin)['tasks']))") echo "Task count: $COUNT" [ "$COUNT" = "4" ] || exit 1 - name: Test /reset for each task run: | for task in easy medium hard expert; do RESP=$(curl -sf -X POST http://localhost:7860/reset \ -H "Content-Type: application/json" -d "{\"task\":\"$task\"}") EID=$(echo "$RESP" | python3 -c "import sys,json; print(json.load(sys.stdin)['state']['episode_id'])") echo "$task: episode_id=$EID" [ -n "$EID" ] || exit 1 done - name: Test /grader returns score in [0,1] run: | RESP=$(curl -sf -X POST http://localhost:7860/reset \ -H "Content-Type: application/json" -d '{"task":"easy"}') EID=$(echo "$RESP" | python3 -c "import sys,json; print(json.load(sys.stdin)['state']['episode_id'])") GRADE=$(curl -sf -X POST http://localhost:7860/grader \ -H "Content-Type: application/json" -d "{\"episode_id\":\"$EID\"}") echo "Grade response: $GRADE" python3 -c " import json, sys d = json.loads('$GRADE') assert 0.0 <= d['score'] <= 1.0, f'Score out of range: {d[\"score\"]}' assert 'breakdown' in d, 'Missing breakdown' for k in ['completeness', 'consistency', 'accuracy', 'format']: assert k in d['breakdown'], f'Missing {k} in breakdown' print('Grader: OK') " - name: Test /step with action run: | RESP=$(curl -sf -X POST http://localhost:7860/reset \ -H "Content-Type: application/json" -d '{"task":"easy"}') EID=$(echo "$RESP" | python3 -c "import sys,json; print(json.load(sys.stdin)['state']['episode_id'])") COL=$(echo "$RESP" | python3 -c "import sys,json; print(json.load(sys.stdin)['observation']['columns'][0])") STEP=$(curl -sf -X POST http://localhost:7860/step \ -H "Content-Type: application/json" \ -d "{\"episode_id\":\"$EID\",\"action\":{\"action_type\":\"fill_missing\",\"column\":\"$COL\",\"strategy\":\"median\"}}") python3 -c " import json d = json.loads('$STEP') assert 'reward' in d, 'Missing reward' assert 'done' in d, 'Missing done' assert 'observation' in d, 'Missing observation' obs = d['observation'] assert 'sample_rows' in obs, 'Missing sample_rows' assert 'action_history' in obs, 'Missing action_history' assert 'budget_remaining' in obs, 'Missing budget_remaining' print(f'Step: reward={d[\"reward\"]:.4f}, budget={obs[\"budget_remaining\"]}') " - name: Cleanup if: always() run: docker rm -f env-server 2>/dev/null || true # -------------------------------------------------------------------------- # Job 2: Unit tests + openenv validate (no Docker needed) # -------------------------------------------------------------------------- unit-tests: name: Unit Tests & Validation runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 - name: Install uv uses: astral-sh/setup-uv@v4 - name: Set up Python run: uv python install ${{ env.PYTHON_VERSION }} - name: Install dependencies run: | uv venv .venv uv pip install --python .venv/bin/python -r requirements.txt uv pip install --python .venv/bin/python openenv-core pytest - name: Install package in dev mode run: | uv pip install --python .venv/bin/python -e . - name: Run pytest suite run: | .venv/bin/python -m pytest tests/ -v - name: Wheel install smoke test run: | uv pip install --python .venv/bin/python build .venv/bin/python -m build --wheel uv venv .venv-smoke uv pip install --python .venv-smoke/bin/python dist/*.whl .venv-smoke/bin/python -c " import data_cleaning_env from data_cleaning_env.models import ActionType, Observation, CleaningAction from data_cleaning_env.grader import compute_quality_score from data_cleaning_env.client import DataCleaningEnvClient actions = list(ActionType) assert len(actions) == 16, f'Expected 16 actions, got {len(actions)}' print(f'Wheel smoke test: OK ({len(actions)} actions)') " - name: openenv validate run: | .venv/bin/openenv validate - name: YAML validation run: | .venv/bin/python -c " import yaml with open('openenv.yaml') as f: data = yaml.safe_load(f) assert 'openenv' in data['tags'] assert data['license'] == 'MIT' assert len(data['tasks']) == 4 print('YAML: OK') " # -------------------------------------------------------------------------- # Job 3: Inference smoke test (heuristic mode, no LLM credits needed) # -------------------------------------------------------------------------- inference-smoke: name: Inference Smoke Test runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 - name: Install uv uses: astral-sh/setup-uv@v4 - name: Set up Python run: uv python install ${{ env.PYTHON_VERSION }} - name: Install dependencies run: | uv venv .venv uv pip install --python .venv/bin/python -r requirements.txt uv pip install --python .venv/bin/python -e . - name: Start server in background run: | .venv/bin/uvicorn data_cleaning_env.server.app:app --host 0.0.0.0 --port 8000 & echo "Waiting for startup..." for i in $(seq 1 30); do if curl -sf http://localhost:8000/health > /dev/null 2>&1; then echo "Server up after ${i}s" break fi sleep 2 done - name: Run inference (heuristic mode) run: | OUTPUT=$(.venv/bin/python inference.py --url http://localhost:8000 2>&1) echo "$OUTPUT" python3 -c " import re, sys output = '''$OUTPUT''' lines = [l for l in output.strip().split('\n') if l.startswith('[')] start_re = re.compile(r'^\[START\] task=\S+ env=\S+ model=\S+$') step_re = re.compile(r'^\[STEP\] step=\d+ action=.+ reward=-?\d+\.\d{2} done=(true|false) error=.+$') end_re = re.compile(r'^\[END\] success=(true|false) steps=\d+ score=\d+\.\d{3} rewards=[\d,.\-]+$') starts = [l for l in lines if start_re.match(l)] ends = [l for l in lines if end_re.match(l)] steps = [l for l in lines if step_re.match(l)] assert len(starts) == 4, f'Expected 4 START, got {len(starts)}: {starts}' assert len(ends) == 4, f'Expected 4 END, got {len(ends)}: {ends}' assert len(steps) > 0, 'No STEP lines found' print(f'Log format: {len(starts)} START, {len(steps)} STEP, {len(ends)} END -- COMPLIANT') " - name: Run baseline run: | RESP=$(curl -sf -X POST http://localhost:8000/baseline) echo "Baseline: $RESP" python3 -c " import json d = json.loads('$RESP') scores = d['baseline_scores'] assert len(scores) == 4, f'Expected 4 tasks, got {len(scores)}' for task, score in scores.items(): assert 0.0 <= score <= 1.0, f'{task} score out of range: {score}' print('Baseline: OK') "