Spaces:
Sleeping
Sleeping
| name: CI – Docker Build, Server Tests, Inference Smoke Test | |
| on: | |
| push: | |
| branches: [main] | |
| pull_request: | |
| branches: [main] | |
| env: | |
| PYTHON_VERSION: "3.11" | |
| jobs: | |
| # -------------------------------------------------------------------------- | |
| # Job 1: Docker image builds successfully | |
| # -------------------------------------------------------------------------- | |
| docker-build: | |
| name: Docker Build | |
| runs-on: ubuntu-latest | |
| steps: | |
| - uses: actions/checkout@v4 | |
| - name: Build Docker image | |
| run: docker build -t data-cleaning-env . | |
| - name: Start container | |
| run: | | |
| docker run -d --name env-server -p 7860:7860 data-cleaning-env | |
| echo "Waiting for server startup..." | |
| for i in $(seq 1 30); do | |
| if curl -sf http://localhost:7860/health > /dev/null 2>&1; then | |
| echo "Server is up after ${i}s" | |
| break | |
| fi | |
| sleep 2 | |
| done | |
| - name: Health check | |
| run: | | |
| STATUS=$(curl -sf http://localhost:7860/health | python3 -c "import sys,json; print(json.load(sys.stdin)['status'])") | |
| echo "Health: $STATUS" | |
| [ "$STATUS" = "ok" ] || exit 1 | |
| - name: Test /tasks returns 4 tasks | |
| run: | | |
| COUNT=$(curl -sf http://localhost:7860/tasks | python3 -c "import sys,json; print(len(json.load(sys.stdin)['tasks']))") | |
| echo "Task count: $COUNT" | |
| [ "$COUNT" = "4" ] || exit 1 | |
| - name: Test /reset for each task | |
| run: | | |
| for task in easy medium hard expert; do | |
| RESP=$(curl -sf -X POST http://localhost:7860/reset \ | |
| -H "Content-Type: application/json" -d "{\"task\":\"$task\"}") | |
| EID=$(echo "$RESP" | python3 -c "import sys,json; print(json.load(sys.stdin)['state']['episode_id'])") | |
| echo "$task: episode_id=$EID" | |
| [ -n "$EID" ] || exit 1 | |
| done | |
| - name: Test /grader returns score in [0,1] | |
| run: | | |
| RESP=$(curl -sf -X POST http://localhost:7860/reset \ | |
| -H "Content-Type: application/json" -d '{"task":"easy"}') | |
| EID=$(echo "$RESP" | python3 -c "import sys,json; print(json.load(sys.stdin)['state']['episode_id'])") | |
| GRADE=$(curl -sf -X POST http://localhost:7860/grader \ | |
| -H "Content-Type: application/json" -d "{\"episode_id\":\"$EID\"}") | |
| echo "Grade response: $GRADE" | |
| python3 -c " | |
| import json, sys | |
| d = json.loads('$GRADE') | |
| assert 0.0 <= d['score'] <= 1.0, f'Score out of range: {d[\"score\"]}' | |
| assert 'breakdown' in d, 'Missing breakdown' | |
| for k in ['completeness', 'consistency', 'accuracy', 'format']: | |
| assert k in d['breakdown'], f'Missing {k} in breakdown' | |
| print('Grader: OK') | |
| " | |
| - name: Test /step with action | |
| run: | | |
| RESP=$(curl -sf -X POST http://localhost:7860/reset \ | |
| -H "Content-Type: application/json" -d '{"task":"easy"}') | |
| EID=$(echo "$RESP" | python3 -c "import sys,json; print(json.load(sys.stdin)['state']['episode_id'])") | |
| COL=$(echo "$RESP" | python3 -c "import sys,json; print(json.load(sys.stdin)['observation']['columns'][0])") | |
| STEP=$(curl -sf -X POST http://localhost:7860/step \ | |
| -H "Content-Type: application/json" \ | |
| -d "{\"episode_id\":\"$EID\",\"action\":{\"action_type\":\"fill_missing\",\"column\":\"$COL\",\"strategy\":\"median\"}}") | |
| python3 -c " | |
| import json | |
| d = json.loads('$STEP') | |
| assert 'reward' in d, 'Missing reward' | |
| assert 'done' in d, 'Missing done' | |
| assert 'observation' in d, 'Missing observation' | |
| obs = d['observation'] | |
| assert 'sample_rows' in obs, 'Missing sample_rows' | |
| assert 'action_history' in obs, 'Missing action_history' | |
| assert 'budget_remaining' in obs, 'Missing budget_remaining' | |
| print(f'Step: reward={d[\"reward\"]:.4f}, budget={obs[\"budget_remaining\"]}') | |
| " | |
| - name: Cleanup | |
| if: always() | |
| run: docker rm -f env-server 2>/dev/null || true | |
| # -------------------------------------------------------------------------- | |
| # Job 2: Unit tests + openenv validate (no Docker needed) | |
| # -------------------------------------------------------------------------- | |
| unit-tests: | |
| name: Unit Tests & Validation | |
| runs-on: ubuntu-latest | |
| steps: | |
| - uses: actions/checkout@v4 | |
| - name: Install uv | |
| uses: astral-sh/setup-uv@v4 | |
| - name: Set up Python | |
| run: uv python install ${{ env.PYTHON_VERSION }} | |
| - name: Install dependencies | |
| run: | | |
| uv venv .venv | |
| uv pip install --python .venv/bin/python -r requirements.txt | |
| uv pip install --python .venv/bin/python openenv-core pytest | |
| - name: Install package in dev mode | |
| run: | | |
| uv pip install --python .venv/bin/python -e . | |
| - name: Run pytest suite | |
| run: | | |
| .venv/bin/python -m pytest tests/ -v | |
| - name: Wheel install smoke test | |
| run: | | |
| uv pip install --python .venv/bin/python build | |
| .venv/bin/python -m build --wheel | |
| uv venv .venv-smoke | |
| uv pip install --python .venv-smoke/bin/python dist/*.whl | |
| .venv-smoke/bin/python -c " | |
| import data_cleaning_env | |
| from data_cleaning_env.models import ActionType, Observation, CleaningAction | |
| from data_cleaning_env.grader import compute_quality_score | |
| from data_cleaning_env.client import DataCleaningEnvClient | |
| actions = list(ActionType) | |
| assert len(actions) == 16, f'Expected 16 actions, got {len(actions)}' | |
| print(f'Wheel smoke test: OK ({len(actions)} actions)') | |
| " | |
| - name: openenv validate | |
| run: | | |
| .venv/bin/openenv validate | |
| - name: YAML validation | |
| run: | | |
| .venv/bin/python -c " | |
| import yaml | |
| with open('openenv.yaml') as f: | |
| data = yaml.safe_load(f) | |
| assert 'openenv' in data['tags'] | |
| assert data['license'] == 'MIT' | |
| assert len(data['tasks']) == 4 | |
| print('YAML: OK') | |
| " | |
| # -------------------------------------------------------------------------- | |
| # Job 3: Inference smoke test (heuristic mode, no LLM credits needed) | |
| # -------------------------------------------------------------------------- | |
| inference-smoke: | |
| name: Inference Smoke Test | |
| runs-on: ubuntu-latest | |
| steps: | |
| - uses: actions/checkout@v4 | |
| - name: Install uv | |
| uses: astral-sh/setup-uv@v4 | |
| - name: Set up Python | |
| run: uv python install ${{ env.PYTHON_VERSION }} | |
| - name: Install dependencies | |
| run: | | |
| uv venv .venv | |
| uv pip install --python .venv/bin/python -r requirements.txt | |
| uv pip install --python .venv/bin/python -e . | |
| - name: Start server in background | |
| run: | | |
| .venv/bin/uvicorn data_cleaning_env.server.app:app --host 0.0.0.0 --port 8000 & | |
| echo "Waiting for startup..." | |
| for i in $(seq 1 30); do | |
| if curl -sf http://localhost:8000/health > /dev/null 2>&1; then | |
| echo "Server up after ${i}s" | |
| break | |
| fi | |
| sleep 2 | |
| done | |
| - name: Run inference (heuristic mode) | |
| run: | | |
| OUTPUT=$(.venv/bin/python inference.py --url http://localhost:8000 2>&1) | |
| echo "$OUTPUT" | |
| python3 -c " | |
| import re, sys | |
| output = '''$OUTPUT''' | |
| lines = [l for l in output.strip().split('\n') if l.startswith('[')] | |
| start_re = re.compile(r'^\[START\] task=\S+ env=\S+ model=\S+$') | |
| step_re = re.compile(r'^\[STEP\] step=\d+ action=.+ reward=-?\d+\.\d{2} done=(true|false) error=.+$') | |
| end_re = re.compile(r'^\[END\] success=(true|false) steps=\d+ score=\d+\.\d{3} rewards=[\d,.\-]+$') | |
| starts = [l for l in lines if start_re.match(l)] | |
| ends = [l for l in lines if end_re.match(l)] | |
| steps = [l for l in lines if step_re.match(l)] | |
| assert len(starts) == 4, f'Expected 4 START, got {len(starts)}: {starts}' | |
| assert len(ends) == 4, f'Expected 4 END, got {len(ends)}: {ends}' | |
| assert len(steps) > 0, 'No STEP lines found' | |
| print(f'Log format: {len(starts)} START, {len(steps)} STEP, {len(ends)} END -- COMPLIANT') | |
| " | |
| - name: Run baseline | |
| run: | | |
| RESP=$(curl -sf -X POST http://localhost:8000/baseline) | |
| echo "Baseline: $RESP" | |
| python3 -c " | |
| import json | |
| d = json.loads('$RESP') | |
| scores = d['baseline_scores'] | |
| assert len(scores) == 4, f'Expected 4 tasks, got {len(scores)}' | |
| for task, score in scores.items(): | |
| assert 0.0 <= score <= 1.0, f'{task} score out of range: {score}' | |
| print('Baseline: OK') | |
| " | |