yashmarathe's picture
refactor: move all root-level files to repo root
7492bfe
name: CI Docker Build, Server Tests, Inference Smoke Test
on:
push:
branches: [main]
pull_request:
branches: [main]
env:
PYTHON_VERSION: "3.11"
jobs:
# --------------------------------------------------------------------------
# Job 1: Docker image builds successfully
# --------------------------------------------------------------------------
docker-build:
name: Docker Build
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Build Docker image
run: docker build -t data-cleaning-env .
- name: Start container
run: |
docker run -d --name env-server -p 7860:7860 data-cleaning-env
echo "Waiting for server startup..."
for i in $(seq 1 30); do
if curl -sf http://localhost:7860/health > /dev/null 2>&1; then
echo "Server is up after ${i}s"
break
fi
sleep 2
done
- name: Health check
run: |
STATUS=$(curl -sf http://localhost:7860/health | python3 -c "import sys,json; print(json.load(sys.stdin)['status'])")
echo "Health: $STATUS"
[ "$STATUS" = "ok" ] || exit 1
- name: Test /tasks returns 4 tasks
run: |
COUNT=$(curl -sf http://localhost:7860/tasks | python3 -c "import sys,json; print(len(json.load(sys.stdin)['tasks']))")
echo "Task count: $COUNT"
[ "$COUNT" = "4" ] || exit 1
- name: Test /reset for each task
run: |
for task in easy medium hard expert; do
RESP=$(curl -sf -X POST http://localhost:7860/reset \
-H "Content-Type: application/json" -d "{\"task\":\"$task\"}")
EID=$(echo "$RESP" | python3 -c "import sys,json; print(json.load(sys.stdin)['state']['episode_id'])")
echo "$task: episode_id=$EID"
[ -n "$EID" ] || exit 1
done
- name: Test /grader returns score in [0,1]
run: |
RESP=$(curl -sf -X POST http://localhost:7860/reset \
-H "Content-Type: application/json" -d '{"task":"easy"}')
EID=$(echo "$RESP" | python3 -c "import sys,json; print(json.load(sys.stdin)['state']['episode_id'])")
GRADE=$(curl -sf -X POST http://localhost:7860/grader \
-H "Content-Type: application/json" -d "{\"episode_id\":\"$EID\"}")
echo "Grade response: $GRADE"
python3 -c "
import json, sys
d = json.loads('$GRADE')
assert 0.0 <= d['score'] <= 1.0, f'Score out of range: {d[\"score\"]}'
assert 'breakdown' in d, 'Missing breakdown'
for k in ['completeness', 'consistency', 'accuracy', 'format']:
assert k in d['breakdown'], f'Missing {k} in breakdown'
print('Grader: OK')
"
- name: Test /step with action
run: |
RESP=$(curl -sf -X POST http://localhost:7860/reset \
-H "Content-Type: application/json" -d '{"task":"easy"}')
EID=$(echo "$RESP" | python3 -c "import sys,json; print(json.load(sys.stdin)['state']['episode_id'])")
COL=$(echo "$RESP" | python3 -c "import sys,json; print(json.load(sys.stdin)['observation']['columns'][0])")
STEP=$(curl -sf -X POST http://localhost:7860/step \
-H "Content-Type: application/json" \
-d "{\"episode_id\":\"$EID\",\"action\":{\"action_type\":\"fill_missing\",\"column\":\"$COL\",\"strategy\":\"median\"}}")
python3 -c "
import json
d = json.loads('$STEP')
assert 'reward' in d, 'Missing reward'
assert 'done' in d, 'Missing done'
assert 'observation' in d, 'Missing observation'
obs = d['observation']
assert 'sample_rows' in obs, 'Missing sample_rows'
assert 'action_history' in obs, 'Missing action_history'
assert 'budget_remaining' in obs, 'Missing budget_remaining'
print(f'Step: reward={d[\"reward\"]:.4f}, budget={obs[\"budget_remaining\"]}')
"
- name: Cleanup
if: always()
run: docker rm -f env-server 2>/dev/null || true
# --------------------------------------------------------------------------
# Job 2: Unit tests + openenv validate (no Docker needed)
# --------------------------------------------------------------------------
unit-tests:
name: Unit Tests & Validation
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Install uv
uses: astral-sh/setup-uv@v4
- name: Set up Python
run: uv python install ${{ env.PYTHON_VERSION }}
- name: Install dependencies
run: |
uv venv .venv
uv pip install --python .venv/bin/python -r requirements.txt
uv pip install --python .venv/bin/python openenv-core pytest
- name: Install package in dev mode
run: |
uv pip install --python .venv/bin/python -e .
- name: Run pytest suite
run: |
.venv/bin/python -m pytest tests/ -v
- name: Wheel install smoke test
run: |
uv pip install --python .venv/bin/python build
.venv/bin/python -m build --wheel
uv venv .venv-smoke
uv pip install --python .venv-smoke/bin/python dist/*.whl
.venv-smoke/bin/python -c "
import data_cleaning_env
from data_cleaning_env.models import ActionType, Observation, CleaningAction
from data_cleaning_env.grader import compute_quality_score
from data_cleaning_env.client import DataCleaningEnvClient
actions = list(ActionType)
assert len(actions) == 16, f'Expected 16 actions, got {len(actions)}'
print(f'Wheel smoke test: OK ({len(actions)} actions)')
"
- name: openenv validate
run: |
.venv/bin/openenv validate
- name: YAML validation
run: |
.venv/bin/python -c "
import yaml
with open('openenv.yaml') as f:
data = yaml.safe_load(f)
assert 'openenv' in data['tags']
assert data['license'] == 'MIT'
assert len(data['tasks']) == 4
print('YAML: OK')
"
# --------------------------------------------------------------------------
# Job 3: Inference smoke test (heuristic mode, no LLM credits needed)
# --------------------------------------------------------------------------
inference-smoke:
name: Inference Smoke Test
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Install uv
uses: astral-sh/setup-uv@v4
- name: Set up Python
run: uv python install ${{ env.PYTHON_VERSION }}
- name: Install dependencies
run: |
uv venv .venv
uv pip install --python .venv/bin/python -r requirements.txt
uv pip install --python .venv/bin/python -e .
- name: Start server in background
run: |
.venv/bin/uvicorn data_cleaning_env.server.app:app --host 0.0.0.0 --port 8000 &
echo "Waiting for startup..."
for i in $(seq 1 30); do
if curl -sf http://localhost:8000/health > /dev/null 2>&1; then
echo "Server up after ${i}s"
break
fi
sleep 2
done
- name: Run inference (heuristic mode)
run: |
OUTPUT=$(.venv/bin/python inference.py --url http://localhost:8000 2>&1)
echo "$OUTPUT"
python3 -c "
import re, sys
output = '''$OUTPUT'''
lines = [l for l in output.strip().split('\n') if l.startswith('[')]
start_re = re.compile(r'^\[START\] task=\S+ env=\S+ model=\S+$')
step_re = re.compile(r'^\[STEP\] step=\d+ action=.+ reward=-?\d+\.\d{2} done=(true|false) error=.+$')
end_re = re.compile(r'^\[END\] success=(true|false) steps=\d+ score=\d+\.\d{3} rewards=[\d,.\-]+$')
starts = [l for l in lines if start_re.match(l)]
ends = [l for l in lines if end_re.match(l)]
steps = [l for l in lines if step_re.match(l)]
assert len(starts) == 4, f'Expected 4 START, got {len(starts)}: {starts}'
assert len(ends) == 4, f'Expected 4 END, got {len(ends)}: {ends}'
assert len(steps) > 0, 'No STEP lines found'
print(f'Log format: {len(starts)} START, {len(steps)} STEP, {len(ends)} END -- COMPLIANT')
"
- name: Run baseline
run: |
RESP=$(curl -sf -X POST http://localhost:8000/baseline)
echo "Baseline: $RESP"
python3 -c "
import json
d = json.loads('$RESP')
scores = d['baseline_scores']
assert len(scores) == 4, f'Expected 4 tasks, got {len(scores)}'
for task, score in scores.items():
assert 0.0 <= score <= 1.0, f'{task} score out of range: {score}'
print('Baseline: OK')
"