name: Gold-QA eval gate on: pull_request: branches: [main] push: branches: [main] workflow_dispatch: jobs: eval: runs-on: ubuntu-latest timeout-minutes: 25 steps: - uses: actions/checkout@v4 with: lfs: true - name: Set up Python 3.11 uses: actions/setup-python@v5 with: python-version: "3.11" cache: "pip" - name: Cache HuggingFace model uses: actions/cache@v4 with: path: ~/.cache/huggingface key: hf-bge-small-${{ runner.os }} - name: Install dependencies run: | python -m pip install --upgrade pip pip install -r requirements.txt - name: Pre-download embedding model run: python -c "from sentence_transformers import SentenceTransformer; SentenceTransformer('BAAI/bge-small-en-v1.5')" - name: Rebuild Chroma if missing run: | if [ ! -f rag/vectors/chroma.sqlite3 ]; then python -m rag.ingest 2>&1 | tail -30 fi - name: Run eval on first 25 gold pairs env: SARVAM_API_KEY: ${{ secrets.SARVAM_API_KEY }} VOYAGE_API_KEY: ${{ secrets.VOYAGE_API_KEY }} GROQ_API_KEY: ${{ secrets.GROQ_API_KEY }} OPENROUTER_API_KEY: ${{ secrets.OPENROUTER_API_KEY }} run: python -m eval.run --limit 25 2>&1 | tee eval/ci_output.log - name: Enforce accuracy floor run: | python <<'PY' import json, sys d = json.load(open("eval/results.json")) s = d.get("summary", {}) factual = s.get("factual_accuracy", 0.0) citation = s.get("citation_accuracy", 0.0) print(f"Factual: {factual*100:.1f}%") print(f"Citation: {citation*100:.1f}%") floor_factual = 0.65 floor_citation = 0.55 if factual < floor_factual: print(f"FAIL — factual below floor {floor_factual*100:.0f}%") sys.exit(1) if citation < floor_citation: print(f"FAIL — citation below floor {floor_citation*100:.0f}%") sys.exit(1) print("PASS") PY - name: Upload eval results if: always() uses: actions/upload-artifact@v4 with: name: eval-results-${{ github.run_number }} path: | eval/results.md eval/results.json eval/ci_output.log retention-days: 30