Spaces:
Sleeping
Sleeping
| name: Gold-QA eval gate | |
| on: | |
| pull_request: | |
| branches: [main] | |
| push: | |
| branches: [main] | |
| workflow_dispatch: | |
| jobs: | |
| eval: | |
| runs-on: ubuntu-latest | |
| timeout-minutes: 25 | |
| steps: | |
| - uses: actions/checkout@v4 | |
| with: | |
| lfs: true | |
| - name: Set up Python 3.11 | |
| uses: actions/setup-python@v5 | |
| with: | |
| python-version: "3.11" | |
| cache: "pip" | |
| - name: Cache HuggingFace model | |
| uses: actions/cache@v4 | |
| with: | |
| path: ~/.cache/huggingface | |
| key: hf-bge-small-${{ runner.os }} | |
| - name: Install dependencies | |
| run: | | |
| python -m pip install --upgrade pip | |
| pip install -r requirements.txt | |
| - name: Pre-download embedding model | |
| run: python -c "from sentence_transformers import SentenceTransformer; SentenceTransformer('BAAI/bge-small-en-v1.5')" | |
| - name: Rebuild Chroma if missing | |
| run: | | |
| if [ ! -f rag/vectors/chroma.sqlite3 ]; then | |
| python -m rag.ingest 2>&1 | tail -30 | |
| fi | |
| - name: Run eval on first 25 gold pairs | |
| env: | |
| SARVAM_API_KEY: ${{ secrets.SARVAM_API_KEY }} | |
| VOYAGE_API_KEY: ${{ secrets.VOYAGE_API_KEY }} | |
| GROQ_API_KEY: ${{ secrets.GROQ_API_KEY }} | |
| OPENROUTER_API_KEY: ${{ secrets.OPENROUTER_API_KEY }} | |
| run: python -m eval.run --limit 25 2>&1 | tee eval/ci_output.log | |
| - name: Enforce accuracy floor | |
| run: | | |
| python <<'PY' | |
| import json, sys | |
| d = json.load(open("eval/results.json")) | |
| s = d.get("summary", {}) | |
| factual = s.get("factual_accuracy", 0.0) | |
| citation = s.get("citation_accuracy", 0.0) | |
| print(f"Factual: {factual*100:.1f}%") | |
| print(f"Citation: {citation*100:.1f}%") | |
| floor_factual = 0.65 | |
| floor_citation = 0.55 | |
| if factual < floor_factual: | |
| print(f"FAIL β factual below floor {floor_factual*100:.0f}%") | |
| sys.exit(1) | |
| if citation < floor_citation: | |
| print(f"FAIL β citation below floor {floor_citation*100:.0f}%") | |
| sys.exit(1) | |
| print("PASS") | |
| PY | |
| - name: Upload eval results | |
| if: always() | |
| uses: actions/upload-artifact@v4 | |
| with: | |
| name: eval-results-${{ github.run_number }} | |
| path: | | |
| eval/results.md | |
| eval/results.json | |
| eval/ci_output.log | |
| retention-days: 30 | |