rohitsar567's picture
Deploy v1 β€” single-Docker FastAPI + Next.js + RAG + voice + faithfulness
7863797 verified
Raw
History Blame Contribute Delete
2.49 kB
name: Gold-QA eval gate
on:
pull_request:
branches: [main]
push:
branches: [main]
workflow_dispatch:
jobs:
eval:
runs-on: ubuntu-latest
timeout-minutes: 25
steps:
- uses: actions/checkout@v4
with:
lfs: true
- name: Set up Python 3.11
uses: actions/setup-python@v5
with:
python-version: "3.11"
cache: "pip"
- name: Cache HuggingFace model
uses: actions/cache@v4
with:
path: ~/.cache/huggingface
key: hf-bge-small-${{ runner.os }}
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install -r requirements.txt
- name: Pre-download embedding model
run: python -c "from sentence_transformers import SentenceTransformer; SentenceTransformer('BAAI/bge-small-en-v1.5')"
- name: Rebuild Chroma if missing
run: |
if [ ! -f rag/vectors/chroma.sqlite3 ]; then
python -m rag.ingest 2>&1 | tail -30
fi
- name: Run eval on first 25 gold pairs
env:
SARVAM_API_KEY: ${{ secrets.SARVAM_API_KEY }}
VOYAGE_API_KEY: ${{ secrets.VOYAGE_API_KEY }}
GROQ_API_KEY: ${{ secrets.GROQ_API_KEY }}
OPENROUTER_API_KEY: ${{ secrets.OPENROUTER_API_KEY }}
run: python -m eval.run --limit 25 2>&1 | tee eval/ci_output.log
- name: Enforce accuracy floor
run: |
python <<'PY'
import json, sys
d = json.load(open("eval/results.json"))
s = d.get("summary", {})
factual = s.get("factual_accuracy", 0.0)
citation = s.get("citation_accuracy", 0.0)
print(f"Factual: {factual*100:.1f}%")
print(f"Citation: {citation*100:.1f}%")
floor_factual = 0.65
floor_citation = 0.55
if factual < floor_factual:
print(f"FAIL β€” factual below floor {floor_factual*100:.0f}%")
sys.exit(1)
if citation < floor_citation:
print(f"FAIL β€” citation below floor {floor_citation*100:.0f}%")
sys.exit(1)
print("PASS")
PY
- name: Upload eval results
if: always()
uses: actions/upload-artifact@v4
with:
name: eval-results-${{ github.run_number }}
path: |
eval/results.md
eval/results.json
eval/ci_output.log
retention-days: 30