name: Gold-QA eval gate

on:
  pull_request:
    branches: [main]
  push:
    branches: [main]
  workflow_dispatch:

jobs:
  eval:
    runs-on: ubuntu-latest
    timeout-minutes: 25
    steps:
      - uses: actions/checkout@v4
        with:
          lfs: true

      - name: Set up Python 3.11
        uses: actions/setup-python@v5
        with:
          python-version: "3.11"
          cache: "pip"

      - name: Cache HuggingFace model
        uses: actions/cache@v4
        with:
          path: ~/.cache/huggingface
          key: hf-bge-small-${{ runner.os }}

      - name: Install dependencies
        run: |
          python -m pip install --upgrade pip
          pip install -r requirements.txt

      - name: Pre-download embedding model
        run: python -c "from sentence_transformers import SentenceTransformer; SentenceTransformer('BAAI/bge-small-en-v1.5')"

      - name: Rebuild Chroma if missing
        run: |
          if [ ! -f rag/vectors/chroma.sqlite3 ]; then
            python -m rag.ingest 2>&1 | tail -30
          fi

      - name: Run eval on first 25 gold pairs
        env:
          SARVAM_API_KEY: ${{ secrets.SARVAM_API_KEY }}
          VOYAGE_API_KEY: ${{ secrets.VOYAGE_API_KEY }}
          GROQ_API_KEY: ${{ secrets.GROQ_API_KEY }}
          OPENROUTER_API_KEY: ${{ secrets.OPENROUTER_API_KEY }}
        run: python -m eval.run --limit 25 2>&1 | tee eval/ci_output.log

      - name: Enforce accuracy floor
        run: |
          python <<'PY'
          import json, sys
          d = json.load(open("eval/results.json"))
          s = d.get("summary", {})
          factual = s.get("factual_accuracy", 0.0)
          citation = s.get("citation_accuracy", 0.0)
          print(f"Factual:  {factual*100:.1f}%")
          print(f"Citation: {citation*100:.1f}%")
          floor_factual = 0.65
          floor_citation = 0.55
          if factual < floor_factual:
              print(f"FAIL — factual below floor {floor_factual*100:.0f}%")
              sys.exit(1)
          if citation < floor_citation:
              print(f"FAIL — citation below floor {floor_citation*100:.0f}%")
              sys.exit(1)
          print("PASS")
          PY

      - name: Upload eval results
        if: always()
        uses: actions/upload-artifact@v4
        with:
          name: eval-results-${{ github.run_number }}
          path: |
            eval/results.md
            eval/results.json
            eval/ci_output.log
          retention-days: 30