substrate / eval /eval_baseline.py
Syed Taha
refactor: update eval_baseline.py to integrate Ollama for local LLM calls and improve usage instructions
0d5ef14
"""
eval/eval_baseline.py
Tests raw LLM with NO retrieval context.
Establishes the floor that all RAG configurations must beat.
Scoring:
- For each query, call LLM with system prompt only (no retrieved chunks)
- Score = % of query's `keywords` found in the generated answer
- Pass if keyword coverage >= threshold (default 0.4 — slightly lower than
retrieval eval since generation may paraphrase rather than use exact terms)
Output: eval/results/baseline.json
Setup:
ollama pull llama2 # or mistral, llama3.1, etc.
ollama serve # start local server at localhost:11434
Usage:
python eval/eval_baseline.py
python eval/eval_baseline.py --tier 1 # single tier
python eval/eval_baseline.py --query T1-001 # single query
python eval/eval_baseline.py --model llama2 # override model
"""
import argparse
import json
import logging
import sys
import time
from pathlib import Path
import numpy as np
import requests
import yaml
from dotenv import load_dotenv
sys.path.insert(0, str(Path(__file__).parent.parent))
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s %(levelname)-8s %(message)s",
datefmt="%H:%M:%S",
)
log = logging.getLogger(__name__)
# Config
def load_config() -> dict:
with open("config.yaml") as f:
return yaml.safe_load(f)
def load_test_queries(path: str) -> list[dict]:
with open(path) as f:
return yaml.safe_load(f)["queries"]
# Prompts
# System prompt loaded from config.yaml -> generation.system_prompt
# Edit there, not here.
def _load_system_prompt(cfg: dict) -> str:
# Try to get system_prompt
# If not found throw error and exit. instead of silently continuing
try :
return cfg["generation"]["system_prompt"]
except KeyError:
log.error("System prompt not found in config.yaml under generation.system_prompt")
sys.exit(1)
# Instruction template — no context injected (baseline condition)
BASELINE_TEMPLATE = """Question: {query}
Answer:"""
# Ollama (local LLM)
def call_ollama(
prompt: str,
model: str,
system_prompt: str = 'You are an expert software engineer.',
max_tokens: int = 512,
temperature: float = 0.1,
) -> str | None:
"""
Call local Ollama server (http://localhost:11434).
Install: curl -fsSL https://ollama.ai/install.sh | sh
Pull model: ollama pull llama3.1 (or mistral, llama3.1, phi3, etc.)
Start: ollama serve
"""
try:
resp = requests.post(
"http://localhost:11434/api/chat",
json={
"model": model,
"messages": [
{"role": "system", "content": system_prompt},
{"role": "user", "content": prompt},
],
"stream": False,
"options": {"temperature": temperature, "num_predict": max_tokens},
},
timeout=120,
)
if resp.status_code == 200:
return resp.json()["message"]["content"].strip()
log.error("Ollama error %d: %s", resp.status_code, resp.text[:200])
return None
except requests.exceptions.ConnectionError:
log.error(
"Cannot connect to Ollama at localhost:11434. "
"Is it running? Start with: ollama serve"
)
sys.exit(1)
except Exception as e:
log.error("Ollama request error: %s", e)
return None
# Scoring
def score_answer(answer: str, keywords: list[str]) -> dict:
"""
Check what % of expected keywords appear in the generated answer.
Uses the `keywords` field (LLM answer eval) not context_keywords.
Case-insensitive, partial match allowed.
"""
if not answer or not keywords:
return {"score": None, "found": [], "missed": [], "passed": None}
answer_lower = answer.lower()
found = [kw for kw in keywords if kw.lower() in answer_lower]
missed = [kw for kw in keywords if kw.lower() not in answer_lower]
score = len(found) / len(keywords)
return {
"score": score,
"found": found,
"missed": missed,
}
# Main eval loop
def run_baseline(
queries: list[dict],
model: str,
pass_threshold: float,
cfg: dict = None,
) -> list[dict]:
results = []
total = len(queries)
for i, q in enumerate(queries, 1):
qid = q["id"]
log.info("[%d/%d] %s — %s...", i, total, qid, q["query"][:60])
prompt = BASELINE_TEMPLATE.format(query=q["query"])
t0 = time.time()
system_prompt = _load_system_prompt(cfg or {})
answer = call_ollama(prompt, model, system_prompt=system_prompt)
duration = time.time() - t0
if answer is None:
log.warning(" No answer returned — skipping")
results.append({
"query_id": qid,
"tier": q["tier"],
"query": q["query"],
"answer": None,
"score": None,
"passed": None,
"found": [],
"missed": q.get("keywords", []),
"duration_s": duration,
"model": model,
"condition": "baseline_no_rag",
})
continue
log.info(" Answer (%d chars, %.1fs): %s...",
len(answer), duration, answer[:80].replace("\n", " "))
score_result = score_answer(answer, q.get("keywords", []))
passed = (
score_result["score"] >= pass_threshold
if score_result["score"] is not None else None
)
score_result["passed"] = passed
log.info(" Score: %.2f (%d/%d keywords) — %s",
score_result["score"] or 0,
len(score_result["found"]),
len(q.get("keywords", [])),
"PASS" if passed else "FAIL")
results.append({
"query_id": qid,
"tier": q["tier"],
"query": q["query"],
"answer": answer,
"score": score_result.get("score"),
"passed": score_result.get("passed"),
"found": score_result.get("found", []),
"missed": score_result.get("missed", []),
"duration_s": duration if not dry_run else 0,
"model": model,
"condition": "baseline_no_rag",
})
return results
# Report
def print_report(results: list[dict], pass_threshold: float) -> dict:
log.info("")
log.info("=" * 70)
log.info("Baseline Evaluation — No RAG")
log.info("=" * 70)
scored = [r for r in results if r["score"] is not None]
passed = [r for r in scored if r.get("passed")]
failed = [r for r in scored if not r.get("passed")]
skipped = [r for r in results if r["score"] is None]
# Per-tier
for tier in sorted(set(r["tier"] for r in results)):
tv = [r for r in scored if r["tier"] == tier]
if not tv:
continue
tp = sum(1 for r in tv if r.get("passed"))
avg = np.mean([r["score"] for r in tv])
log.info(" Tier %d: %d/%d passed (%.0f%%) avg kw score %.2f",
tier, tp, len(tv), 100 * tp / len(tv) if tv else 0, avg)
log.info("")
log.info(" Total queries : %d", len(results))
log.info(" Scored : %d", len(scored))
log.info(" Passed : %d (%.1f%%)",
len(passed), 100 * len(passed) / len(scored) if scored else 0)
log.info(" Failed : %d", len(failed))
log.info(" Skipped (error): %d", len(skipped))
if scored:
avg_score = np.mean([r["score"] for r in scored])
log.info("")
log.info(" Avg keyword score : %.3f (baseline — no RAG)", avg_score)
log.info(" Pass threshold : %.1f", pass_threshold)
log.info("")
log.info("=" * 70)
return {
"condition": "baseline_no_rag",
"total": len(results),
"scored": len(scored),
"passed": len(passed),
"failed": len(failed),
"skipped": len(skipped),
"pass_rate": len(passed) / len(scored) if scored else 0.0,
"avg_score": float(np.mean([r["score"] for r in scored])) if scored else 0.0,
"pass_threshold": pass_threshold,
}
def save_results(summary: dict, per_query: list[dict], output_dir: Path) -> None:
output_dir.mkdir(parents=True, exist_ok=True)
path = output_dir / "baseline.json"
with path.open("w") as f:
json.dump({"summary": summary, "per_query": per_query}, f, indent=2)
log.info("Results saved to %s", path)
# Main
def main() -> None:
load_dotenv()
cfg = load_config()
default_model = cfg["generation"]["model"]
parser = argparse.ArgumentParser(description="Baseline eval — raw LLM, no RAG (Ollama)")
parser.add_argument("--model", type=str, default=default_model,
help=f"Ollama model ID (default: {default_model} from config.yaml)")
parser.add_argument("--tier", type=int, default=None)
parser.add_argument("--query", type=str, default=None)
parser.add_argument("--threshold", type=float, default=0.4,
help="Keyword coverage pass threshold (default 0.4)")
args = parser.parse_args()
queries = load_test_queries(cfg["evaluation"]["test_queries_path"])
if args.tier:
queries = [q for q in queries if q["tier"] == args.tier]
if args.query:
queries = [q for q in queries if q["id"] == args.query]
if not queries:
log.error("No queries matched.")
sys.exit(1)
log.info("=" * 70)
log.info("Substrate — Baseline Evaluation (No RAG)")
log.info("Model : %s", args.model)
log.info("Queries : %d", len(queries))
log.info("Threshold : %.1f", args.threshold)
log.info("=" * 70)
per_query = run_baseline(
queries, args.model,
pass_threshold=args.threshold,
cfg=cfg,
)
summary = print_report(per_query, args.threshold)
save_results(summary, per_query, Path(cfg["evaluation"]["results_dir"]))
if __name__ == "__main__":
main()