File size: 10,118 Bytes
246aed2
 
 
 
 
 
 
 
 
 
 
 
 
 
0d5ef14
 
 
 
246aed2
 
 
 
0d5ef14
246aed2
 
 
 
 
 
 
 
 
 
 
 
 
 
0d5ef14
 
246aed2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0d5ef14
 
246aed2
 
 
0d5ef14
246aed2
 
 
0d5ef14
 
 
 
246aed2
0d5ef14
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
246aed2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0d5ef14
246aed2
 
 
0d5ef14
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
246aed2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0d5ef14
 
 
 
 
 
246aed2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0d5ef14
246aed2
 
 
 
 
0d5ef14
246aed2
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
"""
eval/eval_baseline.py

Tests raw LLM with NO retrieval context.
Establishes the floor that all RAG configurations must beat.

Scoring:
  - For each query, call LLM with system prompt only (no retrieved chunks)
  - Score = % of query's `keywords` found in the generated answer
  - Pass if keyword coverage >= threshold (default 0.4 β€” slightly lower than
    retrieval eval since generation may paraphrase rather than use exact terms)

Output: eval/results/baseline.json

Setup:
  ollama pull llama2           # or mistral, llama3.1, etc.
  ollama serve                 # start local server at localhost:11434

Usage:
    python eval/eval_baseline.py
    python eval/eval_baseline.py --tier 1            # single tier
    python eval/eval_baseline.py --query T1-001      # single query
    python eval/eval_baseline.py --model llama2      # override model
"""

import argparse
import json
import logging
import sys
import time
from pathlib import Path

import numpy as np
import requests
import yaml
from dotenv import load_dotenv

sys.path.insert(0, str(Path(__file__).parent.parent))

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s  %(levelname)-8s  %(message)s",
    datefmt="%H:%M:%S",
)
log = logging.getLogger(__name__)

# Config 
def load_config() -> dict:
    with open("config.yaml") as f:
        return yaml.safe_load(f)

def load_test_queries(path: str) -> list[dict]:
    with open(path) as f:
        return yaml.safe_load(f)["queries"]

# Prompts 

# System prompt loaded from config.yaml -> generation.system_prompt
# Edit there, not here.
def _load_system_prompt(cfg: dict) -> str:
    # Try to get system_prompt
    # If not found throw error and exit. instead of silently continuing
    try :
        return cfg["generation"]["system_prompt"]
    except KeyError:
        log.error("System prompt not found in config.yaml under generation.system_prompt")
        sys.exit(1)

# Instruction template β€” no context injected (baseline condition)
BASELINE_TEMPLATE = """Question: {query}

Answer:"""

# Ollama (local LLM)
def call_ollama(
    prompt: str,
    model: str,
    system_prompt: str = 'You are an expert software engineer.',
    max_tokens: int = 512,
    temperature: float = 0.1,
) -> str | None:
    """
    Call local Ollama server (http://localhost:11434).
    Install: curl -fsSL https://ollama.ai/install.sh | sh
    Pull model: ollama pull llama3.1  (or mistral, llama3.1, phi3, etc.)
    Start: ollama serve
    """
    try:
        resp = requests.post(
            "http://localhost:11434/api/chat",
            json={
                "model": model,
                "messages": [
                    {"role": "system", "content": system_prompt},
                    {"role": "user", "content": prompt},
                ],
                "stream": False,
                "options": {"temperature": temperature, "num_predict": max_tokens},
            },
            timeout=120,
        )
        if resp.status_code == 200:
            return resp.json()["message"]["content"].strip()
        log.error("Ollama error %d: %s", resp.status_code, resp.text[:200])
        return None
    except requests.exceptions.ConnectionError:
        log.error(
            "Cannot connect to Ollama at localhost:11434. "
            "Is it running? Start with: ollama serve"
        )
        sys.exit(1)
    except Exception as e:
        log.error("Ollama request error: %s", e)
        return None


# Scoring 
def score_answer(answer: str, keywords: list[str]) -> dict:
    """
    Check what % of expected keywords appear in the generated answer.
    Uses the `keywords` field (LLM answer eval) not context_keywords.
    Case-insensitive, partial match allowed.
    """
    if not answer or not keywords:
        return {"score": None, "found": [], "missed": [], "passed": None}

    answer_lower = answer.lower()
    found = [kw for kw in keywords if kw.lower() in answer_lower]
    missed = [kw for kw in keywords if kw.lower() not in answer_lower]
    score = len(found) / len(keywords)

    return {
        "score": score,
        "found": found,
        "missed": missed,
    }

# Main eval loop 
def run_baseline(
    queries: list[dict],
    model: str,
    pass_threshold: float,
    cfg: dict = None,
) -> list[dict]:
    results = []
    total = len(queries)

    for i, q in enumerate(queries, 1):
        qid = q["id"]
        log.info("[%d/%d] %s β€” %s...", i, total, qid, q["query"][:60])

        prompt = BASELINE_TEMPLATE.format(query=q["query"])

        t0 = time.time()
        system_prompt = _load_system_prompt(cfg or {})
        answer = call_ollama(prompt, model, system_prompt=system_prompt)
        duration = time.time() - t0

        if answer is None:
            log.warning("  No answer returned β€” skipping")
            results.append({
                "query_id": qid,
                "tier": q["tier"],
                "query": q["query"],
                "answer": None,
                "score": None,
                "passed": None,
                "found": [],
                "missed": q.get("keywords", []),
                "duration_s": duration,
                "model": model,
                "condition": "baseline_no_rag",
            })
            continue

        log.info("  Answer (%d chars, %.1fs): %s...",
                 len(answer), duration, answer[:80].replace("\n", " "))

        score_result = score_answer(answer, q.get("keywords", []))
        passed = (
            score_result["score"] >= pass_threshold
            if score_result["score"] is not None else None
        )
        score_result["passed"] = passed

        log.info("  Score: %.2f (%d/%d keywords) β€” %s",
                 score_result["score"] or 0,
                 len(score_result["found"]),
                 len(q.get("keywords", [])),
                 "PASS" if passed else "FAIL")

        results.append({
            "query_id": qid,
            "tier": q["tier"],
            "query": q["query"],
            "answer": answer,
            "score": score_result.get("score"),
            "passed": score_result.get("passed"),
            "found": score_result.get("found", []),
            "missed": score_result.get("missed", []),
            "duration_s": duration if not dry_run else 0,
            "model": model,
            "condition": "baseline_no_rag",
        })

    return results


# Report 

def print_report(results: list[dict], pass_threshold: float) -> dict:
    log.info("")
    log.info("=" * 70)
    log.info("Baseline Evaluation β€” No RAG")
    log.info("=" * 70)

    scored = [r for r in results if r["score"] is not None]
    passed = [r for r in scored if r.get("passed")]
    failed = [r for r in scored if not r.get("passed")]
    skipped = [r for r in results if r["score"] is None]

    # Per-tier
    for tier in sorted(set(r["tier"] for r in results)):
        tv = [r for r in scored if r["tier"] == tier]
        if not tv:
            continue
        tp = sum(1 for r in tv if r.get("passed"))
        avg = np.mean([r["score"] for r in tv])
        log.info("  Tier %d: %d/%d passed (%.0f%%)  avg kw score %.2f",
                 tier, tp, len(tv), 100 * tp / len(tv) if tv else 0, avg)

    log.info("")
    log.info("  Total queries  : %d", len(results))
    log.info("  Scored         : %d", len(scored))
    log.info("  Passed         : %d (%.1f%%)",
             len(passed), 100 * len(passed) / len(scored) if scored else 0)
    log.info("  Failed         : %d", len(failed))
    log.info("  Skipped (error): %d", len(skipped))

    if scored:
        avg_score = np.mean([r["score"] for r in scored])
        log.info("")
        log.info("  Avg keyword score : %.3f  (baseline β€” no RAG)", avg_score)
        log.info("  Pass threshold    : %.1f", pass_threshold)
        log.info("")

    log.info("=" * 70)

    return {
        "condition": "baseline_no_rag",
        "total": len(results),
        "scored": len(scored),
        "passed": len(passed),
        "failed": len(failed),
        "skipped": len(skipped),
        "pass_rate": len(passed) / len(scored) if scored else 0.0,
        "avg_score": float(np.mean([r["score"] for r in scored])) if scored else 0.0,
        "pass_threshold": pass_threshold,
    }


def save_results(summary: dict, per_query: list[dict], output_dir: Path) -> None:
    output_dir.mkdir(parents=True, exist_ok=True)
    path = output_dir / "baseline.json"
    with path.open("w") as f:
        json.dump({"summary": summary, "per_query": per_query}, f, indent=2)
    log.info("Results saved to %s", path)


# Main 
def main() -> None:
    load_dotenv()

    cfg = load_config()
    default_model = cfg["generation"]["model"]

    parser = argparse.ArgumentParser(description="Baseline eval β€” raw LLM, no RAG (Ollama)")
    parser.add_argument("--model", type=str, default=default_model,
                        help=f"Ollama model ID (default: {default_model} from config.yaml)")
    parser.add_argument("--tier", type=int, default=None)
    parser.add_argument("--query", type=str, default=None)
    parser.add_argument("--threshold", type=float, default=0.4,
                        help="Keyword coverage pass threshold (default 0.4)")
    args = parser.parse_args()

    queries = load_test_queries(cfg["evaluation"]["test_queries_path"])

    if args.tier:
        queries = [q for q in queries if q["tier"] == args.tier]
    if args.query:
        queries = [q for q in queries if q["id"] == args.query]
    if not queries:
        log.error("No queries matched.")
        sys.exit(1)

    log.info("=" * 70)
    log.info("Substrate β€” Baseline Evaluation (No RAG)")
    log.info("Model     : %s", args.model)
    log.info("Queries   : %d", len(queries))
    log.info("Threshold : %.1f", args.threshold)
    log.info("=" * 70)

    per_query = run_baseline(
        queries, args.model,
        pass_threshold=args.threshold,
        cfg=cfg,
    )

    summary = print_report(per_query, args.threshold)
    save_results(summary, per_query, Path(cfg["evaluation"]["results_dir"]))


if __name__ == "__main__":
    main()