#!/usr/bin/env python3 """ RAG Evaluation v2.0 — merged index (documentation + source code). Uses embedding retrieval + QLoRA v1.0. Usage: python scripts/rag_v20_evaluate.py python scripts/rag_v20_evaluate.py --base # use base model """ import json, pickle, sys, time from pathlib import Path import numpy as np from mlx_lm import load, generate from mlx_lm.sample_utils import make_sampler PROJECT_ROOT = Path(__file__).resolve().parent.parent RAG_INDEX_DIR = PROJECT_ROOT / "data" / "rag_index_v20" sys.path.insert(0, str(PROJECT_ROOT / "scripts")) import importlib.util spec = importlib.util.spec_from_file_location("evaluate_module", PROJECT_ROOT / "scripts" / "evaluate.py") eval_mod = importlib.util.module_from_spec(spec) spec.loader.exec_module(eval_mod) TEST_CASES = eval_mod.TEST_CASES CODE_COMPLETION_TESTS = eval_mod.CODE_COMPLETION_TESTS _emb_model = None def get_emb_model(): global _emb_model if _emb_model is None: from sentence_transformers import SentenceTransformer _emb_model = SentenceTransformer("all-MiniLM-L6-v2") return _emb_model def load_index(): with open(RAG_INDEX_DIR / "chunks.jsonl") as f: chunks = [json.loads(line) for line in f] with open(RAG_INDEX_DIR / "embeddings.pkl", "rb") as f: embeddings = pickle.load(f) return chunks, embeddings def retrieve(query, chunks, embeddings, top_k=5): model = get_emb_model() query_vec = model.encode([query])[0] similarities = np.dot(embeddings, query_vec) / ( np.linalg.norm(embeddings, axis=1) * np.linalg.norm(query_vec) ) top_indices = similarities.argsort()[-top_k:][::-1] results = [] for idx in top_indices: if similarities[idx] > 0.1: results.append({"chunk": chunks[idx], "score": float(similarities[idx])}) return results def build_rag_prompt(query, retrieved): """Build prompt with mixed doc + source context.""" context_parts = [] for r in retrieved[:3]: chunk = r["chunk"] source_group = chunk.get("_source_group", "") if source_group == "source_code": func_name = chunk.get("function", "") src_file = chunk.get("source", "") context_parts.append(f"From kernel source ({src_file}):\n```c\n{chunk['answer'][:500]}\n```") else: context_parts.append(f"From kernel documentation:\n{chunk['answer'][:500]}") context = "\n\n".join(context_parts) return f"""You are a Linux kernel expert. Use the following kernel documentation and source code to answer the question. Context: {context} Question: {query} Answer the question thoroughly based on the context above. If the context doesn't contain enough information, use your own knowledge of the Linux kernel.""" def run_evaluation(use_base=False): print("Loading RAG v2.0 index (doc + source merged)...", flush=True) chunks, embeddings = load_index() print(f" Index: {len(chunks)} chunks, embedding dim: {embeddings.shape[1]}", flush=True) if use_base: print("Loading base model...", flush=True) model, tokenizer = load(str(PROJECT_ROOT / "models" / "qwen2.5-7b")) method_name = "RAG v2.0 + Base Model" else: print("Loading fine-tuned model (v1.0)...", flush=True) model, tokenizer = load( str(PROJECT_ROOT / "models" / "qwen2.5-7b"), adapter_path=str(PROJECT_ROOT / "lora_adapters" / "kernel-lora-v1.0") ) method_name = "RAG v2.0 + QLoRA (v1.0)" sampler = make_sampler(temp=0.7) print(" Model loaded\n", flush=True) all_tests = TEST_CASES + CODE_COMPLETION_TESTS print(f"Running {len(all_tests)} tests with {method_name}...\n", flush=True) results = [] for test in all_tests: qid = test["id"] question = test.get("question", test.get("prompt", "")) kws = test.get("reference_keywords", []) print(f" [{qid}] ", end="", flush=True) retrieved = retrieve(question, chunks, embeddings) rag_prompt = build_rag_prompt(question, retrieved) start = time.time() response = generate(model, tokenizer, prompt=rag_prompt[:3000], max_tokens=300, sampler=sampler) elapsed = time.time() - start judge_prompt = ( f"You are an expert Linux kernel evaluator. " f"Rate the following answer on a scale of 0-10 based on correctness, completeness, and precision.\n\n" f"Question: {question}\n\n" f"Answer: {response[:1000]}\n\n" f"Output ONLY a number 0-10, nothing else." ) try: judge_resp = generate(model, tokenizer, prompt=judge_prompt, max_tokens=10, sampler=make_sampler(temp=0.1)) import re score_match = re.search(r'\b(\d+)(?:/10)?\b', judge_resp.strip()) judge_score = int(score_match.group(1)) if score_match else 5 judge_score = max(0, min(10, judge_score)) except: judge_score = 5 normalized_score = judge_score / 10.0 found_keywords = [kw for kw in kws if kw.lower() in response.lower()] results.append({ "id": qid, "score": normalized_score, "keywords_matched": len(found_keywords), "keywords_total": len(kws), "retrieved_chunks": len(retrieved), "elapsed_sec": round(elapsed, 1), }) print(f"Score: {normalized_score:.0%} | {elapsed:.1f}s | {len(retrieved)} chunks", flush=True) categories = {} for r in results: for test in all_tests: if test["id"] == r["id"]: cat = test.get("category", "unknown") categories.setdefault(cat, []).append(r["score"]) break print("\n" + "=" * 60) print(f"RAG v2.0 Evaluation: {method_name}") print("=" * 60) all_scores = [r["score"] for r in results] overall = sum(all_scores) / len(all_scores) print(f"\nOverall: {overall:.1%}") for cat, scores in sorted(categories.items()): print(f" {cat}: {sum(scores)/len(scores):.1%}") timestamp = time.strftime("%Y%m%d_%H%M%S") output = { "timestamp": timestamp, "method": method_name, "index_size": len(chunks), "embedding_dim": embeddings.shape[1], "overall_score": overall, "results": results, "categories": {cat: sum(scores)/len(scores) for cat, scores in categories.items()}, } output_path = PROJECT_ROOT / "results" / f"rag_v20_eval_{timestamp}.json" with open(output_path, "w") as f: json.dump(output, f, indent=2, ensure_ascii=False) print(f"\nResults saved to {output_path}") return output if __name__ == "__main__": import argparse parser = argparse.ArgumentParser(description="RAG v2.0 Evaluation (merged index)") parser.add_argument("--base", action="store_true", help="Use base model instead of QLoRA") args = parser.parse_args() run_evaluation(use_base=args.base)