File size: 7,035 Bytes
1a017d8 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 | #!/usr/bin/env python3
"""
RAG Evaluation v2.0 — merged index (documentation + source code).
Uses embedding retrieval + QLoRA v1.0.
Usage: python scripts/rag_v20_evaluate.py
python scripts/rag_v20_evaluate.py --base # use base model
"""
import json, pickle, sys, time
from pathlib import Path
import numpy as np
from mlx_lm import load, generate
from mlx_lm.sample_utils import make_sampler
PROJECT_ROOT = Path(__file__).resolve().parent.parent
RAG_INDEX_DIR = PROJECT_ROOT / "data" / "rag_index_v20"
sys.path.insert(0, str(PROJECT_ROOT / "scripts"))
import importlib.util
spec = importlib.util.spec_from_file_location("evaluate_module", PROJECT_ROOT / "scripts" / "evaluate.py")
eval_mod = importlib.util.module_from_spec(spec)
spec.loader.exec_module(eval_mod)
TEST_CASES = eval_mod.TEST_CASES
CODE_COMPLETION_TESTS = eval_mod.CODE_COMPLETION_TESTS
_emb_model = None
def get_emb_model():
global _emb_model
if _emb_model is None:
from sentence_transformers import SentenceTransformer
_emb_model = SentenceTransformer("all-MiniLM-L6-v2")
return _emb_model
def load_index():
with open(RAG_INDEX_DIR / "chunks.jsonl") as f:
chunks = [json.loads(line) for line in f]
with open(RAG_INDEX_DIR / "embeddings.pkl", "rb") as f:
embeddings = pickle.load(f)
return chunks, embeddings
def retrieve(query, chunks, embeddings, top_k=5):
model = get_emb_model()
query_vec = model.encode([query])[0]
similarities = np.dot(embeddings, query_vec) / (
np.linalg.norm(embeddings, axis=1) * np.linalg.norm(query_vec)
)
top_indices = similarities.argsort()[-top_k:][::-1]
results = []
for idx in top_indices:
if similarities[idx] > 0.1:
results.append({"chunk": chunks[idx], "score": float(similarities[idx])})
return results
def build_rag_prompt(query, retrieved):
"""Build prompt with mixed doc + source context."""
context_parts = []
for r in retrieved[:3]:
chunk = r["chunk"]
source_group = chunk.get("_source_group", "")
if source_group == "source_code":
func_name = chunk.get("function", "")
src_file = chunk.get("source", "")
context_parts.append(f"From kernel source ({src_file}):\n```c\n{chunk['answer'][:500]}\n```")
else:
context_parts.append(f"From kernel documentation:\n{chunk['answer'][:500]}")
context = "\n\n".join(context_parts)
return f"""You are a Linux kernel expert. Use the following kernel documentation and source code to answer the question.
Context:
{context}
Question: {query}
Answer the question thoroughly based on the context above. If the context doesn't contain enough information, use your own knowledge of the Linux kernel."""
def run_evaluation(use_base=False):
print("Loading RAG v2.0 index (doc + source merged)...", flush=True)
chunks, embeddings = load_index()
print(f" Index: {len(chunks)} chunks, embedding dim: {embeddings.shape[1]}", flush=True)
if use_base:
print("Loading base model...", flush=True)
model, tokenizer = load(str(PROJECT_ROOT / "models" / "qwen2.5-7b"))
method_name = "RAG v2.0 + Base Model"
else:
print("Loading fine-tuned model (v1.0)...", flush=True)
model, tokenizer = load(
str(PROJECT_ROOT / "models" / "qwen2.5-7b"),
adapter_path=str(PROJECT_ROOT / "lora_adapters" / "kernel-lora-v1.0")
)
method_name = "RAG v2.0 + QLoRA (v1.0)"
sampler = make_sampler(temp=0.7)
print(" Model loaded\n", flush=True)
all_tests = TEST_CASES + CODE_COMPLETION_TESTS
print(f"Running {len(all_tests)} tests with {method_name}...\n", flush=True)
results = []
for test in all_tests:
qid = test["id"]
question = test.get("question", test.get("prompt", ""))
kws = test.get("reference_keywords", [])
print(f" [{qid}] ", end="", flush=True)
retrieved = retrieve(question, chunks, embeddings)
rag_prompt = build_rag_prompt(question, retrieved)
start = time.time()
response = generate(model, tokenizer, prompt=rag_prompt[:3000], max_tokens=300, sampler=sampler)
elapsed = time.time() - start
judge_prompt = (
f"You are an expert Linux kernel evaluator. "
f"Rate the following answer on a scale of 0-10 based on correctness, completeness, and precision.\n\n"
f"Question: {question}\n\n"
f"Answer: {response[:1000]}\n\n"
f"Output ONLY a number 0-10, nothing else."
)
try:
judge_resp = generate(model, tokenizer, prompt=judge_prompt, max_tokens=10, sampler=make_sampler(temp=0.1))
import re
score_match = re.search(r'\b(\d+)(?:/10)?\b', judge_resp.strip())
judge_score = int(score_match.group(1)) if score_match else 5
judge_score = max(0, min(10, judge_score))
except:
judge_score = 5
normalized_score = judge_score / 10.0
found_keywords = [kw for kw in kws if kw.lower() in response.lower()]
results.append({
"id": qid,
"score": normalized_score,
"keywords_matched": len(found_keywords),
"keywords_total": len(kws),
"retrieved_chunks": len(retrieved),
"elapsed_sec": round(elapsed, 1),
})
print(f"Score: {normalized_score:.0%} | {elapsed:.1f}s | {len(retrieved)} chunks", flush=True)
categories = {}
for r in results:
for test in all_tests:
if test["id"] == r["id"]:
cat = test.get("category", "unknown")
categories.setdefault(cat, []).append(r["score"])
break
print("\n" + "=" * 60)
print(f"RAG v2.0 Evaluation: {method_name}")
print("=" * 60)
all_scores = [r["score"] for r in results]
overall = sum(all_scores) / len(all_scores)
print(f"\nOverall: {overall:.1%}")
for cat, scores in sorted(categories.items()):
print(f" {cat}: {sum(scores)/len(scores):.1%}")
timestamp = time.strftime("%Y%m%d_%H%M%S")
output = {
"timestamp": timestamp,
"method": method_name,
"index_size": len(chunks),
"embedding_dim": embeddings.shape[1],
"overall_score": overall,
"results": results,
"categories": {cat: sum(scores)/len(scores) for cat, scores in categories.items()},
}
output_path = PROJECT_ROOT / "results" / f"rag_v20_eval_{timestamp}.json"
with open(output_path, "w") as f:
json.dump(output, f, indent=2, ensure_ascii=False)
print(f"\nResults saved to {output_path}")
return output
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser(description="RAG v2.0 Evaluation (merged index)")
parser.add_argument("--base", action="store_true", help="Use base model instead of QLoRA")
args = parser.parse_args()
run_evaluation(use_base=args.base)
|