File size: 7,035 Bytes
1a017d8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
#!/usr/bin/env python3
"""
RAG Evaluation v2.0 — merged index (documentation + source code).
Uses embedding retrieval + QLoRA v1.0.

Usage: python scripts/rag_v20_evaluate.py
       python scripts/rag_v20_evaluate.py --base  # use base model
"""

import json, pickle, sys, time
from pathlib import Path
import numpy as np
from mlx_lm import load, generate
from mlx_lm.sample_utils import make_sampler

PROJECT_ROOT = Path(__file__).resolve().parent.parent
RAG_INDEX_DIR = PROJECT_ROOT / "data" / "rag_index_v20"

sys.path.insert(0, str(PROJECT_ROOT / "scripts"))
import importlib.util
spec = importlib.util.spec_from_file_location("evaluate_module", PROJECT_ROOT / "scripts" / "evaluate.py")
eval_mod = importlib.util.module_from_spec(spec)
spec.loader.exec_module(eval_mod)

TEST_CASES = eval_mod.TEST_CASES
CODE_COMPLETION_TESTS = eval_mod.CODE_COMPLETION_TESTS

_emb_model = None

def get_emb_model():
    global _emb_model
    if _emb_model is None:
        from sentence_transformers import SentenceTransformer
        _emb_model = SentenceTransformer("all-MiniLM-L6-v2")
    return _emb_model


def load_index():
    with open(RAG_INDEX_DIR / "chunks.jsonl") as f:
        chunks = [json.loads(line) for line in f]
    with open(RAG_INDEX_DIR / "embeddings.pkl", "rb") as f:
        embeddings = pickle.load(f)
    return chunks, embeddings


def retrieve(query, chunks, embeddings, top_k=5):
    model = get_emb_model()
    query_vec = model.encode([query])[0]
    similarities = np.dot(embeddings, query_vec) / (
        np.linalg.norm(embeddings, axis=1) * np.linalg.norm(query_vec)
    )
    top_indices = similarities.argsort()[-top_k:][::-1]
    results = []
    for idx in top_indices:
        if similarities[idx] > 0.1:
            results.append({"chunk": chunks[idx], "score": float(similarities[idx])})
    return results


def build_rag_prompt(query, retrieved):
    """Build prompt with mixed doc + source context."""
    context_parts = []
    for r in retrieved[:3]:
        chunk = r["chunk"]
        source_group = chunk.get("_source_group", "")
        if source_group == "source_code":
            func_name = chunk.get("function", "")
            src_file = chunk.get("source", "")
            context_parts.append(f"From kernel source ({src_file}):\n```c\n{chunk['answer'][:500]}\n```")
        else:
            context_parts.append(f"From kernel documentation:\n{chunk['answer'][:500]}")
    context = "\n\n".join(context_parts)
    return f"""You are a Linux kernel expert. Use the following kernel documentation and source code to answer the question.

Context:
{context}

Question: {query}

Answer the question thoroughly based on the context above. If the context doesn't contain enough information, use your own knowledge of the Linux kernel."""


def run_evaluation(use_base=False):
    print("Loading RAG v2.0 index (doc + source merged)...", flush=True)
    chunks, embeddings = load_index()
    print(f"  Index: {len(chunks)} chunks, embedding dim: {embeddings.shape[1]}", flush=True)

    if use_base:
        print("Loading base model...", flush=True)
        model, tokenizer = load(str(PROJECT_ROOT / "models" / "qwen2.5-7b"))
        method_name = "RAG v2.0 + Base Model"
    else:
        print("Loading fine-tuned model (v1.0)...", flush=True)
        model, tokenizer = load(
            str(PROJECT_ROOT / "models" / "qwen2.5-7b"),
            adapter_path=str(PROJECT_ROOT / "lora_adapters" / "kernel-lora-v1.0")
        )
        method_name = "RAG v2.0 + QLoRA (v1.0)"

    sampler = make_sampler(temp=0.7)
    print("  Model loaded\n", flush=True)

    all_tests = TEST_CASES + CODE_COMPLETION_TESTS
    print(f"Running {len(all_tests)} tests with {method_name}...\n", flush=True)

    results = []
    for test in all_tests:
        qid = test["id"]
        question = test.get("question", test.get("prompt", ""))
        kws = test.get("reference_keywords", [])

        print(f"  [{qid}] ", end="", flush=True)

        retrieved = retrieve(question, chunks, embeddings)
        rag_prompt = build_rag_prompt(question, retrieved)

        start = time.time()
        response = generate(model, tokenizer, prompt=rag_prompt[:3000], max_tokens=300, sampler=sampler)
        elapsed = time.time() - start

        judge_prompt = (
            f"You are an expert Linux kernel evaluator. "
            f"Rate the following answer on a scale of 0-10 based on correctness, completeness, and precision.\n\n"
            f"Question: {question}\n\n"
            f"Answer: {response[:1000]}\n\n"
            f"Output ONLY a number 0-10, nothing else."
        )
        try:
            judge_resp = generate(model, tokenizer, prompt=judge_prompt, max_tokens=10, sampler=make_sampler(temp=0.1))
            import re
            score_match = re.search(r'\b(\d+)(?:/10)?\b', judge_resp.strip())
            judge_score = int(score_match.group(1)) if score_match else 5
            judge_score = max(0, min(10, judge_score))
        except:
            judge_score = 5

        normalized_score = judge_score / 10.0
        found_keywords = [kw for kw in kws if kw.lower() in response.lower()]

        results.append({
            "id": qid,
            "score": normalized_score,
            "keywords_matched": len(found_keywords),
            "keywords_total": len(kws),
            "retrieved_chunks": len(retrieved),
            "elapsed_sec": round(elapsed, 1),
        })

        print(f"Score: {normalized_score:.0%} | {elapsed:.1f}s | {len(retrieved)} chunks", flush=True)

    categories = {}
    for r in results:
        for test in all_tests:
            if test["id"] == r["id"]:
                cat = test.get("category", "unknown")
                categories.setdefault(cat, []).append(r["score"])
                break

    print("\n" + "=" * 60)
    print(f"RAG v2.0 Evaluation: {method_name}")
    print("=" * 60)

    all_scores = [r["score"] for r in results]
    overall = sum(all_scores) / len(all_scores)
    print(f"\nOverall: {overall:.1%}")

    for cat, scores in sorted(categories.items()):
        print(f"  {cat}: {sum(scores)/len(scores):.1%}")

    timestamp = time.strftime("%Y%m%d_%H%M%S")
    output = {
        "timestamp": timestamp,
        "method": method_name,
        "index_size": len(chunks),
        "embedding_dim": embeddings.shape[1],
        "overall_score": overall,
        "results": results,
        "categories": {cat: sum(scores)/len(scores) for cat, scores in categories.items()},
    }

    output_path = PROJECT_ROOT / "results" / f"rag_v20_eval_{timestamp}.json"
    with open(output_path, "w") as f:
        json.dump(output, f, indent=2, ensure_ascii=False)

    print(f"\nResults saved to {output_path}")
    return output


if __name__ == "__main__":
    import argparse
    parser = argparse.ArgumentParser(description="RAG v2.0 Evaluation (merged index)")
    parser.add_argument("--base", action="store_true", help="Use base model instead of QLoRA")
    args = parser.parse_args()
    run_evaluation(use_base=args.base)