File size: 13,973 Bytes
d38c1d3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
"""
Benchmark harness for the reasoning engine.

Train/test split β†’ teach train β†’ evaluate on held-out test.
Reports: exact match, F1, abstention rate, latency.

Usage:
    python3 benchmark.py                    # all datasets, 80/20 split
    python3 benchmark.py --split 0.9        # 90/10 split
    python3 benchmark.py --dataset hotpotqa # single dataset
    python3 benchmark.py --test-only        # skip training, test existing brain
"""

import sys, os, json, time, argparse, random, re
from pathlib import Path
from collections import defaultdict

sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'src'))
from brain import Brain

def load_brain(db_path):
    """Load brain β€” auto-detect CSR > LMDB > SQLite."""
    csr_path = os.path.join(db_path, 'cooc_csr', 'indptr.bin')
    lmdb_path = os.path.join(db_path, 'brain.lmdb')
    if os.path.exists(csr_path) and os.path.exists(lmdb_path):
        from brain_csr_adapter import BrainCSR
        return BrainCSR(db_path=db_path)
    if os.path.exists(lmdb_path):
        from brain_lmdb_adapter import BrainLMDB
        return BrainLMDB(db_path=db_path)
    return Brain(db_path=db_path)

DATA_DIR = Path.home() / "webmind-research" / "data"


def tokenize(text):
    """Simple tokenization for F1 scoring."""
    return re.findall(r'\w+', text.lower())


def exact_match(prediction, gold):
    """Normalized exact match."""
    return prediction.strip().lower() == gold.strip().lower()


def f1_score(prediction, gold):
    """Token-level F1 between prediction and gold."""
    pred_tokens = set(tokenize(prediction))
    gold_tokens = set(tokenize(gold))
    if not gold_tokens:
        return 1.0 if not pred_tokens else 0.0
    if not pred_tokens:
        return 0.0
    common = pred_tokens & gold_tokens
    if not common:
        return 0.0
    precision = len(common) / len(pred_tokens)
    recall = len(common) / len(gold_tokens)
    return 2 * precision * recall / (precision + recall)


def load_qa_records(dataset_path):
    """Load records that have questions and answers (for evaluation)."""
    records = []
    with open(dataset_path) as f:
        for line in f:
            try:
                item = json.loads(line.strip())
            except json.JSONDecodeError:
                continue

            question = item.get('question', item.get('text', '')).strip()
            answer = item.get('answer', '').strip()

            if question and answer:
                records.append({'question': question, 'answer': answer})
    return records


def split_data(records, train_ratio=0.8, seed=42):
    """Deterministic train/test split."""
    rng = random.Random(seed)
    shuffled = list(records)
    rng.shuffle(shuffled)
    split_idx = int(len(shuffled) * train_ratio)
    return shuffled[:split_idx], shuffled[split_idx:]


def teach_records(brain, records):
    """Teach training records to brain."""
    for rec in records:
        q = rec['question']
        a = rec['answer']
        # Teach the answer as knowledge
        if len(a) < 50 and not a.endswith('.'):
            combined = f"{q.rstrip('?')} is {a}"
        else:
            combined = a
        brain.teach(combined, confidence=0.6)


def evaluate(brain, test_records, verbose=False):
    """Evaluate brain on held-out test records."""
    results = {
        'total': len(test_records),
        'exact_match': 0,
        'f1_sum': 0.0,
        'abstentions': 0,
        'latency_ms': [],
        'examples': [],
    }

    for rec in test_records:
        question = rec['question']
        gold = rec['answer']

        t0 = time.time()
        try:
            result = brain.ask(question, auto_learn=False)
        except TypeError:
            result = brain.ask(question)
        latency = (time.time() - t0) * 1000

        prediction = result['answer']
        strategy = result['strategy']
        confidence = result['confidence']

        results['latency_ms'].append(latency)

        if strategy == 'abstain':
            results['abstentions'] += 1
            em = 0
            f1 = 0.0
        else:
            em = 1 if exact_match(prediction, gold) else 0
            f1 = f1_score(prediction, gold)

        results['exact_match'] += em
        results['f1_sum'] += f1

        if verbose and len(results['examples']) < 10:
            results['examples'].append({
                'question': question[:80],
                'gold': gold[:80],
                'prediction': prediction[:80],
                'em': em,
                'f1': round(f1, 3),
                'confidence': round(confidence, 3),
                'strategy': strategy,
                'latency_ms': round(latency, 1),
            })

    n = results['total']
    results['exact_match_pct'] = round(100 * results['exact_match'] / n, 2) if n else 0
    results['f1_avg'] = round(results['f1_sum'] / n, 4) if n else 0
    results['abstention_rate'] = round(100 * results['abstentions'] / n, 2) if n else 0
    results['latency_avg_ms'] = round(sum(results['latency_ms']) / n, 1) if n else 0
    results['latency_p95_ms'] = round(sorted(results['latency_ms'])[int(n * 0.95)] if n else 0, 1)

    return results


def rlhf_epoch(brain, test_records, epoch_num):
    """
    One RLHF epoch: evaluate β†’ reinforce good edges β†’ weaken bad ones.

    For each test question:
      - Ask the brain (get answer + participating words)
      - Score against gold (F1)
      - If F1 > 0.5: boost co-occurrence edges for answer words (Γ—1.1)
      - If F1 < 0.2: weaken co-occurrence edges (Γ—0.9)
      - Between: no change (uncertain)

    Works with both BrainCSR (WAL) and dict-based Brain (_cooc).
    """
    f1_sum = 0.0
    em_count = 0
    reinforced = 0
    weakened = 0

    # Check which backend we're using
    has_wal = hasattr(brain, '_wal')
    has_cooc = hasattr(brain, '_cooc')

    for rec in test_records:
        question = rec['question']
        gold = rec['answer']

        try:
            result = brain.ask(question, auto_learn=False)
        except TypeError:
            result = brain.ask(question)
        prediction = result['answer']
        strategy = result['strategy']

        if strategy == 'abstain':
            continue

        f1 = f1_score(prediction, gold)
        f1_sum += f1
        if exact_match(prediction, gold):
            em_count += 1

        # Get word indices that participated in the answer
        content_words = [t for t in tokenize(prediction)
                         if t in brain._word_idx]
        word_indices = [brain._word_idx[w] for w in content_words]

        if not word_indices:
            continue

        # Reinforce or weaken based on quality
        if f1 > 0.5:
            # Good answer β€” reinforce edges
            for widx in word_indices:
                profile = brain._get_profile(widx)
                for neighbor, weight in profile.items():
                    if neighbor == widx:
                        continue
                    boost = weight * 0.5  # 50% boost (was 10%)
                    if has_wal:
                        brain._wal.add_edge(widx, neighbor, boost)
                    elif has_cooc and widx in brain._cooc:
                        brain._cooc[widx][neighbor] = weight * 1.5
                reinforced += 1
        elif f1 < 0.2:
            # Bad answer — weaken edges AND teach correct answer via Q→A map
            if hasattr(brain, 'correct'):
                brain.correct(question, gold)  # direct Q→A mapping + strong edge boost
            for widx in word_indices:
                profile = brain._get_profile(widx)
                for neighbor, weight in profile.items():
                    if neighbor == widx:
                        continue
                    penalty = -weight * 0.5  # 50% reduction (was 10%)
                    if has_wal:
                        brain._wal.add_edge(widx, neighbor, penalty)
                    elif has_cooc and widx in brain._cooc:
                        brain._cooc[widx][neighbor] = weight * 0.5
                weakened += 1

    n = len(test_records)
    f1_avg = f1_sum / n if n else 0
    em_pct = 100 * em_count / n if n else 0

    print(f"  Epoch {epoch_num}: F1={f1_avg:.4f} EM={em_pct:.1f}% "
          f"reinforced={reinforced} weakened={weakened}")

    return f1_avg, em_pct, reinforced, weakened


def run_benchmark(args):
    """Main benchmark flow."""
    # Find datasets
    if args.dataset == 'all':
        ds_paths = sorted(DATA_DIR.glob("*.jsonl"))
    else:
        ds_paths = [DATA_DIR / f"{args.dataset}.jsonl"]

    # Load all Q&A records
    all_records = []
    ds_counts = {}
    for ds_path in ds_paths:
        if not ds_path.exists() or ds_path.stat().st_size == 0:
            continue
        records = load_qa_records(ds_path)
        if records:
            ds_counts[ds_path.stem] = len(records)
            all_records.extend(records)

    if not all_records:
        print("No Q&A records found.")
        return

    print(f"Loaded {len(all_records):,} Q&A records from {len(ds_counts)} datasets")
    for ds, count in sorted(ds_counts.items(), key=lambda x: -x[1])[:10]:
        print(f"  {ds}: {count:,}")

    # Split
    train, test = split_data(all_records, train_ratio=args.split)
    print(f"\nSplit: {len(train):,} train / {len(test):,} test "
          f"({args.split:.0%}/{1-args.split:.0%})")

    # Cap test size for speed
    if args.test_limit and len(test) > args.test_limit:
        test = test[:args.test_limit]
        print(f"Test capped at {args.test_limit}")

    # Train
    if not args.test_only:
        db_path = args.db_path or '/tmp/benchmark_brain'
        os.makedirs(db_path, exist_ok=True)

        print(f"\nTraining on {len(train):,} records...")
        brain = Brain(db_path=db_path)
        brain.begin_bulk()

        t0 = time.time()
        teach_records(brain, train)
        brain.end_bulk()
        train_time = time.time() - t0

        print(f"Training: {train_time:.1f}s | "
              f"{len(brain._words):,} words | "
              f"{brain.db.count():,} neurons")
    else:
        db_path = args.db_path or os.path.expanduser('~/nexus-brain')
        brain = load_brain(db_path)
        print(f"\nUsing existing brain: {len(brain._words):,} words")

    # RLHF epochs (if requested)
    if args.epochs > 0:
        print(f"\n=== RLHF: {args.epochs} epochs ===")
        # Use a subset for RLHF (faster iterations)
        rlhf_set = test[:min(200, len(test))]
        epoch_results = []
        for epoch in range(1, args.epochs + 1):
            f1, em, reinforced, weakened = rlhf_epoch(brain, rlhf_set, epoch)
            epoch_results.append({'epoch': epoch, 'f1': f1, 'em': em,
                                  'reinforced': reinforced, 'weakened': weakened})
            if reinforced == 0 and weakened == 0:
                print(f"  Converged at epoch {epoch} (no changes)")
                break
        print(f"  RLHF complete. F1 improvement: "
              f"{epoch_results[0]['f1']:.4f} β†’ {epoch_results[-1]['f1']:.4f}")

    # Evaluate (final, after RLHF if any)
    print(f"\nEvaluating on {len(test):,} held-out questions...")
    t0 = time.time()
    results = evaluate(brain, test, verbose=True)
    eval_time = time.time() - t0

    # Report
    print(f"\n{'='*60}")
    print(f"BENCHMARK RESULTS")
    print(f"{'='*60}")
    print(f"  Exact Match:     {results['exact_match_pct']}%  "
          f"({results['exact_match']}/{results['total']})")
    print(f"  Token F1:        {results['f1_avg']}")
    print(f"  Abstention Rate: {results['abstention_rate']}%  "
          f"({results['abstentions']}/{results['total']})")
    print(f"  Latency (avg):   {results['latency_avg_ms']} ms")
    print(f"  Latency (p95):   {results['latency_p95_ms']} ms")
    print(f"  Eval time:       {eval_time:.1f}s")
    print(f"{'='*60}")

    if results['examples']:
        print(f"\nSample predictions:")
        for ex in results['examples'][:5]:
            status = "βœ“" if ex['em'] else ("βˆ…" if ex['strategy'] == 'abstain' else "βœ—")
            print(f"  {status} Q: {ex['question']}")
            print(f"    Gold: {ex['gold']}")
            print(f"    Pred: {ex['prediction']} "
                  f"[{ex['strategy']}, f1={ex['f1']}, {ex['latency_ms']}ms]")
            print()

    # Save results
    out_path = Path(db_path) / 'benchmark_results.json'
    with open(out_path, 'w') as f:
        json.dump({
            'exact_match_pct': results['exact_match_pct'],
            'f1_avg': results['f1_avg'],
            'abstention_rate': results['abstention_rate'],
            'latency_avg_ms': results['latency_avg_ms'],
            'latency_p95_ms': results['latency_p95_ms'],
            'total_test': results['total'],
            'total_train': len(train),
            'datasets': ds_counts,
            'split': args.split,
        }, f, indent=2)
    print(f"Results saved to {out_path}")

    brain.close()


def main():
    parser = argparse.ArgumentParser(description="Benchmark the reasoning engine")
    parser.add_argument("--dataset", default="all",
                        help="Dataset name or 'all'")
    parser.add_argument("--split", type=float, default=0.8,
                        help="Train ratio (default 0.8 = 80/20)")
    parser.add_argument("--test-limit", type=int, default=500,
                        help="Max test questions for speed (0=unlimited)")
    parser.add_argument("--test-only", action="store_true",
                        help="Skip training, evaluate existing brain")
    parser.add_argument("--db-path", default=None,
                        help="Brain DB path (default: /tmp/benchmark_brain)")
    parser.add_argument("--epochs", type=int, default=5,
                        help="RLHF epochs (0=skip, default=5)")
    parser.add_argument("--seed", type=int, default=42)
    args = parser.parse_args()

    run_benchmark(args)


if __name__ == '__main__':
    main()