guru / benchmark.py

Upload folder using huggingface_hub

d38c1d3 verified 24 days ago

14 kB

	"""
	Benchmark harness for the reasoning engine.

	Train/test split → teach train → evaluate on held-out test.
	Reports: exact match, F1, abstention rate, latency.

	Usage:
	python3 benchmark.py # all datasets, 80/20 split
	python3 benchmark.py --split 0.9 # 90/10 split
	python3 benchmark.py --dataset hotpotqa # single dataset
	python3 benchmark.py --test-only # skip training, test existing brain
	"""

	import sys, os, json, time, argparse, random, re
	from pathlib import Path
	from collections import defaultdict

	sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'src'))
	from brain import Brain

	def load_brain(db_path):
	"""Load brain — auto-detect CSR > LMDB > SQLite."""
	csr_path = os.path.join(db_path, 'cooc_csr', 'indptr.bin')
	lmdb_path = os.path.join(db_path, 'brain.lmdb')
	if os.path.exists(csr_path) and os.path.exists(lmdb_path):
	from brain_csr_adapter import BrainCSR
	return BrainCSR(db_path=db_path)
	if os.path.exists(lmdb_path):
	from brain_lmdb_adapter import BrainLMDB
	return BrainLMDB(db_path=db_path)
	return Brain(db_path=db_path)

	DATA_DIR = Path.home() / "webmind-research" / "data"


	def tokenize(text):
	"""Simple tokenization for F1 scoring."""
	return re.findall(r'\w+', text.lower())


	def exact_match(prediction, gold):
	"""Normalized exact match."""
	return prediction.strip().lower() == gold.strip().lower()


	def f1_score(prediction, gold):
	"""Token-level F1 between prediction and gold."""
	pred_tokens = set(tokenize(prediction))
	gold_tokens = set(tokenize(gold))
	if not gold_tokens:
	return 1.0 if not pred_tokens else 0.0
	if not pred_tokens:
	return 0.0
	common = pred_tokens & gold_tokens
	if not common:
	return 0.0
	precision = len(common) / len(pred_tokens)
	recall = len(common) / len(gold_tokens)
	return 2 * precision * recall / (precision + recall)


	def load_qa_records(dataset_path):
	"""Load records that have questions and answers (for evaluation)."""
	records = []
	with open(dataset_path) as f:
	for line in f:
	try:
	item = json.loads(line.strip())
	except json.JSONDecodeError:
	continue

	question = item.get('question', item.get('text', '')).strip()
	answer = item.get('answer', '').strip()

	if question and answer:
	records.append({'question': question, 'answer': answer})
	return records


	def split_data(records, train_ratio=0.8, seed=42):
	"""Deterministic train/test split."""
	rng = random.Random(seed)
	shuffled = list(records)
	rng.shuffle(shuffled)
	split_idx = int(len(shuffled) * train_ratio)
	return shuffled[:split_idx], shuffled[split_idx:]


	def teach_records(brain, records):
	"""Teach training records to brain."""
	for rec in records:
	q = rec['question']
	a = rec['answer']
	# Teach the answer as knowledge
	if len(a) < 50 and not a.endswith('.'):
	combined = f"{q.rstrip('?')} is {a}"
	else:
	combined = a
	brain.teach(combined, confidence=0.6)


	def evaluate(brain, test_records, verbose=False):
	"""Evaluate brain on held-out test records."""
	results = {
	'total': len(test_records),
	'exact_match': 0,
	'f1_sum': 0.0,
	'abstentions': 0,
	'latency_ms': [],
	'examples': [],
	}

	for rec in test_records:
	question = rec['question']
	gold = rec['answer']

	t0 = time.time()
	try:
	result = brain.ask(question, auto_learn=False)
	except TypeError:
	result = brain.ask(question)
	latency = (time.time() - t0) * 1000

	prediction = result['answer']
	strategy = result['strategy']
	confidence = result['confidence']

	results['latency_ms'].append(latency)

	if strategy == 'abstain':
	results['abstentions'] += 1
	em = 0
	f1 = 0.0
	else:
	em = 1 if exact_match(prediction, gold) else 0
	f1 = f1_score(prediction, gold)

	results['exact_match'] += em
	results['f1_sum'] += f1

	if verbose and len(results['examples']) < 10:
	results['examples'].append({
	'question': question[:80],
	'gold': gold[:80],
	'prediction': prediction[:80],
	'em': em,
	'f1': round(f1, 3),
	'confidence': round(confidence, 3),
	'strategy': strategy,
	'latency_ms': round(latency, 1),
	})

	n = results['total']
	results['exact_match_pct'] = round(100 * results['exact_match'] / n, 2) if n else 0
	results['f1_avg'] = round(results['f1_sum'] / n, 4) if n else 0
	results['abstention_rate'] = round(100 * results['abstentions'] / n, 2) if n else 0
	results['latency_avg_ms'] = round(sum(results['latency_ms']) / n, 1) if n else 0
	results['latency_p95_ms'] = round(sorted(results['latency_ms'])[int(n * 0.95)] if n else 0, 1)

	return results


	def rlhf_epoch(brain, test_records, epoch_num):
	"""
	One RLHF epoch: evaluate → reinforce good edges → weaken bad ones.

	For each test question:
	- Ask the brain (get answer + participating words)
	- Score against gold (F1)
	- If F1 > 0.5: boost co-occurrence edges for answer words (×1.1)
	- If F1 < 0.2: weaken co-occurrence edges (×0.9)
	- Between: no change (uncertain)

	Works with both BrainCSR (WAL) and dict-based Brain (_cooc).
	"""
	f1_sum = 0.0
	em_count = 0
	reinforced = 0
	weakened = 0

	# Check which backend we're using
	has_wal = hasattr(brain, '_wal')
	has_cooc = hasattr(brain, '_cooc')

	for rec in test_records:
	question = rec['question']
	gold = rec['answer']

	try:
	result = brain.ask(question, auto_learn=False)
	except TypeError:
	result = brain.ask(question)
	prediction = result['answer']
	strategy = result['strategy']

	if strategy == 'abstain':
	continue

	f1 = f1_score(prediction, gold)
	f1_sum += f1
	if exact_match(prediction, gold):
	em_count += 1

	# Get word indices that participated in the answer
	content_words = [t for t in tokenize(prediction)
	if t in brain._word_idx]
	word_indices = [brain._word_idx[w] for w in content_words]

	if not word_indices:
	continue

	# Reinforce or weaken based on quality
	if f1 > 0.5:
	# Good answer — reinforce edges
	for widx in word_indices:
	profile = brain._get_profile(widx)
	for neighbor, weight in profile.items():
	if neighbor == widx:
	continue
	boost = weight * 0.5 # 50% boost (was 10%)
	if has_wal:
	brain._wal.add_edge(widx, neighbor, boost)
	elif has_cooc and widx in brain._cooc:
	brain._cooc[widx][neighbor] = weight * 1.5
	reinforced += 1
	elif f1 < 0.2:
	# Bad answer — weaken edges AND teach correct answer via Q→A map
	if hasattr(brain, 'correct'):
	brain.correct(question, gold) # direct Q→A mapping + strong edge boost
	for widx in word_indices:
	profile = brain._get_profile(widx)
	for neighbor, weight in profile.items():
	if neighbor == widx:
	continue
	penalty = -weight * 0.5 # 50% reduction (was 10%)
	if has_wal:
	brain._wal.add_edge(widx, neighbor, penalty)
	elif has_cooc and widx in brain._cooc:
	brain._cooc[widx][neighbor] = weight * 0.5
	weakened += 1

	n = len(test_records)
	f1_avg = f1_sum / n if n else 0
	em_pct = 100 * em_count / n if n else 0

	print(f" Epoch {epoch_num}: F1={f1_avg:.4f} EM={em_pct:.1f}% "
	f"reinforced={reinforced} weakened={weakened}")

	return f1_avg, em_pct, reinforced, weakened


	def run_benchmark(args):
	"""Main benchmark flow."""
	# Find datasets
	if args.dataset == 'all':
	ds_paths = sorted(DATA_DIR.glob("*.jsonl"))
	else:
	ds_paths = [DATA_DIR / f"{args.dataset}.jsonl"]

	# Load all Q&A records
	all_records = []
	ds_counts = {}
	for ds_path in ds_paths:
	if not ds_path.exists() or ds_path.stat().st_size == 0:
	continue
	records = load_qa_records(ds_path)
	if records:
	ds_counts[ds_path.stem] = len(records)
	all_records.extend(records)

	if not all_records:
	print("No Q&A records found.")
	return

	print(f"Loaded {len(all_records):,} Q&A records from {len(ds_counts)} datasets")
	for ds, count in sorted(ds_counts.items(), key=lambda x: -x[1])[:10]:
	print(f" {ds}: {count:,}")

	# Split
	train, test = split_data(all_records, train_ratio=args.split)
	print(f"\nSplit: {len(train):,} train / {len(test):,} test "
	f"({args.split:.0%}/{1-args.split:.0%})")

	# Cap test size for speed
	if args.test_limit and len(test) > args.test_limit:
	test = test[:args.test_limit]
	print(f"Test capped at {args.test_limit}")

	# Train
	if not args.test_only:
	db_path = args.db_path or '/tmp/benchmark_brain'
	os.makedirs(db_path, exist_ok=True)

	print(f"\nTraining on {len(train):,} records...")
	brain = Brain(db_path=db_path)
	brain.begin_bulk()

	t0 = time.time()
	teach_records(brain, train)
	brain.end_bulk()
	train_time = time.time() - t0

	print(f"Training: {train_time:.1f}s \| "
	f"{len(brain._words):,} words \| "
	f"{brain.db.count():,} neurons")
	else:
	db_path = args.db_path or os.path.expanduser('~/nexus-brain')
	brain = load_brain(db_path)
	print(f"\nUsing existing brain: {len(brain._words):,} words")

	# RLHF epochs (if requested)
	if args.epochs > 0:
	print(f"\n=== RLHF: {args.epochs} epochs ===")
	# Use a subset for RLHF (faster iterations)
	rlhf_set = test[:min(200, len(test))]
	epoch_results = []
	for epoch in range(1, args.epochs + 1):
	f1, em, reinforced, weakened = rlhf_epoch(brain, rlhf_set, epoch)
	epoch_results.append({'epoch': epoch, 'f1': f1, 'em': em,
	'reinforced': reinforced, 'weakened': weakened})
	if reinforced == 0 and weakened == 0:
	print(f" Converged at epoch {epoch} (no changes)")
	break
	print(f" RLHF complete. F1 improvement: "
	f"{epoch_results[0]['f1']:.4f} → {epoch_results[-1]['f1']:.4f}")

	# Evaluate (final, after RLHF if any)
	print(f"\nEvaluating on {len(test):,} held-out questions...")
	t0 = time.time()
	results = evaluate(brain, test, verbose=True)
	eval_time = time.time() - t0

	# Report
	print(f"\n{'='*60}")
	print(f"BENCHMARK RESULTS")
	print(f"{'='*60}")
	print(f" Exact Match: {results['exact_match_pct']}% "
	f"({results['exact_match']}/{results['total']})")
	print(f" Token F1: {results['f1_avg']}")
	print(f" Abstention Rate: {results['abstention_rate']}% "
	f"({results['abstentions']}/{results['total']})")
	print(f" Latency (avg): {results['latency_avg_ms']} ms")
	print(f" Latency (p95): {results['latency_p95_ms']} ms")
	print(f" Eval time: {eval_time:.1f}s")
	print(f"{'='*60}")

	if results['examples']:
	print(f"\nSample predictions:")
	for ex in results['examples'][:5]:
	status = "✓" if ex['em'] else ("∅" if ex['strategy'] == 'abstain' else "✗")
	print(f" {status} Q: {ex['question']}")
	print(f" Gold: {ex['gold']}")
	print(f" Pred: {ex['prediction']} "
	f"[{ex['strategy']}, f1={ex['f1']}, {ex['latency_ms']}ms]")
	print()

	# Save results
	out_path = Path(db_path) / 'benchmark_results.json'
	with open(out_path, 'w') as f:
	json.dump({
	'exact_match_pct': results['exact_match_pct'],
	'f1_avg': results['f1_avg'],
	'abstention_rate': results['abstention_rate'],
	'latency_avg_ms': results['latency_avg_ms'],
	'latency_p95_ms': results['latency_p95_ms'],
	'total_test': results['total'],
	'total_train': len(train),
	'datasets': ds_counts,
	'split': args.split,
	}, f, indent=2)
	print(f"Results saved to {out_path}")

	brain.close()


	def main():
	parser = argparse.ArgumentParser(description="Benchmark the reasoning engine")
	parser.add_argument("--dataset", default="all",
	help="Dataset name or 'all'")
	parser.add_argument("--split", type=float, default=0.8,
	help="Train ratio (default 0.8 = 80/20)")
	parser.add_argument("--test-limit", type=int, default=500,
	help="Max test questions for speed (0=unlimited)")
	parser.add_argument("--test-only", action="store_true",
	help="Skip training, evaluate existing brain")
	parser.add_argument("--db-path", default=None,
	help="Brain DB path (default: /tmp/benchmark_brain)")
	parser.add_argument("--epochs", type=int, default=5,
	help="RLHF epochs (0=skip, default=5)")
	parser.add_argument("--seed", type=int, default=42)
	args = parser.parse_args()

	run_benchmark(args)


	if __name__ == '__main__':
	main()