Upload python/rag/train_qa.py with huggingface_hub

6dce783 verified 8 days ago

10.2 kB

	"""
	Train H4 attention model for extractive question-answering.

	Uses the autoresearch pattern: modify -> run (2 min) -> measure -> keep/discard.
	Metric: F1 score on validation QA pairs (not bpb).

	The model learns to generate answer text given [context \| question \|] as input.
	This is extractive QA — the answer is a span from the context.

	Architecture: same H4LanguageModel from Phase 5/6, trained on QA-formatted text.
	"""

	import os
	import math
	import time
	import torch
	import torch.nn as nn
	import torch.nn.functional as F
	import numpy as np
	import sys
	import re
	from collections import Counter

	sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))

	from h4_language_model import H4LanguageModel
	from rag.prepare_qa import generate_sample_qa, prepare_training_data, format_qa_for_training, download_squad_dev
	from rag.tokenizer import BPETokenizer

	# ---------------------------------------------------------------------------
	# Hyperparameters (AGENT MAY MODIFY THESE)
	# ---------------------------------------------------------------------------

	TIME_BUDGET = 600 # 10 minutes for QA fine-tuning

	# Model (must match pre-trained checkpoint)
	D_MODEL = 64
	N_HEADS = 8
	N_LAYERS = 2
	D_VALUE = 16
	D_FFN = 256
	DROPOUT = 0.0
	USE_BITLINEAR = True

	# Pre-trained checkpoint (set to None to train from scratch)
	PRETRAINED_CKPT = os.path.join(os.path.dirname(__file__), '..', '..', 'checkpoints', 'lm_pretrained.pt')

	# Optimizer (lower LR for fine-tuning from pre-trained)
	LR = 3e-3
	WEIGHT_DECAY = 0.01
	WARMUP_STEPS = 30
	GRAD_CLIP = 1.0

	# Training
	BATCH_SIZE = 2
	MAX_SEQ_LEN = 512
	EVAL_INTERVAL = 500

	# ---------------------------------------------------------------------------
	# QA-specific metrics
	# ---------------------------------------------------------------------------

	def normalize_answer(s: str) -> str:
	"""Lower case, strip whitespace and punctuation."""
	s = s.lower().strip()
	s = re.sub(r'[^\w\s]', '', s)
	s = re.sub(r'\s+', ' ', s)
	return s


	def compute_f1(prediction: str, gold: str) -> float:
	"""Token-level F1 between prediction and gold answer."""
	pred_tokens = normalize_answer(prediction).split()
	gold_tokens = normalize_answer(gold).split()

	if not gold_tokens:
	return 1.0 if not pred_tokens else 0.0
	if not pred_tokens:
	return 0.0

	common = Counter(pred_tokens) & Counter(gold_tokens)
	n_common = sum(common.values())

	if n_common == 0:
	return 0.0

	precision = n_common / len(pred_tokens)
	recall = n_common / len(gold_tokens)
	f1 = 2 * precision * recall / (precision + recall)
	return f1


	def compute_exact_match(prediction: str, gold: str) -> float:
	"""Exact match after normalization."""
	return 1.0 if normalize_answer(prediction) == normalize_answer(gold) else 0.0


	# ---------------------------------------------------------------------------
	# Training
	# ---------------------------------------------------------------------------

	def main():
	t_start = time.time()
	torch.manual_seed(42)
	np.random.seed(42)

	# Load QA data — use SQuAD if available, fall back to sample
	squad_qa = download_squad_dev()
	if len(squad_qa) > 100:
	print(f"Using SQuAD 2.0: {len(squad_qa)} QA pairs")
	all_qa = squad_qa
	else:
	print("SQuAD not available, using sample QA pairs")
	all_qa = generate_sample_qa()

	# Build BPE tokenizer from training data
	tokenizer = BPETokenizer(max_vocab=4096)
	all_texts = [qa['context'] + ' ' + qa['question'] + ' ' + qa['answer'] for qa in all_qa[:2000]]
	tokenizer.build_vocab(all_texts)
	vocab_size = tokenizer.vocab_size

	# Prepare training sequences using BPE
	# Format: [context SEP question SEP answer]
	# With BPE, seq_len=512 covers ~2500 chars — virtually all SQuAD contexts
	all_seqs = []
	for qa in all_qa:
	input_ids, answer_ids = tokenizer.encode_qa(qa['context'], qa['question'], qa['answer'])
	full_ids = input_ids + answer_ids
	if len(full_ids) <= MAX_SEQ_LEN and len(full_ids) > 5:
	all_seqs.append((full_ids, len(input_ids), qa))

	print(f"BPE sequences that fit seq_len={MAX_SEQ_LEN}: {len(all_seqs)} "
	f"(from {len(all_qa)} total QA pairs)")

	# Split into train/val
	rng = np.random.RandomState(42)
	indices = list(range(len(all_seqs)))
	rng.shuffle(indices)
	n_val = min(30, max(20, int(len(all_seqs) * 0.05)))
	val_indices = set(indices[:n_val])

	train_seqs = []
	val_pairs = []
	for i, (full_ids, input_len, qa) in enumerate(all_seqs):
	if i in val_indices:
	val_pairs.append((full_ids[:input_len], qa['answer'], qa))
	else:
	train_seqs.append(torch.tensor(full_ids, dtype=torch.long))

	rng.shuffle(train_seqs)
	avg_len = sum(len(s) for s in train_seqs) / len(train_seqs) if train_seqs else 0
	print(f"Train: {len(train_seqs)} seqs (avg {avg_len:.0f} BPE tokens), Val: {len(val_pairs)} pairs")

	# Create model
	model = H4LanguageModel(
	vocab_size=vocab_size,
	d_model=D_MODEL,
	n_heads=N_HEADS,
	n_layers=N_LAYERS,
	d_value=D_VALUE,
	d_ffn=D_FFN,
	max_seq_len=MAX_SEQ_LEN,
	dropout=DROPOUT,
	use_bitlinear=USE_BITLINEAR,
	)
	params = model.count_params()
	print(f"Model: {params['trainable']:,} trainable params")

	# Note: pre-trained checkpoint skipped because vocab size changed with BPE.
	# The model trains from scratch but BPE makes this much more efficient.

	optimizer = torch.optim.AdamW(
	model.parameters(), lr=LR, weight_decay=WEIGHT_DECAY, betas=(0.9, 0.95))

	def lr_schedule(step):
	if step < WARMUP_STEPS:
	return step / max(WARMUP_STEPS, 1)
	progress = (step - WARMUP_STEPS) / max(1, 5000 - WARMUP_STEPS)
	return 0.1 + 0.9 * 0.5 * (1 + math.cos(math.pi * min(progress, 1.0)))

	scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_schedule)

	# Training loop
	step = 0
	total_training_time = 0.0
	best_f1 = 0.0
	model.train()

	print(f"\nTraining for {TIME_BUDGET}s, metric=F1")
	print(f"{'step':>6} {'loss':>8} {'val_f1':>8} {'val_em':>8} {'lr':>10}")
	print("-" * 48)

	while True:
	t0 = time.time()

	# Sample a training sequence
	seq = train_seqs[step % len(train_seqs)]
	x = seq[:-1].unsqueeze(0) # (1, T-1)
	y = seq[1:].unsqueeze(0) # (1, T-1)

	# Pad/truncate to consistent length
	if x.shape[1] > MAX_SEQ_LEN:
	x = x[:, :MAX_SEQ_LEN]
	y = y[:, :MAX_SEQ_LEN]

	logits = model(x, use_tree=False)
	loss = F.cross_entropy(logits.view(-1, vocab_size), y.view(-1))

	optimizer.zero_grad()
	loss.backward()
	if GRAD_CLIP > 0:
	torch.nn.utils.clip_grad_norm_(model.parameters(), GRAD_CLIP)
	optimizer.step()
	scheduler.step()

	dt = time.time() - t0
	if step > 2:
	total_training_time += dt

	# Evaluate
	if step % EVAL_INTERVAL == 0:
	model.eval()
	f1_scores = []
	em_scores = []

	with torch.no_grad():
	for inp_ids, gold_answer, qa in val_pairs:
	# Generate answer
	inp_tensor = torch.tensor([inp_ids], dtype=torch.long)
	max_answer_len = min(len(tokenizer.encode(gold_answer)) + 10, 50)

	generated = model.generate(
	inp_tensor,
	max_new_tokens=max_answer_len,
	temperature=0.5,
	top_k_sample=10,
	)
	gen_ids = generated[0, len(inp_ids):]
	pred_answer = tokenizer.decode(gen_ids.tolist())

	f1 = compute_f1(pred_answer, gold_answer)
	em = compute_exact_match(pred_answer, gold_answer)
	f1_scores.append(f1)
	em_scores.append(em)

	avg_f1 = sum(f1_scores) / len(f1_scores)
	avg_em = sum(em_scores) / len(em_scores)

	if avg_f1 > best_f1:
	best_f1 = avg_f1

	current_lr = scheduler.get_last_lr()[0]
	print(f"{step:6d} {loss.item():8.4f} {avg_f1:8.3f} {avg_em:8.3f} {current_lr:10.6f}")
	model.train()

	step += 1
	if step > 2 and total_training_time >= TIME_BUDGET:
	break

	# Final evaluation with sample outputs
	model.eval()
	print("\n" + "=" * 60)
	print("SAMPLE QA RESULTS:")
	final_f1_scores = []
	final_em_scores = []

	with torch.no_grad():
	for inp_ids, gold_answer, qa in val_pairs:
	inp_tensor = torch.tensor([inp_ids], dtype=torch.long)
	max_answer_len = min(len(gold_answer) + 20, 100)
	generated = model.generate(
	inp_tensor, max_new_tokens=max_answer_len,
	temperature=0.3, top_k_sample=5,
	)
	gen_ids = generated[0, len(inp_ids):]
	pred_answer = tokenizer.decode(gen_ids.tolist())

	f1 = compute_f1(pred_answer, gold_answer)
	em = compute_exact_match(pred_answer, gold_answer)
	final_f1_scores.append(f1)
	final_em_scores.append(em)

	print(f" Q: {qa['question']}")
	print(f" Gold: {gold_answer}")
	print(f" Pred: {pred_answer[:80]}")
	print(f" F1={f1:.3f} EM={em:.1f}")
	print()

	avg_f1 = sum(final_f1_scores) / len(final_f1_scores) if final_f1_scores else 0
	avg_em = sum(final_em_scores) / len(final_em_scores) if final_em_scores else 0

	print("=" * 60)
	print("\n---")
	print(f"val_f1: {avg_f1:.4f}")
	print(f"val_em: {avg_em:.4f}")
	print(f"best_f1: {best_f1:.4f}")
	print(f"training_seconds: {total_training_time:.1f}")
	print(f"total_seconds: {time.time() - t_start:.1f}")
	print(f"num_steps: {step}")
	print(f"num_params: {params['trainable']}")
	print(f"ternary: {'yes' if USE_BITLINEAR else 'no'}")


	if __name__ == '__main__':
	main()