AutoResearch Agent

experiment 90: revert to cosine scheduler (mean 0.8535 > constant_with_warmup mean 0.8488 over multiple runs)

c8f8849 10 days ago

13.5 kB

	"""
	train.py — Cross-dataset biomedical NER curriculum training.

	This is the file the autoresearch agent modifies.

	The agent can change:
	- CURRICULUM: which datasets, in what order, for how many steps
	- MIXING_RATIOS: when training on multiple datasets simultaneously
	- Model hyperparameters: learning rate, weight decay, warmup, scheduler
	- Fine-tuning strategy: which layers to freeze, LoRA, etc.
	- Architecture tweaks: classifier head design, pooling, dropout

	The agent must NOT change:
	- The evaluation function or metric (F1 via seqeval)
	- The target eval dataset
	- The base model name (but can change how layers are used)
	- The total training time budget (enforced by wallclock)

	Output format (printed to stdout, grep-able):
	val_f1: <float>
	peak_vram_mb: <int>
	"""

	import os
	import sys
	import json
	import time
	import torch
	import numpy as np
	from pathlib import Path
	from datasets import load_from_disk, concatenate_datasets, Dataset
	from transformers import (
	AutoTokenizer,
	AutoModelForTokenClassification,
	DataCollatorForTokenClassification,
	get_scheduler,
	)
	from torch.utils.data import DataLoader
	from torch.cuda.amp import autocast, GradScaler

	# ── Constants (do not modify) ───────────────────────────────────────────────
	CACHE_DIR = Path.home() / ".cache" / "openmed-autoresearch"
	TOTAL_TIME_BUDGET = 300 # 5 minutes in seconds
	TARGET_EVAL_DATASET = "ncbi_disease" # The dataset we measure F1 on
	SEED = 42

	# ── Load metadata ───────────────────────────────────────────────────────────
	with open(CACHE_DIR / "meta.json") as f:
	META = json.load(f)

	UNIFIED_LABELS = META["unified_labels"]
	LABEL2ID = {l: i for i, l in enumerate(UNIFIED_LABELS)}
	ID2LABEL = {i: l for i, l in enumerate(UNIFIED_LABELS)}
	NUM_LABELS = len(UNIFIED_LABELS)
	BASE_MODEL = META["model"]

	# ═══════════════════════════════════════════════════════════════════════════
	# CURRICULUM CONFIGURATION — the agent experiments with this section
	# ═══════════════════════════════════════════════════════════════════════════

	# Each stage is: (dataset_names, proportion_of_time_budget, mixing_ratios_or_none)
	# mixing_ratios is a dict {dataset_name: float} that sums to 1.0
	# If only one dataset, mixing_ratios can be None
	CURRICULUM = [
	# Stage 1: pretrain on bc5cdr_chem (25% of time)
	(["bc5cdr_chem"], 0.25, None),
	# Stage 2: pretrain on jnlpba (15% of time)
	(["jnlpba"], 0.15, None),
	# Stage 3: fine-tune on target (60% of time)
	([TARGET_EVAL_DATASET], 0.60, None),
	]

	# ── Training hyperparameters ────────────────────────────────────────────────
	LEARNING_RATE = 5e-5
	WEIGHT_DECAY = 0.01
	WARMUP_RATIO = 0.1
	BATCH_SIZE = 64
	GRADIENT_ACCUMULATION_STEPS = 1
	MAX_GRAD_NORM = 1.0
	LR_SCHEDULER_TYPE = "cosine"
	DROPOUT_OVERRIDE = None # Set to a float to override model's default dropout
	FP16 = True

	# ── Layer freezing ──────────────────────────────────────────────────────────
	# Freeze the first N transformer layers during training (0 = freeze nothing)
	FREEZE_LAYERS = 0

	# ═══════════════════════════════════════════════════════════════════════════
	# END OF AGENT-MODIFIABLE CONFIGURATION
	# ═══════════════════════════════════════════════════════════════════════════


	def set_seed(seed):
	torch.manual_seed(seed)
	np.random.seed(seed)
	if torch.cuda.is_available():
	torch.cuda.manual_seed_all(seed)


	def load_ner_dataset(name: str, split: str) -> Dataset:
	"""Load a preprocessed dataset split from cache."""
	ds = load_from_disk(str(CACHE_DIR / name))
	return ds[split]


	def build_mixed_dataset(dataset_names, mixing_ratios=None, split="train"):
	"""Build a mixed training dataset from multiple sources."""
	datasets = []
	for name in dataset_names:
	ds = load_ner_dataset(name, split)
	datasets.append((name, ds))

	if len(datasets) == 1:
	return datasets[0][1]

	if mixing_ratios is None:
	# Equal mixing
	mixing_ratios = {name: 1.0 / len(datasets) for name, _ in datasets}

	# Sample proportionally
	mixed_parts = []
	total_target = sum(len(ds) for _, ds in datasets)
	for name, ds in datasets:
	ratio = mixing_ratios.get(name, 0)
	n_samples = max(1, int(ratio * total_target))
	if n_samples >= len(ds):
	mixed_parts.append(ds)
	else:
	indices = np.random.choice(len(ds), size=n_samples, replace=False)
	mixed_parts.append(ds.select(indices.tolist()))

	return concatenate_datasets(mixed_parts).shuffle(seed=SEED)


	def compute_f1(model, dataloader, device):
	"""Compute entity-level F1 using seqeval."""
	from seqeval.metrics import f1_score as seq_f1
	model.eval()
	all_preds = []
	all_labels = []

	with torch.no_grad():
	for batch in dataloader:
	input_ids = batch["input_ids"].to(device)
	attention_mask = batch["attention_mask"].to(device)
	labels = batch["labels"].to(device)

	outputs = model(input_ids=input_ids, attention_mask=attention_mask)
	preds = torch.argmax(outputs.logits, dim=-1)

	for i in range(labels.shape[0]):
	pred_seq = []
	label_seq = []
	for j in range(labels.shape[1]):
	if labels[i][j].item() != -100:
	pred_seq.append(ID2LABEL[preds[i][j].item()])
	label_seq.append(ID2LABEL[labels[i][j].item()])
	if label_seq:
	all_preds.append(pred_seq)
	all_labels.append(label_seq)

	if not all_labels:
	return 0.0

	return seq_f1(all_labels, all_preds, average="micro")


	def freeze_layers(model, n_layers):
	"""Freeze the embeddings and first n_layers of the transformer."""
	if n_layers <= 0:
	return

	# Freeze embeddings
	for param in model.base_model.embeddings.parameters():
	param.requires_grad = False

	# Freeze encoder layers
	encoder_layers = None
	if hasattr(model.base_model, "encoder"):
	if hasattr(model.base_model.encoder, "layer"):
	encoder_layers = model.base_model.encoder.layer
	elif hasattr(model.base_model.encoder, "layers"):
	encoder_layers = model.base_model.encoder.layers
	elif hasattr(model.base_model, "layers"):
	encoder_layers = model.base_model.layers

	if encoder_layers is not None:
	for i, layer in enumerate(encoder_layers):
	if i < n_layers:
	for param in layer.parameters():
	param.requires_grad = False


	def run_training_stage(model, tokenizer, dataset_names, mixing_ratios,
	time_budget_seconds, device, scaler):
	"""Run one curriculum stage within a time budget."""
	train_ds = build_mixed_dataset(dataset_names, mixing_ratios, split="train")
	collator = DataCollatorForTokenClassification(tokenizer, padding=True)
	train_loader = DataLoader(
	train_ds, batch_size=BATCH_SIZE, shuffle=True,
	collate_fn=collator, num_workers=0, pin_memory=True,
	)

	optimizer = torch.optim.AdamW(
	[p for p in model.parameters() if p.requires_grad],
	lr=LEARNING_RATE,
	weight_decay=WEIGHT_DECAY,
	)

	# Estimate total steps from time budget (rough: assume ~0.3s per step)
	est_steps = max(10, int(time_budget_seconds / 0.3))
	scheduler = get_scheduler(
	LR_SCHEDULER_TYPE,
	optimizer=optimizer,
	num_warmup_steps=int(est_steps * WARMUP_RATIO),
	num_training_steps=est_steps,
	)

	model.train()
	start_time = time.time()
	step = 0
	accum_loss = 0.0
	data_iter = iter(train_loader)

	while (time.time() - start_time) < time_budget_seconds:
	# Get next batch, loop if exhausted
	try:
	batch = next(data_iter)
	except StopIteration:
	data_iter = iter(train_loader)
	batch = next(data_iter)

	input_ids = batch["input_ids"].to(device)
	attention_mask = batch["attention_mask"].to(device)
	labels = batch["labels"].to(device)

	if FP16 and scaler is not None:
	with autocast(dtype=torch.float16):
	outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
	loss = outputs.loss / GRADIENT_ACCUMULATION_STEPS
	scaler.scale(loss).backward()
	else:
	outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
	loss = outputs.loss / GRADIENT_ACCUMULATION_STEPS
	loss.backward()

	accum_loss += loss.item()
	step += 1

	if step % GRADIENT_ACCUMULATION_STEPS == 0:
	if FP16 and scaler is not None:
	scaler.unscale_(optimizer)
	torch.nn.utils.clip_grad_norm_(model.parameters(), MAX_GRAD_NORM)
	scaler.step(optimizer)
	scaler.update()
	else:
	torch.nn.utils.clip_grad_norm_(model.parameters(), MAX_GRAD_NORM)
	optimizer.step()
	scheduler.step()
	optimizer.zero_grad()

	elapsed = time.time() - start_time
	avg_loss = accum_loss / max(step, 1)
	print(f" Stage [{','.join(dataset_names)}]: {step} steps in {elapsed:.1f}s, avg_loss={avg_loss:.4f}",
	file=sys.stderr)

	return model


	def main():
	set_seed(SEED)
	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
	print(f"Device: {device}", file=sys.stderr)

	# ── Load model ──────────────────────────────────────────────────────────
	tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
	model = AutoModelForTokenClassification.from_pretrained(
	BASE_MODEL,
	num_labels=NUM_LABELS,
	id2label=ID2LABEL,
	label2id=LABEL2ID,
	ignore_mismatched_sizes=True,
	)

	if DROPOUT_OVERRIDE is not None:
	if hasattr(model.config, "hidden_dropout_prob"):
	model.config.hidden_dropout_prob = DROPOUT_OVERRIDE
	if hasattr(model.config, "attention_probs_dropout_prob"):
	model.config.attention_probs_dropout_prob = DROPOUT_OVERRIDE
	if hasattr(model, "dropout"):
	model.dropout = torch.nn.Dropout(DROPOUT_OVERRIDE)

	freeze_layers(model, FREEZE_LAYERS)
	model.to(device)

	scaler = GradScaler() if FP16 else None

	# ── Run curriculum ──────────────────────────────────────────────────────
	print(f"Curriculum: {len(CURRICULUM)} stages", file=sys.stderr)
	total_start = time.time()

	for i, (ds_names, time_frac, mix_ratios) in enumerate(CURRICULUM):
	stage_budget = TOTAL_TIME_BUDGET * time_frac
	# Adjust for elapsed time to stay within total budget
	elapsed = time.time() - total_start
	remaining = TOTAL_TIME_BUDGET - elapsed
	stage_budget = min(stage_budget, remaining - 10) # leave 10s for eval
	if stage_budget <= 0:
	print(f" Skipping stage {i+1}, no time remaining.", file=sys.stderr)
	break

	print(f" Stage {i+1}/{len(CURRICULUM)}: {ds_names}, "
	f"budget={stage_budget:.0f}s", file=sys.stderr)
	model = run_training_stage(
	model, tokenizer, ds_names, mix_ratios,
	stage_budget, device, scaler,
	)

	# ── Evaluate on target ──────────────────────────────────────────────────
	print("Evaluating...", file=sys.stderr)
	eval_ds = load_ner_dataset(TARGET_EVAL_DATASET, "test")
	collator = DataCollatorForTokenClassification(tokenizer, padding=True)
	eval_loader = DataLoader(
	eval_ds, batch_size=32, shuffle=False,
	collate_fn=collator, num_workers=0, pin_memory=True,
	)

	f1 = compute_f1(model, eval_loader, device)
	peak_vram = torch.cuda.max_memory_allocated(device) // (1024 * 1024) if torch.cuda.is_available() else 0

	# ── Print results (grep-able) ───────────────────────────────────────────
	print(f"val_f1: {f1:.6f}")
	print(f"peak_vram_mb: {peak_vram}")


	if __name__ == "__main__":
	main()