AutoResearch Agent
experiment 90: revert to cosine scheduler (mean 0.8535 > constant_with_warmup mean 0.8488 over multiple runs)
c8f8849
"""
train.py β€” Cross-dataset biomedical NER curriculum training.
This is the file the autoresearch agent modifies.
The agent can change:
- CURRICULUM: which datasets, in what order, for how many steps
- MIXING_RATIOS: when training on multiple datasets simultaneously
- Model hyperparameters: learning rate, weight decay, warmup, scheduler
- Fine-tuning strategy: which layers to freeze, LoRA, etc.
- Architecture tweaks: classifier head design, pooling, dropout
The agent must NOT change:
- The evaluation function or metric (F1 via seqeval)
- The target eval dataset
- The base model name (but can change how layers are used)
- The total training time budget (enforced by wallclock)
Output format (printed to stdout, grep-able):
val_f1: <float>
peak_vram_mb: <int>
"""
import os
import sys
import json
import time
import torch
import numpy as np
from pathlib import Path
from datasets import load_from_disk, concatenate_datasets, Dataset
from transformers import (
AutoTokenizer,
AutoModelForTokenClassification,
DataCollatorForTokenClassification,
get_scheduler,
)
from torch.utils.data import DataLoader
from torch.cuda.amp import autocast, GradScaler
# ── Constants (do not modify) ───────────────────────────────────────────────
CACHE_DIR = Path.home() / ".cache" / "openmed-autoresearch"
TOTAL_TIME_BUDGET = 300 # 5 minutes in seconds
TARGET_EVAL_DATASET = "ncbi_disease" # The dataset we measure F1 on
SEED = 42
# ── Load metadata ───────────────────────────────────────────────────────────
with open(CACHE_DIR / "meta.json") as f:
META = json.load(f)
UNIFIED_LABELS = META["unified_labels"]
LABEL2ID = {l: i for i, l in enumerate(UNIFIED_LABELS)}
ID2LABEL = {i: l for i, l in enumerate(UNIFIED_LABELS)}
NUM_LABELS = len(UNIFIED_LABELS)
BASE_MODEL = META["model"]
# ═══════════════════════════════════════════════════════════════════════════
# CURRICULUM CONFIGURATION β€” the agent experiments with this section
# ═══════════════════════════════════════════════════════════════════════════
# Each stage is: (dataset_names, proportion_of_time_budget, mixing_ratios_or_none)
# mixing_ratios is a dict {dataset_name: float} that sums to 1.0
# If only one dataset, mixing_ratios can be None
CURRICULUM = [
# Stage 1: pretrain on bc5cdr_chem (25% of time)
(["bc5cdr_chem"], 0.25, None),
# Stage 2: pretrain on jnlpba (15% of time)
(["jnlpba"], 0.15, None),
# Stage 3: fine-tune on target (60% of time)
([TARGET_EVAL_DATASET], 0.60, None),
]
# ── Training hyperparameters ────────────────────────────────────────────────
LEARNING_RATE = 5e-5
WEIGHT_DECAY = 0.01
WARMUP_RATIO = 0.1
BATCH_SIZE = 64
GRADIENT_ACCUMULATION_STEPS = 1
MAX_GRAD_NORM = 1.0
LR_SCHEDULER_TYPE = "cosine"
DROPOUT_OVERRIDE = None # Set to a float to override model's default dropout
FP16 = True
# ── Layer freezing ──────────────────────────────────────────────────────────
# Freeze the first N transformer layers during training (0 = freeze nothing)
FREEZE_LAYERS = 0
# ═══════════════════════════════════════════════════════════════════════════
# END OF AGENT-MODIFIABLE CONFIGURATION
# ═══════════════════════════════════════════════════════════════════════════
def set_seed(seed):
torch.manual_seed(seed)
np.random.seed(seed)
if torch.cuda.is_available():
torch.cuda.manual_seed_all(seed)
def load_ner_dataset(name: str, split: str) -> Dataset:
"""Load a preprocessed dataset split from cache."""
ds = load_from_disk(str(CACHE_DIR / name))
return ds[split]
def build_mixed_dataset(dataset_names, mixing_ratios=None, split="train"):
"""Build a mixed training dataset from multiple sources."""
datasets = []
for name in dataset_names:
ds = load_ner_dataset(name, split)
datasets.append((name, ds))
if len(datasets) == 1:
return datasets[0][1]
if mixing_ratios is None:
# Equal mixing
mixing_ratios = {name: 1.0 / len(datasets) for name, _ in datasets}
# Sample proportionally
mixed_parts = []
total_target = sum(len(ds) for _, ds in datasets)
for name, ds in datasets:
ratio = mixing_ratios.get(name, 0)
n_samples = max(1, int(ratio * total_target))
if n_samples >= len(ds):
mixed_parts.append(ds)
else:
indices = np.random.choice(len(ds), size=n_samples, replace=False)
mixed_parts.append(ds.select(indices.tolist()))
return concatenate_datasets(mixed_parts).shuffle(seed=SEED)
def compute_f1(model, dataloader, device):
"""Compute entity-level F1 using seqeval."""
from seqeval.metrics import f1_score as seq_f1
model.eval()
all_preds = []
all_labels = []
with torch.no_grad():
for batch in dataloader:
input_ids = batch["input_ids"].to(device)
attention_mask = batch["attention_mask"].to(device)
labels = batch["labels"].to(device)
outputs = model(input_ids=input_ids, attention_mask=attention_mask)
preds = torch.argmax(outputs.logits, dim=-1)
for i in range(labels.shape[0]):
pred_seq = []
label_seq = []
for j in range(labels.shape[1]):
if labels[i][j].item() != -100:
pred_seq.append(ID2LABEL[preds[i][j].item()])
label_seq.append(ID2LABEL[labels[i][j].item()])
if label_seq:
all_preds.append(pred_seq)
all_labels.append(label_seq)
if not all_labels:
return 0.0
return seq_f1(all_labels, all_preds, average="micro")
def freeze_layers(model, n_layers):
"""Freeze the embeddings and first n_layers of the transformer."""
if n_layers <= 0:
return
# Freeze embeddings
for param in model.base_model.embeddings.parameters():
param.requires_grad = False
# Freeze encoder layers
encoder_layers = None
if hasattr(model.base_model, "encoder"):
if hasattr(model.base_model.encoder, "layer"):
encoder_layers = model.base_model.encoder.layer
elif hasattr(model.base_model.encoder, "layers"):
encoder_layers = model.base_model.encoder.layers
elif hasattr(model.base_model, "layers"):
encoder_layers = model.base_model.layers
if encoder_layers is not None:
for i, layer in enumerate(encoder_layers):
if i < n_layers:
for param in layer.parameters():
param.requires_grad = False
def run_training_stage(model, tokenizer, dataset_names, mixing_ratios,
time_budget_seconds, device, scaler):
"""Run one curriculum stage within a time budget."""
train_ds = build_mixed_dataset(dataset_names, mixing_ratios, split="train")
collator = DataCollatorForTokenClassification(tokenizer, padding=True)
train_loader = DataLoader(
train_ds, batch_size=BATCH_SIZE, shuffle=True,
collate_fn=collator, num_workers=0, pin_memory=True,
)
optimizer = torch.optim.AdamW(
[p for p in model.parameters() if p.requires_grad],
lr=LEARNING_RATE,
weight_decay=WEIGHT_DECAY,
)
# Estimate total steps from time budget (rough: assume ~0.3s per step)
est_steps = max(10, int(time_budget_seconds / 0.3))
scheduler = get_scheduler(
LR_SCHEDULER_TYPE,
optimizer=optimizer,
num_warmup_steps=int(est_steps * WARMUP_RATIO),
num_training_steps=est_steps,
)
model.train()
start_time = time.time()
step = 0
accum_loss = 0.0
data_iter = iter(train_loader)
while (time.time() - start_time) < time_budget_seconds:
# Get next batch, loop if exhausted
try:
batch = next(data_iter)
except StopIteration:
data_iter = iter(train_loader)
batch = next(data_iter)
input_ids = batch["input_ids"].to(device)
attention_mask = batch["attention_mask"].to(device)
labels = batch["labels"].to(device)
if FP16 and scaler is not None:
with autocast(dtype=torch.float16):
outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
loss = outputs.loss / GRADIENT_ACCUMULATION_STEPS
scaler.scale(loss).backward()
else:
outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
loss = outputs.loss / GRADIENT_ACCUMULATION_STEPS
loss.backward()
accum_loss += loss.item()
step += 1
if step % GRADIENT_ACCUMULATION_STEPS == 0:
if FP16 and scaler is not None:
scaler.unscale_(optimizer)
torch.nn.utils.clip_grad_norm_(model.parameters(), MAX_GRAD_NORM)
scaler.step(optimizer)
scaler.update()
else:
torch.nn.utils.clip_grad_norm_(model.parameters(), MAX_GRAD_NORM)
optimizer.step()
scheduler.step()
optimizer.zero_grad()
elapsed = time.time() - start_time
avg_loss = accum_loss / max(step, 1)
print(f" Stage [{','.join(dataset_names)}]: {step} steps in {elapsed:.1f}s, avg_loss={avg_loss:.4f}",
file=sys.stderr)
return model
def main():
set_seed(SEED)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Device: {device}", file=sys.stderr)
# ── Load model ──────────────────────────────────────────────────────────
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
model = AutoModelForTokenClassification.from_pretrained(
BASE_MODEL,
num_labels=NUM_LABELS,
id2label=ID2LABEL,
label2id=LABEL2ID,
ignore_mismatched_sizes=True,
)
if DROPOUT_OVERRIDE is not None:
if hasattr(model.config, "hidden_dropout_prob"):
model.config.hidden_dropout_prob = DROPOUT_OVERRIDE
if hasattr(model.config, "attention_probs_dropout_prob"):
model.config.attention_probs_dropout_prob = DROPOUT_OVERRIDE
if hasattr(model, "dropout"):
model.dropout = torch.nn.Dropout(DROPOUT_OVERRIDE)
freeze_layers(model, FREEZE_LAYERS)
model.to(device)
scaler = GradScaler() if FP16 else None
# ── Run curriculum ──────────────────────────────────────────────────────
print(f"Curriculum: {len(CURRICULUM)} stages", file=sys.stderr)
total_start = time.time()
for i, (ds_names, time_frac, mix_ratios) in enumerate(CURRICULUM):
stage_budget = TOTAL_TIME_BUDGET * time_frac
# Adjust for elapsed time to stay within total budget
elapsed = time.time() - total_start
remaining = TOTAL_TIME_BUDGET - elapsed
stage_budget = min(stage_budget, remaining - 10) # leave 10s for eval
if stage_budget <= 0:
print(f" Skipping stage {i+1}, no time remaining.", file=sys.stderr)
break
print(f" Stage {i+1}/{len(CURRICULUM)}: {ds_names}, "
f"budget={stage_budget:.0f}s", file=sys.stderr)
model = run_training_stage(
model, tokenizer, ds_names, mix_ratios,
stage_budget, device, scaler,
)
# ── Evaluate on target ──────────────────────────────────────────────────
print("Evaluating...", file=sys.stderr)
eval_ds = load_ner_dataset(TARGET_EVAL_DATASET, "test")
collator = DataCollatorForTokenClassification(tokenizer, padding=True)
eval_loader = DataLoader(
eval_ds, batch_size=32, shuffle=False,
collate_fn=collator, num_workers=0, pin_memory=True,
)
f1 = compute_f1(model, eval_loader, device)
peak_vram = torch.cuda.max_memory_allocated(device) // (1024 * 1024) if torch.cuda.is_available() else 0
# ── Print results (grep-able) ───────────────────────────────────────────
print(f"val_f1: {f1:.6f}")
print(f"peak_vram_mb: {peak_vram}")
if __name__ == "__main__":
main()