AutoResearch Agent
experiment 90: revert to cosine scheduler (mean 0.8535 > constant_with_warmup mean 0.8488 over multiple runs)
c8f8849 | """ | |
| train.py β Cross-dataset biomedical NER curriculum training. | |
| This is the file the autoresearch agent modifies. | |
| The agent can change: | |
| - CURRICULUM: which datasets, in what order, for how many steps | |
| - MIXING_RATIOS: when training on multiple datasets simultaneously | |
| - Model hyperparameters: learning rate, weight decay, warmup, scheduler | |
| - Fine-tuning strategy: which layers to freeze, LoRA, etc. | |
| - Architecture tweaks: classifier head design, pooling, dropout | |
| The agent must NOT change: | |
| - The evaluation function or metric (F1 via seqeval) | |
| - The target eval dataset | |
| - The base model name (but can change how layers are used) | |
| - The total training time budget (enforced by wallclock) | |
| Output format (printed to stdout, grep-able): | |
| val_f1: <float> | |
| peak_vram_mb: <int> | |
| """ | |
| import os | |
| import sys | |
| import json | |
| import time | |
| import torch | |
| import numpy as np | |
| from pathlib import Path | |
| from datasets import load_from_disk, concatenate_datasets, Dataset | |
| from transformers import ( | |
| AutoTokenizer, | |
| AutoModelForTokenClassification, | |
| DataCollatorForTokenClassification, | |
| get_scheduler, | |
| ) | |
| from torch.utils.data import DataLoader | |
| from torch.cuda.amp import autocast, GradScaler | |
| # ββ Constants (do not modify) βββββββββββββββββββββββββββββββββββββββββββββββ | |
| CACHE_DIR = Path.home() / ".cache" / "openmed-autoresearch" | |
| TOTAL_TIME_BUDGET = 300 # 5 minutes in seconds | |
| TARGET_EVAL_DATASET = "ncbi_disease" # The dataset we measure F1 on | |
| SEED = 42 | |
| # ββ Load metadata βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| with open(CACHE_DIR / "meta.json") as f: | |
| META = json.load(f) | |
| UNIFIED_LABELS = META["unified_labels"] | |
| LABEL2ID = {l: i for i, l in enumerate(UNIFIED_LABELS)} | |
| ID2LABEL = {i: l for i, l in enumerate(UNIFIED_LABELS)} | |
| NUM_LABELS = len(UNIFIED_LABELS) | |
| BASE_MODEL = META["model"] | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # CURRICULUM CONFIGURATION β the agent experiments with this section | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # Each stage is: (dataset_names, proportion_of_time_budget, mixing_ratios_or_none) | |
| # mixing_ratios is a dict {dataset_name: float} that sums to 1.0 | |
| # If only one dataset, mixing_ratios can be None | |
| CURRICULUM = [ | |
| # Stage 1: pretrain on bc5cdr_chem (25% of time) | |
| (["bc5cdr_chem"], 0.25, None), | |
| # Stage 2: pretrain on jnlpba (15% of time) | |
| (["jnlpba"], 0.15, None), | |
| # Stage 3: fine-tune on target (60% of time) | |
| ([TARGET_EVAL_DATASET], 0.60, None), | |
| ] | |
| # ββ Training hyperparameters ββββββββββββββββββββββββββββββββββββββββββββββββ | |
| LEARNING_RATE = 5e-5 | |
| WEIGHT_DECAY = 0.01 | |
| WARMUP_RATIO = 0.1 | |
| BATCH_SIZE = 64 | |
| GRADIENT_ACCUMULATION_STEPS = 1 | |
| MAX_GRAD_NORM = 1.0 | |
| LR_SCHEDULER_TYPE = "cosine" | |
| DROPOUT_OVERRIDE = None # Set to a float to override model's default dropout | |
| FP16 = True | |
| # ββ Layer freezing ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # Freeze the first N transformer layers during training (0 = freeze nothing) | |
| FREEZE_LAYERS = 0 | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # END OF AGENT-MODIFIABLE CONFIGURATION | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def set_seed(seed): | |
| torch.manual_seed(seed) | |
| np.random.seed(seed) | |
| if torch.cuda.is_available(): | |
| torch.cuda.manual_seed_all(seed) | |
| def load_ner_dataset(name: str, split: str) -> Dataset: | |
| """Load a preprocessed dataset split from cache.""" | |
| ds = load_from_disk(str(CACHE_DIR / name)) | |
| return ds[split] | |
| def build_mixed_dataset(dataset_names, mixing_ratios=None, split="train"): | |
| """Build a mixed training dataset from multiple sources.""" | |
| datasets = [] | |
| for name in dataset_names: | |
| ds = load_ner_dataset(name, split) | |
| datasets.append((name, ds)) | |
| if len(datasets) == 1: | |
| return datasets[0][1] | |
| if mixing_ratios is None: | |
| # Equal mixing | |
| mixing_ratios = {name: 1.0 / len(datasets) for name, _ in datasets} | |
| # Sample proportionally | |
| mixed_parts = [] | |
| total_target = sum(len(ds) for _, ds in datasets) | |
| for name, ds in datasets: | |
| ratio = mixing_ratios.get(name, 0) | |
| n_samples = max(1, int(ratio * total_target)) | |
| if n_samples >= len(ds): | |
| mixed_parts.append(ds) | |
| else: | |
| indices = np.random.choice(len(ds), size=n_samples, replace=False) | |
| mixed_parts.append(ds.select(indices.tolist())) | |
| return concatenate_datasets(mixed_parts).shuffle(seed=SEED) | |
| def compute_f1(model, dataloader, device): | |
| """Compute entity-level F1 using seqeval.""" | |
| from seqeval.metrics import f1_score as seq_f1 | |
| model.eval() | |
| all_preds = [] | |
| all_labels = [] | |
| with torch.no_grad(): | |
| for batch in dataloader: | |
| input_ids = batch["input_ids"].to(device) | |
| attention_mask = batch["attention_mask"].to(device) | |
| labels = batch["labels"].to(device) | |
| outputs = model(input_ids=input_ids, attention_mask=attention_mask) | |
| preds = torch.argmax(outputs.logits, dim=-1) | |
| for i in range(labels.shape[0]): | |
| pred_seq = [] | |
| label_seq = [] | |
| for j in range(labels.shape[1]): | |
| if labels[i][j].item() != -100: | |
| pred_seq.append(ID2LABEL[preds[i][j].item()]) | |
| label_seq.append(ID2LABEL[labels[i][j].item()]) | |
| if label_seq: | |
| all_preds.append(pred_seq) | |
| all_labels.append(label_seq) | |
| if not all_labels: | |
| return 0.0 | |
| return seq_f1(all_labels, all_preds, average="micro") | |
| def freeze_layers(model, n_layers): | |
| """Freeze the embeddings and first n_layers of the transformer.""" | |
| if n_layers <= 0: | |
| return | |
| # Freeze embeddings | |
| for param in model.base_model.embeddings.parameters(): | |
| param.requires_grad = False | |
| # Freeze encoder layers | |
| encoder_layers = None | |
| if hasattr(model.base_model, "encoder"): | |
| if hasattr(model.base_model.encoder, "layer"): | |
| encoder_layers = model.base_model.encoder.layer | |
| elif hasattr(model.base_model.encoder, "layers"): | |
| encoder_layers = model.base_model.encoder.layers | |
| elif hasattr(model.base_model, "layers"): | |
| encoder_layers = model.base_model.layers | |
| if encoder_layers is not None: | |
| for i, layer in enumerate(encoder_layers): | |
| if i < n_layers: | |
| for param in layer.parameters(): | |
| param.requires_grad = False | |
| def run_training_stage(model, tokenizer, dataset_names, mixing_ratios, | |
| time_budget_seconds, device, scaler): | |
| """Run one curriculum stage within a time budget.""" | |
| train_ds = build_mixed_dataset(dataset_names, mixing_ratios, split="train") | |
| collator = DataCollatorForTokenClassification(tokenizer, padding=True) | |
| train_loader = DataLoader( | |
| train_ds, batch_size=BATCH_SIZE, shuffle=True, | |
| collate_fn=collator, num_workers=0, pin_memory=True, | |
| ) | |
| optimizer = torch.optim.AdamW( | |
| [p for p in model.parameters() if p.requires_grad], | |
| lr=LEARNING_RATE, | |
| weight_decay=WEIGHT_DECAY, | |
| ) | |
| # Estimate total steps from time budget (rough: assume ~0.3s per step) | |
| est_steps = max(10, int(time_budget_seconds / 0.3)) | |
| scheduler = get_scheduler( | |
| LR_SCHEDULER_TYPE, | |
| optimizer=optimizer, | |
| num_warmup_steps=int(est_steps * WARMUP_RATIO), | |
| num_training_steps=est_steps, | |
| ) | |
| model.train() | |
| start_time = time.time() | |
| step = 0 | |
| accum_loss = 0.0 | |
| data_iter = iter(train_loader) | |
| while (time.time() - start_time) < time_budget_seconds: | |
| # Get next batch, loop if exhausted | |
| try: | |
| batch = next(data_iter) | |
| except StopIteration: | |
| data_iter = iter(train_loader) | |
| batch = next(data_iter) | |
| input_ids = batch["input_ids"].to(device) | |
| attention_mask = batch["attention_mask"].to(device) | |
| labels = batch["labels"].to(device) | |
| if FP16 and scaler is not None: | |
| with autocast(dtype=torch.float16): | |
| outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels) | |
| loss = outputs.loss / GRADIENT_ACCUMULATION_STEPS | |
| scaler.scale(loss).backward() | |
| else: | |
| outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels) | |
| loss = outputs.loss / GRADIENT_ACCUMULATION_STEPS | |
| loss.backward() | |
| accum_loss += loss.item() | |
| step += 1 | |
| if step % GRADIENT_ACCUMULATION_STEPS == 0: | |
| if FP16 and scaler is not None: | |
| scaler.unscale_(optimizer) | |
| torch.nn.utils.clip_grad_norm_(model.parameters(), MAX_GRAD_NORM) | |
| scaler.step(optimizer) | |
| scaler.update() | |
| else: | |
| torch.nn.utils.clip_grad_norm_(model.parameters(), MAX_GRAD_NORM) | |
| optimizer.step() | |
| scheduler.step() | |
| optimizer.zero_grad() | |
| elapsed = time.time() - start_time | |
| avg_loss = accum_loss / max(step, 1) | |
| print(f" Stage [{','.join(dataset_names)}]: {step} steps in {elapsed:.1f}s, avg_loss={avg_loss:.4f}", | |
| file=sys.stderr) | |
| return model | |
| def main(): | |
| set_seed(SEED) | |
| device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
| print(f"Device: {device}", file=sys.stderr) | |
| # ββ Load model ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL) | |
| model = AutoModelForTokenClassification.from_pretrained( | |
| BASE_MODEL, | |
| num_labels=NUM_LABELS, | |
| id2label=ID2LABEL, | |
| label2id=LABEL2ID, | |
| ignore_mismatched_sizes=True, | |
| ) | |
| if DROPOUT_OVERRIDE is not None: | |
| if hasattr(model.config, "hidden_dropout_prob"): | |
| model.config.hidden_dropout_prob = DROPOUT_OVERRIDE | |
| if hasattr(model.config, "attention_probs_dropout_prob"): | |
| model.config.attention_probs_dropout_prob = DROPOUT_OVERRIDE | |
| if hasattr(model, "dropout"): | |
| model.dropout = torch.nn.Dropout(DROPOUT_OVERRIDE) | |
| freeze_layers(model, FREEZE_LAYERS) | |
| model.to(device) | |
| scaler = GradScaler() if FP16 else None | |
| # ββ Run curriculum ββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| print(f"Curriculum: {len(CURRICULUM)} stages", file=sys.stderr) | |
| total_start = time.time() | |
| for i, (ds_names, time_frac, mix_ratios) in enumerate(CURRICULUM): | |
| stage_budget = TOTAL_TIME_BUDGET * time_frac | |
| # Adjust for elapsed time to stay within total budget | |
| elapsed = time.time() - total_start | |
| remaining = TOTAL_TIME_BUDGET - elapsed | |
| stage_budget = min(stage_budget, remaining - 10) # leave 10s for eval | |
| if stage_budget <= 0: | |
| print(f" Skipping stage {i+1}, no time remaining.", file=sys.stderr) | |
| break | |
| print(f" Stage {i+1}/{len(CURRICULUM)}: {ds_names}, " | |
| f"budget={stage_budget:.0f}s", file=sys.stderr) | |
| model = run_training_stage( | |
| model, tokenizer, ds_names, mix_ratios, | |
| stage_budget, device, scaler, | |
| ) | |
| # ββ Evaluate on target ββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| print("Evaluating...", file=sys.stderr) | |
| eval_ds = load_ner_dataset(TARGET_EVAL_DATASET, "test") | |
| collator = DataCollatorForTokenClassification(tokenizer, padding=True) | |
| eval_loader = DataLoader( | |
| eval_ds, batch_size=32, shuffle=False, | |
| collate_fn=collator, num_workers=0, pin_memory=True, | |
| ) | |
| f1 = compute_f1(model, eval_loader, device) | |
| peak_vram = torch.cuda.max_memory_allocated(device) // (1024 * 1024) if torch.cuda.is_available() else 0 | |
| # ββ Print results (grep-able) βββββββββββββββββββββββββββββββββββββββββββ | |
| print(f"val_f1: {f1:.6f}") | |
| print(f"peak_vram_mb: {peak_vram}") | |
| if __name__ == "__main__": | |
| main() | |