#!/usr/bin/env python """Aether-domain gain: assistant-token CE on held-out curated-v3, base vs V7. Same 4-bit base weights; toggle the LoRA via disable_adapter() so the only difference is the adapter. CE is computed over ASSISTANT tokens only (the Aether-domain answer), masking system+user. Lower CE = better domain fit. ~19% of curated-v3 was seen sub-epoch during the 1000-step run, so any large gap here is genuine domain adaptation, not memorization. """ import json, random, sys, math import torch from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig from peft import PeftModel BASE = "Qwen/Qwen2.5-7B-Instruct" ADAPTER = "/home/blockartica/training-data/aether-v7-qlora" DATA = "/home/blockartica/training-data/aether-curated-v3.jsonl" N = 300 SEQ = 1024 random.seed(1234) # ── sample held-out curated-v3 examples (Aether-domain chat) ────────── rows = [] with open(DATA) as f: for line in f: rows.append(json.loads(line)) random.shuffle(rows) sample = rows[:N] print(f"sampled {len(sample)} curated-v3 examples", flush=True) tok = AutoTokenizer.from_pretrained(BASE) if tok.pad_token is None: tok.pad_token = tok.eos_token # Build (input_ids, labels) where labels mask everything but the final # assistant turn — measures CE on the Aether-domain answer only. def build(ex): msgs = ex["messages"] # prompt = everything up to (not including) the last assistant msg last = len(msgs) - 1 while last > 0 and msgs[last]["role"] != "assistant": last -= 1 if last == 0: return None prompt_msgs = msgs[:last] full_ids = tok.apply_chat_template(msgs, tokenize=True, add_generation_prompt=False) prompt_ids = tok.apply_chat_template(prompt_msgs, tokenize=True, add_generation_prompt=True) if len(full_ids) > SEQ or len(full_ids) <= len(prompt_ids): return None labels = [-100] * len(prompt_ids) + full_ids[len(prompt_ids):] labels = labels[:len(full_ids)] return torch.tensor([full_ids]), torch.tensor([labels]) built = [b for b in (build(e) for e in sample) if b is not None] print(f"usable (fit in {SEQ} tok, has assistant turn): {len(built)}", flush=True) bnb = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_compute_dtype=torch.bfloat16, bnb_4bit_quant_type="nf4", bnb_4bit_use_double_quant=True) print("loading base 4-bit...", flush=True) model = AutoModelForCausalLM.from_pretrained(BASE, quantization_config=bnb, torch_dtype=torch.bfloat16, device_map="cuda:0") print("attaching V7 adapter...", flush=True) model = PeftModel.from_pretrained(model, ADAPTER) model.eval() @torch.no_grad() def mean_ce(): tot_loss, tot_tok = 0.0, 0 for ids, labels in built: ids = ids.to("cuda:0"); labels = labels.to("cuda:0") out = model(input_ids=ids, labels=labels) # out.loss is mean over non -100 tokens; reweight by token count ntok = (labels != -100).sum().item() if ntok == 0: continue tot_loss += out.loss.item() * ntok tot_tok += ntok return tot_loss / tot_tok, tot_tok print("eval WITH adapter (V7)...", flush=True) v7_ce, ntok = mean_ce() print("eval WITHOUT adapter (base)...", flush=True) with model.disable_adapter(): base_ce, _ = mean_ce() print("\n=== AETHER-DOMAIN HELD-OUT CE (assistant tokens only) ===") print(f"examples: {len(built)} assistant tokens scored: {ntok}") print(f"{'model':10}{'CE (nats)':>12}{'perplexity':>14}") print(f"{'base':10}{base_ce:>12.4f}{math.exp(base_ce):>14.2f}") print(f"{'V7':10}{v7_ce:>12.4f}{math.exp(v7_ce):>14.2f}") print(f"{'Δ':10}{(v7_ce-base_ce):>+12.4f} " f"({100*(1-math.exp(v7_ce)/math.exp(base_ce)):+.1f}% perplexity)") print("\nNote: ~19% of curated-v3 seen sub-epoch during training; a large") print("CE drop here is domain adaptation, not memorization.")