aether-mind-v7.0 / evals /domain_ce_eval.py
BlockArtica's picture
v7.0 model card + config + tokenizer + eval artifacts (adapter follows)
1d10ecf verified
#!/usr/bin/env python
"""Aether-domain gain: assistant-token CE on held-out curated-v3, base vs V7.
Same 4-bit base weights; toggle the LoRA via disable_adapter() so the only
difference is the adapter. CE is computed over ASSISTANT tokens only (the
Aether-domain answer), masking system+user. Lower CE = better domain fit.
~19% of curated-v3 was seen sub-epoch during the 1000-step run, so any
large gap here is genuine domain adaptation, not memorization.
"""
import json, random, sys, math
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import PeftModel
BASE = "Qwen/Qwen2.5-7B-Instruct"
ADAPTER = "/home/blockartica/training-data/aether-v7-qlora"
DATA = "/home/blockartica/training-data/aether-curated-v3.jsonl"
N = 300
SEQ = 1024
random.seed(1234)
# ── sample held-out curated-v3 examples (Aether-domain chat) ──────────
rows = []
with open(DATA) as f:
for line in f:
rows.append(json.loads(line))
random.shuffle(rows)
sample = rows[:N]
print(f"sampled {len(sample)} curated-v3 examples", flush=True)
tok = AutoTokenizer.from_pretrained(BASE)
if tok.pad_token is None:
tok.pad_token = tok.eos_token
# Build (input_ids, labels) where labels mask everything but the final
# assistant turn β€” measures CE on the Aether-domain answer only.
def build(ex):
msgs = ex["messages"]
# prompt = everything up to (not including) the last assistant msg
last = len(msgs) - 1
while last > 0 and msgs[last]["role"] != "assistant":
last -= 1
if last == 0:
return None
prompt_msgs = msgs[:last]
full_ids = tok.apply_chat_template(msgs, tokenize=True, add_generation_prompt=False)
prompt_ids = tok.apply_chat_template(prompt_msgs, tokenize=True, add_generation_prompt=True)
if len(full_ids) > SEQ or len(full_ids) <= len(prompt_ids):
return None
labels = [-100] * len(prompt_ids) + full_ids[len(prompt_ids):]
labels = labels[:len(full_ids)]
return torch.tensor([full_ids]), torch.tensor([labels])
built = [b for b in (build(e) for e in sample) if b is not None]
print(f"usable (fit in {SEQ} tok, has assistant turn): {len(built)}", flush=True)
bnb = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_compute_dtype=torch.bfloat16,
bnb_4bit_quant_type="nf4", bnb_4bit_use_double_quant=True)
print("loading base 4-bit...", flush=True)
model = AutoModelForCausalLM.from_pretrained(BASE, quantization_config=bnb,
torch_dtype=torch.bfloat16, device_map="cuda:0")
print("attaching V7 adapter...", flush=True)
model = PeftModel.from_pretrained(model, ADAPTER)
model.eval()
@torch.no_grad()
def mean_ce():
tot_loss, tot_tok = 0.0, 0
for ids, labels in built:
ids = ids.to("cuda:0"); labels = labels.to("cuda:0")
out = model(input_ids=ids, labels=labels)
# out.loss is mean over non -100 tokens; reweight by token count
ntok = (labels != -100).sum().item()
if ntok == 0: continue
tot_loss += out.loss.item() * ntok
tot_tok += ntok
return tot_loss / tot_tok, tot_tok
print("eval WITH adapter (V7)...", flush=True)
v7_ce, ntok = mean_ce()
print("eval WITHOUT adapter (base)...", flush=True)
with model.disable_adapter():
base_ce, _ = mean_ce()
print("\n=== AETHER-DOMAIN HELD-OUT CE (assistant tokens only) ===")
print(f"examples: {len(built)} assistant tokens scored: {ntok}")
print(f"{'model':10}{'CE (nats)':>12}{'perplexity':>14}")
print(f"{'base':10}{base_ce:>12.4f}{math.exp(base_ce):>14.2f}")
print(f"{'V7':10}{v7_ce:>12.4f}{math.exp(v7_ce):>14.2f}")
print(f"{'Ξ”':10}{(v7_ce-base_ce):>+12.4f} "
f"({100*(1-math.exp(v7_ce)/math.exp(base_ce)):+.1f}% perplexity)")
print("\nNote: ~19% of curated-v3 seen sub-epoch during training; a large")
print("CE drop here is domain adaptation, not memorization.")