Update app.py
Browse files
app.py
CHANGED
|
@@ -768,7 +768,9 @@ def _verdict(before, after, subject, new_answer, drift_thresh=0.05):
|
|
| 768 |
return eff, collateral, max_drift, ent_blowup, surgical
|
| 769 |
|
| 770 |
|
| 771 |
-
def edit_and_verify(subject, new_answer, method, strength, use_llm,
|
|
|
|
|
|
|
| 772 |
model, tok = get_handles("glassbox")
|
| 773 |
STATE["name"] = "glassbox"
|
| 774 |
model.reset()
|
|
@@ -800,10 +802,20 @@ def edit_and_verify(subject, new_answer, method, strength, use_llm, llm_model, a
|
|
| 800 |
"", "VERDICT: %s" % ("SURGICAL EDIT" if surgical else "COLLATERAL DAMAGE")]
|
| 801 |
L.append("(model is left in the edited state - inspect it in tabs 1-5, or hit Reset.)")
|
| 802 |
|
|
|
|
| 803 |
if use_llm:
|
| 804 |
-
|
| 805 |
-
|
| 806 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 807 |
|
| 808 |
|
| 809 |
def reset_glassbox():
|
|
@@ -812,13 +824,13 @@ def reset_glassbox():
|
|
| 812 |
return "Glass-box weights restored to pristine. Re-run any tab to confirm."
|
| 813 |
|
| 814 |
|
| 815 |
-
# --- optional: real LLM calls to verify the edit
|
| 816 |
-
|
| 817 |
-
|
| 818 |
-
|
| 819 |
-
|
| 820 |
-
|
| 821 |
-
|
| 822 |
payload = {c: {"prompt": before[c]["prompt"],
|
| 823 |
"before_top1": before[c]["top1"], "before_p_orig": round(before[c]["p_orig"], 3),
|
| 824 |
"after_top1": after[c]["top1"], "after_p_orig": round(after[c]["p_orig"], 3)}
|
|
@@ -826,39 +838,162 @@ def _llm_judge(before, after, subject, new_answer, llm_model, api_key):
|
|
| 826 |
sys = ("You audit knowledge edits to a small language model. The intended edit "
|
| 827 |
"is: make %s's capital '%s'. Given before/after predictions for every "
|
| 828 |
"known fact, decide if the edit was SURGICAL (target changed, all other "
|
| 829 |
-
"facts unchanged) or caused COLLATERAL damage. Reply ONLY as JSON
|
| 830 |
-
'{"verdict":"surgical|collateral",
|
| 831 |
-
'"damaged_facts":[...],"confidence":0-1,
|
| 832 |
-
) % (subject, new_answer)
|
| 833 |
-
|
| 834 |
-
|
| 835 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 836 |
try:
|
| 837 |
-
try:
|
| 838 |
import anthropic
|
| 839 |
client = anthropic.Anthropic(api_key=key)
|
| 840 |
msg = client.messages.create(**body)
|
| 841 |
text = "".join(b.text for b in msg.content if getattr(b, "type", "") == "text")
|
| 842 |
-
except ImportError:
|
| 843 |
import urllib.request
|
| 844 |
req = urllib.request.Request(
|
| 845 |
-
"https://api.anthropic.com/v1/messages",
|
| 846 |
-
data=json.dumps(body).encode(),
|
| 847 |
headers={"x-api-key": key, "anthropic-version": "2023-06-01",
|
| 848 |
"content-type": "application/json"})
|
| 849 |
with urllib.request.urlopen(req, timeout=30) as r:
|
| 850 |
data = json.loads(r.read())
|
| 851 |
text = "".join(b.get("text", "") for b in data.get("content", [])
|
| 852 |
if b.get("type") == "text")
|
| 853 |
-
|
| 854 |
-
if clean.startswith("json"):
|
| 855 |
-
clean = clean[4:].strip()
|
| 856 |
-
v = json.loads(clean)
|
| 857 |
-
return ("verdict=%s target_changed=%s confidence=%s\n damaged: %s\n reason: %s"
|
| 858 |
-
% (v.get("verdict"), v.get("target_changed"), v.get("confidence"),
|
| 859 |
-
v.get("damaged_facts") or "none", v.get("reason")))
|
| 860 |
except Exception as e:
|
| 861 |
-
return "
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 862 |
|
| 863 |
|
| 864 |
# =============================================================================
|
|
@@ -994,6 +1129,46 @@ def upload_to_hf(repo_id, token, what, app_path=__file__):
|
|
| 994 |
return "Upload failed: %s" % e
|
| 995 |
|
| 996 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 997 |
# =============================================================================
|
| 998 |
# UI
|
| 999 |
# =============================================================================
|
|
@@ -1159,54 +1334,72 @@ the fact is read here. The peak line names the site.
|
|
| 1159 |
gr.Markdown("""
|
| 1160 |
### Edit a fact, then prove nothing else broke
|
| 1161 |
**What it does:** rewrites the value one fact-MLP key maps to (the exact thing
|
| 1162 |
-
ROME/MEMIT do on real models
|
|
|
|
| 1163 |
known fact to measure **efficacy** (target changed), **specificity** (others
|
| 1164 |
untouched), and **fluency** (no entropy collapse).
|
| 1165 |
|
| 1166 |
**Two methods, on purpose:**
|
| 1167 |
- `rank1` β the minimal, surgical update. Only the target fact moves β **SURGICAL**.
|
| 1168 |
-
- `broadcast` β a deliberately sloppy edit that smears the change across all facts β the harness catches the **COLLATERAL DAMAGE**. This proves the verifier actually works.
|
| 1169 |
|
| 1170 |
-
**
|
| 1171 |
-
|
| 1172 |
-
|
| 1173 |
-
|
|
|
|
|
|
|
|
|
|
| 1174 |
|
| 1175 |
Subjects: `france`, `germany`, `japan`. Answers: `paris, berlin, tokyo, london, rome`.
|
| 1176 |
After editing, the model stays edited β go look at it in tabs 1β5 (the logit lens
|
| 1177 |
will show the new answer rising; the trace still localises to L0). Hit **Reset**
|
| 1178 |
-
to restore.
|
|
|
|
| 1179 |
""")
|
| 1180 |
with gr.Row():
|
| 1181 |
ed_subj = gr.Textbox(value="france", label="subject")
|
| 1182 |
ed_new = gr.Textbox(value="london", label="new answer")
|
| 1183 |
ed_method = gr.Radio(["rank1", "broadcast"], value="rank1", label="method")
|
| 1184 |
ed_strength = gr.Slider(0.2, 2.0, value=1.0, step=0.1, label="strength")
|
| 1185 |
-
|
| 1186 |
-
|
| 1187 |
-
|
| 1188 |
-
|
| 1189 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1190 |
with gr.Row():
|
| 1191 |
gr.Button("Edit & verify", variant="primary").click(
|
| 1192 |
edit_and_verify,
|
| 1193 |
-
[ed_subj, ed_new, ed_method, ed_strength, ed_llm,
|
|
|
|
|
|
|
| 1194 |
gr.Button("Reset model").click(reset_glassbox, outputs=ed_out)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1195 |
|
| 1196 |
# ---- TAB 7 -------------------------------------------------------------
|
| 1197 |
with gr.Tab("7 Β· Export / Upload to HF"):
|
| 1198 |
gr.Markdown("""
|
| 1199 |
-
### Ship
|
| 1200 |
**Export** writes a self-contained, reloadable repo: weights (`safetensors`),
|
| 1201 |
`config.json`, `vocab.json`, a standalone `modeling_glassbox.py` (reload with
|
| 1202 |
`from modeling_glassbox import load`), and a model card.
|
| 1203 |
|
| 1204 |
-
**Upload** pushes it to the Hub. Choose
|
| 1205 |
-
|
| 1206 |
-
- `space` β *this whole app* as a runnable Gradio Space (adds `requirements.txt`).
|
| 1207 |
-
- `both`.
|
| 1208 |
-
|
| 1209 |
-
Paste a **write** token (or set `HF_TOKEN`). Repo id like `Chris4K/glassbox-interp`.
|
| 1210 |
""")
|
| 1211 |
with gr.Row():
|
| 1212 |
hf_repo = gr.Textbox(value="Chris4K/glassbox-interp", label="repo id")
|
|
@@ -1219,13 +1412,38 @@ Paste a **write** token (or set `HF_TOKEN`). Repo id like `Chris4K/glassbox-inte
|
|
| 1219 |
gr.Button("Upload to HF", variant="primary").click(
|
| 1220 |
upload_to_hf, [hf_repo, hf_token, hf_what], hf_out)
|
| 1221 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1222 |
gr.Markdown("""
|
| 1223 |
---
|
| 1224 |
### Where this goes next
|
| 1225 |
-
- **
|
| 1226 |
-
- **
|
|
|
|
|
|
|
| 1227 |
- **Attribution view:** Geva-style "what does this neuron write to the vocab", per-head attention attribution.
|
| 1228 |
-
- **It already ships:** tab 7 pushes the model and this whole app (as a Space) to your Hub.
|
| 1229 |
""")
|
| 1230 |
|
| 1231 |
demo.load(lambda: load_model("glassbox"), outputs=load_status)
|
|
|
|
| 768 |
return eff, collateral, max_drift, ent_blowup, surgical
|
| 769 |
|
| 770 |
|
| 771 |
+
def edit_and_verify(subject, new_answer, method, strength, use_llm,
|
| 772 |
+
anthropic_key, anthropic_model, hf_token, hf_model,
|
| 773 |
+
local_url, local_model):
|
| 774 |
model, tok = get_handles("glassbox")
|
| 775 |
STATE["name"] = "glassbox"
|
| 776 |
model.reset()
|
|
|
|
| 802 |
"", "VERDICT: %s" % ("SURGICAL EDIT" if surgical else "COLLATERAL DAMAGE")]
|
| 803 |
L.append("(model is left in the edited state - inspect it in tabs 1-5, or hit Reset.)")
|
| 804 |
|
| 805 |
+
llm_report = ""
|
| 806 |
if use_llm:
|
| 807 |
+
providers = [
|
| 808 |
+
{"type": "anthropic", "key": anthropic_key, "model": anthropic_model},
|
| 809 |
+
{"type": "hf", "key": hf_token, "model": hf_model},
|
| 810 |
+
{"type": "local", "url": local_url, "model": local_model},
|
| 811 |
+
]
|
| 812 |
+
llm_report = _llm_judge_chain(before, after, subject, new_answer, providers)
|
| 813 |
+
L += ["", "-" * 60, "INDEPENDENT LLM REVIEW:", llm_report]
|
| 814 |
+
|
| 815 |
+
report = "\n".join(L)
|
| 816 |
+
_log_session(subject, new_answer, method, strength, before, after,
|
| 817 |
+
eff, collateral, max_drift, surgical, llm_report)
|
| 818 |
+
return report
|
| 819 |
|
| 820 |
|
| 821 |
def reset_glassbox():
|
|
|
|
| 824 |
return "Glass-box weights restored to pristine. Re-run any tab to confirm."
|
| 825 |
|
| 826 |
|
| 827 |
+
# --- optional: real LLM calls to verify the edit, with a 3-tier fallback chain
|
| 828 |
+
# Anthropic (Claude) -> Hugging Face Inference -> local OpenAI-compatible server
|
| 829 |
+
# (e.g. LM Studio). Tries each in order; the first provider that's configured
|
| 830 |
+
# AND reachable wins. This means you're never blocked on one vendor being down
|
| 831 |
+
# or on not having an Anthropic key at all - your own RTX 5090 can be the judge.
|
| 832 |
+
def _build_judge_prompt(before, after, subject, new_answer):
|
| 833 |
+
import json
|
| 834 |
payload = {c: {"prompt": before[c]["prompt"],
|
| 835 |
"before_top1": before[c]["top1"], "before_p_orig": round(before[c]["p_orig"], 3),
|
| 836 |
"after_top1": after[c]["top1"], "after_p_orig": round(after[c]["p_orig"], 3)}
|
|
|
|
| 838 |
sys = ("You audit knowledge edits to a small language model. The intended edit "
|
| 839 |
"is: make %s's capital '%s'. Given before/after predictions for every "
|
| 840 |
"known fact, decide if the edit was SURGICAL (target changed, all other "
|
| 841 |
+
"facts unchanged) or caused COLLATERAL damage. Reply ONLY as JSON, no "
|
| 842 |
+
'prose, no markdown fences: {"verdict":"surgical|collateral",'
|
| 843 |
+
'"target_changed":bool,"damaged_facts":[...],"confidence":0-1,'
|
| 844 |
+
'"reason":"one sentence"}.') % (subject, new_answer)
|
| 845 |
+
return sys, json.dumps(payload)
|
| 846 |
+
|
| 847 |
+
|
| 848 |
+
def _parse_verdict_json(text, provider_label):
|
| 849 |
+
import json
|
| 850 |
+
clean = text.strip().strip("`")
|
| 851 |
+
if clean.lower().startswith("json"):
|
| 852 |
+
clean = clean[4:].strip()
|
| 853 |
+
start, end = clean.find("{"), clean.rfind("}")
|
| 854 |
+
if start != -1 and end != -1:
|
| 855 |
+
clean = clean[start:end + 1]
|
| 856 |
+
v = json.loads(clean)
|
| 857 |
+
return ("[%s] verdict=%s target_changed=%s confidence=%s\n damaged: %s\n reason: %s"
|
| 858 |
+
% (provider_label, v.get("verdict"), v.get("target_changed"), v.get("confidence"),
|
| 859 |
+
v.get("damaged_facts") or "none", v.get("reason")))
|
| 860 |
+
|
| 861 |
+
|
| 862 |
+
def _try_anthropic(sys, user, cfg):
|
| 863 |
+
import os, json
|
| 864 |
+
key = (cfg.get("key") or "").strip() or os.environ.get("ANTHROPIC_API_KEY", "")
|
| 865 |
+
if not key:
|
| 866 |
+
return None, "anthropic: no key configured"
|
| 867 |
+
body = {"model": (cfg.get("model") or "claude-sonnet-4-6").strip(),
|
| 868 |
+
"max_tokens": 400, "system": sys, "messages": [{"role": "user", "content": user}]}
|
| 869 |
try:
|
| 870 |
+
try:
|
| 871 |
import anthropic
|
| 872 |
client = anthropic.Anthropic(api_key=key)
|
| 873 |
msg = client.messages.create(**body)
|
| 874 |
text = "".join(b.text for b in msg.content if getattr(b, "type", "") == "text")
|
| 875 |
+
except ImportError:
|
| 876 |
import urllib.request
|
| 877 |
req = urllib.request.Request(
|
| 878 |
+
"https://api.anthropic.com/v1/messages", data=json.dumps(body).encode(),
|
|
|
|
| 879 |
headers={"x-api-key": key, "anthropic-version": "2023-06-01",
|
| 880 |
"content-type": "application/json"})
|
| 881 |
with urllib.request.urlopen(req, timeout=30) as r:
|
| 882 |
data = json.loads(r.read())
|
| 883 |
text = "".join(b.get("text", "") for b in data.get("content", [])
|
| 884 |
if b.get("type") == "text")
|
| 885 |
+
return _parse_verdict_json(text, "anthropic:" + body["model"]), None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 886 |
except Exception as e:
|
| 887 |
+
return None, "anthropic failed: %s" % e
|
| 888 |
+
|
| 889 |
+
|
| 890 |
+
def _try_hf(sys, user, cfg):
|
| 891 |
+
token = (cfg.get("key") or "").strip()
|
| 892 |
+
model = (cfg.get("model") or "Qwen/Qwen2.5-7B-Instruct").strip()
|
| 893 |
+
if not token:
|
| 894 |
+
import os
|
| 895 |
+
token = os.environ.get("HF_TOKEN", "")
|
| 896 |
+
if not token:
|
| 897 |
+
return None, "hf: no token configured"
|
| 898 |
+
try:
|
| 899 |
+
from huggingface_hub import InferenceClient
|
| 900 |
+
client = InferenceClient(model=model, token=token)
|
| 901 |
+
resp = client.chat_completion(
|
| 902 |
+
messages=[{"role": "system", "content": sys}, {"role": "user", "content": user}],
|
| 903 |
+
max_tokens=400)
|
| 904 |
+
text = resp.choices[0].message.content
|
| 905 |
+
return _parse_verdict_json(text, "hf:" + model), None
|
| 906 |
+
except Exception as e:
|
| 907 |
+
return None, "hf failed: %s" % e
|
| 908 |
+
|
| 909 |
+
|
| 910 |
+
def _try_local(sys, user, cfg):
|
| 911 |
+
"""Any OpenAI-compatible /v1/chat/completions server - LM Studio, vLLM,
|
| 912 |
+
Ollama (with its OpenAI shim), text-generation-webui, etc."""
|
| 913 |
+
import json, urllib.request
|
| 914 |
+
url = (cfg.get("url") or "").strip().rstrip("/")
|
| 915 |
+
if not url:
|
| 916 |
+
return None, "local: no URL configured"
|
| 917 |
+
model = (cfg.get("model") or "local-model").strip()
|
| 918 |
+
body = json.dumps({"model": model, "max_tokens": 400, "temperature": 0,
|
| 919 |
+
"messages": [{"role": "system", "content": sys},
|
| 920 |
+
{"role": "user", "content": user}]}).encode()
|
| 921 |
+
try:
|
| 922 |
+
req = urllib.request.Request(
|
| 923 |
+
url + "/v1/chat/completions", data=body,
|
| 924 |
+
headers={"content-type": "application/json"})
|
| 925 |
+
with urllib.request.urlopen(req, timeout=20) as r:
|
| 926 |
+
data = json.loads(r.read())
|
| 927 |
+
text = data["choices"][0]["message"]["content"]
|
| 928 |
+
return _parse_verdict_json(text, "local:" + model + "@" + url), None
|
| 929 |
+
except Exception as e:
|
| 930 |
+
return None, "local failed: %s" % e
|
| 931 |
+
|
| 932 |
+
|
| 933 |
+
def _llm_judge_chain(before, after, subject, new_answer, providers):
|
| 934 |
+
sys, user = _build_judge_prompt(before, after, subject, new_answer)
|
| 935 |
+
dispatch = {"anthropic": _try_anthropic, "hf": _try_hf, "local": _try_local}
|
| 936 |
+
skipped = []
|
| 937 |
+
for cfg in providers:
|
| 938 |
+
fn = dispatch.get(cfg["type"])
|
| 939 |
+
if fn is None:
|
| 940 |
+
continue
|
| 941 |
+
result, err = fn(sys, user, cfg)
|
| 942 |
+
if result is not None:
|
| 943 |
+
note = ("" if not skipped else
|
| 944 |
+
"(skipped: %s)\n" % "; ".join(skipped))
|
| 945 |
+
return note + result
|
| 946 |
+
skipped.append(err)
|
| 947 |
+
return ("all providers unavailable:\n " + "\n ".join(skipped) +
|
| 948 |
+
"\n(configure at least one: Anthropic key, HF token, or a local "
|
| 949 |
+
"OpenAI-compatible server URL like http://192.168.188.25:1234)")
|
| 950 |
+
|
| 951 |
+
|
| 952 |
+
# --- session log: every edit+verify run is appended here as JSON, so you can
|
| 953 |
+
# download it, or paste the markdown block straight into a future chat with
|
| 954 |
+
# Claude for review ("did all work, here's the log").
|
| 955 |
+
SESSION_LOG = []
|
| 956 |
+
|
| 957 |
+
|
| 958 |
+
def _log_session(subject, new_answer, method, strength, before, after,
|
| 959 |
+
eff, collateral, max_drift, surgical, llm_report):
|
| 960 |
+
import datetime
|
| 961 |
+
SESSION_LOG.append({
|
| 962 |
+
"ts": datetime.datetime.utcnow().isoformat() + "Z",
|
| 963 |
+
"subject": subject, "new_answer": new_answer, "method": method,
|
| 964 |
+
"strength": strength, "efficacy_pass": bool(eff),
|
| 965 |
+
"collateral": collateral, "max_drift": round(max_drift, 4),
|
| 966 |
+
"verdict": "SURGICAL" if surgical else "COLLATERAL",
|
| 967 |
+
"before": {c: {"top1": before[c]["top1"], "p_orig": round(before[c]["p_orig"], 4)}
|
| 968 |
+
for c in before},
|
| 969 |
+
"after": {c: {"top1": after[c]["top1"], "p_orig": round(after[c]["p_orig"], 4)}
|
| 970 |
+
for c in after},
|
| 971 |
+
"llm_review": llm_report or None,
|
| 972 |
+
})
|
| 973 |
+
|
| 974 |
+
|
| 975 |
+
def export_session_log():
|
| 976 |
+
import json, os
|
| 977 |
+
if not SESSION_LOG:
|
| 978 |
+
return None, "No edits run yet this session - nothing to export."
|
| 979 |
+
os.makedirs("/mnt/user-data/outputs", exist_ok=True)
|
| 980 |
+
path = "/mnt/user-data/outputs/edit_session_log.json"
|
| 981 |
+
json.dump(SESSION_LOG, open(path, "w"), indent=2)
|
| 982 |
+
# also a markdown rendition meant to be pasted straight into a chat
|
| 983 |
+
md = ["# Edit session log\n"]
|
| 984 |
+
for i, e in enumerate(SESSION_LOG, 1):
|
| 985 |
+
md.append("## Edit %d - %s (%s, %s, strength=%s)\n" %
|
| 986 |
+
(i, e["verdict"], e["subject"] + "->" + e["new_answer"],
|
| 987 |
+
e["method"], e["strength"]))
|
| 988 |
+
md.append("- efficacy: %s, max collateral drift: %.4f, damaged: %s" %
|
| 989 |
+
("pass" if e["efficacy_pass"] else "fail", e["max_drift"],
|
| 990 |
+
e["collateral"] or "none"))
|
| 991 |
+
if e["llm_review"]:
|
| 992 |
+
md.append("- LLM review: " + e["llm_review"].replace("\n", " "))
|
| 993 |
+
md.append("")
|
| 994 |
+
md_path = "/mnt/user-data/outputs/edit_session_log.md"
|
| 995 |
+
open(md_path, "w").write("\n".join(md))
|
| 996 |
+
return path, "Wrote %d edit(s) to %s and %s" % (len(SESSION_LOG), path, md_path)
|
| 997 |
|
| 998 |
|
| 999 |
# =============================================================================
|
|
|
|
| 1129 |
return "Upload failed: %s" % e
|
| 1130 |
|
| 1131 |
|
| 1132 |
+
# --- upload a REAL model (e.g. a VINDEX-edited Llama checkpoint), not the toy.
|
| 1133 |
+
# This does NOT load the model into memory (multi-GB Llama weights don't need
|
| 1134 |
+
# to round-trip through Python) - it just pushes whatever's already on disk.
|
| 1135 |
+
# Point it at the local folder produced by your save_pretrained()/VINDEX run:
|
| 1136 |
+
# expects the usual HF layout (config.json + .safetensors shards + tokenizer
|
| 1137 |
+
# files). Note: gated models (e.g. meta-llama/*) require the destination repo
|
| 1138 |
+
# to either be your own namespace or one you have write access to - the Hub's
|
| 1139 |
+
# license gate is independent of this upload step.
|
| 1140 |
+
def upload_local_checkpoint(local_dir, repo_id, token, private, commit_message):
|
| 1141 |
+
import os
|
| 1142 |
+
try:
|
| 1143 |
+
from huggingface_hub import HfApi
|
| 1144 |
+
except ImportError:
|
| 1145 |
+
return "huggingface_hub not installed. `pip install huggingface_hub`."
|
| 1146 |
+
local_dir = (local_dir or "").strip()
|
| 1147 |
+
repo_id = (repo_id or "").strip()
|
| 1148 |
+
if not local_dir or not os.path.isdir(local_dir):
|
| 1149 |
+
return "local_dir %r does not exist or is not a directory." % local_dir
|
| 1150 |
+
if not repo_id:
|
| 1151 |
+
return "Enter a repo id like 'Chris4K/vindex-llama3-edited'."
|
| 1152 |
+
token = (token or "").strip() or os.environ.get("HF_TOKEN", "")
|
| 1153 |
+
if not token:
|
| 1154 |
+
return "No HF token. Paste a write token or set HF_TOKEN."
|
| 1155 |
+
has_cfg = os.path.exists(os.path.join(local_dir, "config.json"))
|
| 1156 |
+
has_weights = any(f.endswith((".safetensors", ".bin"))
|
| 1157 |
+
for f in os.listdir(local_dir))
|
| 1158 |
+
warn = "" if (has_cfg and has_weights) else (
|
| 1159 |
+
"WARNING: folder is missing config.json or weight files - this may "
|
| 1160 |
+
"not be a loadable HF checkpoint. Uploading anyway.\n")
|
| 1161 |
+
api = HfApi(token=token)
|
| 1162 |
+
try:
|
| 1163 |
+
api.create_repo(repo_id, repo_type="model", private=bool(private), exist_ok=True)
|
| 1164 |
+
api.upload_folder(folder_path=local_dir, repo_id=repo_id, repo_type="model",
|
| 1165 |
+
commit_message=(commit_message or "upload checkpoint").strip())
|
| 1166 |
+
return (warn + "Uploaded %s -> https://huggingface.co/%s\n"
|
| 1167 |
+
"Files: %s" % (local_dir, repo_id, ", ".join(sorted(os.listdir(local_dir))[:12])))
|
| 1168 |
+
except Exception as e:
|
| 1169 |
+
return warn + "Upload failed: %s" % e
|
| 1170 |
+
|
| 1171 |
+
|
| 1172 |
# =============================================================================
|
| 1173 |
# UI
|
| 1174 |
# =============================================================================
|
|
|
|
| 1334 |
gr.Markdown("""
|
| 1335 |
### Edit a fact, then prove nothing else broke
|
| 1336 |
**What it does:** rewrites the value one fact-MLP key maps to (the exact thing
|
| 1337 |
+
ROME/MEMIT do on real models β this is a literal `nn.Module` weight tensor,
|
| 1338 |
+
not a token or vocab change), then runs a verification battery over **every**
|
| 1339 |
known fact to measure **efficacy** (target changed), **specificity** (others
|
| 1340 |
untouched), and **fluency** (no entropy collapse).
|
| 1341 |
|
| 1342 |
**Two methods, on purpose:**
|
| 1343 |
- `rank1` β the minimal, surgical update. Only the target fact moves β **SURGICAL**.
|
| 1344 |
+
- `broadcast` β a deliberately sloppy edit that smears the change across all facts β the harness catches the **COLLATERAL DAMAGE**. This proves the verifier actually works, not just reports "ok" by default.
|
| 1345 |
|
| 1346 |
+
**Independent LLM review, with a fallback chain β not locked to one vendor:**
|
| 1347 |
+
tick the box and it tries, in order: **Anthropic** (Claude, if you give a key)
|
| 1348 |
+
β **Hugging Face Inference** (any hosted chat model, if you give an HF token)
|
| 1349 |
+
β **your own local server** (LM Studio / vLLM / Ollama's OpenAI shim β anything
|
| 1350 |
+
exposing `/v1/chat/completions`). The first one that's configured *and*
|
| 1351 |
+
reachable answers; the rest are skipped and noted. So your own RTX 5090 can
|
| 1352 |
+
be the judge with zero cloud calls if you just fill in the local URL.
|
| 1353 |
|
| 1354 |
Subjects: `france`, `germany`, `japan`. Answers: `paris, berlin, tokyo, london, rome`.
|
| 1355 |
After editing, the model stays edited β go look at it in tabs 1β5 (the logit lens
|
| 1356 |
will show the new answer rising; the trace still localises to L0). Hit **Reset**
|
| 1357 |
+
to restore. Every run is appended to a session log you can download below and
|
| 1358 |
+
paste into a future chat for review.
|
| 1359 |
""")
|
| 1360 |
with gr.Row():
|
| 1361 |
ed_subj = gr.Textbox(value="france", label="subject")
|
| 1362 |
ed_new = gr.Textbox(value="london", label="new answer")
|
| 1363 |
ed_method = gr.Radio(["rank1", "broadcast"], value="rank1", label="method")
|
| 1364 |
ed_strength = gr.Slider(0.2, 2.0, value=1.0, step=0.1, label="strength")
|
| 1365 |
+
ed_llm = gr.Checkbox(value=False, label="also run an independent LLM review")
|
| 1366 |
+
with gr.Accordion("LLM review providers (tried in this order)", open=False):
|
| 1367 |
+
with gr.Row():
|
| 1368 |
+
ed_a_model = gr.Textbox(value="claude-sonnet-4-6", label="1. Anthropic model")
|
| 1369 |
+
ed_a_key = gr.Textbox(value="", label="Anthropic API key", type="password")
|
| 1370 |
+
with gr.Row():
|
| 1371 |
+
ed_h_model = gr.Textbox(value="Qwen/Qwen2.5-7B-Instruct",
|
| 1372 |
+
label="2. HF Inference model")
|
| 1373 |
+
ed_h_key = gr.Textbox(value="", label="HF token", type="password")
|
| 1374 |
+
with gr.Row():
|
| 1375 |
+
ed_l_url = gr.Textbox(value="http://192.168.188.25:1234",
|
| 1376 |
+
label="3. Local server URL (LM Studio etc.)")
|
| 1377 |
+
ed_l_model = gr.Textbox(value="local-model", label="local model name")
|
| 1378 |
+
ed_out = gr.Textbox(label="edit + verification report", lines=24)
|
| 1379 |
with gr.Row():
|
| 1380 |
gr.Button("Edit & verify", variant="primary").click(
|
| 1381 |
edit_and_verify,
|
| 1382 |
+
[ed_subj, ed_new, ed_method, ed_strength, ed_llm,
|
| 1383 |
+
ed_a_key, ed_a_model, ed_h_key, ed_h_model, ed_l_url, ed_l_model],
|
| 1384 |
+
ed_out)
|
| 1385 |
gr.Button("Reset model").click(reset_glassbox, outputs=ed_out)
|
| 1386 |
+
gr.Markdown("**Session log** (every edit run above, appended):")
|
| 1387 |
+
with gr.Row():
|
| 1388 |
+
log_btn = gr.Button("Write session log to disk")
|
| 1389 |
+
log_file = gr.File(label="download")
|
| 1390 |
+
log_status = gr.Markdown()
|
| 1391 |
+
log_btn.click(lambda: export_session_log(), outputs=[log_file, log_status])
|
| 1392 |
|
| 1393 |
# ---- TAB 7 -------------------------------------------------------------
|
| 1394 |
with gr.Tab("7 Β· Export / Upload to HF"):
|
| 1395 |
gr.Markdown("""
|
| 1396 |
+
### Ship the toy glass-box
|
| 1397 |
**Export** writes a self-contained, reloadable repo: weights (`safetensors`),
|
| 1398 |
`config.json`, `vocab.json`, a standalone `modeling_glassbox.py` (reload with
|
| 1399 |
`from modeling_glassbox import load`), and a model card.
|
| 1400 |
|
| 1401 |
+
**Upload** pushes it to the Hub. Choose `model`, `space` (this whole app,
|
| 1402 |
+
runnable), or `both`. Paste a **write** token (or set `HF_TOKEN`).
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1403 |
""")
|
| 1404 |
with gr.Row():
|
| 1405 |
hf_repo = gr.Textbox(value="Chris4K/glassbox-interp", label="repo id")
|
|
|
|
| 1412 |
gr.Button("Upload to HF", variant="primary").click(
|
| 1413 |
upload_to_hf, [hf_repo, hf_token, hf_what], hf_out)
|
| 1414 |
|
| 1415 |
+
gr.Markdown("""
|
| 1416 |
+
---
|
| 1417 |
+
### Upload a REAL model β e.g. your VINDEX-edited Llama checkpoint
|
| 1418 |
+
This does **not** load the model into memory and does **not** assume any
|
| 1419 |
+
particular architecture β it just pushes whatever's already on disk at
|
| 1420 |
+
`local_dir` (the usual `save_pretrained()` layout: `config.json` +
|
| 1421 |
+
`*.safetensors` shards + tokenizer files) straight to a new repo. Large
|
| 1422 |
+
weights upload fine through `upload_folder`; for very large repos consider
|
| 1423 |
+
installing `hf_transfer` for faster throughput. If the base model is gated
|
| 1424 |
+
(e.g. `meta-llama/*`), the gate applies to the destination repo's license
|
| 1425 |
+
settings, not to this upload step.
|
| 1426 |
+
""")
|
| 1427 |
+
with gr.Row():
|
| 1428 |
+
rc_dir = gr.Textbox(value="", label="local checkpoint folder (on this machine)")
|
| 1429 |
+
rc_repo = gr.Textbox(value="", label="destination repo id, e.g. Chris4K/vindex-llama3-edited")
|
| 1430 |
+
with gr.Row():
|
| 1431 |
+
rc_token = gr.Textbox(value="", label="HF write token (optional)", type="password")
|
| 1432 |
+
rc_private = gr.Checkbox(value=True, label="private repo")
|
| 1433 |
+
rc_msg = gr.Textbox(value="upload edited checkpoint", label="commit message")
|
| 1434 |
+
rc_out = gr.Textbox(label="result", lines=6)
|
| 1435 |
+
gr.Button("Upload real checkpoint", variant="primary").click(
|
| 1436 |
+
upload_local_checkpoint, [rc_dir, rc_repo, rc_token, rc_private, rc_msg], rc_out)
|
| 1437 |
+
|
| 1438 |
gr.Markdown("""
|
| 1439 |
---
|
| 1440 |
### Where this goes next
|
| 1441 |
+
- **Closing the loop (what "self-improving" would actually require):** right now a human picks every edit; the verifier just grades it. A real closed loop needs a policy that *proposes* edits on its own (e.g. scanning eval failures for wrong facts), auto-applies, and auto-commits only on a SURGICAL verdict, rolling back otherwise. The hard part β the verifier β already exists here; the proposal step doesn't yet.
|
| 1442 |
+
- **A training-method angle worth taking seriously:** instead of accept/reject after the fact, feed the specificity battery's drift score back as a regularizer *during* the edit computation (closer to elastic weight consolidation, or the null-space projection AlphaEdit-style methods use) so collateral is penalized while solving, not caught after.
|
| 1443 |
+
- **Real-model MEMIT:** the edit loop here is exact because the glass-box's fact layer is literally keyβvalue. The same verify harness (efficacy / specificity / fluency + the multi-provider LLM judge) ports straight onto a gpt2/Llama MEMIT edit β the toy is the regression test you run first.
|
| 1444 |
+
- **Multi-hop & paraphrase generalization:** add `"the currency of france is"` so two relations share a subject, and have the LLM judge auto-generate paraphrase probes to test that an edit generalizes, not just memorizes the one prompt.
|
| 1445 |
- **Attribution view:** Geva-style "what does this neuron write to the vocab", per-head attention attribution.
|
| 1446 |
+
- **It already ships:** tab 7 pushes the toy model and this whole app (as a Space) to your Hub, or a real local checkpoint folder to its own repo.
|
| 1447 |
""")
|
| 1448 |
|
| 1449 |
demo.load(lambda: load_model("glassbox"), outputs=load_status)
|