Chris4K commited on
Commit
cbcdc9d
Β·
verified Β·
1 Parent(s): 836f3d5

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +271 -53
app.py CHANGED
@@ -768,7 +768,9 @@ def _verdict(before, after, subject, new_answer, drift_thresh=0.05):
768
  return eff, collateral, max_drift, ent_blowup, surgical
769
 
770
 
771
- def edit_and_verify(subject, new_answer, method, strength, use_llm, llm_model, api_key):
 
 
772
  model, tok = get_handles("glassbox")
773
  STATE["name"] = "glassbox"
774
  model.reset()
@@ -800,10 +802,20 @@ def edit_and_verify(subject, new_answer, method, strength, use_llm, llm_model, a
800
  "", "VERDICT: %s" % ("SURGICAL EDIT" if surgical else "COLLATERAL DAMAGE")]
801
  L.append("(model is left in the edited state - inspect it in tabs 1-5, or hit Reset.)")
802
 
 
803
  if use_llm:
804
- L += ["", "-" * 60, "INDEPENDENT LLM REVIEW:", _llm_judge(
805
- before, after, subject, new_answer, llm_model, api_key)]
806
- return "\n".join(L)
 
 
 
 
 
 
 
 
 
807
 
808
 
809
  def reset_glassbox():
@@ -812,13 +824,13 @@ def reset_glassbox():
812
  return "Glass-box weights restored to pristine. Re-run any tab to confirm."
813
 
814
 
815
- # --- optional: real LLM calls to verify the edit (independent of our metrics) -
816
- def _llm_judge(before, after, subject, new_answer, llm_model, api_key):
817
- import os, json
818
- key = (api_key or "").strip() or os.environ.get("ANTHROPIC_API_KEY", "")
819
- if not key:
820
- return ("(skipped - no API key. Paste an Anthropic key or set "
821
- "ANTHROPIC_API_KEY to have Claude independently judge the edit.)")
822
  payload = {c: {"prompt": before[c]["prompt"],
823
  "before_top1": before[c]["top1"], "before_p_orig": round(before[c]["p_orig"], 3),
824
  "after_top1": after[c]["top1"], "after_p_orig": round(after[c]["p_orig"], 3)}
@@ -826,39 +838,162 @@ def _llm_judge(before, after, subject, new_answer, llm_model, api_key):
826
  sys = ("You audit knowledge edits to a small language model. The intended edit "
827
  "is: make %s's capital '%s'. Given before/after predictions for every "
828
  "known fact, decide if the edit was SURGICAL (target changed, all other "
829
- "facts unchanged) or caused COLLATERAL damage. Reply ONLY as JSON: "
830
- '{"verdict":"surgical|collateral","target_changed":bool,'
831
- '"damaged_facts":[...],"confidence":0-1,"reason":"one sentence"}.'
832
- ) % (subject, new_answer)
833
- body = {"model": (llm_model or "claude-sonnet-4-6").strip(), "max_tokens": 400,
834
- "system": sys,
835
- "messages": [{"role": "user", "content": json.dumps(payload)}]}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
836
  try:
837
- try: # prefer the official SDK if present
838
  import anthropic
839
  client = anthropic.Anthropic(api_key=key)
840
  msg = client.messages.create(**body)
841
  text = "".join(b.text for b in msg.content if getattr(b, "type", "") == "text")
842
- except ImportError: # fall back to a raw HTTPS call
843
  import urllib.request
844
  req = urllib.request.Request(
845
- "https://api.anthropic.com/v1/messages",
846
- data=json.dumps(body).encode(),
847
  headers={"x-api-key": key, "anthropic-version": "2023-06-01",
848
  "content-type": "application/json"})
849
  with urllib.request.urlopen(req, timeout=30) as r:
850
  data = json.loads(r.read())
851
  text = "".join(b.get("text", "") for b in data.get("content", [])
852
  if b.get("type") == "text")
853
- clean = text.strip().strip("`")
854
- if clean.startswith("json"):
855
- clean = clean[4:].strip()
856
- v = json.loads(clean)
857
- return ("verdict=%s target_changed=%s confidence=%s\n damaged: %s\n reason: %s"
858
- % (v.get("verdict"), v.get("target_changed"), v.get("confidence"),
859
- v.get("damaged_facts") or "none", v.get("reason")))
860
  except Exception as e:
861
- return "(LLM review failed: %s)" % e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
862
 
863
 
864
  # =============================================================================
@@ -994,6 +1129,46 @@ def upload_to_hf(repo_id, token, what, app_path=__file__):
994
  return "Upload failed: %s" % e
995
 
996
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
997
  # =============================================================================
998
  # UI
999
  # =============================================================================
@@ -1159,54 +1334,72 @@ the fact is read here. The peak line names the site.
1159
  gr.Markdown("""
1160
  ### Edit a fact, then prove nothing else broke
1161
  **What it does:** rewrites the value one fact-MLP key maps to (the exact thing
1162
- ROME/MEMIT do on real models), then runs a verification battery over **every**
 
1163
  known fact to measure **efficacy** (target changed), **specificity** (others
1164
  untouched), and **fluency** (no entropy collapse).
1165
 
1166
  **Two methods, on purpose:**
1167
  - `rank1` β€” the minimal, surgical update. Only the target fact moves β†’ **SURGICAL**.
1168
- - `broadcast` β€” a deliberately sloppy edit that smears the change across all facts β†’ the harness catches the **COLLATERAL DAMAGE**. This proves the verifier actually works.
1169
 
1170
- **Optional independent review:** tick the box and paste an Anthropic key (or set
1171
- `ANTHROPIC_API_KEY`) to have **Claude** judge the before/after battery and return
1172
- its own surgical/collateral verdict β€” a second, model-based check on top of the
1173
- deterministic metrics.
 
 
 
1174
 
1175
  Subjects: `france`, `germany`, `japan`. Answers: `paris, berlin, tokyo, london, rome`.
1176
  After editing, the model stays edited β€” go look at it in tabs 1–5 (the logit lens
1177
  will show the new answer rising; the trace still localises to L0). Hit **Reset**
1178
- to restore.
 
1179
  """)
1180
  with gr.Row():
1181
  ed_subj = gr.Textbox(value="france", label="subject")
1182
  ed_new = gr.Textbox(value="london", label="new answer")
1183
  ed_method = gr.Radio(["rank1", "broadcast"], value="rank1", label="method")
1184
  ed_strength = gr.Slider(0.2, 2.0, value=1.0, step=0.1, label="strength")
1185
- with gr.Row():
1186
- ed_llm = gr.Checkbox(value=False, label="also ask Claude to verify")
1187
- ed_model = gr.Textbox(value="claude-sonnet-4-6", label="Claude model")
1188
- ed_key = gr.Textbox(value="", label="Anthropic API key (optional)", type="password")
1189
- ed_out = gr.Textbox(label="edit + verification report", lines=22)
 
 
 
 
 
 
 
 
 
1190
  with gr.Row():
1191
  gr.Button("Edit & verify", variant="primary").click(
1192
  edit_and_verify,
1193
- [ed_subj, ed_new, ed_method, ed_strength, ed_llm, ed_model, ed_key], ed_out)
 
 
1194
  gr.Button("Reset model").click(reset_glassbox, outputs=ed_out)
 
 
 
 
 
 
1195
 
1196
  # ---- TAB 7 -------------------------------------------------------------
1197
  with gr.Tab("7 Β· Export / Upload to HF"):
1198
  gr.Markdown("""
1199
- ### Ship it to the Hub
1200
  **Export** writes a self-contained, reloadable repo: weights (`safetensors`),
1201
  `config.json`, `vocab.json`, a standalone `modeling_glassbox.py` (reload with
1202
  `from modeling_glassbox import load`), and a model card.
1203
 
1204
- **Upload** pushes it to the Hub. Choose:
1205
- - `model` β€” the glass-box as a model repo.
1206
- - `space` β€” *this whole app* as a runnable Gradio Space (adds `requirements.txt`).
1207
- - `both`.
1208
-
1209
- Paste a **write** token (or set `HF_TOKEN`). Repo id like `Chris4K/glassbox-interp`.
1210
  """)
1211
  with gr.Row():
1212
  hf_repo = gr.Textbox(value="Chris4K/glassbox-interp", label="repo id")
@@ -1219,13 +1412,38 @@ Paste a **write** token (or set `HF_TOKEN`). Repo id like `Chris4K/glassbox-inte
1219
  gr.Button("Upload to HF", variant="primary").click(
1220
  upload_to_hf, [hf_repo, hf_token, hf_what], hf_out)
1221
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1222
  gr.Markdown("""
1223
  ---
1224
  ### Where this goes next
1225
- - **Real-model MEMIT:** the edit loop here is exact because the glass-box's fact layer is literally key→value. The same verify harness (efficacy / specificity / fluency + the Claude judge) ports straight onto a gpt2/Llama MEMIT edit — the toy is the regression test you run first.
1226
- - **Multi-hop & paraphrase generalization:** add `"the currency of france is"` so two relations share a subject, and have the Claude judge auto-generate paraphrase probes to test that an edit generalizes, not just memorizes the one prompt.
 
 
1227
  - **Attribution view:** Geva-style "what does this neuron write to the vocab", per-head attention attribution.
1228
- - **It already ships:** tab 7 pushes the model and this whole app (as a Space) to your Hub.
1229
  """)
1230
 
1231
  demo.load(lambda: load_model("glassbox"), outputs=load_status)
 
768
  return eff, collateral, max_drift, ent_blowup, surgical
769
 
770
 
771
+ def edit_and_verify(subject, new_answer, method, strength, use_llm,
772
+ anthropic_key, anthropic_model, hf_token, hf_model,
773
+ local_url, local_model):
774
  model, tok = get_handles("glassbox")
775
  STATE["name"] = "glassbox"
776
  model.reset()
 
802
  "", "VERDICT: %s" % ("SURGICAL EDIT" if surgical else "COLLATERAL DAMAGE")]
803
  L.append("(model is left in the edited state - inspect it in tabs 1-5, or hit Reset.)")
804
 
805
+ llm_report = ""
806
  if use_llm:
807
+ providers = [
808
+ {"type": "anthropic", "key": anthropic_key, "model": anthropic_model},
809
+ {"type": "hf", "key": hf_token, "model": hf_model},
810
+ {"type": "local", "url": local_url, "model": local_model},
811
+ ]
812
+ llm_report = _llm_judge_chain(before, after, subject, new_answer, providers)
813
+ L += ["", "-" * 60, "INDEPENDENT LLM REVIEW:", llm_report]
814
+
815
+ report = "\n".join(L)
816
+ _log_session(subject, new_answer, method, strength, before, after,
817
+ eff, collateral, max_drift, surgical, llm_report)
818
+ return report
819
 
820
 
821
  def reset_glassbox():
 
824
  return "Glass-box weights restored to pristine. Re-run any tab to confirm."
825
 
826
 
827
+ # --- optional: real LLM calls to verify the edit, with a 3-tier fallback chain
828
+ # Anthropic (Claude) -> Hugging Face Inference -> local OpenAI-compatible server
829
+ # (e.g. LM Studio). Tries each in order; the first provider that's configured
830
+ # AND reachable wins. This means you're never blocked on one vendor being down
831
+ # or on not having an Anthropic key at all - your own RTX 5090 can be the judge.
832
+ def _build_judge_prompt(before, after, subject, new_answer):
833
+ import json
834
  payload = {c: {"prompt": before[c]["prompt"],
835
  "before_top1": before[c]["top1"], "before_p_orig": round(before[c]["p_orig"], 3),
836
  "after_top1": after[c]["top1"], "after_p_orig": round(after[c]["p_orig"], 3)}
 
838
  sys = ("You audit knowledge edits to a small language model. The intended edit "
839
  "is: make %s's capital '%s'. Given before/after predictions for every "
840
  "known fact, decide if the edit was SURGICAL (target changed, all other "
841
+ "facts unchanged) or caused COLLATERAL damage. Reply ONLY as JSON, no "
842
+ 'prose, no markdown fences: {"verdict":"surgical|collateral",'
843
+ '"target_changed":bool,"damaged_facts":[...],"confidence":0-1,'
844
+ '"reason":"one sentence"}.') % (subject, new_answer)
845
+ return sys, json.dumps(payload)
846
+
847
+
848
+ def _parse_verdict_json(text, provider_label):
849
+ import json
850
+ clean = text.strip().strip("`")
851
+ if clean.lower().startswith("json"):
852
+ clean = clean[4:].strip()
853
+ start, end = clean.find("{"), clean.rfind("}")
854
+ if start != -1 and end != -1:
855
+ clean = clean[start:end + 1]
856
+ v = json.loads(clean)
857
+ return ("[%s] verdict=%s target_changed=%s confidence=%s\n damaged: %s\n reason: %s"
858
+ % (provider_label, v.get("verdict"), v.get("target_changed"), v.get("confidence"),
859
+ v.get("damaged_facts") or "none", v.get("reason")))
860
+
861
+
862
+ def _try_anthropic(sys, user, cfg):
863
+ import os, json
864
+ key = (cfg.get("key") or "").strip() or os.environ.get("ANTHROPIC_API_KEY", "")
865
+ if not key:
866
+ return None, "anthropic: no key configured"
867
+ body = {"model": (cfg.get("model") or "claude-sonnet-4-6").strip(),
868
+ "max_tokens": 400, "system": sys, "messages": [{"role": "user", "content": user}]}
869
  try:
870
+ try:
871
  import anthropic
872
  client = anthropic.Anthropic(api_key=key)
873
  msg = client.messages.create(**body)
874
  text = "".join(b.text for b in msg.content if getattr(b, "type", "") == "text")
875
+ except ImportError:
876
  import urllib.request
877
  req = urllib.request.Request(
878
+ "https://api.anthropic.com/v1/messages", data=json.dumps(body).encode(),
 
879
  headers={"x-api-key": key, "anthropic-version": "2023-06-01",
880
  "content-type": "application/json"})
881
  with urllib.request.urlopen(req, timeout=30) as r:
882
  data = json.loads(r.read())
883
  text = "".join(b.get("text", "") for b in data.get("content", [])
884
  if b.get("type") == "text")
885
+ return _parse_verdict_json(text, "anthropic:" + body["model"]), None
 
 
 
 
 
 
886
  except Exception as e:
887
+ return None, "anthropic failed: %s" % e
888
+
889
+
890
+ def _try_hf(sys, user, cfg):
891
+ token = (cfg.get("key") or "").strip()
892
+ model = (cfg.get("model") or "Qwen/Qwen2.5-7B-Instruct").strip()
893
+ if not token:
894
+ import os
895
+ token = os.environ.get("HF_TOKEN", "")
896
+ if not token:
897
+ return None, "hf: no token configured"
898
+ try:
899
+ from huggingface_hub import InferenceClient
900
+ client = InferenceClient(model=model, token=token)
901
+ resp = client.chat_completion(
902
+ messages=[{"role": "system", "content": sys}, {"role": "user", "content": user}],
903
+ max_tokens=400)
904
+ text = resp.choices[0].message.content
905
+ return _parse_verdict_json(text, "hf:" + model), None
906
+ except Exception as e:
907
+ return None, "hf failed: %s" % e
908
+
909
+
910
+ def _try_local(sys, user, cfg):
911
+ """Any OpenAI-compatible /v1/chat/completions server - LM Studio, vLLM,
912
+ Ollama (with its OpenAI shim), text-generation-webui, etc."""
913
+ import json, urllib.request
914
+ url = (cfg.get("url") or "").strip().rstrip("/")
915
+ if not url:
916
+ return None, "local: no URL configured"
917
+ model = (cfg.get("model") or "local-model").strip()
918
+ body = json.dumps({"model": model, "max_tokens": 400, "temperature": 0,
919
+ "messages": [{"role": "system", "content": sys},
920
+ {"role": "user", "content": user}]}).encode()
921
+ try:
922
+ req = urllib.request.Request(
923
+ url + "/v1/chat/completions", data=body,
924
+ headers={"content-type": "application/json"})
925
+ with urllib.request.urlopen(req, timeout=20) as r:
926
+ data = json.loads(r.read())
927
+ text = data["choices"][0]["message"]["content"]
928
+ return _parse_verdict_json(text, "local:" + model + "@" + url), None
929
+ except Exception as e:
930
+ return None, "local failed: %s" % e
931
+
932
+
933
+ def _llm_judge_chain(before, after, subject, new_answer, providers):
934
+ sys, user = _build_judge_prompt(before, after, subject, new_answer)
935
+ dispatch = {"anthropic": _try_anthropic, "hf": _try_hf, "local": _try_local}
936
+ skipped = []
937
+ for cfg in providers:
938
+ fn = dispatch.get(cfg["type"])
939
+ if fn is None:
940
+ continue
941
+ result, err = fn(sys, user, cfg)
942
+ if result is not None:
943
+ note = ("" if not skipped else
944
+ "(skipped: %s)\n" % "; ".join(skipped))
945
+ return note + result
946
+ skipped.append(err)
947
+ return ("all providers unavailable:\n " + "\n ".join(skipped) +
948
+ "\n(configure at least one: Anthropic key, HF token, or a local "
949
+ "OpenAI-compatible server URL like http://192.168.188.25:1234)")
950
+
951
+
952
+ # --- session log: every edit+verify run is appended here as JSON, so you can
953
+ # download it, or paste the markdown block straight into a future chat with
954
+ # Claude for review ("did all work, here's the log").
955
+ SESSION_LOG = []
956
+
957
+
958
+ def _log_session(subject, new_answer, method, strength, before, after,
959
+ eff, collateral, max_drift, surgical, llm_report):
960
+ import datetime
961
+ SESSION_LOG.append({
962
+ "ts": datetime.datetime.utcnow().isoformat() + "Z",
963
+ "subject": subject, "new_answer": new_answer, "method": method,
964
+ "strength": strength, "efficacy_pass": bool(eff),
965
+ "collateral": collateral, "max_drift": round(max_drift, 4),
966
+ "verdict": "SURGICAL" if surgical else "COLLATERAL",
967
+ "before": {c: {"top1": before[c]["top1"], "p_orig": round(before[c]["p_orig"], 4)}
968
+ for c in before},
969
+ "after": {c: {"top1": after[c]["top1"], "p_orig": round(after[c]["p_orig"], 4)}
970
+ for c in after},
971
+ "llm_review": llm_report or None,
972
+ })
973
+
974
+
975
+ def export_session_log():
976
+ import json, os
977
+ if not SESSION_LOG:
978
+ return None, "No edits run yet this session - nothing to export."
979
+ os.makedirs("/mnt/user-data/outputs", exist_ok=True)
980
+ path = "/mnt/user-data/outputs/edit_session_log.json"
981
+ json.dump(SESSION_LOG, open(path, "w"), indent=2)
982
+ # also a markdown rendition meant to be pasted straight into a chat
983
+ md = ["# Edit session log\n"]
984
+ for i, e in enumerate(SESSION_LOG, 1):
985
+ md.append("## Edit %d - %s (%s, %s, strength=%s)\n" %
986
+ (i, e["verdict"], e["subject"] + "->" + e["new_answer"],
987
+ e["method"], e["strength"]))
988
+ md.append("- efficacy: %s, max collateral drift: %.4f, damaged: %s" %
989
+ ("pass" if e["efficacy_pass"] else "fail", e["max_drift"],
990
+ e["collateral"] or "none"))
991
+ if e["llm_review"]:
992
+ md.append("- LLM review: " + e["llm_review"].replace("\n", " "))
993
+ md.append("")
994
+ md_path = "/mnt/user-data/outputs/edit_session_log.md"
995
+ open(md_path, "w").write("\n".join(md))
996
+ return path, "Wrote %d edit(s) to %s and %s" % (len(SESSION_LOG), path, md_path)
997
 
998
 
999
  # =============================================================================
 
1129
  return "Upload failed: %s" % e
1130
 
1131
 
1132
+ # --- upload a REAL model (e.g. a VINDEX-edited Llama checkpoint), not the toy.
1133
+ # This does NOT load the model into memory (multi-GB Llama weights don't need
1134
+ # to round-trip through Python) - it just pushes whatever's already on disk.
1135
+ # Point it at the local folder produced by your save_pretrained()/VINDEX run:
1136
+ # expects the usual HF layout (config.json + .safetensors shards + tokenizer
1137
+ # files). Note: gated models (e.g. meta-llama/*) require the destination repo
1138
+ # to either be your own namespace or one you have write access to - the Hub's
1139
+ # license gate is independent of this upload step.
1140
+ def upload_local_checkpoint(local_dir, repo_id, token, private, commit_message):
1141
+ import os
1142
+ try:
1143
+ from huggingface_hub import HfApi
1144
+ except ImportError:
1145
+ return "huggingface_hub not installed. `pip install huggingface_hub`."
1146
+ local_dir = (local_dir or "").strip()
1147
+ repo_id = (repo_id or "").strip()
1148
+ if not local_dir or not os.path.isdir(local_dir):
1149
+ return "local_dir %r does not exist or is not a directory." % local_dir
1150
+ if not repo_id:
1151
+ return "Enter a repo id like 'Chris4K/vindex-llama3-edited'."
1152
+ token = (token or "").strip() or os.environ.get("HF_TOKEN", "")
1153
+ if not token:
1154
+ return "No HF token. Paste a write token or set HF_TOKEN."
1155
+ has_cfg = os.path.exists(os.path.join(local_dir, "config.json"))
1156
+ has_weights = any(f.endswith((".safetensors", ".bin"))
1157
+ for f in os.listdir(local_dir))
1158
+ warn = "" if (has_cfg and has_weights) else (
1159
+ "WARNING: folder is missing config.json or weight files - this may "
1160
+ "not be a loadable HF checkpoint. Uploading anyway.\n")
1161
+ api = HfApi(token=token)
1162
+ try:
1163
+ api.create_repo(repo_id, repo_type="model", private=bool(private), exist_ok=True)
1164
+ api.upload_folder(folder_path=local_dir, repo_id=repo_id, repo_type="model",
1165
+ commit_message=(commit_message or "upload checkpoint").strip())
1166
+ return (warn + "Uploaded %s -> https://huggingface.co/%s\n"
1167
+ "Files: %s" % (local_dir, repo_id, ", ".join(sorted(os.listdir(local_dir))[:12])))
1168
+ except Exception as e:
1169
+ return warn + "Upload failed: %s" % e
1170
+
1171
+
1172
  # =============================================================================
1173
  # UI
1174
  # =============================================================================
 
1334
  gr.Markdown("""
1335
  ### Edit a fact, then prove nothing else broke
1336
  **What it does:** rewrites the value one fact-MLP key maps to (the exact thing
1337
+ ROME/MEMIT do on real models β€” this is a literal `nn.Module` weight tensor,
1338
+ not a token or vocab change), then runs a verification battery over **every**
1339
  known fact to measure **efficacy** (target changed), **specificity** (others
1340
  untouched), and **fluency** (no entropy collapse).
1341
 
1342
  **Two methods, on purpose:**
1343
  - `rank1` β€” the minimal, surgical update. Only the target fact moves β†’ **SURGICAL**.
1344
+ - `broadcast` β€” a deliberately sloppy edit that smears the change across all facts β†’ the harness catches the **COLLATERAL DAMAGE**. This proves the verifier actually works, not just reports "ok" by default.
1345
 
1346
+ **Independent LLM review, with a fallback chain β€” not locked to one vendor:**
1347
+ tick the box and it tries, in order: **Anthropic** (Claude, if you give a key)
1348
+ β†’ **Hugging Face Inference** (any hosted chat model, if you give an HF token)
1349
+ β†’ **your own local server** (LM Studio / vLLM / Ollama's OpenAI shim β€” anything
1350
+ exposing `/v1/chat/completions`). The first one that's configured *and*
1351
+ reachable answers; the rest are skipped and noted. So your own RTX 5090 can
1352
+ be the judge with zero cloud calls if you just fill in the local URL.
1353
 
1354
  Subjects: `france`, `germany`, `japan`. Answers: `paris, berlin, tokyo, london, rome`.
1355
  After editing, the model stays edited β€” go look at it in tabs 1–5 (the logit lens
1356
  will show the new answer rising; the trace still localises to L0). Hit **Reset**
1357
+ to restore. Every run is appended to a session log you can download below and
1358
+ paste into a future chat for review.
1359
  """)
1360
  with gr.Row():
1361
  ed_subj = gr.Textbox(value="france", label="subject")
1362
  ed_new = gr.Textbox(value="london", label="new answer")
1363
  ed_method = gr.Radio(["rank1", "broadcast"], value="rank1", label="method")
1364
  ed_strength = gr.Slider(0.2, 2.0, value=1.0, step=0.1, label="strength")
1365
+ ed_llm = gr.Checkbox(value=False, label="also run an independent LLM review")
1366
+ with gr.Accordion("LLM review providers (tried in this order)", open=False):
1367
+ with gr.Row():
1368
+ ed_a_model = gr.Textbox(value="claude-sonnet-4-6", label="1. Anthropic model")
1369
+ ed_a_key = gr.Textbox(value="", label="Anthropic API key", type="password")
1370
+ with gr.Row():
1371
+ ed_h_model = gr.Textbox(value="Qwen/Qwen2.5-7B-Instruct",
1372
+ label="2. HF Inference model")
1373
+ ed_h_key = gr.Textbox(value="", label="HF token", type="password")
1374
+ with gr.Row():
1375
+ ed_l_url = gr.Textbox(value="http://192.168.188.25:1234",
1376
+ label="3. Local server URL (LM Studio etc.)")
1377
+ ed_l_model = gr.Textbox(value="local-model", label="local model name")
1378
+ ed_out = gr.Textbox(label="edit + verification report", lines=24)
1379
  with gr.Row():
1380
  gr.Button("Edit & verify", variant="primary").click(
1381
  edit_and_verify,
1382
+ [ed_subj, ed_new, ed_method, ed_strength, ed_llm,
1383
+ ed_a_key, ed_a_model, ed_h_key, ed_h_model, ed_l_url, ed_l_model],
1384
+ ed_out)
1385
  gr.Button("Reset model").click(reset_glassbox, outputs=ed_out)
1386
+ gr.Markdown("**Session log** (every edit run above, appended):")
1387
+ with gr.Row():
1388
+ log_btn = gr.Button("Write session log to disk")
1389
+ log_file = gr.File(label="download")
1390
+ log_status = gr.Markdown()
1391
+ log_btn.click(lambda: export_session_log(), outputs=[log_file, log_status])
1392
 
1393
  # ---- TAB 7 -------------------------------------------------------------
1394
  with gr.Tab("7 Β· Export / Upload to HF"):
1395
  gr.Markdown("""
1396
+ ### Ship the toy glass-box
1397
  **Export** writes a self-contained, reloadable repo: weights (`safetensors`),
1398
  `config.json`, `vocab.json`, a standalone `modeling_glassbox.py` (reload with
1399
  `from modeling_glassbox import load`), and a model card.
1400
 
1401
+ **Upload** pushes it to the Hub. Choose `model`, `space` (this whole app,
1402
+ runnable), or `both`. Paste a **write** token (or set `HF_TOKEN`).
 
 
 
 
1403
  """)
1404
  with gr.Row():
1405
  hf_repo = gr.Textbox(value="Chris4K/glassbox-interp", label="repo id")
 
1412
  gr.Button("Upload to HF", variant="primary").click(
1413
  upload_to_hf, [hf_repo, hf_token, hf_what], hf_out)
1414
 
1415
+ gr.Markdown("""
1416
+ ---
1417
+ ### Upload a REAL model β€” e.g. your VINDEX-edited Llama checkpoint
1418
+ This does **not** load the model into memory and does **not** assume any
1419
+ particular architecture β€” it just pushes whatever's already on disk at
1420
+ `local_dir` (the usual `save_pretrained()` layout: `config.json` +
1421
+ `*.safetensors` shards + tokenizer files) straight to a new repo. Large
1422
+ weights upload fine through `upload_folder`; for very large repos consider
1423
+ installing `hf_transfer` for faster throughput. If the base model is gated
1424
+ (e.g. `meta-llama/*`), the gate applies to the destination repo's license
1425
+ settings, not to this upload step.
1426
+ """)
1427
+ with gr.Row():
1428
+ rc_dir = gr.Textbox(value="", label="local checkpoint folder (on this machine)")
1429
+ rc_repo = gr.Textbox(value="", label="destination repo id, e.g. Chris4K/vindex-llama3-edited")
1430
+ with gr.Row():
1431
+ rc_token = gr.Textbox(value="", label="HF write token (optional)", type="password")
1432
+ rc_private = gr.Checkbox(value=True, label="private repo")
1433
+ rc_msg = gr.Textbox(value="upload edited checkpoint", label="commit message")
1434
+ rc_out = gr.Textbox(label="result", lines=6)
1435
+ gr.Button("Upload real checkpoint", variant="primary").click(
1436
+ upload_local_checkpoint, [rc_dir, rc_repo, rc_token, rc_private, rc_msg], rc_out)
1437
+
1438
  gr.Markdown("""
1439
  ---
1440
  ### Where this goes next
1441
+ - **Closing the loop (what "self-improving" would actually require):** right now a human picks every edit; the verifier just grades it. A real closed loop needs a policy that *proposes* edits on its own (e.g. scanning eval failures for wrong facts), auto-applies, and auto-commits only on a SURGICAL verdict, rolling back otherwise. The hard part β€” the verifier β€” already exists here; the proposal step doesn't yet.
1442
+ - **A training-method angle worth taking seriously:** instead of accept/reject after the fact, feed the specificity battery's drift score back as a regularizer *during* the edit computation (closer to elastic weight consolidation, or the null-space projection AlphaEdit-style methods use) so collateral is penalized while solving, not caught after.
1443
+ - **Real-model MEMIT:** the edit loop here is exact because the glass-box's fact layer is literally key→value. The same verify harness (efficacy / specificity / fluency + the multi-provider LLM judge) ports straight onto a gpt2/Llama MEMIT edit — the toy is the regression test you run first.
1444
+ - **Multi-hop & paraphrase generalization:** add `"the currency of france is"` so two relations share a subject, and have the LLM judge auto-generate paraphrase probes to test that an edit generalizes, not just memorizes the one prompt.
1445
  - **Attribution view:** Geva-style "what does this neuron write to the vocab", per-head attention attribution.
1446
+ - **It already ships:** tab 7 pushes the toy model and this whole app (as a Space) to your Hub, or a real local checkpoint folder to its own repo.
1447
  """)
1448
 
1449
  demo.load(lambda: load_model("glassbox"), outputs=load_status)