AsadIsmail commited on
Commit
8aaa576
·
verified ·
1 Parent(s): 8e6525d

Update PRISM-Memory Space bundle

Browse files
README.md CHANGED
@@ -12,8 +12,20 @@ pinned: false
12
 
13
  **Hook:** Turn conversations into durable, searchable memory.
14
 
15
- This Space is the lightweight public demo for the single released
16
- `PRISM-Memory` extraction skill. It shows the best checkpoint only.
 
 
 
 
 
 
 
 
 
 
 
 
17
 
18
  ## Inputs
19
 
@@ -21,13 +33,21 @@ The app reads:
21
 
22
  - `results/confirmed_exp15_summary.json`
23
  - `results/scenario_comparisons.json`
24
- - `MEMORY_EXTRACTION_SKILL.md`
 
 
 
 
 
 
25
 
26
  ## What It Shows
27
 
28
  1. The confirmed metrics for the released checkpoint
29
  2. Selected benchmark cases showing strengths and failure modes
30
- 3. The single canonical memory extraction skill to keep
 
 
31
 
32
  ## Local Run
33
 
@@ -35,3 +55,18 @@ The app reads:
35
  python -m pip install -r requirements.txt
36
  python app.py
37
  ```
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
 
13
  **Hook:** Turn conversations into durable, searchable memory.
14
 
15
+ This Space is the public demo for the single released `PRISM-Memory`
16
+ checkpoint: `exp15_sft_qwen7b_4ep`, a LoRA adapter on top of
17
+ `Qwen/Qwen2.5-7B-Instruct`.
18
+
19
+ It matches the root repo story:
20
+
21
+ - one released checkpoint
22
+ - one extraction behavior
23
+ - one set of confirmed benchmark results
24
+ - one compact explanation of the training data and held-out examples
25
+
26
+ Within the GitHub repo, this directory is a first-class public surface. The
27
+ helper script `scripts/deploy_space.sh` bundles this folder with the release
28
+ artifacts needed for Hugging Face.
29
 
30
  ## Inputs
31
 
 
33
 
34
  - `results/confirmed_exp15_summary.json`
35
  - `results/scenario_comparisons.json`
36
+ - `results/readme_extraction_examples.json`
37
+ - `docs/release/extraction-skill.md`
38
+ - `docs/release/datasets.md`
39
+
40
+ When copied into a standalone Hugging Face Space repo, keep those files beside
41
+ `app.py` and `requirements.txt`, preserving the `docs/release/` and `results/`
42
+ subdirectories.
43
 
44
  ## What It Shows
45
 
46
  1. The confirmed metrics for the released checkpoint
47
  2. Selected benchmark cases showing strengths and failure modes
48
+ 3. Side-by-side held-out extraction examples against the GPT-4.1 reference
49
+ 4. A compact description of the synthetic conversation data and SFT labels
50
+ 5. The single canonical memory extraction skill to keep
51
 
52
  ## Local Run
53
 
 
55
  python -m pip install -r requirements.txt
56
  python app.py
57
  ```
58
+
59
+ ## From The Repo Root
60
+
61
+ ```bash
62
+ bash scripts/deploy_space.sh
63
+ ```
64
+
65
+ Useful bundled files:
66
+
67
+ - `docs/release/extraction-skill.md`
68
+ - `docs/release/datasets.md`
69
+ - `docs/release/extraction-examples.md`
70
+ - `docs/release/release-results.md`
71
+ - `results/confirmed_exp15_summary.json`
72
+ - `results/readme_extraction_examples.json`
app.py CHANGED
@@ -7,14 +7,32 @@ import gradio as gr
7
  import pandas as pd
8
 
9
  APP_DIR = Path(__file__).resolve().parent
10
- if (APP_DIR / "MEMORY_EXTRACTION_SKILL.md").exists() or (APP_DIR / "results").exists():
11
- ROOT = APP_DIR
12
- else:
13
- ROOT = APP_DIR.parent
 
 
 
 
 
 
 
 
 
 
14
  RESULTS_DIR = ROOT / "results"
15
  SUMMARY_PATH = RESULTS_DIR / "confirmed_exp15_summary.json"
16
  SCENARIO_PATH = RESULTS_DIR / "scenario_comparisons.json"
17
- SKILL_PATH = ROOT / "MEMORY_EXTRACTION_SKILL.md"
 
 
 
 
 
 
 
 
18
  LOCOMO_CATEGORY_NAMES = {
19
  "1": "factual",
20
  "2": "temporal",
@@ -38,6 +56,22 @@ def _load_json(path: Path, default):
38
  return json.loads(path.read_text())
39
 
40
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
  def _load_summary() -> dict:
42
  return _load_json(SUMMARY_PATH, {"results": [], "failures": []})
43
 
@@ -46,10 +80,16 @@ def _load_scenarios() -> dict:
46
  return _load_json(SCENARIO_PATH, {"scenarios": []})
47
 
48
 
 
 
 
 
49
  def _load_skill() -> str:
50
- if not SKILL_PATH.exists():
51
- return "Skill document not found."
52
- return SKILL_PATH.read_text()
 
 
53
 
54
 
55
  def _best_result() -> dict | None:
@@ -62,13 +102,23 @@ def release_markdown() -> str:
62
  if not item:
63
  return "## No confirmed release result yet"
64
  checkpoint = Path(item["checkpoint"]).name
65
- return "\n\n".join(
 
 
66
  [
67
  "# PRISM-Memory",
 
68
  "**Turn conversations into durable, searchable memory.**",
 
69
  f"Released checkpoint: `{checkpoint}`",
70
- f"Confirmed LoCoMo: `{item['locomo']['mean']:.3f}`",
71
- f"Confirmed LongMemEval: `{item['lme']['mean']:.3f}`",
 
 
 
 
 
 
72
  ]
73
  )
74
 
@@ -151,6 +201,43 @@ def render_scenario(choice: str):
151
  return "\n".join(header), table
152
 
153
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
154
  with gr.Blocks(title="PRISM-Memory Demo") as demo:
155
  gr.Markdown(release_markdown())
156
 
@@ -170,6 +257,16 @@ with gr.Blocks(title="PRISM-Memory Demo") as demo:
170
  picker.change(render_scenario, inputs=picker, outputs=[scenario_md, scenario_table])
171
  demo.load(fn=lambda: render_scenario(choices[0]), outputs=[scenario_md, scenario_table])
172
 
 
 
 
 
 
 
 
 
 
 
173
  with gr.Tab("Skill"):
174
  gr.Markdown(_load_skill())
175
 
 
7
  import pandas as pd
8
 
9
  APP_DIR = Path(__file__).resolve().parent
10
+
11
+
12
+ def _resolve_root() -> Path:
13
+ for candidate in (APP_DIR, APP_DIR.parent):
14
+ if (candidate / "results" / "confirmed_exp15_summary.json").exists():
15
+ return candidate
16
+ if (candidate / "docs" / "release" / "extraction-skill.md").exists():
17
+ return candidate
18
+ if (candidate / "MEMORY_EXTRACTION_SKILL.md").exists():
19
+ return candidate
20
+ return APP_DIR.parent
21
+
22
+
23
+ ROOT = _resolve_root()
24
  RESULTS_DIR = ROOT / "results"
25
  SUMMARY_PATH = RESULTS_DIR / "confirmed_exp15_summary.json"
26
  SCENARIO_PATH = RESULTS_DIR / "scenario_comparisons.json"
27
+ README_EXAMPLE_PATH = RESULTS_DIR / "readme_extraction_examples.json"
28
+ SKILL_CANDIDATES = [
29
+ ROOT / "docs" / "release" / "extraction-skill.md",
30
+ ROOT / "MEMORY_EXTRACTION_SKILL.md",
31
+ ]
32
+ DATASET_CANDIDATES = [
33
+ ROOT / "docs" / "release" / "datasets.md",
34
+ ROOT / "DATASETS.md",
35
+ ]
36
  LOCOMO_CATEGORY_NAMES = {
37
  "1": "factual",
38
  "2": "temporal",
 
56
  return json.loads(path.read_text())
57
 
58
 
59
+ def _clean_markdown(text: str) -> str:
60
+ lines = text.splitlines()
61
+ if lines and lines[0].startswith("[Back to Repo]"):
62
+ lines = lines[1:]
63
+ while lines and not lines[0].strip():
64
+ lines = lines[1:]
65
+ return "\n".join(lines).strip()
66
+
67
+
68
+ def _load_markdown(candidates: list[Path], fallback: str) -> str:
69
+ for path in candidates:
70
+ if path.exists():
71
+ return _clean_markdown(path.read_text())
72
+ return fallback
73
+
74
+
75
  def _load_summary() -> dict:
76
  return _load_json(SUMMARY_PATH, {"results": [], "failures": []})
77
 
 
80
  return _load_json(SCENARIO_PATH, {"scenarios": []})
81
 
82
 
83
+ def _load_readme_examples() -> dict:
84
+ return _load_json(README_EXAMPLE_PATH, {"examples": []})
85
+
86
+
87
  def _load_skill() -> str:
88
+ return _load_markdown(SKILL_CANDIDATES, "Skill document not found.")
89
+
90
+
91
+ def _load_datasets() -> str:
92
+ return _load_markdown(DATASET_CANDIDATES, "Dataset summary not found.")
93
 
94
 
95
  def _best_result() -> dict | None:
 
102
  if not item:
103
  return "## No confirmed release result yet"
104
  checkpoint = Path(item["checkpoint"]).name
105
+ locomo = item["locomo"]["mean"]
106
+ lme = item["lme"]["mean"]
107
+ return "\n".join(
108
  [
109
  "# PRISM-Memory",
110
+ "",
111
  "**Turn conversations into durable, searchable memory.**",
112
+ "",
113
  f"Released checkpoint: `{checkpoint}`",
114
+ "Base model: `Qwen/Qwen2.5-7B-Instruct`",
115
+ "",
116
+ "| Benchmark | PRISM-Memory `sft4` | GPT-4.1-based PropMem reference |",
117
+ "|---|---:|---:|",
118
+ f"| LongMemEval | `{lme:.3f}` | `0.465` |",
119
+ f"| LoCoMo | `{locomo:.3f}` | `0.536` |",
120
+ "",
121
+ "This Space shows the public release only: confirmed metrics, held-out benchmark cases, side-by-side extraction examples, the training-data summary, and the canonical extraction skill.",
122
  ]
123
  )
124
 
 
201
  return "\n".join(header), table
202
 
203
 
204
+ def _readme_example_label(item: dict) -> str:
205
+ return item["title"]
206
+
207
+
208
+ def readme_example_choices() -> list[str]:
209
+ examples = _load_readme_examples().get("examples", [])
210
+ return [_readme_example_label(example) for example in examples]
211
+
212
+
213
+ def render_readme_example(choice: str) -> str:
214
+ examples = _load_readme_examples().get("examples", [])
215
+ if not examples:
216
+ return "No extraction examples available yet."
217
+
218
+ item = next(
219
+ (example for example in examples if _readme_example_label(example) == choice or example["id"] == choice),
220
+ examples[0],
221
+ )
222
+ body = [
223
+ f"### {item['title']}",
224
+ "",
225
+ f"**Session date:** `{item['session_date']}`",
226
+ f"**Overlap score:** `{item['overlap_score']:.3f}`",
227
+ f"**What this example shows:** {item['note']}",
228
+ "",
229
+ "**Turn**",
230
+ "",
231
+ f"> {item['user_message']}",
232
+ "",
233
+ "**GPT-4.1 reference**",
234
+ ]
235
+ body.extend([f"- {entry}" for entry in item.get("gpt41_reference", [])])
236
+ body.extend(["", "**PRISM-Memory `sft4`**"])
237
+ body.extend([f"- {entry}" for entry in item.get("prism_memory", [])])
238
+ return "\n".join(body)
239
+
240
+
241
  with gr.Blocks(title="PRISM-Memory Demo") as demo:
242
  gr.Markdown(release_markdown())
243
 
 
257
  picker.change(render_scenario, inputs=picker, outputs=[scenario_md, scenario_table])
258
  demo.load(fn=lambda: render_scenario(choices[0]), outputs=[scenario_md, scenario_table])
259
 
260
+ with gr.Tab("Extraction Examples"):
261
+ example_choices = readme_example_choices() or ["pending"]
262
+ example_picker = gr.Dropdown(choices=example_choices, value=example_choices[0], label="Held-Out Example")
263
+ example_md = gr.Markdown()
264
+ example_picker.change(render_readme_example, inputs=example_picker, outputs=example_md)
265
+ demo.load(fn=lambda: render_readme_example(example_choices[0]), outputs=example_md)
266
+
267
+ with gr.Tab("Data"):
268
+ gr.Markdown(_load_datasets())
269
+
270
  with gr.Tab("Skill"):
271
  gr.Markdown(_load_skill())
272
 
docs/release/datasets.md ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [Back to Repo](../../README.md) · [Release Docs](README.md) · [Release Results](release-results.md)
2
+
3
+ # PRISM-Memory Datasets
4
+
5
+ This file separates the data used by the public `PRISM-Memory` release from the
6
+ auxiliary datasets that were only useful for ablations.
7
+
8
+ ## Released Training Recipe
9
+
10
+ The released checkpoint is `exp15_sft_qwen7b_4ep`.
11
+
12
+ The core recipe was:
13
+
14
+ 1. Start from `Qwen/Qwen2.5-7B-Instruct`.
15
+ 2. Fine-tune with LoRA on a `20k` sample from `train_sft.jsonl`.
16
+ 3. Evaluate on held-out `LoCoMo` and held-out `LongMemEval`.
17
+
18
+ ## Source Conversations
19
+
20
+ The underlying synthetic conversation source lives in the upstream
21
+ `better_memory/data/output/` directory.
22
+
23
+ | File | Kind | Split | Notes |
24
+ |---|---|---|---|
25
+ | `train.jsonl` | raw conversations | train | `2,329` synthetic multi-session conversations |
26
+ | `eval.jsonl` | raw conversations | eval | `584` held-out synthetic multi-session conversations |
27
+ | `metadata.json` | split metadata | all | counts by tier, agent type, and update regime |
28
+
29
+ The source generator was built to create long-horizon memory stress cases with
30
+ inserts, updates, deletes, and multi-session recall.
31
+
32
+ ## Derived SFT Data
33
+
34
+ These are GPT-4.1-derived proposition labels built on top of the raw
35
+ conversations.
36
+
37
+ | File | Examples | Role | Release Status |
38
+ |---|---|---|---|
39
+ | `train_sft.jsonl` | `100,427` | primary SFT data | core release data |
40
+ | `train_sft_clean_merged.jsonl` | `20,000` | cleaned resume base matching `sft4` distribution | good follow-on base |
41
+ | `train_sft_temporal_resolved.jsonl` | `2,643` | temporal-fix add-on set | useful for targeted research, not the public base |
42
+ | `eval_sft.jsonl` | reference | GPT-4.1 PropMem extractions on eval conversations | evaluation reference only |
43
+
44
+ ## Evaluation Surfaces
45
+
46
+ The released model was evaluated on two held-out surfaces:
47
+
48
+ | Benchmark | Held-out Surface | Notes |
49
+ |---|---|---|
50
+ | `LoCoMo` | conversations `conv-49` and `conv-50` | five categories: factual, temporal, inferential, multi-hop, adversarial |
51
+ | `LongMemEval` | held-out items stratified by question type | six categories, including temporal reasoning and knowledge updates |
52
+
53
+ Both the GPT-4.1 extraction baseline and the released 7B extractor were scored
54
+ with the same GPT-4.1 QA evaluator and the same cache-backed answer surface.
55
+
56
+ ## Auxiliary LoCoMo Datasets
57
+
58
+ These files were used in ablations and targeted probes. They matter for the
59
+ research story, but they are not the main public training recipe.
60
+
61
+ | File | Examples | Intended Use | Outcome |
62
+ |---|---|---|---|
63
+ | `locomo_qa_supervised_factual.jsonl` | `512` | factual QA supervision | neutral to small benefit |
64
+ | `locomo_qa_supervised_multihop.jsonl` | `625` | multihop QA supervision | neutral to small benefit |
65
+ | `locomo_qa_supervised_temporal.jsonl` | `248` | temporal QA supervision with absolute dates | neutral to small benefit |
66
+ | `locomo_qa_supervised_inferential.jsonl` | `133` | inferential QA supervision | too small, hurt balance |
67
+ | `locomo_qa_supervised_temporal_relformat.jsonl` | `248` | temporal QA with benchmark-style relative dates | hurt |
68
+ | `locomo_sft_extra.jsonl` | `2,645` | LoCoMo-domain SFT add-on | hurt |
69
+ | `locomo_sft_extra_relformat.jsonl` | `3,178` | relative-date LoCoMo SFT add-on | hurt |
70
+
71
+ ## Practical Takeaways
72
+
73
+ 1. The best 7B model came from the stable `20k` `train_sft.jsonl` base, not
74
+ from aggressive benchmark-specific add-ons.
75
+ 2. Training on LoCoMo-domain conversations did not help generalization.
76
+ 3. Relative-date output hacks made the extractor worse.
77
+ 4. More original LME data was not automatically better because noisy temporal
78
+ labels compounded the anchor-loss problem.
79
+
80
+ Related docs:
81
+
82
+ - [extraction-skill.md](extraction-skill.md)
83
+ - [release-results.md](release-results.md)
84
+ - [technical-blog.md](technical-blog.md)
docs/release/extraction-examples.md ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [Back to Repo](../../README.md) · [Release Docs](README.md) · [Result Artifacts](../../results/README.md)
2
+
3
+ # PRISM-Memory Extraction Examples
4
+
5
+ Selected held-out examples from the original Exp15 `eval_sft.jsonl` corpus.
6
+ The `GPT-4.1 reference` rows come from the original SFT target propositions.
7
+ The `PRISM-Memory` rows were regenerated from `exp15_sft_qwen7b_4ep` with greedy decoding using the same extraction prompt family used during evaluation.
8
+
9
+ These examples are illustrations, not the benchmark itself. Use
10
+ [release-results.md](release-results.md) for the aggregate numbers.
11
+
12
+ ## Infrastructure bottlenecks stay structured
13
+
14
+ - Overlap score: `0.909`
15
+ - Session date: `2025-01-04 15:34:00`
16
+ - Note: Near-exact match on two operational facts from a single held-out turn.
17
+
18
+ **Turn**
19
+
20
+ > yeah, no real caching beyond basic Docker layer caching. Jenkins nodes have limited capacity, and we sometimes hit queue delays during peak commits.
21
+
22
+ **GPT-4.1 reference**
23
+
24
+ - No caching beyond basic Docker layer caching
25
+ - Jenkins nodes have limited capacity and experience queue delays during peak commits
26
+
27
+ **PRISM-Memory `sft4`**
28
+
29
+ - No Docker caching beyond basic layer caching
30
+ - Jenkins nodes have limited capacity; peak commits cause queue delays
31
+
32
+ ## Numeric constraints and preferences survive extraction
33
+
34
+ - Overlap score: `0.800`
35
+ - Session date: `2025-03-01 15:07:00`
36
+ - Note: The trained model keeps both the hard concurrency cap and the desired notification style.
37
+
38
+ **Turn**
39
+
40
+ > yeah, I think starting with incremental scans and parallel matrix jobs makes sense. We have 20 concurrent jobs max on GitHub Actions currently. Also want to keep Slack notifications from Snyk consistent with other pipeline alerts—aggregated and concise. Can you help draft the workflow?
41
+
42
+ **GPT-4.1 reference**
43
+
44
+ - GitHub Actions concurrency limit: 20 concurrent jobs
45
+ - Wants Snyk Slack notifications aggregated and concise, consistent with other pipeline alerts
46
+
47
+ **PRISM-Memory `sft4`**
48
+
49
+ - GitHub Actions concurrency limit: 20 concurrent jobs
50
+ - Snyk Slack notifications should be aggregated and concise
51
+
52
+ ## Current state and next-step plans stay grouped
53
+
54
+ - Overlap score: `0.799`
55
+ - Session date: `2025-09-09 15:26:00`
56
+ - Note: The extracted memory keeps the current configuration, the planned security step, and the rollout strategy in one compact bundle.
57
+
58
+ **Turn**
59
+
60
+ > yeah good point about resource overhead, we set CPU limits for all sidecars and monitor with Prometheus now. no mTLS yet, but it’s on the roadmap for phase two. as for routing, we want to start with canary deployments and traffic splitting, maybe some basic fault injection for testing.
61
+
62
+ **GPT-4.1 reference**
63
+
64
+ - Istio sidecar CPU limits set and monitored via Prometheus
65
+ - mTLS planned in phase two
66
+ - Plan to use canary deployments, traffic splitting, and basic fault injection
67
+
68
+ **PRISM-Memory `sft4`**
69
+
70
+ - Sidecar CPU limits set and monitored via Prometheus
71
+ - Istio mTLS planned for phase two
72
+ - Routing strategy: canary deployments and traffic splitting; basic fault injection planned
73
+
74
+ ## Regeneration
75
+
76
+ ```bash
77
+ conda run -n pytorch_p310 python scripts/release/generate_readme_examples.py
78
+ ```
79
+
docs/release/extraction-skill.md ADDED
@@ -0,0 +1,121 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [Back to Repo](../../README.md) · [Release Docs](README.md) · [Result Artifacts](../../results/README.md)
2
+
3
+ # PRISM-Memory Extraction Skill
4
+
5
+ **Hook:** Turn conversations into durable, searchable memory.
6
+
7
+ This is the single extraction skill to keep from the `better_memory` work.
8
+ Public release should point to one checkpoint and one extraction behavior:
9
+
10
+ - **Model:** `exp15_sft_qwen7b_4ep`
11
+ - **Base model:** `Qwen/Qwen2.5-7B-Instruct`
12
+ - **Role:** proposition extraction for long-term conversational memory
13
+ - **Why this one:** best confirmed total profile, best adversarial behavior, and
14
+ best LongMemEval score
15
+
16
+ ## Skill Definition
17
+
18
+ The extractor operates turn by turn and emits `0-5` atomic propositions per
19
+ turn. Each proposition should be a standalone fact about a person, event,
20
+ preference, or property, with dates carried into the fact when available.
21
+
22
+ Canonical prompt:
23
+
24
+ ```text
25
+ You are a memory extraction assistant. Given a conversation turn, extract 0-5 atomic, standalone facts. Each fact must be a complete sentence about a specific person, event, preference, or property. Include dates/times when mentioned. Skip greetings, filler, and questions. Output ONLY a JSON array of strings, e.g. ["fact1", "fact2"] or [].
26
+ ```
27
+
28
+ This prompt comes from `experiment15_learned_extraction.py` in the upstream
29
+ `better_memory` workspace.
30
+
31
+ ## Inference Contract
32
+
33
+ 1. Format the turn with speaker and session date.
34
+ 2. Extract `0-5` propositions as a JSON array.
35
+ 3. Clean speaker references so generic labels become real names.
36
+ 4. Resolve relative temporal expressions against the session date.
37
+ 5. Prefix each proposition with the normalized session date before indexing.
38
+ 6. Retrieve with the PRISM hybrid stack, not with the extractor alone.
39
+
40
+ ## Retrieval Setup To Keep
41
+
42
+ - **Retriever:** `PRISMv3Rerank`
43
+ - **Sparse retrieval:** BM25
44
+ - **Dense retrieval:** `all-MiniLM-L6-v2`
45
+ - **Reranker:** `cross-encoder/ms-marco-MiniLM-L-6-v2`
46
+
47
+ Best confirmed retrieval settings:
48
+
49
+ - **LoCoMo:** adversarial `k=5`, multi-hop `k=10`, all other categories `k=8`
50
+ - **LongMemEval:** multi-session `k=20`, all other categories `k=8` except
51
+ single-session-user `k=5`
52
+
53
+ ## What Worked
54
+
55
+ 1. **The original 20k base mattered.**
56
+ `sft4` came from the exact `train_sft_clean_merged.jsonl` base distribution.
57
+ Runs that changed the base subset regressed.
58
+
59
+ 2. **Four epochs was the sweet spot.**
60
+ `sft4` is the local optimum the repo could actually reproduce.
61
+
62
+ 3. **Absolute date anchoring helped.**
63
+ Temporal repairs worked when the model saw explicit, normalized dates rather
64
+ than benchmark-specific relative phrasing.
65
+
66
+ 4. **Post-processing mattered.**
67
+ Speaker cleanup plus relative-date resolution was necessary to turn raw
68
+ outputs into stable memory records.
69
+
70
+ 5. **Hybrid retrieval beat simpler retrieval.**
71
+ BM25 + dense + reranking consistently outperformed BM25-only or dense-only
72
+ approaches.
73
+
74
+ 6. **Turn-local extraction was enough.**
75
+ The model performed better without feeding long recent-context windows into
76
+ the extractor.
77
+
78
+ 7. **Multihop supervision preserved inferential behavior.**
79
+ When temporal data was added, multihop QA was the only extra signal that
80
+ reliably helped preserve inferential performance.
81
+
82
+ ## What Did Not Work
83
+
84
+ 1. **Relative-date training.**
85
+ Training the extractor to emit benchmark-style relative dates hurt temporal
86
+ performance instead of helping it.
87
+
88
+ 2. **LoCoMo-domain SFT data.**
89
+ Adding LoCoMo training conversations consistently regressed the model.
90
+
91
+ 3. **More than 20k original LME examples.**
92
+ Scaling the original noisy temporal labels to 50k amplified anchor loss and
93
+ caused major regression.
94
+
95
+ 4. **Small clean bases.**
96
+ 5k-base follow-on runs forgot too much and collapsed inferential behavior.
97
+
98
+ 5. **Heavy QA multipliers.**
99
+ High temporal or QA multipliers damaged adversarial precision and LongMemEval.
100
+
101
+ 6. **High learning rates on follow-on QA runs.**
102
+ Aggressive fine-tuning degraded the traits that made `sft4` good.
103
+
104
+ 7. **Trying to push past the local optimum.**
105
+ Most post-`sft4` training traded away adversarial performance for narrower
106
+ gains.
107
+
108
+ ## Release Rule
109
+
110
+ Release only this extraction skill and only this checkpoint publicly:
111
+
112
+ - `exp15_sft_qwen7b_4ep`
113
+
114
+ Treat all other checkpoints as internal ablations and learning artifacts, not as
115
+ parallel public releases.
116
+
117
+ Related docs:
118
+
119
+ - [datasets.md](datasets.md)
120
+ - [release-results.md](release-results.md)
121
+ - [technical-blog.md](technical-blog.md)
docs/release/release-results.md ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [Back to Repo](../../README.md) · [Release Docs](README.md) · [Result Artifacts](../../results/README.md)
2
+
3
+ # PRISM-Memory Release Results
4
+
5
+ This file summarizes the confirmed release metrics and the internal comparison
6
+ artifacts that informed the public checkpoint choice.
7
+
8
+ ## Released Checkpoint
9
+
10
+ - Checkpoint: `exp15_sft_qwen7b_4ep`
11
+ - Base model: `Qwen/Qwen2.5-7B-Instruct`
12
+ - Adapter type: LoRA
13
+ - Confirmed LoCoMo mean: `0.4981204463`
14
+ - Confirmed LongMemEval mean: `0.4767574431`
15
+ - QA cache hits during confirmation: `460`
16
+ - QA cache misses during confirmation: `0`
17
+
18
+ ## Baseline Context
19
+
20
+ `PRISM-Memory` fine-tunes `Qwen/Qwen2.5-7B-Instruct` for the proposition
21
+ extraction step that PropMem normally gets from GPT-4.1. On the confirmed run:
22
+
23
+ | Benchmark | PRISM-Memory `sft4` | GPT-4.1-based PropMem reference | Read |
24
+ |---|---:|---:|---|
25
+ | LongMemEval | `0.4768` | `0.4650` | PRISM wins |
26
+ | LoCoMo | `0.4981` | `0.5360` | PRISM trails, but stays close |
27
+
28
+ The QA layer is held constant. This is an extractor-vs-extractor comparison,
29
+ not an end-to-end GPT-4.1 replacement claim.
30
+
31
+ ## LoCoMo Breakdown
32
+
33
+ | Category | Score |
34
+ |---|---:|
35
+ | factual | `0.3339551926` |
36
+ | temporal | `0.4978785870` |
37
+ | inferential | `0.2605997475` |
38
+ | multi-hop | `0.5144477744` |
39
+ | adversarial | `0.8837209302` |
40
+
41
+ ## LongMemEval Breakdown
42
+
43
+ | Category | Score |
44
+ |---|---:|
45
+ | knowledge-update | `0.5588405797` |
46
+ | multi-session | `0.1390977444` |
47
+ | single-session-assistant | `0.7656395892` |
48
+ | single-session-preference | `0.0519667456` |
49
+ | single-session-user | `0.9133333333` |
50
+ | temporal-reasoning | `0.4316666667` |
51
+
52
+ ## Internal Comparison That Informed The Release
53
+
54
+ The closest runner-up was `inferential_from_temporal_heavy`.
55
+
56
+ - Confirmed LoCoMo mean: `0.4975893989`
57
+ - Confirmed LongMemEval mean: `0.4688992148`
58
+ - Pairwise LoCoMo disagreements vs `sft4`: `152 / 400`
59
+ - Question-level wins: `56` for `sft4`, `52` for the runner-up
60
+
61
+ The release decision stayed with `sft4` because it preserved the strongest
62
+ LongMemEval score and the strongest adversarial behavior.
63
+
64
+ ## Artifact Files
65
+
66
+ - [../../results/confirmed_exp15_summary.json](../../results/confirmed_exp15_summary.json)
67
+ - [../../results/scenario_comparisons.json](../../results/scenario_comparisons.json)
68
+ - [../../results/locomo_pairwise_question_diffs.json](../../results/locomo_pairwise_question_diffs.json)
69
+ - [../../results/sft4.json](../../results/sft4.json)
70
+
71
+ Related docs:
72
+
73
+ - [extraction-skill.md](extraction-skill.md)
74
+ - [extraction-examples.md](extraction-examples.md)
75
+ - [datasets.md](datasets.md)
76
+ - [model-card.md](model-card.md)
results/confirmed_exp15_summary.json CHANGED
@@ -2,7 +2,7 @@
2
  "results": [
3
  {
4
  "alias": "sft4",
5
- "checkpoint": "/home/ec2-user/SageMaker/better_memory/exp15_sft_qwen7b_4ep",
6
  "elapsed_min": 28.93,
7
  "args": {
8
  "n_lme": 10,
@@ -50,4 +50,4 @@
50
  }
51
  ],
52
  "failures": []
53
- }
 
2
  "results": [
3
  {
4
  "alias": "sft4",
5
+ "checkpoint": "exp15_sft_qwen7b_4ep",
6
  "elapsed_min": 28.93,
7
  "args": {
8
  "n_lme": 10,
 
50
  }
51
  ],
52
  "failures": []
53
+ }
results/readme_extraction_examples.json ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "source_dataset": "BETTER_MEMORY_ROOT/data/output/eval_sft.jsonl",
3
+ "model_path": "BETTER_MEMORY_ROOT/exp15_sft_qwen7b_4ep",
4
+ "output_examples": 3,
5
+ "examples": [
6
+ {
7
+ "id": "infra_queue_delays",
8
+ "title": "Infrastructure bottlenecks stay structured",
9
+ "note": "Near-exact match on two operational facts from a single held-out turn.",
10
+ "session_date": "2025-01-04 15:34:00",
11
+ "user_message": "yeah, no real caching beyond basic Docker layer caching. Jenkins nodes have limited capacity, and we sometimes hit queue delays during peak commits.",
12
+ "gpt41_reference": [
13
+ "No caching beyond basic Docker layer caching",
14
+ "Jenkins nodes have limited capacity and experience queue delays during peak commits"
15
+ ],
16
+ "prism_memory": [
17
+ "No Docker caching beyond basic layer caching",
18
+ "Jenkins nodes have limited capacity; peak commits cause queue delays"
19
+ ],
20
+ "overlap_score": 0.9090909090909092
21
+ },
22
+ {
23
+ "id": "github_actions_notifications",
24
+ "title": "Numeric constraints and preferences survive extraction",
25
+ "note": "The trained model keeps both the hard concurrency cap and the desired notification style.",
26
+ "session_date": "2025-03-01 15:07:00",
27
+ "user_message": "yeah, I think starting with incremental scans and parallel matrix jobs makes sense. We have 20 concurrent jobs max on GitHub Actions currently. Also want to keep Slack notifications from Snyk consistent with other pipeline alerts\u2014aggregated and concise. Can you help draft the workflow?",
28
+ "gpt41_reference": [
29
+ "GitHub Actions concurrency limit: 20 concurrent jobs",
30
+ "Wants Snyk Slack notifications aggregated and concise, consistent with other pipeline alerts"
31
+ ],
32
+ "prism_memory": [
33
+ "GitHub Actions concurrency limit: 20 concurrent jobs",
34
+ "Snyk Slack notifications should be aggregated and concise"
35
+ ],
36
+ "overlap_score": 0.8000000000000002
37
+ },
38
+ {
39
+ "id": "sidecar_limits_and_rollout",
40
+ "title": "Current state and next-step plans stay grouped",
41
+ "note": "The extracted memory keeps the current configuration, the planned security step, and the rollout strategy in one compact bundle.",
42
+ "session_date": "2025-09-09 15:26:00",
43
+ "user_message": "yeah good point about resource overhead, we set CPU limits for all sidecars and monitor with Prometheus now. no mTLS yet, but it\u2019s on the roadmap for phase two. as for routing, we want to start with canary deployments and traffic splitting, maybe some basic fault injection for testing.",
44
+ "gpt41_reference": [
45
+ "Istio sidecar CPU limits set and monitored via Prometheus",
46
+ "mTLS planned in phase two",
47
+ "Plan to use canary deployments, traffic splitting, and basic fault injection"
48
+ ],
49
+ "prism_memory": [
50
+ "Sidecar CPU limits set and monitored via Prometheus",
51
+ "Istio mTLS planned for phase two",
52
+ "Routing strategy: canary deployments and traffic splitting; basic fault injection planned"
53
+ ],
54
+ "overlap_score": 0.7985739750445632
55
+ }
56
+ ]
57
+ }