Spaces:

AsadIsmail
/

prism-memory

Running

App Files Files Community

AsadIsmail commited on Apr 17

Commit

8aaa576

verified ·

1 Parent(s): 8e6525d

Update PRISM-Memory Space bundle

Browse files

Files changed (8) hide show

README.md +39 -4
app.py +108 -11
docs/release/datasets.md +84 -0
docs/release/extraction-examples.md +79 -0
docs/release/extraction-skill.md +121 -0
docs/release/release-results.md +76 -0
results/confirmed_exp15_summary.json +2 -2
results/readme_extraction_examples.json +57 -0

README.md CHANGED Viewed

@@ -12,8 +12,20 @@ pinned: false
 **Hook:** Turn conversations into durable, searchable memory.
-This Space is the lightweight public demo for the single released
-`PRISM-Memory` extraction skill. It shows the best checkpoint only.
 ## Inputs
@@ -21,13 +33,21 @@ The app reads:
 - `results/confirmed_exp15_summary.json`
 - `results/scenario_comparisons.json`
-- `MEMORY_EXTRACTION_SKILL.md`
 ## What It Shows
 1. The confirmed metrics for the released checkpoint
 2. Selected benchmark cases showing strengths and failure modes
-3. The single canonical memory extraction skill to keep
 ## Local Run
@@ -35,3 +55,18 @@ The app reads:
 python -m pip install -r requirements.txt
 python app.py
 ```

 **Hook:** Turn conversations into durable, searchable memory.
+This Space is the public demo for the single released `PRISM-Memory`
+checkpoint: `exp15_sft_qwen7b_4ep`, a LoRA adapter on top of
+`Qwen/Qwen2.5-7B-Instruct`.
+It matches the root repo story:
+- one released checkpoint
+- one extraction behavior
+- one set of confirmed benchmark results
+- one compact explanation of the training data and held-out examples
+Within the GitHub repo, this directory is a first-class public surface. The
+helper script `scripts/deploy_space.sh` bundles this folder with the release
+artifacts needed for Hugging Face.
 ## Inputs
 - `results/confirmed_exp15_summary.json`
 - `results/scenario_comparisons.json`
+- `results/readme_extraction_examples.json`
+- `docs/release/extraction-skill.md`
+- `docs/release/datasets.md`
+When copied into a standalone Hugging Face Space repo, keep those files beside
+`app.py` and `requirements.txt`, preserving the `docs/release/` and `results/`
+subdirectories.
 ## What It Shows
 1. The confirmed metrics for the released checkpoint
 2. Selected benchmark cases showing strengths and failure modes
+3. Side-by-side held-out extraction examples against the GPT-4.1 reference
+4. A compact description of the synthetic conversation data and SFT labels
+5. The single canonical memory extraction skill to keep
 ## Local Run
 python -m pip install -r requirements.txt
 python app.py
 ```
+## From The Repo Root
+```bash
+bash scripts/deploy_space.sh
+```
+Useful bundled files:
+- `docs/release/extraction-skill.md`
+- `docs/release/datasets.md`
+- `docs/release/extraction-examples.md`
+- `docs/release/release-results.md`
+- `results/confirmed_exp15_summary.json`
+- `results/readme_extraction_examples.json`

app.py CHANGED Viewed

@@ -7,14 +7,32 @@ import gradio as gr
 import pandas as pd
 APP_DIR = Path(__file__).resolve().parent
-if (APP_DIR / "MEMORY_EXTRACTION_SKILL.md").exists() or (APP_DIR / "results").exists():
-    ROOT = APP_DIR
-else:
-    ROOT = APP_DIR.parent
 RESULTS_DIR = ROOT / "results"
 SUMMARY_PATH = RESULTS_DIR / "confirmed_exp15_summary.json"
 SCENARIO_PATH = RESULTS_DIR / "scenario_comparisons.json"
-SKILL_PATH = ROOT / "MEMORY_EXTRACTION_SKILL.md"
 LOCOMO_CATEGORY_NAMES = {
     "1": "factual",
     "2": "temporal",
@@ -38,6 +56,22 @@ def _load_json(path: Path, default):
     return json.loads(path.read_text())
 def _load_summary() -> dict:
     return _load_json(SUMMARY_PATH, {"results": [], "failures": []})
@@ -46,10 +80,16 @@ def _load_scenarios() -> dict:
     return _load_json(SCENARIO_PATH, {"scenarios": []})
 def _load_skill() -> str:
-    if not SKILL_PATH.exists():
-        return "Skill document not found."
-    return SKILL_PATH.read_text()
 def _best_result() -> dict | None:
@@ -62,13 +102,23 @@ def release_markdown() -> str:
     if not item:
         return "## No confirmed release result yet"
     checkpoint = Path(item["checkpoint"]).name
-    return "\n\n".join(
         [
             "# PRISM-Memory",
             "**Turn conversations into durable, searchable memory.**",
             f"Released checkpoint: `{checkpoint}`",
-            f"Confirmed LoCoMo: `{item['locomo']['mean']:.3f}`",
-            f"Confirmed LongMemEval: `{item['lme']['mean']:.3f}`",
         ]
     )
@@ -151,6 +201,43 @@ def render_scenario(choice: str):
     return "\n".join(header), table
 with gr.Blocks(title="PRISM-Memory Demo") as demo:
     gr.Markdown(release_markdown())
@@ -170,6 +257,16 @@ with gr.Blocks(title="PRISM-Memory Demo") as demo:
         picker.change(render_scenario, inputs=picker, outputs=[scenario_md, scenario_table])
         demo.load(fn=lambda: render_scenario(choices[0]), outputs=[scenario_md, scenario_table])
     with gr.Tab("Skill"):
         gr.Markdown(_load_skill())

 import pandas as pd
 APP_DIR = Path(__file__).resolve().parent
+def _resolve_root() -> Path:
+    for candidate in (APP_DIR, APP_DIR.parent):
+        if (candidate / "results" / "confirmed_exp15_summary.json").exists():
+            return candidate
+        if (candidate / "docs" / "release" / "extraction-skill.md").exists():
+            return candidate
+        if (candidate / "MEMORY_EXTRACTION_SKILL.md").exists():
+            return candidate
+    return APP_DIR.parent
+ROOT = _resolve_root()
 RESULTS_DIR = ROOT / "results"
 SUMMARY_PATH = RESULTS_DIR / "confirmed_exp15_summary.json"
 SCENARIO_PATH = RESULTS_DIR / "scenario_comparisons.json"
+README_EXAMPLE_PATH = RESULTS_DIR / "readme_extraction_examples.json"
+SKILL_CANDIDATES = [
+    ROOT / "docs" / "release" / "extraction-skill.md",
+    ROOT / "MEMORY_EXTRACTION_SKILL.md",
+]
+DATASET_CANDIDATES = [
+    ROOT / "docs" / "release" / "datasets.md",
+    ROOT / "DATASETS.md",
+]
 LOCOMO_CATEGORY_NAMES = {
     "1": "factual",
     "2": "temporal",
     return json.loads(path.read_text())
+def _clean_markdown(text: str) -> str:
+    lines = text.splitlines()
+    if lines and lines[0].startswith("[Back to Repo]"):
+        lines = lines[1:]
+        while lines and not lines[0].strip():
+            lines = lines[1:]
+    return "\n".join(lines).strip()
+def _load_markdown(candidates: list[Path], fallback: str) -> str:
+    for path in candidates:
+        if path.exists():
+            return _clean_markdown(path.read_text())
+    return fallback
 def _load_summary() -> dict:
     return _load_json(SUMMARY_PATH, {"results": [], "failures": []})
     return _load_json(SCENARIO_PATH, {"scenarios": []})
+def _load_readme_examples() -> dict:
+    return _load_json(README_EXAMPLE_PATH, {"examples": []})
 def _load_skill() -> str:
+    return _load_markdown(SKILL_CANDIDATES, "Skill document not found.")
+def _load_datasets() -> str:
+    return _load_markdown(DATASET_CANDIDATES, "Dataset summary not found.")
 def _best_result() -> dict | None:
     if not item:
         return "## No confirmed release result yet"
     checkpoint = Path(item["checkpoint"]).name
+    locomo = item["locomo"]["mean"]
+    lme = item["lme"]["mean"]
+    return "\n".join(
         [
             "# PRISM-Memory",
+            "",
             "**Turn conversations into durable, searchable memory.**",
+            "",
             f"Released checkpoint: `{checkpoint}`",
+            "Base model: `Qwen/Qwen2.5-7B-Instruct`",
+            "",
+            "| Benchmark | PRISM-Memory `sft4` | GPT-4.1-based PropMem reference |",
+            "|---|---:|---:|",
+            f"| LongMemEval | `{lme:.3f}` | `0.465` |",
+            f"| LoCoMo | `{locomo:.3f}` | `0.536` |",
+            "",
+            "This Space shows the public release only: confirmed metrics, held-out benchmark cases, side-by-side extraction examples, the training-data summary, and the canonical extraction skill.",
         ]
     )
     return "\n".join(header), table
+def _readme_example_label(item: dict) -> str:
+    return item["title"]
+def readme_example_choices() -> list[str]:
+    examples = _load_readme_examples().get("examples", [])
+    return [_readme_example_label(example) for example in examples]
+def render_readme_example(choice: str) -> str:
+    examples = _load_readme_examples().get("examples", [])
+    if not examples:
+        return "No extraction examples available yet."
+    item = next(
+        (example for example in examples if _readme_example_label(example) == choice or example["id"] == choice),
+        examples[0],
+    )
+    body = [
+        f"### {item['title']}",
+        "",
+        f"**Session date:** `{item['session_date']}`",
+        f"**Overlap score:** `{item['overlap_score']:.3f}`",
+        f"**What this example shows:** {item['note']}",
+        "",
+        "**Turn**",
+        "",
+        f"> {item['user_message']}",
+        "",
+        "**GPT-4.1 reference**",
+    ]
+    body.extend([f"- {entry}" for entry in item.get("gpt41_reference", [])])
+    body.extend(["", "**PRISM-Memory `sft4`**"])
+    body.extend([f"- {entry}" for entry in item.get("prism_memory", [])])
+    return "\n".join(body)
 with gr.Blocks(title="PRISM-Memory Demo") as demo:
     gr.Markdown(release_markdown())
         picker.change(render_scenario, inputs=picker, outputs=[scenario_md, scenario_table])
         demo.load(fn=lambda: render_scenario(choices[0]), outputs=[scenario_md, scenario_table])
+    with gr.Tab("Extraction Examples"):
+        example_choices = readme_example_choices() or ["pending"]
+        example_picker = gr.Dropdown(choices=example_choices, value=example_choices[0], label="Held-Out Example")
+        example_md = gr.Markdown()
+        example_picker.change(render_readme_example, inputs=example_picker, outputs=example_md)
+        demo.load(fn=lambda: render_readme_example(example_choices[0]), outputs=example_md)
+    with gr.Tab("Data"):
+        gr.Markdown(_load_datasets())
     with gr.Tab("Skill"):
         gr.Markdown(_load_skill())

docs/release/datasets.md ADDED Viewed

	@@ -0,0 +1,84 @@

+[Back to Repo](../../README.md) · [Release Docs](README.md) · [Release Results](release-results.md)
+# PRISM-Memory Datasets
+This file separates the data used by the public `PRISM-Memory` release from the
+auxiliary datasets that were only useful for ablations.
+## Released Training Recipe
+The released checkpoint is `exp15_sft_qwen7b_4ep`.
+The core recipe was:
+1. Start from `Qwen/Qwen2.5-7B-Instruct`.
+2. Fine-tune with LoRA on a `20k` sample from `train_sft.jsonl`.
+3. Evaluate on held-out `LoCoMo` and held-out `LongMemEval`.
+## Source Conversations
+The underlying synthetic conversation source lives in the upstream
+`better_memory/data/output/` directory.
+| File | Kind | Split | Notes |
+|---|---|---|---|
+| `train.jsonl` | raw conversations | train | `2,329` synthetic multi-session conversations |
+| `eval.jsonl` | raw conversations | eval | `584` held-out synthetic multi-session conversations |
+| `metadata.json` | split metadata | all | counts by tier, agent type, and update regime |
+The source generator was built to create long-horizon memory stress cases with
+inserts, updates, deletes, and multi-session recall.
+## Derived SFT Data
+These are GPT-4.1-derived proposition labels built on top of the raw
+conversations.
+| File | Examples | Role | Release Status |
+|---|---|---|---|
+| `train_sft.jsonl` | `100,427` | primary SFT data | core release data |
+| `train_sft_clean_merged.jsonl` | `20,000` | cleaned resume base matching `sft4` distribution | good follow-on base |
+| `train_sft_temporal_resolved.jsonl` | `2,643` | temporal-fix add-on set | useful for targeted research, not the public base |
+| `eval_sft.jsonl` | reference | GPT-4.1 PropMem extractions on eval conversations | evaluation reference only |
+## Evaluation Surfaces
+The released model was evaluated on two held-out surfaces:
+| Benchmark | Held-out Surface | Notes |
+|---|---|---|
+| `LoCoMo` | conversations `conv-49` and `conv-50` | five categories: factual, temporal, inferential, multi-hop, adversarial |
+| `LongMemEval` | held-out items stratified by question type | six categories, including temporal reasoning and knowledge updates |
+Both the GPT-4.1 extraction baseline and the released 7B extractor were scored
+with the same GPT-4.1 QA evaluator and the same cache-backed answer surface.
+## Auxiliary LoCoMo Datasets
+These files were used in ablations and targeted probes. They matter for the
+research story, but they are not the main public training recipe.
+| File | Examples | Intended Use | Outcome |
+|---|---|---|---|
+| `locomo_qa_supervised_factual.jsonl` | `512` | factual QA supervision | neutral to small benefit |
+| `locomo_qa_supervised_multihop.jsonl` | `625` | multihop QA supervision | neutral to small benefit |
+| `locomo_qa_supervised_temporal.jsonl` | `248` | temporal QA supervision with absolute dates | neutral to small benefit |
+| `locomo_qa_supervised_inferential.jsonl` | `133` | inferential QA supervision | too small, hurt balance |
+| `locomo_qa_supervised_temporal_relformat.jsonl` | `248` | temporal QA with benchmark-style relative dates | hurt |
+| `locomo_sft_extra.jsonl` | `2,645` | LoCoMo-domain SFT add-on | hurt |
+| `locomo_sft_extra_relformat.jsonl` | `3,178` | relative-date LoCoMo SFT add-on | hurt |
+## Practical Takeaways
+1. The best 7B model came from the stable `20k` `train_sft.jsonl` base, not
+   from aggressive benchmark-specific add-ons.
+2. Training on LoCoMo-domain conversations did not help generalization.
+3. Relative-date output hacks made the extractor worse.
+4. More original LME data was not automatically better because noisy temporal
+   labels compounded the anchor-loss problem.
+Related docs:
+- [extraction-skill.md](extraction-skill.md)
+- [release-results.md](release-results.md)
+- [technical-blog.md](technical-blog.md)

docs/release/extraction-examples.md ADDED Viewed

	@@ -0,0 +1,79 @@

+[Back to Repo](../../README.md) · [Release Docs](README.md) · [Result Artifacts](../../results/README.md)
+# PRISM-Memory Extraction Examples
+Selected held-out examples from the original Exp15 `eval_sft.jsonl` corpus.
+The `GPT-4.1 reference` rows come from the original SFT target propositions.
+The `PRISM-Memory` rows were regenerated from `exp15_sft_qwen7b_4ep` with greedy decoding using the same extraction prompt family used during evaluation.
+These examples are illustrations, not the benchmark itself. Use
+[release-results.md](release-results.md) for the aggregate numbers.
+## Infrastructure bottlenecks stay structured
+- Overlap score: `0.909`
+- Session date: `2025-01-04 15:34:00`
+- Note: Near-exact match on two operational facts from a single held-out turn.
+**Turn**
+> yeah, no real caching beyond basic Docker layer caching. Jenkins nodes have limited capacity, and we sometimes hit queue delays during peak commits.
+**GPT-4.1 reference**
+- No caching beyond basic Docker layer caching
+- Jenkins nodes have limited capacity and experience queue delays during peak commits
+**PRISM-Memory `sft4`**
+- No Docker caching beyond basic layer caching
+- Jenkins nodes have limited capacity; peak commits cause queue delays
+## Numeric constraints and preferences survive extraction
+- Overlap score: `0.800`
+- Session date: `2025-03-01 15:07:00`
+- Note: The trained model keeps both the hard concurrency cap and the desired notification style.
+**Turn**
+> yeah, I think starting with incremental scans and parallel matrix jobs makes sense. We have 20 concurrent jobs max on GitHub Actions currently. Also want to keep Slack notifications from Snyk consistent with other pipeline alerts—aggregated and concise. Can you help draft the workflow?
+**GPT-4.1 reference**
+- GitHub Actions concurrency limit: 20 concurrent jobs
+- Wants Snyk Slack notifications aggregated and concise, consistent with other pipeline alerts
+**PRISM-Memory `sft4`**
+- GitHub Actions concurrency limit: 20 concurrent jobs
+- Snyk Slack notifications should be aggregated and concise
+## Current state and next-step plans stay grouped
+- Overlap score: `0.799`
+- Session date: `2025-09-09 15:26:00`
+- Note: The extracted memory keeps the current configuration, the planned security step, and the rollout strategy in one compact bundle.
+**Turn**
+> yeah good point about resource overhead, we set CPU limits for all sidecars and monitor with Prometheus now. no mTLS yet, but it’s on the roadmap for phase two. as for routing, we want to start with canary deployments and traffic splitting, maybe some basic fault injection for testing.
+**GPT-4.1 reference**
+- Istio sidecar CPU limits set and monitored via Prometheus
+- mTLS planned in phase two
+- Plan to use canary deployments, traffic splitting, and basic fault injection
+**PRISM-Memory `sft4`**
+- Sidecar CPU limits set and monitored via Prometheus
+- Istio mTLS planned for phase two
+- Routing strategy: canary deployments and traffic splitting; basic fault injection planned
+## Regeneration
+```bash
+conda run -n pytorch_p310 python scripts/release/generate_readme_examples.py
+```

docs/release/extraction-skill.md ADDED Viewed

	@@ -0,0 +1,121 @@

+[Back to Repo](../../README.md) · [Release Docs](README.md) · [Result Artifacts](../../results/README.md)
+# PRISM-Memory Extraction Skill
+**Hook:** Turn conversations into durable, searchable memory.
+This is the single extraction skill to keep from the `better_memory` work.
+Public release should point to one checkpoint and one extraction behavior:
+- **Model:** `exp15_sft_qwen7b_4ep`
+- **Base model:** `Qwen/Qwen2.5-7B-Instruct`
+- **Role:** proposition extraction for long-term conversational memory
+- **Why this one:** best confirmed total profile, best adversarial behavior, and
+  best LongMemEval score
+## Skill Definition
+The extractor operates turn by turn and emits `0-5` atomic propositions per
+turn. Each proposition should be a standalone fact about a person, event,
+preference, or property, with dates carried into the fact when available.
+Canonical prompt:
+```text
+You are a memory extraction assistant. Given a conversation turn, extract 0-5 atomic, standalone facts. Each fact must be a complete sentence about a specific person, event, preference, or property. Include dates/times when mentioned. Skip greetings, filler, and questions. Output ONLY a JSON array of strings, e.g. ["fact1", "fact2"] or [].
+```
+This prompt comes from `experiment15_learned_extraction.py` in the upstream
+`better_memory` workspace.
+## Inference Contract
+1. Format the turn with speaker and session date.
+2. Extract `0-5` propositions as a JSON array.
+3. Clean speaker references so generic labels become real names.
+4. Resolve relative temporal expressions against the session date.
+5. Prefix each proposition with the normalized session date before indexing.
+6. Retrieve with the PRISM hybrid stack, not with the extractor alone.
+## Retrieval Setup To Keep
+- **Retriever:** `PRISMv3Rerank`
+- **Sparse retrieval:** BM25
+- **Dense retrieval:** `all-MiniLM-L6-v2`
+- **Reranker:** `cross-encoder/ms-marco-MiniLM-L-6-v2`
+Best confirmed retrieval settings:
+- **LoCoMo:** adversarial `k=5`, multi-hop `k=10`, all other categories `k=8`
+- **LongMemEval:** multi-session `k=20`, all other categories `k=8` except
+  single-session-user `k=5`
+## What Worked
+1. **The original 20k base mattered.**
+   `sft4` came from the exact `train_sft_clean_merged.jsonl` base distribution.
+   Runs that changed the base subset regressed.
+2. **Four epochs was the sweet spot.**
+   `sft4` is the local optimum the repo could actually reproduce.
+3. **Absolute date anchoring helped.**
+   Temporal repairs worked when the model saw explicit, normalized dates rather
+   than benchmark-specific relative phrasing.
+4. **Post-processing mattered.**
+   Speaker cleanup plus relative-date resolution was necessary to turn raw
+   outputs into stable memory records.
+5. **Hybrid retrieval beat simpler retrieval.**
+   BM25 + dense + reranking consistently outperformed BM25-only or dense-only
+   approaches.
+6. **Turn-local extraction was enough.**
+   The model performed better without feeding long recent-context windows into
+   the extractor.
+7. **Multihop supervision preserved inferential behavior.**
+   When temporal data was added, multihop QA was the only extra signal that
+   reliably helped preserve inferential performance.
+## What Did Not Work
+1. **Relative-date training.**
+   Training the extractor to emit benchmark-style relative dates hurt temporal
+   performance instead of helping it.
+2. **LoCoMo-domain SFT data.**
+   Adding LoCoMo training conversations consistently regressed the model.
+3. **More than 20k original LME examples.**
+   Scaling the original noisy temporal labels to 50k amplified anchor loss and
+   caused major regression.
+4. **Small clean bases.**
+   5k-base follow-on runs forgot too much and collapsed inferential behavior.
+5. **Heavy QA multipliers.**
+   High temporal or QA multipliers damaged adversarial precision and LongMemEval.
+6. **High learning rates on follow-on QA runs.**
+   Aggressive fine-tuning degraded the traits that made `sft4` good.
+7. **Trying to push past the local optimum.**
+   Most post-`sft4` training traded away adversarial performance for narrower
+   gains.
+## Release Rule
+Release only this extraction skill and only this checkpoint publicly:
+- `exp15_sft_qwen7b_4ep`
+Treat all other checkpoints as internal ablations and learning artifacts, not as
+parallel public releases.
+Related docs:
+- [datasets.md](datasets.md)
+- [release-results.md](release-results.md)
+- [technical-blog.md](technical-blog.md)

docs/release/release-results.md ADDED Viewed

	@@ -0,0 +1,76 @@

+[Back to Repo](../../README.md) · [Release Docs](README.md) · [Result Artifacts](../../results/README.md)
+# PRISM-Memory Release Results
+This file summarizes the confirmed release metrics and the internal comparison
+artifacts that informed the public checkpoint choice.
+## Released Checkpoint
+- Checkpoint: `exp15_sft_qwen7b_4ep`
+- Base model: `Qwen/Qwen2.5-7B-Instruct`
+- Adapter type: LoRA
+- Confirmed LoCoMo mean: `0.4981204463`
+- Confirmed LongMemEval mean: `0.4767574431`
+- QA cache hits during confirmation: `460`
+- QA cache misses during confirmation: `0`
+## Baseline Context
+`PRISM-Memory` fine-tunes `Qwen/Qwen2.5-7B-Instruct` for the proposition
+extraction step that PropMem normally gets from GPT-4.1. On the confirmed run:
+| Benchmark | PRISM-Memory `sft4` | GPT-4.1-based PropMem reference | Read |
+|---|---:|---:|---|
+| LongMemEval | `0.4768` | `0.4650` | PRISM wins |
+| LoCoMo | `0.4981` | `0.5360` | PRISM trails, but stays close |
+The QA layer is held constant. This is an extractor-vs-extractor comparison,
+not an end-to-end GPT-4.1 replacement claim.
+## LoCoMo Breakdown
+| Category | Score |
+|---|---:|
+| factual | `0.3339551926` |
+| temporal | `0.4978785870` |
+| inferential | `0.2605997475` |
+| multi-hop | `0.5144477744` |
+| adversarial | `0.8837209302` |
+## LongMemEval Breakdown
+| Category | Score |
+|---|---:|
+| knowledge-update | `0.5588405797` |
+| multi-session | `0.1390977444` |
+| single-session-assistant | `0.7656395892` |
+| single-session-preference | `0.0519667456` |
+| single-session-user | `0.9133333333` |
+| temporal-reasoning | `0.4316666667` |
+## Internal Comparison That Informed The Release
+The closest runner-up was `inferential_from_temporal_heavy`.
+- Confirmed LoCoMo mean: `0.4975893989`
+- Confirmed LongMemEval mean: `0.4688992148`
+- Pairwise LoCoMo disagreements vs `sft4`: `152 / 400`
+- Question-level wins: `56` for `sft4`, `52` for the runner-up
+The release decision stayed with `sft4` because it preserved the strongest
+LongMemEval score and the strongest adversarial behavior.
+## Artifact Files
+- [../../results/confirmed_exp15_summary.json](../../results/confirmed_exp15_summary.json)
+- [../../results/scenario_comparisons.json](../../results/scenario_comparisons.json)
+- [../../results/locomo_pairwise_question_diffs.json](../../results/locomo_pairwise_question_diffs.json)
+- [../../results/sft4.json](../../results/sft4.json)
+Related docs:
+- [extraction-skill.md](extraction-skill.md)
+- [extraction-examples.md](extraction-examples.md)
+- [datasets.md](datasets.md)
+- [model-card.md](model-card.md)

results/confirmed_exp15_summary.json CHANGED Viewed

@@ -2,7 +2,7 @@
   "results": [
     {
       "alias": "sft4",
-      "checkpoint": "/home/ec2-user/SageMaker/better_memory/exp15_sft_qwen7b_4ep",
       "elapsed_min": 28.93,
       "args": {
         "n_lme": 10,
@@ -50,4 +50,4 @@
     }
   ],
   "failures": []
-}

   "results": [
     {
       "alias": "sft4",
+      "checkpoint": "exp15_sft_qwen7b_4ep",
       "elapsed_min": 28.93,
       "args": {
         "n_lme": 10,
     }
   ],
   "failures": []
+}

results/readme_extraction_examples.json ADDED Viewed

	@@ -0,0 +1,57 @@

+{
+  "source_dataset": "BETTER_MEMORY_ROOT/data/output/eval_sft.jsonl",
+  "model_path": "BETTER_MEMORY_ROOT/exp15_sft_qwen7b_4ep",
+  "output_examples": 3,
+  "examples": [
+    {
+      "id": "infra_queue_delays",
+      "title": "Infrastructure bottlenecks stay structured",
+      "note": "Near-exact match on two operational facts from a single held-out turn.",
+      "session_date": "2025-01-04 15:34:00",
+      "user_message": "yeah, no real caching beyond basic Docker layer caching. Jenkins nodes have limited capacity, and we sometimes hit queue delays during peak commits.",
+      "gpt41_reference": [
+        "No caching beyond basic Docker layer caching",
+        "Jenkins nodes have limited capacity and experience queue delays during peak commits"
+      ],
+      "prism_memory": [
+        "No Docker caching beyond basic layer caching",
+        "Jenkins nodes have limited capacity; peak commits cause queue delays"
+      ],
+      "overlap_score": 0.9090909090909092
+    },
+    {
+      "id": "github_actions_notifications",
+      "title": "Numeric constraints and preferences survive extraction",
+      "note": "The trained model keeps both the hard concurrency cap and the desired notification style.",
+      "session_date": "2025-03-01 15:07:00",
+      "user_message": "yeah, I think starting with incremental scans and parallel matrix jobs makes sense. We have 20 concurrent jobs max on GitHub Actions currently. Also want to keep Slack notifications from Snyk consistent with other pipeline alerts\u2014aggregated and concise. Can you help draft the workflow?",
+      "gpt41_reference": [
+        "GitHub Actions concurrency limit: 20 concurrent jobs",
+        "Wants Snyk Slack notifications aggregated and concise, consistent with other pipeline alerts"
+      ],
+      "prism_memory": [
+        "GitHub Actions concurrency limit: 20 concurrent jobs",
+        "Snyk Slack notifications should be aggregated and concise"
+      ],
+      "overlap_score": 0.8000000000000002
+    },
+    {
+      "id": "sidecar_limits_and_rollout",
+      "title": "Current state and next-step plans stay grouped",
+      "note": "The extracted memory keeps the current configuration, the planned security step, and the rollout strategy in one compact bundle.",
+      "session_date": "2025-09-09 15:26:00",
+      "user_message": "yeah good point about resource overhead, we set CPU limits for all sidecars and monitor with Prometheus now. no mTLS yet, but it\u2019s on the roadmap for phase two. as for routing, we want to start with canary deployments and traffic splitting, maybe some basic fault injection for testing.",
+      "gpt41_reference": [
+        "Istio sidecar CPU limits set and monitored via Prometheus",
+        "mTLS planned in phase two",
+        "Plan to use canary deployments, traffic splitting, and basic fault injection"
+      ],
+      "prism_memory": [
+        "Sidecar CPU limits set and monitored via Prometheus",
+        "Istio mTLS planned for phase two",
+        "Routing strategy: canary deployments and traffic splitting; basic fault injection planned"
+      ],
+      "overlap_score": 0.7985739750445632
+    }
+  ]
+}