Spaces:
Sleeping
Sleeping
| """ | |
| STALE-inspired Memory Conflict Validator | |
| Tests LLM ability to detect implicitly invalidated memories | |
| """ | |
| import gradio as gr | |
| import re | |
| # Lazy loading - no model imports at module level | |
| llm_client = None | |
| def get_llm(): | |
| global llm_client | |
| if llm_client is None: | |
| import requests | |
| llm_client = requests.Session() | |
| return llm_client | |
| # Test scenarios inspired by STALE benchmark | |
| TEST_SCENARIOS = [ | |
| { | |
| "name": "Location Change (Implicit)", | |
| "context": "User mentioned they moved from Berlin to Paris last month. They said they love the local cafes.", | |
| "query": "What's your favorite Berlin cafe?", | |
| "expected_behavior": "Reject premise - user is in Paris now, not Berlin", | |
| "dimension": "premise_resistance" | |
| }, | |
| { | |
| "name": "Preference Update", | |
| "context": "User used to be vegetarian but started eating meat again last week after a doctor's recommendation.", | |
| "query": "What vegetarian restaurants do you recommend?", | |
| "expected_behavior": "Acknowledge changed preference - no longer vegetarian", | |
| "dimension": "state_resolution" | |
| }, | |
| { | |
| "name": "Project Completion", | |
| "context": "User was working on Project Alpha. Yesterday they announced it shipped successfully.", | |
| "query": "How is Project Alpha going? Still in development?", | |
| "expected_behavior": "Correct - project shipped, no longer in development", | |
| "dimension": "implicit_policy_adaptation" | |
| }, | |
| { | |
| "name": "Job Change", | |
| "context": "User left their job at TechCorp in January. They started at StartupXYZ in February.", | |
| "query": "Do you still have access to the TechCorp internal wiki?", | |
| "expected_behavior": "Reject - no longer employed at TechCorp", | |
| "dimension": "premise_resistance" | |
| }, | |
| { | |
| "name": "Living Situation", | |
| "context": "User lived alone for years. Last month they got married and moved in with their partner.", | |
| "query": "Any tips for living alone?", | |
| "expected_behavior": "Acknowledge change - no longer living alone", | |
| "dimension": "state_resolution" | |
| } | |
| ] | |
| def evaluate_response(context, query, response, expected): | |
| """Simple heuristic evaluation of response quality""" | |
| response_lower = response.lower() | |
| # Check for stale memory indicators | |
| stale_indicators = [ | |
| "used to", "previously", "before", "no longer", | |
| "moved", "changed", "now", "recently", "updated" | |
| ] | |
| # Check for premise acceptance (failure) | |
| premise_failures = [ | |
| "my favorite", "i recommend", "i like", "i prefer" | |
| ] | |
| stale_score = sum(1 for ind in stale_indicators if ind in response_lower) | |
| failure_score = sum(1 for fail in premise_failures if fail in response_lower) | |
| if stale_score > 0 and failure_score == 0: | |
| return "✅ PASS - Correctly identified stale memory" | |
| elif failure_score > 0: | |
| return "❌ FAIL - Accepted stale premise" | |
| else: | |
| return "⚠️ UNCLEAR - Ambiguous handling" | |
| def test_scenario(scenario_idx, custom_context, custom_query, model_response): | |
| """Run a memory conflict test""" | |
| if scenario_idx < len(TEST_SCENARIOS): | |
| scenario = TEST_SCENARIOS[scenario_idx] | |
| context = scenario["context"] | |
| query = scenario["query"] | |
| expected = scenario["expected_behavior"] | |
| else: | |
| context = custom_context | |
| query = custom_query | |
| expected = "Custom scenario - evaluate manually" | |
| if not model_response.strip(): | |
| return "Please provide a model response to evaluate", "", "" | |
| result = evaluate_response(context, query, model_response, expected) | |
| output = f"""## Test Result: {result} | |
| **Context:** {context} | |
| **Query:** {query} | |
| **Expected Behavior:** {expected} | |
| **Model Response:** | |
| {model_response} | |
| --- | |
| *This test is inspired by the STALE benchmark for detecting implicit memory conflicts in LLM agents.* | |
| """ | |
| return output, context, query | |
| def get_scenario_details(idx): | |
| """Return scenario details for the dropdown""" | |
| if idx < len(TEST_SCENARIOS): | |
| s = TEST_SCENARIOS[idx] | |
| return s["context"], s["query"], s["expected_behavior"] | |
| return "", "", "" | |
| # Build UI | |
| with gr.Blocks(title="Implicit Memory Conflict Validator") as demo: | |
| gr.Markdown("# 🧠 Implicit Memory Conflict Validator") | |
| gr.Markdown("Test if LLM agents correctly detect when memories become invalid without explicit negation.") | |
| with gr.Tab("Quick Test"): | |
| scenario_select = gr.Dropdown( | |
| choices=[(s["name"], i) for i, s in enumerate(TEST_SCENARIOS)], | |
| value=0, | |
| label="Select Test Scenario" | |
| ) | |
| with gr.Row(): | |
| with gr.Column(): | |
| ctx_display = gr.Textbox(label="Context", lines=3, interactive=False) | |
| query_display = gr.Textbox(label="User Query", lines=1, interactive=False) | |
| expected_display = gr.Textbox(label="Expected Behavior", lines=2, interactive=False) | |
| with gr.Column(): | |
| model_response = gr.Textbox( | |
| label="Paste Model Response Here", | |
| lines=5, | |
| placeholder="Paste the LLM's response to the query..." | |
| ) | |
| evaluate_btn = gr.Button("Evaluate Response", variant="primary") | |
| result_output = gr.Markdown() | |
| def update_scenario(idx): | |
| ctx, qry, exp = get_scenario_details(idx) | |
| return ctx, qry, exp | |
| scenario_select.change( | |
| update_scenario, | |
| inputs=[scenario_select], | |
| outputs=[ctx_display, query_display, expected_display] | |
| ) | |
| # Initialize first scenario | |
| demo.load( | |
| lambda: get_scenario_details(0), | |
| outputs=[ctx_display, query_display, expected_display] | |
| ) | |
| evaluate_btn.click( | |
| test_scenario, | |
| inputs=[scenario_select, ctx_display, query_display, model_response], | |
| outputs=[result_output] | |
| ) | |
| with gr.Tab("Custom Test"): | |
| gr.Markdown("Create your own memory conflict scenario:") | |
| custom_ctx = gr.Textbox(label="Context (what the agent knows)", lines=4) | |
| custom_qry = gr.Textbox(label="User Query", lines=2) | |
| custom_expected = gr.Textbox(label="Expected Behavior", lines=2) | |
| custom_response = gr.Textbox(label="Model Response", lines=5) | |
| custom_eval_btn = gr.Button("Evaluate", variant="primary") | |
| custom_result = gr.Markdown() | |
| custom_eval_btn.click( | |
| test_scenario, | |
| inputs=[gr.State(len(TEST_SCENARIOS)), custom_ctx, custom_qry, custom_response], | |
| outputs=[custom_result] | |
| ) | |
| with gr.Tab("About"): | |
| gr.Markdown(""" | |
| ## About This Tool | |
| This validator is inspired by the **STALE** benchmark from HKUST NLP Group, which tests whether LLM agents | |
| can detect when their memories are no longer valid through *implicit conflict* detection. | |
| ### Three Dimensions Tested: | |
| 1. **State Resolution** - Detecting that a prior belief is outdated | |
| 2. **Premise Resistance** - Rejecting queries that falsely presuppose a stale state | |
| 3. **Implicit Policy Adaptation** - Proactively applying updated states in downstream behavior | |
| ### Why This Matters: | |
| Real-world agents must handle memory invalidation without explicit negation. The STALE paper found | |
| that even frontier models achieve only ~55% accuracy on these tasks. | |
| ### Reference: | |
| > STALE: Can LLM Agents Know When Their Memories Are No Longer Valid? | |
| > Hanxiang Chao et al., HKUST NLP Group | |
| > arXiv:2605.06527 (2026) | |
| """) | |
| if __name__ == "__main__": | |
| demo.launch() |