""" STALE-inspired Memory Conflict Validator Tests LLM ability to detect implicitly invalidated memories """ import gradio as gr import re # Lazy loading - no model imports at module level llm_client = None def get_llm(): global llm_client if llm_client is None: import requests llm_client = requests.Session() return llm_client # Test scenarios inspired by STALE benchmark TEST_SCENARIOS = [ { "name": "Location Change (Implicit)", "context": "User mentioned they moved from Berlin to Paris last month. They said they love the local cafes.", "query": "What's your favorite Berlin cafe?", "expected_behavior": "Reject premise - user is in Paris now, not Berlin", "dimension": "premise_resistance" }, { "name": "Preference Update", "context": "User used to be vegetarian but started eating meat again last week after a doctor's recommendation.", "query": "What vegetarian restaurants do you recommend?", "expected_behavior": "Acknowledge changed preference - no longer vegetarian", "dimension": "state_resolution" }, { "name": "Project Completion", "context": "User was working on Project Alpha. Yesterday they announced it shipped successfully.", "query": "How is Project Alpha going? Still in development?", "expected_behavior": "Correct - project shipped, no longer in development", "dimension": "implicit_policy_adaptation" }, { "name": "Job Change", "context": "User left their job at TechCorp in January. They started at StartupXYZ in February.", "query": "Do you still have access to the TechCorp internal wiki?", "expected_behavior": "Reject - no longer employed at TechCorp", "dimension": "premise_resistance" }, { "name": "Living Situation", "context": "User lived alone for years. Last month they got married and moved in with their partner.", "query": "Any tips for living alone?", "expected_behavior": "Acknowledge change - no longer living alone", "dimension": "state_resolution" } ] def evaluate_response(context, query, response, expected): """Simple heuristic evaluation of response quality""" response_lower = response.lower() # Check for stale memory indicators stale_indicators = [ "used to", "previously", "before", "no longer", "moved", "changed", "now", "recently", "updated" ] # Check for premise acceptance (failure) premise_failures = [ "my favorite", "i recommend", "i like", "i prefer" ] stale_score = sum(1 for ind in stale_indicators if ind in response_lower) failure_score = sum(1 for fail in premise_failures if fail in response_lower) if stale_score > 0 and failure_score == 0: return "✅ PASS - Correctly identified stale memory" elif failure_score > 0: return "❌ FAIL - Accepted stale premise" else: return "⚠️ UNCLEAR - Ambiguous handling" def test_scenario(scenario_idx, custom_context, custom_query, model_response): """Run a memory conflict test""" if scenario_idx < len(TEST_SCENARIOS): scenario = TEST_SCENARIOS[scenario_idx] context = scenario["context"] query = scenario["query"] expected = scenario["expected_behavior"] else: context = custom_context query = custom_query expected = "Custom scenario - evaluate manually" if not model_response.strip(): return "Please provide a model response to evaluate", "", "" result = evaluate_response(context, query, model_response, expected) output = f"""## Test Result: {result} **Context:** {context} **Query:** {query} **Expected Behavior:** {expected} **Model Response:** {model_response} --- *This test is inspired by the STALE benchmark for detecting implicit memory conflicts in LLM agents.* """ return output, context, query def get_scenario_details(idx): """Return scenario details for the dropdown""" if idx < len(TEST_SCENARIOS): s = TEST_SCENARIOS[idx] return s["context"], s["query"], s["expected_behavior"] return "", "", "" # Build UI with gr.Blocks(title="Implicit Memory Conflict Validator") as demo: gr.Markdown("# 🧠 Implicit Memory Conflict Validator") gr.Markdown("Test if LLM agents correctly detect when memories become invalid without explicit negation.") with gr.Tab("Quick Test"): scenario_select = gr.Dropdown( choices=[(s["name"], i) for i, s in enumerate(TEST_SCENARIOS)], value=0, label="Select Test Scenario" ) with gr.Row(): with gr.Column(): ctx_display = gr.Textbox(label="Context", lines=3, interactive=False) query_display = gr.Textbox(label="User Query", lines=1, interactive=False) expected_display = gr.Textbox(label="Expected Behavior", lines=2, interactive=False) with gr.Column(): model_response = gr.Textbox( label="Paste Model Response Here", lines=5, placeholder="Paste the LLM's response to the query..." ) evaluate_btn = gr.Button("Evaluate Response", variant="primary") result_output = gr.Markdown() def update_scenario(idx): ctx, qry, exp = get_scenario_details(idx) return ctx, qry, exp scenario_select.change( update_scenario, inputs=[scenario_select], outputs=[ctx_display, query_display, expected_display] ) # Initialize first scenario demo.load( lambda: get_scenario_details(0), outputs=[ctx_display, query_display, expected_display] ) evaluate_btn.click( test_scenario, inputs=[scenario_select, ctx_display, query_display, model_response], outputs=[result_output] ) with gr.Tab("Custom Test"): gr.Markdown("Create your own memory conflict scenario:") custom_ctx = gr.Textbox(label="Context (what the agent knows)", lines=4) custom_qry = gr.Textbox(label="User Query", lines=2) custom_expected = gr.Textbox(label="Expected Behavior", lines=2) custom_response = gr.Textbox(label="Model Response", lines=5) custom_eval_btn = gr.Button("Evaluate", variant="primary") custom_result = gr.Markdown() custom_eval_btn.click( test_scenario, inputs=[gr.State(len(TEST_SCENARIOS)), custom_ctx, custom_qry, custom_response], outputs=[custom_result] ) with gr.Tab("About"): gr.Markdown(""" ## About This Tool This validator is inspired by the **STALE** benchmark from HKUST NLP Group, which tests whether LLM agents can detect when their memories are no longer valid through *implicit conflict* detection. ### Three Dimensions Tested: 1. **State Resolution** - Detecting that a prior belief is outdated 2. **Premise Resistance** - Rejecting queries that falsely presuppose a stale state 3. **Implicit Policy Adaptation** - Proactively applying updated states in downstream behavior ### Why This Matters: Real-world agents must handle memory invalidation without explicit negation. The STALE paper found that even frontier models achieve only ~55% accuracy on these tasks. ### Reference: > STALE: Can LLM Agents Know When Their Memories Are No Longer Valid? > Hanxiang Chao et al., HKUST NLP Group > arXiv:2605.06527 (2026) """) if __name__ == "__main__": demo.launch()