Spaces:

O96a
/

exp-006-stale-memory-validator

Sleeping

App Files Files Community

exp-006-stale-memory-validator / app.py

O96a

Upload folder using huggingface_hub

691e11f verified 24 days ago

raw

history blame contribute delete

7.99 kB

	"""
	STALE-inspired Memory Conflict Validator
	Tests LLM ability to detect implicitly invalidated memories
	"""

	import gradio as gr
	import re

	# Lazy loading - no model imports at module level
	llm_client = None

	def get_llm():
	global llm_client
	if llm_client is None:
	import requests
	llm_client = requests.Session()
	return llm_client

	# Test scenarios inspired by STALE benchmark
	TEST_SCENARIOS = [
	{
	"name": "Location Change (Implicit)",
	"context": "User mentioned they moved from Berlin to Paris last month. They said they love the local cafes.",
	"query": "What's your favorite Berlin cafe?",
	"expected_behavior": "Reject premise - user is in Paris now, not Berlin",
	"dimension": "premise_resistance"
	},
	{
	"name": "Preference Update",
	"context": "User used to be vegetarian but started eating meat again last week after a doctor's recommendation.",
	"query": "What vegetarian restaurants do you recommend?",
	"expected_behavior": "Acknowledge changed preference - no longer vegetarian",
	"dimension": "state_resolution"
	},
	{
	"name": "Project Completion",
	"context": "User was working on Project Alpha. Yesterday they announced it shipped successfully.",
	"query": "How is Project Alpha going? Still in development?",
	"expected_behavior": "Correct - project shipped, no longer in development",
	"dimension": "implicit_policy_adaptation"
	},
	{
	"name": "Job Change",
	"context": "User left their job at TechCorp in January. They started at StartupXYZ in February.",
	"query": "Do you still have access to the TechCorp internal wiki?",
	"expected_behavior": "Reject - no longer employed at TechCorp",
	"dimension": "premise_resistance"
	},
	{
	"name": "Living Situation",
	"context": "User lived alone for years. Last month they got married and moved in with their partner.",
	"query": "Any tips for living alone?",
	"expected_behavior": "Acknowledge change - no longer living alone",
	"dimension": "state_resolution"
	}
	]

	def evaluate_response(context, query, response, expected):
	"""Simple heuristic evaluation of response quality"""
	response_lower = response.lower()

	# Check for stale memory indicators
	stale_indicators = [
	"used to", "previously", "before", "no longer",
	"moved", "changed", "now", "recently", "updated"
	]

	# Check for premise acceptance (failure)
	premise_failures = [
	"my favorite", "i recommend", "i like", "i prefer"
	]

	stale_score = sum(1 for ind in stale_indicators if ind in response_lower)
	failure_score = sum(1 for fail in premise_failures if fail in response_lower)

	if stale_score > 0 and failure_score == 0:
	return "✅ PASS - Correctly identified stale memory"
	elif failure_score > 0:
	return "❌ FAIL - Accepted stale premise"
	else:
	return "⚠️ UNCLEAR - Ambiguous handling"

	def test_scenario(scenario_idx, custom_context, custom_query, model_response):
	"""Run a memory conflict test"""
	if scenario_idx < len(TEST_SCENARIOS):
	scenario = TEST_SCENARIOS[scenario_idx]
	context = scenario["context"]
	query = scenario["query"]
	expected = scenario["expected_behavior"]
	else:
	context = custom_context
	query = custom_query
	expected = "Custom scenario - evaluate manually"

	if not model_response.strip():
	return "Please provide a model response to evaluate", "", ""

	result = evaluate_response(context, query, model_response, expected)

	output = f"""## Test Result: {result}

	Context: {context}

	Query: {query}

	Expected Behavior: {expected}

	Model Response:
	{model_response}

	---
	This test is inspired by the STALE benchmark for detecting implicit memory conflicts in LLM agents.
	"""

	return output, context, query

	def get_scenario_details(idx):
	"""Return scenario details for the dropdown"""
	if idx < len(TEST_SCENARIOS):
	s = TEST_SCENARIOS[idx]
	return s["context"], s["query"], s["expected_behavior"]
	return "", "", ""

	# Build UI
	with gr.Blocks(title="Implicit Memory Conflict Validator") as demo:
	gr.Markdown("# 🧠 Implicit Memory Conflict Validator")
	gr.Markdown("Test if LLM agents correctly detect when memories become invalid without explicit negation.")

	with gr.Tab("Quick Test"):
	scenario_select = gr.Dropdown(
	choices=[(s["name"], i) for i, s in enumerate(TEST_SCENARIOS)],
	value=0,
	label="Select Test Scenario"
	)

	with gr.Row():
	with gr.Column():
	ctx_display = gr.Textbox(label="Context", lines=3, interactive=False)
	query_display = gr.Textbox(label="User Query", lines=1, interactive=False)
	expected_display = gr.Textbox(label="Expected Behavior", lines=2, interactive=False)

	with gr.Column():
	model_response = gr.Textbox(
	label="Paste Model Response Here",
	lines=5,
	placeholder="Paste the LLM's response to the query..."
	)

	evaluate_btn = gr.Button("Evaluate Response", variant="primary")
	result_output = gr.Markdown()

	def update_scenario(idx):
	ctx, qry, exp = get_scenario_details(idx)
	return ctx, qry, exp

	scenario_select.change(
	update_scenario,
	inputs=[scenario_select],
	outputs=[ctx_display, query_display, expected_display]
	)

	# Initialize first scenario
	demo.load(
	lambda: get_scenario_details(0),
	outputs=[ctx_display, query_display, expected_display]
	)

	evaluate_btn.click(
	test_scenario,
	inputs=[scenario_select, ctx_display, query_display, model_response],
	outputs=[result_output]
	)

	with gr.Tab("Custom Test"):
	gr.Markdown("Create your own memory conflict scenario:")
	custom_ctx = gr.Textbox(label="Context (what the agent knows)", lines=4)
	custom_qry = gr.Textbox(label="User Query", lines=2)
	custom_expected = gr.Textbox(label="Expected Behavior", lines=2)
	custom_response = gr.Textbox(label="Model Response", lines=5)
	custom_eval_btn = gr.Button("Evaluate", variant="primary")
	custom_result = gr.Markdown()

	custom_eval_btn.click(
	test_scenario,
	inputs=[gr.State(len(TEST_SCENARIOS)), custom_ctx, custom_qry, custom_response],
	outputs=[custom_result]
	)

	with gr.Tab("About"):
	gr.Markdown("""
	## About This Tool

	This validator is inspired by the STALE benchmark from HKUST NLP Group, which tests whether LLM agents
	can detect when their memories are no longer valid through implicit conflict detection.

	### Three Dimensions Tested:

	1. State Resolution - Detecting that a prior belief is outdated
	2. Premise Resistance - Rejecting queries that falsely presuppose a stale state
	3. Implicit Policy Adaptation - Proactively applying updated states in downstream behavior

	### Why This Matters:

	Real-world agents must handle memory invalidation without explicit negation. The STALE paper found
	that even frontier models achieve only ~55% accuracy on these tasks.

	### Reference:

	> STALE: Can LLM Agents Know When Their Memories Are No Longer Valid?
	> Hanxiang Chao et al., HKUST NLP Group
	> arXiv:2605.06527 (2026)
	""")

	if __name__ == "__main__":
	demo.launch()