LocalAI / backend /python /mlx /test_mlx_cache.py

Upload folder using huggingface_hub

0f07ba7 verified 20 days ago

18.1 kB

	"""
	Comprehensive unit tests for ThreadSafeLRUPromptCache.

	Tests all cache operation modes:
	- Exact match
	- Shorter prefix match
	- Longer prefix match (with trimming)
	- No match
	- LRU eviction
	- Reference counting
	- Multi-model namespacing
	- Thread safety with data integrity verification
	"""
	import unittest
	import concurrent.futures
	import threading
	import copy
	from mlx_cache import ThreadSafeLRUPromptCache


	class TestCacheExactMatch(unittest.TestCase):
	"""Tests for exact match cache behavior."""

	def setUp(self):
	self.cache = ThreadSafeLRUPromptCache(max_size=10)

	def test_exact_match_returns_cache_and_empty_remaining(self):
	"""Exact match should return the cache with no remaining tokens."""
	tokens = [1, 2, 3, 4, 5]
	mock_cache = ["kv_cache_data"]

	self.cache.insert_cache("model1", tokens, mock_cache)
	result_cache, remaining = self.cache.fetch_nearest_cache("model1", tokens)

	self.assertEqual(result_cache, mock_cache)
	self.assertEqual(remaining, [])

	def test_exact_match_extracts_and_removes_from_cache(self):
	"""Fetching exact match with count=1 should remove entry from cache."""
	tokens = [1, 2, 3]
	self.cache.insert_cache("model1", tokens, ["cache"])

	self.assertEqual(len(self.cache), 1)

	# First fetch extracts the entry
	self.cache.fetch_nearest_cache("model1", tokens)

	# Cache should now be empty
	self.assertEqual(len(self.cache), 0)

	# Second fetch should return None (no match)
	result_cache, remaining = self.cache.fetch_nearest_cache("model1", tokens)
	self.assertIsNone(result_cache)
	self.assertEqual(remaining, tokens)


	class TestCacheShorterPrefix(unittest.TestCase):
	"""Tests for shorter prefix match behavior."""

	def setUp(self):
	self.cache = ThreadSafeLRUPromptCache(max_size=10)

	def test_shorter_prefix_returns_cache_with_remaining_tokens(self):
	"""When cached prefix is shorter, return cache and remaining suffix."""
	short_tokens = [1, 2, 3]
	long_tokens = [1, 2, 3, 4, 5, 6]
	mock_cache = ["prefix_cache"]

	self.cache.insert_cache("model1", short_tokens, mock_cache)
	result_cache, remaining = self.cache.fetch_nearest_cache("model1", long_tokens)

	self.assertEqual(result_cache, mock_cache)
	self.assertEqual(remaining, [4, 5, 6])

	def test_shorter_prefix_correct_remaining_calculation(self):
	"""Verify remaining tokens are calculated correctly for various prefix lengths."""
	# Note: Single-token prefixes ([1] -> [1,2,3]) are deliberately not matched
	# to allow longer cached sequences to be preferred for trimming.
	# This matches upstream mlx_lm/server.py behavior.
	test_cases = [
	# (cached_tokens, requested_tokens, expected_remaining)
	([1, 2], [1, 2, 3, 4, 5], [3, 4, 5]),
	([10, 20, 30, 40], [10, 20, 30, 40, 50], [50]),
	]

	for cached, requested, expected_remaining in test_cases:
	with self.subTest(cached=cached, requested=requested):
	cache = ThreadSafeLRUPromptCache(max_size=10)
	cache.insert_cache("model", cached, ["cache"])
	result_cache, remaining = cache.fetch_nearest_cache("model", requested)

	self.assertIsNotNone(result_cache)
	self.assertEqual(remaining, expected_remaining)

	def test_single_token_prefix_not_matched(self):
	"""Single-token prefixes are not matched (by design, matches upstream).

	This allows longer cached sequences to be preferred for trimming,
	which provides better KV cache reuse. Single-token caches are rare
	in practice since real prompts with chat templates are many tokens.
	"""
	cache = ThreadSafeLRUPromptCache(max_size=10)
	cache.insert_cache("model", [1], ["cache"])

	result_cache, remaining = cache.fetch_nearest_cache("model", [1, 2, 3])

	# Single-token prefix is NOT matched
	self.assertIsNone(result_cache)
	self.assertEqual(remaining, [1, 2, 3])


	class TestCacheLongerPrefix(unittest.TestCase):
	"""Tests for longer prefix match behavior (trimming)."""

	def setUp(self):
	# Track trim calls for verification
	self.trim_calls = []

	def mock_can_trim(cache):
	return True

	def mock_trim(cache, num_to_trim):
	self.trim_calls.append(num_to_trim)
	# Simulate trimming by modifying the cache
	cache.append(f"trimmed_{num_to_trim}")

	self.cache = ThreadSafeLRUPromptCache(
	max_size=10,
	can_trim_fn=mock_can_trim,
	trim_fn=mock_trim,
	)

	def test_longer_prefix_triggers_trim(self):
	"""When cached sequence is longer, should trim to match requested prefix."""
	long_tokens = [1, 2, 3, 4, 5]
	short_tokens = [1, 2, 3]

	self.cache.insert_cache("model1", long_tokens, ["original_cache"])
	result_cache, remaining = self.cache.fetch_nearest_cache("model1", short_tokens)

	# Should have called trim
	self.assertTrue(len(self.trim_calls) > 0, "trim_fn should have been called")
	# Result should be a trimmed copy, not the original
	self.assertIn("trimmed_", str(result_cache))

	def test_longer_prefix_without_trim_fn_returns_no_match(self):
	"""Without trim functions, longer prefix should not match."""
	cache_no_trim = ThreadSafeLRUPromptCache(max_size=10)

	long_tokens = [1, 2, 3, 4, 5]
	short_tokens = [1, 2, 3]

	cache_no_trim.insert_cache("model1", long_tokens, ["cache"])
	result_cache, remaining = cache_no_trim.fetch_nearest_cache("model1", short_tokens)

	# Without trim_fn, should return no match
	self.assertIsNone(result_cache)
	self.assertEqual(remaining, short_tokens)

	def test_longer_prefix_can_trim_false_returns_no_match(self):
	"""When can_trim_fn returns False, should not attempt trim."""
	cache = ThreadSafeLRUPromptCache(
	max_size=10,
	can_trim_fn=lambda c: False,
	trim_fn=lambda c, n: None,
	)

	cache.insert_cache("model1", [1, 2, 3, 4, 5], ["cache"])
	result_cache, remaining = cache.fetch_nearest_cache("model1", [1, 2, 3])

	self.assertIsNone(result_cache)
	self.assertEqual(remaining, [1, 2, 3])


	class TestCacheNoMatch(unittest.TestCase):
	"""Tests for no match behavior."""

	def setUp(self):
	self.cache = ThreadSafeLRUPromptCache(max_size=10)

	def test_empty_cache_returns_none(self):
	"""Empty cache should return None and all tokens as remaining."""
	tokens = [1, 2, 3]
	result_cache, remaining = self.cache.fetch_nearest_cache("model1", tokens)

	self.assertIsNone(result_cache)
	self.assertEqual(remaining, tokens)

	def test_different_prefix_returns_none(self):
	"""Tokens with different prefix should not match."""
	self.cache.insert_cache("model1", [1, 2, 3], ["cache"])

	# Completely different tokens
	result_cache, remaining = self.cache.fetch_nearest_cache("model1", [4, 5, 6])

	self.assertIsNone(result_cache)
	self.assertEqual(remaining, [4, 5, 6])

	def test_partial_prefix_mismatch_returns_none(self):
	"""Tokens that diverge mid-sequence should not match."""
	self.cache.insert_cache("model1", [1, 2, 3], ["cache"])

	# Same start but diverges
	result_cache, remaining = self.cache.fetch_nearest_cache("model1", [1, 2, 99])

	self.assertIsNone(result_cache)
	self.assertEqual(remaining, [1, 2, 99])

	def test_wrong_model_returns_none(self):
	"""Different model key should not match."""
	self.cache.insert_cache("model1", [1, 2, 3], ["cache"])

	result_cache, remaining = self.cache.fetch_nearest_cache("model2", [1, 2, 3])

	self.assertIsNone(result_cache)
	self.assertEqual(remaining, [1, 2, 3])


	class TestCacheLRUEviction(unittest.TestCase):
	"""Tests for LRU eviction behavior."""

	def setUp(self):
	self.cache = ThreadSafeLRUPromptCache(max_size=3)

	def test_evicts_oldest_when_full(self):
	"""Should evict least recently used entry when capacity exceeded."""
	self.cache.insert_cache("model", [1], ["cache1"])
	self.cache.insert_cache("model", [2], ["cache2"])
	self.cache.insert_cache("model", [3], ["cache3"])

	self.assertEqual(len(self.cache), 3)

	# Insert 4th entry - should evict [1]
	self.cache.insert_cache("model", [4], ["cache4"])

	self.assertEqual(len(self.cache), 3)

	# [1] should be evicted
	result, _ = self.cache.fetch_nearest_cache("model", [1])
	self.assertIsNone(result)

	# [2], [3], [4] should still exist
	for tokens in [[2], [3], [4]]:
	# Re-insert since fetch extracts
	self.cache.insert_cache("model", tokens, [f"cache{tokens[0]}"])

	result2, _ = self.cache.fetch_nearest_cache("model", [2])
	self.assertIsNotNone(result2)

	def test_access_updates_lru_order(self):
	"""Accessing an entry should move it to most recently used."""
	self.cache.insert_cache("model", [1], ["cache1"])
	self.cache.insert_cache("model", [2], ["cache2"])
	self.cache.insert_cache("model", [3], ["cache3"])

	# Access [1] to make it most recently used
	cache1, _ = self.cache.fetch_nearest_cache("model", [1])
	# Re-insert it (simulating normal usage pattern)
	self.cache.insert_cache("model", [1], cache1)

	# Now insert two more entries - should evict [2] then [3], not [1]
	self.cache.insert_cache("model", [4], ["cache4"])
	self.cache.insert_cache("model", [5], ["cache5"])

	# [1] should still exist (was accessed, so not evicted)
	result1, _ = self.cache.fetch_nearest_cache("model", [1])
	self.assertIsNotNone(result1)

	# [2] should be evicted (was oldest after [1] was accessed)
	result2, _ = self.cache.fetch_nearest_cache("model", [2])
	self.assertIsNone(result2)


	class TestCacheReferenceCount(unittest.TestCase):
	"""Tests for reference counting behavior."""

	def setUp(self):
	self.cache = ThreadSafeLRUPromptCache(max_size=10)

	def test_multiple_inserts_increment_count(self):
	"""Inserting same tokens multiple times should increment count."""
	tokens = [1, 2, 3]

	self.cache.insert_cache("model", tokens, ["cache"])
	self.cache.insert_cache("model", tokens, ["cache"])
	self.cache.insert_cache("model", tokens, ["cache"])

	# Should still be one entry (with count=3 internally)
	self.assertEqual(len(self.cache), 1)

	# First two fetches should return copies (count decremented)
	result1, _ = self.cache.fetch_nearest_cache("model", tokens)
	self.assertIsNotNone(result1)

	result2, _ = self.cache.fetch_nearest_cache("model", tokens)
	self.assertIsNotNone(result2)

	# Third fetch extracts the last reference
	result3, _ = self.cache.fetch_nearest_cache("model", tokens)
	self.assertIsNotNone(result3)

	# Fourth fetch should return None (entry fully extracted)
	result4, _ = self.cache.fetch_nearest_cache("model", tokens)
	self.assertIsNone(result4)

	def test_extract_with_high_count_returns_deep_copy(self):
	"""When count > 1, extract should return a deep copy."""
	tokens = [1, 2, 3]
	original_cache = [{"nested": "data"}]

	self.cache.insert_cache("model", tokens, original_cache)
	self.cache.insert_cache("model", tokens, original_cache) # count=2

	result1, _ = self.cache.fetch_nearest_cache("model", tokens)

	# Modify the returned cache
	result1[0]["nested"] = "modified"

	# Second fetch should get unmodified copy
	result2, _ = self.cache.fetch_nearest_cache("model", tokens)
	self.assertEqual(result2[0]["nested"], "data")


	class TestCacheMultiModel(unittest.TestCase):
	"""Tests for multi-model namespacing."""

	def setUp(self):
	self.cache = ThreadSafeLRUPromptCache(max_size=10)

	def test_same_tokens_different_models_are_separate(self):
	"""Same token sequence under different models should be independent."""
	tokens = [1, 2, 3]

	self.cache.insert_cache("model_a", tokens, ["cache_a"])
	self.cache.insert_cache("model_b", tokens, ["cache_b"])

	self.assertEqual(len(self.cache), 2)

	result_a, _ = self.cache.fetch_nearest_cache("model_a", tokens)
	result_b, _ = self.cache.fetch_nearest_cache("model_b", tokens)

	self.assertEqual(result_a, ["cache_a"])
	self.assertEqual(result_b, ["cache_b"])

	def test_eviction_across_models(self):
	"""LRU eviction should work across different models."""
	cache = ThreadSafeLRUPromptCache(max_size=3)

	cache.insert_cache("model_a", [1], ["a1"])
	cache.insert_cache("model_b", [1], ["b1"])
	cache.insert_cache("model_a", [2], ["a2"])

	self.assertEqual(len(cache), 3)

	# Insert 4th - should evict model_a:[1] (oldest)
	cache.insert_cache("model_b", [2], ["b2"])

	result, _ = cache.fetch_nearest_cache("model_a", [1])
	self.assertIsNone(result)


	class TestCacheThreadSafety(unittest.TestCase):
	"""Tests for thread safety with data integrity verification."""

	def test_concurrent_inserts_no_data_loss(self):
	"""Concurrent inserts should not lose data."""
	cache = ThreadSafeLRUPromptCache(max_size=100)
	num_threads = 10
	inserts_per_thread = 20

	def insert_entries(thread_id):
	for i in range(inserts_per_thread):
	tokens = [thread_id, i]
	cache.insert_cache("model", tokens, [f"cache_{thread_id}_{i}"])

	with concurrent.futures.ThreadPoolExecutor(max_workers=num_threads) as executor:
	futures = [executor.submit(insert_entries, tid) for tid in range(num_threads)]
	concurrent.futures.wait(futures)

	# Verify expected number of entries (may be less due to LRU eviction with max_size=100)
	# But should be exactly 100 since we inserted exactly 200 and max_size is 100
	self.assertEqual(len(cache), 100)

	def test_concurrent_fetch_and_insert_no_corruption(self):
	"""Concurrent fetches and inserts should not corrupt data."""
	cache = ThreadSafeLRUPromptCache(max_size=50)
	errors = []
	lock = threading.Lock()

	# Pre-populate with known data
	for i in range(20):
	cache.insert_cache("model", [i], [f"original_{i}"])

	def fetch_and_verify(thread_id):
	try:
	for _ in range(50):
	token_id = thread_id % 20
	result, remaining = cache.fetch_nearest_cache("model", [token_id])

	if result is not None:
	# Verify data integrity
	expected_prefix = f"original_{token_id}"
	if not str(result[0]).startswith("original_"):
	with lock:
	errors.append(f"Corrupted data: {result}")

	# Re-insert to keep cache populated
	cache.insert_cache("model", [token_id], result)

	except Exception as e:
	with lock:
	errors.append(str(e))

	with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
	futures = [executor.submit(fetch_and_verify, tid) for tid in range(10)]
	concurrent.futures.wait(futures)

	self.assertEqual(errors, [], f"Thread safety errors: {errors}")

	def test_concurrent_operations_maintain_cache_bounds(self):
	"""Cache size should never exceed max_size under concurrent operations."""
	max_size = 10
	cache = ThreadSafeLRUPromptCache(max_size=max_size)
	size_violations = []
	lock = threading.Lock()

	def random_operations(thread_id):
	import random
	for i in range(100):
	tokens = [random.randint(0, 50)]
	if random.random() < 0.7:
	cache.insert_cache("model", tokens, [f"cache_{thread_id}_{i}"])
	else:
	cache.fetch_nearest_cache("model", tokens)

	current_size = len(cache)
	if current_size > max_size:
	with lock:
	size_violations.append(current_size)

	with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
	futures = [executor.submit(random_operations, tid) for tid in range(10)]
	concurrent.futures.wait(futures)

	self.assertEqual(size_violations, [], f"Size exceeded max: {size_violations}")
	self.assertLessEqual(len(cache), max_size)


	class TestCacheClear(unittest.TestCase):
	"""Tests for cache clear operation."""

	def setUp(self):
	self.cache = ThreadSafeLRUPromptCache(max_size=10)

	def test_clear_removes_all_entries(self):
	"""Clear should remove all entries."""
	self.cache.insert_cache("model1", [1, 2], ["cache1"])
	self.cache.insert_cache("model2", [3, 4], ["cache2"])
	self.cache.insert_cache("model1", [5, 6], ["cache3"])

	self.assertEqual(len(self.cache), 3)

	self.cache.clear()

	self.assertEqual(len(self.cache), 0)

	def test_clear_allows_new_inserts(self):
	"""After clear, new inserts should work normally."""
	self.cache.insert_cache("model", [1], ["cache1"])
	self.cache.clear()
	self.cache.insert_cache("model", [2], ["cache2"])

	self.assertEqual(len(self.cache), 1)

	result, _ = self.cache.fetch_nearest_cache("model", [2])
	self.assertEqual(result, ["cache2"])


	if __name__ == "__main__":
	unittest.main()