Spaces:

NYSERDA-CRE-Working-Group
/

Updated_code_complaince

Running

App Files Files Community

Updated_code_complaince / tools /crop_cache.py

Ryan2219

Upload 70 files

e1ced8e verified 16 days ago

raw

history blame contribute delete

6.14 kB

	"""In-session crop cache — avoids redundant Gemini API calls for identical crops.

	Stored in ``st.session_state`` so it persists across questions within a single
	Streamlit session, but is discarded when the session ends.

	Matching strategy:
	- Exact match on ``(page_num, crop_instruction)`` is the primary lookup.
	- Fuzzy match with a simple normalized overlap score handles cases where
	the planner rephrases slightly (e.g., "Crop the gymnasium area" vs
	"Crop gymnasium area showing diffusers"). Only matches above a high
	threshold (0.85) are considered hits to avoid false positives.
	"""
	from __future__ import annotations

	import logging
	import re
	from dataclasses import dataclass, field

	from state import ImageRef

	logger = logging.getLogger(__name__)


	@dataclass
	class CachedCrop:
	"""A cached crop entry with its original instruction and result."""
	page_num: int
	crop_instruction: str
	label: str
	image_ref: ImageRef
	# Normalised token set for fuzzy matching (computed once at insert time)
	_tokens: frozenset[str] = field(default_factory=frozenset, repr=False)


	def _normalise_tokens(text: str) -> frozenset[str]:
	"""Lowercase, strip punctuation, split into a token set."""
	cleaned = re.sub(r"[^a-z0-9\s]", "", text.lower())
	return frozenset(cleaned.split())


	def _token_overlap(a: frozenset[str], b: frozenset[str]) -> float:
	"""Jaccard-style overlap: \|intersection\| / \|union\|."""
	if not a or not b:
	return 0.0
	return len(a & b) / len(a \| b)


	class CropCache:
	"""Session-scoped cache mapping (page, instruction) → ImageRef.

	Thread-safe for concurrent reads (dict lookups under CPython's GIL) but
	writes are serialised via the single-threaded Streamlit main thread.
	"""

	# Minimum token-overlap score to accept a fuzzy match.
	# Tuned so that minor rephrasing (dropping "the", "all") still matches
	# (~0.78 overlap) while genuinely different instructions miss (~0.06-0.42).
	FUZZY_THRESHOLD = 0.70

	def __init__(self) -> None:
	# Primary index: exact (page_num, instruction) → CachedCrop
	self._exact: dict[tuple[int, str], CachedCrop] = {}
	# Secondary list for fuzzy scanning (same objects as _exact values)
	self._entries: list[CachedCrop] = []
	self._hit_count = 0
	self._miss_count = 0

	# ------------------------------------------------------------------
	# Public API
	# ------------------------------------------------------------------

	def lookup(self, page_num: int, crop_instruction: str) -> ImageRef \| None:
	"""Return a cached ImageRef if a matching crop exists, else None.

	Tries exact match first, then falls back to fuzzy token overlap
	restricted to the same page.
	"""
	key = (page_num, crop_instruction)

	# 1. Exact match
	if key in self._exact:
	self._hit_count += 1
	entry = self._exact[key]
	logger.info(
	"CropCache HIT (exact) page=%d instruction='%s' → %s",
	page_num, crop_instruction[:60], entry.image_ref["id"],
	)
	return entry.image_ref

	# 2. Fuzzy match — only among entries on the same page
	query_tokens = _normalise_tokens(crop_instruction)
	best_score = 0.0
	best_entry: CachedCrop \| None = None

	for entry in self._entries:
	if entry.page_num != page_num:
	continue
	score = _token_overlap(query_tokens, entry._tokens)
	if score > best_score:
	best_score = score
	best_entry = entry

	if best_entry is not None and best_score >= self.FUZZY_THRESHOLD:
	self._hit_count += 1
	logger.info(
	"CropCache HIT (fuzzy %.2f) page=%d instruction='%s' → %s",
	best_score, page_num, crop_instruction[:60],
	best_entry.image_ref["id"],
	)
	return best_entry.image_ref

	self._miss_count += 1
	return None

	def register(
	self,
	page_num: int,
	crop_instruction: str,
	label: str,
	image_ref: ImageRef,
	*,
	is_fallback: bool = False,
	) -> None:
	"""Register a successful crop in the cache.

	Parameters
	----------
	is_fallback
	If True, the crop is a full-page fallback (Gemini failed to crop).
	These are NOT cached because they don't represent a useful targeted crop.
	"""
	if is_fallback:
	logger.debug(
	"CropCache SKIP (fallback) page=%d instruction='%s'",
	page_num, crop_instruction[:60],
	)
	return

	key = (page_num, crop_instruction)
	if key in self._exact:
	return # already cached

	entry = CachedCrop(
	page_num=page_num,
	crop_instruction=crop_instruction,
	label=label,
	image_ref=image_ref,
	_tokens=_normalise_tokens(crop_instruction),
	)
	self._exact[key] = entry
	self._entries.append(entry)
	logger.info(
	"CropCache REGISTER page=%d instruction='%s' → %s",
	page_num, crop_instruction[:60], image_ref["id"],
	)

	@property
	def size(self) -> int:
	return len(self._entries)

	@property
	def stats(self) -> str:
	total = self._hit_count + self._miss_count
	rate = (self._hit_count / total * 100) if total > 0 else 0
	return (
	f"CropCache: {self.size} entries, "
	f"{self._hit_count} hits / {self._miss_count} misses "
	f"({rate:.0f}% hit rate)"
	)

	def clear(self) -> None:
	"""Reset the cache (e.g., when a new PDF is loaded)."""
	self._exact.clear()
	self._entries.clear()
	self._hit_count = 0
	self._miss_count = 0