Spaces:
Running
Running
| """ | |
| Simple in-memory cache with TTL for V3 web scraping API. | |
| """ | |
| import time | |
| from threading import Lock | |
| from typing import Any | |
| from app.core.logging import get_logger | |
| logger = get_logger(__name__) | |
| class SimpleCache: | |
| """Thread-safe in-memory cache with TTL-based expiration.""" | |
| def __init__(self, ttl_seconds: int = 3600, max_size: int = 1000): | |
| """ | |
| Initialize cache with TTL and max size. | |
| Args: | |
| ttl_seconds: Time-to-live for cache entries in seconds (default: 1 hour) | |
| max_size: Maximum number of entries to store (default: 1000) | |
| """ | |
| self._cache: dict[str, dict[str, Any]] = {} | |
| self._lock = Lock() | |
| self._ttl = ttl_seconds | |
| self._max_size = max_size | |
| self._hits = 0 | |
| self._misses = 0 | |
| logger.info(f"Cache initialized with TTL={ttl_seconds}s, max_size={max_size}") | |
| def get(self, key: str) -> dict[str, Any] | None: | |
| """ | |
| Get cached content for key. | |
| Args: | |
| key: Cache key (typically a URL) | |
| Returns: | |
| Cached data if found and not expired, None otherwise | |
| """ | |
| with self._lock: | |
| if key not in self._cache: | |
| self._misses += 1 | |
| return None | |
| entry = self._cache[key] | |
| expiry_time = entry["expiry"] | |
| # Check if expired | |
| if time.time() > expiry_time: | |
| del self._cache[key] | |
| self._misses += 1 | |
| logger.debug(f"Cache expired for key: {key[:50]}...") | |
| return None | |
| self._hits += 1 | |
| logger.debug(f"Cache hit for key: {key[:50]}...") | |
| return entry["data"] | |
| def set(self, key: str, data: dict[str, Any]) -> None: | |
| """ | |
| Cache content with TTL. | |
| Args: | |
| key: Cache key (typically a URL) | |
| data: Data to cache | |
| """ | |
| with self._lock: | |
| # Enforce max size by removing oldest entry | |
| if len(self._cache) >= self._max_size: | |
| oldest_key = min( | |
| self._cache.keys(), key=lambda k: self._cache[k]["expiry"] | |
| ) | |
| del self._cache[oldest_key] | |
| logger.debug(f"Cache full, removed oldest entry: {oldest_key[:50]}...") | |
| expiry_time = time.time() + self._ttl | |
| self._cache[key] = { | |
| "data": data, | |
| "expiry": expiry_time, | |
| "created": time.time(), | |
| } | |
| logger.debug(f"Cached key: {key[:50]}...") | |
| def clear_expired(self) -> int: | |
| """ | |
| Remove all expired entries from cache. | |
| Returns: | |
| Number of entries removed | |
| """ | |
| with self._lock: | |
| current_time = time.time() | |
| expired_keys = [ | |
| key | |
| for key, entry in self._cache.items() | |
| if current_time > entry["expiry"] | |
| ] | |
| for key in expired_keys: | |
| del self._cache[key] | |
| if expired_keys: | |
| logger.info(f"Cleared {len(expired_keys)} expired cache entries") | |
| return len(expired_keys) | |
| def clear_all(self) -> None: | |
| """Clear all cache entries.""" | |
| with self._lock: | |
| count = len(self._cache) | |
| self._cache.clear() | |
| self._hits = 0 | |
| self._misses = 0 | |
| logger.info(f"Cleared all {count} cache entries") | |
| def stats(self) -> dict[str, int]: | |
| """ | |
| Get cache statistics. | |
| Returns: | |
| Dictionary with cache metrics | |
| """ | |
| with self._lock: | |
| total_requests = self._hits + self._misses | |
| hit_rate = ( | |
| (self._hits / total_requests * 100) if total_requests > 0 else 0.0 | |
| ) | |
| return { | |
| "size": len(self._cache), | |
| "max_size": self._max_size, | |
| "hits": self._hits, | |
| "misses": self._misses, | |
| "hit_rate": round(hit_rate, 2), | |
| "ttl_seconds": self._ttl, | |
| } | |
| # Global cache instance for scraped content | |
| scraping_cache = SimpleCache(ttl_seconds=3600, max_size=1000) | |