ming
Migrate to Ruff for linting/formatting and add comprehensive import tests
29ed661
"""
Simple in-memory cache with TTL for V3 web scraping API.
"""
import time
from threading import Lock
from typing import Any
from app.core.logging import get_logger
logger = get_logger(__name__)
class SimpleCache:
"""Thread-safe in-memory cache with TTL-based expiration."""
def __init__(self, ttl_seconds: int = 3600, max_size: int = 1000):
"""
Initialize cache with TTL and max size.
Args:
ttl_seconds: Time-to-live for cache entries in seconds (default: 1 hour)
max_size: Maximum number of entries to store (default: 1000)
"""
self._cache: dict[str, dict[str, Any]] = {}
self._lock = Lock()
self._ttl = ttl_seconds
self._max_size = max_size
self._hits = 0
self._misses = 0
logger.info(f"Cache initialized with TTL={ttl_seconds}s, max_size={max_size}")
def get(self, key: str) -> dict[str, Any] | None:
"""
Get cached content for key.
Args:
key: Cache key (typically a URL)
Returns:
Cached data if found and not expired, None otherwise
"""
with self._lock:
if key not in self._cache:
self._misses += 1
return None
entry = self._cache[key]
expiry_time = entry["expiry"]
# Check if expired
if time.time() > expiry_time:
del self._cache[key]
self._misses += 1
logger.debug(f"Cache expired for key: {key[:50]}...")
return None
self._hits += 1
logger.debug(f"Cache hit for key: {key[:50]}...")
return entry["data"]
def set(self, key: str, data: dict[str, Any]) -> None:
"""
Cache content with TTL.
Args:
key: Cache key (typically a URL)
data: Data to cache
"""
with self._lock:
# Enforce max size by removing oldest entry
if len(self._cache) >= self._max_size:
oldest_key = min(
self._cache.keys(), key=lambda k: self._cache[k]["expiry"]
)
del self._cache[oldest_key]
logger.debug(f"Cache full, removed oldest entry: {oldest_key[:50]}...")
expiry_time = time.time() + self._ttl
self._cache[key] = {
"data": data,
"expiry": expiry_time,
"created": time.time(),
}
logger.debug(f"Cached key: {key[:50]}...")
def clear_expired(self) -> int:
"""
Remove all expired entries from cache.
Returns:
Number of entries removed
"""
with self._lock:
current_time = time.time()
expired_keys = [
key
for key, entry in self._cache.items()
if current_time > entry["expiry"]
]
for key in expired_keys:
del self._cache[key]
if expired_keys:
logger.info(f"Cleared {len(expired_keys)} expired cache entries")
return len(expired_keys)
def clear_all(self) -> None:
"""Clear all cache entries."""
with self._lock:
count = len(self._cache)
self._cache.clear()
self._hits = 0
self._misses = 0
logger.info(f"Cleared all {count} cache entries")
def stats(self) -> dict[str, int]:
"""
Get cache statistics.
Returns:
Dictionary with cache metrics
"""
with self._lock:
total_requests = self._hits + self._misses
hit_rate = (
(self._hits / total_requests * 100) if total_requests > 0 else 0.0
)
return {
"size": len(self._cache),
"max_size": self._max_size,
"hits": self._hits,
"misses": self._misses,
"hit_rate": round(hit_rate, 2),
"ttl_seconds": self._ttl,
}
# Global cache instance for scraped content
scraping_cache = SimpleCache(ttl_seconds=3600, max_size=1000)