"""File-based caching for search results and generated data.""" import json import hashlib import os from pathlib import Path from typing import Any, Optional from datetime import datetime class FileCache: """JSON file-based cache for search results and generated data.""" def __init__(self, cache_dir: str = ".legitdata_cache"): """ Initialize file cache. Args: cache_dir: Directory to store cache files """ self.cache_dir = Path(cache_dir) self.cache_dir.mkdir(parents=True, exist_ok=True) # Separate subdirectories for different cache types self.context_dir = self.cache_dir / "context" self.search_dir = self.cache_dir / "search" self.classification_dir = self.cache_dir / "classification" self.generation_dir = self.cache_dir / "generation" for subdir in [self.context_dir, self.search_dir, self.classification_dir, self.generation_dir]: subdir.mkdir(exist_ok=True) def _hash_key(self, key: str) -> str: """Generate a filesystem-safe hash for a cache key.""" return hashlib.md5(key.encode()).hexdigest() def _get_path(self, cache_type: str, key: str) -> Path: """Get the file path for a cache entry.""" dir_map = { "context": self.context_dir, "search": self.search_dir, "classification": self.classification_dir, "generation": self.generation_dir, } directory = dir_map.get(cache_type, self.cache_dir) return directory / f"{self._hash_key(key)}.json" def get(self, cache_type: str, key: str) -> Optional[Any]: """ Get a value from the cache. Args: cache_type: Type of cache (context, search, classification, generation) key: Cache key Returns: Cached value or None if not found """ path = self._get_path(cache_type, key) if not path.exists(): return None try: with open(path, 'r') as f: data = json.load(f) return data.get("value") except (json.JSONDecodeError, IOError): return None def set(self, cache_type: str, key: str, value: Any) -> None: """ Store a value in the cache. Args: cache_type: Type of cache key: Cache key value: Value to store """ path = self._get_path(cache_type, key) data = { "key": key, "value": value, "cached_at": datetime.now().isoformat(), } try: with open(path, 'w') as f: json.dump(data, f, indent=2, default=str) except IOError as e: print(f"Warning: Could not write cache: {e}") def exists(self, cache_type: str, key: str) -> bool: """Check if a cache entry exists.""" return self._get_path(cache_type, key).exists() def delete(self, cache_type: str, key: str) -> bool: """ Delete a cache entry. Returns: True if deleted, False if not found """ path = self._get_path(cache_type, key) if path.exists(): path.unlink() return True return False def clear(self, cache_type: Optional[str] = None) -> int: """ Clear cache entries. Args: cache_type: Type to clear, or None for all Returns: Number of entries cleared """ count = 0 if cache_type: directories = [self._get_path(cache_type, "").parent] else: directories = [self.context_dir, self.search_dir, self.classification_dir, self.generation_dir] for directory in directories: if directory.exists(): for file in directory.glob("*.json"): file.unlink() count += 1 return count def get_stats(self) -> dict: """Get cache statistics.""" stats = { "total": 0, "by_type": {} } for cache_type, directory in [ ("context", self.context_dir), ("search", self.search_dir), ("classification", self.classification_dir), ("generation", self.generation_dir), ]: count = len(list(directory.glob("*.json"))) if directory.exists() else 0 stats["by_type"][cache_type] = count stats["total"] += count return stats # Convenience methods for specific cache types def get_context(self, url: str) -> Optional[dict]: """Get cached company context.""" return self.get("context", url) def set_context(self, url: str, context: dict) -> None: """Cache company context.""" self.set("context", url, context) def get_search_results(self, query: str) -> Optional[list]: """Get cached search results.""" return self.get("search", query) def set_search_results(self, query: str, results: list) -> None: """Cache search results.""" self.set("search", query, results) def get_classification(self, schema_hash: str) -> Optional[dict]: """Get cached column classifications.""" return self.get("classification", schema_hash) def set_classification(self, schema_hash: str, classification: dict) -> None: """Cache column classifications.""" self.set("classification", schema_hash, classification) def get_generated_values(self, key: str) -> Optional[list]: """Get cached generated values.""" return self.get("generation", key) def set_generated_values(self, key: str, values: list) -> None: """Cache generated values.""" self.set("generation", key, values)