Spaces:
Running
Running
| """File-based caching for search results and generated data.""" | |
| import json | |
| import hashlib | |
| import os | |
| from pathlib import Path | |
| from typing import Any, Optional | |
| from datetime import datetime | |
| class FileCache: | |
| """JSON file-based cache for search results and generated data.""" | |
| def __init__(self, cache_dir: str = ".legitdata_cache"): | |
| """ | |
| Initialize file cache. | |
| Args: | |
| cache_dir: Directory to store cache files | |
| """ | |
| self.cache_dir = Path(cache_dir) | |
| self.cache_dir.mkdir(parents=True, exist_ok=True) | |
| # Separate subdirectories for different cache types | |
| self.context_dir = self.cache_dir / "context" | |
| self.search_dir = self.cache_dir / "search" | |
| self.classification_dir = self.cache_dir / "classification" | |
| self.generation_dir = self.cache_dir / "generation" | |
| for subdir in [self.context_dir, self.search_dir, | |
| self.classification_dir, self.generation_dir]: | |
| subdir.mkdir(exist_ok=True) | |
| def _hash_key(self, key: str) -> str: | |
| """Generate a filesystem-safe hash for a cache key.""" | |
| return hashlib.md5(key.encode()).hexdigest() | |
| def _get_path(self, cache_type: str, key: str) -> Path: | |
| """Get the file path for a cache entry.""" | |
| dir_map = { | |
| "context": self.context_dir, | |
| "search": self.search_dir, | |
| "classification": self.classification_dir, | |
| "generation": self.generation_dir, | |
| } | |
| directory = dir_map.get(cache_type, self.cache_dir) | |
| return directory / f"{self._hash_key(key)}.json" | |
| def get(self, cache_type: str, key: str) -> Optional[Any]: | |
| """ | |
| Get a value from the cache. | |
| Args: | |
| cache_type: Type of cache (context, search, classification, generation) | |
| key: Cache key | |
| Returns: | |
| Cached value or None if not found | |
| """ | |
| path = self._get_path(cache_type, key) | |
| if not path.exists(): | |
| return None | |
| try: | |
| with open(path, 'r') as f: | |
| data = json.load(f) | |
| return data.get("value") | |
| except (json.JSONDecodeError, IOError): | |
| return None | |
| def set(self, cache_type: str, key: str, value: Any) -> None: | |
| """ | |
| Store a value in the cache. | |
| Args: | |
| cache_type: Type of cache | |
| key: Cache key | |
| value: Value to store | |
| """ | |
| path = self._get_path(cache_type, key) | |
| data = { | |
| "key": key, | |
| "value": value, | |
| "cached_at": datetime.now().isoformat(), | |
| } | |
| try: | |
| with open(path, 'w') as f: | |
| json.dump(data, f, indent=2, default=str) | |
| except IOError as e: | |
| print(f"Warning: Could not write cache: {e}") | |
| def exists(self, cache_type: str, key: str) -> bool: | |
| """Check if a cache entry exists.""" | |
| return self._get_path(cache_type, key).exists() | |
| def delete(self, cache_type: str, key: str) -> bool: | |
| """ | |
| Delete a cache entry. | |
| Returns: | |
| True if deleted, False if not found | |
| """ | |
| path = self._get_path(cache_type, key) | |
| if path.exists(): | |
| path.unlink() | |
| return True | |
| return False | |
| def clear(self, cache_type: Optional[str] = None) -> int: | |
| """ | |
| Clear cache entries. | |
| Args: | |
| cache_type: Type to clear, or None for all | |
| Returns: | |
| Number of entries cleared | |
| """ | |
| count = 0 | |
| if cache_type: | |
| directories = [self._get_path(cache_type, "").parent] | |
| else: | |
| directories = [self.context_dir, self.search_dir, | |
| self.classification_dir, self.generation_dir] | |
| for directory in directories: | |
| if directory.exists(): | |
| for file in directory.glob("*.json"): | |
| file.unlink() | |
| count += 1 | |
| return count | |
| def get_stats(self) -> dict: | |
| """Get cache statistics.""" | |
| stats = { | |
| "total": 0, | |
| "by_type": {} | |
| } | |
| for cache_type, directory in [ | |
| ("context", self.context_dir), | |
| ("search", self.search_dir), | |
| ("classification", self.classification_dir), | |
| ("generation", self.generation_dir), | |
| ]: | |
| count = len(list(directory.glob("*.json"))) if directory.exists() else 0 | |
| stats["by_type"][cache_type] = count | |
| stats["total"] += count | |
| return stats | |
| # Convenience methods for specific cache types | |
| def get_context(self, url: str) -> Optional[dict]: | |
| """Get cached company context.""" | |
| return self.get("context", url) | |
| def set_context(self, url: str, context: dict) -> None: | |
| """Cache company context.""" | |
| self.set("context", url, context) | |
| def get_search_results(self, query: str) -> Optional[list]: | |
| """Get cached search results.""" | |
| return self.get("search", query) | |
| def set_search_results(self, query: str, results: list) -> None: | |
| """Cache search results.""" | |
| self.set("search", query, results) | |
| def get_classification(self, schema_hash: str) -> Optional[dict]: | |
| """Get cached column classifications.""" | |
| return self.get("classification", schema_hash) | |
| def set_classification(self, schema_hash: str, classification: dict) -> None: | |
| """Cache column classifications.""" | |
| self.set("classification", schema_hash, classification) | |
| def get_generated_values(self, key: str) -> Optional[list]: | |
| """Get cached generated values.""" | |
| return self.get("generation", key) | |
| def set_generated_values(self, key: str, values: list) -> None: | |
| """Cache generated values.""" | |
| self.set("generation", key, values) | |