mikeboone's picture
feat: vendor legitdata source package into repository
1502291
"""File-based caching for search results and generated data."""
import json
import hashlib
import os
from pathlib import Path
from typing import Any, Optional
from datetime import datetime
class FileCache:
"""JSON file-based cache for search results and generated data."""
def __init__(self, cache_dir: str = ".legitdata_cache"):
"""
Initialize file cache.
Args:
cache_dir: Directory to store cache files
"""
self.cache_dir = Path(cache_dir)
self.cache_dir.mkdir(parents=True, exist_ok=True)
# Separate subdirectories for different cache types
self.context_dir = self.cache_dir / "context"
self.search_dir = self.cache_dir / "search"
self.classification_dir = self.cache_dir / "classification"
self.generation_dir = self.cache_dir / "generation"
for subdir in [self.context_dir, self.search_dir,
self.classification_dir, self.generation_dir]:
subdir.mkdir(exist_ok=True)
def _hash_key(self, key: str) -> str:
"""Generate a filesystem-safe hash for a cache key."""
return hashlib.md5(key.encode()).hexdigest()
def _get_path(self, cache_type: str, key: str) -> Path:
"""Get the file path for a cache entry."""
dir_map = {
"context": self.context_dir,
"search": self.search_dir,
"classification": self.classification_dir,
"generation": self.generation_dir,
}
directory = dir_map.get(cache_type, self.cache_dir)
return directory / f"{self._hash_key(key)}.json"
def get(self, cache_type: str, key: str) -> Optional[Any]:
"""
Get a value from the cache.
Args:
cache_type: Type of cache (context, search, classification, generation)
key: Cache key
Returns:
Cached value or None if not found
"""
path = self._get_path(cache_type, key)
if not path.exists():
return None
try:
with open(path, 'r') as f:
data = json.load(f)
return data.get("value")
except (json.JSONDecodeError, IOError):
return None
def set(self, cache_type: str, key: str, value: Any) -> None:
"""
Store a value in the cache.
Args:
cache_type: Type of cache
key: Cache key
value: Value to store
"""
path = self._get_path(cache_type, key)
data = {
"key": key,
"value": value,
"cached_at": datetime.now().isoformat(),
}
try:
with open(path, 'w') as f:
json.dump(data, f, indent=2, default=str)
except IOError as e:
print(f"Warning: Could not write cache: {e}")
def exists(self, cache_type: str, key: str) -> bool:
"""Check if a cache entry exists."""
return self._get_path(cache_type, key).exists()
def delete(self, cache_type: str, key: str) -> bool:
"""
Delete a cache entry.
Returns:
True if deleted, False if not found
"""
path = self._get_path(cache_type, key)
if path.exists():
path.unlink()
return True
return False
def clear(self, cache_type: Optional[str] = None) -> int:
"""
Clear cache entries.
Args:
cache_type: Type to clear, or None for all
Returns:
Number of entries cleared
"""
count = 0
if cache_type:
directories = [self._get_path(cache_type, "").parent]
else:
directories = [self.context_dir, self.search_dir,
self.classification_dir, self.generation_dir]
for directory in directories:
if directory.exists():
for file in directory.glob("*.json"):
file.unlink()
count += 1
return count
def get_stats(self) -> dict:
"""Get cache statistics."""
stats = {
"total": 0,
"by_type": {}
}
for cache_type, directory in [
("context", self.context_dir),
("search", self.search_dir),
("classification", self.classification_dir),
("generation", self.generation_dir),
]:
count = len(list(directory.glob("*.json"))) if directory.exists() else 0
stats["by_type"][cache_type] = count
stats["total"] += count
return stats
# Convenience methods for specific cache types
def get_context(self, url: str) -> Optional[dict]:
"""Get cached company context."""
return self.get("context", url)
def set_context(self, url: str, context: dict) -> None:
"""Cache company context."""
self.set("context", url, context)
def get_search_results(self, query: str) -> Optional[list]:
"""Get cached search results."""
return self.get("search", query)
def set_search_results(self, query: str, results: list) -> None:
"""Cache search results."""
self.set("search", query, results)
def get_classification(self, schema_hash: str) -> Optional[dict]:
"""Get cached column classifications."""
return self.get("classification", schema_hash)
def set_classification(self, schema_hash: str, classification: dict) -> None:
"""Cache column classifications."""
self.set("classification", schema_hash, classification)
def get_generated_values(self, key: str) -> Optional[list]:
"""Get cached generated values."""
return self.get("generation", key)
def set_generated_values(self, key: str, values: list) -> None:
"""Cache generated values."""
self.set("generation", key, values)