import os import json import logging from typing import Optional from huggingface_hub import HfApi, hf_hub_download from huggingface_hub.utils import RepositoryNotFoundError, EntryNotFoundError logger = logging.getLogger(__name__) class HFStorageManager: """ Manages persistent storage using Hugging Face Datasets. Stores JSON files in a HF dataset for persistence across Space restarts. """ def __init__(self): """Initialize HF Storage Manager with credentials from environment variables.""" self.token = os.getenv("HF_TOKEN") self.repo_id = os.getenv("HF_DATASET_REPO") self._cache = {} # In-memory cache to reduce API calls if not self.token: logger.warning("HF_TOKEN not found. Using local storage fallback.") self.use_hf = False return if not self.repo_id: logger.warning("HF_DATASET_REPO not found. Using local storage fallback.") self.use_hf = False return self.use_hf = True self.api = HfApi() logger.info(f"HF Storage initialized for dataset: {self.repo_id}") def save_file(self, filename: str, data: dict) -> bool: """ Save a JSON file to HF dataset. Args: filename: Name of the file (e.g., 'chat_history.json') data: Dictionary to save as JSON Returns: bool: True if successful, False otherwise """ if not self.use_hf: return self._save_local(filename, data) try: # Save locally first temp_path = f"/tmp/{filename}" with open(temp_path, 'w') as f: json.dump(data, f, indent=2) # Upload to HF dataset self.api.upload_file( path_or_fileobj=temp_path, path_in_repo=filename, repo_id=self.repo_id, repo_type="dataset", token=self.token ) # Cache the data self._cache[filename] = data logger.info(f"Successfully saved {filename} to HF dataset") return True except Exception as e: logger.error(f"Failed to save {filename} to HF dataset: {e}") # Fallback to local storage return self._save_local(filename, data) def load_file(self, filename: str) -> Optional[dict]: """ Load a JSON file from HF dataset. Args: filename: Name of the file to load Returns: dict or None: Loaded data or None if not found """ if not self.use_hf: return self._load_local(filename) # Check cache first if filename in self._cache: logger.info(f"Loaded {filename} from cache") return self._cache[filename] try: # Download from HF dataset file_path = hf_hub_download( repo_id=self.repo_id, filename=filename, repo_type="dataset", token=self.token ) with open(file_path, 'r') as f: data = json.load(f) # Cache the data self._cache[filename] = data logger.info(f"Successfully loaded {filename} from HF dataset") return data except (RepositoryNotFoundError, EntryNotFoundError): logger.info(f"{filename} not found in HF dataset, returning None") return None except Exception as e: logger.error(f"Failed to load {filename} from HF dataset: {e}") # Fallback to local storage return self._load_local(filename) def file_exists(self, filename: str) -> bool: """ Check if a file exists in HF dataset. Args: filename: Name of the file Returns: bool: True if file exists, False otherwise """ if not self.use_hf: return os.path.exists(filename) try: hf_hub_download( repo_id=self.repo_id, filename=filename, repo_type="dataset", token=self.token ) return True except (RepositoryNotFoundError, EntryNotFoundError): return False except Exception as e: logger.error(f"Error checking if {filename} exists: {e}") return os.path.exists(filename) def _save_local(self, filename: str, data: dict) -> bool: """Fallback: Save to local filesystem.""" try: with open(filename, 'w') as f: json.dump(data, f, indent=2) logger.info(f"Saved {filename} locally (fallback)") return True except Exception as e: logger.error(f"Failed to save {filename} locally: {e}") return False def _load_local(self, filename: str) -> Optional[dict]: """Fallback: Load from local filesystem.""" try: if not os.path.exists(filename): return None with open(filename, 'r') as f: data = json.load(f) logger.info(f"Loaded {filename} locally (fallback)") return data except Exception as e: logger.error(f"Failed to load {filename} locally: {e}") return None