Student_Analyzer / hf_storage.py
joker7094's picture
optimize HF storage with caching and switch to gemini-2.5-flash
c133164
import os
import json
import logging
from typing import Optional
from huggingface_hub import HfApi, hf_hub_download
from huggingface_hub.utils import RepositoryNotFoundError, EntryNotFoundError
logger = logging.getLogger(__name__)
class HFStorageManager:
"""
Manages persistent storage using Hugging Face Datasets.
Stores JSON files in a HF dataset for persistence across Space restarts.
"""
def __init__(self):
"""Initialize HF Storage Manager with credentials from environment variables."""
self.token = os.getenv("HF_TOKEN")
self.repo_id = os.getenv("HF_DATASET_REPO")
self._cache = {} # In-memory cache to reduce API calls
if not self.token:
logger.warning("HF_TOKEN not found. Using local storage fallback.")
self.use_hf = False
return
if not self.repo_id:
logger.warning("HF_DATASET_REPO not found. Using local storage fallback.")
self.use_hf = False
return
self.use_hf = True
self.api = HfApi()
logger.info(f"HF Storage initialized for dataset: {self.repo_id}")
def save_file(self, filename: str, data: dict) -> bool:
"""
Save a JSON file to HF dataset.
Args:
filename: Name of the file (e.g., 'chat_history.json')
data: Dictionary to save as JSON
Returns:
bool: True if successful, False otherwise
"""
if not self.use_hf:
return self._save_local(filename, data)
try:
# Save locally first
temp_path = f"/tmp/{filename}"
with open(temp_path, 'w') as f:
json.dump(data, f, indent=2)
# Upload to HF dataset
self.api.upload_file(
path_or_fileobj=temp_path,
path_in_repo=filename,
repo_id=self.repo_id,
repo_type="dataset",
token=self.token
)
# Cache the data
self._cache[filename] = data
logger.info(f"Successfully saved {filename} to HF dataset")
return True
except Exception as e:
logger.error(f"Failed to save {filename} to HF dataset: {e}")
# Fallback to local storage
return self._save_local(filename, data)
def load_file(self, filename: str) -> Optional[dict]:
"""
Load a JSON file from HF dataset.
Args:
filename: Name of the file to load
Returns:
dict or None: Loaded data or None if not found
"""
if not self.use_hf:
return self._load_local(filename)
# Check cache first
if filename in self._cache:
logger.info(f"Loaded {filename} from cache")
return self._cache[filename]
try:
# Download from HF dataset
file_path = hf_hub_download(
repo_id=self.repo_id,
filename=filename,
repo_type="dataset",
token=self.token
)
with open(file_path, 'r') as f:
data = json.load(f)
# Cache the data
self._cache[filename] = data
logger.info(f"Successfully loaded {filename} from HF dataset")
return data
except (RepositoryNotFoundError, EntryNotFoundError):
logger.info(f"{filename} not found in HF dataset, returning None")
return None
except Exception as e:
logger.error(f"Failed to load {filename} from HF dataset: {e}")
# Fallback to local storage
return self._load_local(filename)
def file_exists(self, filename: str) -> bool:
"""
Check if a file exists in HF dataset.
Args:
filename: Name of the file
Returns:
bool: True if file exists, False otherwise
"""
if not self.use_hf:
return os.path.exists(filename)
try:
hf_hub_download(
repo_id=self.repo_id,
filename=filename,
repo_type="dataset",
token=self.token
)
return True
except (RepositoryNotFoundError, EntryNotFoundError):
return False
except Exception as e:
logger.error(f"Error checking if {filename} exists: {e}")
return os.path.exists(filename)
def _save_local(self, filename: str, data: dict) -> bool:
"""Fallback: Save to local filesystem."""
try:
with open(filename, 'w') as f:
json.dump(data, f, indent=2)
logger.info(f"Saved {filename} locally (fallback)")
return True
except Exception as e:
logger.error(f"Failed to save {filename} locally: {e}")
return False
def _load_local(self, filename: str) -> Optional[dict]:
"""Fallback: Load from local filesystem."""
try:
if not os.path.exists(filename):
return None
with open(filename, 'r') as f:
data = json.load(f)
logger.info(f"Loaded {filename} locally (fallback)")
return data
except Exception as e:
logger.error(f"Failed to load {filename} locally: {e}")
return None