NotebookLM / persistence /storage_service.py
internomega-terrablue
Enabling logger with flush
3fb7184
"""HF Dataset repo storage for user notebook data."""
import json
import os
import tempfile
from huggingface_hub import HfApi, hf_hub_download
from state import UserData
class StorageService:
REPO_ID = "Group-1-5010/notebooklm-data"
REPO_TYPE = "dataset"
_repo_ensured = False
@staticmethod
def _get_token():
token = os.environ.get("HF_TOKEN")
if not token:
raise RuntimeError("HF_TOKEN not found in environment. Add it as a Secret in your HF Space settings.")
return token
@staticmethod
def _ensure_repo(api: HfApi):
"""Create the dataset repo if it doesn't already exist (once per process)."""
if StorageService._repo_ensured:
return
api.create_repo(
repo_id=StorageService.REPO_ID,
repo_type=StorageService.REPO_TYPE,
private=True,
exist_ok=True,
)
StorageService._repo_ensured = True
print(f"[StorageService] Ensured dataset repo exists: {StorageService.REPO_ID}", flush=True)
@staticmethod
def save_user_data(user_data: UserData) -> None:
"""Serialize UserData to JSON and upload to HF Dataset repo.
Raises on failure so the caller can show the error to the user.
"""
token = StorageService._get_token()
data_json = json.dumps(user_data.to_dict(), ensure_ascii=False, indent=2)
tmp_path = None
try:
with tempfile.NamedTemporaryFile(
mode="w", suffix=".json", delete=False
) as tmp:
tmp.write(data_json)
tmp_path = tmp.name
api = HfApi(token=token)
StorageService._ensure_repo(api)
api.upload_file(
path_or_fileobj=tmp_path,
path_in_repo=f"data/{user_data.user_id}.json",
repo_id=StorageService.REPO_ID,
repo_type=StorageService.REPO_TYPE,
)
print(f"[StorageService] Saved user data for '{user_data.user_id}'", flush=True)
finally:
if tmp_path:
try:
os.unlink(tmp_path)
except Exception:
pass
@staticmethod
def load_user_data(user_id: str, user_name: str) -> UserData | None:
"""Download and deserialize UserData from HF Dataset repo.
Returns None if the file doesn't exist (new user).
"""
try:
token = StorageService._get_token()
except RuntimeError:
print("[StorageService] HF_TOKEN not set — skipping load", flush=True)
return None
try:
path = hf_hub_download(
repo_id=StorageService.REPO_ID,
filename=f"data/{user_id}.json",
repo_type=StorageService.REPO_TYPE,
token=token,
)
with open(path, "r") as f:
data = json.load(f)
print(f"[StorageService] Loaded user data for '{user_id}'", flush=True)
return UserData.from_dict(data)
except Exception as e:
# EntryNotFoundError or network errors — treat as new user
print(f"[StorageService] No existing data for '{user_id}': {e}", flush=True)
return None