chatpaper / src /storage /hf_storage.py
Shafagh99's picture
add chatpaper project
c003cc2
import os
import json
from pathlib import Path
from huggingface_hub import HfApi, hf_hub_download
from huggingface_hub.utils import EntryNotFoundError
HF_TOKEN = os.getenv("HF_TOKEN")
HF_USERNAME = os.getenv("HF_USERNAME")
def get_repo_id() -> str:
return f"{HF_USERNAME}/chatpaper-data"
def is_hf_configured() -> bool:
"""Returns True only if HF credentials are available."""
return bool(HF_TOKEN and HF_USERNAME)
def ensure_dataset_repo():
if not is_hf_configured():
return
api = HfApi(token=HF_TOKEN)
repo_id = get_repo_id()
try:
api.repo_info(repo_id=repo_id, repo_type="dataset")
print(f"HF dataset repo exists: {repo_id}")
except Exception:
try:
api.create_repo(repo_id=repo_id, repo_type="dataset", private=True)
print(f"Created HF dataset repo: {repo_id}")
except Exception as e:
if "already exists" in str(e) or "409" in str(e):
print(f"HF dataset repo already exists: {repo_id}")
else:
print(f"Warning: Could not create HF repo: {e}")
def upload_file(local_path: str, path_in_repo: str) -> bool:
try:
api = HfApi(token=HF_TOKEN)
api.upload_file(
path_or_fileobj=local_path,
path_in_repo=path_in_repo,
repo_id=get_repo_id(),
repo_type="dataset",
)
return True
except Exception as e:
print(f"HF upload error: {e}")
return False
def download_file(path_in_repo: str, local_path: str) -> bool:
try:
downloaded = hf_hub_download(
repo_id=get_repo_id(),
filename=path_in_repo,
repo_type="dataset",
token=HF_TOKEN,
local_dir="/tmp/hf_downloads",
)
Path(local_path).parent.mkdir(parents=True, exist_ok=True)
import shutil
shutil.copy(downloaded, local_path)
return True
except EntryNotFoundError:
return False
except Exception as e:
print(f"HF download error: {e}")
return False
def save_json_to_hf(data, path_in_repo: str) -> bool:
tmp_path = Path("/tmp") / ("hf_" + path_in_repo.replace("/", "_"))
tmp_path.parent.mkdir(parents=True, exist_ok=True)
with open(tmp_path, "w", encoding="utf-8") as f:
json.dump(data, f, indent=2, ensure_ascii=False)
result = upload_file(str(tmp_path), path_in_repo)
tmp_path.unlink(missing_ok=True)
return result
def load_json_from_hf(path_in_repo: str, default=None):
tmp_path = f"/tmp/hf_{path_in_repo.replace('/', '_')}"
success = download_file(path_in_repo, tmp_path)
if not success:
return default
try:
with open(tmp_path, "r", encoding="utf-8") as f:
return json.load(f)
except Exception:
return default
finally:
try:
Path(tmp_path).unlink(missing_ok=True)
except Exception:
pass
def save_chat(chat_data: dict) -> bool:
if not is_hf_configured():
return False
path = f"chats/{chat_data['session_id']}.json"
return save_json_to_hf(chat_data, path)
def load_all_chats() -> list:
if not is_hf_configured():
return []
try:
api = HfApi(token=HF_TOKEN)
files = list(api.list_repo_files(repo_id=get_repo_id(), repo_type="dataset"))
chat_files = [f for f in files if f.startswith("chats/") and f.endswith(".json")]
except Exception:
return []
chats = []
for file_path in chat_files:
chat = load_json_from_hf(file_path, default=None)
if chat:
chats.append(chat)
chats.sort(key=lambda x: x.get("timestamp", ""), reverse=True)
return chats
def delete_chat(session_id: str) -> bool:
if not is_hf_configured():
return False
try:
api = HfApi(token=HF_TOKEN)
api.delete_file(
path_in_repo=f"chats/{session_id}.json",
repo_id=get_repo_id(),
repo_type="dataset",
)
return True
except Exception as e:
print(f"Delete chat error: {e}")
return False
def save_related_papers(data: dict) -> bool:
if not is_hf_configured():
return False
return save_json_to_hf(data, "related_papers.json")
def load_related_papers() -> dict:
if not is_hf_configured():
return {}
return load_json_from_hf("related_papers.json", default={})