| | #!/usr/bin/env bash
|
| | set -euo pipefail
|
| | IFS=$'\n\t'
|
| |
|
| |
|
| | set -a
|
| |
|
| | log() {
|
| | echo "[$(date +'%F %T')] $*"
|
| | }
|
| |
|
| |
|
| | init_backup(){
|
| | if [[ -n "${DATASET_ID:-}" ]]; then
|
| | log "📁 使用外部定义的 DATASET_ID=$DATASET_ID"
|
| | return 0
|
| | fi
|
| | if [[ -z "${HF_TOKEN:-}" ]]; then
|
| | log "⚠️ HF_TOKEN 未设置,跳过备份"
|
| | return 1
|
| | fi
|
| |
|
| | USER_ID=$(python3 - <<'PY'
|
| | import os,sys
|
| | from huggingface_hub import HfApi
|
| | try:
|
| | name = HfApi(token=os.getenv("HF_TOKEN")).whoami().get("name","")
|
| | print(name) if name else sys.exit(1)
|
| | except:
|
| | sys.exit(1)
|
| | PY
|
| | )
|
| | if [[ -z "$USER_ID" ]]; then
|
| | log "⚠️ 获取 USER_ID 失败,跳过备份"
|
| | return 1
|
| | fi
|
| |
|
| | DATASET_ID="${USER_ID}/data"
|
| |
|
| | log "✅ 设置默认 DATASET_ID=$DATASET_ID"
|
| | return 0
|
| | }
|
| |
|
| |
|
| | prep_repo(){
|
| | python3 <<'PY'
|
| | import os
|
| | from huggingface_hub import HfApi
|
| | api = HfApi(token=os.getenv("HF_TOKEN"))
|
| | repo = os.environ["DATASET_ID"]
|
| | author = repo.split("/")[0]
|
| | if not any(d.id == repo for d in api.list_datasets(author=author)):
|
| | api.create_repo(repo_id=repo, repo_type="dataset", private=True)
|
| | branch = "Chat-Share"
|
| | refs = api.list_repo_refs(repo_id=repo, repo_type="dataset").branches
|
| | if branch not in [b.name for b in refs]:
|
| | api.create_branch(repo_id=repo, repo_type="dataset", branch=branch)
|
| | PY
|
| | log "✅ 数据集 & 分支就绪"
|
| | }
|
| |
|
| |
|
| | restore_latest(){
|
| | python3 <<'PY'
|
| | import os,sys,tarfile,tempfile
|
| | from huggingface_hub import HfApi
|
| | api = HfApi(token=os.getenv("HF_TOKEN"))
|
| | repo, branch = os.getenv("DATASET_ID"), "Chat-Share"
|
| | files = api.list_repo_files(repo_id=repo, repo_type="dataset", revision=branch)
|
| | backs = sorted(f for f in files if f.endswith(".tar.gz"))
|
| | if not backs: sys.exit(0)
|
| | td = tempfile.mkdtemp()
|
| | path = api.hf_hub_download(repo_id=repo, repo_type="dataset",
|
| | revision=branch, filename=backs[-1], local_dir=td)
|
| | with tarfile.open(path) as t:
|
| | t.extractall(os.getenv("BACKUP_DIR"))
|
| | PY
|
| | log "✅ 恢复最新备份(如果有)"
|
| | }
|
| |
|
| |
|
| | do_backup(){
|
| | ts=$(date +%Y%m%d_%H%M%S)
|
| | fname="Chat-Share_${ts}.tar.gz"
|
| | tmp=$(mktemp -d)
|
| | tar -czf "$tmp/$fname" -C "$BACKUP_DIR" .
|
| |
|
| | python3 <<PY
|
| | import os
|
| | from huggingface_hub import HfApi
|
| | api = HfApi(token=os.getenv("HF_TOKEN"))
|
| | repo, branch = os.getenv("DATASET_ID"), "Chat-Share"
|
| | api.upload_file(path_or_fileobj="$tmp/$fname",
|
| | path_in_repo="$fname",
|
| | repo_id=repo, repo_type="dataset",
|
| | revision=branch)
|
| | keep = int(os.getenv("DATASET_NUM", "10"))
|
| | files = api.list_repo_files(repo_id=repo, repo_type="dataset", revision=branch)
|
| | backs = sorted(f for f in files if f.endswith(".tar.gz"))
|
| | for old in backs[:-keep]:
|
| | api.delete_file(path_in_repo=old,
|
| | repo_id=repo, repo_type="dataset", revision=branch)
|
| | api.super_squash_history(repo_id=repo, repo_type="dataset", branch=branch)
|
| | PY
|
| |
|
| | rm -rf "$tmp"
|
| | log "✅ 上传备份并清理临时文件"
|
| | }
|
| |
|
| |
|
| | sync_loop(){
|
| | while true; do
|
| | do_backup
|
| | log "⏳ 下次同步在 ${SYNC_INTERVAL}s 后"
|
| | sleep "${SYNC_INTERVAL}"
|
| | done
|
| | }
|
| |
|
| |
|
| | main(){
|
| | BACKUP_DIR="${BACKUP_DIR:-$HOME/app/data}"
|
| | DATASET_NUM="${DATASET_NUM:-10}"
|
| | SYNC_INTERVAL="${SYNC_INTERVAL:-36000}"
|
| |
|
| | if init_backup; then
|
| | log "🚀 启动备份/同步流程,使用数据集:$DATASET_ID"
|
| | prep_repo
|
| | restore_latest
|
| | sync_loop &
|
| | else
|
| | log "🚀 直接启动主应用,无备份/同步"
|
| | fi
|
| |
|
| | exec python app.py
|
| | }
|
| |
|
| | main |