| #!/usr/bin/env bash |
| set -euo pipefail |
| IFS=$'\n\t' |
|
|
| |
| set -a |
|
|
| log() { |
| echo "[$(date +'%F %T')] $*" |
| } |
|
|
| |
| init_backup(){ |
| if [[ -n "${DATASET_ID:-}" ]]; then |
| log "📁 使用外部定义的 DATASET_ID=$DATASET_ID" |
| return 0 |
| fi |
| if [[ -z "${HF_TOKEN:-}" ]]; then |
| log "⚠️ HF_TOKEN 未设置,跳过备份" |
| return 1 |
| fi |
|
|
| USER_ID=$(python3 - <<'PY' |
| import os,sys |
| from huggingface_hub import HfApi |
| try: |
| name = HfApi(token=os.getenv("HF_TOKEN")).whoami().get("name","") |
| print(name) if name else sys.exit(1) |
| except: |
| sys.exit(1) |
| PY |
| ) |
| if [[ -z "$USER_ID" ]]; then |
| log "⚠️ 获取 USER_ID 失败,跳过备份" |
| return 1 |
| fi |
|
|
| DATASET_ID="${USER_ID}/data" |
| |
| log "✅ 设置默认 DATASET_ID=$DATASET_ID" |
| return 0 |
| } |
|
|
| |
| prep_repo(){ |
| python3 <<'PY' |
| import os |
| from huggingface_hub import HfApi |
| api = HfApi(token=os.getenv("HF_TOKEN")) |
| repo = os.environ["DATASET_ID"] |
| author = repo.split("/")[0] |
| if not any(d.id == repo for d in api.list_datasets(author=author)): |
| api.create_repo(repo_id=repo, repo_type="dataset", private=True) |
| branch = "Chat-Share" |
| refs = api.list_repo_refs(repo_id=repo, repo_type="dataset").branches |
| if branch not in [b.name for b in refs]: |
| api.create_branch(repo_id=repo, repo_type="dataset", branch=branch) |
| PY |
| log "✅ 数据集 & 分支就绪" |
| } |
|
|
| |
| restore_latest(){ |
| python3 <<'PY' |
| import os,sys,tarfile,tempfile |
| from huggingface_hub import HfApi |
| api = HfApi(token=os.getenv("HF_TOKEN")) |
| repo, branch = os.getenv("DATASET_ID"), "Chat-Share" |
| files = api.list_repo_files(repo_id=repo, repo_type="dataset", revision=branch) |
| backs = sorted(f for f in files if f.endswith(".tar.gz")) |
| if not backs: sys.exit(0) |
| td = tempfile.mkdtemp() |
| path = api.hf_hub_download(repo_id=repo, repo_type="dataset", |
| revision=branch, filename=backs[-1], local_dir=td) |
| with tarfile.open(path) as t: |
| t.extractall(os.getenv("BACKUP_DIR")) |
| PY |
| log "✅ 恢复最新备份(如果有)" |
| } |
|
|
| |
| do_backup(){ |
| ts=$(date +%Y%m%d_%H%M%S) |
| fname="Chat-Share_${ts}.tar.gz" |
| tmp=$(mktemp -d) |
| tar -czf "$tmp/$fname" -C "$BACKUP_DIR" . |
|
|
| python3 <<PY |
| import os |
| from huggingface_hub import HfApi |
| api = HfApi(token=os.getenv("HF_TOKEN")) |
| repo, branch = os.getenv("DATASET_ID"), "Chat-Share" |
| api.upload_file(path_or_fileobj="$tmp/$fname", |
| path_in_repo="$fname", |
| repo_id=repo, repo_type="dataset", |
| revision=branch) |
| keep = int(os.getenv("DATASET_NUM", "10")) |
| files = api.list_repo_files(repo_id=repo, repo_type="dataset", revision=branch) |
| backs = sorted(f for f in files if f.endswith(".tar.gz")) |
| for old in backs[:-keep]: |
| api.delete_file(path_in_repo=old, |
| repo_id=repo, repo_type="dataset", revision=branch) |
| api.super_squash_history(repo_id=repo, repo_type="dataset", branch=branch) |
| PY |
|
|
| rm -rf "$tmp" |
| log "✅ 上传备份并清理临时文件" |
| } |
|
|
| |
| sync_loop(){ |
| while true; do |
| do_backup |
| log "⏳ 下次同步在 ${SYNC_INTERVAL}s 后" |
| sleep "${SYNC_INTERVAL}" |
| done |
| } |
|
|
| |
| main(){ |
| BACKUP_DIR="${BACKUP_DIR:-$HOME/app/data}" |
| DATASET_NUM="${DATASET_NUM:-10}" |
| SYNC_INTERVAL="${SYNC_INTERVAL:-36000}" |
|
|
| if init_backup; then |
| log "🚀 启动备份/同步流程,使用数据集:$DATASET_ID" |
| prep_repo |
| restore_latest |
| sync_loop & |
| else |
| log "🚀 直接启动主应用,无备份/同步" |
| fi |
|
|
| exec python app.py |
| } |
|
|
| main |
|
|