File size: 3,683 Bytes
1966c0d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
#!/usr/bin/env bash
set -euo pipefail
IFS=$'\n\t'

# 自动 export 后面所有赋值
set -a

log() {
  echo "[$(date +'%F %T')] $*"
}

# 1. init_backup
init_backup(){
  if [[ -n "${DATASET_ID:-}" ]]; then
    log "📁 使用外部定义的 DATASET_ID=$DATASET_ID"
    return 0
  fi
  if [[ -z "${HF_TOKEN:-}" ]]; then
    log "⚠️ HF_TOKEN 未设置,跳过备份"
    return 1
  fi

  USER_ID=$(python3 - <<'PY'
import os,sys
from huggingface_hub import HfApi
try:
    name = HfApi(token=os.getenv("HF_TOKEN")).whoami().get("name","")
    print(name) if name else sys.exit(1)
except:
    sys.exit(1)
PY
  )
  if [[ -z "$USER_ID" ]]; then
    log "⚠️ 获取 USER_ID 失败,跳过备份"
    return 1
  fi

  DATASET_ID="${USER_ID}/data"
  # ← 这里修正了变量名
  log "✅ 设置默认 DATASET_ID=$DATASET_ID"
  return 0
}

# 2. prep_repo
prep_repo(){
  python3 <<'PY'
import os
from huggingface_hub import HfApi
api = HfApi(token=os.getenv("HF_TOKEN"))
repo = os.environ["DATASET_ID"]
author = repo.split("/")[0]
if not any(d.id == repo for d in api.list_datasets(author=author)):
    api.create_repo(repo_id=repo, repo_type="dataset", private=True)
branch = "Chat-Share"
refs = api.list_repo_refs(repo_id=repo, repo_type="dataset").branches
if branch not in [b.name for b in refs]:
    api.create_branch(repo_id=repo, repo_type="dataset", branch=branch)
PY
  log "✅ 数据集 & 分支就绪"
}

# 3. restore_latest
restore_latest(){
  python3 <<'PY'
import os,sys,tarfile,tempfile
from huggingface_hub import HfApi
api = HfApi(token=os.getenv("HF_TOKEN"))
repo, branch = os.getenv("DATASET_ID"), "Chat-Share"
files = api.list_repo_files(repo_id=repo, repo_type="dataset", revision=branch)
backs = sorted(f for f in files if f.endswith(".tar.gz"))
if not backs: sys.exit(0)
td = tempfile.mkdtemp()
path = api.hf_hub_download(repo_id=repo, repo_type="dataset",
    revision=branch, filename=backs[-1], local_dir=td)
with tarfile.open(path) as t:
    t.extractall(os.getenv("BACKUP_DIR"))
PY
  log "✅ 恢复最新备份(如果有)"
}

# 4. do_backup
do_backup(){
  ts=$(date +%Y%m%d_%H%M%S)
  fname="Chat-Share_${ts}.tar.gz"
  tmp=$(mktemp -d)
  tar -czf "$tmp/$fname" -C "$BACKUP_DIR" .

  python3 <<PY

import os

from huggingface_hub import HfApi

api = HfApi(token=os.getenv("HF_TOKEN"))

repo, branch = os.getenv("DATASET_ID"), "Chat-Share"

api.upload_file(path_or_fileobj="$tmp/$fname",

                path_in_repo="$fname",

                repo_id=repo, repo_type="dataset",

                revision=branch)

keep = int(os.getenv("DATASET_NUM", "10"))

files = api.list_repo_files(repo_id=repo, repo_type="dataset", revision=branch)

backs = sorted(f for f in files if f.endswith(".tar.gz"))

for old in backs[:-keep]:

    api.delete_file(path_in_repo=old,

                    repo_id=repo, repo_type="dataset", revision=branch)

api.super_squash_history(repo_id=repo, repo_type="dataset", branch=branch)

PY

  rm -rf "$tmp"
  log "✅ 上传备份并清理临时文件"
}

# 5. sync_loop
sync_loop(){
  while true; do
    do_backup
    log "⏳ 下次同步在 ${SYNC_INTERVAL}s 后"
    sleep "${SYNC_INTERVAL}"
  done
}

# 主流程
main(){
  BACKUP_DIR="${BACKUP_DIR:-$HOME/app/data}"
  DATASET_NUM="${DATASET_NUM:-10}"
  SYNC_INTERVAL="${SYNC_INTERVAL:-36000}"

  if init_backup; then
    log "🚀 启动备份/同步流程,使用数据集:$DATASET_ID"
    prep_repo
    restore_latest
    sync_loop &    # 后台
  else
    log "🚀 直接启动主应用,无备份/同步"
  fi

  exec python app.py
}

main