Update sync_to_dataset.py
Browse files- sync_to_dataset.py +36 -18
sync_to_dataset.py
CHANGED
|
@@ -4,43 +4,61 @@ import time
|
|
| 4 |
import logging
|
| 5 |
from huggingface_hub import HfApi, upload_folder
|
| 6 |
|
|
|
|
| 7 |
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
| 8 |
logger = logging.getLogger(__name__)
|
| 9 |
|
|
|
|
| 10 |
HF_TOKEN = os.environ.get("HF_TOKEN")
|
| 11 |
-
DATASET_REPO = os.environ.get("OPENCODE_DATASET_REPO")
|
| 12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 13 |
|
| 14 |
if not HF_TOKEN:
|
| 15 |
-
logger.error("HF_TOKEN environment variable not set.
|
| 16 |
exit(1)
|
| 17 |
if not DATASET_REPO:
|
| 18 |
-
logger.error("OPENCODE_DATASET_REPO environment variable not set.
|
| 19 |
exit(1)
|
| 20 |
|
| 21 |
api = HfApi(token=HF_TOKEN)
|
| 22 |
|
| 23 |
-
def
|
|
|
|
| 24 |
try:
|
| 25 |
-
if not os.path.isdir(
|
| 26 |
-
logger.warning("
|
| 27 |
return
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 28 |
upload_folder(
|
| 29 |
repo_id=DATASET_REPO,
|
| 30 |
-
folder_path=
|
| 31 |
-
path_in_repo=
|
| 32 |
repo_type="dataset",
|
| 33 |
-
commit_message=f"Auto
|
| 34 |
)
|
| 35 |
-
logger.info(f"Successfully
|
| 36 |
except Exception as e:
|
| 37 |
-
logger.error(f"
|
| 38 |
|
| 39 |
-
|
| 40 |
-
logger.info(f"Starting
|
|
|
|
|
|
|
| 41 |
while True:
|
| 42 |
-
try:
|
| 43 |
-
sync_data()
|
| 44 |
-
except Exception as e:
|
| 45 |
-
logger.error(f"Unexpected error in sync loop: {e}")
|
| 46 |
time.sleep(SYNC_INTERVAL)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4 |
import logging
|
| 5 |
from huggingface_hub import HfApi, upload_folder
|
| 6 |
|
| 7 |
+
# 配置日志
|
| 8 |
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
| 9 |
logger = logging.getLogger(__name__)
|
| 10 |
|
| 11 |
+
# 从环境变量读取配置
|
| 12 |
HF_TOKEN = os.environ.get("HF_TOKEN")
|
| 13 |
+
DATASET_REPO = os.environ.get("OPENCODE_DATASET_REPO") # 应为 "jamesw853/opencode-data"
|
| 14 |
+
LOCAL_DIR = "/data"
|
| 15 |
+
SYNC_INTERVAL = 180 # 秒
|
| 16 |
+
|
| 17 |
+
# 目标子目录(注意:目录名包含空格,Hugging Face Hub 支持,但建议用引号)
|
| 18 |
+
TARGET_SUBDIR = "opencode backup" # 上传到数据集的这个子目录下
|
| 19 |
|
| 20 |
if not HF_TOKEN:
|
| 21 |
+
logger.error("HF_TOKEN environment variable not set. Cannot upload.")
|
| 22 |
exit(1)
|
| 23 |
if not DATASET_REPO:
|
| 24 |
+
logger.error("OPENCODE_DATASET_REPO environment variable not set.")
|
| 25 |
exit(1)
|
| 26 |
|
| 27 |
api = HfApi(token=HF_TOKEN)
|
| 28 |
|
| 29 |
+
def upload_data():
|
| 30 |
+
"""将 /data 目录上传到数据集的指定子目录"""
|
| 31 |
try:
|
| 32 |
+
if not os.path.isdir(LOCAL_DIR):
|
| 33 |
+
logger.warning(f"{LOCAL_DIR} does not exist, skipping upload.")
|
| 34 |
return
|
| 35 |
+
|
| 36 |
+
# 列出要上传的文件(可选,用于日志)
|
| 37 |
+
files = []
|
| 38 |
+
for root, _, filenames in os.walk(LOCAL_DIR):
|
| 39 |
+
for f in filenames:
|
| 40 |
+
files.append(os.path.relpath(os.path.join(root, f), LOCAL_DIR))
|
| 41 |
+
logger.info(f"Found {len(files)} file(s) to upload: {files[:5]}{'...' if len(files)>5 else ''}")
|
| 42 |
+
|
| 43 |
+
# 上传整个文件夹到数据集内的 TARGET_SUBDIR 子目录
|
| 44 |
upload_folder(
|
| 45 |
repo_id=DATASET_REPO,
|
| 46 |
+
folder_path=LOCAL_DIR,
|
| 47 |
+
path_in_repo=TARGET_SUBDIR, # 所有文件会出现在数据集 /opencode backup/ 下
|
| 48 |
repo_type="dataset",
|
| 49 |
+
commit_message=f"Auto backup to {TARGET_SUBDIR} at {time.strftime('%Y-%m-%d %H:%M:%S')}"
|
| 50 |
)
|
| 51 |
+
logger.info(f"Successfully uploaded {LOCAL_DIR} to {DATASET_REPO}/{TARGET_SUBDIR}")
|
| 52 |
except Exception as e:
|
| 53 |
+
logger.error(f"Upload failed: {e}")
|
| 54 |
|
| 55 |
+
def main():
|
| 56 |
+
logger.info(f"Starting backup daemon: interval={SYNC_INTERVAL}s, dataset={DATASET_REPO}, target_dir={TARGET_SUBDIR}")
|
| 57 |
+
# 启动后先立即执行一次
|
| 58 |
+
upload_data()
|
| 59 |
while True:
|
|
|
|
|
|
|
|
|
|
|
|
|
| 60 |
time.sleep(SYNC_INTERVAL)
|
| 61 |
+
upload_data()
|
| 62 |
+
|
| 63 |
+
if __name__ == "__main__":
|
| 64 |
+
main()
|