| |
| from huggingface_hub import snapshot_download, HfApi |
| import os |
| import sys |
| import time |
| import json |
| import hashlib |
| from datetime import datetime |
|
|
| def get_dataset_info(repo_id, token): |
| """获取数据集的最新信息,用于检测更新""" |
| try: |
| api = HfApi(token=token) |
| info = api.repo_info(repo_id=repo_id, repo_type="dataset") |
| return { |
| "sha": info.sha, |
| "last_modified": info.last_modified.isoformat() if info.last_modified else None |
| } |
| except Exception as e: |
| print(f"获取数据集信息出错: {str(e)}") |
| return None |
|
|
| def save_dataset_info(info, music_dir): |
| """保存数据集信息到本地文件""" |
| info_file = os.path.join(music_dir, ".dataset_info.json") |
| try: |
| with open(info_file, "w") as f: |
| json.dump(info, f) |
| except Exception as e: |
| print(f"保存数据集信息出错: {str(e)}") |
|
|
| def load_dataset_info(music_dir): |
| """从本地文件加载数据集信息""" |
| info_file = os.path.join(music_dir, ".dataset_info.json") |
| if not os.path.exists(info_file): |
| return None |
| |
| try: |
| with open(info_file, "r") as f: |
| return json.load(f) |
| except Exception as e: |
| print(f"加载数据集信息出错: {str(e)}") |
| return None |
|
|
| def update_music(dataset_name, token, music_dir, force=False): |
| """更新音乐文件,只在有变化时更新""" |
| print(f"[{datetime.now()}] 检查音乐数据集更新...") |
| |
| |
| remote_info = get_dataset_info(dataset_name, token) |
| if not remote_info: |
| print("无法获取远程数据集信息,跳过更新") |
| return False |
| |
| |
| local_info = load_dataset_info(music_dir) |
| |
| |
| if not force and local_info and local_info.get("sha") == remote_info.get("sha"): |
| print("音乐数据集没有变化,无需更新") |
| return False |
| |
| print(f"检测到音乐数据集有更新,开始下载...") |
| |
| try: |
| |
| snapshot_download( |
| repo_id=dataset_name, |
| repo_type="dataset", |
| local_dir=music_dir, |
| token=token |
| ) |
| |
| |
| save_dataset_info(remote_info, music_dir) |
| |
| print(f"[{datetime.now()}] 音乐数据集更新成功!") |
| return True |
| except Exception as e: |
| print(f"更新音乐数据集出错: {str(e)}") |
| return False |
|
|
| if __name__ == "__main__": |
| |
| dataset_name = sys.argv[1] |
| token = sys.argv[2] |
| music_dir = sys.argv[3] |
| |
| |
| interval = int(sys.argv[4]) if len(sys.argv) > 4 else 3600 |
| force = sys.argv[5].lower() == "true" if len(sys.argv) > 5 else False |
| |
| |
| update_music(dataset_name, token, music_dir, force=True) |
| |
| |
| while True: |
| time.sleep(interval) |
| update_music(dataset_name, token, music_dir, force=force) |