Jaimodiji commited on
Commit
7a709ef
·
1 Parent(s): 8794715

Upload hf_sync.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. hf_sync.py +23 -31
hf_sync.py CHANGED
@@ -1,39 +1,30 @@
1
  import os
2
- import subprocess
3
  import sys
4
  import shutil
 
5
 
6
  # Configuration
7
  REPO_ID = os.environ.get("DATASET_REPO_ID")
8
  HF_TOKEN = os.environ.get("HF_TOKEN")
9
 
10
- def run_command(command):
11
- print(f"Running: {' '.join(command)}")
12
- # Ensure HF_TOKEN is in environment for the command
13
- env = os.environ.copy()
14
- if HF_TOKEN:
15
- env["HF_TOKEN"] = HF_TOKEN
16
-
17
- result = subprocess.run(command, capture_output=True, text=True, env=env)
18
- if result.returncode != 0:
19
- print(f"Error: {result.stderr}")
20
- else:
21
- print(f"Output: {result.stdout}")
22
- return result.returncode == 0
23
-
24
  def download():
25
  if not REPO_ID:
26
  print("DATASET_REPO_ID not set, skipping download.")
27
  return
28
 
29
  print(f"Downloading data from {REPO_ID}...")
30
- # hf download REPO_ID --repo-type dataset --local-dir data_repo
31
- # Using --local-dir-use-symlinks False to avoid issues in some environments
32
- success = run_command(["hf", "download", REPO_ID, "--repo-type", "dataset", "--local-dir", "data_repo", ])
33
- if success:
 
 
 
 
 
34
  print("Download successful.")
35
- else:
36
- print("Download failed or repository is empty.")
37
 
38
  def upload():
39
  if not REPO_ID:
@@ -44,22 +35,23 @@ def upload():
44
  return
45
 
46
  print(f"Uploading data to {REPO_ID}...")
47
- # hf upload REPO_ID data_repo / --repo-type dataset
48
- # We upload the contents of data_repo to the root of the dataset
49
- success = run_command(["hf", "upload", REPO_ID, "data_repo", ".", "--repo-type", "dataset"])
50
- if success:
 
 
 
 
51
  print("Upload successful.")
52
- else:
53
- print("Upload failed.")
54
 
55
  def init_local():
56
  """Ensure data_repo has the necessary structure if download failed or it's new."""
57
  os.makedirs("data_repo/output", exist_ok=True)
58
  os.makedirs("data_repo/processed", exist_ok=True)
59
  os.makedirs("data_repo/uploads", exist_ok=True)
60
- # database.db will be created by the app if it doesn't exist,
61
- # but we should ensure it's in data_repo.
62
- # We'll handle this in entrypoint.sh by symlinking.
63
 
64
  if __name__ == "__main__":
65
  if len(sys.argv) < 2:
@@ -74,4 +66,4 @@ if __name__ == "__main__":
74
  elif action == "init":
75
  init_local()
76
  else:
77
- print(f"Unknown action: {action}")
 
1
  import os
 
2
  import sys
3
  import shutil
4
+ from huggingface_hub import snapshot_download, HfApi
5
 
6
  # Configuration
7
  REPO_ID = os.environ.get("DATASET_REPO_ID")
8
  HF_TOKEN = os.environ.get("HF_TOKEN")
9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
  def download():
11
  if not REPO_ID:
12
  print("DATASET_REPO_ID not set, skipping download.")
13
  return
14
 
15
  print(f"Downloading data from {REPO_ID}...")
16
+ try:
17
+ # snapshot_download is more efficient for many files than the CLI
18
+ snapshot_download(
19
+ repo_id=REPO_ID,
20
+ repo_type="dataset",
21
+ local_dir="data_repo",
22
+ token=HF_TOKEN,
23
+ max_workers=8
24
+ )
25
  print("Download successful.")
26
+ except Exception as e:
27
+ print(f"Download failed: {e}")
28
 
29
  def upload():
30
  if not REPO_ID:
 
35
  return
36
 
37
  print(f"Uploading data to {REPO_ID}...")
38
+ try:
39
+ api = HfApi(token=HF_TOKEN)
40
+ api.upload_folder(
41
+ folder_path="data_repo",
42
+ repo_id=REPO_ID,
43
+ repo_type="dataset",
44
+ # This handles large folders by committing in chunks if necessary
45
+ )
46
  print("Upload successful.")
47
+ except Exception as e:
48
+ print(f"Upload failed: {e}")
49
 
50
  def init_local():
51
  """Ensure data_repo has the necessary structure if download failed or it's new."""
52
  os.makedirs("data_repo/output", exist_ok=True)
53
  os.makedirs("data_repo/processed", exist_ok=True)
54
  os.makedirs("data_repo/uploads", exist_ok=True)
 
 
 
55
 
56
  if __name__ == "__main__":
57
  if len(sys.argv) < 2:
 
66
  elif action == "init":
67
  init_local()
68
  else:
69
+ print(f"Unknown action: {action}")