tao-shen Claude Opus 4.6 commited on
Commit
980afb9
Β·
1 Parent(s): 7f87cb0

fix: auto-split large dirs + exclude huggingrun.env from upload

Browse files

- Dirs with >2000 files are recursively split into sub-dirs
(avoids 504 timeout on usr/ which has 10K+ files)
- Exclude huggingrun.env (contains HF_TOKEN, rejected by secret scanner)

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

Files changed (1) hide show
  1. entrypoint.py +80 -38
entrypoint.py CHANGED
@@ -51,6 +51,7 @@ UPLOAD_IGNORE = [
51
  "*.sock", "*.lock",
52
  ".huggingface",
53
  ".cache",
 
54
  ]
55
 
56
 
@@ -249,55 +250,96 @@ def save_and_upload():
249
  shutil.rmtree(full, ignore_errors=True)
250
  dirnames.remove(d)
251
 
252
- # Upload per top-level directory (avoids 504 timeout on huge single commit)
253
  api = HfApi(token=HF_TOKEN)
254
  ts = time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime())
255
-
256
- # First, upload loose files in /data/ root (user-packages.list, etc.)
257
  t0_all = time.time()
258
- try:
259
- root_files = [f for f in os.listdir(PERSIST_PATH)
260
- if os.path.isfile(os.path.join(PERSIST_PATH, f))
261
- and not f.startswith('.')]
262
- if root_files:
263
- for rf in root_files:
264
- fpath = os.path.join(PERSIST_PATH, rf)
265
- api.upload_file(
266
- path_or_fileobj=fpath,
267
- path_in_repo=rf,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
268
  repo_id=HF_DATASET_REPO,
269
  repo_type="dataset",
270
- commit_message=f"sync {ts}: {rf}",
 
 
271
  )
272
- log(f" uploaded {len(root_files)} root files")
273
- except Exception as e:
274
- log(f" root files upload failed: {e}")
275
-
276
- # Then upload each top-level directory as a separate commit
277
- top_dirs = sorted([d for d in os.listdir(PERSIST_PATH)
278
- if os.path.isdir(os.path.join(PERSIST_PATH, d))
279
- and not d.startswith('.')])
280
- uploaded = 0
281
- failed = 0
282
- for d in top_dirs:
283
- dir_path = os.path.join(PERSIST_PATH, d)
284
- t0 = time.time()
 
285
  try:
286
- api.upload_folder(
287
- folder_path=dir_path,
 
288
  repo_id=HF_DATASET_REPO,
289
  repo_type="dataset",
290
- path_in_repo=d,
291
- commit_message=f"sync {ts}: {d}/",
292
- ignore_patterns=UPLOAD_IGNORE,
293
  )
294
- elapsed = time.time() - t0
295
- log(f" {d}/ uploaded ({elapsed:.1f}s)")
296
- uploaded += 1
297
  except Exception as e:
298
- elapsed = time.time() - t0
299
- log(f" {d}/ failed ({elapsed:.1f}s): {e}")
300
- failed += 1
 
 
 
 
 
 
 
301
 
302
  elapsed_all = time.time() - t0_all
303
  log(f"══ SYNC: done ({elapsed_all:.1f}s) β€” {uploaded} ok, {failed} failed ══")
 
51
  "*.sock", "*.lock",
52
  ".huggingface",
53
  ".cache",
54
+ "huggingrun.env", # contains HF_TOKEN, rejected by secret scanner
55
  ]
56
 
57
 
 
250
  shutil.rmtree(full, ignore_errors=True)
251
  dirnames.remove(d)
252
 
253
+ # Upload directory tree β€” split large dirs into sub-dirs to avoid 504 timeout
254
  api = HfApi(token=HF_TOKEN)
255
  ts = time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime())
 
 
256
  t0_all = time.time()
257
+ uploaded = 0
258
+ failed = 0
259
+
260
+ def upload_dir(local_path, repo_path):
261
+ """Upload a directory. If it's too large (>2000 files), split into sub-dirs."""
262
+ nonlocal uploaded, failed
263
+ # Count files (excluding ignored patterns)
264
+ try:
265
+ file_count = int(subprocess.check_output(
266
+ f"find '{local_path}' -type f -not -name '*.pyc' -not -path '*/__pycache__/*' "
267
+ f"-not -path '*/.cache/*' -not -path '*/.git/*' | wc -l",
268
+ shell=True, text=True).strip())
269
+ except Exception:
270
+ file_count = 0
271
+
272
+ if file_count > 2000:
273
+ # Too large β€” recurse into sub-directories
274
+ log(f" {repo_path}/ ({file_count} files) β†’ splitting")
275
+ sub_items = sorted(os.listdir(local_path))
276
+ # Upload files in this dir first (non-recursive)
277
+ dir_files = [f for f in sub_items
278
+ if os.path.isfile(os.path.join(local_path, f))
279
+ and not f.startswith('.')]
280
+ if dir_files:
281
+ for f in dir_files:
282
+ try:
283
+ api.upload_file(
284
+ path_or_fileobj=os.path.join(local_path, f),
285
+ path_in_repo=f"{repo_path}/{f}" if repo_path else f,
286
+ repo_id=HF_DATASET_REPO,
287
+ repo_type="dataset",
288
+ commit_message=f"sync {ts}: {repo_path}/{f}",
289
+ )
290
+ except Exception:
291
+ pass
292
+ # Recurse into sub-dirs
293
+ for d in sub_items:
294
+ dp = os.path.join(local_path, d)
295
+ if os.path.isdir(dp) and not d.startswith('.'):
296
+ rp = f"{repo_path}/{d}" if repo_path else d
297
+ upload_dir(dp, rp)
298
+ else:
299
+ # Small enough β€” upload as single commit
300
+ t0 = time.time()
301
+ try:
302
+ api.upload_folder(
303
+ folder_path=local_path,
304
  repo_id=HF_DATASET_REPO,
305
  repo_type="dataset",
306
+ path_in_repo=repo_path,
307
+ commit_message=f"sync {ts}: {repo_path}/",
308
+ ignore_patterns=UPLOAD_IGNORE,
309
  )
310
+ elapsed = time.time() - t0
311
+ log(f" {repo_path}/ uploaded ({elapsed:.1f}s, {file_count} files)")
312
+ uploaded += 1
313
+ except Exception as e:
314
+ elapsed = time.time() - t0
315
+ err_short = str(e).split('\n')[0][:100]
316
+ log(f" {repo_path}/ failed ({elapsed:.1f}s): {err_short}")
317
+ failed += 1
318
+
319
+ # Upload loose files at root
320
+ root_files = [f for f in os.listdir(PERSIST_PATH)
321
+ if os.path.isfile(os.path.join(PERSIST_PATH, f))
322
+ and not f.startswith('.')]
323
+ for rf in root_files:
324
  try:
325
+ api.upload_file(
326
+ path_or_fileobj=os.path.join(PERSIST_PATH, rf),
327
+ path_in_repo=rf,
328
  repo_id=HF_DATASET_REPO,
329
  repo_type="dataset",
330
+ commit_message=f"sync {ts}: {rf}",
 
 
331
  )
 
 
 
332
  except Exception as e:
333
+ log(f" {rf} failed: {str(e)[:80]}")
334
+ if root_files:
335
+ log(f" uploaded {len(root_files)} root files")
336
+
337
+ # Upload each top-level directory (auto-splits if >2000 files)
338
+ top_dirs = sorted([d for d in os.listdir(PERSIST_PATH)
339
+ if os.path.isdir(os.path.join(PERSIST_PATH, d))
340
+ and not d.startswith('.')])
341
+ for d in top_dirs:
342
+ upload_dir(os.path.join(PERSIST_PATH, d), d)
343
 
344
  elapsed_all = time.time() - t0_all
345
  log(f"══ SYNC: done ({elapsed_all:.1f}s) β€” {uploaded} ok, {failed} failed ══")