samwaugh commited on
Commit
1a419bd
·
1 Parent(s): 33b499e

Change vars to be lighter on HF servers

Browse files
Files changed (1) hide show
  1. backend/runner/config.py +8 -5
backend/runner/config.py CHANGED
@@ -338,7 +338,7 @@ def _download_markdown_files_parallel(works_dir: Path, work_dirs: set, files: li
338
  completed = 0
339
  failed = 0
340
 
341
- with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
342
  future_to_work = {executor.submit(download_markdown_file, work_id): work_id for work_id in work_list}
343
 
344
  for future in concurrent.futures.as_completed(future_to_work):
@@ -352,6 +352,8 @@ def _download_markdown_files_parallel(works_dir: Path, work_dirs: set, files: li
352
 
353
  if (completed + failed) % 500 == 0:
354
  print(f"📄 Downloaded {completed}/{len(work_list)} markdown files (failed: {failed})")
 
 
355
 
356
  except Exception as e:
357
  print(f"❌ Error processing {work_id}: {e}")
@@ -404,7 +406,7 @@ def _download_images_batch(works_dir: Path, work_dirs: set, files: list) -> None
404
 
405
  # Process works in batches to avoid overwhelming the server
406
  work_list = list(work_dirs)
407
- batch_size = 50 # Process 50 works at a time
408
  total_downloaded = 0
409
  total_failed = 0
410
 
@@ -412,7 +414,7 @@ def _download_images_batch(works_dir: Path, work_dirs: set, files: list) -> None
412
  batch = work_list[i:i + batch_size]
413
  print(f"🖼️ Processing image batch {i//batch_size + 1}/{(len(work_list) + batch_size - 1)//batch_size} ({len(batch)} works)")
414
 
415
- with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
416
  future_to_work = {executor.submit(download_work_images, work_id): work_id for work_id in batch}
417
 
418
  for future in concurrent.futures.as_completed(future_to_work):
@@ -425,8 +427,9 @@ def _download_images_batch(works_dir: Path, work_dirs: set, files: list) -> None
425
  print(f"❌ Error processing {work_id}: {e}")
426
  total_failed += 1
427
 
428
- # Small delay between batches to be nice to the server
429
- time.sleep(1)
 
430
 
431
  print(f"✅ Phase 2 complete: {total_downloaded} images downloaded, {total_failed} failed")
432
 
 
338
  completed = 0
339
  failed = 0
340
 
341
+ with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor: # Reduced from 10 to 5
342
  future_to_work = {executor.submit(download_markdown_file, work_id): work_id for work_id in work_list}
343
 
344
  for future in concurrent.futures.as_completed(future_to_work):
 
352
 
353
  if (completed + failed) % 500 == 0:
354
  print(f"📄 Downloaded {completed}/{len(work_list)} markdown files (failed: {failed})")
355
+ # Small delay every 500 files to be gentle on HF servers
356
+ time.sleep(2)
357
 
358
  except Exception as e:
359
  print(f"❌ Error processing {work_id}: {e}")
 
406
 
407
  # Process works in batches to avoid overwhelming the server
408
  work_list = list(work_dirs)
409
+ batch_size = 20 # Process 20 works at a time (reduced from 50)
410
  total_downloaded = 0
411
  total_failed = 0
412
 
 
414
  batch = work_list[i:i + batch_size]
415
  print(f"🖼️ Processing image batch {i//batch_size + 1}/{(len(work_list) + batch_size - 1)//batch_size} ({len(batch)} works)")
416
 
417
+ with concurrent.futures.ThreadPoolExecutor(max_workers=3) as executor: # Reduced from 5 to 3
418
  future_to_work = {executor.submit(download_work_images, work_id): work_id for work_id in batch}
419
 
420
  for future in concurrent.futures.as_completed(future_to_work):
 
427
  print(f"❌ Error processing {work_id}: {e}")
428
  total_failed += 1
429
 
430
+ # Longer delay between batches to be more gentle on HF servers
431
+ print(f"⏳ Waiting 5 seconds before next batch...")
432
+ time.sleep(5)
433
 
434
  print(f"✅ Phase 2 complete: {total_downloaded} images downloaded, {total_failed} failed")
435