Change vars to be lighter on HF servers
Browse files- backend/runner/config.py +8 -5
backend/runner/config.py
CHANGED
|
@@ -338,7 +338,7 @@ def _download_markdown_files_parallel(works_dir: Path, work_dirs: set, files: li
|
|
| 338 |
completed = 0
|
| 339 |
failed = 0
|
| 340 |
|
| 341 |
-
with concurrent.futures.ThreadPoolExecutor(max_workers=
|
| 342 |
future_to_work = {executor.submit(download_markdown_file, work_id): work_id for work_id in work_list}
|
| 343 |
|
| 344 |
for future in concurrent.futures.as_completed(future_to_work):
|
|
@@ -352,6 +352,8 @@ def _download_markdown_files_parallel(works_dir: Path, work_dirs: set, files: li
|
|
| 352 |
|
| 353 |
if (completed + failed) % 500 == 0:
|
| 354 |
print(f"📄 Downloaded {completed}/{len(work_list)} markdown files (failed: {failed})")
|
|
|
|
|
|
|
| 355 |
|
| 356 |
except Exception as e:
|
| 357 |
print(f"❌ Error processing {work_id}: {e}")
|
|
@@ -404,7 +406,7 @@ def _download_images_batch(works_dir: Path, work_dirs: set, files: list) -> None
|
|
| 404 |
|
| 405 |
# Process works in batches to avoid overwhelming the server
|
| 406 |
work_list = list(work_dirs)
|
| 407 |
-
batch_size =
|
| 408 |
total_downloaded = 0
|
| 409 |
total_failed = 0
|
| 410 |
|
|
@@ -412,7 +414,7 @@ def _download_images_batch(works_dir: Path, work_dirs: set, files: list) -> None
|
|
| 412 |
batch = work_list[i:i + batch_size]
|
| 413 |
print(f"🖼️ Processing image batch {i//batch_size + 1}/{(len(work_list) + batch_size - 1)//batch_size} ({len(batch)} works)")
|
| 414 |
|
| 415 |
-
with concurrent.futures.ThreadPoolExecutor(max_workers=
|
| 416 |
future_to_work = {executor.submit(download_work_images, work_id): work_id for work_id in batch}
|
| 417 |
|
| 418 |
for future in concurrent.futures.as_completed(future_to_work):
|
|
@@ -425,8 +427,9 @@ def _download_images_batch(works_dir: Path, work_dirs: set, files: list) -> None
|
|
| 425 |
print(f"❌ Error processing {work_id}: {e}")
|
| 426 |
total_failed += 1
|
| 427 |
|
| 428 |
-
#
|
| 429 |
-
|
|
|
|
| 430 |
|
| 431 |
print(f"✅ Phase 2 complete: {total_downloaded} images downloaded, {total_failed} failed")
|
| 432 |
|
|
|
|
| 338 |
completed = 0
|
| 339 |
failed = 0
|
| 340 |
|
| 341 |
+
with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor: # Reduced from 10 to 5
|
| 342 |
future_to_work = {executor.submit(download_markdown_file, work_id): work_id for work_id in work_list}
|
| 343 |
|
| 344 |
for future in concurrent.futures.as_completed(future_to_work):
|
|
|
|
| 352 |
|
| 353 |
if (completed + failed) % 500 == 0:
|
| 354 |
print(f"📄 Downloaded {completed}/{len(work_list)} markdown files (failed: {failed})")
|
| 355 |
+
# Small delay every 500 files to be gentle on HF servers
|
| 356 |
+
time.sleep(2)
|
| 357 |
|
| 358 |
except Exception as e:
|
| 359 |
print(f"❌ Error processing {work_id}: {e}")
|
|
|
|
| 406 |
|
| 407 |
# Process works in batches to avoid overwhelming the server
|
| 408 |
work_list = list(work_dirs)
|
| 409 |
+
batch_size = 20 # Process 20 works at a time (reduced from 50)
|
| 410 |
total_downloaded = 0
|
| 411 |
total_failed = 0
|
| 412 |
|
|
|
|
| 414 |
batch = work_list[i:i + batch_size]
|
| 415 |
print(f"🖼️ Processing image batch {i//batch_size + 1}/{(len(work_list) + batch_size - 1)//batch_size} ({len(batch)} works)")
|
| 416 |
|
| 417 |
+
with concurrent.futures.ThreadPoolExecutor(max_workers=3) as executor: # Reduced from 5 to 3
|
| 418 |
future_to_work = {executor.submit(download_work_images, work_id): work_id for work_id in batch}
|
| 419 |
|
| 420 |
for future in concurrent.futures.as_completed(future_to_work):
|
|
|
|
| 427 |
print(f"❌ Error processing {work_id}: {e}")
|
| 428 |
total_failed += 1
|
| 429 |
|
| 430 |
+
# Longer delay between batches to be more gentle on HF servers
|
| 431 |
+
print(f"⏳ Waiting 5 seconds before next batch...")
|
| 432 |
+
time.sleep(5)
|
| 433 |
|
| 434 |
print(f"✅ Phase 2 complete: {total_downloaded} images downloaded, {total_failed} failed")
|
| 435 |
|