Spaces:

samwaugh
/

ArteFact

Paused

App Files Files Community

samwaugh commited on Sep 29, 2025

Commit

1a419bd

1 Parent(s): 33b499e

Change vars to be lighter on HF servers

Browse files

Files changed (1) hide show

backend/runner/config.py +8 -5

backend/runner/config.py CHANGED Viewed

@@ -338,7 +338,7 @@ def _download_markdown_files_parallel(works_dir: Path, work_dirs: set, files: li
     completed = 0
     failed = 0
-    with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
         future_to_work = {executor.submit(download_markdown_file, work_id): work_id for work_id in work_list}
         for future in concurrent.futures.as_completed(future_to_work):
@@ -352,6 +352,8 @@ def _download_markdown_files_parallel(works_dir: Path, work_dirs: set, files: li
                 if (completed + failed) % 500 == 0:
                     print(f"📄 Downloaded {completed}/{len(work_list)} markdown files (failed: {failed})")
             except Exception as e:
                 print(f"❌ Error processing {work_id}: {e}")
@@ -404,7 +406,7 @@ def _download_images_batch(works_dir: Path, work_dirs: set, files: list) -> None
     # Process works in batches to avoid overwhelming the server
     work_list = list(work_dirs)
-    batch_size = 50  # Process 50 works at a time
     total_downloaded = 0
     total_failed = 0
@@ -412,7 +414,7 @@ def _download_images_batch(works_dir: Path, work_dirs: set, files: list) -> None
         batch = work_list[i:i + batch_size]
         print(f"🖼️  Processing image batch {i//batch_size + 1}/{(len(work_list) + batch_size - 1)//batch_size} ({len(batch)} works)")
-        with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
             future_to_work = {executor.submit(download_work_images, work_id): work_id for work_id in batch}
             for future in concurrent.futures.as_completed(future_to_work):
@@ -425,8 +427,9 @@ def _download_images_batch(works_dir: Path, work_dirs: set, files: list) -> None
                     print(f"❌ Error processing {work_id}: {e}")
                     total_failed += 1
-        # Small delay between batches to be nice to the server
-        time.sleep(1)
     print(f"✅ Phase 2 complete: {total_downloaded} images downloaded, {total_failed} failed")

     completed = 0
     failed = 0
+    with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:  # Reduced from 10 to 5
         future_to_work = {executor.submit(download_markdown_file, work_id): work_id for work_id in work_list}
         for future in concurrent.futures.as_completed(future_to_work):
                 if (completed + failed) % 500 == 0:
                     print(f"📄 Downloaded {completed}/{len(work_list)} markdown files (failed: {failed})")
+                    # Small delay every 500 files to be gentle on HF servers
+                    time.sleep(2)
             except Exception as e:
                 print(f"❌ Error processing {work_id}: {e}")
     # Process works in batches to avoid overwhelming the server
     work_list = list(work_dirs)
+    batch_size = 20  # Process 20 works at a time (reduced from 50)
     total_downloaded = 0
     total_failed = 0
         batch = work_list[i:i + batch_size]
         print(f"🖼️  Processing image batch {i//batch_size + 1}/{(len(work_list) + batch_size - 1)//batch_size} ({len(batch)} works)")
+        with concurrent.futures.ThreadPoolExecutor(max_workers=3) as executor:  # Reduced from 5 to 3
             future_to_work = {executor.submit(download_work_images, work_id): work_id for work_id in batch}
             for future in concurrent.futures.as_completed(future_to_work):
                     print(f"❌ Error processing {work_id}: {e}")
                     total_failed += 1
+        # Longer delay between batches to be more gentle on HF servers
+        print(f"⏳ Waiting 5 seconds before next batch...")
+        time.sleep(5)
     print(f"✅ Phase 2 complete: {total_downloaded} images downloaded, {total_failed} failed")