Spaces:

factorstudios
/

cordae-fc

Running

App Files Files Community

factorstudios commited on 7 days ago

Commit

6dfe7fa

verified ·

1 Parent(s): 6f1e541

Update app.py

Browse files

Files changed (1) hide show

app.py +68 -23

app.py CHANGED Viewed

@@ -183,6 +183,27 @@ class CaptionServer:
 servers = [CaptionServer(url) for url in CAPTION_SERVERS]
 server_index = 0
 # --- Progress and State Management Functions ---
 def load_progress() -> Dict:
@@ -236,12 +257,14 @@ def save_json_state(file_path: str, data: Dict[str, Any]):
 async def download_hf_state() -> Dict[str, Any]:
     """Downloads the state file from Hugging Face or returns a default state."""
     local_path = LOCAL_STATE_FOLDER / HF_STATE_FILE
     default_state = {"next_download_index": 0, "file_states": {}}
     try:
         # Check if the file exists in the helium repo
-        files = HfApi(token=HF_TOKEN).list_repo_files(
             repo_id=HF_OUTPUT_DATASET_ID,
             repo_type="dataset"
         )
@@ -264,11 +287,15 @@ async def download_hf_state() -> Dict[str, Any]:
         return load_json_state(str(local_path), default_state)
     except Exception as e:
         print(f"[{FLOW_ID}] Failed to download state file: {str(e)}. Starting fresh.")
         return default_state
 async def upload_hf_state(state: Dict[str, Any]) -> bool:
     """Uploads the state file to Hugging Face."""
     local_path = LOCAL_STATE_FOLDER / HF_STATE_FILE
     try:
@@ -286,7 +313,13 @@ async def upload_hf_state(state: Dict[str, Any]) -> bool:
         print(f"[{FLOW_ID}] Successfully uploaded state file.")
         return True
     except Exception as e:
-        print(f"[{FLOW_ID}] Failed to upload state file: {str(e)}")
         return False
 async def lock_file_for_processing(zip_filename: str, state: Dict[str, Any]) -> bool:
@@ -334,6 +367,7 @@ async def get_zip_file_list(progress_data: Dict) -> List[str]:
         print(f"[{FLOW_ID}] Using cached file list with {len(progress_data['file_list'])} files.")
         return progress_data['file_list']
     print(f"[{FLOW_ID}] Fetching full list of zip files from {HF_DATASET_ID}...")
     try:
         api = HfApi(token=HF_TOKEN)
@@ -360,11 +394,15 @@ async def get_zip_file_list(progress_data: Dict) -> List[str]:
         return zip_files
     except Exception as e:
         print(f"[{FLOW_ID}] Error fetching file list from Hugging Face: {e}")
         return []
 async def download_and_extract_zip_by_index(file_index: int, repo_file_full_path: str) -> Optional[Path]:
     """Downloads the zip file for the given index and extracts its contents."""
     # Extract the base name for the extraction directory
     zip_full_name = Path(repo_file_full_path).name
@@ -401,11 +439,15 @@ async def download_and_extract_zip_by_index(file_index: int, repo_file_full_path
         return extract_dir
     except Exception as e:
         print(f"[{FLOW_ID}] Error downloading or extracting zip for {repo_file_full_path}: {e}")
         return None
 async def upload_captions_to_hf(zip_full_name: str, captions: List[Dict]) -> bool:
     """Uploads the final captions JSON file to the output dataset."""
     # Use the full zip name, replacing the extension with .json
     caption_filename = Path(zip_full_name).with_suffix('.json').name
@@ -428,6 +470,11 @@ async def upload_captions_to_hf(zip_full_name: str, captions: List[Dict]) -> boo
         return True
     except Exception as e:
         print(f"[{FLOW_ID}] Error uploading captions for {zip_full_name}: {e}")
         return False
@@ -454,18 +501,10 @@ async def get_available_server(timeout: float = 300.0) -> CaptionServer:
 async def send_image_for_captioning(image_path: Path, course_name: str, progress_tracker: Dict) -> Optional[Dict]:
     """Sends a single image to a caption server for processing."""
-    global RATE_LIMIT_PAUSE_UNTIL
     # This function now handles server selection and retries internally
     MAX_RETRIES = 3
     for attempt in range(MAX_RETRIES):
-        # Check if we are currently paused due to rate limiting
-        current_time = time.time()
-        if current_time < RATE_LIMIT_PAUSE_UNTIL:
-            wait_time = RATE_LIMIT_PAUSE_UNTIL - current_time
-            print(f"[{FLOW_ID}] ⏳ Rate limit pause active. Waiting {int(wait_time)} more seconds...")
-            await asyncio.sleep(min(wait_time, 60)) # Wait in chunks
-            continue
         server = None
         try:
@@ -512,14 +551,17 @@ async def send_image_for_captioning(image_path: Path, course_name: str, progress
                         else:
                             print(f"[{FLOW_ID}] Warning: Server returned 200 but no caption for {image_path.name}.")
                     elif resp.status == 429:
-                        print(f"[{FLOW_ID}] 🛑 HIT RATE LIMIT (429) on {server.url}. Pausing for 1 hour.")
-                        RATE_LIMIT_PAUSE_UNTIL = time.time() + 3600 # Pause for 1 hour
-                        # We don't return None immediately, we let the loop retry after the pause
                     else:
                         print(f"[{FLOW_ID}] Error: Server {server.url} returned status {resp.status} for {image_path.name}.")
         except Exception as e:
-            print(f"[{FLOW_ID}] Exception during captioning for {image_path.name} on {server.url if server else 'unknown server'}: {e}")
         finally:
             if server:
                 server.busy = False
@@ -535,7 +577,6 @@ async def process_dataset_task(start_index: int):
     """
     Main background task that processes zip files sequentially.
     """
-    global RATE_LIMIT_PAUSE_UNTIL
     print(f"[{FLOW_ID}] Starting dataset processing task from index {start_index}...")
     # 1. Load progress and get file list
@@ -552,13 +593,21 @@ async def process_dataset_task(start_index: int):
     # --- NEW: Check helium_tg for existing JSON files to avoid redundant processing ---
     print(f"[{FLOW_ID}] Checking {HF_OUTPUT_DATASET_ID} for existing JSON outputs...")
     try:
         api = HfApi(token=HF_TOKEN)
         existing_files = api.list_repo_files(repo_id=HF_OUTPUT_DATASET_ID, repo_type="dataset")
         existing_json_files = {f for f in existing_files if f.endswith('.json')}
         print(f"[{FLOW_ID}] Found {len(existing_json_files)} existing JSON files in {HF_OUTPUT_DATASET_ID}.")
     except Exception as e:
-        print(f"[{FLOW_ID}] Warning: Could not fetch existing files from {HF_OUTPUT_DATASET_ID}: {e}")
-        existing_json_files = set()
     # ----------------------------------------------------------------------------------
     global_success = True
@@ -568,11 +617,7 @@ async def process_dataset_task(start_index: int):
     start_idx = max(0, start_index - 1)
     for i in range(start_idx, len(file_list)):
-        # Check for rate limit pause at the start of each file
-        while time.time() < RATE_LIMIT_PAUSE_UNTIL:
-            wait_time = int(RATE_LIMIT_PAUSE_UNTIL - time.time())
-            print(f"[{FLOW_ID}] ⏳ Global rate limit pause. Waiting {wait_time}s before next file...")
-            await asyncio.sleep(min(wait_time, 60))
         file_index = i + 1
         repo_file_full_path = file_list[i]

 servers = [CaptionServer(url) for url in CAPTION_SERVERS]
 server_index = 0
+# --- Centralized Rate Limit Handling ---
+async def check_rate_limit_pause():
+    """Checks if a rate limit pause is active and waits if necessary."""
+    global RATE_LIMIT_PAUSE_UNTIL
+    while True:
+        current_time = time.time()
+        if current_time < RATE_LIMIT_PAUSE_UNTIL:
+            wait_time = int(RATE_LIMIT_PAUSE_UNTIL - current_time)
+            print(f"[{FLOW_ID}] ⏳ Global rate limit pause active. Waiting {wait_time}s...")
+            # Wait in chunks to allow for potential updates or shutdowns
+            await asyncio.sleep(min(wait_time, 60))
+        else:
+            break
+def trigger_rate_limit_pause():
+    """Sets the global rate limit pause for 1 hour."""
+    global RATE_LIMIT_PAUSE_UNTIL
+    print(f"[{FLOW_ID}] 🛑 TRIGGERING 1-HOUR PAUSE due to 429 Rate Limit error.")
+    RATE_LIMIT_PAUSE_UNTIL = time.time() + 3600
 # --- Progress and State Management Functions ---
 def load_progress() -> Dict:
 async def download_hf_state() -> Dict[str, Any]:
     """Downloads the state file from Hugging Face or returns a default state."""
+    await check_rate_limit_pause()
     local_path = LOCAL_STATE_FOLDER / HF_STATE_FILE
     default_state = {"next_download_index": 0, "file_states": {}}
     try:
         # Check if the file exists in the helium repo
+        api = HfApi(token=HF_TOKEN)
+        files = api.list_repo_files(
             repo_id=HF_OUTPUT_DATASET_ID,
             repo_type="dataset"
         )
         return load_json_state(str(local_path), default_state)
     except Exception as e:
+        if "429" in str(e):
+            trigger_rate_limit_pause()
+            return await download_hf_state() # Retry after pause
         print(f"[{FLOW_ID}] Failed to download state file: {str(e)}. Starting fresh.")
         return default_state
 async def upload_hf_state(state: Dict[str, Any]) -> bool:
     """Uploads the state file to Hugging Face."""
+    await check_rate_limit_pause()
     local_path = LOCAL_STATE_FOLDER / HF_STATE_FILE
     try:
         print(f"[{FLOW_ID}] Successfully uploaded state file.")
         return True
     except Exception as e:
+        if "429" in str(e):
+            trigger_rate_limit_pause()
+            # We don't retry here to avoid infinite loops in state management,
+            # but the next call will wait.
+            print(f"[{FLOW_ID}] Failed to upload state file due to 429. Pause triggered.")
+        else:
+            print(f"[{FLOW_ID}] Failed to upload state file: {str(e)}")
         return False
 async def lock_file_for_processing(zip_filename: str, state: Dict[str, Any]) -> bool:
         print(f"[{FLOW_ID}] Using cached file list with {len(progress_data['file_list'])} files.")
         return progress_data['file_list']
+    await check_rate_limit_pause()
     print(f"[{FLOW_ID}] Fetching full list of zip files from {HF_DATASET_ID}...")
     try:
         api = HfApi(token=HF_TOKEN)
         return zip_files
     except Exception as e:
+        if "429" in str(e):
+            trigger_rate_limit_pause()
+            return await get_zip_file_list(progress_data)
         print(f"[{FLOW_ID}] Error fetching file list from Hugging Face: {e}")
         return []
 async def download_and_extract_zip_by_index(file_index: int, repo_file_full_path: str) -> Optional[Path]:
     """Downloads the zip file for the given index and extracts its contents."""
+    await check_rate_limit_pause()
     # Extract the base name for the extraction directory
     zip_full_name = Path(repo_file_full_path).name
         return extract_dir
     except Exception as e:
+        if "429" in str(e):
+            trigger_rate_limit_pause()
+            return await download_and_extract_zip_by_index(file_index, repo_file_full_path)
         print(f"[{FLOW_ID}] Error downloading or extracting zip for {repo_file_full_path}: {e}")
         return None
 async def upload_captions_to_hf(zip_full_name: str, captions: List[Dict]) -> bool:
     """Uploads the final captions JSON file to the output dataset."""
+    await check_rate_limit_pause()
     # Use the full zip name, replacing the extension with .json
     caption_filename = Path(zip_full_name).with_suffix('.json').name
         return True
     except Exception as e:
+        if "429" in str(e):
+            trigger_rate_limit_pause()
+            # Retry once after pause
+            await check_rate_limit_pause()
+            return await upload_captions_to_hf(zip_full_name, captions)
         print(f"[{FLOW_ID}] Error uploading captions for {zip_full_name}: {e}")
         return False
 async def send_image_for_captioning(image_path: Path, course_name: str, progress_tracker: Dict) -> Optional[Dict]:
     """Sends a single image to a caption server for processing."""
     # This function now handles server selection and retries internally
     MAX_RETRIES = 3
     for attempt in range(MAX_RETRIES):
+        await check_rate_limit_pause()
         server = None
         try:
                         else:
                             print(f"[{FLOW_ID}] Warning: Server returned 200 but no caption for {image_path.name}.")
                     elif resp.status == 429:
+                        print(f"[{FLOW_ID}] 🛑 HIT RATE LIMIT (429) on {server.url}.")
+                        trigger_rate_limit_pause()
+                        # Loop will retry after check_rate_limit_pause()
                     else:
                         print(f"[{FLOW_ID}] Error: Server {server.url} returned status {resp.status} for {image_path.name}.")
         except Exception as e:
+            if "429" in str(e):
+                trigger_rate_limit_pause()
+            else:
+                print(f"[{FLOW_ID}] Exception during captioning for {image_path.name} on {server.url if server else 'unknown server'}: {e}")
         finally:
             if server:
                 server.busy = False
     """
     Main background task that processes zip files sequentially.
     """
     print(f"[{FLOW_ID}] Starting dataset processing task from index {start_index}...")
     # 1. Load progress and get file list
     # --- NEW: Check helium_tg for existing JSON files to avoid redundant processing ---
     print(f"[{FLOW_ID}] Checking {HF_OUTPUT_DATASET_ID} for existing JSON outputs...")
     try:
+        await check_rate_limit_pause()
         api = HfApi(token=HF_TOKEN)
         existing_files = api.list_repo_files(repo_id=HF_OUTPUT_DATASET_ID, repo_type="dataset")
         existing_json_files = {f for f in existing_files if f.endswith('.json')}
         print(f"[{FLOW_ID}] Found {len(existing_json_files)} existing JSON files in {HF_OUTPUT_DATASET_ID}.")
     except Exception as e:
+        if "429" in str(e):
+            trigger_rate_limit_pause()
+            await check_rate_limit_pause()
+            # We'll just continue with an empty set if it fails again,
+            # the per-file check will catch it later.
+            existing_json_files = set()
+        else:
+            print(f"[{FLOW_ID}] Warning: Could not fetch existing files from {HF_OUTPUT_DATASET_ID}: {e}")
+            existing_json_files = set()
     # ----------------------------------------------------------------------------------
     global_success = True
     start_idx = max(0, start_index - 1)
     for i in range(start_idx, len(file_list)):
+        await check_rate_limit_pause()
         file_index = i + 1
         repo_file_full_path = file_list[i]