Spaces:

Samfredoly
/

switches

Paused

App Files Files Community

Samfredoly commited on Nov 26, 2025

Commit

fb8c428

verified ·

1 Parent(s): d0cb1d3

Update app.py

Browse files

Files changed (1) hide show

app.py +21 -48

app.py CHANGED Viewed

@@ -30,13 +30,8 @@ LOCAL_STATE_FOLDER = Path(".state")
 LOCAL_STATE_FOLDER.mkdir(exist_ok=True)
 # Processing configuration
-MAX_UPLOADS_BEFORE_PAUSE = 120  # Max uploads allowed per hour
 UPLOAD_PAUSE_ENABLED = True
-UPLOAD_WINDOW_SECONDS = 3600  # 1 hour in seconds
-# Upload rate limiting (per-hour tracking)
-upload_timestamps: List[float] = []  # Timestamps of recent uploads (for rate limiting)
-upload_lock = asyncio.Lock()  # Lock for upload_timestamps access
 # Directory within the HF dataset where the audio files are located
 AUDIO_FILE_PREFIX = "audio/"
@@ -310,33 +305,6 @@ async def upload_transcription_to_hf(wav_filename: str, transcription_data: Dict
     # Use the WAV filename, replacing the extension with .json
     json_filename = Path(wav_filename).with_suffix('.json').name
-    # --- Rate limiting: check uploads per hour
-    global upload_timestamps
-    if UPLOAD_PAUSE_ENABLED:
-        async with upload_lock:
-            now = time.time()
-            # Remove uploads older than 1 hour
-            upload_timestamps = [ts for ts in upload_timestamps if now - ts < UPLOAD_WINDOW_SECONDS]
-            # Check if we've hit the limit
-            if len(upload_timestamps) >= MAX_UPLOADS_BEFORE_PAUSE:
-                oldest_upload = upload_timestamps[0]
-                time_until_available = UPLOAD_WINDOW_SECONDS - (now - oldest_upload)
-                print(f"[{FLOW_ID}] ⏸️  Upload limit reached ({len(upload_timestamps)}/{MAX_UPLOADS_BEFORE_PAUSE} in last hour). Pausing for {time_until_available:.0f}s...")
-                # Release lock and wait
-        # Wait until the oldest upload falls outside the window
-        while True:
-            async with upload_lock:
-                now = time.time()
-                upload_timestamps = [ts for ts in upload_timestamps if now - ts < UPLOAD_WINDOW_SECONDS]
-                if len(upload_timestamps) < MAX_UPLOADS_BEFORE_PAUSE:
-                    print(f"[{FLOW_ID}] ✅ Upload limit lifted. Resuming uploads...")
-                    break
-            await asyncio.sleep(5)  # Check every 5 seconds
     try:
         print(f"[{FLOW_ID}] Uploading transcription for {wav_filename} as {json_filename} to {HF_OUTPUT_DATASET_ID}...")
@@ -352,11 +320,6 @@ async def upload_transcription_to_hf(wav_filename: str, transcription_data: Dict
             commit_message=f"[{FLOW_ID}] Transcription for {wav_filename}"
         )
-        # Record this upload timestamp
-        if UPLOAD_PAUSE_ENABLED:
-            async with upload_lock:
-                upload_timestamps.append(time.time())
         print(f"[{FLOW_ID}] Successfully uploaded transcription for {wav_filename}.")
         return True
@@ -577,9 +540,13 @@ async def process_dataset_task(start_index: int):
     global_success = True
     current_batch_index = start_list_index
     batch_size = len(servers)  # Batch size = number of servers (20 files per batch)
     try:
         while current_batch_index < len(file_list):
             # Process a batch dynamically
             next_index, uploaded_count = await process_batch_dynamic(
                 file_list,
@@ -589,6 +556,9 @@ async def process_dataset_task(start_index: int):
                 progress
             )
             # Update progress
             progress['last_processed_index'] = next_index
             progress['uploaded_count'] = uploaded_count
@@ -596,6 +566,7 @@ async def process_dataset_task(start_index: int):
             # Update current batch index
             current_batch_index = next_index
             # Log statistics
             print(f"[{FLOW_ID}] Batch complete. Progress: {current_batch_index}/{len(file_list)}, Uploaded: {uploaded_count}")
@@ -604,8 +575,17 @@ async def process_dataset_task(start_index: int):
             print(f"[{FLOW_ID}] Server Statistics:")
             for i, server in enumerate(servers):
                 print(f"  Server {i+1}: {server.total_processed} files, {server.total_time:.2f}s total, {server.fps:.2f} files/sec")
-        print(f"[{FLOW_ID}] All files processed successfully!")
         return True
     except Exception as e:
@@ -657,24 +637,17 @@ async def root():
     total_time = sum(s.total_time for s in servers)
     avg_fps = total_processed / total_time if total_time > 0 else 0
-    # Get current uploads in the last hour
-    now = time.time()
-    uploads_in_last_hour = sum(1 for ts in upload_timestamps if now - ts < UPLOAD_WINDOW_SECONDS)
-    is_paused = uploads_in_last_hour >= MAX_UPLOADS_BEFORE_PAUSE
     return {
         "flow_id": FLOW_ID,
         "status": "ready",
         "last_processed_index": progress.get('last_processed_index', 0),
         "total_files_in_list": len(progress['file_list']),
         "uploaded_count": progress.get('uploaded_count', 0),
-        "uploads_in_last_hour": uploads_in_last_hour,
-        "upload_limit": MAX_UPLOADS_BEFORE_PAUSE,
-        "upload_paused": is_paused,
         "total_servers": len(servers),
         "processing_servers": sum(1 for s in servers if s.is_processing),
         "total_files_processed_by_servers": total_processed,
-        "avg_files_per_second": avg_fps
     }
 @app.post("/start_processing")

 LOCAL_STATE_FOLDER.mkdir(exist_ok=True)
 # Processing configuration
+MAX_UPLOADS_BEFORE_PAUSE = 120  # Pause uploading after 120 files
 UPLOAD_PAUSE_ENABLED = True
 # Directory within the HF dataset where the audio files are located
 AUDIO_FILE_PREFIX = "audio/"
     # Use the WAV filename, replacing the extension with .json
     json_filename = Path(wav_filename).with_suffix('.json').name
     try:
         print(f"[{FLOW_ID}] Uploading transcription for {wav_filename} as {json_filename} to {HF_OUTPUT_DATASET_ID}...")
             commit_message=f"[{FLOW_ID}] Transcription for {wav_filename}"
         )
         print(f"[{FLOW_ID}] Successfully uploaded transcription for {wav_filename}.")
         return True
     global_success = True
     current_batch_index = start_list_index
     batch_size = len(servers)  # Batch size = number of servers (20 files per batch)
+    batch_interval_seconds = 600  # 600 seconds = 10 minutes (enforces max 6 batches per hour)
     try:
+        batch_count = 0
         while current_batch_index < len(file_list):
+            batch_start_time = time.time()
             # Process a batch dynamically
             next_index, uploaded_count = await process_batch_dynamic(
                 file_list,
                 progress
             )
+            batch_end_time = time.time()
+            batch_elapsed = batch_end_time - batch_start_time
             # Update progress
             progress['last_processed_index'] = next_index
             progress['uploaded_count'] = uploaded_count
             # Update current batch index
             current_batch_index = next_index
+            batch_count += 1
             # Log statistics
             print(f"[{FLOW_ID}] Batch complete. Progress: {current_batch_index}/{len(file_list)}, Uploaded: {uploaded_count}")
             print(f"[{FLOW_ID}] Server Statistics:")
             for i, server in enumerate(servers):
                 print(f"  Server {i+1}: {server.total_processed} files, {server.total_time:.2f}s total, {server.fps:.2f} files/sec")
+            # Rate limiting: enforce minimum 10 minutes between batch starts (max 6 batches per hour)
+            if current_batch_index < len(file_list):  # Don't wait after the last batch
+                wait_time = batch_interval_seconds - batch_elapsed
+                if wait_time > 0:
+                    print(f"[{FLOW_ID}] Rate limit: batch took {batch_elapsed:.1f}s. Waiting {wait_time:.1f}s before next batch (min 10 min interval)...")
+                    await asyncio.sleep(wait_time)
+                else:
+                    print(f"[{FLOW_ID}] Batch took {batch_elapsed:.1f}s (exceeded 10 min interval). Proceeding immediately to next batch.")
+        print(f"[{FLOW_ID}] All files processed successfully! Total batches: {batch_count}")
         return True
     except Exception as e:
     total_time = sum(s.total_time for s in servers)
     avg_fps = total_processed / total_time if total_time > 0 else 0
     return {
         "flow_id": FLOW_ID,
         "status": "ready",
         "last_processed_index": progress.get('last_processed_index', 0),
         "total_files_in_list": len(progress['file_list']),
         "uploaded_count": progress.get('uploaded_count', 0),
         "total_servers": len(servers),
         "processing_servers": sum(1 for s in servers if s.is_processing),
         "total_files_processed_by_servers": total_processed,
+        "avg_files_per_second": avg_fps,
+        "upload_limit_paused": progress.get('uploaded_count', 0) >= MAX_UPLOADS_BEFORE_PAUSE
     }
 @app.post("/start_processing")