Spaces:

Samfredoly
/

switches

Paused

App Files Files Community

Samfredoly commited on Nov 23, 2025

Commit

963cadd

verified ·

1 Parent(s): 1b81a5c

Update app.py

Browse files

Files changed (1) hide show

app.py +167 -332

app.py CHANGED Viewed

@@ -3,12 +3,10 @@ import json
 import time
 import asyncio
 import aiohttp
-# import zipfile # Removed as zipping is no longer required
 import shutil
-from collections import deque
-from collections import deque
 import threading
-from typing import Dict, List, Set, Optional, Tuple, Any, Deque
 from urllib.parse import quote
 from datetime import datetime
 from pathlib import Path
@@ -63,8 +61,7 @@ WHISPER_SERVERS = [
 ]
 MODEL_TYPE = "whisper-small"
-# ZIP_UPLOAD_THRESHOLD = 100  # Upload and zip after this many transcriptions - REMOVED per user request
-MAX_UPLOADS_PER_HOUR = 120 # User requested rate limit
 # Temporary storage for audio files
 TEMP_DIR = Path(f"temp_audio_{FLOW_ID}")
@@ -119,38 +116,6 @@ def save_progress(progress_data: Dict):
     except Exception as e:
         print(f"[{FLOW_ID}] CRITICAL ERROR: Could not save progress to {PROGRESS_FILE}: {e}")
-# Rate Limiting State
-upload_timestamps: Deque[float] = deque()
-upload_lock = asyncio.Lock()
-def check_rate_limit() -> bool:
-    """Checks if the upload rate limit has been reached."""
-    global upload_timestamps
-    # Remove timestamps older than 1 hour (3600 seconds)
-    one_hour_ago = time.time() - 3600
-    while upload_timestamps and upload_timestamps[0] < one_hour_ago:
-        upload_timestamps.popleft()
-    return len(upload_timestamps) < MAX_UPLOADS_PER_HOUR
-async def record_upload():
-    """Records a successful upload and enforces the rate limit wait."""
-    global upload_timestamps
-    async with upload_lock:
-        upload_timestamps.append(time.time())
-        # Wait for the next available slot if the limit was just hit
-        if len(upload_timestamps) > MAX_UPLOADS_PER_HOUR:
-            time_to_wait = upload_timestamps[0] + 3600 - time.time()
-            if time_to_wait > 0:
-                print(f"[{FLOW_ID}] Rate limit hit. Waiting for {time_to_wait:.2f} seconds.")
-                await asyncio.sleep(time_to_wait)
-                # After waiting, the oldest timestamp should be outside the window
-                one_hour_ago = time.time() - 3600
-                while upload_timestamps and upload_timestamps[0] < one_hour_ago:
-                    upload_timestamps.popleft()
-        print(f"[{FLOW_ID}] Upload recorded. Current count in last hour: {len(upload_timestamps)}/{MAX_UPLOADS_PER_HOUR}")
 def load_json_state(file_path: str, default_value: Dict[str, Any]) -> Dict[str, Any]:
     """Load state from JSON file with migration logic for new structure."""
     if os.path.exists(file_path):
@@ -369,42 +334,43 @@ async def download_audio_file(file_index: int, repo_file_full_path: str) -> Opti
         print(f"[{FLOW_ID}] Error downloading audio file {repo_file_full_path}: {e}")
         return None
-async def upload_single_transcription(file_path: Path) -> bool:
-    """Uploads a single transcription JSON file to the dataset."""
-    if not file_path.exists():
-        print(f"[{FLOW_ID}] File not found for upload: {file_path.name}")
         return False
-    # 1. Check Rate Limit
-    if not check_rate_limit():
-        print(f"[{FLOW_ID}] 🛑 Rate limit of {MAX_UPLOADS_PER_HOUR} uploads/hour reached. Waiting for next hour slot.")
-        # The main loop will handle the wait by checking the limit again
-        return False
     try:
-        print(f"[{FLOW_ID}] 📤 Uploading transcription file: {file_path.name} to {HF_OUTPUT_DATASET_ID}...")
         api = HfApi(token=HF_TOKEN)
         api.upload_file(
-            path_or_fileobj=str(file_path),
-            path_in_repo=file_path.name,
             repo_id=HF_OUTPUT_DATASET_ID,
             repo_type="dataset",
-            commit_message=f"[{FLOW_ID}] Upload transcription: {file_path.name}"
         )
-        print(f"[{FLOW_ID}] ✅ Successfully uploaded: {file_path.name}")
-        # 2. Record successful upload
-        await record_upload()
-        # 3. Cleanup local file
-        os.remove(file_path)
         return True
     except Exception as e:
-        print(f"[{FLOW_ID}] Error uploading transcription: {e}")
         return False
 # --- Core Processing Functions ---
@@ -538,9 +504,7 @@ async def process_audio_files(background_tasks: BackgroundTasks):
     }
 async def process_audio_files_background():
-    """Background task that processes audio files with reference mapping using parallel batch distribution and atomic state updates."""
-    # 1. Load initial state and data
     progress_data = load_progress()
     reference_map = progress_data.get('reference_map', {})
@@ -555,104 +519,49 @@ async def process_audio_files_background():
     if not audio_files:
         print(f"[{FLOW_ID}] No audio files found. Exiting.")
         return
     # Dynamic batch size: one file per server
     BATCH_SIZE = len(servers)
     print(f"[{FLOW_ID}] 📊 Configuration: {len(servers)} Whisper server(s) → Batch size: {BATCH_SIZE} (1 file per server)")
-    # --- Core Processing Loop ---
-    while True:
-        # Download the latest state from Hugging Face at the start of each batch attempt
-        hf_state = await download_hf_state()
-        current_index = hf_state['next_download_index']
-        if current_index >= len(audio_files):
-            print(f"[{FLOW_ID}] All {len(audio_files)} files processed. Waiting for new files...")
-            await asyncio.sleep(300) # Wait 5 minutes before checking again
-            # Re-fetch file list in case new files were added
-            audio_files = await get_audio_file_list(progress_data)
-            continue
-        batch_end = min(current_index + BATCH_SIZE, len(audio_files))
-        batch_files_full_path = audio_files[current_index:batch_end]
-        if not batch_files_full_path:
-            await asyncio.sleep(10)
-            continue
-        print(f"\n[{FLOW_ID}] 📦 BATCH: Attempting to process files #{current_index}-#{batch_end-1} ({len(batch_files_full_path)} files)")
-        # 1. ATOMIC LOCK: Attempt to lock all files in the batch
-        # We will lock the files sequentially, and if any fail (already locked by another server),
-        # we will unlock all files locked in this attempt and restart the loop.
-        locked_files = []
-        lock_succeeded = True
-        # Re-download state to ensure we have the latest before locking
-        hf_state = await download_hf_state()
-        for audio_file_full_path in batch_files_full_path:
-            audio_filename = Path(audio_file_full_path).name
-            # Check if already processed or processing
-            file_state = hf_state['file_states'].get(audio_filename)
-            if file_state == "processed":
-                print(f"[{FLOW_ID}] ⏭️  File already processed: {audio_filename}. Aborting batch and moving index.")
-                # Abort batch, unlock any files we locked, and move index past this file
-                lock_succeeded = False
-                break
-            elif file_state == "processing":
-                print(f"[{FLOW_ID}] ⏭️  File currently processing: {audio_filename}. Aborting batch and moving index.")
-                # Abort batch, unlock any files we locked, and move index past this file
-                lock_succeeded = False
-                break
-            # Attempt to lock
-            if await lock_file_for_processing(audio_filename, hf_state):
-                locked_files.append(audio_filename)
-            else:
-                print(f"[{FLOW_ID}] ❌ Failed to lock file: {audio_filename}. Aborting batch.")
-                lock_succeeded = False
-                break
-        # If lock failed for any reason (already processed/processing or lock upload failed)
-        if not lock_succeeded:
-            # Unlock all files we successfully locked in this attempt
-            for filename in locked_files:
-                # We don't need to upload state for each unlock, we'll do it once at the end
-                if filename in hf_state['file_states']:
-                    del hf_state['file_states'][filename]
-            # If the failure was due to a file being processed/processed, we need to advance the index
-            if current_index < len(audio_files) and hf_state['file_states'].get(Path(audio_files[current_index]).name) in ["processed", "processing"]:
-                hf_state['next_download_index'] += 1
-                await upload_hf_state(hf_state)
-            # Wait and restart the loop
-            print(f"[{FLOW_ID}] Batch aborted. Waiting 10s before retrying...")
-            await asyncio.sleep(10)
-            continue
-        # 2. Download all files in batch in parallel
-        print(f"[{FLOW_ID}] ⬇️  Downloading batch ({len(batch_files_full_path)} files)...")
-        download_tasks = [download_audio_file(current_index + idx, path) for idx, path in enumerate(batch_files_full_path)]
         downloaded_paths = await asyncio.gather(*download_tasks, return_exceptions=True)
-        # 3. Send all downloaded files to Whisper servers in parallel
-        print(f"[{FLOW_ID}] 🎤 Distributing to {len(servers)} Whisper server(s) ({len(batch_files_full_path)} files)...")
         transcription_tasks = []
-        file_metadata = []
-        for idx, (repo_file_path, audio_path) in enumerate(zip(batch_files_full_path, downloaded_paths)):
             audio_filename = Path(repo_file_path).name
-            if isinstance(audio_path, Exception) or not audio_path or not audio_path.exists():
-                print(f"[{FLOW_ID}] ❌ Skipping {audio_filename} (download failed or path invalid)")
-                # Mark as processed immediately so we don't get stuck on a bad file
-                hf_state['file_states'][audio_filename] = "processed"
                 continue
             reference_filename = find_matching_filename(audio_filename, reference_map)
@@ -660,211 +569,137 @@ async def process_audio_files_background():
                 'audio_filename': audio_filename,
                 'audio_path': audio_path,
                 'reference_filename': reference_filename,
-                'file_index': current_index + idx
             })
-            # Create transcription task
-            # We need a wrapper function for parallel execution that uses the single-file logic
-            # The original code had a send_audio_for_transcription_task, but we removed it.
-            # We will use a lambda or a simple wrapper to call the existing send_audio_for_transcription
-            # The original send_audio_for_transcription is already async and handles server selection.
-            transcription_tasks.append(send_audio_for_transcription(audio_path, {'completed': 0, 'total': 1}))
         if transcription_tasks:
             print(f"[{FLOW_ID}] ⏳ Waiting for {len(transcription_tasks)} transcriptions (parallel)...")
             transcription_results = await asyncio.gather(*transcription_tasks, return_exceptions=True)
-            # 4. Process results, save locally, and upload individually
-            successful_uploads = 0
-            next_index_to_set = current_index
             for metadata, result in zip(file_metadata, transcription_results):
-                audio_filename = metadata['audio_filename']
-                audio_path = metadata['audio_path']
-                # Cleanup downloaded audio file
-                if audio_path.exists():
-                    os.remove(audio_path)
-                if isinstance(result, Exception) or not result:
-                    print(f"[{FLOW_ID}] ❌ Transcription failed for {audio_filename}. Marking as processed (failed).")
-                    hf_state['file_states'][audio_filename] = "processed"
-                    next_index_to_set = metadata['file_index'] + 1
                     continue
-                # Save JSON locally
-                json_filename = Path(metadata['reference_filename']).stem if metadata['reference_filename'] else Path(audio_filename).stem
-                json_path = RESULTS_DIR / f"{json_filename}.json"
-                # Add reference file mapping to the result
-                if metadata['reference_filename']:
-                    result['reference_file'] = metadata['reference_filename']
-                save_json_state(str(json_path), result)
-                # Upload the single JSON file
-                if await upload_single_transcription(json_path):
-                    successful_uploads += 1
-                    hf_state['file_states'][audio_filename] = "processed"
-                    next_index_to_set = metadata['file_index'] + 1
-                else:
-                    # Upload failed (likely rate limit). Keep the file locked and local for next attempt.
-                    # We must break the inner loop and restart the outer loop to re-check the rate limit.
-                    print(f"[{FLOW_ID}] 🛑 Upload failed (likely rate limit). Aborting batch processing.")
-                    # Keep the file locked (status is still 'processing') and local (json_path not removed)
-                    # We will break and restart the main loop.
-                    break
-            # 5. ATOMIC UNLOCK/STATE UPDATE: Update state on HF
-            # If the inner loop was broken due to rate limit, we skip the state update and restart the main loop.
-            if successful_uploads == len(transcription_tasks):
-                # All files in the batch were successfully processed and uploaded.
-                hf_state['next_download_index'] = next_index_to_set
-                await upload_hf_state(hf_state)
-                print(f"[{FLOW_ID}] ✅ Batch completed. Next index set to {next_index_to_set}.")
-            elif successful_uploads > 0:
-                # Some files failed transcription or download, but were marked as 'processed' (failed).
-                # The index was advanced past them.
-                hf_state['next_download_index'] = next_index_to_set
-                await upload_hf_state(hf_state)
-                print(f"[{FLOW_ID}] ✅ Batch partially completed. Next index set to {next_index_to_set}.")
-            elif successful_uploads == 0 and len(transcription_tasks) > 0:
-                # If the break was due to rate limit, the outer loop will handle the wait/retry.
-                # If all transcriptions failed, the index was advanced past them.
-                if next_index_to_set > current_index:
-                    hf_state['next_download_index'] = next_index_to_set
-                    await upload_hf_state(hf_state)
-                    print(f"[{FLOW_ID}] ⚠️ Batch failed. Next index set to {next_index_to_set}.")
-        # Wait a short period to avoid hammering the HF API
-        await asyncio.sleep(1)
-                await upload_hf_state(hf_state)
-                print(f"[{FLOW_ID}] ✅ Batch partially completed. Next index set to {next_index_to_set}.")
-            elif successful_uploads == 0 and len(transcription_tasks) > 0:
-                # If the break was due to rate limit, the outer loop will handle the wait/retry.
-                # If all transcriptions failed, the index was advanced past them.
-                if next_index_to_set > current_index:
-                    hf_state['next_download_index'] = next_index_to_set
-                    await upload_hf_state(hf_state)
-                    print(f"[{FLOW_ID}] ⚠️ Batch failed. Next index set to {next_index_to_set}.")
-        # Wait a short period to avoid hammering the HF API
-        await asyncio.sleep(1)
-            continue
-        # 2. Download all files in batch in parallel
-        print(f"[{FLOW_ID}] ⬇️  Downloading batch ({len(batch_files_full_path)} files)...")
-        download_tasks = [download_audio_file(current_index + idx, path) for idx, path in enumerate(batch_files_full_path)]
-        downloaded_paths = await asyncio.gather(*download_tasks, return_exceptions=True)
-        # 3. Send all downloaded files to Whisper servers in parallel
-        print(f"[{FLOW_ID}] 🎤 Distributing to {len(servers)} Whisper server(s) ({len(batch_files_full_path)} files)...")
-        transcription_tasks = []
-        file_metadata = []
-        for idx, (repo_file_path, audio_path) in enumerate(zip(batch_files_full_path, downloaded_paths)):
-            audio_filename = Path(repo_file_path).name
-            if isinstance(audio_path, Exception) or not audio_path or not audio_path.exists():
-                print(f"[{FLOW_ID}] ❌ Skipping {audio_filename} (download failed or path invalid)")
-                # Mark as processed immediately so we don't get stuck on a bad file
-                hf_state['file_states'][audio_filename] = "processed"
-                continue
-            reference_filename = find_matching_filename(audio_filename, reference_map)
-            file_metadata.append({
-                'audio_filename': audio_filename,
-                'audio_path': audio_path,
-                'reference_filename': reference_filename,
-                'file_index': current_index + idx
-            })
-            # Create transcription task
-            transcription_tasks.append(send_audio_for_transcription(audio_path, {'completed': 0, 'total': 1}))
-        if transcription_tasks:
-            print(f"[{FLOW_ID}] ⏳ Waiting for {len(transcription_tasks)} transcriptions (parallel)...")
-            transcription_results = await asyncio.gather(*transcription_tasks, return_exceptions=True)
-            # 4. Process results, save locally, and upload individually
-            successful_uploads = 0
-            next_index_to_set = current_index
-            for metadata, result in zip(file_metadata, transcription_results):
-                audio_filename = metadata['audio_filename']
-                audio_path = metadata['audio_path']
-                # Cleanup downloaded audio file
-                if audio_path.exists():
-                    os.remove(audio_path)
-                if isinstance(result, Exception) or not result:
-                    print(f"[{FLOW_ID}] ❌ Transcription failed for {audio_filename}. Marking as processed (failed).")
-                    hf_state['file_states'][audio_filename] = "processed"
-                    next_index_to_set = metadata['file_index'] + 1
-                    continue
-                # Save JSON locally
-                json_filename = Path(metadata['reference_filename']).stem if metadata['reference_filename'] else Path(audio_filename).stem
-                json_path = RESULTS_DIR / f"{json_filename}.json"
-                # Add reference file mapping to the result
-                if metadata['reference_filename']:
-                    result['reference_file'] = metadata['reference_filename']
-                save_json_state(str(json_path), result)
-                # Upload the single JSON file
-                if await upload_single_transcription(json_path):
-                    successful_uploads += 1
-                    hf_state['file_states'][audio_filename] = "processed"
-                    next_index_to_set = metadata['file_index'] + 1
-                else:
-                    # Upload failed (likely rate limit). Keep the file locked and local for next attempt.
-                    # We must break the inner loop and restart the outer loop to re-check the rate limit.
-                    print(f"[{FLOW_ID}] 🛑 Upload failed (likely rate limit). Aborting batch processing.")
-                    # Keep the file locked (status is still 'processing') and local (json_path not removed)
-                    # We will break and restart the main loop.
-                    break
-            # 5. ATOMIC UNLOCK/STATE UPDATE: Update state on HF
-            # If the inner loop was broken due to rate limit, we skip the state update and restart the main loop.
-            if successful_uploads == len(transcription_tasks):
-                # All files in the batch were successfully processed and uploaded.
-                hf_state['next_download_index'] = next_index_to_set
-                await upload_hf_state(hf_state)
-                print(f"[{FLOW_ID}] ✅ Batch completed. Next index set to {next_index_to_set}.")
-            elif successful_uploads > 0:
-                # Some files failed transcription or download, but were marked as 'processed' (failed).
-                # The index was advanced past them.
-                hf_state['next_download_index'] = next_index_to_set
-                await upload_hf_state(hf_state)
-                print(f"[{FLOW_ID}] ✅ Batch partially completed. Next index set to {next_index_to_set}.")
-            elif successful_uploads == 0 and len(transcription_tasks) > 0:
-                # If the break was due to rate limit, the outer loop will handle the wait/retry.
-                # If all transcriptions failed, the index was advanced past them.
-                if next_index_to_set > current_index:
-                    hf_state['next_download_index'] = next_index_to_set
-                    await upload_hf_state(hf_state)
-                    print(f"[{FLOW_ID}] ⚠️ Batch failed. Next index set to {next_index_to_set}.")
-        # Wait a short period to avoid hammering the HF API
-        await asyncio.sleep(1)
-                await upload_hf_state(hf_state)
-                print(f"[{FLOW_ID}] ✅ Batch partially completed. Next index set to {next_index_to_set}.")
-            elif successful_uploads == 0 and len(transcription_tasks) > 0:
-                # If the break was due to rate limit, the outer loop will handle the wait/retry.
-                # If all transcriptions failed, the index was advanced past them.
-                if next_index_to_set > current_index:
-                    hf_state['next_download_index'] = next_index_to_set
-                    await upload_hf_state(hf_state)
-                    print(f"[{FLOW_ID}] ⚠️ Batch failed. Next index set to {next_index_to_set}.")
-        # Wait a short period to avoid hammering the HF API
-        await asyncio.sleep(1)
 @app.get("/")
 async def root():

 import time
 import asyncio
 import aiohttp
+import zipfile
 import shutil
 import threading
+from typing import Dict, List, Set, Optional, Tuple, Any
 from urllib.parse import quote
 from datetime import datetime
 from pathlib import Path
 ]
 MODEL_TYPE = "whisper-small"
+ZIP_UPLOAD_THRESHOLD = 100  # Upload and zip after this many transcriptions
 # Temporary storage for audio files
 TEMP_DIR = Path(f"temp_audio_{FLOW_ID}")
     except Exception as e:
         print(f"[{FLOW_ID}] CRITICAL ERROR: Could not save progress to {PROGRESS_FILE}: {e}")
 def load_json_state(file_path: str, default_value: Dict[str, Any]) -> Dict[str, Any]:
     """Load state from JSON file with migration logic for new structure."""
     if os.path.exists(file_path):
         print(f"[{FLOW_ID}] Error downloading audio file {repo_file_full_path}: {e}")
         return None
+async def zip_and_upload_transcriptions(transcription_files: List[Path], batch_number: int) -> bool:
+    """Zips transcription JSON files and uploads to dataset with batch numbering."""
+    if not transcription_files:
+        print(f"[{FLOW_ID}] No transcription files to zip.")
         return False
     try:
+        zip_filename = f"audio_json_batch_{batch_number}.zip"
+        zip_path = RESULTS_DIR / zip_filename
+        print(f"[{FLOW_ID}] 📦 Creating zip file: {zip_filename} with {len(transcription_files)} files...")
+        with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
+            for file_path in transcription_files:
+                if file_path.exists():
+                    zipf.write(file_path, arcname=file_path.name)
+        print(f"[{FLOW_ID}] 📤 Uploading zip file to {HF_OUTPUT_DATASET_ID}...")
         api = HfApi(token=HF_TOKEN)
         api.upload_file(
+            path_or_fileobj=str(zip_path),
+            path_in_repo=zip_filename,
             repo_id=HF_OUTPUT_DATASET_ID,
             repo_type="dataset",
+            commit_message=f"[{FLOW_ID}] Batch {batch_number}: {len(transcription_files)} transcriptions"
         )
+        print(f"[{FLOW_ID}] ✅ Successfully uploaded: {zip_filename}")
+        # Cleanup
+        os.remove(zip_path)
         return True
     except Exception as e:
+        print(f"[{FLOW_ID}] Error zipping and uploading transcriptions: {e}")
         return False
 # --- Core Processing Functions ---
     }
 async def process_audio_files_background():
+    """Background task that processes audio files with reference mapping using batch distribution."""
     progress_data = load_progress()
     reference_map = progress_data.get('reference_map', {})
     if not audio_files:
         print(f"[{FLOW_ID}] No audio files found. Exiting.")
         return
     # Dynamic batch size: one file per server
     BATCH_SIZE = len(servers)
     print(f"[{FLOW_ID}] 📊 Configuration: {len(servers)} Whisper server(s) → Batch size: {BATCH_SIZE} (1 file per server)")
+    start_index = progress_data['last_processed_index']
+    transcription_files = []
+    batch_number = 1
+    print(f"[{FLOW_ID}] Starting batch processing from file #{start_index} (out of {len(audio_files)})...")
+    # Process in batches
+    for batch_start in range(start_index, len(audio_files), BATCH_SIZE):
+        batch_end = min(batch_start + BATCH_SIZE, len(audio_files))
+        batch_files = audio_files[batch_start:batch_end]
+        print(f"\n[{FLOW_ID}] 📦 BATCH: Processing files #{batch_start}-#{batch_end-1} ({len(batch_files)} files)")
+        # Step 1: Download all files in batch in parallel
+        print(f"[{FLOW_ID}] ⬇️  Downloading batch ({len(batch_files)} files)...")
+        download_tasks = []
+        for idx, repo_file_path in enumerate(batch_files):
+            file_index = batch_start + idx
+            download_tasks.append(download_audio_file(file_index, repo_file_path))
         downloaded_paths = await asyncio.gather(*download_tasks, return_exceptions=True)
+        # Step 2: Send all downloaded files to Whisper servers in parallel
+        print(f"[{FLOW_ID}] 🎤 Distributing to {len(servers)} Whisper server(s) ({len(batch_files)} files)...")
         transcription_tasks = []
+        file_metadata = []  # Track file info for results
+        for idx, (repo_file_path, audio_path) in enumerate(zip(batch_files, downloaded_paths)):
+            file_index = batch_start + idx
             audio_filename = Path(repo_file_path).name
+            # Skip if download failed
+            if isinstance(audio_path, Exception):
+                print(f"[{FLOW_ID}] ⏭️  Skipping {audio_filename} (download failed)")
+                continue
+            if not audio_path or not audio_path.exists():
                 continue
             reference_filename = find_matching_filename(audio_filename, reference_map)
                 'audio_filename': audio_filename,
                 'audio_path': audio_path,
                 'reference_filename': reference_filename,
+                'file_index': file_index
             })
+            # Create transcription task (will be awaited in parallel)
+            transcription_tasks.append(send_audio_for_transcription_task(audio_path, audio_filename))
         if transcription_tasks:
             print(f"[{FLOW_ID}] ⏳ Waiting for {len(transcription_tasks)} transcriptions (parallel)...")
             transcription_results = await asyncio.gather(*transcription_tasks, return_exceptions=True)
+            # Step 3: Save transcriptions locally (don't upload individually)
+            successful = len([r for r in transcription_results if not isinstance(r, Exception) and r])
+            print(f"[{FLOW_ID}] 💾 Saving {successful}/{len(transcription_results)} transcriptions locally...")
             for metadata, result in zip(file_metadata, transcription_results):
+                if isinstance(result, Exception):
+                    print(f"[{FLOW_ID}] ❌ Transcription failed for {metadata['audio_filename']}: {result}")
                     continue
+                if result:
+                    # Save JSON locally
+                    json_filename = Path(metadata['reference_filename']).stem if metadata['reference_filename'] else Path(metadata['audio_filename']).stem
+                    json_file_path = Path(RESULTS_DIR) / f"{json_filename}.json"
+                    # Write JSON to file
+                    with open(json_file_path, 'w', encoding='utf-8') as f:
+                        json.dump(result, f, indent=2, ensure_ascii=False)
+                    transcription_files.append(json_file_path)
+                    progress_data['transcription_count'] += 1
+                    # Mark as processed
+                    state = await download_hf_state()
+                    await unlock_file_as_processed(
+                        metadata['audio_filename'],
+                        state,
+                        metadata['file_index'] + 1
+                    )
+            # Step 4: Cleanup downloaded audio files
+            for metadata in file_metadata:
+                if metadata['audio_path'].exists():
+                    os.remove(metadata['audio_path'])
+        # Save progress after batch
+        progress_data['last_processed_index'] = batch_end
+        save_progress(progress_data)
+        # Step 5: Check if we've reached the batch threshold for zipping (100 files)
+        if len(transcription_files) >= ZIP_UPLOAD_THRESHOLD:
+            print(f"\n[{FLOW_ID}] 📦 Reached ZIP threshold ({ZIP_UPLOAD_THRESHOLD}). Creating and uploading batch {batch_number}...")
+            files_to_zip = transcription_files[:ZIP_UPLOAD_THRESHOLD]
+            await zip_and_upload_transcriptions(files_to_zip, batch_number)
+            # Remove zipped files locally and update list
+            for file_path in files_to_zip:
+                if file_path.exists():
+                    os.remove(file_path)
+            transcription_files = transcription_files[ZIP_UPLOAD_THRESHOLD:]
+            batch_number += 1
+    # Upload remaining transcriptions as final batch
+    if transcription_files:
+        print(f"\n[{FLOW_ID}] 📦 Uploading final batch {batch_number} with {len(transcription_files)} transcriptions...")
+        await zip_and_upload_transcriptions(transcription_files, batch_number)
+        # Cleanup
+        for file_path in transcription_files:
+            if file_path.exists():
+                os.remove(file_path)
+    print(f"\n[{FLOW_ID}] ✅ ALL DONE! Total transcriptions: {progress_data['transcription_count']}")
+async def send_audio_for_transcription_task(audio_path: Path, audio_filename: str) -> Optional[Dict]:
+    """Wrapper for transcription that can be used in asyncio.gather."""
+    MAX_RETRIES = 3
+    for attempt in range(MAX_RETRIES):
+        server = None
+        try:
+            server = await get_available_server()
+            server.busy = True
+            start_time = time.time()
+            # Read file content once
+            with audio_path.open('rb') as f:
+                file_content = f.read()
+            form_data = aiohttp.FormData()
+            form_data.add_field('file',
+                                io.BytesIO(file_content),
+                                filename=audio_filename,
+                                content_type='audio/mpeg')
+            async with aiohttp.ClientSession() as session:
+                async with session.post(server.url, data=form_data, timeout=aiohttp.ClientTimeout(total=600)) as resp:
+                    if resp.status == 200:
+                        result = await resp.json()
+                        if result.get('text') or result.get('transcription'):
+                            print(f"[{FLOW_ID}] ✅ {audio_filename}")
+                            return {
+                                "audio_file": audio_filename,
+                                "text": result.get('text', result.get('transcription', '')),
+                                "language": result.get('language', 'unknown'),
+                                "confidence": result.get('confidence'),
+                                "duration": result.get('duration'),
+                            }
+                        else:
+                            print(f"[{FLOW_ID}] ⚠️  Invalid response for {audio_filename}")
+                            continue
+                    else:
+                        error_text = await resp.text()
+                        print(f"[{FLOW_ID}] ❌ Server error {resp.status}: {audio_filename}")
+                        continue
+        except (aiohttp.ClientError, asyncio.TimeoutError, TimeoutError) as e:
+            print(f"[{FLOW_ID}] ⏱️  Timeout/Connection error for {audio_filename}")
+            continue
+        except Exception as e:
+            print(f"[{FLOW_ID}] ❌ Error for {audio_filename}: {str(e)[:50]}")
+            continue
+        finally:
+            if server:
+                end_time = time.time()
+                server.busy = False
+                server.total_processed += 1
+                server.total_time += (end_time - start_time)
+    return None
 @app.get("/")
 async def root():