Spaces:

Samfredoly
/

switches

Paused

App Files Files Community

Samfredoly commited on Nov 26, 2025

Commit

34e34a0

verified ·

1 Parent(s): 99dce0a

Update app.py

Browse files

Files changed (1) hide show

app.py +78 -46

app.py CHANGED Viewed

@@ -283,17 +283,19 @@ async def download_wav_file_by_index(file_index: int, repo_file_full_path: str)
     print(f"[{FLOW_ID}] Downloading file #{file_index}: {repo_file_full_path}")
     try:
-        # Use hf_hub_download to get the file path
         wav_path = hf_hub_download(
             repo_id=HF_AUDIO_DATASET_ID,
             filename=repo_file_full_path,
             repo_type="dataset",
             token=HF_TOKEN,
         )
         print(f"[{FLOW_ID}] Downloaded WAV file to {wav_path}")
         return Path(wav_path)
     except Exception as e:
         print(f"[{FLOW_ID}] Error downloading WAV file {repo_file_full_path}: {e}")
         return None
@@ -336,34 +338,33 @@ async def send_audio_to_whisper(wav_path: Path, server: WhisperServer) -> Option
         # Prepare multipart form data
         form_data = aiohttp.FormData()
-        form_data.add_field('file',
-                            wav_path.open('rb'),
-                            filename=wav_path.name,
-                            content_type='audio/wav')
-        async with aiohttp.ClientSession() as session:
-            # 10 minute timeout for transcription
-            async with session.post(server.url, data=form_data, timeout=600) as resp:
-                if resp.status == 200:
-                    result = await resp.json()
-                    end_time = time.time()
-                    # Update server stats
-                    server.total_processed += 1
-                    server.total_time += (end_time - start_time)
-                    print(f"[{FLOW_ID}] ✓ {wav_path.name} transcribed successfully by {server.url}")
-                    return {
-                        "file": wav_path.name,
-                        "transcription": result,
-                        "timestamp": datetime.now().isoformat(),
-                        "processing_time_seconds": end_time - start_time
-                    }
-                else:
-                    error_text = await resp.text()
-                    print(f"[{FLOW_ID}] ✗ Error from {server.url}: {resp.status} - {error_text}")
-                    return None
     except asyncio.TimeoutError:
         print(f"[{FLOW_ID}] ✗ Timeout from {server.url} for {wav_path.name}")
@@ -404,6 +405,27 @@ async def process_batch_dynamic(wav_files: List[str], start_batch_index: int, ba
     pending_tasks: Dict[asyncio.Task, Tuple[int, Path, WhisperServer]] = {}
     print(f"[{FLOW_ID}] Processing batch from index {start_batch_index} to {batch_end}")
     try:
         while current_index < batch_end or pending_tasks:
@@ -421,13 +443,12 @@ async def process_batch_dynamic(wav_files: List[str], start_batch_index: int, ba
                 wav_file = wav_files[file_index]
                 wav_filename = Path(wav_file).name
-                # Mark file as processing in state
-                state["file_states"][wav_filename] = "processing"
                 # Download the WAV file
                 wav_path = await download_wav_file_by_index(file_index + 1, wav_file)
                 if not wav_path:
                     state["file_states"][wav_filename] = "failed"
                     current_index += 1
                     continue
@@ -454,22 +475,20 @@ async def process_batch_dynamic(wav_files: List[str], start_batch_index: int, ba
                         transcription_result = task.result()
                         if transcription_result:
-                            # Check if we should pause uploading
-                            if UPLOAD_PAUSE_ENABLED and uploaded_count >= MAX_UPLOADS_BEFORE_PAUSE:
-                                print(f"[{FLOW_ID}] ⏸️  Upload limit reached ({uploaded_count}/{MAX_UPLOADS_BEFORE_PAUSE}). Pausing uploads but continuing processing...")
-                                # Mark as processed but don't upload
                                 state["file_states"][wav_filename] = "processed"
                             else:
-                                # Upload transcription
-                                if await upload_transcription_to_hf(wav_filename, transcription_result):
-                                    state["file_states"][wav_filename] = "processed"
-                                    uploaded_count += 1
-                                    progress['uploaded_count'] = uploaded_count
-                                    save_progress(progress)
-                                else:
-                                    state["file_states"][wav_filename] = "failed"
                         else:
                             state["file_states"][wav_filename] = "failed"
                     except Exception as e:
                         print(f"[{FLOW_ID}] Error processing result for {wav_filename}: {e}")
@@ -520,6 +539,19 @@ async def process_dataset_task(start_index: int):
     if 'uploaded_count' not in progress:
         progress['uploaded_count'] = 0
     global_success = True
     current_batch_index = start_list_index
     batch_size = len(servers) * 2  # Process 2 batches per server at a time

     print(f"[{FLOW_ID}] Downloading file #{file_index}: {repo_file_full_path}")
     try:
+        # Download the file into our TEMP_DIR (so we can safely delete it later)
         wav_path = hf_hub_download(
             repo_id=HF_AUDIO_DATASET_ID,
             filename=repo_file_full_path,
             repo_type="dataset",
             token=HF_TOKEN,
+            local_dir=str(TEMP_DIR),
+            local_dir_use_symlinks=False,
         )
         print(f"[{FLOW_ID}] Downloaded WAV file to {wav_path}")
         return Path(wav_path)
     except Exception as e:
         print(f"[{FLOW_ID}] Error downloading WAV file {repo_file_full_path}: {e}")
         return None
         # Prepare multipart form data
         form_data = aiohttp.FormData()
+        # Open the file in a context manager so the descriptor is closed after the request
+        with wav_path.open('rb') as f:
+            form_data.add_field('file', f, filename=wav_path.name, content_type='audio/wav')
+            async with aiohttp.ClientSession() as session:
+                # 10 minute timeout for transcription
+                async with session.post(server.url, data=form_data, timeout=600) as resp:
+                    if resp.status == 200:
+                        result = await resp.json()
+                        end_time = time.time()
+                        # Update server stats
+                        server.total_processed += 1
+                        server.total_time += (end_time - start_time)
+                        print(f"[{FLOW_ID}] ✓ {wav_path.name} transcribed successfully by {server.url}")
+                        return {
+                            "file": wav_path.name,
+                            "transcription": result,
+                            "timestamp": datetime.now().isoformat(),
+                            "processing_time_seconds": end_time - start_time
+                        }
+                    else:
+                        error_text = await resp.text()
+                        print(f"[{FLOW_ID}] ✗ Error from {server.url}: {resp.status} - {error_text}")
+                        return None
     except asyncio.TimeoutError:
         print(f"[{FLOW_ID}] ✗ Timeout from {server.url} for {wav_path.name}")
     pending_tasks: Dict[asyncio.Task, Tuple[int, Path, WhisperServer]] = {}
     print(f"[{FLOW_ID}] Processing batch from index {start_batch_index} to {batch_end}")
+    # --- Batch-level locking: mark all files in this batch as 'processing' and upload state
+    try:
+        for idx in range(start_batch_index, batch_end):
+            wav_file = wav_files[idx]
+            wav_name = Path(wav_file).name
+            state.setdefault("file_states", {})
+            # Only set to processing if it's not already processed/processing
+            if state["file_states"].get(wav_name) not in ("processing", "processed"):
+                state["file_states"][wav_name] = "processing"
+        # Advance the next_download_index to the end of this batch (1-based index)
+        state["next_download_index"] = batch_end
+        # Upload HF state to establish locks for this batch
+        if await upload_hf_state(state):
+            print(f"[{FLOW_ID}] ✅ Batch lock uploaded for indices {start_batch_index}..{batch_end - 1}")
+        else:
+            print(f"[{FLOW_ID}] ❌ Failed to upload batch lock for indices {start_batch_index}..{batch_end - 1}")
+    except Exception as e:
+        print(f"[{FLOW_ID}] Error while setting up batch locks: {e}")
     try:
         while current_index < batch_end or pending_tasks:
                 wav_file = wav_files[file_index]
                 wav_filename = Path(wav_file).name
                 # Download the WAV file
                 wav_path = await download_wav_file_by_index(file_index + 1, wav_file)
                 if not wav_path:
                     state["file_states"][wav_filename] = "failed"
+                    # Persist failure to HF
+                    await upload_hf_state(state)
                     current_index += 1
                     continue
                         transcription_result = task.result()
                         if transcription_result:
+                            # Upload transcription (immediately)
+                            uploaded_ok = await upload_transcription_to_hf(wav_filename, transcription_result)
+                            if uploaded_ok:
                                 state["file_states"][wav_filename] = "processed"
+                                uploaded_count += 1
+                                progress['uploaded_count'] = uploaded_count
+                                save_progress(progress)
                             else:
+                                state["file_states"][wav_filename] = "failed"
+                            # Persist state change for this file immediately
+                            await upload_hf_state(state)
                         else:
                             state["file_states"][wav_filename] = "failed"
+                            await upload_hf_state(state)
                     except Exception as e:
                         print(f"[{FLOW_ID}] Error processing result for {wav_filename}: {e}")
     if 'uploaded_count' not in progress:
         progress['uploaded_count'] = 0
+    # If there was no HF state in the repo, upload a fresh initial state file
+    try:
+        if not current_state.get("file_states") and current_state.get("next_download_index", 0) == 0:
+            print(f"[{FLOW_ID}] No HF state detected; uploading initial state file to {HF_OUTPUT_DATASET_ID}...")
+            # Ensure structure
+            current_state.setdefault("file_states", {})
+            current_state.setdefault("next_download_index", 0)
+            if await upload_hf_state(current_state):
+                print(f"[{FLOW_ID}] ✅ Initial HF state uploaded.")
+            else:
+                print(f"[{FLOW_ID}] ❌ Failed to upload initial HF state.")
+    except Exception as e:
+        print(f"[{FLOW_ID}] Error while uploading initial HF state: {e}")
     global_success = True
     current_batch_index = start_list_index
     batch_size = len(servers) * 2  # Process 2 batches per server at a time