Spaces:

Samfredoly
/

switches

Paused

App Files Files Community

Samfredoly commited on Nov 20, 2025

Commit

f58b066

verified ·

1 Parent(s): 8adf0c7

Update app.py

Browse files

Files changed (1) hide show

app.py +29 -11

app.py CHANGED Viewed

@@ -421,18 +421,25 @@ async def send_audio_for_transcription(audio_path: Path, progress_tracker: Dict)
             if attempt == 0:
                 print(f"[{FLOW_ID}] Starting transcription attempt on {audio_path.name}...")
-            # 2. Prepare request data
             form_data = aiohttp.FormData()
             form_data.add_field('file',
-                                audio_path.open('rb'),
                                 filename=audio_path.name,
                                 content_type='audio/mpeg')
             # 3. Send request
             async with aiohttp.ClientSession() as session:
                 async with session.post(server.url, data=form_data, timeout=aiohttp.ClientTimeout(total=600)) as resp:
                     if resp.status == 200:
                         result = await resp.json()
                         # Check if response contains transcription data
                         if result.get('text') or result.get('transcription'):
@@ -441,7 +448,7 @@ async def send_audio_for_transcription(audio_path: Path, progress_tracker: Dict)
                             if progress_tracker['completed'] % 10 == 0:
                                 print(f"[{FLOW_ID}] PROGRESS: {progress_tracker['completed']}/{progress_tracker['total']} transcriptions completed.")
-                            print(f"[{FLOW_ID}] Success: {audio_path.name} transcribed by {server.url}")
                             # Store the full transcription result
                             return {
@@ -452,18 +459,20 @@ async def send_audio_for_transcription(audio_path: Path, progress_tracker: Dict)
                                 "duration": result.get('duration'),
                             }
                         else:
-                            print(f"[{FLOW_ID}] Server {server.url} returned invalid response format for {audio_path.name}. Response: {result}")
                             continue
                     else:
                         error_text = await resp.text()
-                        print(f"[{FLOW_ID}] Error from server {server.url} for {audio_path.name}: {resp.status} - {error_text}. Retrying...")
                         continue
         except (aiohttp.ClientError, asyncio.TimeoutError, TimeoutError) as e:
-            print(f"[{FLOW_ID}] Connection/Timeout error for {audio_path.name} on {server.url if server else 'unknown server'}: {e}. Retrying...")
             continue
         except Exception as e:
-            print(f"[{FLOW_ID}] Unexpected error during transcription for {audio_path.name}: {e}. Retrying...")
             continue
         finally:
             if server:
@@ -472,7 +481,7 @@ async def send_audio_for_transcription(audio_path: Path, progress_tracker: Dict)
                 server.total_processed += 1
                 server.total_time += (end_time - start_time)
-    print(f"[{FLOW_ID}] FAILED after {MAX_RETRIES} attempts for {audio_path.name}.")
     return None
 # --- FastAPI App and Endpoints ---
@@ -494,6 +503,7 @@ async def process_audio_files(background_tasks: BackgroundTasks):
     Fetches audio from HF dataset, sends to Whisper servers, and uploads results.
     Uses reference file mapping for output filename renaming.
     """
     background_tasks.add_task(process_audio_files_background)
     return {
         "status": "processing_started",
@@ -501,6 +511,8 @@ async def process_audio_files(background_tasks: BackgroundTasks):
         "message": "Background processing task started. Check /status for progress."
     }
 async def process_audio_files_background():
     """Background task that processes audio files with reference mapping."""
     progress_data = load_progress()
@@ -534,24 +546,29 @@ async def process_audio_files_background():
         repo_file_path = audio_files[file_index]
         audio_filename = Path(repo_file_path).name
         # Check if already processed
         state = await download_hf_state()
         if audio_filename in state.get('file_states', {}) and state['file_states'][audio_filename] == 'processed':
-            print(f"[{FLOW_ID}] Skipping already processed: {audio_filename}")
             continue
         # Lock the file for processing
         if not await lock_file_for_processing(audio_filename, state):
-            print(f"[{FLOW_ID}] Could not lock file {audio_filename}, skipping.")
             continue
         try:
             # Download audio file
             audio_path = await download_audio_file(file_index, repo_file_path)
             if not audio_path:
-                print(f"[{FLOW_ID}] Failed to download {audio_filename}")
                 continue
             # Get matching reference filename
             reference_filename = find_matching_filename(audio_filename, reference_map)
             if reference_filename:
@@ -560,6 +577,7 @@ async def process_audio_files_background():
                 print(f"[{FLOW_ID}] No reference match for {audio_filename}, will use audio filename")
             # Send for transcription
             transcription_result = await send_audio_for_transcription(audio_path, progress_tracker)
             if transcription_result:

             if attempt == 0:
                 print(f"[{FLOW_ID}] Starting transcription attempt on {audio_path.name}...")
+            # 2. Prepare request data - keep file open until request is done
+            with audio_path.open('rb') as f:
+                file_content = f.read()
             form_data = aiohttp.FormData()
             form_data.add_field('file',
+                                io.BytesIO(file_content),
                                 filename=audio_path.name,
                                 content_type='audio/mpeg')
             # 3. Send request
             async with aiohttp.ClientSession() as session:
+                print(f"[{FLOW_ID}] Sending audio file to {server.url}...")
                 async with session.post(server.url, data=form_data, timeout=aiohttp.ClientTimeout(total=600)) as resp:
+                    print(f"[{FLOW_ID}] Received response status: {resp.status}")
                     if resp.status == 200:
                         result = await resp.json()
+                        print(f"[{FLOW_ID}] Response data: {result}")
                         # Check if response contains transcription data
                         if result.get('text') or result.get('transcription'):
                             if progress_tracker['completed'] % 10 == 0:
                                 print(f"[{FLOW_ID}] PROGRESS: {progress_tracker['completed']}/{progress_tracker['total']} transcriptions completed.")
+                            print(f"[{FLOW_ID}] ✅ Success: {audio_path.name} transcribed by {server.url}")
                             # Store the full transcription result
                             return {
                                 "duration": result.get('duration'),
                             }
                         else:
+                            print(f"[{FLOW_ID}] ⚠️ Server {server.url} returned invalid response format for {audio_path.name}. Response: {result}")
                             continue
                     else:
                         error_text = await resp.text()
+                        print(f"[{FLOW_ID}] ❌ Error from server {server.url} for {audio_path.name}: {resp.status} - {error_text}. Retrying...")
                         continue
         except (aiohttp.ClientError, asyncio.TimeoutError, TimeoutError) as e:
+            print(f"[{FLOW_ID}] ❌ Connection/Timeout error for {audio_path.name} on {server.url if server else 'unknown server'}: {e}. Retrying...")
             continue
         except Exception as e:
+            print(f"[{FLOW_ID}] ❌ Unexpected error during transcription for {audio_path.name}: {e}. Retrying...")
+            import traceback
+            traceback.print_exc()
             continue
         finally:
             if server:
                 server.total_processed += 1
                 server.total_time += (end_time - start_time)
+    print(f"[{FLOW_ID}] ❌ FAILED after {MAX_RETRIES} attempts for {audio_path.name}.")
     return None
 # --- FastAPI App and Endpoints ---
     Fetches audio from HF dataset, sends to Whisper servers, and uploads results.
     Uses reference file mapping for output filename renaming.
     """
+    print(f"[{FLOW_ID}] /process endpoint called, starting background task...")
     background_tasks.add_task(process_audio_files_background)
     return {
         "status": "processing_started",
         "message": "Background processing task started. Check /status for progress."
     }
+# No wrapper needed - add_task can handle coroutine functions directly in newer FastAPI
 async def process_audio_files_background():
     """Background task that processes audio files with reference mapping."""
     progress_data = load_progress()
         repo_file_path = audio_files[file_index]
         audio_filename = Path(repo_file_path).name
+        print(f"[{FLOW_ID}] 📝 Processing file #{file_index}: {audio_filename}")
         # Check if already processed
         state = await download_hf_state()
         if audio_filename in state.get('file_states', {}) and state['file_states'][audio_filename] == 'processed':
+            print(f"[{FLOW_ID}] ⏭️  Skipping already processed: {audio_filename}")
             continue
         # Lock the file for processing
         if not await lock_file_for_processing(audio_filename, state):
+            print(f"[{FLOW_ID}] ❌ Could not lock file {audio_filename}, skipping.")
             continue
         try:
             # Download audio file
+            print(f"[{FLOW_ID}] ⬇️  Downloading audio file...")
             audio_path = await download_audio_file(file_index, repo_file_path)
             if not audio_path:
+                print(f"[{FLOW_ID}] ❌ Failed to download {audio_filename}")
                 continue
+            print(f"[{FLOW_ID}] ✅ Audio downloaded to {audio_path}")
             # Get matching reference filename
             reference_filename = find_matching_filename(audio_filename, reference_map)
             if reference_filename:
                 print(f"[{FLOW_ID}] No reference match for {audio_filename}, will use audio filename")
             # Send for transcription
+            print(f"[{FLOW_ID}] 🎤 Sending to Whisper server...")
             transcription_result = await send_audio_for_transcription(audio_path, progress_tracker)
             if transcription_result: