Samfredoly commited on
Commit
f58b066
·
verified ·
1 Parent(s): 8adf0c7

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +29 -11
app.py CHANGED
@@ -421,18 +421,25 @@ async def send_audio_for_transcription(audio_path: Path, progress_tracker: Dict)
421
  if attempt == 0:
422
  print(f"[{FLOW_ID}] Starting transcription attempt on {audio_path.name}...")
423
 
424
- # 2. Prepare request data
 
 
 
425
  form_data = aiohttp.FormData()
426
  form_data.add_field('file',
427
- audio_path.open('rb'),
428
  filename=audio_path.name,
429
  content_type='audio/mpeg')
430
 
431
  # 3. Send request
432
  async with aiohttp.ClientSession() as session:
 
433
  async with session.post(server.url, data=form_data, timeout=aiohttp.ClientTimeout(total=600)) as resp:
 
 
434
  if resp.status == 200:
435
  result = await resp.json()
 
436
 
437
  # Check if response contains transcription data
438
  if result.get('text') or result.get('transcription'):
@@ -441,7 +448,7 @@ async def send_audio_for_transcription(audio_path: Path, progress_tracker: Dict)
441
  if progress_tracker['completed'] % 10 == 0:
442
  print(f"[{FLOW_ID}] PROGRESS: {progress_tracker['completed']}/{progress_tracker['total']} transcriptions completed.")
443
 
444
- print(f"[{FLOW_ID}] Success: {audio_path.name} transcribed by {server.url}")
445
 
446
  # Store the full transcription result
447
  return {
@@ -452,18 +459,20 @@ async def send_audio_for_transcription(audio_path: Path, progress_tracker: Dict)
452
  "duration": result.get('duration'),
453
  }
454
  else:
455
- print(f"[{FLOW_ID}] Server {server.url} returned invalid response format for {audio_path.name}. Response: {result}")
456
  continue
457
  else:
458
  error_text = await resp.text()
459
- print(f"[{FLOW_ID}] Error from server {server.url} for {audio_path.name}: {resp.status} - {error_text}. Retrying...")
460
  continue
461
 
462
  except (aiohttp.ClientError, asyncio.TimeoutError, TimeoutError) as e:
463
- print(f"[{FLOW_ID}] Connection/Timeout error for {audio_path.name} on {server.url if server else 'unknown server'}: {e}. Retrying...")
464
  continue
465
  except Exception as e:
466
- print(f"[{FLOW_ID}] Unexpected error during transcription for {audio_path.name}: {e}. Retrying...")
 
 
467
  continue
468
  finally:
469
  if server:
@@ -472,7 +481,7 @@ async def send_audio_for_transcription(audio_path: Path, progress_tracker: Dict)
472
  server.total_processed += 1
473
  server.total_time += (end_time - start_time)
474
 
475
- print(f"[{FLOW_ID}] FAILED after {MAX_RETRIES} attempts for {audio_path.name}.")
476
  return None
477
 
478
  # --- FastAPI App and Endpoints ---
@@ -494,6 +503,7 @@ async def process_audio_files(background_tasks: BackgroundTasks):
494
  Fetches audio from HF dataset, sends to Whisper servers, and uploads results.
495
  Uses reference file mapping for output filename renaming.
496
  """
 
497
  background_tasks.add_task(process_audio_files_background)
498
  return {
499
  "status": "processing_started",
@@ -501,6 +511,8 @@ async def process_audio_files(background_tasks: BackgroundTasks):
501
  "message": "Background processing task started. Check /status for progress."
502
  }
503
 
 
 
504
  async def process_audio_files_background():
505
  """Background task that processes audio files with reference mapping."""
506
  progress_data = load_progress()
@@ -534,24 +546,29 @@ async def process_audio_files_background():
534
  repo_file_path = audio_files[file_index]
535
  audio_filename = Path(repo_file_path).name
536
 
 
 
537
  # Check if already processed
538
  state = await download_hf_state()
539
  if audio_filename in state.get('file_states', {}) and state['file_states'][audio_filename] == 'processed':
540
- print(f"[{FLOW_ID}] Skipping already processed: {audio_filename}")
541
  continue
542
 
543
  # Lock the file for processing
544
  if not await lock_file_for_processing(audio_filename, state):
545
- print(f"[{FLOW_ID}] Could not lock file {audio_filename}, skipping.")
546
  continue
547
 
548
  try:
549
  # Download audio file
 
550
  audio_path = await download_audio_file(file_index, repo_file_path)
551
  if not audio_path:
552
- print(f"[{FLOW_ID}] Failed to download {audio_filename}")
553
  continue
554
 
 
 
555
  # Get matching reference filename
556
  reference_filename = find_matching_filename(audio_filename, reference_map)
557
  if reference_filename:
@@ -560,6 +577,7 @@ async def process_audio_files_background():
560
  print(f"[{FLOW_ID}] No reference match for {audio_filename}, will use audio filename")
561
 
562
  # Send for transcription
 
563
  transcription_result = await send_audio_for_transcription(audio_path, progress_tracker)
564
 
565
  if transcription_result:
 
421
  if attempt == 0:
422
  print(f"[{FLOW_ID}] Starting transcription attempt on {audio_path.name}...")
423
 
424
+ # 2. Prepare request data - keep file open until request is done
425
+ with audio_path.open('rb') as f:
426
+ file_content = f.read()
427
+
428
  form_data = aiohttp.FormData()
429
  form_data.add_field('file',
430
+ io.BytesIO(file_content),
431
  filename=audio_path.name,
432
  content_type='audio/mpeg')
433
 
434
  # 3. Send request
435
  async with aiohttp.ClientSession() as session:
436
+ print(f"[{FLOW_ID}] Sending audio file to {server.url}...")
437
  async with session.post(server.url, data=form_data, timeout=aiohttp.ClientTimeout(total=600)) as resp:
438
+ print(f"[{FLOW_ID}] Received response status: {resp.status}")
439
+
440
  if resp.status == 200:
441
  result = await resp.json()
442
+ print(f"[{FLOW_ID}] Response data: {result}")
443
 
444
  # Check if response contains transcription data
445
  if result.get('text') or result.get('transcription'):
 
448
  if progress_tracker['completed'] % 10 == 0:
449
  print(f"[{FLOW_ID}] PROGRESS: {progress_tracker['completed']}/{progress_tracker['total']} transcriptions completed.")
450
 
451
+ print(f"[{FLOW_ID}] Success: {audio_path.name} transcribed by {server.url}")
452
 
453
  # Store the full transcription result
454
  return {
 
459
  "duration": result.get('duration'),
460
  }
461
  else:
462
+ print(f"[{FLOW_ID}] ⚠️ Server {server.url} returned invalid response format for {audio_path.name}. Response: {result}")
463
  continue
464
  else:
465
  error_text = await resp.text()
466
+ print(f"[{FLOW_ID}] Error from server {server.url} for {audio_path.name}: {resp.status} - {error_text}. Retrying...")
467
  continue
468
 
469
  except (aiohttp.ClientError, asyncio.TimeoutError, TimeoutError) as e:
470
+ print(f"[{FLOW_ID}] Connection/Timeout error for {audio_path.name} on {server.url if server else 'unknown server'}: {e}. Retrying...")
471
  continue
472
  except Exception as e:
473
+ print(f"[{FLOW_ID}] Unexpected error during transcription for {audio_path.name}: {e}. Retrying...")
474
+ import traceback
475
+ traceback.print_exc()
476
  continue
477
  finally:
478
  if server:
 
481
  server.total_processed += 1
482
  server.total_time += (end_time - start_time)
483
 
484
+ print(f"[{FLOW_ID}] FAILED after {MAX_RETRIES} attempts for {audio_path.name}.")
485
  return None
486
 
487
  # --- FastAPI App and Endpoints ---
 
503
  Fetches audio from HF dataset, sends to Whisper servers, and uploads results.
504
  Uses reference file mapping for output filename renaming.
505
  """
506
+ print(f"[{FLOW_ID}] /process endpoint called, starting background task...")
507
  background_tasks.add_task(process_audio_files_background)
508
  return {
509
  "status": "processing_started",
 
511
  "message": "Background processing task started. Check /status for progress."
512
  }
513
 
514
+ # No wrapper needed - add_task can handle coroutine functions directly in newer FastAPI
515
+
516
  async def process_audio_files_background():
517
  """Background task that processes audio files with reference mapping."""
518
  progress_data = load_progress()
 
546
  repo_file_path = audio_files[file_index]
547
  audio_filename = Path(repo_file_path).name
548
 
549
+ print(f"[{FLOW_ID}] 📝 Processing file #{file_index}: {audio_filename}")
550
+
551
  # Check if already processed
552
  state = await download_hf_state()
553
  if audio_filename in state.get('file_states', {}) and state['file_states'][audio_filename] == 'processed':
554
+ print(f"[{FLOW_ID}] ⏭️ Skipping already processed: {audio_filename}")
555
  continue
556
 
557
  # Lock the file for processing
558
  if not await lock_file_for_processing(audio_filename, state):
559
+ print(f"[{FLOW_ID}] Could not lock file {audio_filename}, skipping.")
560
  continue
561
 
562
  try:
563
  # Download audio file
564
+ print(f"[{FLOW_ID}] ⬇️ Downloading audio file...")
565
  audio_path = await download_audio_file(file_index, repo_file_path)
566
  if not audio_path:
567
+ print(f"[{FLOW_ID}] Failed to download {audio_filename}")
568
  continue
569
 
570
+ print(f"[{FLOW_ID}] ✅ Audio downloaded to {audio_path}")
571
+
572
  # Get matching reference filename
573
  reference_filename = find_matching_filename(audio_filename, reference_map)
574
  if reference_filename:
 
577
  print(f"[{FLOW_ID}] No reference match for {audio_filename}, will use audio filename")
578
 
579
  # Send for transcription
580
+ print(f"[{FLOW_ID}] 🎤 Sending to Whisper server...")
581
  transcription_result = await send_audio_for_transcription(audio_path, progress_tracker)
582
 
583
  if transcription_result: