Samfredoly commited on
Commit
8020386
·
verified ·
1 Parent(s): 6d01df4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +12 -6
app.py CHANGED
@@ -300,10 +300,12 @@ def upload_all_files(job: ProcessingJob, all_courses: List[Dict], combined_file_
300
 
301
 
302
  # ============================================================================
303
- # Main Processing Logic (Modified)
304
  # ============================================================================
305
 
306
- async def process_single_course(
 
 
307
  course_file: str,
308
  job: ProcessingJob,
309
  ato_files: List[str]
@@ -330,6 +332,8 @@ async def process_single_course(
330
  ato_path = download_file(DATASET_ATO, expected_ato_file)
331
  if ato_path:
332
  transcription_data = load_json_file(ato_path)
 
 
333
  if transcription_data:
334
  job.matched_transcriptions += 1
335
 
@@ -363,9 +367,11 @@ async def process_all_courses_background(job_id: str):
363
  print(f"{'='*70}\n")
364
 
365
  # Fetch file lists from all datasets
 
 
366
  print("[INIT] Fetching file lists from datasets...")
367
- helium_files = list_dataset_files(DATASET_HELIUM)
368
- ato_files = list_dataset_files(DATASET_ATO)
369
 
370
  # Filter to only _frames.json files from Helium
371
  course_files = [f for f in helium_files if f.endswith("_frames.json")]
@@ -380,7 +386,7 @@ async def process_all_courses_background(job_id: str):
380
 
381
  for idx, course_file in enumerate(course_files):
382
  try:
383
- # Use asyncio.to_thread for blocking operations like hf_hub_download
384
  course_data = await asyncio.to_thread(
385
  process_single_course,
386
  course_file,
@@ -415,7 +421,7 @@ async def process_all_courses_background(job_id: str):
415
 
416
  job.output_file = str(output_file)
417
 
418
- # NEW: Upload all files with intelligent fallback
419
  job.status = JobStatus.UPLOADING
420
  await asyncio.to_thread(upload_all_files, job, all_courses, output_file)
421
 
 
300
 
301
 
302
  # ============================================================================
303
+ # Main Processing Logic (Modified - FIX APPLIED HERE)
304
  # ============================================================================
305
 
306
+ # FIX: Changed from 'async def' to 'def' because this function contains blocking I/O
307
+ # and is intended to be run in a separate thread via asyncio.to_thread.
308
+ def process_single_course(
309
  course_file: str,
310
  job: ProcessingJob,
311
  ato_files: List[str]
 
332
  ato_path = download_file(DATASET_ATO, expected_ato_file)
333
  if ato_path:
334
  transcription_data = load_json_file(ato_path)
335
+ # NOTE: job.matched_transcriptions is a mutable attribute of the job object
336
+ # which is safe to modify here as it's running in a single thread per job.
337
  if transcription_data:
338
  job.matched_transcriptions += 1
339
 
 
367
  print(f"{'='*70}\n")
368
 
369
  # Fetch file lists from all datasets
370
+ # NOTE: list_dataset_files contains blocking I/O, so it should be run in a thread.
371
+ # However, since it's only called once at the start, we can use asyncio.to_thread.
372
  print("[INIT] Fetching file lists from datasets...")
373
+ helium_files = await asyncio.to_thread(list_dataset_files, DATASET_HELIUM)
374
+ ato_files = await asyncio.to_thread(list_dataset_files, DATASET_ATO)
375
 
376
  # Filter to only _frames.json files from Helium
377
  course_files = [f for f in helium_files if f.endswith("_frames.json")]
 
386
 
387
  for idx, course_file in enumerate(course_files):
388
  try:
389
+ # process_single_course is now synchronous and correctly run in a thread
390
  course_data = await asyncio.to_thread(
391
  process_single_course,
392
  course_file,
 
421
 
422
  job.output_file = str(output_file)
423
 
424
+ # Upload all files with intelligent fallback
425
  job.status = JobStatus.UPLOADING
426
  await asyncio.to_thread(upload_all_files, job, all_courses, output_file)
427