KSvend Claude Happy commited on
Commit
cb6d0e1
·
1 Parent(s): 7946db4

debug: log batch job IDs and poll statuses to diagnose CDSE timeouts

Browse files

Generated with [Claude Code](https://claude.ai/code)
via [Happy](https://happy.engineering)

Co-Authored-By: Claude <noreply@anthropic.com>
Co-Authored-By: Happy <yesreply@happy.engineering>

Files changed (1) hide show
  1. app/worker.py +5 -1
app/worker.py CHANGED
@@ -79,6 +79,8 @@ async def process_job(job_id: str, db: Database, registry: IndicatorRegistry) ->
79
  season_months=job.request.season_months(),
80
  )
81
  batch_submissions[indicator_id] = jobs
 
 
82
  await db.update_job_progress(job_id, indicator_id, "processing on CDSE")
83
  except Exception as exc:
84
  logger.warning("Batch submit failed for %s, will use fallback: %s", indicator_id, exc)
@@ -90,9 +92,12 @@ async def process_job(job_id: str, db: Database, registry: IndicatorRegistry) ->
90
 
91
  while pending:
92
  # Check current statuses before sleeping
 
93
  for indicator_id in list(pending.keys()):
94
  jobs = pending[indicator_id]
95
  statuses = [j.status() for j in jobs]
 
 
96
  if all(s == "finished" for s in statuses):
97
  logger.info("Batch jobs finished for %s", indicator_id)
98
  del pending[indicator_id]
@@ -103,7 +108,6 @@ async def process_job(job_id: str, db: Database, registry: IndicatorRegistry) ->
103
  if not pending:
104
  break
105
 
106
- elapsed = time.monotonic() - poll_start
107
  if elapsed >= BATCH_TIMEOUT:
108
  logger.warning("Batch poll timeout after %.0fs, remaining: %s", elapsed, list(pending.keys()))
109
  fallback_ids.update(pending.keys())
 
79
  season_months=job.request.season_months(),
80
  )
81
  batch_submissions[indicator_id] = jobs
82
+ job_ids = [getattr(j, 'job_id', '?') for j in jobs]
83
+ print(f"[Aperture] Submitted {indicator_id} batch jobs: {job_ids}")
84
  await db.update_job_progress(job_id, indicator_id, "processing on CDSE")
85
  except Exception as exc:
86
  logger.warning("Batch submit failed for %s, will use fallback: %s", indicator_id, exc)
 
92
 
93
  while pending:
94
  # Check current statuses before sleeping
95
+ elapsed = time.monotonic() - poll_start
96
  for indicator_id in list(pending.keys()):
97
  jobs = pending[indicator_id]
98
  statuses = [j.status() for j in jobs]
99
+ job_ids = [getattr(j, 'job_id', '?') for j in jobs]
100
+ print(f"[Aperture] Poll {indicator_id} ({elapsed:.0f}s): {list(zip(job_ids, statuses))}")
101
  if all(s == "finished" for s in statuses):
102
  logger.info("Batch jobs finished for %s", indicator_id)
103
  del pending[indicator_id]
 
108
  if not pending:
109
  break
110
 
 
111
  if elapsed >= BATCH_TIMEOUT:
112
  logger.warning("Batch poll timeout after %.0fs, remaining: %s", elapsed, list(pending.keys()))
113
  fallback_ids.update(pending.keys())