Spaces:

jebin2
/

apigateway

Sleeping

App Files Files Community

jebin2 commited on 19 days ago

Commit

0164d71

1 Parent(s): c94217d

stuck or duplicate process

Browse files

Files changed (1) hide show

services/priority_worker_pool.py +56 -8

services/priority_worker_pool.py CHANGED Viewed

@@ -69,9 +69,9 @@ class WorkerConfig:
     fast_workers: int = 5
     medium_workers: int = 5
     slow_workers: int = 5
-    fast_interval: int = 5      # seconds
-    medium_interval: int = 30   # seconds
-    slow_interval: int = 60     # seconds
     max_retries: int = 60       # Max retry attempts before failing
     @classmethod
@@ -221,16 +221,64 @@ class PriorityWorker(Generic[JobType]):
         """Process a single job."""
         logger.info(f"Worker {self.worker_id}: Processing job {job.job_id} (status: {job.status})")
         if job.status == "queued":
-            # New job - start processing
-            job.status = "processing"
-            job.started_at = datetime.utcnow()
             await session.commit()
-            # Process the job
             job = await self.job_processor.process(job, session)
         else:
-            # Already processing - check status
             job = await self.job_processor.check_status(job, session)
         # Handle retry limit

     fast_workers: int = 5
     medium_workers: int = 5
     slow_workers: int = 5
+    fast_interval: int = 2      # seconds
+    medium_interval: int = 10   # seconds
+    slow_interval: int = 15     # seconds
     max_retries: int = 60       # Max retry attempts before failing
     @classmethod
         """Process a single job."""
         logger.info(f"Worker {self.worker_id}: Processing job {job.job_id} (status: {job.status})")
+        from sqlalchemy import update
         if job.status == "queued":
+            # New job - try to claim it atomically
+            # Set next_process_at to future to prevent others from picking it up while we process
+            next_check = datetime.utcnow() + timedelta(seconds=self.poll_interval * 2)
+            stmt = (
+                update(self.job_model)
+                .where(
+                    self.job_model.job_id == job.job_id,
+                    self.job_model.status == "queued"
+                )
+                .values(
+                    status="processing",
+                    started_at=datetime.utcnow(),
+                    next_process_at=next_check
+                )
+            )
+            result = await session.execute(stmt)
             await session.commit()
+            if result.rowcount == 0:
+                logger.info(f"Worker {self.worker_id}: Failed to claim job {job.job_id} (already taken)")
+                return
+            # We claimed it. Refresh and process.
+            await session.refresh(job)
             job = await self.job_processor.process(job, session)
         else:
+            # Already processing - try to claim for status check
+            # Ensure we only pick it up if next_process_at matches (or is null/past)
+            # But the SELECT already filtered for that.
+            # We just need to ensure no one else grabbed it between SELECT and UPDATE.
+            # Update next_process_at to future to lock it for this check
+            next_check = datetime.utcnow() + timedelta(seconds=self.poll_interval * 2)
+            stmt = (
+                update(self.job_model)
+                .where(
+                    self.job_model.job_id == job.job_id,
+                    or_(
+                        self.job_model.next_process_at.is_(None),
+                        self.job_model.next_process_at <= datetime.utcnow()
+                    )
+                )
+                .values(next_process_at=next_check)
+            )
+            result = await session.execute(stmt)
+            await session.commit()
+            if result.rowcount == 0:
+                logger.info(f"Worker {self.worker_id}: Failed to claim job {job.job_id} for check (already taken)")
+                return
+            await session.refresh(job)
             job = await self.job_processor.check_status(job, session)
         # Handle retry limit