Spaces:

jebin2
/

apigateway

Sleeping

App Files Files Community

jebin2 commited on 13 days ago

Commit

ec0e527

1 Parent(s): 74b89f0

retry logic

Browse files

Files changed (5) hide show

check_exceptions.py +15 -0
core/models.py +1 -0
routers/gemini.py +16 -14
services/gemini_job_worker.py +84 -14
services/gemini_service.py +4 -3

check_exceptions.py ADDED Viewed

	@@ -0,0 +1,15 @@

+try:
+    from google.api_core import exceptions
+    print("google.api_core.exceptions found")
+    print(f"ResourceExhausted: {exceptions.ResourceExhausted}")
+    print(f"Unauthenticated: {exceptions.Unauthenticated}")
+    print(f"PermissionDenied: {exceptions.PermissionDenied}")
+except ImportError:
+    print("google.api_core.exceptions NOT found")
+try:
+    from google import genai
+    print("google.genai found")
+except ImportError:
+    print("google.genai NOT found")

core/models.py CHANGED Viewed

@@ -165,6 +165,7 @@ class GeminiJob(Base):
     input_data = Column(JSON, nullable=True)  # Request details (prompt, settings, etc.)
     output_data = Column(JSON, nullable=True)  # Result (filename, text, etc.)
     error_message = Column(Text, nullable=True)
     created_at = Column(DateTime(timezone=True), server_default=func.now())
     started_at = Column(DateTime(timezone=True), nullable=True)

     input_data = Column(JSON, nullable=True)  # Request details (prompt, settings, etc.)
     output_data = Column(JSON, nullable=True)  # Result (filename, text, etc.)
+    api_response = Column(JSON, nullable=True)  # Raw response from third-party API (success or error)
     error_message = Column(Text, nullable=True)
     created_at = Column(DateTime(timezone=True), server_default=func.now())
     started_at = Column(DateTime(timezone=True), nullable=True)

routers/gemini.py CHANGED Viewed

@@ -535,23 +535,25 @@ async def delete_job(
     refund_amount = 0
     message = "Job deleted"
-    if job.status == "queued":
-        # Refund logic: Restore 8 credits (10 - 2)
-        # Only if it was a video job (cost 10). For others (cost 1), maybe no refund or full?
-        # Requirement says "restore 8", implying video job context.
-        # Let's check credits_reserved. If 10, refund 8. If 1, refund 0? Or 1?
-        # Assuming this logic is specific to the high-cost video jobs.
-        if job.credits_reserved >= 10:
             refund_amount = 8
             user.credits += refund_amount
             message = f"Job deleted. {refund_amount} credits refunded."
-        elif job.credits_reserved > 0:
-             # For lower cost jobs, maybe full refund if queued? Or partial?
-             # User specifically mentioned "restore 8" for the queued state.
-             # I'll stick to the specific requirement for now, but maybe refund full for 1-credit jobs?
-             # Let's assume strict "restore 8" applies to the 10-credit video jobs.
-             pass
     await db.delete(job)
     await db.commit()

     refund_amount = 0
     message = "Job deleted"
+    if not job.third_party_id:
+        # Job never successfully started on Gemini (Dev error / Pre-execution failure)
+        # Refund FULL credits
+        if job.credits_reserved > 0 and not job.credits_refunded:
+            refund_amount = job.credits_reserved
+            user.credits += refund_amount
+            job.credits_refunded = True
+            message = f"Job deleted. Full {refund_amount} credits refunded (job not started)."
+    elif job.status == "queued":
+        # Job has third_party_id but is queued? (Unlikely for video, but maybe for others?)
+        # Or maybe it was reset to queued?
+        # Use existing logic: Refund 8 credits (10 - 2) for video
+        if job.credits_reserved >= 10 and not job.credits_refunded:
             refund_amount = 8
             user.credits += refund_amount
+            # Don't mark as fully refunded, as it's partial?
+            # Actually credits_refunded is boolean. Maybe we shouldn't set it if partial?
+            # But we gave back credits. Let's just update user credits.
             message = f"Job deleted. {refund_amount} credits refunded."
     await db.delete(job)
     await db.commit()

services/gemini_job_worker.py CHANGED Viewed

@@ -51,6 +51,71 @@ class GeminiJobProcessor(JobProcessor[GeminiJob]):
         from services.api_key_manager import record_usage
         await record_usage(session, key_index, success, error_message)
     async def process(self, job: GeminiJob, session: AsyncSession) -> GeminiJob:
         """Start processing a new job with round-robin API key."""
         key_index, service = await self._get_service_with_key(session)
@@ -80,12 +145,9 @@ class GeminiJobProcessor(JobProcessor[GeminiJob]):
                 job.completed_at = datetime.utcnow()
                 error_msg = job.error_message
         except Exception as e:
-            logger.error(f"Error processing job {job.job_id}: {e}")
-            job.status = "failed"
-            job.error_message = str(e)
-            job.completed_at = datetime.utcnow()
-            success = False
-            error_msg = str(e)
         # Record usage
         await self._record_usage(session, key_index, success, error_msg)
@@ -107,6 +169,8 @@ class GeminiJobProcessor(JobProcessor[GeminiJob]):
         try:
             status_result = await service.check_video_status(job.third_party_id)
             if status_result.get("done"):
                 if status_result.get("status") == "completed":
@@ -137,14 +201,14 @@ class GeminiJobProcessor(JobProcessor[GeminiJob]):
                 success = True  # Status check succeeded even if video not ready
         except Exception as e:
-            logger.error(f"Error checking video status for {job.job_id}: {e}")
-            job.retry_count += 1
-            job.error_message = f"Status check failed: {e}"
-            config = WorkerConfig.from_env()
-            interval = get_interval_for_priority(job.priority, config)
-            job.next_process_at = datetime.utcnow() + timedelta(seconds=interval)
-            success = False
-            error_msg = str(e)
         # Record usage
         await self._record_usage(session, key_index, success, error_msg)
@@ -198,6 +262,7 @@ class GeminiJobProcessor(JobProcessor[GeminiJob]):
             number_of_videos=input_data.get("number_of_videos", 1)
         )
         job.third_party_id = result.get("gemini_operation_name")
         # Schedule first status check
         config = WorkerConfig.from_env()
@@ -215,6 +280,8 @@ class GeminiJobProcessor(JobProcessor[GeminiJob]):
         )
         job.status = "completed"
         job.output_data = {"image": result}
         job.completed_at = datetime.utcnow()
         return job
@@ -226,6 +293,7 @@ class GeminiJobProcessor(JobProcessor[GeminiJob]):
         )
         job.status = "completed"
         job.output_data = {"text": result}
         job.completed_at = datetime.utcnow()
         return job
@@ -238,6 +306,7 @@ class GeminiJobProcessor(JobProcessor[GeminiJob]):
         )
         job.status = "completed"
         job.output_data = {"analysis": result}
         job.completed_at = datetime.utcnow()
         return job
@@ -250,6 +319,7 @@ class GeminiJobProcessor(JobProcessor[GeminiJob]):
         )
         job.status = "completed"
         job.output_data = {"prompt": result}
         job.completed_at = datetime.utcnow()
         return job

         from services.api_key_manager import record_usage
         await record_usage(session, key_index, success, error_message)
+    def _handle_error(self, job: GeminiJob, error: Exception, reset_to_queued: bool = False) -> tuple[bool, str]:
+        """
+        Handle job errors with retry logic.
+        Args:
+            job: The job object
+            error: The exception raised
+            reset_to_queued: Whether to reset status to 'queued' on retry (for process())
+        Returns:
+            Tuple of (success, error_message)
+            success is False (since it's an error)
+            error_message is the formatted error string
+        """
+        error_str = str(error)
+        is_retryable = False
+        log_msg = ""
+        # Check for Rate Limit (429)
+        if "429" in error_str or "ResourceExhausted" in error_str:
+            is_retryable = True
+            log_msg = f"Rate limit hit for job {job.job_id}"
+        # Check for Auth/Billing errors (401, 403, API key not found, API key not valid, FAILED_PRECONDITION)
+        elif "401" in error_str or "403" in error_str or "Unauthenticated" in error_str or "PermissionDenied" in error_str or "API key not found" in error_str or "API key not valid" in error_str or "FAILED_PRECONDITION" in error_str:
+            is_retryable = True
+            log_msg = f"Auth/Billing error for job {job.job_id}: {error_str}. Rescheduling to try different key."
+        # Check for Server errors (500, 503, 504)
+        elif "500" in error_str or "503" in error_str or "504" in error_str or "INTERNAL" in error_str or "UNAVAILABLE" in error_str or "DEADLINE_EXCEEDED" in error_str:
+            is_retryable = True
+            log_msg = f"Server error for job {job.job_id}: {error_str}"
+        # Try to parse JSON error details if present
+        try:
+            import json
+            import re
+            # Look for JSON-like structure in error string
+            json_match = re.search(r"(\{.*\})", error_str)
+            if json_match:
+                job.api_response = json.loads(json_match.group(1))
+            else:
+                job.api_response = {"error": error_str}
+        except Exception:
+            job.api_response = {"error": error_str}
+        if is_retryable:
+            logger.warning(f"{log_msg}. Rescheduling.")
+            job.retry_count += 1
+            config = WorkerConfig.from_env()
+            # Use a longer delay for these errors (e.g., 30s)
+            interval = 30
+            job.next_process_at = datetime.utcnow() + timedelta(seconds=interval)
+            if reset_to_queued:
+                job.status = "queued"
+            return False, f"Retryable error: {error_str}"
+        else:
+            logger.error(f"Error processing job {job.job_id}: {error}")
+            job.status = "failed"
+            job.error_message = str(error)
+            job.completed_at = datetime.utcnow()
+            return False, str(error)
     async def process(self, job: GeminiJob, session: AsyncSession) -> GeminiJob:
         """Start processing a new job with round-robin API key."""
         key_index, service = await self._get_service_with_key(session)
                 job.completed_at = datetime.utcnow()
                 error_msg = job.error_message
         except Exception as e:
+            # Use helper for error handling
+            # reset_to_queued=True because if we fail to start, we want to try starting again from scratch
+            success, error_msg = self._handle_error(job, e, reset_to_queued=True)
         # Record usage
         await self._record_usage(session, key_index, success, error_msg)
         try:
             status_result = await service.check_video_status(job.third_party_id)
+            # Save raw response
+            job.api_response = status_result
             if status_result.get("done"):
                 if status_result.get("status") == "completed":
                 success = True  # Status check succeeded even if video not ready
         except Exception as e:
+            # Use helper for error handling
+            # reset_to_queued=False because we want to continue checking status, not restart
+            success, error_msg = self._handle_error(job, e, reset_to_queued=False)
+        # Record usage
+        await self._record_usage(session, key_index, success, error_msg)
+        return job
         # Record usage
         await self._record_usage(session, key_index, success, error_msg)
             number_of_videos=input_data.get("number_of_videos", 1)
         )
         job.third_party_id = result.get("gemini_operation_name")
+        job.api_response = result
         # Schedule first status check
         config = WorkerConfig.from_env()
         )
         job.status = "completed"
         job.output_data = {"image": result}
+        # Don't save full base64 image to api_response
+        job.api_response = {"status": "success", "type": "image_edit"}
         job.completed_at = datetime.utcnow()
         return job
         )
         job.status = "completed"
         job.output_data = {"text": result}
+        job.api_response = {"result": result}
         job.completed_at = datetime.utcnow()
         return job
         )
         job.status = "completed"
         job.output_data = {"analysis": result}
+        job.api_response = {"result": result}
         job.completed_at = datetime.utcnow()
         return job
         )
         job.status = "completed"
         job.output_data = {"prompt": result}
+        job.api_response = {"result": result}
         job.completed_at = datetime.utcnow()
         return job

services/gemini_service.py CHANGED Viewed

@@ -33,6 +33,7 @@ os.makedirs(DOWNLOADS_DIR, exist_ok=True)
 # Mock mode for local testing (set GEMINI_MOCK_MODE=true to skip real API calls)
 MOCK_MODE = os.getenv("GEMINI_MOCK_MODE", "false").lower() == "true"
 # Sample video URL for mock mode (a public test video)
 MOCK_VIDEO_URL = "https://video.twimg.com/amplify_video/1994083297756848128/vid/avc1/576x576/ue31qU0xts8L9tXD.mp4?tag=21"
@@ -116,7 +117,7 @@ class GeminiService:
         # Mock mode for testing
         if MOCK_MODE:
             logger.info("[MOCK MODE] Generating animation prompt")
-            await asyncio.sleep(0.5)  # Simulate API delay
             return "A gentle breeze rustles through the scene as soft light dances across the surface. The camera slowly zooms in with a subtle parallax effect, creating depth and movement."
         default_prompt = custom_prompt or "Describe how this image could be subtly animated with cinematic movement."
@@ -348,7 +349,7 @@ class GeminiService:
         # Mock mode for testing
         if MOCK_MODE:
             logger.info(f"[MOCK MODE] Generating text for prompt: {prompt[:50]}...")
-            await asyncio.sleep(0.5)  # Simulate API delay
             return f"This is a mock response for your prompt: '{prompt[:100]}...'. In production, this would be generated by Gemini AI."
         model_name = model or MODELS["text_generation"]
@@ -377,7 +378,7 @@ class GeminiService:
         # Mock mode for testing
         if MOCK_MODE:
             logger.info(f"[MOCK MODE] Analyzing image with prompt: {prompt[:50]}...")
-            await asyncio.sleep(0.5)  # Simulate API delay
             return f"Mock analysis result: The image appears to show a scene that matches your query '{prompt[:50]}...'. This is placeholder content for testing."
         async with get_text_semaphore():

 # Mock mode for local testing (set GEMINI_MOCK_MODE=true to skip real API calls)
 MOCK_MODE = os.getenv("GEMINI_MOCK_MODE", "false").lower() == "true"
+MOCK_MODE_SLEEP_TIME = os.getenv("GEMINI_MOCK_MODE_SLEEP_TIME", "0.5")
 # Sample video URL for mock mode (a public test video)
 MOCK_VIDEO_URL = "https://video.twimg.com/amplify_video/1994083297756848128/vid/avc1/576x576/ue31qU0xts8L9tXD.mp4?tag=21"
         # Mock mode for testing
         if MOCK_MODE:
             logger.info("[MOCK MODE] Generating animation prompt")
+            await asyncio.sleep(GEMINI_MOCK_MODE_SLEEP_TIME)  # Simulate API delay
             return "A gentle breeze rustles through the scene as soft light dances across the surface. The camera slowly zooms in with a subtle parallax effect, creating depth and movement."
         default_prompt = custom_prompt or "Describe how this image could be subtly animated with cinematic movement."
         # Mock mode for testing
         if MOCK_MODE:
             logger.info(f"[MOCK MODE] Generating text for prompt: {prompt[:50]}...")
+            await asyncio.sleep(MOCK_MODE_SLEEP_TIME)  # Simulate API delay
             return f"This is a mock response for your prompt: '{prompt[:100]}...'. In production, this would be generated by Gemini AI."
         model_name = model or MODELS["text_generation"]
         # Mock mode for testing
         if MOCK_MODE:
             logger.info(f"[MOCK MODE] Analyzing image with prompt: {prompt[:50]}...")
+            await asyncio.sleep(MOCK_MODE_SLEEP_TIME)  # Simulate API delay
             return f"Mock analysis result: The image appears to show a scene that matches your query '{prompt[:50]}...'. This is placeholder content for testing."
         async with get_text_semaphore():