from langchain_google_genai import ChatGoogleGenerativeAI from langchain_core.runnables import RunnableConfig from tenacity import ( retry, stop_after_attempt, wait_exponential, retry_if_exception_type ) from google.api_core import exceptions import time # Rate limiting settings REQUESTS_PER_MINUTE = 15 MINIMUM_PAUSE = 60.0 / REQUESTS_PER_MINUTE # Minimum time between requests last_request_time = 0 def rate_limit(): """Implement rate limiting""" global last_request_time current_time = time.time() time_since_last_request = current_time - last_request_time if time_since_last_request < MINIMUM_PAUSE: time.sleep(MINIMUM_PAUSE - time_since_last_request) last_request_time = time.time() @retry( stop=stop_after_attempt(3), wait=wait_exponential(multiplier=2, min=4, max=60), retry=retry_if_exception_type(exceptions.ResourceExhausted) ) def _call_llm_with_retry(*args, **kwargs): """Wrapper function to implement retry logic""" rate_limit() # Apply rate limiting before each attempt return ChatGoogleGenerativeAI(*args, **kwargs) def get_llm( llm_provider_api_key: str, model_name: str = "gemini-2.0-flash-lite", # Default model aligned with AGENT_MODEL_NAME ): """Get a rate-limited and retry-enabled LLM instance""" return _call_llm_with_retry( google_api_key=llm_provider_api_key, temperature=0.7, max_retries=3, model=model_name, convert_system_message_to_human=True, )