| | from langchain_google_genai import ChatGoogleGenerativeAI |
| | from langchain_core.runnables import RunnableConfig |
| | from tenacity import ( |
| | retry, |
| | stop_after_attempt, |
| | wait_exponential, |
| | retry_if_exception_type |
| | ) |
| | from google.api_core import exceptions |
| | import time |
| |
|
| | |
| | REQUESTS_PER_MINUTE = 15 |
| | MINIMUM_PAUSE = 60.0 / REQUESTS_PER_MINUTE |
| | last_request_time = 0 |
| |
|
| | def rate_limit(): |
| | """Implement rate limiting""" |
| | global last_request_time |
| | current_time = time.time() |
| | time_since_last_request = current_time - last_request_time |
| | if time_since_last_request < MINIMUM_PAUSE: |
| | time.sleep(MINIMUM_PAUSE - time_since_last_request) |
| | last_request_time = time.time() |
| |
|
| | @retry( |
| | stop=stop_after_attempt(3), |
| | wait=wait_exponential(multiplier=2, min=4, max=60), |
| | retry=retry_if_exception_type(exceptions.ResourceExhausted) |
| | ) |
| | def _call_llm_with_retry(*args, **kwargs): |
| | """Wrapper function to implement retry logic""" |
| | rate_limit() |
| | return ChatGoogleGenerativeAI(*args, **kwargs) |
| |
|
| | def get_llm( |
| | llm_provider_api_key: str, |
| | model_name: str = "gemini-2.0-flash-lite", |
| | ): |
| | """Get a rate-limited and retry-enabled LLM instance""" |
| | return _call_llm_with_retry( |
| | google_api_key=llm_provider_api_key, |
| | temperature=0.7, |
| | max_retries=3, |
| | model=model_name, |
| | convert_system_message_to_human=True, |
| | ) |
| |
|