| from langchain_google_genai import ChatGoogleGenerativeAI |
| from langchain_core.runnables import RunnableConfig |
| from tenacity import ( |
| retry, |
| stop_after_attempt, |
| wait_exponential, |
| retry_if_exception_type |
| ) |
| from google.api_core import exceptions |
| import time |
|
|
| |
| REQUESTS_PER_MINUTE = 10 |
| MINIMUM_PAUSE = 60.0 / REQUESTS_PER_MINUTE |
| last_request_time = 0 |
|
|
| def rate_limit(): |
| """Implement rate limiting""" |
| global last_request_time |
| current_time = time.time() |
| time_since_last_request = current_time - last_request_time |
| if time_since_last_request < MINIMUM_PAUSE: |
| time.sleep(MINIMUM_PAUSE - time_since_last_request) |
| last_request_time = time.time() |
|
|
| @retry( |
| stop=stop_after_attempt(3), |
| wait=wait_exponential(multiplier=2, min=4, max=60), |
| retry=retry_if_exception_type(exceptions.ResourceExhausted) |
| ) |
| def _call_llm_with_retry(*args, **kwargs): |
| """Wrapper function to implement retry logic""" |
| rate_limit() |
| return ChatGoogleGenerativeAI(*args, **kwargs) |
|
|
| def get_llm( |
| llm_provider_api_key: str, |
| model_name: str = "gemini-2.0-flash", |
| ): |
| """Get a rate-limited and retry-enabled LLM instance""" |
| return _call_llm_with_retry( |
| google_api_key=llm_provider_api_key, |
| temperature=0.7, |
| max_retries=3, |
| model=model_name, |
| convert_system_message_to_human=True, |
| ) |
|
|