File size: 1,739 Bytes
f4cd1fd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
import time
from abc import ABC, abstractmethod
from collections import deque
from typing import Any

SECONDS_IN_MINUTE = 60

class BaseAgent(ABC):
    @abstractmethod
    def run(self, question: str, file_name: str = "", file_content: str = "") -> Any:
        """Override this method in your agent implementation"""
        print(f"Agent received question (first 50 chars): {question[:50]}...")
        return "BaseAgent default answer"

class InputTokenRateLimiter:
    """Ensures we don’t exceed tokens per minute limits for LLMs"""
    _instance = None

    def __new__(cls):
        if cls._instance is None:
            cls._instance = super().__new__(cls)
        return cls._instance

    def __init__(self, max_tpm=50000):
        self.max_tpm = max_tpm
        if not hasattr(self, "_initialized"):
            self.token_window = deque()  # stores (timestamp, tokens_used)
            self._initialized = True

    def _update_queue(self, time_now):
        while self.token_window and time_now - self.token_window[0][0] > SECONDS_IN_MINUTE:
            self.token_window.popleft()

    def tokens_used_last_minute(self):
        now = time.time()
        self._update_queue(now)
        return sum(tokens for _, tokens in self.token_window)

    def maybe_wait(self, tokens_expected_to_use):
        ctr = 0
        while self.tokens_used_last_minute() + tokens_expected_to_use > self.max_tpm:
            if ctr % 10 == 0:
                print("Sleeping 0.5s to respect token limits...")
            time.sleep(0.5)
            self._update_queue(time.time())
            ctr += 1

    def add_tokens(self, tokens):
        now = time.time()
        self.token_window.append((now, tokens))
        self._update_queue(now)