Spaces:
Sleeping
Sleeping
| """ | |
| Improved token counting using tiktoken. | |
| Provides accurate token estimates for different models. | |
| """ | |
| import json | |
| from typing import Any, Dict | |
| class TokenCounter: | |
| """Count tokens accurately for different models.""" | |
| def __init__(self): | |
| self._tiktoken_available = False | |
| self._encoders = {} | |
| try: | |
| import tiktoken | |
| self._tiktoken_available = True | |
| self.tiktoken = tiktoken | |
| except ImportError: | |
| print("Warning: tiktoken not installed. Using fallback token counting.") | |
| print("Install with: pip install tiktoken") | |
| def count_tokens(self, text: str, model: str = 'claude-sonnet-4') -> int: | |
| """ | |
| Count tokens for given text and model. | |
| Args: | |
| text: Text to count tokens for | |
| model: Model name (affects tokenization) | |
| Returns: | |
| Number of tokens | |
| """ | |
| if not self._tiktoken_available: | |
| return self._fallback_count(text) | |
| # Map our model names to tiktoken encodings | |
| encoding_map = { | |
| 'claude-sonnet-4': 'cl100k_base', # Claude uses similar to GPT-4 | |
| 'claude-opus-4': 'cl100k_base', | |
| 'claude-haiku-4': 'cl100k_base', | |
| 'gpt-4-turbo': 'cl100k_base', | |
| 'gpt-4o': 'o200k_base', | |
| 'gpt-3.5-turbo': 'cl100k_base', | |
| 'gemini-pro': 'cl100k_base', # Approximation | |
| 'gemini-flash': 'cl100k_base', | |
| } | |
| encoding_name = encoding_map.get(model, 'cl100k_base') | |
| # Get or create encoder | |
| if encoding_name not in self._encoders: | |
| try: | |
| self._encoders[encoding_name] = self.tiktoken.get_encoding(encoding_name) | |
| except Exception as e: | |
| print(f"Warning: Could not load encoding {encoding_name}: {e}") | |
| return self._fallback_count(text) | |
| encoder = self._encoders[encoding_name] | |
| try: | |
| tokens = encoder.encode(text) | |
| return len(tokens) | |
| except Exception as e: | |
| print(f"Warning: Token counting failed: {e}") | |
| return self._fallback_count(text) | |
| def count_tokens_for_dict(self, data: Dict[str, Any], model: str = 'claude-sonnet-4') -> int: | |
| """ | |
| Count tokens for a dictionary (API response). | |
| Args: | |
| data: Dictionary to count tokens for | |
| model: Model name | |
| Returns: | |
| Number of tokens | |
| """ | |
| # Convert to JSON string for counting | |
| text = json.dumps(data, indent=None) | |
| return self.count_tokens(text, model) | |
| def _fallback_count(self, text: str) -> int: | |
| """ | |
| Fallback token counting when tiktoken unavailable. | |
| Simple heuristic: ~4 characters per token for English text. | |
| """ | |
| return len(text) // 4 | |
| def count_tokens_detailed(self, text: str, model: str = 'claude-sonnet-4') -> Dict: | |
| """ | |
| Get detailed token count information. | |
| Returns: | |
| Dictionary with token count and breakdown | |
| """ | |
| total_tokens = self.count_tokens(text, model) | |
| return { | |
| 'total_tokens': total_tokens, | |
| 'character_count': len(text), | |
| 'avg_chars_per_token': len(text) / total_tokens if total_tokens > 0 else 0, | |
| 'model': model, | |
| 'method': 'tiktoken' if self._tiktoken_available else 'fallback' | |
| } | |
| def compare_models(self, text: str, models: list = None) -> Dict[str, int]: | |
| """ | |
| Compare token counts across different models. | |
| Args: | |
| text: Text to count | |
| models: List of model names to compare | |
| Returns: | |
| Dictionary mapping model names to token counts | |
| """ | |
| if models is None: | |
| models = ['claude-sonnet-4', 'gpt-4-turbo', 'gpt-4o', 'gemini-pro'] | |
| results = {} | |
| for model in models: | |
| results[model] = self.count_tokens(text, model) | |
| return results | |
| # Global instance | |
| _token_counter = None | |
| def get_token_counter() -> TokenCounter: | |
| """Get the global token counter instance.""" | |
| global _token_counter | |
| if _token_counter is None: | |
| _token_counter = TokenCounter() | |
| return _token_counter | |