Spaces:
Paused
Paused
Commit ·
bccd1a0
1
Parent(s): 28fe5c6
🔧 Fix tokenizer crash with unknown custom models
Browse files- Add model-specific encoder fallback for unknown models
- Graceful error handling when tiktoken fails on custom models
- Use character fallback when tokenization fails
- Fix crash when using custom models like o3-mini
🤖 Generated with [Claude Code](https://claude.ai/code)
Co-Authored-By: Claude <noreply@anthropic.com>
ai/tokenizers/openai_tokenizer.py
CHANGED
|
@@ -24,6 +24,18 @@ class OpenAITokenizer:
|
|
| 24 |
except:
|
| 25 |
pass
|
| 26 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 27 |
def count_tokens(self, prompt: Union[str, List[Dict[str, Any]]], model: str = "gpt-3.5-turbo") -> Dict[str, Any]:
|
| 28 |
"""Count tokens for OpenAI request with proper formatting."""
|
| 29 |
start_time = time.time()
|
|
@@ -58,7 +70,10 @@ class OpenAITokenizer:
|
|
| 58 |
|
| 59 |
def _count_chat_tokens(self, messages: List[Dict[str, Any]], model: str) -> Dict[str, Any]:
|
| 60 |
"""Count tokens for chat messages with proper OpenAI formatting."""
|
| 61 |
-
|
|
|
|
|
|
|
|
|
|
| 62 |
# Fallback for chat messages
|
| 63 |
total_chars = sum(len(str(msg.get("content", ""))) for msg in messages)
|
| 64 |
return {
|
|
@@ -99,7 +114,11 @@ class OpenAITokenizer:
|
|
| 99 |
if len(text_content) > 800000 or num_tokens > 200000:
|
| 100 |
raise ValueError("Content is too large to tokenize.")
|
| 101 |
|
| 102 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 103 |
|
| 104 |
if key == "name":
|
| 105 |
num_tokens += tokens_per_name
|
|
|
|
| 24 |
except:
|
| 25 |
pass
|
| 26 |
|
| 27 |
+
def _get_encoder_for_model(self, model: str):
|
| 28 |
+
"""Get the appropriate encoder for a model, with fallback for unknown models."""
|
| 29 |
+
if not TIKTOKEN_AVAILABLE:
|
| 30 |
+
return None
|
| 31 |
+
|
| 32 |
+
try:
|
| 33 |
+
# Try to get model-specific encoder
|
| 34 |
+
return tiktoken.encoding_for_model(model)
|
| 35 |
+
except:
|
| 36 |
+
# Fallback to default encoder if model is unknown
|
| 37 |
+
return self.encoder
|
| 38 |
+
|
| 39 |
def count_tokens(self, prompt: Union[str, List[Dict[str, Any]]], model: str = "gpt-3.5-turbo") -> Dict[str, Any]:
|
| 40 |
"""Count tokens for OpenAI request with proper formatting."""
|
| 41 |
start_time = time.time()
|
|
|
|
| 70 |
|
| 71 |
def _count_chat_tokens(self, messages: List[Dict[str, Any]], model: str) -> Dict[str, Any]:
|
| 72 |
"""Count tokens for chat messages with proper OpenAI formatting."""
|
| 73 |
+
# Get model-specific encoder with fallback
|
| 74 |
+
encoder = self._get_encoder_for_model(model)
|
| 75 |
+
|
| 76 |
+
if not encoder:
|
| 77 |
# Fallback for chat messages
|
| 78 |
total_chars = sum(len(str(msg.get("content", ""))) for msg in messages)
|
| 79 |
return {
|
|
|
|
| 114 |
if len(text_content) > 800000 or num_tokens > 200000:
|
| 115 |
raise ValueError("Content is too large to tokenize.")
|
| 116 |
|
| 117 |
+
try:
|
| 118 |
+
num_tokens += len(encoder.encode(text_content))
|
| 119 |
+
except Exception as e:
|
| 120 |
+
# Fallback to character count if tokenization fails
|
| 121 |
+
num_tokens += len(text_content) // 4
|
| 122 |
|
| 123 |
if key == "name":
|
| 124 |
num_tokens += tokens_per_name
|