WasabiDrop Claude commited on
Commit
bccd1a0
·
1 Parent(s): 28fe5c6

🔧 Fix tokenizer crash with unknown custom models

Browse files

- Add model-specific encoder fallback for unknown models
- Graceful error handling when tiktoken fails on custom models
- Use character fallback when tokenization fails
- Fix crash when using custom models like o3-mini

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>

Files changed (1) hide show
  1. ai/tokenizers/openai_tokenizer.py +21 -2
ai/tokenizers/openai_tokenizer.py CHANGED
@@ -24,6 +24,18 @@ class OpenAITokenizer:
24
  except:
25
  pass
26
 
 
 
 
 
 
 
 
 
 
 
 
 
27
  def count_tokens(self, prompt: Union[str, List[Dict[str, Any]]], model: str = "gpt-3.5-turbo") -> Dict[str, Any]:
28
  """Count tokens for OpenAI request with proper formatting."""
29
  start_time = time.time()
@@ -58,7 +70,10 @@ class OpenAITokenizer:
58
 
59
  def _count_chat_tokens(self, messages: List[Dict[str, Any]], model: str) -> Dict[str, Any]:
60
  """Count tokens for chat messages with proper OpenAI formatting."""
61
- if not self.encoder:
 
 
 
62
  # Fallback for chat messages
63
  total_chars = sum(len(str(msg.get("content", ""))) for msg in messages)
64
  return {
@@ -99,7 +114,11 @@ class OpenAITokenizer:
99
  if len(text_content) > 800000 or num_tokens > 200000:
100
  raise ValueError("Content is too large to tokenize.")
101
 
102
- num_tokens += len(self.encoder.encode(text_content))
 
 
 
 
103
 
104
  if key == "name":
105
  num_tokens += tokens_per_name
 
24
  except:
25
  pass
26
 
27
+ def _get_encoder_for_model(self, model: str):
28
+ """Get the appropriate encoder for a model, with fallback for unknown models."""
29
+ if not TIKTOKEN_AVAILABLE:
30
+ return None
31
+
32
+ try:
33
+ # Try to get model-specific encoder
34
+ return tiktoken.encoding_for_model(model)
35
+ except:
36
+ # Fallback to default encoder if model is unknown
37
+ return self.encoder
38
+
39
  def count_tokens(self, prompt: Union[str, List[Dict[str, Any]]], model: str = "gpt-3.5-turbo") -> Dict[str, Any]:
40
  """Count tokens for OpenAI request with proper formatting."""
41
  start_time = time.time()
 
70
 
71
  def _count_chat_tokens(self, messages: List[Dict[str, Any]], model: str) -> Dict[str, Any]:
72
  """Count tokens for chat messages with proper OpenAI formatting."""
73
+ # Get model-specific encoder with fallback
74
+ encoder = self._get_encoder_for_model(model)
75
+
76
+ if not encoder:
77
  # Fallback for chat messages
78
  total_chars = sum(len(str(msg.get("content", ""))) for msg in messages)
79
  return {
 
114
  if len(text_content) > 800000 or num_tokens > 200000:
115
  raise ValueError("Content is too large to tokenize.")
116
 
117
+ try:
118
+ num_tokens += len(encoder.encode(text_content))
119
+ except Exception as e:
120
+ # Fallback to character count if tokenization fails
121
+ num_tokens += len(text_content) // 4
122
 
123
  if key == "name":
124
  num_tokens += tokens_per_name