opus / internal /tokenizer /tokenizer.go
llzai's picture
Upload 28 files
80ffd2e verified
raw
history blame contribute delete
783 Bytes
package tokenizer
import (
"log"
"github.com/pkoukk/tiktoken-go"
)
var encoding *tiktoken.Tiktoken
// Init initializes the tokenizer with cl100k_base encoding
// This should be called at startup to preload the encoding data
func Init() error {
var err error
encoding, err = tiktoken.GetEncoding("cl100k_base")
if err != nil {
log.Printf("[WARN] Failed to initialize tiktoken: %v, using fallback", err)
return err
}
log.Printf("[INFO] Tiktoken initialized with cl100k_base encoding")
return nil
}
// CountTokens counts the number of tokens in a text string
func CountTokens(text string) int {
if encoding == nil {
// Fallback: estimate ~4 characters per token
return len(text) / 4
}
return len(encoding.Encode(text, nil, nil))
}