Spaces:
Sleeping
Sleeping
Commit
·
dd15743
1
Parent(s):
04b4160
token based chunking 2
Browse files- documents_prep.py +9 -5
documents_prep.py
CHANGED
|
@@ -12,14 +12,18 @@ import tiktoken
|
|
| 12 |
from transformers import AutoTokenizer
|
| 13 |
|
| 14 |
def count_tokens(text, model="gpt-3.5-turbo"):
|
| 15 |
-
"""Count tokens in text using HF tokenizer for better accuracy"""
|
| 16 |
try:
|
| 17 |
-
# Use a simple HF tokenizer for more consistent results
|
| 18 |
tokenizer = AutoTokenizer.from_pretrained("microsoft/DialoGPT-medium")
|
| 19 |
-
|
| 20 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 21 |
except:
|
| 22 |
-
# Fallback to tiktoken
|
| 23 |
try:
|
| 24 |
encoding = tiktoken.encoding_for_model(model)
|
| 25 |
return len(encoding.encode(text))
|
|
|
|
| 12 |
from transformers import AutoTokenizer
|
| 13 |
|
| 14 |
def count_tokens(text, model="gpt-3.5-turbo"):
|
|
|
|
| 15 |
try:
|
|
|
|
| 16 |
tokenizer = AutoTokenizer.from_pretrained("microsoft/DialoGPT-medium")
|
| 17 |
+
max_chunk_size = 2048
|
| 18 |
+
total_tokens = 0
|
| 19 |
+
|
| 20 |
+
for i in range(0, len(text), max_chunk_size * 4): # Approximate 4 chars per token
|
| 21 |
+
chunk = text[i:i + max_chunk_size * 4]
|
| 22 |
+
tokens = tokenizer.encode(chunk, add_special_tokens=False, truncation=True, max_length=1024)
|
| 23 |
+
total_tokens += len(tokens)
|
| 24 |
+
|
| 25 |
+
return total_tokens
|
| 26 |
except:
|
|
|
|
| 27 |
try:
|
| 28 |
encoding = tiktoken.encoding_for_model(model)
|
| 29 |
return len(encoding.encode(text))
|