MrSimple07 commited on
Commit
dd15743
·
1 Parent(s): 04b4160

token based chunking 2

Browse files
Files changed (1) hide show
  1. documents_prep.py +9 -5
documents_prep.py CHANGED
@@ -12,14 +12,18 @@ import tiktoken
12
  from transformers import AutoTokenizer
13
 
14
  def count_tokens(text, model="gpt-3.5-turbo"):
15
- """Count tokens in text using HF tokenizer for better accuracy"""
16
  try:
17
- # Use a simple HF tokenizer for more consistent results
18
  tokenizer = AutoTokenizer.from_pretrained("microsoft/DialoGPT-medium")
19
- tokens = tokenizer.encode(text, add_special_tokens=False)
20
- return len(tokens)
 
 
 
 
 
 
 
21
  except:
22
- # Fallback to tiktoken
23
  try:
24
  encoding = tiktoken.encoding_for_model(model)
25
  return len(encoding.encode(text))
 
12
  from transformers import AutoTokenizer
13
 
14
  def count_tokens(text, model="gpt-3.5-turbo"):
 
15
  try:
 
16
  tokenizer = AutoTokenizer.from_pretrained("microsoft/DialoGPT-medium")
17
+ max_chunk_size = 2048
18
+ total_tokens = 0
19
+
20
+ for i in range(0, len(text), max_chunk_size * 4): # Approximate 4 chars per token
21
+ chunk = text[i:i + max_chunk_size * 4]
22
+ tokens = tokenizer.encode(chunk, add_special_tokens=False, truncation=True, max_length=1024)
23
+ total_tokens += len(tokens)
24
+
25
+ return total_tokens
26
  except:
 
27
  try:
28
  encoding = tiktoken.encoding_for_model(model)
29
  return len(encoding.encode(text))