MrSimple07 commited on
Commit
04b4160
·
1 Parent(s): 79a7114

token based chunking

Browse files
Files changed (1) hide show
  1. documents_prep.py +15 -7
documents_prep.py CHANGED
@@ -9,15 +9,23 @@ from config import CHUNK_SIZE, CHUNK_OVERLAP
9
 
10
 
11
  import tiktoken
 
12
 
13
  def count_tokens(text, model="gpt-3.5-turbo"):
14
- """Count tokens in text using tiktoken"""
15
  try:
16
- encoding = tiktoken.encoding_for_model(model)
17
- return len(encoding.encode(text))
 
 
18
  except:
19
- # Fallback: approximate 1 token = 4 characters for Russian/English text
20
- return len(text) // 4
 
 
 
 
 
21
 
22
  def chunk_document(doc, chunk_size=None, chunk_overlap=None):
23
  """Chunk document based on tokens instead of characters"""
@@ -76,7 +84,7 @@ def process_documents_with_chunking(documents):
76
  table_count += 1
77
  if doc_tokens > CHUNK_SIZE:
78
  large_tables_count += 1
79
- log_message(f"Large table found: {doc.metadata.get('table_number', 'unknown')} in document {doc.metadata.get('document_id', 'unknown')}, size: {doc_tokens} tokens ({doc_chars} characters)")
80
 
81
  # Chunk large tables
82
  chunked_docs = chunk_document(doc)
@@ -111,7 +119,7 @@ def process_documents_with_chunking(documents):
111
  image_count += 1
112
  if doc_tokens > CHUNK_SIZE:
113
  large_images_count += 1
114
- log_message(f"Large image description found: {doc.metadata.get('image_number', 'unknown')} in document {doc.metadata.get('document_id', 'unknown')}, size: {doc_tokens} tokens ({doc_chars} characters)")
115
 
116
  # Chunk large images
117
  chunked_docs = chunk_document(doc)
 
9
 
10
 
11
  import tiktoken
12
+ from transformers import AutoTokenizer
13
 
14
  def count_tokens(text, model="gpt-3.5-turbo"):
15
+ """Count tokens in text using HF tokenizer for better accuracy"""
16
  try:
17
+ # Use a simple HF tokenizer for more consistent results
18
+ tokenizer = AutoTokenizer.from_pretrained("microsoft/DialoGPT-medium")
19
+ tokens = tokenizer.encode(text, add_special_tokens=False)
20
+ return len(tokens)
21
  except:
22
+ # Fallback to tiktoken
23
+ try:
24
+ encoding = tiktoken.encoding_for_model(model)
25
+ return len(encoding.encode(text))
26
+ except:
27
+ # Final fallback: approximate 1 token = 4 characters
28
+ return len(text) // 4
29
 
30
  def chunk_document(doc, chunk_size=None, chunk_overlap=None):
31
  """Chunk document based on tokens instead of characters"""
 
84
  table_count += 1
85
  if doc_tokens > CHUNK_SIZE:
86
  large_tables_count += 1
87
+ log_message(f"Large table found: {doc.metadata.get('table_number', 'unknown')} in document {doc.metadata.get('document_id', 'unknown')}, size: {doc_tokens} tokens")
88
 
89
  # Chunk large tables
90
  chunked_docs = chunk_document(doc)
 
119
  image_count += 1
120
  if doc_tokens > CHUNK_SIZE:
121
  large_images_count += 1
122
+ log_message(f"Large image description found: {doc.metadata.get('image_number', 'unknown')} in document {doc.metadata.get('document_id', 'unknown')}, size: {doc_tokens} tokens")
123
 
124
  # Chunk large images
125
  chunked_docs = chunk_document(doc)