Spaces:
Sleeping
Sleeping
Commit
·
04b4160
1
Parent(s):
79a7114
token based chunking
Browse files- documents_prep.py +15 -7
documents_prep.py
CHANGED
|
@@ -9,15 +9,23 @@ from config import CHUNK_SIZE, CHUNK_OVERLAP
|
|
| 9 |
|
| 10 |
|
| 11 |
import tiktoken
|
|
|
|
| 12 |
|
| 13 |
def count_tokens(text, model="gpt-3.5-turbo"):
|
| 14 |
-
"""Count tokens in text using
|
| 15 |
try:
|
| 16 |
-
|
| 17 |
-
|
|
|
|
|
|
|
| 18 |
except:
|
| 19 |
-
# Fallback
|
| 20 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 21 |
|
| 22 |
def chunk_document(doc, chunk_size=None, chunk_overlap=None):
|
| 23 |
"""Chunk document based on tokens instead of characters"""
|
|
@@ -76,7 +84,7 @@ def process_documents_with_chunking(documents):
|
|
| 76 |
table_count += 1
|
| 77 |
if doc_tokens > CHUNK_SIZE:
|
| 78 |
large_tables_count += 1
|
| 79 |
-
log_message(f"Large table found: {doc.metadata.get('table_number', 'unknown')} in document {doc.metadata.get('document_id', 'unknown')}, size: {doc_tokens} tokens
|
| 80 |
|
| 81 |
# Chunk large tables
|
| 82 |
chunked_docs = chunk_document(doc)
|
|
@@ -111,7 +119,7 @@ def process_documents_with_chunking(documents):
|
|
| 111 |
image_count += 1
|
| 112 |
if doc_tokens > CHUNK_SIZE:
|
| 113 |
large_images_count += 1
|
| 114 |
-
log_message(f"Large image description found: {doc.metadata.get('image_number', 'unknown')} in document {doc.metadata.get('document_id', 'unknown')}, size: {doc_tokens} tokens
|
| 115 |
|
| 116 |
# Chunk large images
|
| 117 |
chunked_docs = chunk_document(doc)
|
|
|
|
| 9 |
|
| 10 |
|
| 11 |
import tiktoken
|
| 12 |
+
from transformers import AutoTokenizer
|
| 13 |
|
| 14 |
def count_tokens(text, model="gpt-3.5-turbo"):
|
| 15 |
+
"""Count tokens in text using HF tokenizer for better accuracy"""
|
| 16 |
try:
|
| 17 |
+
# Use a simple HF tokenizer for more consistent results
|
| 18 |
+
tokenizer = AutoTokenizer.from_pretrained("microsoft/DialoGPT-medium")
|
| 19 |
+
tokens = tokenizer.encode(text, add_special_tokens=False)
|
| 20 |
+
return len(tokens)
|
| 21 |
except:
|
| 22 |
+
# Fallback to tiktoken
|
| 23 |
+
try:
|
| 24 |
+
encoding = tiktoken.encoding_for_model(model)
|
| 25 |
+
return len(encoding.encode(text))
|
| 26 |
+
except:
|
| 27 |
+
# Final fallback: approximate 1 token = 4 characters
|
| 28 |
+
return len(text) // 4
|
| 29 |
|
| 30 |
def chunk_document(doc, chunk_size=None, chunk_overlap=None):
|
| 31 |
"""Chunk document based on tokens instead of characters"""
|
|
|
|
| 84 |
table_count += 1
|
| 85 |
if doc_tokens > CHUNK_SIZE:
|
| 86 |
large_tables_count += 1
|
| 87 |
+
log_message(f"Large table found: {doc.metadata.get('table_number', 'unknown')} in document {doc.metadata.get('document_id', 'unknown')}, size: {doc_tokens} tokens")
|
| 88 |
|
| 89 |
# Chunk large tables
|
| 90 |
chunked_docs = chunk_document(doc)
|
|
|
|
| 119 |
image_count += 1
|
| 120 |
if doc_tokens > CHUNK_SIZE:
|
| 121 |
large_images_count += 1
|
| 122 |
+
log_message(f"Large image description found: {doc.metadata.get('image_number', 'unknown')} in document {doc.metadata.get('document_id', 'unknown')}, size: {doc_tokens} tokens")
|
| 123 |
|
| 124 |
# Chunk large images
|
| 125 |
chunked_docs = chunk_document(doc)
|