Spaces:

MrSimple01
/

RAG_AIEXP_01

Sleeping

App Files Files Community

MrSimple07 commited on Sep 18, 2025

Commit

04b4160

1 Parent(s): 79a7114

token based chunking

Browse files

Files changed (1) hide show

documents_prep.py +15 -7

documents_prep.py CHANGED Viewed

@@ -9,15 +9,23 @@ from config import CHUNK_SIZE, CHUNK_OVERLAP
 import tiktoken
 def count_tokens(text, model="gpt-3.5-turbo"):
-    """Count tokens in text using tiktoken"""
     try:
-        encoding = tiktoken.encoding_for_model(model)
-        return len(encoding.encode(text))
     except:
-        # Fallback: approximate 1 token = 4 characters for Russian/English text
-        return len(text) // 4
 def chunk_document(doc, chunk_size=None, chunk_overlap=None):
     """Chunk document based on tokens instead of characters"""
@@ -76,7 +84,7 @@ def process_documents_with_chunking(documents):
             table_count += 1
             if doc_tokens > CHUNK_SIZE:
                 large_tables_count += 1
-                log_message(f"Large table found: {doc.metadata.get('table_number', 'unknown')} in document {doc.metadata.get('document_id', 'unknown')}, size: {doc_tokens} tokens ({doc_chars} characters)")
                 # Chunk large tables
                 chunked_docs = chunk_document(doc)
@@ -111,7 +119,7 @@ def process_documents_with_chunking(documents):
             image_count += 1
             if doc_tokens > CHUNK_SIZE:
                 large_images_count += 1
-                log_message(f"Large image description found: {doc.metadata.get('image_number', 'unknown')} in document {doc.metadata.get('document_id', 'unknown')}, size: {doc_tokens} tokens ({doc_chars} characters)")
                 # Chunk large images
                 chunked_docs = chunk_document(doc)

 import tiktoken
+from transformers import AutoTokenizer
 def count_tokens(text, model="gpt-3.5-turbo"):
+    """Count tokens in text using HF tokenizer for better accuracy"""
     try:
+        # Use a simple HF tokenizer for more consistent results
+        tokenizer = AutoTokenizer.from_pretrained("microsoft/DialoGPT-medium")
+        tokens = tokenizer.encode(text, add_special_tokens=False)
+        return len(tokens)
     except:
+        # Fallback to tiktoken
+        try:
+            encoding = tiktoken.encoding_for_model(model)
+            return len(encoding.encode(text))
+        except:
+            # Final fallback: approximate 1 token = 4 characters
+            return len(text) // 4
 def chunk_document(doc, chunk_size=None, chunk_overlap=None):
     """Chunk document based on tokens instead of characters"""
             table_count += 1
             if doc_tokens > CHUNK_SIZE:
                 large_tables_count += 1
+                log_message(f"Large table found: {doc.metadata.get('table_number', 'unknown')} in document {doc.metadata.get('document_id', 'unknown')}, size: {doc_tokens} tokens")
                 # Chunk large tables
                 chunked_docs = chunk_document(doc)
             image_count += 1
             if doc_tokens > CHUNK_SIZE:
                 large_images_count += 1
+                log_message(f"Large image description found: {doc.metadata.get('image_number', 'unknown')} in document {doc.metadata.get('document_id', 'unknown')}, size: {doc_tokens} tokens")
                 # Chunk large images
                 chunked_docs = chunk_document(doc)