zeta / src /utils /text_utils.py
rodrigo-moonray
Deploy zeta-only embeddings (NV-Embed-v2 + E5-small)
9b457ed
"""
Text processing utilities for chunking and token counting.
This module provides utilities for token counting using tiktoken and text preprocessing.
"""
import tiktoken
import re
from typing import List
# Global encoder instance (cached for performance)
_encoder = None
def get_encoder():
"""
Get or create tiktoken encoder (Claude-compatible).
Returns:
tiktoken.Encoding: The cl100k_base encoding used by Claude
"""
global _encoder
if _encoder is None:
_encoder = tiktoken.get_encoding("cl100k_base")
return _encoder
def count_tokens(text: str) -> int:
"""
Count tokens in text using tiktoken.
Args:
text: Input text to count tokens
Returns:
int: Number of tokens in the text
"""
if not text:
return 0
encoder = get_encoder()
return len(encoder.encode(text))
def clean_text(text: str) -> str:
"""
Clean extracted PDF text.
Removes excessive whitespace and null bytes.
Args:
text: Raw text from PDF extraction
Returns:
str: Cleaned text
"""
if not text:
return ""
# Remove null bytes
text = text.replace('\x00', '')
# Remove excessive whitespace
text = re.sub(r'\s+', ' ', text)
# Remove leading/trailing whitespace
text = text.strip()
return text
def split_into_sentences(text: str) -> List[str]:
"""
Split text into sentences using simple heuristics.
Attempts to use nltk if available, otherwise falls back to regex-based splitting.
Args:
text: Input text to split
Returns:
List[str]: List of sentences
"""
if not text:
return []
# Try using nltk if available
try:
import nltk
# Download punkt_tab if not already available
try:
return nltk.sent_tokenize(text)
except LookupError:
# Punkt data not available, download it
try:
nltk.download('punkt_tab', quiet=True)
return nltk.sent_tokenize(text)
except:
# If download fails, fall back to regex
pass
except ImportError:
pass
# Fallback to simple regex-based splitting
# Split on period, exclamation, or question mark followed by whitespace
sentences = re.split(r'(?<=[.!?])\s+', text)
# Filter out empty sentences and strip whitespace
sentences = [s.strip() for s in sentences if s.strip()]
return sentences
def truncate_text(text: str, max_length: int = 100, suffix: str = "...") -> str:
"""
Truncate text to a maximum length.
Args:
text: Text to truncate
max_length: Maximum length in characters
suffix: Suffix to add if text is truncated
Returns:
str: Truncated text
"""
if len(text) <= max_length:
return text
return text[:max_length - len(suffix)] + suffix
def estimate_pages_from_text(text: str, chars_per_page: int = 2000) -> int:
"""
Estimate number of pages from text length.
Args:
text: Input text
chars_per_page: Average characters per page (default: 2000)
Returns:
int: Estimated number of pages
"""
if not text:
return 0
return max(1, len(text) // chars_per_page)