| | """ |
| | Text processing utilities for chunking and token counting. |
| | |
| | This module provides utilities for token counting using tiktoken and text preprocessing. |
| | """ |
| |
|
| | import tiktoken |
| | import re |
| | from typing import List |
| |
|
| |
|
| | |
| | _encoder = None |
| |
|
| |
|
| | def get_encoder(): |
| | """ |
| | Get or create tiktoken encoder (Claude-compatible). |
| | |
| | Returns: |
| | tiktoken.Encoding: The cl100k_base encoding used by Claude |
| | """ |
| | global _encoder |
| | if _encoder is None: |
| | _encoder = tiktoken.get_encoding("cl100k_base") |
| | return _encoder |
| |
|
| |
|
| | def count_tokens(text: str) -> int: |
| | """ |
| | Count tokens in text using tiktoken. |
| | |
| | Args: |
| | text: Input text to count tokens |
| | |
| | Returns: |
| | int: Number of tokens in the text |
| | """ |
| | if not text: |
| | return 0 |
| | encoder = get_encoder() |
| | return len(encoder.encode(text)) |
| |
|
| |
|
| | def clean_text(text: str) -> str: |
| | """ |
| | Clean extracted PDF text. |
| | |
| | Removes excessive whitespace and null bytes. |
| | |
| | Args: |
| | text: Raw text from PDF extraction |
| | |
| | Returns: |
| | str: Cleaned text |
| | """ |
| | if not text: |
| | return "" |
| |
|
| | |
| | text = text.replace('\x00', '') |
| |
|
| | |
| | text = re.sub(r'\s+', ' ', text) |
| |
|
| | |
| | text = text.strip() |
| |
|
| | return text |
| |
|
| |
|
| | def split_into_sentences(text: str) -> List[str]: |
| | """ |
| | Split text into sentences using simple heuristics. |
| | |
| | Attempts to use nltk if available, otherwise falls back to regex-based splitting. |
| | |
| | Args: |
| | text: Input text to split |
| | |
| | Returns: |
| | List[str]: List of sentences |
| | """ |
| | if not text: |
| | return [] |
| |
|
| | |
| | try: |
| | import nltk |
| | |
| | try: |
| | return nltk.sent_tokenize(text) |
| | except LookupError: |
| | |
| | try: |
| | nltk.download('punkt_tab', quiet=True) |
| | return nltk.sent_tokenize(text) |
| | except: |
| | |
| | pass |
| | except ImportError: |
| | pass |
| |
|
| | |
| | |
| | sentences = re.split(r'(?<=[.!?])\s+', text) |
| |
|
| | |
| | sentences = [s.strip() for s in sentences if s.strip()] |
| |
|
| | return sentences |
| |
|
| |
|
| | def truncate_text(text: str, max_length: int = 100, suffix: str = "...") -> str: |
| | """ |
| | Truncate text to a maximum length. |
| | |
| | Args: |
| | text: Text to truncate |
| | max_length: Maximum length in characters |
| | suffix: Suffix to add if text is truncated |
| | |
| | Returns: |
| | str: Truncated text |
| | """ |
| | if len(text) <= max_length: |
| | return text |
| |
|
| | return text[:max_length - len(suffix)] + suffix |
| |
|
| |
|
| | def estimate_pages_from_text(text: str, chars_per_page: int = 2000) -> int: |
| | """ |
| | Estimate number of pages from text length. |
| | |
| | Args: |
| | text: Input text |
| | chars_per_page: Average characters per page (default: 2000) |
| | |
| | Returns: |
| | int: Estimated number of pages |
| | """ |
| | if not text: |
| | return 0 |
| |
|
| | return max(1, len(text) // chars_per_page) |
| |
|