File size: 3,318 Bytes
9b457ed
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
"""
Text processing utilities for chunking and token counting.

This module provides utilities for token counting using tiktoken and text preprocessing.
"""

import tiktoken
import re
from typing import List


# Global encoder instance (cached for performance)
_encoder = None


def get_encoder():
    """
    Get or create tiktoken encoder (Claude-compatible).

    Returns:
        tiktoken.Encoding: The cl100k_base encoding used by Claude
    """
    global _encoder
    if _encoder is None:
        _encoder = tiktoken.get_encoding("cl100k_base")
    return _encoder


def count_tokens(text: str) -> int:
    """
    Count tokens in text using tiktoken.

    Args:
        text: Input text to count tokens

    Returns:
        int: Number of tokens in the text
    """
    if not text:
        return 0
    encoder = get_encoder()
    return len(encoder.encode(text))


def clean_text(text: str) -> str:
    """
    Clean extracted PDF text.

    Removes excessive whitespace and null bytes.

    Args:
        text: Raw text from PDF extraction

    Returns:
        str: Cleaned text
    """
    if not text:
        return ""

    # Remove null bytes
    text = text.replace('\x00', '')

    # Remove excessive whitespace
    text = re.sub(r'\s+', ' ', text)

    # Remove leading/trailing whitespace
    text = text.strip()

    return text


def split_into_sentences(text: str) -> List[str]:
    """
    Split text into sentences using simple heuristics.

    Attempts to use nltk if available, otherwise falls back to regex-based splitting.

    Args:
        text: Input text to split

    Returns:
        List[str]: List of sentences
    """
    if not text:
        return []

    # Try using nltk if available
    try:
        import nltk
        # Download punkt_tab if not already available
        try:
            return nltk.sent_tokenize(text)
        except LookupError:
            # Punkt data not available, download it
            try:
                nltk.download('punkt_tab', quiet=True)
                return nltk.sent_tokenize(text)
            except:
                # If download fails, fall back to regex
                pass
    except ImportError:
        pass

    # Fallback to simple regex-based splitting
    # Split on period, exclamation, or question mark followed by whitespace
    sentences = re.split(r'(?<=[.!?])\s+', text)

    # Filter out empty sentences and strip whitespace
    sentences = [s.strip() for s in sentences if s.strip()]

    return sentences


def truncate_text(text: str, max_length: int = 100, suffix: str = "...") -> str:
    """
    Truncate text to a maximum length.

    Args:
        text: Text to truncate
        max_length: Maximum length in characters
        suffix: Suffix to add if text is truncated

    Returns:
        str: Truncated text
    """
    if len(text) <= max_length:
        return text

    return text[:max_length - len(suffix)] + suffix


def estimate_pages_from_text(text: str, chars_per_page: int = 2000) -> int:
    """
    Estimate number of pages from text length.

    Args:
        text: Input text
        chars_per_page: Average characters per page (default: 2000)

    Returns:
        int: Estimated number of pages
    """
    if not text:
        return 0

    return max(1, len(text) // chars_per_page)