File size: 2,981 Bytes
80a56d9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
"""
Text processing utilities for ELAN-Bot application.
"""

import tiktoken
from typing import List, Tuple
from config.settings import DEFAULT_TOKENIZER_MODEL, CHUNK_SIZE


class TextProcessor:
    """Utility class for text processing operations."""
    
    def __init__(self, model: str = DEFAULT_TOKENIZER_MODEL):
        """
        Initialize the text processor.
        
        Args:
            model: The tokenizer model to use
        """
        self.model = model
        self.tokenizer = None
    
    def _get_tokenizer(self):
        """Get or create the tokenizer."""
        if self.tokenizer is None:
            self.tokenizer = tiktoken.encoding_for_model(self.model)
        return self.tokenizer
    
    def split_eaf_content(
        self, 
        eaf_file: str, 
        chunk_size: int = CHUNK_SIZE
    ) -> Tuple[str, List[str]]:
        """
        Split EAF file content into smaller chunks based on token count.
        
        Args:
            eaf_file: The complete EAF file content
            chunk_size: Maximum number of tokens per chunk
            
        Returns:
            Tuple containing (instructions, text_chunks) where:
            - instructions: Text before the XML content
            - text_chunks: List of XML chunks split by token count
        """
        # Separate initial instructions from XML content
        instructions = ""
        xml_start = eaf_file.find("<?xml")
        
        if xml_start > 0:
            instructions = eaf_file[:xml_start].strip()
            eaf_content = eaf_file[xml_start:]
        else:
            eaf_content = eaf_file
        
        # Tokenize the content
        tokenizer = self._get_tokenizer()
        tokens = tokenizer.encode(eaf_content)
        
        # Split tokens into chunks
        token_chunks = []
        for i in range(0, len(tokens), chunk_size):
            chunk = tokens[i:i+chunk_size]
            token_chunks.append(chunk)
        
        # Decode chunks back to text
        text_chunks = []
        for chunk in token_chunks:
            chunk_text = tokenizer.decode(chunk)
            text_chunks.append(chunk_text)
        
        return instructions, text_chunks
    
    @staticmethod
    def combine_chunks(processed_chunks: List[str]) -> str:
        """
        Combine processed chunks into a single string.
        
        Args:
            processed_chunks: List of processed chunk strings
            
        Returns:
            str: Combined content
        """
        return "".join(processed_chunks)
    
    @staticmethod
    def is_xml_content(message: str) -> bool:
        """
        Check if the message contains XML/EAF content.
        
        Args:
            message: The message to check
            
        Returns:
            bool: True if message contains XML content
        """
        xml_indicators = ["<?xml", "<eaf", "<ANNOTATION"]
        return any(indicator in message for indicator in xml_indicators)