File size: 2,981 Bytes
80a56d9 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 | """
Text processing utilities for ELAN-Bot application.
"""
import tiktoken
from typing import List, Tuple
from config.settings import DEFAULT_TOKENIZER_MODEL, CHUNK_SIZE
class TextProcessor:
"""Utility class for text processing operations."""
def __init__(self, model: str = DEFAULT_TOKENIZER_MODEL):
"""
Initialize the text processor.
Args:
model: The tokenizer model to use
"""
self.model = model
self.tokenizer = None
def _get_tokenizer(self):
"""Get or create the tokenizer."""
if self.tokenizer is None:
self.tokenizer = tiktoken.encoding_for_model(self.model)
return self.tokenizer
def split_eaf_content(
self,
eaf_file: str,
chunk_size: int = CHUNK_SIZE
) -> Tuple[str, List[str]]:
"""
Split EAF file content into smaller chunks based on token count.
Args:
eaf_file: The complete EAF file content
chunk_size: Maximum number of tokens per chunk
Returns:
Tuple containing (instructions, text_chunks) where:
- instructions: Text before the XML content
- text_chunks: List of XML chunks split by token count
"""
# Separate initial instructions from XML content
instructions = ""
xml_start = eaf_file.find("<?xml")
if xml_start > 0:
instructions = eaf_file[:xml_start].strip()
eaf_content = eaf_file[xml_start:]
else:
eaf_content = eaf_file
# Tokenize the content
tokenizer = self._get_tokenizer()
tokens = tokenizer.encode(eaf_content)
# Split tokens into chunks
token_chunks = []
for i in range(0, len(tokens), chunk_size):
chunk = tokens[i:i+chunk_size]
token_chunks.append(chunk)
# Decode chunks back to text
text_chunks = []
for chunk in token_chunks:
chunk_text = tokenizer.decode(chunk)
text_chunks.append(chunk_text)
return instructions, text_chunks
@staticmethod
def combine_chunks(processed_chunks: List[str]) -> str:
"""
Combine processed chunks into a single string.
Args:
processed_chunks: List of processed chunk strings
Returns:
str: Combined content
"""
return "".join(processed_chunks)
@staticmethod
def is_xml_content(message: str) -> bool:
"""
Check if the message contains XML/EAF content.
Args:
message: The message to check
Returns:
bool: True if message contains XML content
"""
xml_indicators = ["<?xml", "<eaf", "<ANNOTATION"]
return any(indicator in message for indicator in xml_indicators) |