HipFil98 commited on
Commit
80a56d9
·
verified ·
1 Parent(s): 626b31f

Create text_processing.py

Browse files
Files changed (1) hide show
  1. utils/text_processing.py +99 -0
utils/text_processing.py ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Text processing utilities for ELAN-Bot application.
3
+ """
4
+
5
+ import tiktoken
6
+ from typing import List, Tuple
7
+ from config.settings import DEFAULT_TOKENIZER_MODEL, CHUNK_SIZE
8
+
9
+
10
+ class TextProcessor:
11
+ """Utility class for text processing operations."""
12
+
13
+ def __init__(self, model: str = DEFAULT_TOKENIZER_MODEL):
14
+ """
15
+ Initialize the text processor.
16
+
17
+ Args:
18
+ model: The tokenizer model to use
19
+ """
20
+ self.model = model
21
+ self.tokenizer = None
22
+
23
+ def _get_tokenizer(self):
24
+ """Get or create the tokenizer."""
25
+ if self.tokenizer is None:
26
+ self.tokenizer = tiktoken.encoding_for_model(self.model)
27
+ return self.tokenizer
28
+
29
+ def split_eaf_content(
30
+ self,
31
+ eaf_file: str,
32
+ chunk_size: int = CHUNK_SIZE
33
+ ) -> Tuple[str, List[str]]:
34
+ """
35
+ Split EAF file content into smaller chunks based on token count.
36
+
37
+ Args:
38
+ eaf_file: The complete EAF file content
39
+ chunk_size: Maximum number of tokens per chunk
40
+
41
+ Returns:
42
+ Tuple containing (instructions, text_chunks) where:
43
+ - instructions: Text before the XML content
44
+ - text_chunks: List of XML chunks split by token count
45
+ """
46
+ # Separate initial instructions from XML content
47
+ instructions = ""
48
+ xml_start = eaf_file.find("<?xml")
49
+
50
+ if xml_start > 0:
51
+ instructions = eaf_file[:xml_start].strip()
52
+ eaf_content = eaf_file[xml_start:]
53
+ else:
54
+ eaf_content = eaf_file
55
+
56
+ # Tokenize the content
57
+ tokenizer = self._get_tokenizer()
58
+ tokens = tokenizer.encode(eaf_content)
59
+
60
+ # Split tokens into chunks
61
+ token_chunks = []
62
+ for i in range(0, len(tokens), chunk_size):
63
+ chunk = tokens[i:i+chunk_size]
64
+ token_chunks.append(chunk)
65
+
66
+ # Decode chunks back to text
67
+ text_chunks = []
68
+ for chunk in token_chunks:
69
+ chunk_text = tokenizer.decode(chunk)
70
+ text_chunks.append(chunk_text)
71
+
72
+ return instructions, text_chunks
73
+
74
+ @staticmethod
75
+ def combine_chunks(processed_chunks: List[str]) -> str:
76
+ """
77
+ Combine processed chunks into a single string.
78
+
79
+ Args:
80
+ processed_chunks: List of processed chunk strings
81
+
82
+ Returns:
83
+ str: Combined content
84
+ """
85
+ return "".join(processed_chunks)
86
+
87
+ @staticmethod
88
+ def is_xml_content(message: str) -> bool:
89
+ """
90
+ Check if the message contains XML/EAF content.
91
+
92
+ Args:
93
+ message: The message to check
94
+
95
+ Returns:
96
+ bool: True if message contains XML content
97
+ """
98
+ xml_indicators = ["<?xml", "<eaf", "<ANNOTATION"]
99
+ return any(indicator in message for indicator in xml_indicators)