Spaces:
Running
Running
| """Text processing utilities for NER annotation.""" | |
| import re | |
| from typing import List, Dict, Union, Tuple | |
| def tokenize_text(text: str) -> List[str]: | |
| """Tokenize the input text into a list of tokens. | |
| Args: | |
| text: The input text to tokenize | |
| Returns: | |
| List of tokens | |
| """ | |
| return re.findall(r'\w+(?:[-_]\w+)*|\S', text) | |
| def join_tokens(tokens: List[str]) -> str: | |
| """Join tokens with proper spacing. | |
| Args: | |
| tokens: List of tokens to join | |
| Returns: | |
| Joined text string | |
| """ | |
| text = "" | |
| for token in tokens: | |
| if token in {",", ".", "!", "?", ":", ";", "..."}: | |
| text = text.rstrip() + token | |
| else: | |
| text += " " + token | |
| return text.strip() | |
| def process_text_for_gliner( | |
| text: str, | |
| max_tokens: int = 256, | |
| overlap: int = 32 | |
| ) -> List[str]: | |
| """Process text for GLiNER by splitting long texts into overlapping chunks. | |
| Preserves sentence boundaries and context when possible. | |
| Args: | |
| text: The input text to process | |
| max_tokens: Maximum number of tokens per chunk | |
| overlap: Number of tokens to overlap between chunks | |
| Returns: | |
| List of text chunks suitable for GLiNER | |
| """ | |
| # First split into sentences to preserve natural boundaries | |
| sentences = re.split(r'(?<=[.!?])\s+', text) | |
| chunks = [] | |
| current_chunk = [] | |
| current_length = 0 | |
| for sentence in sentences: | |
| # Tokenize the sentence | |
| sentence_tokens = tokenize_text(sentence) | |
| sentence_length = len(sentence_tokens) | |
| # If a single sentence is too long, split it | |
| if sentence_length > max_tokens: | |
| # If we have accumulated tokens, add them as a chunk | |
| if current_chunk: | |
| chunks.append(" ".join(current_chunk)) | |
| current_chunk = [] | |
| current_length = 0 | |
| # Split the long sentence into smaller chunks | |
| start = 0 | |
| while start < sentence_length: | |
| end = min(start + max_tokens, sentence_length) | |
| chunk_tokens = sentence_tokens[start:end] | |
| chunks.append(" ".join(chunk_tokens)) | |
| start = end - overlap if end < sentence_length else end | |
| # If adding this sentence would exceed max_tokens, start a new chunk | |
| elif current_length + sentence_length > max_tokens: | |
| chunks.append(" ".join(current_chunk)) | |
| current_chunk = sentence_tokens | |
| current_length = sentence_length | |
| else: | |
| current_chunk.extend(sentence_tokens) | |
| current_length += sentence_length | |
| # Add any remaining tokens as the final chunk | |
| if current_chunk: | |
| chunks.append(" ".join(current_chunk)) | |
| return chunks | |
| def extract_tokens_and_labels( | |
| data: List[Dict[str, Union[str, None]]] | |
| ) -> Tuple[List[str], List[Tuple[int, int, str]]]: | |
| """Extract tokens and NER labels from annotation data. | |
| Args: | |
| data: List of token-label pairs | |
| Returns: | |
| Tuple of (tokens, ner_spans) | |
| """ | |
| tokens = [] | |
| ner = [] | |
| token_start_idx = 0 | |
| for entry in data: | |
| char = entry['token'] | |
| label = entry['class_or_confidence'] | |
| # Tokenize the current text chunk | |
| token_list = tokenize_text(char) | |
| # Append tokens to the main tokens list | |
| tokens.extend(token_list) | |
| if label: | |
| token_end_idx = token_start_idx + len(token_list) - 1 | |
| ner.append((token_start_idx, token_end_idx, label)) | |
| token_start_idx += len(token_list) | |
| return tokens, ner |