Spaces:

BookingCare
/

ner-annotation

Running

ner-annotation / src /ner_annotation /utils /text_processing.py

nam pham

feat: improve ui/ux

a33a001 9 months ago

3.67 kB

	"""Text processing utilities for NER annotation."""

	import re
	from typing import List, Dict, Union, Tuple

	def tokenize_text(text: str) -> List[str]:
	"""Tokenize the input text into a list of tokens.

	Args:
	text: The input text to tokenize

	Returns:
	List of tokens
	"""
	return re.findall(r'\w+(?:[-_]\w+)*\|\S', text)

	def join_tokens(tokens: List[str]) -> str:
	"""Join tokens with proper spacing.

	Args:
	tokens: List of tokens to join

	Returns:
	Joined text string
	"""
	text = ""
	for token in tokens:
	if token in {",", ".", "!", "?", ":", ";", "..."}:
	text = text.rstrip() + token
	else:
	text += " " + token
	return text.strip()

	def process_text_for_gliner(
	text: str,
	max_tokens: int = 256,
	overlap: int = 32
	) -> List[str]:
	"""Process text for GLiNER by splitting long texts into overlapping chunks.

	Preserves sentence boundaries and context when possible.

	Args:
	text: The input text to process
	max_tokens: Maximum number of tokens per chunk
	overlap: Number of tokens to overlap between chunks

	Returns:
	List of text chunks suitable for GLiNER
	"""
	# First split into sentences to preserve natural boundaries
	sentences = re.split(r'(?<=[.!?])\s+', text)
	chunks = []
	current_chunk = []
	current_length = 0

	for sentence in sentences:
	# Tokenize the sentence
	sentence_tokens = tokenize_text(sentence)
	sentence_length = len(sentence_tokens)

	# If a single sentence is too long, split it
	if sentence_length > max_tokens:
	# If we have accumulated tokens, add them as a chunk
	if current_chunk:
	chunks.append(" ".join(current_chunk))
	current_chunk = []
	current_length = 0

	# Split the long sentence into smaller chunks
	start = 0
	while start < sentence_length:
	end = min(start + max_tokens, sentence_length)
	chunk_tokens = sentence_tokens[start:end]
	chunks.append(" ".join(chunk_tokens))
	start = end - overlap if end < sentence_length else end

	# If adding this sentence would exceed max_tokens, start a new chunk
	elif current_length + sentence_length > max_tokens:
	chunks.append(" ".join(current_chunk))
	current_chunk = sentence_tokens
	current_length = sentence_length
	else:
	current_chunk.extend(sentence_tokens)
	current_length += sentence_length

	# Add any remaining tokens as the final chunk
	if current_chunk:
	chunks.append(" ".join(current_chunk))

	return chunks

	def extract_tokens_and_labels(
	data: List[Dict[str, Union[str, None]]]
	) -> Tuple[List[str], List[Tuple[int, int, str]]]:
	"""Extract tokens and NER labels from annotation data.

	Args:
	data: List of token-label pairs

	Returns:
	Tuple of (tokens, ner_spans)
	"""
	tokens = []
	ner = []
	token_start_idx = 0

	for entry in data:
	char = entry['token']
	label = entry['class_or_confidence']

	# Tokenize the current text chunk
	token_list = tokenize_text(char)

	# Append tokens to the main tokens list
	tokens.extend(token_list)

	if label:
	token_end_idx = token_start_idx + len(token_list) - 1
	ner.append((token_start_idx, token_end_idx, label))

	token_start_idx += len(token_list)

	return tokens, ner