Spaces:

Blablablab
/

codebook

Paused

App Files Files Community

codebook / potato /export /nlp_utils.py

davidjurgens

Deploy: Potato — Codebook Annotation

aceb1b2 verified 9 days ago

Raw

History Blame Contribute Delete

5.29 kB

	"""
	NLP Export Utilities

	Shared helpers for NLP export formats (CoNLL-2003, CoNLL-U).
	Provides tokenization and BIO tag alignment.
	"""

	from typing import List, Dict, Tuple, Optional
	import logging
	import re

	logger = logging.getLogger(__name__)


	def tokenize_text(text: str, method: str = "whitespace") -> List[Dict]:
	"""
	Tokenize text into tokens with character offsets.

	Args:
	text: Input text string
	method: Tokenization method. Options:
	- "whitespace": Split on whitespace (default)
	- "word_punct": Split on word boundaries and punctuation

	Returns:
	List of dicts with keys: token, start, end
	"""
	if not text:
	return []

	if method == "word_punct":
	tokens = []
	for match in re.finditer(r'\S+', text):
	raw = match.group()
	raw_start = match.start()
	# Split punctuation from word boundaries
	sub_tokens = re.finditer(r'[\w]+\|[^\w\s]', raw)
	for sub in sub_tokens:
	tokens.append({
	"token": sub.group(),
	"start": raw_start + sub.start(),
	"end": raw_start + sub.end(),
	})
	return tokens

	# Default: whitespace tokenization
	tokens = []
	for match in re.finditer(r'\S+', text):
	tokens.append({
	"token": match.group(),
	"start": match.start(),
	"end": match.end(),
	})
	return tokens


	def char_spans_to_bio_tags(
	tokens: List[Dict],
	spans: List[Dict],
	scheme: str = "BIO"
	) -> List[str]:
	"""
	Convert character-level spans to token-level BIO tags.

	Handles:
	- Multi-token entities
	- Tokens partially inside spans (included if majority overlap)
	- Overlapping spans (longest match wins)

	Args:
	tokens: List of token dicts with keys: token, start, end
	spans: List of span dicts with keys: start, end, label (or name)
	scheme: Tagging scheme - "BIO" (default) or "BIOES"

	Returns:
	List of BIO tag strings, one per token (e.g., ["O", "B-PER", "I-PER"])
	"""
	if not tokens:
	return []

	tags = ["O"] * len(tokens)

	if not spans:
	return tags

	# Sort spans by length (longest first) so longest match wins on overlap
	sorted_spans = sorted(
	spans,
	key=lambda s: (s.get("end", 0) - s.get("start", 0)),
	reverse=True,
	)

	# Track which tokens are already assigned
	assigned = [False] * len(tokens)

	for span in sorted_spans:
	span_start = span.get("start", 0)
	span_end = span.get("end", 0)
	label = span.get("label") or span.get("name", "ENTITY")

	if span_start >= span_end:
	continue

	# Find tokens that overlap with this span
	span_tokens = []
	for i, tok in enumerate(tokens):
	if assigned[i]:
	continue
	# Calculate overlap
	overlap_start = max(tok["start"], span_start)
	overlap_end = min(tok["end"], span_end)
	overlap = max(0, overlap_end - overlap_start)
	tok_len = tok["end"] - tok["start"]
	if tok_len > 0 and overlap > 0:
	# Include token if overlap covers majority of the token
	if overlap >= tok_len / 2:
	span_tokens.append(i)

	if not span_tokens:
	continue

	# Assign BIO tags
	for j, tok_idx in enumerate(span_tokens):
	if j == 0:
	tags[tok_idx] = f"B-{label}"
	else:
	tags[tok_idx] = f"I-{label}"
	assigned[tok_idx] = True

	# Apply BIOES if requested
	if scheme == "BIOES" and span_tokens:
	if len(span_tokens) == 1:
	tags[span_tokens[0]] = f"S-{label}"
	else:
	tags[span_tokens[-1]] = f"E-{label}"

	return tags


	def group_sentences(tokens: List[Dict], text: str) -> List[List[int]]:
	"""
	Group token indices into sentences based on sentence-ending punctuation.

	Args:
	tokens: List of token dicts
	text: Original text

	Returns:
	List of lists of token indices, one list per sentence
	"""
	if not tokens:
	return []

	sentences = []
	current = []

	for i, tok in enumerate(tokens):
	current.append(i)
	# Sentence boundary: token ends with sentence-final punctuation
	# and is followed by whitespace + uppercase or end of text
	token_text = tok["token"]
	ends_with_sent_punct = (
	token_text in (".", "!", "?", "...", "。")
	or token_text.endswith(".")
	or token_text.endswith("!")
	or token_text.endswith("?")
	)
	if ends_with_sent_punct:
	# Check if next token starts a new sentence (uppercase or end)
	if i + 1 >= len(tokens):
	sentences.append(current)
	current = []
	else:
	next_tok = tokens[i + 1]["token"]
	if next_tok and next_tok[0].isupper():
	sentences.append(current)
	current = []

	if current:
	sentences.append(current)

	return sentences