codebook / potato /export /nlp_utils.py
davidjurgens's picture
Deploy: Potato — Codebook Annotation
aceb1b2 verified
Raw
History Blame Contribute Delete
5.29 kB
"""
NLP Export Utilities
Shared helpers for NLP export formats (CoNLL-2003, CoNLL-U).
Provides tokenization and BIO tag alignment.
"""
from typing import List, Dict, Tuple, Optional
import logging
import re
logger = logging.getLogger(__name__)
def tokenize_text(text: str, method: str = "whitespace") -> List[Dict]:
"""
Tokenize text into tokens with character offsets.
Args:
text: Input text string
method: Tokenization method. Options:
- "whitespace": Split on whitespace (default)
- "word_punct": Split on word boundaries and punctuation
Returns:
List of dicts with keys: token, start, end
"""
if not text:
return []
if method == "word_punct":
tokens = []
for match in re.finditer(r'\S+', text):
raw = match.group()
raw_start = match.start()
# Split punctuation from word boundaries
sub_tokens = re.finditer(r'[\w]+|[^\w\s]', raw)
for sub in sub_tokens:
tokens.append({
"token": sub.group(),
"start": raw_start + sub.start(),
"end": raw_start + sub.end(),
})
return tokens
# Default: whitespace tokenization
tokens = []
for match in re.finditer(r'\S+', text):
tokens.append({
"token": match.group(),
"start": match.start(),
"end": match.end(),
})
return tokens
def char_spans_to_bio_tags(
tokens: List[Dict],
spans: List[Dict],
scheme: str = "BIO"
) -> List[str]:
"""
Convert character-level spans to token-level BIO tags.
Handles:
- Multi-token entities
- Tokens partially inside spans (included if majority overlap)
- Overlapping spans (longest match wins)
Args:
tokens: List of token dicts with keys: token, start, end
spans: List of span dicts with keys: start, end, label (or name)
scheme: Tagging scheme - "BIO" (default) or "BIOES"
Returns:
List of BIO tag strings, one per token (e.g., ["O", "B-PER", "I-PER"])
"""
if not tokens:
return []
tags = ["O"] * len(tokens)
if not spans:
return tags
# Sort spans by length (longest first) so longest match wins on overlap
sorted_spans = sorted(
spans,
key=lambda s: (s.get("end", 0) - s.get("start", 0)),
reverse=True,
)
# Track which tokens are already assigned
assigned = [False] * len(tokens)
for span in sorted_spans:
span_start = span.get("start", 0)
span_end = span.get("end", 0)
label = span.get("label") or span.get("name", "ENTITY")
if span_start >= span_end:
continue
# Find tokens that overlap with this span
span_tokens = []
for i, tok in enumerate(tokens):
if assigned[i]:
continue
# Calculate overlap
overlap_start = max(tok["start"], span_start)
overlap_end = min(tok["end"], span_end)
overlap = max(0, overlap_end - overlap_start)
tok_len = tok["end"] - tok["start"]
if tok_len > 0 and overlap > 0:
# Include token if overlap covers majority of the token
if overlap >= tok_len / 2:
span_tokens.append(i)
if not span_tokens:
continue
# Assign BIO tags
for j, tok_idx in enumerate(span_tokens):
if j == 0:
tags[tok_idx] = f"B-{label}"
else:
tags[tok_idx] = f"I-{label}"
assigned[tok_idx] = True
# Apply BIOES if requested
if scheme == "BIOES" and span_tokens:
if len(span_tokens) == 1:
tags[span_tokens[0]] = f"S-{label}"
else:
tags[span_tokens[-1]] = f"E-{label}"
return tags
def group_sentences(tokens: List[Dict], text: str) -> List[List[int]]:
"""
Group token indices into sentences based on sentence-ending punctuation.
Args:
tokens: List of token dicts
text: Original text
Returns:
List of lists of token indices, one list per sentence
"""
if not tokens:
return []
sentences = []
current = []
for i, tok in enumerate(tokens):
current.append(i)
# Sentence boundary: token ends with sentence-final punctuation
# and is followed by whitespace + uppercase or end of text
token_text = tok["token"]
ends_with_sent_punct = (
token_text in (".", "!", "?", "...", "。")
or token_text.endswith(".")
or token_text.endswith("!")
or token_text.endswith("?")
)
if ends_with_sent_punct:
# Check if next token starts a new sentence (uppercase or end)
if i + 1 >= len(tokens):
sentences.append(current)
current = []
else:
next_tok = tokens[i + 1]["token"]
if next_tok and next_tok[0].isupper():
sentences.append(current)
current = []
if current:
sentences.append(current)
return sentences