Spaces:
Paused
Paused
| """ | |
| NLP Export Utilities | |
| Shared helpers for NLP export formats (CoNLL-2003, CoNLL-U). | |
| Provides tokenization and BIO tag alignment. | |
| """ | |
| from typing import List, Dict, Tuple, Optional | |
| import logging | |
| import re | |
| logger = logging.getLogger(__name__) | |
| def tokenize_text(text: str, method: str = "whitespace") -> List[Dict]: | |
| """ | |
| Tokenize text into tokens with character offsets. | |
| Args: | |
| text: Input text string | |
| method: Tokenization method. Options: | |
| - "whitespace": Split on whitespace (default) | |
| - "word_punct": Split on word boundaries and punctuation | |
| Returns: | |
| List of dicts with keys: token, start, end | |
| """ | |
| if not text: | |
| return [] | |
| if method == "word_punct": | |
| tokens = [] | |
| for match in re.finditer(r'\S+', text): | |
| raw = match.group() | |
| raw_start = match.start() | |
| # Split punctuation from word boundaries | |
| sub_tokens = re.finditer(r'[\w]+|[^\w\s]', raw) | |
| for sub in sub_tokens: | |
| tokens.append({ | |
| "token": sub.group(), | |
| "start": raw_start + sub.start(), | |
| "end": raw_start + sub.end(), | |
| }) | |
| return tokens | |
| # Default: whitespace tokenization | |
| tokens = [] | |
| for match in re.finditer(r'\S+', text): | |
| tokens.append({ | |
| "token": match.group(), | |
| "start": match.start(), | |
| "end": match.end(), | |
| }) | |
| return tokens | |
| def char_spans_to_bio_tags( | |
| tokens: List[Dict], | |
| spans: List[Dict], | |
| scheme: str = "BIO" | |
| ) -> List[str]: | |
| """ | |
| Convert character-level spans to token-level BIO tags. | |
| Handles: | |
| - Multi-token entities | |
| - Tokens partially inside spans (included if majority overlap) | |
| - Overlapping spans (longest match wins) | |
| Args: | |
| tokens: List of token dicts with keys: token, start, end | |
| spans: List of span dicts with keys: start, end, label (or name) | |
| scheme: Tagging scheme - "BIO" (default) or "BIOES" | |
| Returns: | |
| List of BIO tag strings, one per token (e.g., ["O", "B-PER", "I-PER"]) | |
| """ | |
| if not tokens: | |
| return [] | |
| tags = ["O"] * len(tokens) | |
| if not spans: | |
| return tags | |
| # Sort spans by length (longest first) so longest match wins on overlap | |
| sorted_spans = sorted( | |
| spans, | |
| key=lambda s: (s.get("end", 0) - s.get("start", 0)), | |
| reverse=True, | |
| ) | |
| # Track which tokens are already assigned | |
| assigned = [False] * len(tokens) | |
| for span in sorted_spans: | |
| span_start = span.get("start", 0) | |
| span_end = span.get("end", 0) | |
| label = span.get("label") or span.get("name", "ENTITY") | |
| if span_start >= span_end: | |
| continue | |
| # Find tokens that overlap with this span | |
| span_tokens = [] | |
| for i, tok in enumerate(tokens): | |
| if assigned[i]: | |
| continue | |
| # Calculate overlap | |
| overlap_start = max(tok["start"], span_start) | |
| overlap_end = min(tok["end"], span_end) | |
| overlap = max(0, overlap_end - overlap_start) | |
| tok_len = tok["end"] - tok["start"] | |
| if tok_len > 0 and overlap > 0: | |
| # Include token if overlap covers majority of the token | |
| if overlap >= tok_len / 2: | |
| span_tokens.append(i) | |
| if not span_tokens: | |
| continue | |
| # Assign BIO tags | |
| for j, tok_idx in enumerate(span_tokens): | |
| if j == 0: | |
| tags[tok_idx] = f"B-{label}" | |
| else: | |
| tags[tok_idx] = f"I-{label}" | |
| assigned[tok_idx] = True | |
| # Apply BIOES if requested | |
| if scheme == "BIOES" and span_tokens: | |
| if len(span_tokens) == 1: | |
| tags[span_tokens[0]] = f"S-{label}" | |
| else: | |
| tags[span_tokens[-1]] = f"E-{label}" | |
| return tags | |
| def group_sentences(tokens: List[Dict], text: str) -> List[List[int]]: | |
| """ | |
| Group token indices into sentences based on sentence-ending punctuation. | |
| Args: | |
| tokens: List of token dicts | |
| text: Original text | |
| Returns: | |
| List of lists of token indices, one list per sentence | |
| """ | |
| if not tokens: | |
| return [] | |
| sentences = [] | |
| current = [] | |
| for i, tok in enumerate(tokens): | |
| current.append(i) | |
| # Sentence boundary: token ends with sentence-final punctuation | |
| # and is followed by whitespace + uppercase or end of text | |
| token_text = tok["token"] | |
| ends_with_sent_punct = ( | |
| token_text in (".", "!", "?", "...", "。") | |
| or token_text.endswith(".") | |
| or token_text.endswith("!") | |
| or token_text.endswith("?") | |
| ) | |
| if ends_with_sent_punct: | |
| # Check if next token starts a new sentence (uppercase or end) | |
| if i + 1 >= len(tokens): | |
| sentences.append(current) | |
| current = [] | |
| else: | |
| next_tok = tokens[i + 1]["token"] | |
| if next_tok and next_tok[0].isupper(): | |
| sentences.append(current) | |
| current = [] | |
| if current: | |
| sentences.append(current) | |
| return sentences | |