"""
NLP Export Utilities

Shared helpers for NLP export formats (CoNLL-2003, CoNLL-U).
Provides tokenization and BIO tag alignment.
"""

from typing import List, Dict, Tuple, Optional
import logging
import re

logger = logging.getLogger(__name__)


def tokenize_text(text: str, method: str = "whitespace") -> List[Dict]:
    """
    Tokenize text into tokens with character offsets.

    Args:
        text: Input text string
        method: Tokenization method. Options:
            - "whitespace": Split on whitespace (default)
            - "word_punct": Split on word boundaries and punctuation

    Returns:
        List of dicts with keys: token, start, end
    """
    if not text:
        return []

    if method == "word_punct":
        tokens = []
        for match in re.finditer(r'\S+', text):
            raw = match.group()
            raw_start = match.start()
            # Split punctuation from word boundaries
            sub_tokens = re.finditer(r'[\w]+|[^\w\s]', raw)
            for sub in sub_tokens:
                tokens.append({
                    "token": sub.group(),
                    "start": raw_start + sub.start(),
                    "end": raw_start + sub.end(),
                })
        return tokens

    # Default: whitespace tokenization
    tokens = []
    for match in re.finditer(r'\S+', text):
        tokens.append({
            "token": match.group(),
            "start": match.start(),
            "end": match.end(),
        })
    return tokens


def char_spans_to_bio_tags(
    tokens: List[Dict],
    spans: List[Dict],
    scheme: str = "BIO"
) -> List[str]:
    """
    Convert character-level spans to token-level BIO tags.

    Handles:
    - Multi-token entities
    - Tokens partially inside spans (included if majority overlap)
    - Overlapping spans (longest match wins)

    Args:
        tokens: List of token dicts with keys: token, start, end
        spans: List of span dicts with keys: start, end, label (or name)
        scheme: Tagging scheme - "BIO" (default) or "BIOES"

    Returns:
        List of BIO tag strings, one per token (e.g., ["O", "B-PER", "I-PER"])
    """
    if not tokens:
        return []

    tags = ["O"] * len(tokens)

    if not spans:
        return tags

    # Sort spans by length (longest first) so longest match wins on overlap
    sorted_spans = sorted(
        spans,
        key=lambda s: (s.get("end", 0) - s.get("start", 0)),
        reverse=True,
    )

    # Track which tokens are already assigned
    assigned = [False] * len(tokens)

    for span in sorted_spans:
        span_start = span.get("start", 0)
        span_end = span.get("end", 0)
        label = span.get("label") or span.get("name", "ENTITY")

        if span_start >= span_end:
            continue

        # Find tokens that overlap with this span
        span_tokens = []
        for i, tok in enumerate(tokens):
            if assigned[i]:
                continue
            # Calculate overlap
            overlap_start = max(tok["start"], span_start)
            overlap_end = min(tok["end"], span_end)
            overlap = max(0, overlap_end - overlap_start)
            tok_len = tok["end"] - tok["start"]
            if tok_len > 0 and overlap > 0:
                # Include token if overlap covers majority of the token
                if overlap >= tok_len / 2:
                    span_tokens.append(i)

        if not span_tokens:
            continue

        # Assign BIO tags
        for j, tok_idx in enumerate(span_tokens):
            if j == 0:
                tags[tok_idx] = f"B-{label}"
            else:
                tags[tok_idx] = f"I-{label}"
            assigned[tok_idx] = True

        # Apply BIOES if requested
        if scheme == "BIOES" and span_tokens:
            if len(span_tokens) == 1:
                tags[span_tokens[0]] = f"S-{label}"
            else:
                tags[span_tokens[-1]] = f"E-{label}"

    return tags


def group_sentences(tokens: List[Dict], text: str) -> List[List[int]]:
    """
    Group token indices into sentences based on sentence-ending punctuation.

    Args:
        tokens: List of token dicts
        text: Original text

    Returns:
        List of lists of token indices, one list per sentence
    """
    if not tokens:
        return []

    sentences = []
    current = []

    for i, tok in enumerate(tokens):
        current.append(i)
        # Sentence boundary: token ends with sentence-final punctuation
        # and is followed by whitespace + uppercase or end of text
        token_text = tok["token"]
        ends_with_sent_punct = (
            token_text in (".", "!", "?", "...", "。")
            or token_text.endswith(".")
            or token_text.endswith("!")
            or token_text.endswith("?")
        )
        if ends_with_sent_punct:
            # Check if next token starts a new sentence (uppercase or end)
            if i + 1 >= len(tokens):
                sentences.append(current)
                current = []
            else:
                next_tok = tokens[i + 1]["token"]
                if next_tok and next_tok[0].isupper():
                    sentences.append(current)
                    current = []

    if current:
        sentences.append(current)

    return sentences