Spaces:

cmboulanger
/

tei-annotator

Sleeping

File size: 2,569 Bytes

37eaffd

from __future__ import annotations

from ..models.spans import ResolvedSpan, SpanDescriptor

try:
    from rapidfuzz import fuzz as _fuzz

    _HAS_RAPIDFUZZ = True
except ImportError:
    _HAS_RAPIDFUZZ = False


def _find_context(
    source: str,
    context: str,
    threshold: float,
) -> tuple[int, bool] | None:
    """
    Locate *context* in *source*.

    Returns (start_pos, is_fuzzy):
    - (pos, False) on exact match
    - (pos, True)  on fuzzy match with score >= threshold
    - None         if not found or below threshold
    """
    pos = source.find(context)
    if pos != -1:
        return pos, False

    if not _HAS_RAPIDFUZZ or not context:
        return None

    win = len(context)
    if win > len(source):
        return None

    best_score = 0.0
    best_pos = -1
    for i in range(len(source) - win + 1):
        score = _fuzz.ratio(context, source[i : i + win]) / 100.0
        if score > best_score:
            best_score = score
            best_pos = i

    if best_score >= threshold:
        return best_pos, True
    return None


def resolve_spans(
    source: str,
    spans: list[SpanDescriptor],
    fuzzy_threshold: float = 0.92,
) -> list[ResolvedSpan]:
    """
    Convert context-anchored SpanDescriptors to char-offset ResolvedSpans.

    Rejects spans whose text cannot be reliably located in *source*.
    Spans that required fuzzy context matching are flagged with fuzzy_match=True.
    """
    resolved: list[ResolvedSpan] = []

    for span in spans:
        result = _find_context(source, span.context, fuzzy_threshold)
        if result is None:
            continue  # context not found → reject

        ctx_start, context_is_fuzzy = result

        # Find span.text within the located context window
        window = source[ctx_start : ctx_start + len(span.context)]
        text_pos = window.find(span.text)
        if text_pos == -1:
            continue  # text not in context window → reject

        abs_start = ctx_start + text_pos
        abs_end = abs_start + len(span.text)

        # Verify verbatim match (should always hold after exact context find,
        # but important guard after fuzzy context find)
        if source[abs_start:abs_end] != span.text:
            continue

        resolved.append(
            ResolvedSpan(
                element=span.element,
                start=abs_start,
                end=abs_end,
                attrs=span.attrs.copy(),
                children=[],
                fuzzy_match=context_is_fuzzy,
            )
        )

    return resolved