cmboulanger's picture
full implementation
37eaffd
from __future__ import annotations
from ..models.spans import ResolvedSpan, SpanDescriptor
try:
from rapidfuzz import fuzz as _fuzz
_HAS_RAPIDFUZZ = True
except ImportError:
_HAS_RAPIDFUZZ = False
def _find_context(
source: str,
context: str,
threshold: float,
) -> tuple[int, bool] | None:
"""
Locate *context* in *source*.
Returns (start_pos, is_fuzzy):
- (pos, False) on exact match
- (pos, True) on fuzzy match with score >= threshold
- None if not found or below threshold
"""
pos = source.find(context)
if pos != -1:
return pos, False
if not _HAS_RAPIDFUZZ or not context:
return None
win = len(context)
if win > len(source):
return None
best_score = 0.0
best_pos = -1
for i in range(len(source) - win + 1):
score = _fuzz.ratio(context, source[i : i + win]) / 100.0
if score > best_score:
best_score = score
best_pos = i
if best_score >= threshold:
return best_pos, True
return None
def resolve_spans(
source: str,
spans: list[SpanDescriptor],
fuzzy_threshold: float = 0.92,
) -> list[ResolvedSpan]:
"""
Convert context-anchored SpanDescriptors to char-offset ResolvedSpans.
Rejects spans whose text cannot be reliably located in *source*.
Spans that required fuzzy context matching are flagged with fuzzy_match=True.
"""
resolved: list[ResolvedSpan] = []
for span in spans:
result = _find_context(source, span.context, fuzzy_threshold)
if result is None:
continue # context not found → reject
ctx_start, context_is_fuzzy = result
# Find span.text within the located context window
window = source[ctx_start : ctx_start + len(span.context)]
text_pos = window.find(span.text)
if text_pos == -1:
continue # text not in context window → reject
abs_start = ctx_start + text_pos
abs_end = abs_start + len(span.text)
# Verify verbatim match (should always hold after exact context find,
# but important guard after fuzzy context find)
if source[abs_start:abs_end] != span.text:
continue
resolved.append(
ResolvedSpan(
element=span.element,
start=abs_start,
end=abs_end,
attrs=span.attrs.copy(),
children=[],
fuzzy_match=context_is_fuzzy,
)
)
return resolved