Spaces:
Sleeping
Sleeping
File size: 2,569 Bytes
37eaffd | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 | from __future__ import annotations
from ..models.spans import ResolvedSpan, SpanDescriptor
try:
from rapidfuzz import fuzz as _fuzz
_HAS_RAPIDFUZZ = True
except ImportError:
_HAS_RAPIDFUZZ = False
def _find_context(
source: str,
context: str,
threshold: float,
) -> tuple[int, bool] | None:
"""
Locate *context* in *source*.
Returns (start_pos, is_fuzzy):
- (pos, False) on exact match
- (pos, True) on fuzzy match with score >= threshold
- None if not found or below threshold
"""
pos = source.find(context)
if pos != -1:
return pos, False
if not _HAS_RAPIDFUZZ or not context:
return None
win = len(context)
if win > len(source):
return None
best_score = 0.0
best_pos = -1
for i in range(len(source) - win + 1):
score = _fuzz.ratio(context, source[i : i + win]) / 100.0
if score > best_score:
best_score = score
best_pos = i
if best_score >= threshold:
return best_pos, True
return None
def resolve_spans(
source: str,
spans: list[SpanDescriptor],
fuzzy_threshold: float = 0.92,
) -> list[ResolvedSpan]:
"""
Convert context-anchored SpanDescriptors to char-offset ResolvedSpans.
Rejects spans whose text cannot be reliably located in *source*.
Spans that required fuzzy context matching are flagged with fuzzy_match=True.
"""
resolved: list[ResolvedSpan] = []
for span in spans:
result = _find_context(source, span.context, fuzzy_threshold)
if result is None:
continue # context not found → reject
ctx_start, context_is_fuzzy = result
# Find span.text within the located context window
window = source[ctx_start : ctx_start + len(span.context)]
text_pos = window.find(span.text)
if text_pos == -1:
continue # text not in context window → reject
abs_start = ctx_start + text_pos
abs_end = abs_start + len(span.text)
# Verify verbatim match (should always hold after exact context find,
# but important guard after fuzzy context find)
if source[abs_start:abs_end] != span.text:
continue
resolved.append(
ResolvedSpan(
element=span.element,
start=abs_start,
end=abs_end,
attrs=span.attrs.copy(),
children=[],
fuzzy_match=context_is_fuzzy,
)
)
return resolved
|