Spaces:
Sleeping
Sleeping
| from __future__ import annotations | |
| from ..models.spans import ResolvedSpan, SpanDescriptor | |
| try: | |
| from rapidfuzz import fuzz as _fuzz | |
| _HAS_RAPIDFUZZ = True | |
| except ImportError: | |
| _HAS_RAPIDFUZZ = False | |
| def _find_context( | |
| source: str, | |
| context: str, | |
| threshold: float, | |
| ) -> tuple[int, bool] | None: | |
| """ | |
| Locate *context* in *source*. | |
| Returns (start_pos, is_fuzzy): | |
| - (pos, False) on exact match | |
| - (pos, True) on fuzzy match with score >= threshold | |
| - None if not found or below threshold | |
| """ | |
| pos = source.find(context) | |
| if pos != -1: | |
| return pos, False | |
| if not _HAS_RAPIDFUZZ or not context: | |
| return None | |
| win = len(context) | |
| if win > len(source): | |
| return None | |
| best_score = 0.0 | |
| best_pos = -1 | |
| for i in range(len(source) - win + 1): | |
| score = _fuzz.ratio(context, source[i : i + win]) / 100.0 | |
| if score > best_score: | |
| best_score = score | |
| best_pos = i | |
| if best_score >= threshold: | |
| return best_pos, True | |
| return None | |
| def resolve_spans( | |
| source: str, | |
| spans: list[SpanDescriptor], | |
| fuzzy_threshold: float = 0.92, | |
| ) -> list[ResolvedSpan]: | |
| """ | |
| Convert context-anchored SpanDescriptors to char-offset ResolvedSpans. | |
| Rejects spans whose text cannot be reliably located in *source*. | |
| Spans that required fuzzy context matching are flagged with fuzzy_match=True. | |
| """ | |
| resolved: list[ResolvedSpan] = [] | |
| for span in spans: | |
| result = _find_context(source, span.context, fuzzy_threshold) | |
| if result is None: | |
| continue # context not found → reject | |
| ctx_start, context_is_fuzzy = result | |
| # Find span.text within the located context window | |
| window = source[ctx_start : ctx_start + len(span.context)] | |
| text_pos = window.find(span.text) | |
| if text_pos == -1: | |
| continue # text not in context window → reject | |
| abs_start = ctx_start + text_pos | |
| abs_end = abs_start + len(span.text) | |
| # Verify verbatim match (should always hold after exact context find, | |
| # but important guard after fuzzy context find) | |
| if source[abs_start:abs_end] != span.text: | |
| continue | |
| resolved.append( | |
| ResolvedSpan( | |
| element=span.element, | |
| start=abs_start, | |
| end=abs_end, | |
| attrs=span.attrs.copy(), | |
| children=[], | |
| fuzzy_match=context_is_fuzzy, | |
| ) | |
| ) | |
| return resolved | |