cmboulanger's picture
full implementation
37eaffd
from __future__ import annotations
import warnings
from ..models.spans import ResolvedSpan
def _build_nesting_tree(flat_spans: list[ResolvedSpan]) -> list[ResolvedSpan]:
"""
Populate ResolvedSpan.children based on offset containment and return root spans.
Spans are sorted so that outer (longer) spans are processed before inner ones.
Overlapping (non-nesting) spans are skipped with a warning.
"""
# Sort: start asc, then end desc so outer spans come before inner at same start
spans = sorted(flat_spans, key=lambda s: (s.start, -(s.end - s.start)))
# Clear any children left from a previous call
for s in spans:
s.children = []
roots: list[ResolvedSpan] = []
stack: list[ResolvedSpan] = []
for span in spans:
rejected = False
# Pop stack entries that are fully before (or incompatibly overlap) this span
while stack:
top = stack[-1]
if top.start <= span.start and span.end <= top.end:
break # top properly contains span → it's the parent
elif span.start >= top.end:
stack.pop() # span comes after top → pop and continue
else:
# Partial overlap (neither contained nor after) → reject span
warnings.warn(
f"Overlapping spans [{top.start},{top.end}] and "
f"[{span.start},{span.end}] cannot be nested; "
f"skipping <{span.element}> span.",
stacklevel=3,
)
rejected = True
break
if rejected:
continue
if stack:
stack[-1].children.append(span)
else:
roots.append(span)
stack.append(span)
return roots
def _inject_recursive(
text: str,
spans: list[ResolvedSpan],
offset: int,
) -> str:
"""
Insert XML open/close tags for *spans* into *text*.
*offset* is the absolute position of text[0] in the original source, used
to translate span.start/end (absolute) to positions within *text*.
"""
if not spans:
return text
result: list[str] = []
cursor = 0 # relative position within text
for span in sorted(spans, key=lambda s: s.start):
rel_start = span.start - offset
rel_end = span.end - offset
# Text before this span
result.append(text[cursor:rel_start])
# Build tag strings
attrs_str = " ".join(f'{k}="{v}"' for k, v in span.attrs.items())
open_tag = f"<{span.element}" + (f" {attrs_str}" if attrs_str else "") + ">"
close_tag = f"</{span.element}>"
# Recursively inject children inside this span's content
inner = text[rel_start:rel_end]
if span.children:
inner = _inject_recursive(inner, span.children, offset=span.start)
result.append(open_tag)
result.append(inner)
result.append(close_tag)
cursor = rel_end
result.append(text[cursor:])
return "".join(result)
def inject_xml(source: str, spans: list[ResolvedSpan]) -> str:
"""
Insert XML tags into *source* at the positions defined by *spans*.
Nesting is inferred from offset containment via _build_nesting_tree.
"""
if not spans:
return source
root_spans = _build_nesting_tree(spans)
return _inject_recursive(source, root_spans, offset=0)