File size: 1,435 Bytes
37eaffd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
from __future__ import annotations

from ..models.schema import TEISchema
from ..models.spans import ResolvedSpan


def validate_spans(
    spans: list[ResolvedSpan],
    schema: TEISchema,
    source: str,
) -> list[ResolvedSpan]:
    """
    Filter out spans that fail schema validation.

    Rejected when:
    - element is not in the schema
    - an attribute name is not listed for that element
    - an attribute value is not in the element's allowed_values (when constrained)
    - span bounds are out of range
    """
    valid: list[ResolvedSpan] = []

    for span in spans:
        # Bounds sanity check
        if span.start < 0 or span.end > len(source) or span.start >= span.end:
            continue

        elem = schema.get(span.element)
        if elem is None:
            continue  # element not in schema

        allowed_names = {a.name for a in elem.attributes}
        attr_ok = True
        for attr_name, attr_value in span.attrs.items():
            if attr_name not in allowed_names:
                attr_ok = False
                break
            attr_def = next((a for a in elem.attributes if a.name == attr_name), None)
            if attr_def and attr_def.allowed_values is not None:
                if attr_value not in attr_def.allowed_values:
                    attr_ok = False
                    break

        if not attr_ok:
            continue

        valid.append(span)

    return valid