Spaces:
Sleeping
Sleeping
| """ | |
| Unit tests for _aggregate_spans() in src/ner/model.py. | |
| Uses EvidenceNER.__new__() to bypass checkpoint loading — no model required. | |
| Run: pytest tests/ner/test_model.py | |
| """ | |
| from src.ner.model import EvidenceNER, LABEL2ID | |
| def _ner(): | |
| """Return an EvidenceNER instance with no checkpoint loaded.""" | |
| return EvidenceNER.__new__(EvidenceNER) | |
| # --------------------------------------------------------------------------- | |
| # Happy path | |
| # --------------------------------------------------------------------------- | |
| def test_aggregate_spans_single_entity(): | |
| """B-AMOUNT followed by I-AMOUNT produces one AMOUNT entity.""" | |
| ner = _ner() | |
| text = "₹4,299 overcharged" | |
| offset_mapping = [(0, 0), (0, 6), (6, 7), (7, 14), (0, 0)] | |
| pred_ids = [ | |
| LABEL2ID["O"], | |
| LABEL2ID["B-AMOUNT"], | |
| LABEL2ID["I-AMOUNT"], | |
| LABEL2ID["O"], | |
| LABEL2ID["O"], | |
| ] | |
| confs = [0.99, 0.91, 0.88, 0.95, 0.99] | |
| entities = ner._aggregate_spans(text, offset_mapping, pred_ids, confs) | |
| assert len(entities) == 1 | |
| assert entities[0].label == "AMOUNT" | |
| assert entities[0].start == 0 | |
| assert entities[0].end == 7 | |
| def test_aggregate_spans_two_entities(): | |
| """B-AMOUNT…O…B-REF_ID produces two separate entities.""" | |
| ner = _ner() | |
| text = "₹4,299 for OD-123" | |
| offset_mapping = [ | |
| (0, 0), | |
| (0, 6), # ₹4,299 | |
| (7, 10), # for → O | |
| (11, 17), # OD-123 | |
| (0, 0), | |
| ] | |
| pred_ids = [ | |
| LABEL2ID["O"], | |
| LABEL2ID["B-AMOUNT"], | |
| LABEL2ID["O"], | |
| LABEL2ID["B-REF_ID"], | |
| LABEL2ID["O"], | |
| ] | |
| confs = [0.99, 0.91, 0.95, 0.82, 0.99] | |
| entities = ner._aggregate_spans(text, offset_mapping, pred_ids, confs) | |
| labels = [e.label for e in entities] | |
| assert "AMOUNT" in labels | |
| assert "REF_ID" in labels | |
| assert len(entities) == 2 | |
| # --------------------------------------------------------------------------- | |
| # Edge cases — broken sequences | |
| # --------------------------------------------------------------------------- | |
| def test_aggregate_spans_orphan_i_tag_dropped(): | |
| """I-AMOUNT with no preceding B-AMOUNT must not produce an entity.""" | |
| ner = _ner() | |
| text = "some text here" | |
| offset_mapping = [(0, 0), (0, 4), (5, 9), (10, 14), (0, 0)] | |
| pred_ids = [ | |
| LABEL2ID["O"], | |
| LABEL2ID["I-AMOUNT"], # orphan — no preceding B- | |
| LABEL2ID["O"], | |
| LABEL2ID["O"], | |
| LABEL2ID["O"], | |
| ] | |
| confs = [0.99, 0.85, 0.95, 0.95, 0.99] | |
| entities = ner._aggregate_spans(text, offset_mapping, pred_ids, confs) | |
| assert entities == [] | |
| def test_aggregate_spans_mismatched_i_closes_span(): | |
| """B-AMOUNT followed by I-DATE should flush AMOUNT and drop I-DATE.""" | |
| ner = _ner() | |
| text = "₹4,299 today" | |
| offset_mapping = [(0, 0), (0, 6), (7, 12), (0, 0)] | |
| pred_ids = [ | |
| LABEL2ID["O"], | |
| LABEL2ID["B-AMOUNT"], | |
| LABEL2ID["I-DATE"], # mismatch — different type | |
| LABEL2ID["O"], | |
| ] | |
| confs = [0.99, 0.91, 0.80, 0.99] | |
| entities = ner._aggregate_spans(text, offset_mapping, pred_ids, confs) | |
| assert len(entities) == 1 | |
| assert entities[0].label == "AMOUNT" | |
| # --------------------------------------------------------------------------- | |
| # Special tokens | |
| # --------------------------------------------------------------------------- | |
| def test_aggregate_spans_special_tokens_skipped(): | |
| """(0,0) offset entries for CLS/SEP must not contribute to any entity.""" | |
| ner = _ner() | |
| text = "Flipkart" | |
| # CLS at 0, real token, SEP at end | |
| offset_mapping = [(0, 0), (0, 8), (0, 0)] | |
| pred_ids = [ | |
| LABEL2ID["B-ORG"], # CLS — should be skipped | |
| LABEL2ID["B-ORG"], # real token | |
| LABEL2ID["I-ORG"], # SEP — should be skipped | |
| ] | |
| confs = [0.99, 0.91, 0.99] | |
| entities = ner._aggregate_spans(text, offset_mapping, pred_ids, confs) | |
| assert len(entities) == 1 | |
| assert entities[0].start == 0 | |
| assert entities[0].end == 8 | |
| # --------------------------------------------------------------------------- | |
| # Empty input guard | |
| # --------------------------------------------------------------------------- | |
| def test_extract_empty_string_returns_empty_list(): | |
| """EvidenceNER.extract() guard fires before tokenizer — safe via __new__.""" | |
| ner = _ner() | |
| assert ner.extract("") == [] | |
| def test_extract_whitespace_only_returns_empty_list(): | |
| """EvidenceNER.extract() guard fires for whitespace-only input.""" | |
| ner = _ner() | |
| assert ner.extract(" ") == [] | |