File size: 5,930 Bytes
2edd0d5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
"""Regression tests for postcode blocker / duplicate-column handling."""

from types import SimpleNamespace

import pandas as pd

from fuzzy_address_matcher.fuzzy_match import add_fuzzy_block_sequence_col
from fuzzy_address_matcher.matcher_funcs import (
    _column_has_usable_values,
    _normalize_join_key_strings,
    _postcode_batch_covered_search_keys_normalized,
    _resolve_column_series,
    _slice_frame_by_normalized_keys,
    _street_overflow_unbatched_search_enabled,
    _strip_runtime_fuzzy_cols_from_stand_cache,
    _uncovered_search_key_values_for_street_overflow,
    create_batch_ranges,
)


def test_column_has_usable_values_duplicate_label_first_column_nonempty():
    left = pd.DataFrame({"postcode_search": ["nw16hr", ""]})
    right = pd.DataFrame({"postcode_search": ["", ""]})
    df = pd.concat([left, right], axis=1)
    assert df.columns.tolist().count("postcode_search") == 2
    assert _column_has_usable_values(df, "postcode_search") is True
    ser = _resolve_column_series(df, "postcode_search")
    assert ser is not None
    assert ser.iloc[0] == "nw16hr"


def test_column_has_usable_values_duplicate_label_all_empty():
    left = pd.DataFrame({"postcode_search": ["", ""]})
    right = pd.DataFrame({"postcode_search": ["", ""]})
    df = pd.concat([left, right], axis=1)
    assert _column_has_usable_values(df, "postcode_search") is False


def test_add_fuzzy_block_sequence_col_duplicate_postcode_search_labels():
    base = pd.DataFrame({"postcode_search": ["p1", "p1", "p2"], "idx": [0, 1, 2]})
    extra = pd.DataFrame({"postcode_search": ["p1", "p1", "p2"]})
    df = pd.concat([base, extra], axis=1)
    out = add_fuzzy_block_sequence_col(df, "postcode_search")
    o = out.sort_index()
    assert [int(o.loc[i, "_fuzzy_block_seq"]) for i in (0, 1, 2)] == [0, 1, 0]


def test_slice_frame_by_normalized_keys_matches_key_column_not_positional_index():
    """Parquet reload uses RangeIndex; batch keys are original join labels."""
    df = pd.DataFrame(
        {
            "index": ["5000", "5001", "99999"],
            "postcode_search": ["a", "b", "c"],
        }
    )
    out = _slice_frame_by_normalized_keys(df, "index", [5000, 99999])
    assert len(out) == 2
    assert set(out["index"].tolist()) == {"5000", "99999"}


def test_create_batch_ranges_uses_join_column_not_dataframe_index():
    """Batches must list the same ids as ``search_df_key_field`` / ``ref_index``."""
    df = pd.DataFrame(
        {
            "postcode": ["AB1 2CD", "AB1 2CD"],
            "index": ["rowA", "rowB"],
        },
        index=[0, 1],
    )
    ref_df = pd.DataFrame(
        {
            "Postcode": ["AB1 2CD"],
            "ref_index": [99],
        },
        index=[0],
    )
    out = create_batch_ranges(
        df,
        ref_df,
        5000,
        5000,
        "postcode",
        "Postcode",
        search_df_key_field="index",
        ref_key_field="ref_index",
    )
    assert out["search_range"].iloc[0] == ["rowA", "rowB"]
    assert out["ref_range"].iloc[0] == [99]


def test_slice_frame_by_normalized_keys_large_labels_not_in_rangeindex():
    df = pd.DataFrame(
        {"index": ["35000", "35001"], "x": [1, 2]},
    )
    search_range = [35000]
    out_old_index = df.loc[df.index.isin(search_range)]
    assert len(out_old_index) == 0
    out = _slice_frame_by_normalized_keys(df, "index", search_range)
    assert len(out) == 1
    assert out.iloc[0]["index"] == "35000"


def test_normalize_join_key_strings_int_float_string_align():
    left = pd.Series([6199, 6199.0, "6199.0"], dtype=object)
    right = pd.Series([6199], dtype="Int64")
    a = set(_normalize_join_key_strings(left).tolist())
    b = set(_normalize_join_key_strings(right).tolist())
    assert a == {"6199"}
    assert b == {"6199"}


def test_postcode_batch_covered_keys_and_uncovered_search_values():
    range_df = pd.DataFrame({"search_range": [["k1"], ["k3"]]})
    cov = _postcode_batch_covered_search_keys_normalized(range_df)
    assert cov == {"k1", "k3"}
    matcher = SimpleNamespace(
        search_df_key_field="index",
        search_df_cleaned=pd.DataFrame({"index": ["k1", "k2", "k3"]}),
    )
    unc = _uncovered_search_key_values_for_street_overflow(matcher, cov)
    assert set(unc) == {"k2"}


def test_create_batch_ranges_omits_search_only_postcode_from_covered_keys():
    """Search row whose truncated postcode is not in ref never enters a postcode batch."""
    search = pd.DataFrame(
        {
            "postcode": ["AB1 2CD", "ZZ9 9ZZ"],
            "index": ["in_ref_pc", "search_only_pc"],
        }
    )
    ref = pd.DataFrame(
        {
            "Postcode": ["AB1 2CD"],
            "ref_index": [0],
        }
    )
    range_df = create_batch_ranges(
        search.copy(),
        ref.copy(),
        batch_size=1,
        ref_batch_size=1,
        search_postcode_col="postcode",
        ref_postcode_col="Postcode",
        search_df_key_field="index",
        ref_key_field="ref_index",
    )
    cov = _postcode_batch_covered_search_keys_normalized(range_df)
    assert "search_only_pc" not in cov
    assert "in_ref_pc" in cov
    matcher = SimpleNamespace(
        search_df_key_field="index",
        search_df_cleaned=search,
    )
    unc = _uncovered_search_key_values_for_street_overflow(matcher, cov)
    assert unc == ["search_only_pc"]


def test_street_overflow_unbatched_search_env_toggle(monkeypatch):
    monkeypatch.delenv("STREET_OVERFLOW_UNBATCHED_SEARCH", raising=False)
    assert _street_overflow_unbatched_search_enabled() is True
    monkeypatch.setenv("STREET_OVERFLOW_UNBATCHED_SEARCH", "0")
    assert _street_overflow_unbatched_search_enabled() is False


def test_strip_runtime_fuzzy_cols_from_stand_cache():
    df = pd.DataFrame({"a": [1], "_fuzzy_block_seq": [3]})
    out = _strip_runtime_fuzzy_cols_from_stand_cache(df)
    assert "_fuzzy_block_seq" not in out.columns
    assert "a" in out.columns