File size: 5,930 Bytes
2edd0d5 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 | """Regression tests for postcode blocker / duplicate-column handling."""
from types import SimpleNamespace
import pandas as pd
from fuzzy_address_matcher.fuzzy_match import add_fuzzy_block_sequence_col
from fuzzy_address_matcher.matcher_funcs import (
_column_has_usable_values,
_normalize_join_key_strings,
_postcode_batch_covered_search_keys_normalized,
_resolve_column_series,
_slice_frame_by_normalized_keys,
_street_overflow_unbatched_search_enabled,
_strip_runtime_fuzzy_cols_from_stand_cache,
_uncovered_search_key_values_for_street_overflow,
create_batch_ranges,
)
def test_column_has_usable_values_duplicate_label_first_column_nonempty():
left = pd.DataFrame({"postcode_search": ["nw16hr", ""]})
right = pd.DataFrame({"postcode_search": ["", ""]})
df = pd.concat([left, right], axis=1)
assert df.columns.tolist().count("postcode_search") == 2
assert _column_has_usable_values(df, "postcode_search") is True
ser = _resolve_column_series(df, "postcode_search")
assert ser is not None
assert ser.iloc[0] == "nw16hr"
def test_column_has_usable_values_duplicate_label_all_empty():
left = pd.DataFrame({"postcode_search": ["", ""]})
right = pd.DataFrame({"postcode_search": ["", ""]})
df = pd.concat([left, right], axis=1)
assert _column_has_usable_values(df, "postcode_search") is False
def test_add_fuzzy_block_sequence_col_duplicate_postcode_search_labels():
base = pd.DataFrame({"postcode_search": ["p1", "p1", "p2"], "idx": [0, 1, 2]})
extra = pd.DataFrame({"postcode_search": ["p1", "p1", "p2"]})
df = pd.concat([base, extra], axis=1)
out = add_fuzzy_block_sequence_col(df, "postcode_search")
o = out.sort_index()
assert [int(o.loc[i, "_fuzzy_block_seq"]) for i in (0, 1, 2)] == [0, 1, 0]
def test_slice_frame_by_normalized_keys_matches_key_column_not_positional_index():
"""Parquet reload uses RangeIndex; batch keys are original join labels."""
df = pd.DataFrame(
{
"index": ["5000", "5001", "99999"],
"postcode_search": ["a", "b", "c"],
}
)
out = _slice_frame_by_normalized_keys(df, "index", [5000, 99999])
assert len(out) == 2
assert set(out["index"].tolist()) == {"5000", "99999"}
def test_create_batch_ranges_uses_join_column_not_dataframe_index():
"""Batches must list the same ids as ``search_df_key_field`` / ``ref_index``."""
df = pd.DataFrame(
{
"postcode": ["AB1 2CD", "AB1 2CD"],
"index": ["rowA", "rowB"],
},
index=[0, 1],
)
ref_df = pd.DataFrame(
{
"Postcode": ["AB1 2CD"],
"ref_index": [99],
},
index=[0],
)
out = create_batch_ranges(
df,
ref_df,
5000,
5000,
"postcode",
"Postcode",
search_df_key_field="index",
ref_key_field="ref_index",
)
assert out["search_range"].iloc[0] == ["rowA", "rowB"]
assert out["ref_range"].iloc[0] == [99]
def test_slice_frame_by_normalized_keys_large_labels_not_in_rangeindex():
df = pd.DataFrame(
{"index": ["35000", "35001"], "x": [1, 2]},
)
search_range = [35000]
out_old_index = df.loc[df.index.isin(search_range)]
assert len(out_old_index) == 0
out = _slice_frame_by_normalized_keys(df, "index", search_range)
assert len(out) == 1
assert out.iloc[0]["index"] == "35000"
def test_normalize_join_key_strings_int_float_string_align():
left = pd.Series([6199, 6199.0, "6199.0"], dtype=object)
right = pd.Series([6199], dtype="Int64")
a = set(_normalize_join_key_strings(left).tolist())
b = set(_normalize_join_key_strings(right).tolist())
assert a == {"6199"}
assert b == {"6199"}
def test_postcode_batch_covered_keys_and_uncovered_search_values():
range_df = pd.DataFrame({"search_range": [["k1"], ["k3"]]})
cov = _postcode_batch_covered_search_keys_normalized(range_df)
assert cov == {"k1", "k3"}
matcher = SimpleNamespace(
search_df_key_field="index",
search_df_cleaned=pd.DataFrame({"index": ["k1", "k2", "k3"]}),
)
unc = _uncovered_search_key_values_for_street_overflow(matcher, cov)
assert set(unc) == {"k2"}
def test_create_batch_ranges_omits_search_only_postcode_from_covered_keys():
"""Search row whose truncated postcode is not in ref never enters a postcode batch."""
search = pd.DataFrame(
{
"postcode": ["AB1 2CD", "ZZ9 9ZZ"],
"index": ["in_ref_pc", "search_only_pc"],
}
)
ref = pd.DataFrame(
{
"Postcode": ["AB1 2CD"],
"ref_index": [0],
}
)
range_df = create_batch_ranges(
search.copy(),
ref.copy(),
batch_size=1,
ref_batch_size=1,
search_postcode_col="postcode",
ref_postcode_col="Postcode",
search_df_key_field="index",
ref_key_field="ref_index",
)
cov = _postcode_batch_covered_search_keys_normalized(range_df)
assert "search_only_pc" not in cov
assert "in_ref_pc" in cov
matcher = SimpleNamespace(
search_df_key_field="index",
search_df_cleaned=search,
)
unc = _uncovered_search_key_values_for_street_overflow(matcher, cov)
assert unc == ["search_only_pc"]
def test_street_overflow_unbatched_search_env_toggle(monkeypatch):
monkeypatch.delenv("STREET_OVERFLOW_UNBATCHED_SEARCH", raising=False)
assert _street_overflow_unbatched_search_enabled() is True
monkeypatch.setenv("STREET_OVERFLOW_UNBATCHED_SEARCH", "0")
assert _street_overflow_unbatched_search_enabled() is False
def test_strip_runtime_fuzzy_cols_from_stand_cache():
df = pd.DataFrame({"a": [1], "_fuzzy_block_seq": [3]})
out = _strip_runtime_fuzzy_cols_from_stand_cache(df)
assert "_fuzzy_block_seq" not in out.columns
assert "a" in out.columns
|