fuzzy_address_matcher / test /test_postcode_blocker_regression.py
seanpedrickcase's picture
Updated joins to use more reliable index matching where possible. Results output should now show the original search addresses. Increased number of addresses used for street matcher step when postcode blocking used
2edd0d5
"""Regression tests for postcode blocker / duplicate-column handling."""
from types import SimpleNamespace
import pandas as pd
from fuzzy_address_matcher.fuzzy_match import add_fuzzy_block_sequence_col
from fuzzy_address_matcher.matcher_funcs import (
_column_has_usable_values,
_normalize_join_key_strings,
_postcode_batch_covered_search_keys_normalized,
_resolve_column_series,
_slice_frame_by_normalized_keys,
_street_overflow_unbatched_search_enabled,
_strip_runtime_fuzzy_cols_from_stand_cache,
_uncovered_search_key_values_for_street_overflow,
create_batch_ranges,
)
def test_column_has_usable_values_duplicate_label_first_column_nonempty():
left = pd.DataFrame({"postcode_search": ["nw16hr", ""]})
right = pd.DataFrame({"postcode_search": ["", ""]})
df = pd.concat([left, right], axis=1)
assert df.columns.tolist().count("postcode_search") == 2
assert _column_has_usable_values(df, "postcode_search") is True
ser = _resolve_column_series(df, "postcode_search")
assert ser is not None
assert ser.iloc[0] == "nw16hr"
def test_column_has_usable_values_duplicate_label_all_empty():
left = pd.DataFrame({"postcode_search": ["", ""]})
right = pd.DataFrame({"postcode_search": ["", ""]})
df = pd.concat([left, right], axis=1)
assert _column_has_usable_values(df, "postcode_search") is False
def test_add_fuzzy_block_sequence_col_duplicate_postcode_search_labels():
base = pd.DataFrame({"postcode_search": ["p1", "p1", "p2"], "idx": [0, 1, 2]})
extra = pd.DataFrame({"postcode_search": ["p1", "p1", "p2"]})
df = pd.concat([base, extra], axis=1)
out = add_fuzzy_block_sequence_col(df, "postcode_search")
o = out.sort_index()
assert [int(o.loc[i, "_fuzzy_block_seq"]) for i in (0, 1, 2)] == [0, 1, 0]
def test_slice_frame_by_normalized_keys_matches_key_column_not_positional_index():
"""Parquet reload uses RangeIndex; batch keys are original join labels."""
df = pd.DataFrame(
{
"index": ["5000", "5001", "99999"],
"postcode_search": ["a", "b", "c"],
}
)
out = _slice_frame_by_normalized_keys(df, "index", [5000, 99999])
assert len(out) == 2
assert set(out["index"].tolist()) == {"5000", "99999"}
def test_create_batch_ranges_uses_join_column_not_dataframe_index():
"""Batches must list the same ids as ``search_df_key_field`` / ``ref_index``."""
df = pd.DataFrame(
{
"postcode": ["AB1 2CD", "AB1 2CD"],
"index": ["rowA", "rowB"],
},
index=[0, 1],
)
ref_df = pd.DataFrame(
{
"Postcode": ["AB1 2CD"],
"ref_index": [99],
},
index=[0],
)
out = create_batch_ranges(
df,
ref_df,
5000,
5000,
"postcode",
"Postcode",
search_df_key_field="index",
ref_key_field="ref_index",
)
assert out["search_range"].iloc[0] == ["rowA", "rowB"]
assert out["ref_range"].iloc[0] == [99]
def test_slice_frame_by_normalized_keys_large_labels_not_in_rangeindex():
df = pd.DataFrame(
{"index": ["35000", "35001"], "x": [1, 2]},
)
search_range = [35000]
out_old_index = df.loc[df.index.isin(search_range)]
assert len(out_old_index) == 0
out = _slice_frame_by_normalized_keys(df, "index", search_range)
assert len(out) == 1
assert out.iloc[0]["index"] == "35000"
def test_normalize_join_key_strings_int_float_string_align():
left = pd.Series([6199, 6199.0, "6199.0"], dtype=object)
right = pd.Series([6199], dtype="Int64")
a = set(_normalize_join_key_strings(left).tolist())
b = set(_normalize_join_key_strings(right).tolist())
assert a == {"6199"}
assert b == {"6199"}
def test_postcode_batch_covered_keys_and_uncovered_search_values():
range_df = pd.DataFrame({"search_range": [["k1"], ["k3"]]})
cov = _postcode_batch_covered_search_keys_normalized(range_df)
assert cov == {"k1", "k3"}
matcher = SimpleNamespace(
search_df_key_field="index",
search_df_cleaned=pd.DataFrame({"index": ["k1", "k2", "k3"]}),
)
unc = _uncovered_search_key_values_for_street_overflow(matcher, cov)
assert set(unc) == {"k2"}
def test_create_batch_ranges_omits_search_only_postcode_from_covered_keys():
"""Search row whose truncated postcode is not in ref never enters a postcode batch."""
search = pd.DataFrame(
{
"postcode": ["AB1 2CD", "ZZ9 9ZZ"],
"index": ["in_ref_pc", "search_only_pc"],
}
)
ref = pd.DataFrame(
{
"Postcode": ["AB1 2CD"],
"ref_index": [0],
}
)
range_df = create_batch_ranges(
search.copy(),
ref.copy(),
batch_size=1,
ref_batch_size=1,
search_postcode_col="postcode",
ref_postcode_col="Postcode",
search_df_key_field="index",
ref_key_field="ref_index",
)
cov = _postcode_batch_covered_search_keys_normalized(range_df)
assert "search_only_pc" not in cov
assert "in_ref_pc" in cov
matcher = SimpleNamespace(
search_df_key_field="index",
search_df_cleaned=search,
)
unc = _uncovered_search_key_values_for_street_overflow(matcher, cov)
assert unc == ["search_only_pc"]
def test_street_overflow_unbatched_search_env_toggle(monkeypatch):
monkeypatch.delenv("STREET_OVERFLOW_UNBATCHED_SEARCH", raising=False)
assert _street_overflow_unbatched_search_enabled() is True
monkeypatch.setenv("STREET_OVERFLOW_UNBATCHED_SEARCH", "0")
assert _street_overflow_unbatched_search_enabled() is False
def test_strip_runtime_fuzzy_cols_from_stand_cache():
df = pd.DataFrame({"a": [1], "_fuzzy_block_seq": [3]})
out = _strip_runtime_fuzzy_cols_from_stand_cache(df)
assert "_fuzzy_block_seq" not in out.columns
assert "a" in out.columns