import pandas as pd from server import _read_any, clean_data from scrubdata import apply_plan, mock_plan def test_clean_data_handles_zero_byte_upload(tmp_path): path = tmp_path / "empty.csv" path.write_bytes(b"") result = clean_data(str(path), use_model=False) assert result["before"] == [] assert result["after"] == [] assert result["total_rows_before"] == 0 assert "Couldn't read this file" in result["summary"] def test_clean_data_handles_headers_without_rows(tmp_path): path = tmp_path / "headers.csv" path.write_text("name,email\n") result = clean_data(str(path), use_model=False) assert result["before"] == [] assert result["after"] == [] assert result["total_rows_before"] == 0 assert result["summary"] == "That file looks empty — no rows or columns to clean." def test_sanitized_header_suffixes_remain_unique(tmp_path): path = tmp_path / "colliding_headers.csv" path.write_text("a,a.1, a\nx,y,z\n") raw = _read_any(str(path)) result = clean_data(str(path), use_model=False) assert list(raw.columns) == ["a", "a.1", "a.2"] assert raw.columns.is_unique assert result["columns_before"] == ["a", "a.1", "a.2"] assert result["total_rows_before"] == 1 assert "Something went wrong while cleaning" not in result["summary"] def test_maria_sample_cleaning_is_preserved(): raw = pd.read_csv("samples/maria_crm_export.csv", dtype=str, keep_default_na=False) via_server = _read_any("samples/maria_crm_export.csv") expected, expected_log = apply_plan(raw, mock_plan(raw)) actual, actual_log = apply_plan(via_server, mock_plan(via_server)) pd.testing.assert_frame_equal(actual, expected) assert actual_log == expected_log