| from pathlib import Path | |
| from scripts.validate_normalization import validate_file | |
| def _write(path: Path, content: str) -> Path: | |
| path.write_text(content, encoding="utf-8") | |
| return path | |
| def test_validate_file_passes_with_valid_tsv(tmp_path: Path) -> None: | |
| file_path = _write( | |
| tmp_path / "valid.tsv", | |
| "id\traw_text\tnormalized_text\tnote\n" | |
| "n001\tfoo\tfoo\tok\n", | |
| ) | |
| assert validate_file(file_path) == [] | |
| def test_validate_file_fails_on_duplicate_id(tmp_path: Path) -> None: | |
| file_path = _write( | |
| tmp_path / "dup.tsv", | |
| "id\traw_text\tnormalized_text\tnote\n" | |
| "n001\tfoo\tfoo\tok\n" | |
| "n001\tbar\tbar\tdup\n", | |
| ) | |
| errors = validate_file(file_path) | |
| assert any("duplicate id" in error for error in errors) | |
| def test_validate_file_fails_on_missing_columns(tmp_path: Path) -> None: | |
| file_path = _write( | |
| tmp_path / "missing.tsv", | |
| "id\traw_text\tnote\n" | |
| "n001\tfoo\tmissing normalized\n", | |
| ) | |
| errors = validate_file(file_path) | |
| assert errors | |
| assert "Missing required columns: normalized_text" in errors[0] | |