Spaces:
Running
Running
| """Comprehensive tests for the public DiffCompressor API. | |
| Tests cover: | |
| 1. Context line reduction | |
| 2. Hunk selection and limiting | |
| 3. Compression ratios | |
| 4. Edge cases | |
| 5. Bug-fix regressions and routing-gap fixtures | |
| Stage 3b note (2026-04-25): the Python `DiffCompressor` implementation | |
| was retired in favor of the Rust-backed shim (`headroom._core` via PyO3). | |
| Tests that probed Python-only internals — `_parse_diff`, `_score_hunks`, | |
| the `DiffHunk` / `DiffFile` parser dataclasses — were removed because | |
| the Rust crate has its own parallel coverage in | |
| `crates/headroom-core/tests`. Public-API tests (anything calling | |
| `compressor.compress(...)`) are preserved unchanged: they exercise the | |
| Rust backend through the same import path and assert the same outputs. | |
| """ | |
| from headroom.transforms.diff_compressor import ( | |
| DiffCompressionResult, | |
| DiffCompressor, | |
| DiffCompressorConfig, | |
| ) | |
| class TestContextReduction: | |
| """Tests for context line reduction.""" | |
| def test_reduce_context_lines(self): | |
| """Context lines are reduced to configured maximum.""" | |
| content = """diff --git a/file.py b/file.py | |
| --- a/file.py | |
| +++ b/file.py | |
| @@ -1,10 +1,11 @@ | |
| context1 | |
| context2 | |
| context3 | |
| context4 | |
| +added | |
| context5 | |
| context6 | |
| context7 | |
| context8 | |
| """ | |
| # Default max_context_lines is 2 | |
| compressor = DiffCompressor( | |
| config=DiffCompressorConfig( | |
| max_context_lines=2, | |
| min_lines_for_ccr=5, | |
| enable_ccr=False, | |
| ) | |
| ) | |
| result = compressor.compress(content) | |
| # Should keep 2 context before and 2 after the +added line | |
| # Plus the added line itself | |
| lines = result.compressed.split("\n") | |
| context_count = sum(1 for line in lines if line.startswith(" ")) | |
| # At most 4 context lines (2 before + 2 after) | |
| assert context_count <= 4 | |
| def test_preserve_all_changes(self): | |
| """All addition and deletion lines are preserved.""" | |
| content = """diff --git a/file.py b/file.py | |
| --- a/file.py | |
| +++ b/file.py | |
| @@ -1,10 +1,10 @@ | |
| ctx1 | |
| ctx2 | |
| -removed1 | |
| +added1 | |
| ctx3 | |
| ctx4 | |
| -removed2 | |
| +added2 | |
| ctx5 | |
| ctx6 | |
| """ | |
| compressor = DiffCompressor( | |
| config=DiffCompressorConfig( | |
| min_lines_for_ccr=5, | |
| enable_ccr=False, | |
| ) | |
| ) | |
| result = compressor.compress(content) | |
| assert "-removed1" in result.compressed | |
| assert "-removed2" in result.compressed | |
| assert "+added1" in result.compressed | |
| assert "+added2" in result.compressed | |
| class TestHunkSelection: | |
| """Tests for hunk selection when limiting.""" | |
| def test_max_hunks_per_file(self): | |
| """Hunks are limited to max_hunks_per_file.""" | |
| # Create a diff with many hunks | |
| hunks = [] | |
| for i in range(20): | |
| hunks.append(f"""@@ -{i * 10},3 +{i * 10},4 @@ | |
| context | |
| +added_{i} | |
| more | |
| """) | |
| content = f"""diff --git a/bigfile.py b/bigfile.py | |
| --- a/bigfile.py | |
| +++ b/bigfile.py | |
| {"".join(hunks)}""" | |
| compressor = DiffCompressor( | |
| config=DiffCompressorConfig( | |
| max_hunks_per_file=5, | |
| min_lines_for_ccr=10, | |
| enable_ccr=False, | |
| ) | |
| ) | |
| result = compressor.compress(content) | |
| # Should have at most 5 hunks | |
| hunk_count = result.compressed.count("@@") | |
| # Each hunk has one @@ header (we count full hunk headers) | |
| assert hunk_count <= 10 # Each hunk header appears twice @@...@@ | |
| def test_keeps_first_and_last_hunk(self): | |
| """First and last hunks are preserved when limiting.""" | |
| hunks = [] | |
| for i in range(10): | |
| hunks.append(f"""@@ -{i * 10},3 +{i * 10},4 @@ | |
| context | |
| +added_{i} | |
| more | |
| """) | |
| content = f"""diff --git a/file.py b/file.py | |
| --- a/file.py | |
| +++ b/file.py | |
| {"".join(hunks)}""" | |
| compressor = DiffCompressor( | |
| config=DiffCompressorConfig( | |
| max_hunks_per_file=3, | |
| min_lines_for_ccr=10, | |
| enable_ccr=False, | |
| ) | |
| ) | |
| result = compressor.compress(content) | |
| # First hunk (added_0) should be present | |
| assert "+added_0" in result.compressed | |
| # Last hunk (added_9) should be present | |
| assert "+added_9" in result.compressed | |
| class TestFileSelection: | |
| """Tests for file selection when limiting.""" | |
| def test_max_files(self): | |
| """Files are limited to max_files.""" | |
| # Create diff with many files | |
| files = [] | |
| for i in range(30): | |
| files.append(f"""diff --git a/file{i}.py b/file{i}.py | |
| --- a/file{i}.py | |
| +++ b/file{i}.py | |
| @@ -1,2 +1,3 @@ | |
| ctx | |
| +added | |
| ctx2 | |
| """) | |
| content = "\n".join(files) | |
| compressor = DiffCompressor( | |
| config=DiffCompressorConfig( | |
| max_files=10, | |
| min_lines_for_ccr=20, | |
| enable_ccr=False, | |
| ) | |
| ) | |
| result = compressor.compress(content) | |
| # Count diff --git headers | |
| file_count = result.compressed.count("diff --git") | |
| assert file_count <= 10 | |
| class TestCompressionResult: | |
| """Tests for DiffCompressionResult properties.""" | |
| def test_compression_ratio_calculation(self): | |
| """Compression ratio is calculated correctly.""" | |
| result = DiffCompressionResult( | |
| compressed="a\nb\nc", | |
| original_line_count=100, | |
| compressed_line_count=10, | |
| files_affected=2, | |
| additions=5, | |
| deletions=3, | |
| hunks_kept=2, | |
| hunks_removed=5, | |
| ) | |
| assert result.compression_ratio == 0.1 | |
| def test_tokens_saved_estimate(self): | |
| """Token savings estimation works correctly.""" | |
| result = DiffCompressionResult( | |
| compressed="short", | |
| original_line_count=100, | |
| compressed_line_count=10, | |
| files_affected=1, | |
| additions=10, | |
| deletions=5, | |
| hunks_kept=1, | |
| hunks_removed=0, | |
| ) | |
| # 90 lines saved * 40 chars/line / 4 chars/token = 900 tokens | |
| assert result.tokens_saved_estimate == 900 | |
| class TestSmallDiffPassthrough: | |
| """Tests for small diff passthrough behavior.""" | |
| def test_small_diff_unchanged(self): | |
| """Diffs smaller than threshold pass through unchanged.""" | |
| content = """diff --git a/small.py b/small.py | |
| --- a/small.py | |
| +++ b/small.py | |
| @@ -1,2 +1,3 @@ | |
| line1 | |
| +added | |
| line2 | |
| """ | |
| compressor = DiffCompressor( | |
| config=DiffCompressorConfig( | |
| min_lines_for_ccr=100, # High threshold | |
| ) | |
| ) | |
| result = compressor.compress(content) | |
| # Should be unchanged | |
| assert result.compressed == content | |
| assert result.compression_ratio == 1.0 | |
| class TestOutputFormatting: | |
| """Tests for output formatting.""" | |
| def test_summary_line_added(self): | |
| """Summary line is added at end of compressed diff.""" | |
| # Large diff that will be compressed | |
| hunks = [] | |
| for i in range(15): | |
| hunks.append(f"""@@ -{i * 10},5 +{i * 10},6 @@ | |
| ctx1 | |
| ctx2 | |
| +added_{i} | |
| ctx3 | |
| ctx4 | |
| """) | |
| content = f"""diff --git a/file.py b/file.py | |
| --- a/file.py | |
| +++ b/file.py | |
| {"".join(hunks)}""" | |
| compressor = DiffCompressor( | |
| config=DiffCompressorConfig( | |
| max_hunks_per_file=5, | |
| min_lines_for_ccr=10, | |
| enable_ccr=False, | |
| ) | |
| ) | |
| result = compressor.compress(content) | |
| # Should have summary at end | |
| assert "files changed" in result.compressed | |
| assert "hunks omitted" in result.compressed | |
| def test_preserves_diff_format(self): | |
| """Output preserves valid unified diff format.""" | |
| content = """diff --git a/test.py b/test.py | |
| --- a/test.py | |
| +++ b/test.py | |
| @@ -1,3 +1,4 @@ | |
| def test(): | |
| + # new comment | |
| pass | |
| return True | |
| """ | |
| compressor = DiffCompressor( | |
| config=DiffCompressorConfig( | |
| min_lines_for_ccr=5, | |
| enable_ccr=False, | |
| ) | |
| ) | |
| result = compressor.compress(content) | |
| # Should have all standard diff markers | |
| assert "diff --git" in result.compressed | |
| assert "---" in result.compressed | |
| assert "+++" in result.compressed | |
| assert "@@" in result.compressed | |
| class TestEdgeCases: | |
| """Tests for edge cases and boundary conditions.""" | |
| def test_empty_input(self): | |
| """Empty input is handled gracefully.""" | |
| compressor = DiffCompressor() | |
| result = compressor.compress("") | |
| assert result.compressed == "" | |
| assert result.compression_ratio == 1.0 | |
| def test_non_diff_input(self): | |
| """Non-diff input passes through unchanged.""" | |
| content = "This is not a diff\nJust regular text" | |
| compressor = DiffCompressor() | |
| result = compressor.compress(content) | |
| # Should pass through (no diff --git found) | |
| assert result.compressed == content | |
| def test_unicode_content(self): | |
| """Unicode characters in diff are handled.""" | |
| content = """diff --git a/i18n.py b/i18n.py | |
| --- a/i18n.py | |
| +++ b/i18n.py | |
| @@ -1,2 +1,3 @@ | |
| msg = "hello" | |
| +msg_ja = "こんにちは" | |
| return msg | |
| """ | |
| compressor = DiffCompressor() | |
| result = compressor.compress(content) | |
| assert "こんにちは" in result.compressed | |
| def test_no_newline_at_eof(self): | |
| """Handles 'No newline at end of file' indicator.""" | |
| content = """diff --git a/file.py b/file.py | |
| --- a/file.py | |
| +++ b/file.py | |
| @@ -1,2 +1,2 @@ | |
| line1 | |
| -line2 | |
| \\ No newline at end of file | |
| +line2_modified | |
| \\ No newline at end of file | |
| """ | |
| compressor = DiffCompressor() | |
| result = compressor.compress(content) | |
| # Should not crash and preserve the indicator | |
| assert "No newline" in result.compressed or "-line2" in result.compressed | |
| def test_empty_hunks(self): | |
| """Files with no actual hunks are handled.""" | |
| content = """diff --git a/file.py b/file.py | |
| --- a/file.py | |
| +++ b/file.py | |
| """ | |
| compressor = DiffCompressor() | |
| result = compressor.compress(content) | |
| # Should not crash | |
| assert result.compressed is not None | |
| class TestConfigOptions: | |
| """Tests for configuration options.""" | |
| def test_max_context_lines_config(self): | |
| """max_context_lines configuration controls context reduction.""" | |
| content = """diff --git a/file.py b/file.py | |
| --- a/file.py | |
| +++ b/file.py | |
| @@ -1,10 +1,11 @@ | |
| c1 | |
| c2 | |
| c3 | |
| c4 | |
| c5 | |
| +added | |
| c6 | |
| c7 | |
| c8 | |
| c9 | |
| c10 | |
| """ | |
| # With max_context_lines=1 | |
| compressor = DiffCompressor( | |
| config=DiffCompressorConfig( | |
| max_context_lines=1, | |
| min_lines_for_ccr=5, | |
| enable_ccr=False, | |
| ) | |
| ) | |
| result = compressor.compress(content) | |
| # Count context lines (lines starting with space) | |
| context_count = sum(1 for line in result.compressed.split("\n") if line.startswith(" ")) | |
| # Should have at most 2 context lines (1 before + 1 after) | |
| assert context_count <= 2 | |
| def test_always_keep_additions_default(self): | |
| """Additions are always kept by default.""" | |
| content = """diff --git a/file.py b/file.py | |
| --- a/file.py | |
| +++ b/file.py | |
| @@ -1,3 +1,5 @@ | |
| ctx | |
| +add1 | |
| +add2 | |
| ctx | |
| """ | |
| compressor = DiffCompressor( | |
| config=DiffCompressorConfig( | |
| always_keep_additions=True, | |
| min_lines_for_ccr=2, | |
| enable_ccr=False, | |
| ) | |
| ) | |
| result = compressor.compress(content) | |
| assert "+add1" in result.compressed | |
| assert "+add2" in result.compressed | |
| def test_always_keep_deletions_default(self): | |
| """Deletions are always kept by default.""" | |
| content = """diff --git a/file.py b/file.py | |
| --- a/file.py | |
| +++ b/file.py | |
| @@ -1,5 +1,3 @@ | |
| ctx | |
| -del1 | |
| -del2 | |
| ctx | |
| """ | |
| compressor = DiffCompressor( | |
| config=DiffCompressorConfig( | |
| always_keep_deletions=True, | |
| min_lines_for_ccr=2, | |
| enable_ccr=False, | |
| ) | |
| ) | |
| result = compressor.compress(content) | |
| assert "-del1" in result.compressed | |
| assert "-del2" in result.compressed | |
| # ─── Bug-fix tests (2026-04-25): four silent information-loss paths ───────── | |
| # | |
| # Before the fix, the parser captured these patterns but the emitter dropped | |
| # them, or the regex didn't match them at all. Each test exercises one of | |
| # the four paths the same way the Rust unit tests do. | |
| def _cfg_below_threshold(): | |
| """Small config so the parser+emitter actually run on test inputs.""" | |
| from headroom.transforms.diff_compressor import DiffCompressorConfig | |
| return DiffCompressorConfig(min_lines_for_ccr=5) | |
| class TestBugfixRenamePreservation: | |
| """rename/similarity/dissimilarity/copy markers were captured into | |
| is_renamed=True and then dropped by the emitter. Output looked like a | |
| plain modification of the old path.""" | |
| def test_rename_with_similarity_index_preserved(self): | |
| from headroom.transforms.diff_compressor import DiffCompressor | |
| diff = ( | |
| "diff --git a/old.py b/new.py\n" | |
| "similarity index 92%\n" | |
| "rename from old.py\n" | |
| "rename to new.py\n" | |
| "--- a/old.py\n" | |
| "+++ b/new.py\n" | |
| "@@ -1,3 +1,3 @@\n" | |
| " ctx_a\n" | |
| "-old\n" | |
| "+new\n" | |
| " ctx_b\n" | |
| ) | |
| result = DiffCompressor(_cfg_below_threshold()).compress(diff) | |
| assert "similarity index 92%" in result.compressed | |
| assert "rename from old.py" in result.compressed | |
| assert "rename to new.py" in result.compressed | |
| def test_dissimilarity_index_preserved(self): | |
| from headroom.transforms.diff_compressor import DiffCompressor | |
| diff = ( | |
| "diff --git a/x.py b/y.py\n" | |
| "dissimilarity index 60%\n" | |
| "rename from x.py\n" | |
| "rename to y.py\n" | |
| "--- a/x.py\n" | |
| "+++ b/y.py\n" | |
| "@@ -1 +1 @@\n" | |
| "-a\n" | |
| "+b\n" | |
| ) | |
| result = DiffCompressor(_cfg_below_threshold()).compress(diff) | |
| assert "dissimilarity index 60%" in result.compressed | |
| def test_copy_markers_preserved(self): | |
| from headroom.transforms.diff_compressor import DiffCompressor | |
| diff = ( | |
| "diff --git a/orig.py b/dup.py\n" | |
| "similarity index 100%\n" | |
| "copy from orig.py\n" | |
| "copy to dup.py\n" | |
| "--- a/orig.py\n" | |
| "+++ b/dup.py\n" | |
| "@@ -1 +1 @@\n" | |
| "-old\n" | |
| "+new\n" | |
| ) | |
| result = DiffCompressor(_cfg_below_threshold()).compress(diff) | |
| assert "copy from orig.py" in result.compressed | |
| assert "copy to dup.py" in result.compressed | |
| class TestBugfixCombinedDiff: | |
| """Combined-diff `@@@` hunks from merge commits had ALL content silently | |
| dropped because the regex hardcoded `@@`.""" | |
| def test_3way_combined_diff_content_preserved(self): | |
| from headroom.transforms.diff_compressor import DiffCompressor | |
| diff = ( | |
| "diff --git a/merge.py b/merge.py\n" | |
| "--- a/merge.py\n" | |
| "+++ b/merge.py\n" | |
| "@@@ -1,3 -1,3 +1,4 @@@\n" | |
| " unchanged_a\n" | |
| "- old_branch_1\n" | |
| " -old_branch_2\n" | |
| "++new_in_merge\n" | |
| " +new_added\n" | |
| " unchanged_b\n" | |
| ) | |
| result = DiffCompressor(_cfg_below_threshold()).compress(diff) | |
| assert "@@@ -1,3 -1,3 +1,4 @@@" in result.compressed | |
| assert "++new_in_merge" in result.compressed | |
| assert result.files_affected > 0 | |
| class TestBugfixNoNewlineMarker: | |
| r"""`\ No newline at end of file` got dropped by context trim whenever it | |
| was further than max_context_lines from a +/- change.""" | |
| def test_no_newline_marker_survives_distance(self): | |
| from headroom.transforms.diff_compressor import DiffCompressor | |
| diff = ( | |
| "diff --git a/last.txt b/last.txt\n" | |
| "--- a/last.txt\n" | |
| "+++ b/last.txt\n" | |
| "@@ -1,8 +1,8 @@\n" | |
| "-old_first\n" | |
| "+new_first\n" | |
| " ctx_a\n" | |
| " ctx_b\n" | |
| " ctx_c\n" | |
| " ctx_d\n" | |
| " ctx_e\n" | |
| " ctx_f\n" | |
| "\\ No newline at end of file\n" | |
| ) | |
| result = DiffCompressor(_cfg_below_threshold()).compress(diff) | |
| assert "\\ No newline at end of file" in result.compressed | |
| class TestBugfixPreDiffContent: | |
| """Anything before the first `diff --git` (commit headers, email-style | |
| metadata) was silently dropped.""" | |
| def test_commit_header_preserved(self): | |
| from headroom.transforms.diff_compressor import DiffCompressor | |
| diff = ( | |
| "commit abc1234567890abcdef\n" | |
| "Author: Tester <t@example.com>\n" | |
| "Date: Mon Apr 25 12:00:00 2026\n" | |
| "\n" | |
| " Refactor: rename and modify\n" | |
| "\n" | |
| "diff --git a/x.py b/x.py\n" | |
| "--- a/x.py\n" | |
| "+++ b/x.py\n" | |
| "@@ -1 +1 @@\n" | |
| "-a\n" | |
| "+b\n" | |
| ) | |
| result = DiffCompressor(_cfg_below_threshold()).compress(diff) | |
| assert result.compressed.startswith("commit abc1234567890abcdef") | |
| assert "Author: Tester" in result.compressed | |
| assert "Refactor: rename and modify" in result.compressed | |
| assert "diff --git a/x.py b/x.py" in result.compressed | |
| assert "-a" in result.compressed | |
| assert "+b" in result.compressed | |
| def test_no_pre_diff_content_does_not_add_blank_line(self): | |
| """Edge case: when there's no pre-diff content, output must NOT | |
| gain a leading blank line from a stray empty-list prepend.""" | |
| from headroom.transforms.diff_compressor import DiffCompressor | |
| diff = "diff --git a/x.py b/x.py\n--- a/x.py\n+++ b/x.py\n@@ -1 +1 @@\n-a\n+b\n" | |
| result = DiffCompressor(_cfg_below_threshold()).compress(diff) | |
| assert result.compressed.startswith("diff --git a/x.py b/x.py") | |
| class TestRoutingGapMergeDiffs: | |
| """Routing gap (2026-04-25 follow-up): ContentRouter detects diff inputs | |
| and routes them to DiffCompressor, but the parser previously only knew | |
| the `diff --git` shape. Merge-commit diffs from `git log -p` use | |
| `diff --combined <path>` or `diff --cc <path>` and were treated as | |
| non-diff blobs and passed through unchanged. | |
| """ | |
| def test_diff_combined_header_starts_a_file_section(self): | |
| from headroom.transforms.diff_compressor import DiffCompressor | |
| diff = ( | |
| "diff --combined merge_target.py\n" | |
| "index abc..def..ghi 100644\n" | |
| "--- a/merge_target.py\n" | |
| "+++ b/merge_target.py\n" | |
| "@@@ -1,3 -1,3 +1,4 @@@\n" | |
| " unchanged_a\n" | |
| "- old_p1\n" | |
| " -old_p2\n" | |
| "++new_in_merge\n" | |
| " unchanged_b\n" | |
| ) | |
| result = DiffCompressor(_cfg_below_threshold()).compress(diff) | |
| assert result.files_affected == 1 | |
| assert "diff --combined merge_target.py" in result.compressed | |
| assert "@@@ -1,3 -1,3 +1,4 @@@" in result.compressed | |
| assert "++new_in_merge" in result.compressed | |
| def test_diff_cc_header_starts_a_file_section(self): | |
| from headroom.transforms.diff_compressor import DiffCompressor | |
| diff = ( | |
| "diff --cc cc_target.py\n" | |
| "index abc..def..ghi\n" | |
| "--- a/cc_target.py\n" | |
| "+++ b/cc_target.py\n" | |
| "@@@ -1,3 -1,3 +1,4 @@@\n" | |
| " ctx\n" | |
| "- removed_p1\n" | |
| " -removed_p2\n" | |
| "++added_in_merge\n" | |
| " more_ctx\n" | |
| ) | |
| result = DiffCompressor(_cfg_below_threshold()).compress(diff) | |
| assert result.files_affected == 1 | |
| assert "diff --cc cc_target.py" in result.compressed | |
| assert "++added_in_merge" in result.compressed | |
| class TestRoutingGapDetectorScanWindow: | |
| """Routing gap (2026-04-25 follow-up): `_try_detect_diff` only scanned | |
| the first 50 lines, so `git log -p` outputs with long commit messages | |
| pushed the diff past the detection window — input was misrouted away | |
| from DiffCompressor entirely. Window widened to 500 lines. | |
| """ | |
| def test_detect_picks_up_diff_after_long_commit_message(self): | |
| from headroom.transforms.content_detector import ( | |
| ContentType, | |
| detect_content_type, | |
| ) | |
| # 60 lines of commit message before the diff. Old 50-line cap | |
| # would have missed the `diff --git` header entirely. | |
| msg_lines = [ | |
| "commit abc123", | |
| "Author: Tester <t@example.com>", | |
| "Date: Mon Apr 25 12:00:00 2026", | |
| "", | |
| ] + [f" msg line {i}" for i in range(60)] | |
| diff = ( | |
| "\n".join(msg_lines) | |
| + "\n\n" | |
| + "diff --git a/x.py b/x.py\n--- a/x.py\n+++ b/x.py\n@@ -1 +1 @@\n-old\n+new\n" | |
| ) | |
| result = detect_content_type(diff) | |
| assert result.content_type == ContentType.GIT_DIFF | |
| assert result.confidence >= 0.7 | |
| def test_detect_recognizes_combined_diff_headers(self): | |
| """The detector also gained recognition for combined-diff hunk | |
| headers (`@@@`+) — useful when the only signal in a snippet is | |
| the merge-style hunk.""" | |
| from headroom.transforms.content_detector import ( | |
| ContentType, | |
| detect_content_type, | |
| ) | |
| # Full merge diff (with `--- a/` shared with regular diffs as a | |
| # belt-and-suspenders signal). | |
| diff = ( | |
| "diff --combined m.py\n--- a/m.py\n+++ b/m.py\n@@@ -1,2 -1,2 +1,3 @@@\n ctx\n++added\n" | |
| ) | |
| result = detect_content_type(diff) | |
| assert result.content_type == ContentType.GIT_DIFF | |