headroom / tests /test_transforms /test_diff_compressor.py
chopratejas's picture
feat(rust): retire python diff_compressor, ship rust-only via pyo3
73b5b6c
"""Comprehensive tests for the public DiffCompressor API.
Tests cover:
1. Context line reduction
2. Hunk selection and limiting
3. Compression ratios
4. Edge cases
5. Bug-fix regressions and routing-gap fixtures
Stage 3b note (2026-04-25): the Python `DiffCompressor` implementation
was retired in favor of the Rust-backed shim (`headroom._core` via PyO3).
Tests that probed Python-only internals — `_parse_diff`, `_score_hunks`,
the `DiffHunk` / `DiffFile` parser dataclasses — were removed because
the Rust crate has its own parallel coverage in
`crates/headroom-core/tests`. Public-API tests (anything calling
`compressor.compress(...)`) are preserved unchanged: they exercise the
Rust backend through the same import path and assert the same outputs.
"""
from headroom.transforms.diff_compressor import (
DiffCompressionResult,
DiffCompressor,
DiffCompressorConfig,
)
class TestContextReduction:
"""Tests for context line reduction."""
def test_reduce_context_lines(self):
"""Context lines are reduced to configured maximum."""
content = """diff --git a/file.py b/file.py
--- a/file.py
+++ b/file.py
@@ -1,10 +1,11 @@
context1
context2
context3
context4
+added
context5
context6
context7
context8
"""
# Default max_context_lines is 2
compressor = DiffCompressor(
config=DiffCompressorConfig(
max_context_lines=2,
min_lines_for_ccr=5,
enable_ccr=False,
)
)
result = compressor.compress(content)
# Should keep 2 context before and 2 after the +added line
# Plus the added line itself
lines = result.compressed.split("\n")
context_count = sum(1 for line in lines if line.startswith(" "))
# At most 4 context lines (2 before + 2 after)
assert context_count <= 4
def test_preserve_all_changes(self):
"""All addition and deletion lines are preserved."""
content = """diff --git a/file.py b/file.py
--- a/file.py
+++ b/file.py
@@ -1,10 +1,10 @@
ctx1
ctx2
-removed1
+added1
ctx3
ctx4
-removed2
+added2
ctx5
ctx6
"""
compressor = DiffCompressor(
config=DiffCompressorConfig(
min_lines_for_ccr=5,
enable_ccr=False,
)
)
result = compressor.compress(content)
assert "-removed1" in result.compressed
assert "-removed2" in result.compressed
assert "+added1" in result.compressed
assert "+added2" in result.compressed
class TestHunkSelection:
"""Tests for hunk selection when limiting."""
def test_max_hunks_per_file(self):
"""Hunks are limited to max_hunks_per_file."""
# Create a diff with many hunks
hunks = []
for i in range(20):
hunks.append(f"""@@ -{i * 10},3 +{i * 10},4 @@
context
+added_{i}
more
""")
content = f"""diff --git a/bigfile.py b/bigfile.py
--- a/bigfile.py
+++ b/bigfile.py
{"".join(hunks)}"""
compressor = DiffCompressor(
config=DiffCompressorConfig(
max_hunks_per_file=5,
min_lines_for_ccr=10,
enable_ccr=False,
)
)
result = compressor.compress(content)
# Should have at most 5 hunks
hunk_count = result.compressed.count("@@")
# Each hunk has one @@ header (we count full hunk headers)
assert hunk_count <= 10 # Each hunk header appears twice @@...@@
def test_keeps_first_and_last_hunk(self):
"""First and last hunks are preserved when limiting."""
hunks = []
for i in range(10):
hunks.append(f"""@@ -{i * 10},3 +{i * 10},4 @@
context
+added_{i}
more
""")
content = f"""diff --git a/file.py b/file.py
--- a/file.py
+++ b/file.py
{"".join(hunks)}"""
compressor = DiffCompressor(
config=DiffCompressorConfig(
max_hunks_per_file=3,
min_lines_for_ccr=10,
enable_ccr=False,
)
)
result = compressor.compress(content)
# First hunk (added_0) should be present
assert "+added_0" in result.compressed
# Last hunk (added_9) should be present
assert "+added_9" in result.compressed
class TestFileSelection:
"""Tests for file selection when limiting."""
def test_max_files(self):
"""Files are limited to max_files."""
# Create diff with many files
files = []
for i in range(30):
files.append(f"""diff --git a/file{i}.py b/file{i}.py
--- a/file{i}.py
+++ b/file{i}.py
@@ -1,2 +1,3 @@
ctx
+added
ctx2
""")
content = "\n".join(files)
compressor = DiffCompressor(
config=DiffCompressorConfig(
max_files=10,
min_lines_for_ccr=20,
enable_ccr=False,
)
)
result = compressor.compress(content)
# Count diff --git headers
file_count = result.compressed.count("diff --git")
assert file_count <= 10
class TestCompressionResult:
"""Tests for DiffCompressionResult properties."""
def test_compression_ratio_calculation(self):
"""Compression ratio is calculated correctly."""
result = DiffCompressionResult(
compressed="a\nb\nc",
original_line_count=100,
compressed_line_count=10,
files_affected=2,
additions=5,
deletions=3,
hunks_kept=2,
hunks_removed=5,
)
assert result.compression_ratio == 0.1
def test_tokens_saved_estimate(self):
"""Token savings estimation works correctly."""
result = DiffCompressionResult(
compressed="short",
original_line_count=100,
compressed_line_count=10,
files_affected=1,
additions=10,
deletions=5,
hunks_kept=1,
hunks_removed=0,
)
# 90 lines saved * 40 chars/line / 4 chars/token = 900 tokens
assert result.tokens_saved_estimate == 900
class TestSmallDiffPassthrough:
"""Tests for small diff passthrough behavior."""
def test_small_diff_unchanged(self):
"""Diffs smaller than threshold pass through unchanged."""
content = """diff --git a/small.py b/small.py
--- a/small.py
+++ b/small.py
@@ -1,2 +1,3 @@
line1
+added
line2
"""
compressor = DiffCompressor(
config=DiffCompressorConfig(
min_lines_for_ccr=100, # High threshold
)
)
result = compressor.compress(content)
# Should be unchanged
assert result.compressed == content
assert result.compression_ratio == 1.0
class TestOutputFormatting:
"""Tests for output formatting."""
def test_summary_line_added(self):
"""Summary line is added at end of compressed diff."""
# Large diff that will be compressed
hunks = []
for i in range(15):
hunks.append(f"""@@ -{i * 10},5 +{i * 10},6 @@
ctx1
ctx2
+added_{i}
ctx3
ctx4
""")
content = f"""diff --git a/file.py b/file.py
--- a/file.py
+++ b/file.py
{"".join(hunks)}"""
compressor = DiffCompressor(
config=DiffCompressorConfig(
max_hunks_per_file=5,
min_lines_for_ccr=10,
enable_ccr=False,
)
)
result = compressor.compress(content)
# Should have summary at end
assert "files changed" in result.compressed
assert "hunks omitted" in result.compressed
def test_preserves_diff_format(self):
"""Output preserves valid unified diff format."""
content = """diff --git a/test.py b/test.py
--- a/test.py
+++ b/test.py
@@ -1,3 +1,4 @@
def test():
+ # new comment
pass
return True
"""
compressor = DiffCompressor(
config=DiffCompressorConfig(
min_lines_for_ccr=5,
enable_ccr=False,
)
)
result = compressor.compress(content)
# Should have all standard diff markers
assert "diff --git" in result.compressed
assert "---" in result.compressed
assert "+++" in result.compressed
assert "@@" in result.compressed
class TestEdgeCases:
"""Tests for edge cases and boundary conditions."""
def test_empty_input(self):
"""Empty input is handled gracefully."""
compressor = DiffCompressor()
result = compressor.compress("")
assert result.compressed == ""
assert result.compression_ratio == 1.0
def test_non_diff_input(self):
"""Non-diff input passes through unchanged."""
content = "This is not a diff\nJust regular text"
compressor = DiffCompressor()
result = compressor.compress(content)
# Should pass through (no diff --git found)
assert result.compressed == content
def test_unicode_content(self):
"""Unicode characters in diff are handled."""
content = """diff --git a/i18n.py b/i18n.py
--- a/i18n.py
+++ b/i18n.py
@@ -1,2 +1,3 @@
msg = "hello"
+msg_ja = "こんにちは"
return msg
"""
compressor = DiffCompressor()
result = compressor.compress(content)
assert "こんにちは" in result.compressed
def test_no_newline_at_eof(self):
"""Handles 'No newline at end of file' indicator."""
content = """diff --git a/file.py b/file.py
--- a/file.py
+++ b/file.py
@@ -1,2 +1,2 @@
line1
-line2
\\ No newline at end of file
+line2_modified
\\ No newline at end of file
"""
compressor = DiffCompressor()
result = compressor.compress(content)
# Should not crash and preserve the indicator
assert "No newline" in result.compressed or "-line2" in result.compressed
def test_empty_hunks(self):
"""Files with no actual hunks are handled."""
content = """diff --git a/file.py b/file.py
--- a/file.py
+++ b/file.py
"""
compressor = DiffCompressor()
result = compressor.compress(content)
# Should not crash
assert result.compressed is not None
class TestConfigOptions:
"""Tests for configuration options."""
def test_max_context_lines_config(self):
"""max_context_lines configuration controls context reduction."""
content = """diff --git a/file.py b/file.py
--- a/file.py
+++ b/file.py
@@ -1,10 +1,11 @@
c1
c2
c3
c4
c5
+added
c6
c7
c8
c9
c10
"""
# With max_context_lines=1
compressor = DiffCompressor(
config=DiffCompressorConfig(
max_context_lines=1,
min_lines_for_ccr=5,
enable_ccr=False,
)
)
result = compressor.compress(content)
# Count context lines (lines starting with space)
context_count = sum(1 for line in result.compressed.split("\n") if line.startswith(" "))
# Should have at most 2 context lines (1 before + 1 after)
assert context_count <= 2
def test_always_keep_additions_default(self):
"""Additions are always kept by default."""
content = """diff --git a/file.py b/file.py
--- a/file.py
+++ b/file.py
@@ -1,3 +1,5 @@
ctx
+add1
+add2
ctx
"""
compressor = DiffCompressor(
config=DiffCompressorConfig(
always_keep_additions=True,
min_lines_for_ccr=2,
enable_ccr=False,
)
)
result = compressor.compress(content)
assert "+add1" in result.compressed
assert "+add2" in result.compressed
def test_always_keep_deletions_default(self):
"""Deletions are always kept by default."""
content = """diff --git a/file.py b/file.py
--- a/file.py
+++ b/file.py
@@ -1,5 +1,3 @@
ctx
-del1
-del2
ctx
"""
compressor = DiffCompressor(
config=DiffCompressorConfig(
always_keep_deletions=True,
min_lines_for_ccr=2,
enable_ccr=False,
)
)
result = compressor.compress(content)
assert "-del1" in result.compressed
assert "-del2" in result.compressed
# ─── Bug-fix tests (2026-04-25): four silent information-loss paths ─────────
#
# Before the fix, the parser captured these patterns but the emitter dropped
# them, or the regex didn't match them at all. Each test exercises one of
# the four paths the same way the Rust unit tests do.
def _cfg_below_threshold():
"""Small config so the parser+emitter actually run on test inputs."""
from headroom.transforms.diff_compressor import DiffCompressorConfig
return DiffCompressorConfig(min_lines_for_ccr=5)
class TestBugfixRenamePreservation:
"""rename/similarity/dissimilarity/copy markers were captured into
is_renamed=True and then dropped by the emitter. Output looked like a
plain modification of the old path."""
def test_rename_with_similarity_index_preserved(self):
from headroom.transforms.diff_compressor import DiffCompressor
diff = (
"diff --git a/old.py b/new.py\n"
"similarity index 92%\n"
"rename from old.py\n"
"rename to new.py\n"
"--- a/old.py\n"
"+++ b/new.py\n"
"@@ -1,3 +1,3 @@\n"
" ctx_a\n"
"-old\n"
"+new\n"
" ctx_b\n"
)
result = DiffCompressor(_cfg_below_threshold()).compress(diff)
assert "similarity index 92%" in result.compressed
assert "rename from old.py" in result.compressed
assert "rename to new.py" in result.compressed
def test_dissimilarity_index_preserved(self):
from headroom.transforms.diff_compressor import DiffCompressor
diff = (
"diff --git a/x.py b/y.py\n"
"dissimilarity index 60%\n"
"rename from x.py\n"
"rename to y.py\n"
"--- a/x.py\n"
"+++ b/y.py\n"
"@@ -1 +1 @@\n"
"-a\n"
"+b\n"
)
result = DiffCompressor(_cfg_below_threshold()).compress(diff)
assert "dissimilarity index 60%" in result.compressed
def test_copy_markers_preserved(self):
from headroom.transforms.diff_compressor import DiffCompressor
diff = (
"diff --git a/orig.py b/dup.py\n"
"similarity index 100%\n"
"copy from orig.py\n"
"copy to dup.py\n"
"--- a/orig.py\n"
"+++ b/dup.py\n"
"@@ -1 +1 @@\n"
"-old\n"
"+new\n"
)
result = DiffCompressor(_cfg_below_threshold()).compress(diff)
assert "copy from orig.py" in result.compressed
assert "copy to dup.py" in result.compressed
class TestBugfixCombinedDiff:
"""Combined-diff `@@@` hunks from merge commits had ALL content silently
dropped because the regex hardcoded `@@`."""
def test_3way_combined_diff_content_preserved(self):
from headroom.transforms.diff_compressor import DiffCompressor
diff = (
"diff --git a/merge.py b/merge.py\n"
"--- a/merge.py\n"
"+++ b/merge.py\n"
"@@@ -1,3 -1,3 +1,4 @@@\n"
" unchanged_a\n"
"- old_branch_1\n"
" -old_branch_2\n"
"++new_in_merge\n"
" +new_added\n"
" unchanged_b\n"
)
result = DiffCompressor(_cfg_below_threshold()).compress(diff)
assert "@@@ -1,3 -1,3 +1,4 @@@" in result.compressed
assert "++new_in_merge" in result.compressed
assert result.files_affected > 0
class TestBugfixNoNewlineMarker:
r"""`\ No newline at end of file` got dropped by context trim whenever it
was further than max_context_lines from a +/- change."""
def test_no_newline_marker_survives_distance(self):
from headroom.transforms.diff_compressor import DiffCompressor
diff = (
"diff --git a/last.txt b/last.txt\n"
"--- a/last.txt\n"
"+++ b/last.txt\n"
"@@ -1,8 +1,8 @@\n"
"-old_first\n"
"+new_first\n"
" ctx_a\n"
" ctx_b\n"
" ctx_c\n"
" ctx_d\n"
" ctx_e\n"
" ctx_f\n"
"\\ No newline at end of file\n"
)
result = DiffCompressor(_cfg_below_threshold()).compress(diff)
assert "\\ No newline at end of file" in result.compressed
class TestBugfixPreDiffContent:
"""Anything before the first `diff --git` (commit headers, email-style
metadata) was silently dropped."""
def test_commit_header_preserved(self):
from headroom.transforms.diff_compressor import DiffCompressor
diff = (
"commit abc1234567890abcdef\n"
"Author: Tester <t@example.com>\n"
"Date: Mon Apr 25 12:00:00 2026\n"
"\n"
" Refactor: rename and modify\n"
"\n"
"diff --git a/x.py b/x.py\n"
"--- a/x.py\n"
"+++ b/x.py\n"
"@@ -1 +1 @@\n"
"-a\n"
"+b\n"
)
result = DiffCompressor(_cfg_below_threshold()).compress(diff)
assert result.compressed.startswith("commit abc1234567890abcdef")
assert "Author: Tester" in result.compressed
assert "Refactor: rename and modify" in result.compressed
assert "diff --git a/x.py b/x.py" in result.compressed
assert "-a" in result.compressed
assert "+b" in result.compressed
def test_no_pre_diff_content_does_not_add_blank_line(self):
"""Edge case: when there's no pre-diff content, output must NOT
gain a leading blank line from a stray empty-list prepend."""
from headroom.transforms.diff_compressor import DiffCompressor
diff = "diff --git a/x.py b/x.py\n--- a/x.py\n+++ b/x.py\n@@ -1 +1 @@\n-a\n+b\n"
result = DiffCompressor(_cfg_below_threshold()).compress(diff)
assert result.compressed.startswith("diff --git a/x.py b/x.py")
class TestRoutingGapMergeDiffs:
"""Routing gap (2026-04-25 follow-up): ContentRouter detects diff inputs
and routes them to DiffCompressor, but the parser previously only knew
the `diff --git` shape. Merge-commit diffs from `git log -p` use
`diff --combined <path>` or `diff --cc <path>` and were treated as
non-diff blobs and passed through unchanged.
"""
def test_diff_combined_header_starts_a_file_section(self):
from headroom.transforms.diff_compressor import DiffCompressor
diff = (
"diff --combined merge_target.py\n"
"index abc..def..ghi 100644\n"
"--- a/merge_target.py\n"
"+++ b/merge_target.py\n"
"@@@ -1,3 -1,3 +1,4 @@@\n"
" unchanged_a\n"
"- old_p1\n"
" -old_p2\n"
"++new_in_merge\n"
" unchanged_b\n"
)
result = DiffCompressor(_cfg_below_threshold()).compress(diff)
assert result.files_affected == 1
assert "diff --combined merge_target.py" in result.compressed
assert "@@@ -1,3 -1,3 +1,4 @@@" in result.compressed
assert "++new_in_merge" in result.compressed
def test_diff_cc_header_starts_a_file_section(self):
from headroom.transforms.diff_compressor import DiffCompressor
diff = (
"diff --cc cc_target.py\n"
"index abc..def..ghi\n"
"--- a/cc_target.py\n"
"+++ b/cc_target.py\n"
"@@@ -1,3 -1,3 +1,4 @@@\n"
" ctx\n"
"- removed_p1\n"
" -removed_p2\n"
"++added_in_merge\n"
" more_ctx\n"
)
result = DiffCompressor(_cfg_below_threshold()).compress(diff)
assert result.files_affected == 1
assert "diff --cc cc_target.py" in result.compressed
assert "++added_in_merge" in result.compressed
class TestRoutingGapDetectorScanWindow:
"""Routing gap (2026-04-25 follow-up): `_try_detect_diff` only scanned
the first 50 lines, so `git log -p` outputs with long commit messages
pushed the diff past the detection window — input was misrouted away
from DiffCompressor entirely. Window widened to 500 lines.
"""
def test_detect_picks_up_diff_after_long_commit_message(self):
from headroom.transforms.content_detector import (
ContentType,
detect_content_type,
)
# 60 lines of commit message before the diff. Old 50-line cap
# would have missed the `diff --git` header entirely.
msg_lines = [
"commit abc123",
"Author: Tester <t@example.com>",
"Date: Mon Apr 25 12:00:00 2026",
"",
] + [f" msg line {i}" for i in range(60)]
diff = (
"\n".join(msg_lines)
+ "\n\n"
+ "diff --git a/x.py b/x.py\n--- a/x.py\n+++ b/x.py\n@@ -1 +1 @@\n-old\n+new\n"
)
result = detect_content_type(diff)
assert result.content_type == ContentType.GIT_DIFF
assert result.confidence >= 0.7
def test_detect_recognizes_combined_diff_headers(self):
"""The detector also gained recognition for combined-diff hunk
headers (`@@@`+) — useful when the only signal in a snippet is
the merge-style hunk."""
from headroom.transforms.content_detector import (
ContentType,
detect_content_type,
)
# Full merge diff (with `--- a/` shared with regular diffs as a
# belt-and-suspenders signal).
diff = (
"diff --combined m.py\n--- a/m.py\n+++ b/m.py\n@@@ -1,2 -1,2 +1,3 @@@\n ctx\n++added\n"
)
result = detect_content_type(diff)
assert result.content_type == ContentType.GIT_DIFF