Lean4-helper / tests /test_app_fuzz.py
p4r5kpftnp-cmd
Add CI/CD: GitHub Actions workflow + ruff config
6cf7e7c
Raw
History Blame Contribute Delete
15.4 kB
"""
Fuzz tests for `app.solve_proof`.
These tests exercise a fixed corpus of edge-case inputs against
`app.solve_proof` and assert:
* every call returns a (status_html, final_code, logs) 3-tuple,
* no call crashes (uncaught exceptions),
* the right status messages are produced for each scenario,
* the temp file `solve_proof` creates is always cleaned up,
*even* when the underlying agent raises.
We mock `langgraph_agent.LangGraphAgent` BEFORE importing `app` so no Lean,
LLM, or FAISS infrastructure is needed.
"""
import os
import sys
import tempfile
import threading
import unittest
from unittest import mock
# Make `app.py` and `src/` importable.
sys.path.insert(0, ".")
sys.path.insert(0, "src")
# ----- Mock LangGraphAgent BEFORE importing app -----------------------------
_fake_lg_module = mock.MagicMock()
class _DummyAgent:
"""Stand-in for LangGraphAgent. Configured per-test via class attributes."""
# Per-test knobs.
# If set, __init__ records (model_name, max_retries) here.
init_log: list = []
# If set, solve_file_detailed raises this instance instead of returning.
raise_exc: BaseException | None = None
# If set, this dict is returned from solve_file_detailed.
result: dict = {
"success": True,
"solved_at_attempt": 1,
"total_attempts": 1,
}
# Record temp file path the agent was passed, plus the bytes on disk.
last_path: str | None = None
last_bytes: bytes | None = None
def __init__(self, model_name="x", max_retries=5, **kwargs):
self.model_name = model_name
self.max_retries = max_retries
_DummyAgent.init_log.append((model_name, max_retries))
def solve_file_detailed(self, file_path: str) -> dict:
_DummyAgent.last_path = file_path
try:
with open(file_path, "rb") as f:
_DummyAgent.last_bytes = f.read()
except OSError:
_DummyAgent.last_bytes = None
if _DummyAgent.raise_exc is not None:
raise _DummyAgent.raise_exc
return _DummyAgent.result
_fake_lg_module.LangGraphAgent = _DummyAgent
# Default to having a key set; individual tests override this.
os.environ.setdefault("GROQ_API_KEY", "test-key")
# Mock langgraph_agent only for the duration of importing `app`, then restore
# sys.modules so this file doesn't poison the import for other test files.
# app.py captures its own `LangGraphAgent` reference at import time; tests
# patch `app.LangGraphAgent` directly afterward.
_orig_langgraph_agent = sys.modules.get("langgraph_agent")
sys.modules["langgraph_agent"] = _fake_lg_module
import app # noqa: E402 (must come after the mock above)
if _orig_langgraph_agent is not None:
sys.modules["langgraph_agent"] = _orig_langgraph_agent
else:
del sys.modules["langgraph_agent"]
app.LangGraphAgent = _DummyAgent # keep app bound to the dummy after restore
# Sentinels for the fuzz corpus -----------------------------------------------
EMPTY = ""
WHITESPACE = " \n\t "
HUGE = "a" * 100_000
UNICODE = "theorem foo : ∀ n : ℕ, ∃ m : ℕ, n + m = m + n := by sorry"
NO_SORRY = "theorem foo : True := trivial"
def _wrap_namedtemp(captured: list[str], lock: threading.Lock):
"""Return a side_effect for `tempfile.NamedTemporaryFile` that records the
name of every actual tempfile created so callers can assert cleanup."""
real_ntf = tempfile.NamedTemporaryFile
def factory(*args, **kwargs):
f = real_ntf(*args, **kwargs)
with lock:
captured.append(f.name)
return f
return factory
class SolveProofFuzzTests(unittest.TestCase):
"""Each scenario from the fuzz corpus."""
def setUp(self):
# Reset the dummy agent's class-level knobs.
_DummyAgent.init_log = []
_DummyAgent.raise_exc = None
_DummyAgent.result = {
"success": True,
"solved_at_attempt": 1,
"total_attempts": 1,
}
_DummyAgent.last_path = None
_DummyAgent.last_bytes = None
# Make sure GROQ_API_KEY is set unless a test clears it.
os.environ["GROQ_API_KEY"] = "test-key"
self._captured_paths: list[str] = []
self._capture_lock = threading.Lock()
self._ntf_patch = mock.patch.object(
app.tempfile,
"NamedTemporaryFile",
side_effect=_wrap_namedtemp(self._captured_paths, self._capture_lock),
)
self._ntf_patch.start()
# Don't touch the real Lean REPL / FAISS retriever just because we're
# exercising app.solve_proof in tests — return harmless mocks.
self._components_patch = mock.patch.object(
app, "_get_components", return_value=(mock.MagicMock(), mock.MagicMock())
)
self._components_patch.start()
def tearDown(self):
self._components_patch.stop()
self._ntf_patch.stop()
# Best-effort: clean up anything the test leaked so /tmp doesn't fill.
for path in self._captured_paths:
if os.path.exists(path):
try:
os.unlink(path)
except OSError:
pass
# ------------------------------------------------------------------
# Helpers
# ------------------------------------------------------------------
def _assert_triple(self, ret):
self.assertIsInstance(ret, tuple, f"return is not tuple: {ret!r}")
self.assertEqual(len(ret), 3, f"return is not length-3 tuple: {ret!r}")
status, code, logs = ret
self.assertIsInstance(status, str)
self.assertIsInstance(code, str)
self.assertIsInstance(logs, str)
return status, code, logs
def _assert_temp_cleaned(self):
"""Every tempfile we observed must be gone now."""
for path in self._captured_paths:
self.assertFalse(
os.path.exists(path),
f"tempfile leaked: {path} still exists",
)
# ------------------------------------------------------------------
# Empty string -> "No input"
# ------------------------------------------------------------------
def test_empty_string_returns_no_input(self):
ret = app.solve_proof(EMPTY, "llama-3.3-70b-versatile", 1)
status, code, logs = self._assert_triple(ret)
self.assertIn("No input", status)
# No tempfile should have been created at all for empty input.
self.assertEqual(self._captured_paths, [])
self.assertEqual(code, "")
self.assertEqual(logs, "")
# ------------------------------------------------------------------
# Whitespace-only -> "No input"
# ------------------------------------------------------------------
def test_whitespace_only_returns_no_input(self):
ret = app.solve_proof(WHITESPACE, "llama-3.3-70b-versatile", 1)
status, code, logs = self._assert_triple(ret)
self.assertIn("No input", status)
self.assertEqual(self._captured_paths, [])
# ------------------------------------------------------------------
# 100KB string must not OOM/crash.
# ------------------------------------------------------------------
def test_huge_input_does_not_crash(self):
ret = app.solve_proof(HUGE, "llama-3.3-70b-versatile", 1)
status, code, logs = self._assert_triple(ret)
# Should pass through to the agent (which returns success).
self.assertIn("Verified", status)
# The agent must have seen the full 100k bytes.
self.assertIsNotNone(_DummyAgent.last_bytes)
self.assertEqual(len(_DummyAgent.last_bytes), 100_000)
self._assert_temp_cleaned()
# ------------------------------------------------------------------
# Unicode-heavy: must encode UTF-8 to temp file successfully.
# ------------------------------------------------------------------
def test_unicode_heavy_encodes_utf8(self):
ret = app.solve_proof(UNICODE, "llama-3.3-70b-versatile", 1)
status, code, logs = self._assert_triple(ret)
self.assertIn("Verified", status)
# The bytes the agent saw must be the UTF-8 encoding of UNICODE.
self.assertIsNotNone(_DummyAgent.last_bytes)
self.assertEqual(_DummyAgent.last_bytes, UNICODE.encode("utf-8"))
# The `code` returned (read back from disk) must round-trip.
self.assertEqual(code, UNICODE)
self._assert_temp_cleaned()
# ------------------------------------------------------------------
# Unicode-heavy under simulated POSIX/ASCII locale.
#
# On systems where `locale.getpreferredencoding()` is ASCII (e.g. some
# Docker containers, HuggingFace Spaces with PYTHONUTF8=0 + LC_ALL=C),
# `open(path, "w")` without an explicit encoding will reject characters
# like ∀, ℕ, ∃. The fix is to pass `encoding="utf-8"` explicitly when
# writing the temp file and reading it back.
#
# We force ASCII mode by patching the builtins/tempfile callables that
# `app.solve_proof` uses, defaulting `encoding="ascii"` whenever the
# caller didn't specify one. If the implementation forgot to pass
# encoding="utf-8", the unicode write/read will UnicodeEncode/DecodeError.
# ------------------------------------------------------------------
def test_unicode_heavy_under_ascii_locale(self):
real_open = open
real_ntf = tempfile.NamedTemporaryFile
def ascii_open(*args, **kwargs):
# In text mode (no "b"), default to ASCII if no encoding given.
mode = kwargs.get("mode")
if mode is None and len(args) >= 2:
mode = args[1]
if mode is None:
mode = "r"
if "b" not in mode and "encoding" not in kwargs:
kwargs["encoding"] = "ascii"
return real_open(*args, **kwargs)
def ascii_ntf(*args, **kwargs):
mode = kwargs.get("mode", "w+b")
if "b" not in mode and "encoding" not in kwargs:
kwargs["encoding"] = "ascii"
return real_ntf(*args, **kwargs)
with mock.patch.object(app.tempfile, "NamedTemporaryFile",
side_effect=ascii_ntf), \
mock.patch("builtins.open", side_effect=ascii_open):
ret = app.solve_proof(UNICODE, "llama-3.3-70b-versatile", 1)
status, code, logs = self._assert_triple(ret)
# Must NOT crash with a UnicodeError leaking through as "Error:".
self.assertNotIn(
"UnicodeEncodeError", status,
f"solve_proof leaked UnicodeEncodeError under ASCII locale: {status!r}",
)
self.assertNotIn(
"UnicodeDecodeError", status,
f"solve_proof leaked UnicodeDecodeError under ASCII locale: {status!r}",
)
# Must produce a Verified status: the agent didn't fail, the encoding
# plumbing must transparently use utf-8.
self.assertIn(
"Verified", status,
f"solve_proof returned non-Verified status under ASCII locale: {status!r}",
)
self.assertEqual(code, UNICODE)
self._assert_temp_cleaned()
# ------------------------------------------------------------------
# GROQ_API_KEY unset -> "GROQ_API_KEY missing"
# ------------------------------------------------------------------
def test_missing_groq_api_key(self):
with mock.patch.dict(os.environ, {}, clear=True):
ret = app.solve_proof(NO_SORRY, "llama-3.3-70b-versatile", 1)
status, code, logs = self._assert_triple(ret)
self.assertIn("GROQ_API_KEY missing", status)
# No tempfile created -- function short-circuits.
self.assertEqual(self._captured_paths, [])
self.assertEqual(code, "")
self.assertEqual(logs, "")
# ------------------------------------------------------------------
# Agent raises -> "Error:" status; temp file still cleaned up.
# ------------------------------------------------------------------
def test_agent_raises_runtime_error(self):
_DummyAgent.raise_exc = RuntimeError("synthetic blow-up")
ret = app.solve_proof(NO_SORRY, "llama-3.3-70b-versatile", 1)
status, code, logs = self._assert_triple(ret)
self.assertIn("Error", status)
self.assertIn("synthetic blow-up", status)
self.assertEqual(code, "")
# Most important: temp file must be cleaned up even on exception.
self.assertEqual(len(self._captured_paths), 1)
self._assert_temp_cleaned()
# ------------------------------------------------------------------
# Agent reports success -> "Verified" status.
# ------------------------------------------------------------------
def test_agent_success(self):
_DummyAgent.result = {
"success": True,
"solved_at_attempt": 2,
"total_attempts": 3,
}
ret = app.solve_proof(NO_SORRY, "llama-3.3-70b-versatile", 3)
status, code, logs = self._assert_triple(ret)
self.assertIn("Verified", status)
self.assertIn("2", status)
self.assertIn("3", status)
self._assert_temp_cleaned()
# ------------------------------------------------------------------
# Agent reports failure -> "No proof found" status.
# ------------------------------------------------------------------
def test_agent_failure(self):
_DummyAgent.result = {
"success": False,
"solved_at_attempt": None,
"total_attempts": 5,
}
ret = app.solve_proof(NO_SORRY, "llama-3.3-70b-versatile", 5)
status, code, logs = self._assert_triple(ret)
self.assertIn("No proof found", status)
self.assertIn("5", status)
self._assert_temp_cleaned()
# ------------------------------------------------------------------
# Code without `sorry` should still pass through unchanged.
# ------------------------------------------------------------------
def test_no_sorry_keyword_passes_through(self):
ret = app.solve_proof(NO_SORRY, "llama-3.3-70b-versatile", 1)
status, code, logs = self._assert_triple(ret)
self.assertIn("Verified", status)
self.assertEqual(code, NO_SORRY)
# Bytes seen by agent must equal the input encoded as utf-8.
self.assertEqual(_DummyAgent.last_bytes, NO_SORRY.encode("utf-8"))
self._assert_temp_cleaned()
# ------------------------------------------------------------------
# Sanity: tempfile cleanup verified for every scenario in one place.
# ------------------------------------------------------------------
def test_temp_file_always_cleaned_up(self):
"""Run every non-short-circuit scenario and assert no tempfile leaks."""
scenarios = [
("ok", NO_SORRY, None),
("huge", HUGE, None),
("unicode", UNICODE, None),
("raise", NO_SORRY, RuntimeError("boom")),
]
for label, code_in, exc in scenarios:
_DummyAgent.raise_exc = exc
with self.subTest(label):
app.solve_proof(code_in, "llama-3.3-70b-versatile", 1)
_DummyAgent.raise_exc = None
self._assert_temp_cleaned()
if __name__ == "__main__":
unittest.main()