""" Fuzz tests for `app.solve_proof`. These tests exercise a fixed corpus of edge-case inputs against `app.solve_proof` and assert: * every call returns a (status_html, final_code, logs) 3-tuple, * no call crashes (uncaught exceptions), * the right status messages are produced for each scenario, * the temp file `solve_proof` creates is always cleaned up, *even* when the underlying agent raises. We mock `langgraph_agent.LangGraphAgent` BEFORE importing `app` so no Lean, LLM, or FAISS infrastructure is needed. """ import os import sys import tempfile import threading import unittest from unittest import mock # Make `app.py` and `src/` importable. sys.path.insert(0, ".") sys.path.insert(0, "src") # ----- Mock LangGraphAgent BEFORE importing app ----------------------------- _fake_lg_module = mock.MagicMock() class _DummyAgent: """Stand-in for LangGraphAgent. Configured per-test via class attributes.""" # Per-test knobs. # If set, __init__ records (model_name, max_retries) here. init_log: list = [] # If set, solve_file_detailed raises this instance instead of returning. raise_exc: BaseException | None = None # If set, this dict is returned from solve_file_detailed. result: dict = { "success": True, "solved_at_attempt": 1, "total_attempts": 1, } # Record temp file path the agent was passed, plus the bytes on disk. last_path: str | None = None last_bytes: bytes | None = None def __init__(self, model_name="x", max_retries=5, **kwargs): self.model_name = model_name self.max_retries = max_retries _DummyAgent.init_log.append((model_name, max_retries)) def solve_file_detailed(self, file_path: str) -> dict: _DummyAgent.last_path = file_path try: with open(file_path, "rb") as f: _DummyAgent.last_bytes = f.read() except OSError: _DummyAgent.last_bytes = None if _DummyAgent.raise_exc is not None: raise _DummyAgent.raise_exc return _DummyAgent.result _fake_lg_module.LangGraphAgent = _DummyAgent # Default to having a key set; individual tests override this. os.environ.setdefault("GROQ_API_KEY", "test-key") # Mock langgraph_agent only for the duration of importing `app`, then restore # sys.modules so this file doesn't poison the import for other test files. # app.py captures its own `LangGraphAgent` reference at import time; tests # patch `app.LangGraphAgent` directly afterward. _orig_langgraph_agent = sys.modules.get("langgraph_agent") sys.modules["langgraph_agent"] = _fake_lg_module import app # noqa: E402 (must come after the mock above) if _orig_langgraph_agent is not None: sys.modules["langgraph_agent"] = _orig_langgraph_agent else: del sys.modules["langgraph_agent"] app.LangGraphAgent = _DummyAgent # keep app bound to the dummy after restore # Sentinels for the fuzz corpus ----------------------------------------------- EMPTY = "" WHITESPACE = " \n\t " HUGE = "a" * 100_000 UNICODE = "theorem foo : ∀ n : ℕ, ∃ m : ℕ, n + m = m + n := by sorry" NO_SORRY = "theorem foo : True := trivial" def _wrap_namedtemp(captured: list[str], lock: threading.Lock): """Return a side_effect for `tempfile.NamedTemporaryFile` that records the name of every actual tempfile created so callers can assert cleanup.""" real_ntf = tempfile.NamedTemporaryFile def factory(*args, **kwargs): f = real_ntf(*args, **kwargs) with lock: captured.append(f.name) return f return factory class SolveProofFuzzTests(unittest.TestCase): """Each scenario from the fuzz corpus.""" def setUp(self): # Reset the dummy agent's class-level knobs. _DummyAgent.init_log = [] _DummyAgent.raise_exc = None _DummyAgent.result = { "success": True, "solved_at_attempt": 1, "total_attempts": 1, } _DummyAgent.last_path = None _DummyAgent.last_bytes = None # Make sure GROQ_API_KEY is set unless a test clears it. os.environ["GROQ_API_KEY"] = "test-key" self._captured_paths: list[str] = [] self._capture_lock = threading.Lock() self._ntf_patch = mock.patch.object( app.tempfile, "NamedTemporaryFile", side_effect=_wrap_namedtemp(self._captured_paths, self._capture_lock), ) self._ntf_patch.start() # Don't touch the real Lean REPL / FAISS retriever just because we're # exercising app.solve_proof in tests — return harmless mocks. self._components_patch = mock.patch.object( app, "_get_components", return_value=(mock.MagicMock(), mock.MagicMock()) ) self._components_patch.start() def tearDown(self): self._components_patch.stop() self._ntf_patch.stop() # Best-effort: clean up anything the test leaked so /tmp doesn't fill. for path in self._captured_paths: if os.path.exists(path): try: os.unlink(path) except OSError: pass # ------------------------------------------------------------------ # Helpers # ------------------------------------------------------------------ def _assert_triple(self, ret): self.assertIsInstance(ret, tuple, f"return is not tuple: {ret!r}") self.assertEqual(len(ret), 3, f"return is not length-3 tuple: {ret!r}") status, code, logs = ret self.assertIsInstance(status, str) self.assertIsInstance(code, str) self.assertIsInstance(logs, str) return status, code, logs def _assert_temp_cleaned(self): """Every tempfile we observed must be gone now.""" for path in self._captured_paths: self.assertFalse( os.path.exists(path), f"tempfile leaked: {path} still exists", ) # ------------------------------------------------------------------ # Empty string -> "No input" # ------------------------------------------------------------------ def test_empty_string_returns_no_input(self): ret = app.solve_proof(EMPTY, "llama-3.3-70b-versatile", 1) status, code, logs = self._assert_triple(ret) self.assertIn("No input", status) # No tempfile should have been created at all for empty input. self.assertEqual(self._captured_paths, []) self.assertEqual(code, "") self.assertEqual(logs, "") # ------------------------------------------------------------------ # Whitespace-only -> "No input" # ------------------------------------------------------------------ def test_whitespace_only_returns_no_input(self): ret = app.solve_proof(WHITESPACE, "llama-3.3-70b-versatile", 1) status, code, logs = self._assert_triple(ret) self.assertIn("No input", status) self.assertEqual(self._captured_paths, []) # ------------------------------------------------------------------ # 100KB string must not OOM/crash. # ------------------------------------------------------------------ def test_huge_input_does_not_crash(self): ret = app.solve_proof(HUGE, "llama-3.3-70b-versatile", 1) status, code, logs = self._assert_triple(ret) # Should pass through to the agent (which returns success). self.assertIn("Verified", status) # The agent must have seen the full 100k bytes. self.assertIsNotNone(_DummyAgent.last_bytes) self.assertEqual(len(_DummyAgent.last_bytes), 100_000) self._assert_temp_cleaned() # ------------------------------------------------------------------ # Unicode-heavy: must encode UTF-8 to temp file successfully. # ------------------------------------------------------------------ def test_unicode_heavy_encodes_utf8(self): ret = app.solve_proof(UNICODE, "llama-3.3-70b-versatile", 1) status, code, logs = self._assert_triple(ret) self.assertIn("Verified", status) # The bytes the agent saw must be the UTF-8 encoding of UNICODE. self.assertIsNotNone(_DummyAgent.last_bytes) self.assertEqual(_DummyAgent.last_bytes, UNICODE.encode("utf-8")) # The `code` returned (read back from disk) must round-trip. self.assertEqual(code, UNICODE) self._assert_temp_cleaned() # ------------------------------------------------------------------ # Unicode-heavy under simulated POSIX/ASCII locale. # # On systems where `locale.getpreferredencoding()` is ASCII (e.g. some # Docker containers, HuggingFace Spaces with PYTHONUTF8=0 + LC_ALL=C), # `open(path, "w")` without an explicit encoding will reject characters # like ∀, ℕ, ∃. The fix is to pass `encoding="utf-8"` explicitly when # writing the temp file and reading it back. # # We force ASCII mode by patching the builtins/tempfile callables that # `app.solve_proof` uses, defaulting `encoding="ascii"` whenever the # caller didn't specify one. If the implementation forgot to pass # encoding="utf-8", the unicode write/read will UnicodeEncode/DecodeError. # ------------------------------------------------------------------ def test_unicode_heavy_under_ascii_locale(self): real_open = open real_ntf = tempfile.NamedTemporaryFile def ascii_open(*args, **kwargs): # In text mode (no "b"), default to ASCII if no encoding given. mode = kwargs.get("mode") if mode is None and len(args) >= 2: mode = args[1] if mode is None: mode = "r" if "b" not in mode and "encoding" not in kwargs: kwargs["encoding"] = "ascii" return real_open(*args, **kwargs) def ascii_ntf(*args, **kwargs): mode = kwargs.get("mode", "w+b") if "b" not in mode and "encoding" not in kwargs: kwargs["encoding"] = "ascii" return real_ntf(*args, **kwargs) with mock.patch.object(app.tempfile, "NamedTemporaryFile", side_effect=ascii_ntf), \ mock.patch("builtins.open", side_effect=ascii_open): ret = app.solve_proof(UNICODE, "llama-3.3-70b-versatile", 1) status, code, logs = self._assert_triple(ret) # Must NOT crash with a UnicodeError leaking through as "Error:". self.assertNotIn( "UnicodeEncodeError", status, f"solve_proof leaked UnicodeEncodeError under ASCII locale: {status!r}", ) self.assertNotIn( "UnicodeDecodeError", status, f"solve_proof leaked UnicodeDecodeError under ASCII locale: {status!r}", ) # Must produce a Verified status: the agent didn't fail, the encoding # plumbing must transparently use utf-8. self.assertIn( "Verified", status, f"solve_proof returned non-Verified status under ASCII locale: {status!r}", ) self.assertEqual(code, UNICODE) self._assert_temp_cleaned() # ------------------------------------------------------------------ # GROQ_API_KEY unset -> "GROQ_API_KEY missing" # ------------------------------------------------------------------ def test_missing_groq_api_key(self): with mock.patch.dict(os.environ, {}, clear=True): ret = app.solve_proof(NO_SORRY, "llama-3.3-70b-versatile", 1) status, code, logs = self._assert_triple(ret) self.assertIn("GROQ_API_KEY missing", status) # No tempfile created -- function short-circuits. self.assertEqual(self._captured_paths, []) self.assertEqual(code, "") self.assertEqual(logs, "") # ------------------------------------------------------------------ # Agent raises -> "Error:" status; temp file still cleaned up. # ------------------------------------------------------------------ def test_agent_raises_runtime_error(self): _DummyAgent.raise_exc = RuntimeError("synthetic blow-up") ret = app.solve_proof(NO_SORRY, "llama-3.3-70b-versatile", 1) status, code, logs = self._assert_triple(ret) self.assertIn("Error", status) self.assertIn("synthetic blow-up", status) self.assertEqual(code, "") # Most important: temp file must be cleaned up even on exception. self.assertEqual(len(self._captured_paths), 1) self._assert_temp_cleaned() # ------------------------------------------------------------------ # Agent reports success -> "Verified" status. # ------------------------------------------------------------------ def test_agent_success(self): _DummyAgent.result = { "success": True, "solved_at_attempt": 2, "total_attempts": 3, } ret = app.solve_proof(NO_SORRY, "llama-3.3-70b-versatile", 3) status, code, logs = self._assert_triple(ret) self.assertIn("Verified", status) self.assertIn("2", status) self.assertIn("3", status) self._assert_temp_cleaned() # ------------------------------------------------------------------ # Agent reports failure -> "No proof found" status. # ------------------------------------------------------------------ def test_agent_failure(self): _DummyAgent.result = { "success": False, "solved_at_attempt": None, "total_attempts": 5, } ret = app.solve_proof(NO_SORRY, "llama-3.3-70b-versatile", 5) status, code, logs = self._assert_triple(ret) self.assertIn("No proof found", status) self.assertIn("5", status) self._assert_temp_cleaned() # ------------------------------------------------------------------ # Code without `sorry` should still pass through unchanged. # ------------------------------------------------------------------ def test_no_sorry_keyword_passes_through(self): ret = app.solve_proof(NO_SORRY, "llama-3.3-70b-versatile", 1) status, code, logs = self._assert_triple(ret) self.assertIn("Verified", status) self.assertEqual(code, NO_SORRY) # Bytes seen by agent must equal the input encoded as utf-8. self.assertEqual(_DummyAgent.last_bytes, NO_SORRY.encode("utf-8")) self._assert_temp_cleaned() # ------------------------------------------------------------------ # Sanity: tempfile cleanup verified for every scenario in one place. # ------------------------------------------------------------------ def test_temp_file_always_cleaned_up(self): """Run every non-short-circuit scenario and assert no tempfile leaks.""" scenarios = [ ("ok", NO_SORRY, None), ("huge", HUGE, None), ("unicode", UNICODE, None), ("raise", NO_SORRY, RuntimeError("boom")), ] for label, code_in, exc in scenarios: _DummyAgent.raise_exc = exc with self.subTest(label): app.solve_proof(code_in, "llama-3.3-70b-versatile", 1) _DummyAgent.raise_exc = None self._assert_temp_cleaned() if __name__ == "__main__": unittest.main()