Spaces:

Ray5th
/

Lean4-helper

Running

p4r5kpftnp-cmd

Add CI/CD: GitHub Actions workflow + ruff config

6cf7e7c about 1 month ago

15.4 kB

	"""
	Fuzz tests for `app.solve_proof`.

	These tests exercise a fixed corpus of edge-case inputs against
	`app.solve_proof` and assert:
	* every call returns a (status_html, final_code, logs) 3-tuple,
	* no call crashes (uncaught exceptions),
	* the right status messages are produced for each scenario,
	* the temp file `solve_proof` creates is always cleaned up,
	even when the underlying agent raises.

	We mock `langgraph_agent.LangGraphAgent` BEFORE importing `app` so no Lean,
	LLM, or FAISS infrastructure is needed.
	"""

	import os
	import sys
	import tempfile
	import threading
	import unittest
	from unittest import mock

	# Make `app.py` and `src/` importable.
	sys.path.insert(0, ".")
	sys.path.insert(0, "src")


	# ----- Mock LangGraphAgent BEFORE importing app -----------------------------
	_fake_lg_module = mock.MagicMock()


	class _DummyAgent:
	"""Stand-in for LangGraphAgent. Configured per-test via class attributes."""

	# Per-test knobs.
	# If set, __init__ records (model_name, max_retries) here.
	init_log: list = []
	# If set, solve_file_detailed raises this instance instead of returning.
	raise_exc: BaseException \| None = None
	# If set, this dict is returned from solve_file_detailed.
	result: dict = {
	"success": True,
	"solved_at_attempt": 1,
	"total_attempts": 1,
	}
	# Record temp file path the agent was passed, plus the bytes on disk.
	last_path: str \| None = None
	last_bytes: bytes \| None = None

	def __init__(self, model_name="x", max_retries=5, **kwargs):
	self.model_name = model_name
	self.max_retries = max_retries
	_DummyAgent.init_log.append((model_name, max_retries))

	def solve_file_detailed(self, file_path: str) -> dict:
	_DummyAgent.last_path = file_path
	try:
	with open(file_path, "rb") as f:
	_DummyAgent.last_bytes = f.read()
	except OSError:
	_DummyAgent.last_bytes = None

	if _DummyAgent.raise_exc is not None:
	raise _DummyAgent.raise_exc
	return _DummyAgent.result


	_fake_lg_module.LangGraphAgent = _DummyAgent

	# Default to having a key set; individual tests override this.
	os.environ.setdefault("GROQ_API_KEY", "test-key")

	# Mock langgraph_agent only for the duration of importing `app`, then restore
	# sys.modules so this file doesn't poison the import for other test files.
	# app.py captures its own `LangGraphAgent` reference at import time; tests
	# patch `app.LangGraphAgent` directly afterward.
	_orig_langgraph_agent = sys.modules.get("langgraph_agent")
	sys.modules["langgraph_agent"] = _fake_lg_module
	import app # noqa: E402 (must come after the mock above)

	if _orig_langgraph_agent is not None:
	sys.modules["langgraph_agent"] = _orig_langgraph_agent
	else:
	del sys.modules["langgraph_agent"]
	app.LangGraphAgent = _DummyAgent # keep app bound to the dummy after restore


	# Sentinels for the fuzz corpus -----------------------------------------------
	EMPTY = ""
	WHITESPACE = " \n\t "
	HUGE = "a" * 100_000
	UNICODE = "theorem foo : ∀ n : ℕ, ∃ m : ℕ, n + m = m + n := by sorry"
	NO_SORRY = "theorem foo : True := trivial"


	def _wrap_namedtemp(captured: list[str], lock: threading.Lock):
	"""Return a side_effect for `tempfile.NamedTemporaryFile` that records the
	name of every actual tempfile created so callers can assert cleanup."""
	real_ntf = tempfile.NamedTemporaryFile

	def factory(args, *kwargs):
	f = real_ntf(args, *kwargs)
	with lock:
	captured.append(f.name)
	return f

	return factory


	class SolveProofFuzzTests(unittest.TestCase):
	"""Each scenario from the fuzz corpus."""

	def setUp(self):
	# Reset the dummy agent's class-level knobs.
	_DummyAgent.init_log = []
	_DummyAgent.raise_exc = None
	_DummyAgent.result = {
	"success": True,
	"solved_at_attempt": 1,
	"total_attempts": 1,
	}
	_DummyAgent.last_path = None
	_DummyAgent.last_bytes = None

	# Make sure GROQ_API_KEY is set unless a test clears it.
	os.environ["GROQ_API_KEY"] = "test-key"

	self._captured_paths: list[str] = []
	self._capture_lock = threading.Lock()
	self._ntf_patch = mock.patch.object(
	app.tempfile,
	"NamedTemporaryFile",
	side_effect=_wrap_namedtemp(self._captured_paths, self._capture_lock),
	)
	self._ntf_patch.start()

	# Don't touch the real Lean REPL / FAISS retriever just because we're
	# exercising app.solve_proof in tests — return harmless mocks.
	self._components_patch = mock.patch.object(
	app, "_get_components", return_value=(mock.MagicMock(), mock.MagicMock())
	)
	self._components_patch.start()

	def tearDown(self):
	self._components_patch.stop()
	self._ntf_patch.stop()
	# Best-effort: clean up anything the test leaked so /tmp doesn't fill.
	for path in self._captured_paths:
	if os.path.exists(path):
	try:
	os.unlink(path)
	except OSError:
	pass

	# ------------------------------------------------------------------
	# Helpers
	# ------------------------------------------------------------------
	def _assert_triple(self, ret):
	self.assertIsInstance(ret, tuple, f"return is not tuple: {ret!r}")
	self.assertEqual(len(ret), 3, f"return is not length-3 tuple: {ret!r}")
	status, code, logs = ret
	self.assertIsInstance(status, str)
	self.assertIsInstance(code, str)
	self.assertIsInstance(logs, str)
	return status, code, logs

	def _assert_temp_cleaned(self):
	"""Every tempfile we observed must be gone now."""
	for path in self._captured_paths:
	self.assertFalse(
	os.path.exists(path),
	f"tempfile leaked: {path} still exists",
	)

	# ------------------------------------------------------------------
	# Empty string -> "No input"
	# ------------------------------------------------------------------
	def test_empty_string_returns_no_input(self):
	ret = app.solve_proof(EMPTY, "llama-3.3-70b-versatile", 1)
	status, code, logs = self._assert_triple(ret)
	self.assertIn("No input", status)
	# No tempfile should have been created at all for empty input.
	self.assertEqual(self._captured_paths, [])
	self.assertEqual(code, "")
	self.assertEqual(logs, "")

	# ------------------------------------------------------------------
	# Whitespace-only -> "No input"
	# ------------------------------------------------------------------
	def test_whitespace_only_returns_no_input(self):
	ret = app.solve_proof(WHITESPACE, "llama-3.3-70b-versatile", 1)
	status, code, logs = self._assert_triple(ret)
	self.assertIn("No input", status)
	self.assertEqual(self._captured_paths, [])

	# ------------------------------------------------------------------
	# 100KB string must not OOM/crash.
	# ------------------------------------------------------------------
	def test_huge_input_does_not_crash(self):
	ret = app.solve_proof(HUGE, "llama-3.3-70b-versatile", 1)
	status, code, logs = self._assert_triple(ret)
	# Should pass through to the agent (which returns success).
	self.assertIn("Verified", status)
	# The agent must have seen the full 100k bytes.
	self.assertIsNotNone(_DummyAgent.last_bytes)
	self.assertEqual(len(_DummyAgent.last_bytes), 100_000)
	self._assert_temp_cleaned()

	# ------------------------------------------------------------------
	# Unicode-heavy: must encode UTF-8 to temp file successfully.
	# ------------------------------------------------------------------
	def test_unicode_heavy_encodes_utf8(self):
	ret = app.solve_proof(UNICODE, "llama-3.3-70b-versatile", 1)
	status, code, logs = self._assert_triple(ret)
	self.assertIn("Verified", status)

	# The bytes the agent saw must be the UTF-8 encoding of UNICODE.
	self.assertIsNotNone(_DummyAgent.last_bytes)
	self.assertEqual(_DummyAgent.last_bytes, UNICODE.encode("utf-8"))

	# The `code` returned (read back from disk) must round-trip.
	self.assertEqual(code, UNICODE)

	self._assert_temp_cleaned()

	# ------------------------------------------------------------------
	# Unicode-heavy under simulated POSIX/ASCII locale.
	#
	# On systems where `locale.getpreferredencoding()` is ASCII (e.g. some
	# Docker containers, HuggingFace Spaces with PYTHONUTF8=0 + LC_ALL=C),
	# `open(path, "w")` without an explicit encoding will reject characters
	# like ∀, ℕ, ∃. The fix is to pass `encoding="utf-8"` explicitly when
	# writing the temp file and reading it back.
	#
	# We force ASCII mode by patching the builtins/tempfile callables that
	# `app.solve_proof` uses, defaulting `encoding="ascii"` whenever the
	# caller didn't specify one. If the implementation forgot to pass
	# encoding="utf-8", the unicode write/read will UnicodeEncode/DecodeError.
	# ------------------------------------------------------------------
	def test_unicode_heavy_under_ascii_locale(self):
	real_open = open
	real_ntf = tempfile.NamedTemporaryFile

	def ascii_open(args, *kwargs):
	# In text mode (no "b"), default to ASCII if no encoding given.
	mode = kwargs.get("mode")
	if mode is None and len(args) >= 2:
	mode = args[1]
	if mode is None:
	mode = "r"
	if "b" not in mode and "encoding" not in kwargs:
	kwargs["encoding"] = "ascii"
	return real_open(args, *kwargs)

	def ascii_ntf(args, *kwargs):
	mode = kwargs.get("mode", "w+b")
	if "b" not in mode and "encoding" not in kwargs:
	kwargs["encoding"] = "ascii"
	return real_ntf(args, *kwargs)

	with mock.patch.object(app.tempfile, "NamedTemporaryFile",
	side_effect=ascii_ntf), \
	mock.patch("builtins.open", side_effect=ascii_open):
	ret = app.solve_proof(UNICODE, "llama-3.3-70b-versatile", 1)

	status, code, logs = self._assert_triple(ret)
	# Must NOT crash with a UnicodeError leaking through as "Error:".
	self.assertNotIn(
	"UnicodeEncodeError", status,
	f"solve_proof leaked UnicodeEncodeError under ASCII locale: {status!r}",
	)
	self.assertNotIn(
	"UnicodeDecodeError", status,
	f"solve_proof leaked UnicodeDecodeError under ASCII locale: {status!r}",
	)
	# Must produce a Verified status: the agent didn't fail, the encoding
	# plumbing must transparently use utf-8.
	self.assertIn(
	"Verified", status,
	f"solve_proof returned non-Verified status under ASCII locale: {status!r}",
	)
	self.assertEqual(code, UNICODE)
	self._assert_temp_cleaned()

	# ------------------------------------------------------------------
	# GROQ_API_KEY unset -> "GROQ_API_KEY missing"
	# ------------------------------------------------------------------
	def test_missing_groq_api_key(self):
	with mock.patch.dict(os.environ, {}, clear=True):
	ret = app.solve_proof(NO_SORRY, "llama-3.3-70b-versatile", 1)
	status, code, logs = self._assert_triple(ret)
	self.assertIn("GROQ_API_KEY missing", status)
	# No tempfile created -- function short-circuits.
	self.assertEqual(self._captured_paths, [])
	self.assertEqual(code, "")
	self.assertEqual(logs, "")

	# ------------------------------------------------------------------
	# Agent raises -> "Error:" status; temp file still cleaned up.
	# ------------------------------------------------------------------
	def test_agent_raises_runtime_error(self):
	_DummyAgent.raise_exc = RuntimeError("synthetic blow-up")
	ret = app.solve_proof(NO_SORRY, "llama-3.3-70b-versatile", 1)
	status, code, logs = self._assert_triple(ret)
	self.assertIn("Error", status)
	self.assertIn("synthetic blow-up", status)
	self.assertEqual(code, "")
	# Most important: temp file must be cleaned up even on exception.
	self.assertEqual(len(self._captured_paths), 1)
	self._assert_temp_cleaned()

	# ------------------------------------------------------------------
	# Agent reports success -> "Verified" status.
	# ------------------------------------------------------------------
	def test_agent_success(self):
	_DummyAgent.result = {
	"success": True,
	"solved_at_attempt": 2,
	"total_attempts": 3,
	}
	ret = app.solve_proof(NO_SORRY, "llama-3.3-70b-versatile", 3)
	status, code, logs = self._assert_triple(ret)
	self.assertIn("Verified", status)
	self.assertIn("2", status)
	self.assertIn("3", status)
	self._assert_temp_cleaned()

	# ------------------------------------------------------------------
	# Agent reports failure -> "No proof found" status.
	# ------------------------------------------------------------------
	def test_agent_failure(self):
	_DummyAgent.result = {
	"success": False,
	"solved_at_attempt": None,
	"total_attempts": 5,
	}
	ret = app.solve_proof(NO_SORRY, "llama-3.3-70b-versatile", 5)
	status, code, logs = self._assert_triple(ret)
	self.assertIn("No proof found", status)
	self.assertIn("5", status)
	self._assert_temp_cleaned()

	# ------------------------------------------------------------------
	# Code without `sorry` should still pass through unchanged.
	# ------------------------------------------------------------------
	def test_no_sorry_keyword_passes_through(self):
	ret = app.solve_proof(NO_SORRY, "llama-3.3-70b-versatile", 1)
	status, code, logs = self._assert_triple(ret)
	self.assertIn("Verified", status)
	self.assertEqual(code, NO_SORRY)
	# Bytes seen by agent must equal the input encoded as utf-8.
	self.assertEqual(_DummyAgent.last_bytes, NO_SORRY.encode("utf-8"))
	self._assert_temp_cleaned()

	# ------------------------------------------------------------------
	# Sanity: tempfile cleanup verified for every scenario in one place.
	# ------------------------------------------------------------------
	def test_temp_file_always_cleaned_up(self):
	"""Run every non-short-circuit scenario and assert no tempfile leaks."""
	scenarios = [
	("ok", NO_SORRY, None),
	("huge", HUGE, None),
	("unicode", UNICODE, None),
	("raise", NO_SORRY, RuntimeError("boom")),
	]
	for label, code_in, exc in scenarios:
	_DummyAgent.raise_exc = exc
	with self.subTest(label):
	app.solve_proof(code_in, "llama-3.3-70b-versatile", 1)
	_DummyAgent.raise_exc = None
	self._assert_temp_cleaned()


	if __name__ == "__main__":
	unittest.main()