"""Tests for the LLM Artifact Checker.""" from pathlib import Path from qalmsw.checkers import ArtifactChecker, Severity from qalmsw.document import Document def _doc(source: str) -> Document: return Document(path=Path("paper.tex"), source=source, paragraphs=[]) # ---- Meta-comment patterns ---- def test_here_is_summary(): source = r"""\begin{document} Here is a 200-word summary of the results. \end{document}""" findings = ArtifactChecker().check(_doc(source)) matched = [f for f in findings if "meta-comment" in f.message and "summary" in f.message] assert len(matched) >= 1 assert matched[0].severity == Severity.error assert matched[0].excerpt is not None assert "200-word" in matched[0].excerpt def test_would_you_like_me_to(): source = r"""\begin{document} Here is the analysis. Would you like me to make any changes? \end{document}""" findings = ArtifactChecker().check(_doc(source)) matched = [f for f in findings if "meta-comment" in f.message] assert len(matched) >= 1 assert matched[0].severity == Severity.error def test_id_be_happy_to(): source = r"""\begin{document} I'd be happy to help you with this section. \end{document}""" findings = ArtifactChecker().check(_doc(source)) matched = [f for f in findings if "meta-comment" in f.message] assert len(matched) >= 1 # ---- Placeholder patterns ---- def test_fill_in_with_real_data(): source = r"""\begin{document} Table~\ref{tab:results} shows the performance. Fill in with the real numbers. \end{document}""" findings = ArtifactChecker().check(_doc(source)) matched = [f for f in findings if "placeholder" in f.message] assert len(matched) >= 1 assert matched[0].severity == Severity.error def test_illustrative_data(): source = r"""\begin{document} The data in this table is illustrative only. \end{document}""" findings = ArtifactChecker().check(_doc(source)) matched = [f for f in findings if "illustrative" in f.message] assert len(matched) >= 1 def test_insert_your_content_here(): source = r"""\begin{document} Insert your content here. \end{document}""" findings = ArtifactChecker().check(_doc(source)) matched = [f for f in findings if "insert your content" in f.message] assert len(matched) >= 1 def test_placeholder_caption(): source = r"""\begin{document} \begin{figure} \caption{Placeholder caption text} \end{figure} \end{document}""" findings = ArtifactChecker().check(_doc(source)) matched = [f for f in findings if "Phantom caption" in f.message] assert len(matched) >= 1 assert matched[0].severity == Severity.error def test_placeholder_label(): source = r"""\begin{document} \label{fig:placeholder} \end{document}""" findings = ArtifactChecker().check(_doc(source)) matched = [f for f in findings if "Phantom label" in f.message] assert len(matched) >= 1 # ---- Self-awareness patterns ---- def test_as_an_ai_language_model(): source = r"""\begin{document} As an AI language model, I cannot browse the internet. \end{document}""" findings = ArtifactChecker().check(_doc(source)) matched = [f for f in findings if "self-reference" in f.message] assert len(matched) >= 1 def test_i_cannot_provide(): source = r"""\begin{document} I can't provide the specific implementation details. \end{document}""" findings = ArtifactChecker().check(_doc(source)) matched = [f for f in findings if "refusal" in f.message] assert len(matched) >= 1 def test_im_sorry(): source = r"""\begin{document} I'm sorry, but I cannot access real-time data. \end{document}""" findings = ArtifactChecker().check(_doc(source)) matched = [f for f in findings if "sorry" in f.message] assert len(matched) >= 1 # ---- LaTeX artifact patterns ---- def test_lipsum(): source = r"""\begin{document} \lipsum[1-4] \end{document}""" findings = ArtifactChecker().check(_doc(source)) matched = [f for f in findings if "\\lipsum" in f.message] assert len(matched) >= 1 assert matched[0].severity == Severity.warning def test_todo_marker(): source = r"""\begin{document} TODO: add experimental results here. \end{document}""" findings = ArtifactChecker().check(_doc(source)) matched = [f for f in findings if "TODO" in f.message] assert len(matched) >= 1 # ---- Clean document produces no findings ---- def test_clean_document_no_findings(): source = r"""\begin{document} We propose a novel method for geometric algebra flow matching. Our approach outperforms existing methods across three benchmarks. \bibliography{refs} \end{document}""" findings = ArtifactChecker().check(_doc(source)) assert findings == [] # ---- Preamble scanning ---- def test_preamble_artifact(): source = r"""\label{sec:placeholder} \begin{document} Clean body text here. \end{document}""" findings = ArtifactChecker().check(_doc(source)) matched = [f for f in findings if "Phantom label" in f.message] assert len(matched) >= 1