qalmsw / tests /test_parse.py
pebaryan
Add math formula consistency checker
0ae9d9f
Raw
History Blame Contribute Delete
2.97 kB
from qalmsw.parse import has_math, has_prose, parse_paragraphs
def test_simple_paragraphs():
source = (
"\\begin{document}\n"
"First paragraph here.\n"
"Still first.\n"
"\n"
"Second paragraph.\n"
"\\end{document}\n"
)
paras = parse_paragraphs(source)
assert len(paras) == 2
assert paras[0].text == "First paragraph here.\nStill first."
assert paras[0].start_line == 2
assert paras[0].end_line == 3
assert paras[1].text == "Second paragraph."
assert paras[1].start_line == 5
assert paras[1].end_line == 5
def test_comments_stripped_but_lines_preserved():
source = "\\begin{document}\nLine one. % a comment\nLine two.\n\\end{document}\n"
paras = parse_paragraphs(source)
assert len(paras) == 1
assert "comment" not in paras[0].text
assert paras[0].start_line == 2
def test_escaped_percent_not_a_comment():
source = "\\begin{document}\n50\\% efficiency.\n\\end{document}\n"
paras = parse_paragraphs(source)
assert paras[0].text == "50\\% efficiency."
def test_no_begin_document():
source = "First line.\n\nSecond line."
paras = parse_paragraphs(source)
assert len(paras) == 2
assert paras[0].start_line == 1
assert paras[1].start_line == 3
def test_content_after_preamble_tracks_lines():
source = (
"\\documentclass{article}\n"
"\\usepackage{amsmath}\n"
"\\begin{document}\n"
"Real content.\n"
"\\end{document}\n"
)
paras = parse_paragraphs(source)
assert len(paras) == 1
assert paras[0].start_line == 4
def test_has_prose_rejects_bare_commands():
assert not has_prose("\\maketitle")
assert not has_prose("\\section{Introduction}")
assert not has_prose("\\begin{equation}\\end{equation}")
def test_has_prose_accepts_real_sentences():
assert has_prose("This paper investigates attention mechanisms.")
def test_has_prose_accepts_prose_with_latex_commands():
assert has_prose("Recent work~\\cite{foo} shows that self-attention scales well.")
def test_has_math_detects_inline_and_display_math():
assert has_math("Let $x=1$ for the rest of the proof.")
assert has_math("Let $$x=1$$ for the rest of the proof.")
assert has_math("\\begin{equation}x=1\\end{equation}")
assert has_math("We define \\(x\\) as the latent variable.")
def test_has_math_rejects_plain_prose():
assert not has_math("This paragraph has no formulas.")
def test_inline_thebibliography_is_excluded_from_body():
source = (
"\\begin{document}\n"
"Real body content with enough words here.\n"
"\n"
"\\begin{thebibliography}{99}\n"
"\\bibitem{foo} A. Author, Title, Venue, 2020.\n"
"\\end{thebibliography}\n"
"\\end{document}\n"
)
paras = parse_paragraphs(source)
assert len(paras) == 1
assert "bibitem" not in paras[0].text
assert "thebibliography" not in paras[0].text