import unittest from pipeline_v10 import _citation_repair_needed, _inject_real_references, _reindex_final_answer_by_body_order from tests.helpers import make_source class CitationFinalizerTests(unittest.TestCase): def test_reindex_final_answer_reorders_body_and_references_together(self): text = ( "Pregnancy warning for isotretinoin [3].\n\n" "ACE inhibitors are also avoided [1].\n\n" "Warfarin is generally contraindicated [2].\n\n" "### References\n\n" "[1] ACE inhibitors source\n\n" "[2] Warfarin source\n\n" "[3] Isotretinoin source" ) result = _reindex_final_answer_by_body_order(text) self.assertIn("Pregnancy warning for isotretinoin [1].", result) self.assertIn("ACE inhibitors are also avoided [2].", result) self.assertIn("Warfarin is generally contraindicated [3].", result) self.assertIn("[1] Isotretinoin source", result) self.assertIn("[2] ACE inhibitors source", result) self.assertIn("[3] Warfarin source", result) def test_first_visible_citation_is_renumbered_to_one(self): text = ( "Medicines such as isotretinoin are not recommended during pregnancy [3].\n\n" "Later-pregnancy NSAID exposure is also avoided [1].\n\n" "ACE inhibitors are generally not recommended either [2].\n\n" "### References\n\n" "[1] stale one\n\n" "[2] stale two\n\n" "[3] stale three" ) sources = [ make_source("NSAIDs in pregnancy", 2021), make_source("ACE inhibitors in pregnancy", 2022), make_source("Isotretinoin teratogenicity", 2023), ] result = _inject_real_references(text, sources) self.assertIn("Medicines such as isotretinoin are not recommended during pregnancy [1].", result) self.assertIn("Later-pregnancy NSAID exposure is also avoided [2].", result) self.assertIn("ACE inhibitors are generally not recommended either [3].", result) self.assertIn("[1] Doe et al. *Isotretinoin teratogenicity* (2023). *Journal*. DOI: 10.1000/test", result) self.assertIn("[2] Doe et al. *NSAIDs in pregnancy* (2021). *Journal*. DOI: 10.1000/test", result) self.assertIn("[3] Doe et al. *ACE inhibitors in pregnancy* (2022). *Journal*. DOI: 10.1000/test", result) self.assertNotIn("stale one", result) def test_grouped_and_phantom_citations_are_normalized(self): text = ( "First paragraph uses grouped citations [3, 1, 9].\n\n" "Second paragraph uses a range [1-2].\n\n" "### References\n\n" "[9] bogus" ) sources = [ make_source("Source A", 2021), make_source("Source B", 2022), make_source("Source C", 2023), ] result = _inject_real_references(text, sources) self.assertIn("First paragraph uses grouped citations [1][2].", result) self.assertIn("Second paragraph uses a range [2][3].", result) self.assertNotIn("[9]", result) self.assertIn("[1] Doe et al. *Source C* (2023). *Journal*. DOI: 10.1000/test", result) self.assertIn("[2] Doe et al. *Source A* (2021). *Journal*. DOI: 10.1000/test", result) self.assertIn("[3] Doe et al. *Source B* (2022). *Journal*. DOI: 10.1000/test", result) def test_uncited_substantive_paragraph_is_backfilled_from_neighbor(self): text = ( "Opening paragraph with support [1].\n\n" "This middle paragraph has enough words to count as substantive but lacks an inline citation entirely.\n\n" "Closing paragraph also carries support from a second source [2]." ) sources = [ make_source("Source A", 2021), make_source("Source B", 2022), ] result = _inject_real_references(text, sources) self.assertIn( "This middle paragraph has enough words to count as substantive but lacks an inline citation entirely. [2]", result, ) self.assertEqual(result.count("### References"), 1) def test_references_are_rebuilt_from_used_sources_only(self): text = ( "Only one source is cited here [2].\n\n" "### References\n\n" "[1] wrong\n\n" "[2] wrong" ) sources = [ make_source("Source A", 2021), make_source("Source B", 2022), ] result = _inject_real_references(text, sources) self.assertIn("Only one source is cited here [1].", result) self.assertIn("[1] Doe et al. *Source B* (2022). *Journal*. DOI: 10.1000/test", result) self.assertNotIn("*Source A*", result) self.assertNotIn("[2] wrong", result) def test_plain_references_heading_is_replaced_not_appended(self): text = ( "A paragraph with grouped citations [3, 1, 9].\n\n" "References\n" "[1] stale ref one\n\n" "[2] stale ref two\n\n" "[3] stale ref three" ) sources = [ make_source("Source A", 2021), make_source("Source B", 2022), make_source("Source C", 2023), ] result = _inject_real_references(text, sources) self.assertNotIn("stale ref", result) self.assertEqual(result.count("### References"), 1) self.assertNotIn("\nReferences\n", result) self.assertIn("A paragraph with grouped citations [1][2].", result) def test_repair_needed_flags_grouped_citations(self): text = "Paragraph with grouped citations [1, 2]." self.assertTrue(_citation_repair_needed(text, 3)) def test_repair_needed_flags_uncited_substantive_paragraph(self): text = ( "A cited paragraph with enough words to count as substantive [1].\n\n" "This second paragraph is also substantive, but it currently has no citation at all." ) self.assertTrue(_citation_repair_needed(text, 2)) def test_repair_not_needed_for_clean_answer(self): text = ( "Opening paragraph has enough words to count as substantive and includes a proper citation [1].\n\n" "Second substantive paragraph also has enough words and includes support from another source [2].\n\n" "### References\n\n" "[1] old\n\n" "[2] old" ) self.assertFalse(_citation_repair_needed(text, 2)) def test_escaped_brackets_are_unescaped_and_normalized(self): """LLMs sometimes emit \\[1\\]\\[2\\] which markdown renders as [1][2]. These must be parsed and finalised the same as plain brackets.""" text = ( "Time crystals break time-translation symmetry \\[1\\]\\[2\\].\n\n" "First proposed by Wilczek \\[3\\].\n\n" "Discrete time crystals show period doubling \\[4\\]\\[6\\].\n\n" "### References\n\n" "\\[1\\] stale\n\n" "\\[2\\] stale\n\n" "\\[3\\] stale\n\n" "\\[4\\] stale" ) sources = [ make_source("Wilczek paper", 2012), make_source("Khemani paper", 2016), make_source("Else paper", 2016), make_source("Zhang observation", 2017), ] result = _inject_real_references(text, sources) body = result.split("### References")[0] # No phantom citations should remain self.assertNotIn("[5]", body) self.assertNotIn("[6]", body) self.assertNotIn("\\[", result) self.assertNotIn("\\]", result) # Body and references should both use 1..N in matching order import re as _re body_cites = [int(x) for x in _re.findall(r"\[(\d+)\]", body)] self.assertTrue(len(set(body_cites)) > 0) self.assertEqual(set(body_cites), {1, 2, 3, 4}) def test_citations_appear_in_sequential_order(self): """First-cited source must be [1], second new source must be [2], etc.""" text = ( "Paragraph one cites the third source [3].\n\n" "Paragraph two cites the first source [1].\n\n" "Paragraph three cites the second source [2]." ) sources = [ make_source("First in list", 2021), # body cites this 2nd make_source("Second in list", 2022), # body cites this 3rd make_source("Third in list", 2023), # body cites this 1st ] result = _inject_real_references(text, sources) body = result.split("### References")[0] # First [N] in body should be [1] import re as _re first_cite = _re.search(r"\[(\d+)\]", body).group(1) self.assertEqual(first_cite, "1") # Sequential 1, 2, 3 in order of first appearance self.assertIn("Paragraph one cites the third source [1].", body) self.assertIn("Paragraph two cites the first source [2].", body) self.assertIn("Paragraph three cites the second source [3].", body) # References [1] = "Third in list" (the first thing cited in body) self.assertIn("[1] Doe et al. *Third in list*", result) self.assertIn("[2] Doe et al. *First in list*", result) self.assertIn("[3] Doe et al. *Second in list*", result)