| import unittest |
|
|
| from pipeline_v10 import _citation_repair_needed, _inject_real_references, _reindex_final_answer_by_body_order |
|
|
| from tests.helpers import make_source |
|
|
|
|
| class CitationFinalizerTests(unittest.TestCase): |
| def test_reindex_final_answer_reorders_body_and_references_together(self): |
| text = ( |
| "Pregnancy warning for isotretinoin [3].\n\n" |
| "ACE inhibitors are also avoided [1].\n\n" |
| "Warfarin is generally contraindicated [2].\n\n" |
| "### References\n\n" |
| "[1] ACE inhibitors source\n\n" |
| "[2] Warfarin source\n\n" |
| "[3] Isotretinoin source" |
| ) |
|
|
| result = _reindex_final_answer_by_body_order(text) |
|
|
| self.assertIn("Pregnancy warning for isotretinoin [1].", result) |
| self.assertIn("ACE inhibitors are also avoided [2].", result) |
| self.assertIn("Warfarin is generally contraindicated [3].", result) |
| self.assertIn("[1] Isotretinoin source", result) |
| self.assertIn("[2] ACE inhibitors source", result) |
| self.assertIn("[3] Warfarin source", result) |
|
|
| def test_first_visible_citation_is_renumbered_to_one(self): |
| text = ( |
| "Medicines such as isotretinoin are not recommended during pregnancy [3].\n\n" |
| "Later-pregnancy NSAID exposure is also avoided [1].\n\n" |
| "ACE inhibitors are generally not recommended either [2].\n\n" |
| "### References\n\n" |
| "[1] stale one\n\n" |
| "[2] stale two\n\n" |
| "[3] stale three" |
| ) |
| sources = [ |
| make_source("NSAIDs in pregnancy", 2021), |
| make_source("ACE inhibitors in pregnancy", 2022), |
| make_source("Isotretinoin teratogenicity", 2023), |
| ] |
|
|
| result = _inject_real_references(text, sources) |
|
|
| self.assertIn("Medicines such as isotretinoin are not recommended during pregnancy [1].", result) |
| self.assertIn("Later-pregnancy NSAID exposure is also avoided [2].", result) |
| self.assertIn("ACE inhibitors are generally not recommended either [3].", result) |
| self.assertIn("[1] Doe et al. *Isotretinoin teratogenicity* (2023). *Journal*. DOI: 10.1000/test", result) |
| self.assertIn("[2] Doe et al. *NSAIDs in pregnancy* (2021). *Journal*. DOI: 10.1000/test", result) |
| self.assertIn("[3] Doe et al. *ACE inhibitors in pregnancy* (2022). *Journal*. DOI: 10.1000/test", result) |
| self.assertNotIn("stale one", result) |
|
|
| def test_grouped_and_phantom_citations_are_normalized(self): |
| text = ( |
| "First paragraph uses grouped citations [3, 1, 9].\n\n" |
| "Second paragraph uses a range [1-2].\n\n" |
| "### References\n\n" |
| "[9] bogus" |
| ) |
| sources = [ |
| make_source("Source A", 2021), |
| make_source("Source B", 2022), |
| make_source("Source C", 2023), |
| ] |
|
|
| result = _inject_real_references(text, sources) |
|
|
| self.assertIn("First paragraph uses grouped citations [1][2].", result) |
| self.assertIn("Second paragraph uses a range [2][3].", result) |
| self.assertNotIn("[9]", result) |
| self.assertIn("[1] Doe et al. *Source C* (2023). *Journal*. DOI: 10.1000/test", result) |
| self.assertIn("[2] Doe et al. *Source A* (2021). *Journal*. DOI: 10.1000/test", result) |
| self.assertIn("[3] Doe et al. *Source B* (2022). *Journal*. DOI: 10.1000/test", result) |
|
|
| def test_uncited_substantive_paragraph_is_backfilled_from_neighbor(self): |
| text = ( |
| "Opening paragraph with support [1].\n\n" |
| "This middle paragraph has enough words to count as substantive but lacks an inline citation entirely.\n\n" |
| "Closing paragraph also carries support from a second source [2]." |
| ) |
| sources = [ |
| make_source("Source A", 2021), |
| make_source("Source B", 2022), |
| ] |
|
|
| result = _inject_real_references(text, sources) |
|
|
| self.assertIn( |
| "This middle paragraph has enough words to count as substantive but lacks an inline citation entirely. [2]", |
| result, |
| ) |
| self.assertEqual(result.count("### References"), 1) |
|
|
| def test_references_are_rebuilt_from_used_sources_only(self): |
| text = ( |
| "Only one source is cited here [2].\n\n" |
| "### References\n\n" |
| "[1] wrong\n\n" |
| "[2] wrong" |
| ) |
| sources = [ |
| make_source("Source A", 2021), |
| make_source("Source B", 2022), |
| ] |
|
|
| result = _inject_real_references(text, sources) |
|
|
| self.assertIn("Only one source is cited here [1].", result) |
| self.assertIn("[1] Doe et al. *Source B* (2022). *Journal*. DOI: 10.1000/test", result) |
| self.assertNotIn("*Source A*", result) |
| self.assertNotIn("[2] wrong", result) |
|
|
| def test_plain_references_heading_is_replaced_not_appended(self): |
| text = ( |
| "A paragraph with grouped citations [3, 1, 9].\n\n" |
| "References\n" |
| "[1] stale ref one\n\n" |
| "[2] stale ref two\n\n" |
| "[3] stale ref three" |
| ) |
| sources = [ |
| make_source("Source A", 2021), |
| make_source("Source B", 2022), |
| make_source("Source C", 2023), |
| ] |
|
|
| result = _inject_real_references(text, sources) |
|
|
| self.assertNotIn("stale ref", result) |
| self.assertEqual(result.count("### References"), 1) |
| self.assertNotIn("\nReferences\n", result) |
| self.assertIn("A paragraph with grouped citations [1][2].", result) |
|
|
| def test_repair_needed_flags_grouped_citations(self): |
| text = "Paragraph with grouped citations [1, 2]." |
| self.assertTrue(_citation_repair_needed(text, 3)) |
|
|
| def test_repair_needed_flags_uncited_substantive_paragraph(self): |
| text = ( |
| "A cited paragraph with enough words to count as substantive [1].\n\n" |
| "This second paragraph is also substantive, but it currently has no citation at all." |
| ) |
| self.assertTrue(_citation_repair_needed(text, 2)) |
|
|
| def test_repair_not_needed_for_clean_answer(self): |
| text = ( |
| "Opening paragraph has enough words to count as substantive and includes a proper citation [1].\n\n" |
| "Second substantive paragraph also has enough words and includes support from another source [2].\n\n" |
| "### References\n\n" |
| "[1] old\n\n" |
| "[2] old" |
| ) |
| self.assertFalse(_citation_repair_needed(text, 2)) |
|
|
| def test_escaped_brackets_are_unescaped_and_normalized(self): |
| """LLMs sometimes emit \\[1\\]\\[2\\] which markdown renders as [1][2]. |
| These must be parsed and finalised the same as plain brackets.""" |
| text = ( |
| "Time crystals break time-translation symmetry \\[1\\]\\[2\\].\n\n" |
| "First proposed by Wilczek \\[3\\].\n\n" |
| "Discrete time crystals show period doubling \\[4\\]\\[6\\].\n\n" |
| "### References\n\n" |
| "\\[1\\] stale\n\n" |
| "\\[2\\] stale\n\n" |
| "\\[3\\] stale\n\n" |
| "\\[4\\] stale" |
| ) |
| sources = [ |
| make_source("Wilczek paper", 2012), |
| make_source("Khemani paper", 2016), |
| make_source("Else paper", 2016), |
| make_source("Zhang observation", 2017), |
| ] |
|
|
| result = _inject_real_references(text, sources) |
| body = result.split("### References")[0] |
|
|
| |
| self.assertNotIn("[5]", body) |
| self.assertNotIn("[6]", body) |
| self.assertNotIn("\\[", result) |
| self.assertNotIn("\\]", result) |
|
|
| |
| import re as _re |
| body_cites = [int(x) for x in _re.findall(r"\[(\d+)\]", body)] |
| self.assertTrue(len(set(body_cites)) > 0) |
| self.assertEqual(set(body_cites), {1, 2, 3, 4}) |
|
|
| def test_citations_appear_in_sequential_order(self): |
| """First-cited source must be [1], second new source must be [2], etc.""" |
| text = ( |
| "Paragraph one cites the third source [3].\n\n" |
| "Paragraph two cites the first source [1].\n\n" |
| "Paragraph three cites the second source [2]." |
| ) |
| sources = [ |
| make_source("First in list", 2021), |
| make_source("Second in list", 2022), |
| make_source("Third in list", 2023), |
| ] |
|
|
| result = _inject_real_references(text, sources) |
| body = result.split("### References")[0] |
|
|
| |
| import re as _re |
| first_cite = _re.search(r"\[(\d+)\]", body).group(1) |
| self.assertEqual(first_cite, "1") |
|
|
| |
| self.assertIn("Paragraph one cites the third source [1].", body) |
| self.assertIn("Paragraph two cites the first source [2].", body) |
| self.assertIn("Paragraph three cites the second source [3].", body) |
|
|
| |
| self.assertIn("[1] Doe et al. *Third in list*", result) |
| self.assertIn("[2] Doe et al. *First in list*", result) |
| self.assertIn("[3] Doe et al. *Second in list*", result) |
|
|