pdf.tocgen.split / utils /test_toc_processor.py
adelevett's picture
Upload 76 files
046e3b8 verified
raw
history blame
1.67 kB
import unittest
from toc_processor import merge_same_page_headers, clean_text, parse_raw_toc_output
class TestTOCProcessor(unittest.TestCase):
def test_merge_same_page_headers(self):
# Scenario: "American Government..." (Page 31) followed by "Divided World" (Page 31)
input_toc = [
[1, "Chapter 1 Intro", 5],
[1, "American Government and Politics in a Racially", 31],
[1, "Divided World", 31],
[1, "Chapter 2", 57]
]
expected_toc = [
[1, "Chapter 1 Intro", 5],
[1, "American Government and Politics in a Racially Divided World", 31],
[1, "Chapter 2", 57]
]
result = merge_same_page_headers(input_toc)
print(f"\nInput: {[e[1] for e in input_toc]}")
print(f"Result: {[e[1] for e in result]}")
self.assertEqual(len(result), 3)
self.assertEqual(result[1][1], "American Government and Politics in a Racially Divided World")
self.assertEqual(result[1][2], 31)
def test_merge_same_page_headers_mixed_levels(self):
# Scenario: Level 1 followed by Level 2 on same page (Should NOT merge)
input_toc = [
[1, "Chapter 1", 10],
[2, "Section 1.1", 10]
]
result = merge_same_page_headers(input_toc)
self.assertEqual(len(result), 2)
def test_clean_text(self):
dirty = "Hello\xa0World\xad"
clean = clean_text(dirty)
self.assertEqual(clean, "Hello World")
if __name__ == '__main__':
unittest.main()