InitialMarkups

Runtime error

App Files Files Community

Marthee commited on Nov 17, 2025

Commit

7013b67

verified ·

1 Parent(s): e3090a8

Update InitialMarkups.py

Browse files

Files changed (1) hide show

InitialMarkups.py +50 -16

InitialMarkups.py CHANGED Viewed

@@ -64,6 +64,56 @@ def changepdflinks(json_data, pdf_path):
     return updated_json
 def get_regular_font_size_and_color(doc):
     font_sizes = []
     colors = []
@@ -673,23 +723,7 @@ def extract_section_under_header(multiplePDF_Paths):
         dot_pattern = re.compile(r'\.{3,}')
         url_pattern = re.compile(r'https?://\S+|www\.\S+')
-        def get_toc_page_numbers(doc, max_pages_to_check=15):
-            toc_pages = []
-            for page_num in range(min(len(doc), max_pages_to_check)):
-                page = doc.load_page(page_num)
-                blocks = page.get_text("dict")["blocks"]
-                dot_line_count = 0
-                for block in blocks:
-                    for line in block.get("lines", []):
-                        line_text = get_spaced_text_from_spans(line["spans"]).strip()
-                        if dot_pattern.search(line_text):
-                            dot_line_count += 1
-                if dot_line_count >= 1:
-                    toc_pages.append(page_num)
-            return list(range(0, toc_pages[-1] +1)) if toc_pages else toc_pages
         toc_pages = get_toc_page_numbers(doc)

     return updated_json
+def get_toc_page_numbers(doc, max_pages_to_check=15):
+    toc_pages = []
+    # 1. Existing Dot Pattern (looking for ".....")
+    dot_pattern = re.compile(r"\.{2,}")
+    # 2. NEW: Title Pattern (looking for specific headers)
+    # ^ and $ ensure the line is JUST that word (ignoring "The contents of the bag...")
+    # re.IGNORECASE makes it match "CONTENTS", "Contents", "Index", etc.
+    title_pattern = re.compile(r"^\s*(table of contents|contents|index)\s*$", re.IGNORECASE)
+    for page_num in range(min(len(doc), max_pages_to_check)):
+        page = doc.load_page(page_num)
+        blocks = page.get_text("dict")["blocks"]
+        dot_line_count = 0
+        has_toc_title = False
+        for block in blocks:
+            for line in block.get("lines", []):
+                # Extract text from spans (mimicking get_spaced_text_from_spans)
+                line_text = " ".join([span["text"] for span in line["spans"]]).strip()
+                # CHECK A: Does the line have dots?
+                if dot_pattern.search(line_text):
+                    dot_line_count += 1
+                # CHECK B: Is this line a Title?
+                # We check this early in the loop. If a page has a title "Contents",
+                # we mark it immediately.
+                if title_pattern.match(line_text):
+                    has_toc_title = True
+        # CONDITION:
+        # It is a TOC page if it has a Title OR if it has dot leaders.
+        # We use 'dot_line_count >= 1' to be sensitive to single-item lists.
+        if has_toc_title or dot_line_count >= 1:
+            toc_pages.append(page_num)
+    # RETURN:
+    # If we found TOC pages (e.g., [2, 3]), we return [0, 1, 2, 3]
+    # This covers the cover page, inside cover, and the TOC itself.
+    if toc_pages:
+        last_toc_page = toc_pages[-1]
+        return list(range(0, last_toc_page + 1))
+    return [] # Return empty list if nothing found
 def get_regular_font_size_and_color(doc):
     font_sizes = []
     colors = []
         dot_pattern = re.compile(r'\.{3,}')
         url_pattern = re.compile(r'https?://\S+|www\.\S+')
         toc_pages = get_toc_page_numbers(doc)