Marthee commited on
Commit
7013b67
·
verified ·
1 Parent(s): e3090a8

Update InitialMarkups.py

Browse files
Files changed (1) hide show
  1. InitialMarkups.py +50 -16
InitialMarkups.py CHANGED
@@ -64,6 +64,56 @@ def changepdflinks(json_data, pdf_path):
64
  return updated_json
65
 
66
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
67
  def get_regular_font_size_and_color(doc):
68
  font_sizes = []
69
  colors = []
@@ -673,23 +723,7 @@ def extract_section_under_header(multiplePDF_Paths):
673
  dot_pattern = re.compile(r'\.{3,}')
674
  url_pattern = re.compile(r'https?://\S+|www\.\S+')
675
 
676
- def get_toc_page_numbers(doc, max_pages_to_check=15):
677
- toc_pages = []
678
- for page_num in range(min(len(doc), max_pages_to_check)):
679
- page = doc.load_page(page_num)
680
- blocks = page.get_text("dict")["blocks"]
681
-
682
- dot_line_count = 0
683
- for block in blocks:
684
- for line in block.get("lines", []):
685
- line_text = get_spaced_text_from_spans(line["spans"]).strip()
686
- if dot_pattern.search(line_text):
687
- dot_line_count += 1
688
-
689
- if dot_line_count >= 1:
690
- toc_pages.append(page_num)
691
 
692
- return list(range(0, toc_pages[-1] +1)) if toc_pages else toc_pages
693
 
694
  toc_pages = get_toc_page_numbers(doc)
695
 
 
64
  return updated_json
65
 
66
 
67
+
68
+ def get_toc_page_numbers(doc, max_pages_to_check=15):
69
+ toc_pages = []
70
+
71
+ # 1. Existing Dot Pattern (looking for ".....")
72
+ dot_pattern = re.compile(r"\.{2,}")
73
+
74
+ # 2. NEW: Title Pattern (looking for specific headers)
75
+ # ^ and $ ensure the line is JUST that word (ignoring "The contents of the bag...")
76
+ # re.IGNORECASE makes it match "CONTENTS", "Contents", "Index", etc.
77
+ title_pattern = re.compile(r"^\s*(table of contents|contents|index)\s*$", re.IGNORECASE)
78
+
79
+ for page_num in range(min(len(doc), max_pages_to_check)):
80
+ page = doc.load_page(page_num)
81
+ blocks = page.get_text("dict")["blocks"]
82
+
83
+ dot_line_count = 0
84
+ has_toc_title = False
85
+
86
+ for block in blocks:
87
+ for line in block.get("lines", []):
88
+ # Extract text from spans (mimicking get_spaced_text_from_spans)
89
+ line_text = " ".join([span["text"] for span in line["spans"]]).strip()
90
+
91
+ # CHECK A: Does the line have dots?
92
+ if dot_pattern.search(line_text):
93
+ dot_line_count += 1
94
+
95
+ # CHECK B: Is this line a Title?
96
+ # We check this early in the loop. If a page has a title "Contents",
97
+ # we mark it immediately.
98
+ if title_pattern.match(line_text):
99
+ has_toc_title = True
100
+
101
+ # CONDITION:
102
+ # It is a TOC page if it has a Title OR if it has dot leaders.
103
+ # We use 'dot_line_count >= 1' to be sensitive to single-item lists.
104
+ if has_toc_title or dot_line_count >= 1:
105
+ toc_pages.append(page_num)
106
+
107
+ # RETURN:
108
+ # If we found TOC pages (e.g., [2, 3]), we return [0, 1, 2, 3]
109
+ # This covers the cover page, inside cover, and the TOC itself.
110
+ if toc_pages:
111
+ last_toc_page = toc_pages[-1]
112
+ return list(range(0, last_toc_page + 1))
113
+
114
+ return [] # Return empty list if nothing found
115
+
116
+
117
  def get_regular_font_size_and_color(doc):
118
  font_sizes = []
119
  colors = []
 
723
  dot_pattern = re.compile(r'\.{3,}')
724
  url_pattern = re.compile(r'https?://\S+|www\.\S+')
725
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
726
 
 
727
 
728
  toc_pages = get_toc_page_numbers(doc)
729