Spaces:
Runtime error
Runtime error
Update InitialMarkups.py
Browse files- InitialMarkups.py +50 -16
InitialMarkups.py
CHANGED
|
@@ -64,6 +64,56 @@ def changepdflinks(json_data, pdf_path):
|
|
| 64 |
return updated_json
|
| 65 |
|
| 66 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 67 |
def get_regular_font_size_and_color(doc):
|
| 68 |
font_sizes = []
|
| 69 |
colors = []
|
|
@@ -673,23 +723,7 @@ def extract_section_under_header(multiplePDF_Paths):
|
|
| 673 |
dot_pattern = re.compile(r'\.{3,}')
|
| 674 |
url_pattern = re.compile(r'https?://\S+|www\.\S+')
|
| 675 |
|
| 676 |
-
def get_toc_page_numbers(doc, max_pages_to_check=15):
|
| 677 |
-
toc_pages = []
|
| 678 |
-
for page_num in range(min(len(doc), max_pages_to_check)):
|
| 679 |
-
page = doc.load_page(page_num)
|
| 680 |
-
blocks = page.get_text("dict")["blocks"]
|
| 681 |
-
|
| 682 |
-
dot_line_count = 0
|
| 683 |
-
for block in blocks:
|
| 684 |
-
for line in block.get("lines", []):
|
| 685 |
-
line_text = get_spaced_text_from_spans(line["spans"]).strip()
|
| 686 |
-
if dot_pattern.search(line_text):
|
| 687 |
-
dot_line_count += 1
|
| 688 |
-
|
| 689 |
-
if dot_line_count >= 1:
|
| 690 |
-
toc_pages.append(page_num)
|
| 691 |
|
| 692 |
-
return list(range(0, toc_pages[-1] +1)) if toc_pages else toc_pages
|
| 693 |
|
| 694 |
toc_pages = get_toc_page_numbers(doc)
|
| 695 |
|
|
|
|
| 64 |
return updated_json
|
| 65 |
|
| 66 |
|
| 67 |
+
|
| 68 |
+
def get_toc_page_numbers(doc, max_pages_to_check=15):
|
| 69 |
+
toc_pages = []
|
| 70 |
+
|
| 71 |
+
# 1. Existing Dot Pattern (looking for ".....")
|
| 72 |
+
dot_pattern = re.compile(r"\.{2,}")
|
| 73 |
+
|
| 74 |
+
# 2. NEW: Title Pattern (looking for specific headers)
|
| 75 |
+
# ^ and $ ensure the line is JUST that word (ignoring "The contents of the bag...")
|
| 76 |
+
# re.IGNORECASE makes it match "CONTENTS", "Contents", "Index", etc.
|
| 77 |
+
title_pattern = re.compile(r"^\s*(table of contents|contents|index)\s*$", re.IGNORECASE)
|
| 78 |
+
|
| 79 |
+
for page_num in range(min(len(doc), max_pages_to_check)):
|
| 80 |
+
page = doc.load_page(page_num)
|
| 81 |
+
blocks = page.get_text("dict")["blocks"]
|
| 82 |
+
|
| 83 |
+
dot_line_count = 0
|
| 84 |
+
has_toc_title = False
|
| 85 |
+
|
| 86 |
+
for block in blocks:
|
| 87 |
+
for line in block.get("lines", []):
|
| 88 |
+
# Extract text from spans (mimicking get_spaced_text_from_spans)
|
| 89 |
+
line_text = " ".join([span["text"] for span in line["spans"]]).strip()
|
| 90 |
+
|
| 91 |
+
# CHECK A: Does the line have dots?
|
| 92 |
+
if dot_pattern.search(line_text):
|
| 93 |
+
dot_line_count += 1
|
| 94 |
+
|
| 95 |
+
# CHECK B: Is this line a Title?
|
| 96 |
+
# We check this early in the loop. If a page has a title "Contents",
|
| 97 |
+
# we mark it immediately.
|
| 98 |
+
if title_pattern.match(line_text):
|
| 99 |
+
has_toc_title = True
|
| 100 |
+
|
| 101 |
+
# CONDITION:
|
| 102 |
+
# It is a TOC page if it has a Title OR if it has dot leaders.
|
| 103 |
+
# We use 'dot_line_count >= 1' to be sensitive to single-item lists.
|
| 104 |
+
if has_toc_title or dot_line_count >= 1:
|
| 105 |
+
toc_pages.append(page_num)
|
| 106 |
+
|
| 107 |
+
# RETURN:
|
| 108 |
+
# If we found TOC pages (e.g., [2, 3]), we return [0, 1, 2, 3]
|
| 109 |
+
# This covers the cover page, inside cover, and the TOC itself.
|
| 110 |
+
if toc_pages:
|
| 111 |
+
last_toc_page = toc_pages[-1]
|
| 112 |
+
return list(range(0, last_toc_page + 1))
|
| 113 |
+
|
| 114 |
+
return [] # Return empty list if nothing found
|
| 115 |
+
|
| 116 |
+
|
| 117 |
def get_regular_font_size_and_color(doc):
|
| 118 |
font_sizes = []
|
| 119 |
colors = []
|
|
|
|
| 723 |
dot_pattern = re.compile(r'\.{3,}')
|
| 724 |
url_pattern = re.compile(r'https?://\S+|www\.\S+')
|
| 725 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 726 |
|
|
|
|
| 727 |
|
| 728 |
toc_pages = get_toc_page_numbers(doc)
|
| 729 |
|