Spaces:
Sleeping
Sleeping
Update findInitialMarkups.py
Browse files- findInitialMarkups.py +5 -0
findInitialMarkups.py
CHANGED
|
@@ -531,6 +531,10 @@ def print_tree_with_numbers(headers, listofheaders, indent=0):
|
|
| 531 |
return listofheaders
|
| 532 |
|
| 533 |
def get_toc_page_numbers(doc, max_pages_to_check=15):
|
|
|
|
|
|
|
|
|
|
|
|
|
| 534 |
toc_pages = []
|
| 535 |
for page_num in range(min(len(doc), max_pages_to_check)):
|
| 536 |
page = doc.load_page(page_num)
|
|
@@ -561,6 +565,7 @@ def headersfrompdf(filePath):
|
|
| 561 |
|
| 562 |
doc = fitz.open(stream=pdf_content, filetype="pdf")
|
| 563 |
most_common_font_size, most_common_color, most_common_font = get_regular_font_size_and_color(doc)
|
|
|
|
| 564 |
toc_pages = get_toc_page_numbers(doc)
|
| 565 |
hierarchy = build_header_hierarchy(doc,toc_pages, most_common_font_size, most_common_color, most_common_font)
|
| 566 |
assign_numbers_to_headers(hierarchy)
|
|
|
|
| 531 |
return listofheaders
|
| 532 |
|
| 533 |
def get_toc_page_numbers(doc, max_pages_to_check=15):
|
| 534 |
+
# Precompute regex patterns
|
| 535 |
+
dot_pattern = re.compile(r'\.{3,}')
|
| 536 |
+
url_pattern = re.compile(r'https?://\S+|www\.\S+')
|
| 537 |
+
|
| 538 |
toc_pages = []
|
| 539 |
for page_num in range(min(len(doc), max_pages_to_check)):
|
| 540 |
page = doc.load_page(page_num)
|
|
|
|
| 565 |
|
| 566 |
doc = fitz.open(stream=pdf_content, filetype="pdf")
|
| 567 |
most_common_font_size, most_common_color, most_common_font = get_regular_font_size_and_color(doc)
|
| 568 |
+
|
| 569 |
toc_pages = get_toc_page_numbers(doc)
|
| 570 |
hierarchy = build_header_hierarchy(doc,toc_pages, most_common_font_size, most_common_color, most_common_font)
|
| 571 |
assign_numbers_to_headers(hierarchy)
|