Marthee commited on
Commit
1e42a9f
·
verified ·
1 Parent(s): 9f1156f

Update findInitialMarkups.py

Browse files
Files changed (1) hide show
  1. findInitialMarkups.py +5 -0
findInitialMarkups.py CHANGED
@@ -531,6 +531,10 @@ def print_tree_with_numbers(headers, listofheaders, indent=0):
531
  return listofheaders
532
 
533
  def get_toc_page_numbers(doc, max_pages_to_check=15):
 
 
 
 
534
  toc_pages = []
535
  for page_num in range(min(len(doc), max_pages_to_check)):
536
  page = doc.load_page(page_num)
@@ -561,6 +565,7 @@ def headersfrompdf(filePath):
561
 
562
  doc = fitz.open(stream=pdf_content, filetype="pdf")
563
  most_common_font_size, most_common_color, most_common_font = get_regular_font_size_and_color(doc)
 
564
  toc_pages = get_toc_page_numbers(doc)
565
  hierarchy = build_header_hierarchy(doc,toc_pages, most_common_font_size, most_common_color, most_common_font)
566
  assign_numbers_to_headers(hierarchy)
 
531
  return listofheaders
532
 
533
  def get_toc_page_numbers(doc, max_pages_to_check=15):
534
+ # Precompute regex patterns
535
+ dot_pattern = re.compile(r'\.{3,}')
536
+ url_pattern = re.compile(r'https?://\S+|www\.\S+')
537
+
538
  toc_pages = []
539
  for page_num in range(min(len(doc), max_pages_to_check)):
540
  page = doc.load_page(page_num)
 
565
 
566
  doc = fitz.open(stream=pdf_content, filetype="pdf")
567
  most_common_font_size, most_common_color, most_common_font = get_regular_font_size_and_color(doc)
568
+
569
  toc_pages = get_toc_page_numbers(doc)
570
  hierarchy = build_header_hierarchy(doc,toc_pages, most_common_font_size, most_common_color, most_common_font)
571
  assign_numbers_to_headers(hierarchy)