Marthee commited on
Commit
fcf7255
·
verified ·
1 Parent(s): 2aa8c4b

Update InitialMarkups.py

Browse files
Files changed (1) hide show
  1. InitialMarkups.py +17 -11
InitialMarkups.py CHANGED
@@ -1044,7 +1044,7 @@ def extract_section_under_header(pdf_path):
1044
 
1045
 
1046
  def extract_section_under_header_tobebilledOnly(pdf_path):
1047
- Alltext_Tobebilled=''
1048
  top_margin = 70
1049
  bottom_margin = 50
1050
  headertoContinue1 = False
@@ -1098,6 +1098,7 @@ def extract_section_under_header_tobebilledOnly(pdf_path):
1098
 
1099
  hierarchy = build_header_hierarchy(doc, toc_pages, most_common_font_size, most_common_color, most_common_font)
1100
  listofHeaderstoMarkup = get_leaf_headers_with_paths(hierarchy)
 
1101
  # Precompute all children headers once
1102
  allchildrenheaders = [normalize_text(item['text']) for item, p in listofHeaderstoMarkup]
1103
  allchildrenheaders_set = set(allchildrenheaders) # For faster lookups
@@ -1113,6 +1114,8 @@ def extract_section_under_header_tobebilledOnly(pdf_path):
1113
  subHeaderFontSize= top_3_font_sizes[1]
1114
  subsubheaderFontSize= top_3_font_sizes[1]
1115
 
 
 
1116
  # Preload all pages to avoid repeated loading
1117
  # pages = [doc.load_page(page_num) for page_num in range(len(doc)) if page_num not in toc_pages]
1118
 
@@ -1120,7 +1123,6 @@ def extract_section_under_header_tobebilledOnly(pdf_path):
1120
  heading_to_search = heading_to_searchDict['text']
1121
  heading_to_searchPageNum = heading_to_searchDict['page']
1122
 
1123
-
1124
  # Initialize variables
1125
  headertoContinue1 = False
1126
  headertoContinue2 = False
@@ -1181,8 +1183,10 @@ def extract_section_under_header_tobebilledOnly(pdf_path):
1181
 
1182
  # Check if we should continue processing
1183
  if combined_line_norm and combined_line_norm in paths[0]:
 
1184
  headertoContinue1 = combined_line_norm
1185
  if combined_line_norm and combined_line_norm in paths[-2]:
 
1186
  headertoContinue2 = combined_line_norm
1187
  if 'installation' in paths[-2].lower() or 'execution' in paths[-2].lower() or 'miscellaneous items' in paths[-2].lower() :
1188
  stringtowrite='Not to be billed'
@@ -1220,13 +1224,12 @@ def extract_section_under_header_tobebilledOnly(pdf_path):
1220
  and span['size'] < mainHeaderFontSize)
1221
  ]
1222
  if header_spans and stringtowrite.startswith('To'):
1223
- Alltext_Tobebilled+=combined_line_norm
1224
  collecting = True
1225
  matched_header_font_size = max(span["size"] for span in header_spans)
1226
-
1227
  collected_lines.append(line_text)
1228
  valid_spans = [span for span in spans if span.get("bbox")]
1229
-
1230
  if valid_spans:
1231
  x0s = [span["bbox"][0] for span in valid_spans]
1232
  x1s = [span["bbox"][2] for span in valid_spans]
@@ -1267,7 +1270,7 @@ def extract_section_under_header_tobebilledOnly(pdf_path):
1267
  encoded_link = '&'.join([f"{key}={value}" for key, value in encoded_params.items()])
1268
 
1269
  # Correctly construct the final URL with page and zoom
1270
- final_url = f"{tobebilledonlyLink}{encoded_link}#page={str(pageNumberFound)}&zoom={zoom_str}"
1271
 
1272
  # Get current date and time
1273
  now = datetime.now()
@@ -1314,11 +1317,10 @@ def extract_section_under_header_tobebilledOnly(pdf_path):
1314
  and span['size'] < mainHeaderFontSize)
1315
  ]
1316
 
1317
- if header_spans and (meets_word_threshold or same_start_word(heading_to_search, combined_line_norm) ) and stringtowrite.startswith('To'):
1318
- Alltext_Tobebilled+=combined_line_norm
1319
  collecting = True
1320
  matched_header_font_size = max(span["size"] for span in header_spans)
1321
-
1322
  collected_lines.append(line_text)
1323
  valid_spans = [span for span in spans if span.get("bbox")]
1324
 
@@ -1362,7 +1364,7 @@ def extract_section_under_header_tobebilledOnly(pdf_path):
1362
  encoded_link = '&'.join([f"{key}={value}" for key, value in encoded_params.items()])
1363
 
1364
  # Correctly construct the final URL with page and zoom
1365
- final_url = f"{tobebilledonlyLink}{encoded_link}#page={str(pageNumberFound)}&zoom={zoom_str}"
1366
 
1367
  # Get current date and time
1368
  now = datetime.now()
@@ -1389,6 +1391,7 @@ def extract_section_under_header_tobebilledOnly(pdf_path):
1389
  # Convert list to JSON
1390
  json_output = json.dumps(data_list_JSON, indent=4)
1391
 
 
1392
  i += 2
1393
  continue
1394
  if collecting:
@@ -1465,7 +1468,10 @@ def extract_section_under_header_tobebilledOnly(pdf_path):
1465
 
1466
  pdf_bytes = BytesIO()
1467
  docHighlights.save(pdf_bytes)
1468
- return pdf_bytes.getvalue(), docHighlights , json_output , Alltext_Tobebilled
 
 
 
1469
 
1470
 
1471
 
 
1044
 
1045
 
1046
  def extract_section_under_header_tobebilledOnly(pdf_path):
1047
+ Alltexttobebilled=''
1048
  top_margin = 70
1049
  bottom_margin = 50
1050
  headertoContinue1 = False
 
1098
 
1099
  hierarchy = build_header_hierarchy(doc, toc_pages, most_common_font_size, most_common_color, most_common_font)
1100
  listofHeaderstoMarkup = get_leaf_headers_with_paths(hierarchy)
1101
+
1102
  # Precompute all children headers once
1103
  allchildrenheaders = [normalize_text(item['text']) for item, p in listofHeaderstoMarkup]
1104
  allchildrenheaders_set = set(allchildrenheaders) # For faster lookups
 
1114
  subHeaderFontSize= top_3_font_sizes[1]
1115
  subsubheaderFontSize= top_3_font_sizes[1]
1116
 
1117
+
1118
+
1119
  # Preload all pages to avoid repeated loading
1120
  # pages = [doc.load_page(page_num) for page_num in range(len(doc)) if page_num not in toc_pages]
1121
 
 
1123
  heading_to_search = heading_to_searchDict['text']
1124
  heading_to_searchPageNum = heading_to_searchDict['page']
1125
 
 
1126
  # Initialize variables
1127
  headertoContinue1 = False
1128
  headertoContinue2 = False
 
1183
 
1184
  # Check if we should continue processing
1185
  if combined_line_norm and combined_line_norm in paths[0]:
1186
+
1187
  headertoContinue1 = combined_line_norm
1188
  if combined_line_norm and combined_line_norm in paths[-2]:
1189
+
1190
  headertoContinue2 = combined_line_norm
1191
  if 'installation' in paths[-2].lower() or 'execution' in paths[-2].lower() or 'miscellaneous items' in paths[-2].lower() :
1192
  stringtowrite='Not to be billed'
 
1224
  and span['size'] < mainHeaderFontSize)
1225
  ]
1226
  if header_spans and stringtowrite.startswith('To'):
 
1227
  collecting = True
1228
  matched_header_font_size = max(span["size"] for span in header_spans)
1229
+ Alltexttobebilled+= ' '+ combined_line_norm
1230
  collected_lines.append(line_text)
1231
  valid_spans = [span for span in spans if span.get("bbox")]
1232
+
1233
  if valid_spans:
1234
  x0s = [span["bbox"][0] for span in valid_spans]
1235
  x1s = [span["bbox"][2] for span in valid_spans]
 
1270
  encoded_link = '&'.join([f"{key}={value}" for key, value in encoded_params.items()])
1271
 
1272
  # Correctly construct the final URL with page and zoom
1273
+ final_url = f"{baselink}{encoded_link}#page={str(pageNumberFound)}&zoom={zoom_str}"
1274
 
1275
  # Get current date and time
1276
  now = datetime.now()
 
1317
  and span['size'] < mainHeaderFontSize)
1318
  ]
1319
 
1320
+ if header_spans and (meets_word_threshold or same_start_word(heading_to_search, combined_line_norm) ) and stringtowrite.startswith('To'):
 
1321
  collecting = True
1322
  matched_header_font_size = max(span["size"] for span in header_spans)
1323
+ Alltexttobebilled+= ' '+ combined_line_norm
1324
  collected_lines.append(line_text)
1325
  valid_spans = [span for span in spans if span.get("bbox")]
1326
 
 
1364
  encoded_link = '&'.join([f"{key}={value}" for key, value in encoded_params.items()])
1365
 
1366
  # Correctly construct the final URL with page and zoom
1367
+ final_url = f"{baselink}{encoded_link}#page={str(pageNumberFound)}&zoom={zoom_str}"
1368
 
1369
  # Get current date and time
1370
  now = datetime.now()
 
1391
  # Convert list to JSON
1392
  json_output = json.dumps(data_list_JSON, indent=4)
1393
 
1394
+
1395
  i += 2
1396
  continue
1397
  if collecting:
 
1468
 
1469
  pdf_bytes = BytesIO()
1470
  docHighlights.save(pdf_bytes)
1471
+ return pdf_bytes.getvalue(), docHighlights , json_output
1472
+
1473
+
1474
+
1475
 
1476
 
1477