Spaces:
Sleeping
Sleeping
Update InitialMarkups.py
Browse files- InitialMarkups.py +17 -11
InitialMarkups.py
CHANGED
|
@@ -1044,7 +1044,7 @@ def extract_section_under_header(pdf_path):
|
|
| 1044 |
|
| 1045 |
|
| 1046 |
def extract_section_under_header_tobebilledOnly(pdf_path):
|
| 1047 |
-
|
| 1048 |
top_margin = 70
|
| 1049 |
bottom_margin = 50
|
| 1050 |
headertoContinue1 = False
|
|
@@ -1098,6 +1098,7 @@ def extract_section_under_header_tobebilledOnly(pdf_path):
|
|
| 1098 |
|
| 1099 |
hierarchy = build_header_hierarchy(doc, toc_pages, most_common_font_size, most_common_color, most_common_font)
|
| 1100 |
listofHeaderstoMarkup = get_leaf_headers_with_paths(hierarchy)
|
|
|
|
| 1101 |
# Precompute all children headers once
|
| 1102 |
allchildrenheaders = [normalize_text(item['text']) for item, p in listofHeaderstoMarkup]
|
| 1103 |
allchildrenheaders_set = set(allchildrenheaders) # For faster lookups
|
|
@@ -1113,6 +1114,8 @@ def extract_section_under_header_tobebilledOnly(pdf_path):
|
|
| 1113 |
subHeaderFontSize= top_3_font_sizes[1]
|
| 1114 |
subsubheaderFontSize= top_3_font_sizes[1]
|
| 1115 |
|
|
|
|
|
|
|
| 1116 |
# Preload all pages to avoid repeated loading
|
| 1117 |
# pages = [doc.load_page(page_num) for page_num in range(len(doc)) if page_num not in toc_pages]
|
| 1118 |
|
|
@@ -1120,7 +1123,6 @@ def extract_section_under_header_tobebilledOnly(pdf_path):
|
|
| 1120 |
heading_to_search = heading_to_searchDict['text']
|
| 1121 |
heading_to_searchPageNum = heading_to_searchDict['page']
|
| 1122 |
|
| 1123 |
-
|
| 1124 |
# Initialize variables
|
| 1125 |
headertoContinue1 = False
|
| 1126 |
headertoContinue2 = False
|
|
@@ -1181,8 +1183,10 @@ def extract_section_under_header_tobebilledOnly(pdf_path):
|
|
| 1181 |
|
| 1182 |
# Check if we should continue processing
|
| 1183 |
if combined_line_norm and combined_line_norm in paths[0]:
|
|
|
|
| 1184 |
headertoContinue1 = combined_line_norm
|
| 1185 |
if combined_line_norm and combined_line_norm in paths[-2]:
|
|
|
|
| 1186 |
headertoContinue2 = combined_line_norm
|
| 1187 |
if 'installation' in paths[-2].lower() or 'execution' in paths[-2].lower() or 'miscellaneous items' in paths[-2].lower() :
|
| 1188 |
stringtowrite='Not to be billed'
|
|
@@ -1220,13 +1224,12 @@ def extract_section_under_header_tobebilledOnly(pdf_path):
|
|
| 1220 |
and span['size'] < mainHeaderFontSize)
|
| 1221 |
]
|
| 1222 |
if header_spans and stringtowrite.startswith('To'):
|
| 1223 |
-
Alltext_Tobebilled+=combined_line_norm
|
| 1224 |
collecting = True
|
| 1225 |
matched_header_font_size = max(span["size"] for span in header_spans)
|
| 1226 |
-
|
| 1227 |
collected_lines.append(line_text)
|
| 1228 |
valid_spans = [span for span in spans if span.get("bbox")]
|
| 1229 |
-
|
| 1230 |
if valid_spans:
|
| 1231 |
x0s = [span["bbox"][0] for span in valid_spans]
|
| 1232 |
x1s = [span["bbox"][2] for span in valid_spans]
|
|
@@ -1267,7 +1270,7 @@ def extract_section_under_header_tobebilledOnly(pdf_path):
|
|
| 1267 |
encoded_link = '&'.join([f"{key}={value}" for key, value in encoded_params.items()])
|
| 1268 |
|
| 1269 |
# Correctly construct the final URL with page and zoom
|
| 1270 |
-
final_url = f"{
|
| 1271 |
|
| 1272 |
# Get current date and time
|
| 1273 |
now = datetime.now()
|
|
@@ -1314,11 +1317,10 @@ def extract_section_under_header_tobebilledOnly(pdf_path):
|
|
| 1314 |
and span['size'] < mainHeaderFontSize)
|
| 1315 |
]
|
| 1316 |
|
| 1317 |
-
if header_spans and (meets_word_threshold or same_start_word(heading_to_search, combined_line_norm) ) and stringtowrite.startswith('To'):
|
| 1318 |
-
Alltext_Tobebilled+=combined_line_norm
|
| 1319 |
collecting = True
|
| 1320 |
matched_header_font_size = max(span["size"] for span in header_spans)
|
| 1321 |
-
|
| 1322 |
collected_lines.append(line_text)
|
| 1323 |
valid_spans = [span for span in spans if span.get("bbox")]
|
| 1324 |
|
|
@@ -1362,7 +1364,7 @@ def extract_section_under_header_tobebilledOnly(pdf_path):
|
|
| 1362 |
encoded_link = '&'.join([f"{key}={value}" for key, value in encoded_params.items()])
|
| 1363 |
|
| 1364 |
# Correctly construct the final URL with page and zoom
|
| 1365 |
-
final_url = f"{
|
| 1366 |
|
| 1367 |
# Get current date and time
|
| 1368 |
now = datetime.now()
|
|
@@ -1389,6 +1391,7 @@ def extract_section_under_header_tobebilledOnly(pdf_path):
|
|
| 1389 |
# Convert list to JSON
|
| 1390 |
json_output = json.dumps(data_list_JSON, indent=4)
|
| 1391 |
|
|
|
|
| 1392 |
i += 2
|
| 1393 |
continue
|
| 1394 |
if collecting:
|
|
@@ -1465,7 +1468,10 @@ def extract_section_under_header_tobebilledOnly(pdf_path):
|
|
| 1465 |
|
| 1466 |
pdf_bytes = BytesIO()
|
| 1467 |
docHighlights.save(pdf_bytes)
|
| 1468 |
-
return pdf_bytes.getvalue(), docHighlights , json_output
|
|
|
|
|
|
|
|
|
|
| 1469 |
|
| 1470 |
|
| 1471 |
|
|
|
|
| 1044 |
|
| 1045 |
|
| 1046 |
def extract_section_under_header_tobebilledOnly(pdf_path):
|
| 1047 |
+
Alltexttobebilled=''
|
| 1048 |
top_margin = 70
|
| 1049 |
bottom_margin = 50
|
| 1050 |
headertoContinue1 = False
|
|
|
|
| 1098 |
|
| 1099 |
hierarchy = build_header_hierarchy(doc, toc_pages, most_common_font_size, most_common_color, most_common_font)
|
| 1100 |
listofHeaderstoMarkup = get_leaf_headers_with_paths(hierarchy)
|
| 1101 |
+
|
| 1102 |
# Precompute all children headers once
|
| 1103 |
allchildrenheaders = [normalize_text(item['text']) for item, p in listofHeaderstoMarkup]
|
| 1104 |
allchildrenheaders_set = set(allchildrenheaders) # For faster lookups
|
|
|
|
| 1114 |
subHeaderFontSize= top_3_font_sizes[1]
|
| 1115 |
subsubheaderFontSize= top_3_font_sizes[1]
|
| 1116 |
|
| 1117 |
+
|
| 1118 |
+
|
| 1119 |
# Preload all pages to avoid repeated loading
|
| 1120 |
# pages = [doc.load_page(page_num) for page_num in range(len(doc)) if page_num not in toc_pages]
|
| 1121 |
|
|
|
|
| 1123 |
heading_to_search = heading_to_searchDict['text']
|
| 1124 |
heading_to_searchPageNum = heading_to_searchDict['page']
|
| 1125 |
|
|
|
|
| 1126 |
# Initialize variables
|
| 1127 |
headertoContinue1 = False
|
| 1128 |
headertoContinue2 = False
|
|
|
|
| 1183 |
|
| 1184 |
# Check if we should continue processing
|
| 1185 |
if combined_line_norm and combined_line_norm in paths[0]:
|
| 1186 |
+
|
| 1187 |
headertoContinue1 = combined_line_norm
|
| 1188 |
if combined_line_norm and combined_line_norm in paths[-2]:
|
| 1189 |
+
|
| 1190 |
headertoContinue2 = combined_line_norm
|
| 1191 |
if 'installation' in paths[-2].lower() or 'execution' in paths[-2].lower() or 'miscellaneous items' in paths[-2].lower() :
|
| 1192 |
stringtowrite='Not to be billed'
|
|
|
|
| 1224 |
and span['size'] < mainHeaderFontSize)
|
| 1225 |
]
|
| 1226 |
if header_spans and stringtowrite.startswith('To'):
|
|
|
|
| 1227 |
collecting = True
|
| 1228 |
matched_header_font_size = max(span["size"] for span in header_spans)
|
| 1229 |
+
Alltexttobebilled+= ' '+ combined_line_norm
|
| 1230 |
collected_lines.append(line_text)
|
| 1231 |
valid_spans = [span for span in spans if span.get("bbox")]
|
| 1232 |
+
|
| 1233 |
if valid_spans:
|
| 1234 |
x0s = [span["bbox"][0] for span in valid_spans]
|
| 1235 |
x1s = [span["bbox"][2] for span in valid_spans]
|
|
|
|
| 1270 |
encoded_link = '&'.join([f"{key}={value}" for key, value in encoded_params.items()])
|
| 1271 |
|
| 1272 |
# Correctly construct the final URL with page and zoom
|
| 1273 |
+
final_url = f"{baselink}{encoded_link}#page={str(pageNumberFound)}&zoom={zoom_str}"
|
| 1274 |
|
| 1275 |
# Get current date and time
|
| 1276 |
now = datetime.now()
|
|
|
|
| 1317 |
and span['size'] < mainHeaderFontSize)
|
| 1318 |
]
|
| 1319 |
|
| 1320 |
+
if header_spans and (meets_word_threshold or same_start_word(heading_to_search, combined_line_norm) ) and stringtowrite.startswith('To'):
|
|
|
|
| 1321 |
collecting = True
|
| 1322 |
matched_header_font_size = max(span["size"] for span in header_spans)
|
| 1323 |
+
Alltexttobebilled+= ' '+ combined_line_norm
|
| 1324 |
collected_lines.append(line_text)
|
| 1325 |
valid_spans = [span for span in spans if span.get("bbox")]
|
| 1326 |
|
|
|
|
| 1364 |
encoded_link = '&'.join([f"{key}={value}" for key, value in encoded_params.items()])
|
| 1365 |
|
| 1366 |
# Correctly construct the final URL with page and zoom
|
| 1367 |
+
final_url = f"{baselink}{encoded_link}#page={str(pageNumberFound)}&zoom={zoom_str}"
|
| 1368 |
|
| 1369 |
# Get current date and time
|
| 1370 |
now = datetime.now()
|
|
|
|
| 1391 |
# Convert list to JSON
|
| 1392 |
json_output = json.dumps(data_list_JSON, indent=4)
|
| 1393 |
|
| 1394 |
+
|
| 1395 |
i += 2
|
| 1396 |
continue
|
| 1397 |
if collecting:
|
|
|
|
| 1468 |
|
| 1469 |
pdf_bytes = BytesIO()
|
| 1470 |
docHighlights.save(pdf_bytes)
|
| 1471 |
+
return pdf_bytes.getvalue(), docHighlights , json_output
|
| 1472 |
+
|
| 1473 |
+
|
| 1474 |
+
|
| 1475 |
|
| 1476 |
|
| 1477 |
|