Spaces:
Sleeping
Sleeping
Update InitialMarkups.py
Browse files- InitialMarkups.py +120 -0
InitialMarkups.py
CHANGED
|
@@ -1044,6 +1044,126 @@ def extract_section_under_header(pdf_path):
|
|
| 1044 |
|
| 1045 |
|
| 1046 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1047 |
def extract_section_under_header_tobebilledOnly(pdf_path):
|
| 1048 |
Alltexttobebilled=''
|
| 1049 |
alltextWithoutNotbilled=''
|
|
|
|
| 1044 |
|
| 1045 |
|
| 1046 |
|
| 1047 |
+
def extract_section_under_header_withoutNot(pdf_path):
|
| 1048 |
+
Alltexttobebilled=''
|
| 1049 |
+
alltextWithoutNotbilled=''
|
| 1050 |
+
top_margin = 70
|
| 1051 |
+
bottom_margin = 50
|
| 1052 |
+
headertoContinue1 = False
|
| 1053 |
+
headertoContinue2=False
|
| 1054 |
+
|
| 1055 |
+
parsed_url = urlparse(pdf_path)
|
| 1056 |
+
filename = os.path.basename(parsed_url.path)
|
| 1057 |
+
filename = unquote(filename) # decode URL-encoded characters
|
| 1058 |
+
|
| 1059 |
+
# Optimized URL handling
|
| 1060 |
+
if pdf_path and ('http' in pdf_path or 'dropbox' in pdf_path):
|
| 1061 |
+
pdf_path = pdf_path.replace('dl=0', 'dl=1')
|
| 1062 |
+
|
| 1063 |
+
# Cache frequently used values
|
| 1064 |
+
response = requests.get(pdf_path)
|
| 1065 |
+
pdf_content = BytesIO(response.content)
|
| 1066 |
+
if not pdf_content:
|
| 1067 |
+
raise ValueError("No valid PDF content found.")
|
| 1068 |
+
|
| 1069 |
+
doc = fitz.open(stream=pdf_content, filetype="pdf")
|
| 1070 |
+
docHighlights = fitz.open(stream=pdf_content, filetype="pdf")
|
| 1071 |
+
most_common_font_size, most_common_color, most_common_font = get_regular_font_size_and_color(doc)
|
| 1072 |
+
|
| 1073 |
+
# Precompute regex patterns
|
| 1074 |
+
dot_pattern = re.compile(r'\.{3,}')
|
| 1075 |
+
url_pattern = re.compile(r'https?://\S+|www\.\S+')
|
| 1076 |
+
|
| 1077 |
+
def get_toc_page_numbers(doc, max_pages_to_check=15):
|
| 1078 |
+
toc_pages = []
|
| 1079 |
+
for page_num in range(min(len(doc), max_pages_to_check)):
|
| 1080 |
+
page = doc.load_page(page_num)
|
| 1081 |
+
blocks = page.get_text("dict")["blocks"]
|
| 1082 |
+
|
| 1083 |
+
dot_line_count = 0
|
| 1084 |
+
for block in blocks:
|
| 1085 |
+
for line in block.get("lines", []):
|
| 1086 |
+
line_text = get_spaced_text_from_spans(line["spans"]).strip()
|
| 1087 |
+
if dot_pattern.search(line_text):
|
| 1088 |
+
dot_line_count += 1
|
| 1089 |
+
|
| 1090 |
+
if dot_line_count >= 3:
|
| 1091 |
+
toc_pages.append(page_num)
|
| 1092 |
+
|
| 1093 |
+
return list(range(0, toc_pages[-1] +1)) if toc_pages else toc_pages
|
| 1094 |
+
|
| 1095 |
+
toc_pages = get_toc_page_numbers(doc)
|
| 1096 |
+
|
| 1097 |
+
|
| 1098 |
+
hierarchy = build_header_hierarchy(doc, toc_pages, most_common_font_size, most_common_color, most_common_font)
|
| 1099 |
+
listofHeaderstoMarkup = get_leaf_headers_with_paths(hierarchy)
|
| 1100 |
+
|
| 1101 |
+
for heading_to_searchDict, paths in listofHeaderstoMarkup:
|
| 1102 |
+
heading_to_search = heading_to_searchDict['text']
|
| 1103 |
+
heading_to_searchPageNum = heading_to_searchDict['page']
|
| 1104 |
+
break_collecting = False
|
| 1105 |
+
for page_num in range(heading_to_searchPageNum,len(doc)):
|
| 1106 |
+
if page_num in toc_pages:
|
| 1107 |
+
continue
|
| 1108 |
+
if break_collecting:
|
| 1109 |
+
break
|
| 1110 |
+
page=doc[page_num]
|
| 1111 |
+
page_height = page.rect.height
|
| 1112 |
+
blocks = page.get_text("dict")["blocks"]
|
| 1113 |
+
|
| 1114 |
+
for block in blocks:
|
| 1115 |
+
if break_collecting:
|
| 1116 |
+
break
|
| 1117 |
+
|
| 1118 |
+
lines = block.get("lines", [])
|
| 1119 |
+
i = 0
|
| 1120 |
+
while i < len(lines):
|
| 1121 |
+
if break_collecting:
|
| 1122 |
+
break
|
| 1123 |
+
|
| 1124 |
+
spans = lines[i].get("spans", [])
|
| 1125 |
+
if not spans:
|
| 1126 |
+
i += 1
|
| 1127 |
+
continue
|
| 1128 |
+
|
| 1129 |
+
y0 = spans[0]["bbox"][1]
|
| 1130 |
+
y1 = spans[0]["bbox"][3]
|
| 1131 |
+
if y0 < top_margin or y1 > (page_height - bottom_margin):
|
| 1132 |
+
i += 1
|
| 1133 |
+
continue
|
| 1134 |
+
|
| 1135 |
+
line_text = get_spaced_text_from_spans(spans).lower()
|
| 1136 |
+
line_text_norm = normalize_text(line_text)
|
| 1137 |
+
|
| 1138 |
+
# Combine with next line if available
|
| 1139 |
+
if i + 1 < len(lines):
|
| 1140 |
+
next_spans = lines[i + 1].get("spans", [])
|
| 1141 |
+
next_line_text = get_spaced_text_from_spans(next_spans).lower()
|
| 1142 |
+
combined_line_norm = normalize_text(line_text + " " + next_line_text)
|
| 1143 |
+
else:
|
| 1144 |
+
combined_line_norm = line_text_norm
|
| 1145 |
+
|
| 1146 |
+
# Check if we should continue processing
|
| 1147 |
+
if combined_line_norm and combined_line_norm in paths[0]:
|
| 1148 |
+
|
| 1149 |
+
headertoContinue1 = combined_line_norm
|
| 1150 |
+
if combined_line_norm and combined_line_norm in paths[-2]:
|
| 1151 |
+
|
| 1152 |
+
headertoContinue2 = combined_line_norm
|
| 1153 |
+
if 'installation' in paths[-2].lower() or 'execution' in paths[-2].lower() or 'miscellaneous items' in paths[-2].lower() :
|
| 1154 |
+
stringtowrite='Not to be billed'
|
| 1155 |
+
else:
|
| 1156 |
+
stringtowrite='To be billed'
|
| 1157 |
+
if stringtowrite!='To be billed':
|
| 1158 |
+
alltextWithoutNotbilled+= combined_line_norm #################################################
|
| 1159 |
+
|
| 1160 |
+
return alltextWithoutNotbilled
|
| 1161 |
+
|
| 1162 |
+
|
| 1163 |
+
|
| 1164 |
+
|
| 1165 |
+
|
| 1166 |
+
##############################################################3
|
| 1167 |
def extract_section_under_header_tobebilledOnly(pdf_path):
|
| 1168 |
Alltexttobebilled=''
|
| 1169 |
alltextWithoutNotbilled=''
|