Marthee commited on
Commit
06527d8
·
verified ·
1 Parent(s): 166e454

Update InitialMarkups.py

Browse files
Files changed (1) hide show
  1. InitialMarkups.py +120 -0
InitialMarkups.py CHANGED
@@ -1044,6 +1044,126 @@ def extract_section_under_header(pdf_path):
1044
 
1045
 
1046
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1047
  def extract_section_under_header_tobebilledOnly(pdf_path):
1048
  Alltexttobebilled=''
1049
  alltextWithoutNotbilled=''
 
1044
 
1045
 
1046
 
1047
+ def extract_section_under_header_withoutNot(pdf_path):
1048
+ Alltexttobebilled=''
1049
+ alltextWithoutNotbilled=''
1050
+ top_margin = 70
1051
+ bottom_margin = 50
1052
+ headertoContinue1 = False
1053
+ headertoContinue2=False
1054
+
1055
+ parsed_url = urlparse(pdf_path)
1056
+ filename = os.path.basename(parsed_url.path)
1057
+ filename = unquote(filename) # decode URL-encoded characters
1058
+
1059
+ # Optimized URL handling
1060
+ if pdf_path and ('http' in pdf_path or 'dropbox' in pdf_path):
1061
+ pdf_path = pdf_path.replace('dl=0', 'dl=1')
1062
+
1063
+ # Cache frequently used values
1064
+ response = requests.get(pdf_path)
1065
+ pdf_content = BytesIO(response.content)
1066
+ if not pdf_content:
1067
+ raise ValueError("No valid PDF content found.")
1068
+
1069
+ doc = fitz.open(stream=pdf_content, filetype="pdf")
1070
+ docHighlights = fitz.open(stream=pdf_content, filetype="pdf")
1071
+ most_common_font_size, most_common_color, most_common_font = get_regular_font_size_and_color(doc)
1072
+
1073
+ # Precompute regex patterns
1074
+ dot_pattern = re.compile(r'\.{3,}')
1075
+ url_pattern = re.compile(r'https?://\S+|www\.\S+')
1076
+
1077
+ def get_toc_page_numbers(doc, max_pages_to_check=15):
1078
+ toc_pages = []
1079
+ for page_num in range(min(len(doc), max_pages_to_check)):
1080
+ page = doc.load_page(page_num)
1081
+ blocks = page.get_text("dict")["blocks"]
1082
+
1083
+ dot_line_count = 0
1084
+ for block in blocks:
1085
+ for line in block.get("lines", []):
1086
+ line_text = get_spaced_text_from_spans(line["spans"]).strip()
1087
+ if dot_pattern.search(line_text):
1088
+ dot_line_count += 1
1089
+
1090
+ if dot_line_count >= 3:
1091
+ toc_pages.append(page_num)
1092
+
1093
+ return list(range(0, toc_pages[-1] +1)) if toc_pages else toc_pages
1094
+
1095
+ toc_pages = get_toc_page_numbers(doc)
1096
+
1097
+
1098
+ hierarchy = build_header_hierarchy(doc, toc_pages, most_common_font_size, most_common_color, most_common_font)
1099
+ listofHeaderstoMarkup = get_leaf_headers_with_paths(hierarchy)
1100
+
1101
+ for heading_to_searchDict, paths in listofHeaderstoMarkup:
1102
+ heading_to_search = heading_to_searchDict['text']
1103
+ heading_to_searchPageNum = heading_to_searchDict['page']
1104
+ break_collecting = False
1105
+ for page_num in range(heading_to_searchPageNum,len(doc)):
1106
+ if page_num in toc_pages:
1107
+ continue
1108
+ if break_collecting:
1109
+ break
1110
+ page=doc[page_num]
1111
+ page_height = page.rect.height
1112
+ blocks = page.get_text("dict")["blocks"]
1113
+
1114
+ for block in blocks:
1115
+ if break_collecting:
1116
+ break
1117
+
1118
+ lines = block.get("lines", [])
1119
+ i = 0
1120
+ while i < len(lines):
1121
+ if break_collecting:
1122
+ break
1123
+
1124
+ spans = lines[i].get("spans", [])
1125
+ if not spans:
1126
+ i += 1
1127
+ continue
1128
+
1129
+ y0 = spans[0]["bbox"][1]
1130
+ y1 = spans[0]["bbox"][3]
1131
+ if y0 < top_margin or y1 > (page_height - bottom_margin):
1132
+ i += 1
1133
+ continue
1134
+
1135
+ line_text = get_spaced_text_from_spans(spans).lower()
1136
+ line_text_norm = normalize_text(line_text)
1137
+
1138
+ # Combine with next line if available
1139
+ if i + 1 < len(lines):
1140
+ next_spans = lines[i + 1].get("spans", [])
1141
+ next_line_text = get_spaced_text_from_spans(next_spans).lower()
1142
+ combined_line_norm = normalize_text(line_text + " " + next_line_text)
1143
+ else:
1144
+ combined_line_norm = line_text_norm
1145
+
1146
+ # Check if we should continue processing
1147
+ if combined_line_norm and combined_line_norm in paths[0]:
1148
+
1149
+ headertoContinue1 = combined_line_norm
1150
+ if combined_line_norm and combined_line_norm in paths[-2]:
1151
+
1152
+ headertoContinue2 = combined_line_norm
1153
+ if 'installation' in paths[-2].lower() or 'execution' in paths[-2].lower() or 'miscellaneous items' in paths[-2].lower() :
1154
+ stringtowrite='Not to be billed'
1155
+ else:
1156
+ stringtowrite='To be billed'
1157
+ if stringtowrite!='To be billed':
1158
+ alltextWithoutNotbilled+= combined_line_norm #################################################
1159
+
1160
+ return alltextWithoutNotbilled
1161
+
1162
+
1163
+
1164
+
1165
+
1166
+ ##############################################################3
1167
  def extract_section_under_header_tobebilledOnly(pdf_path):
1168
  Alltexttobebilled=''
1169
  alltextWithoutNotbilled=''