Spaces:
Paused
Paused
Update InitialMarkups.py
Browse files- InitialMarkups.py +408 -3
InitialMarkups.py
CHANGED
|
@@ -8,8 +8,11 @@ Original file is located at
|
|
| 8 |
"""
|
| 9 |
baselink='https://findconsole-initialmarkups.hf.space/view-pdf?'
|
| 10 |
|
| 11 |
-
|
| 12 |
|
|
|
|
|
|
|
|
|
|
| 13 |
from io import BytesIO
|
| 14 |
import re
|
| 15 |
import requests
|
|
@@ -26,6 +29,13 @@ from collections import defaultdict, Counter
|
|
| 26 |
import difflib
|
| 27 |
from fuzzywuzzy import fuzz
|
| 28 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 29 |
def get_regular_font_size_and_color(doc):
|
| 30 |
font_sizes = []
|
| 31 |
colors = []
|
|
@@ -203,6 +213,7 @@ def extract_headers(doc, toc_pages, most_common_font_size, most_common_color, mo
|
|
| 203 |
|
| 204 |
# Get the smallest font size among valid ones
|
| 205 |
smallest_font_size = min(valid_font_sizes) if valid_font_sizes else None
|
|
|
|
| 206 |
return headers, top_3_font_sizes, smallest_font_size, spans
|
| 207 |
|
| 208 |
def is_numbered(text):
|
|
@@ -606,6 +617,10 @@ def extract_section_under_header(pdf_path):
|
|
| 606 |
bottom_margin = 50
|
| 607 |
headertoContinue1 = False
|
| 608 |
headertoContinue2=False
|
|
|
|
|
|
|
|
|
|
|
|
|
| 609 |
|
| 610 |
# Optimized URL handling
|
| 611 |
if pdf_path and ('http' in pdf_path or 'dropbox' in pdf_path):
|
|
@@ -845,7 +860,8 @@ def extract_section_under_header(pdf_path):
|
|
| 845 |
"Layer": "Initial",
|
| 846 |
"Code": stringtowrite,
|
| 847 |
"head above 1": paths[-2],
|
| 848 |
-
"head above 2": paths[0]
|
|
|
|
| 849 |
}
|
| 850 |
data_list_JSON.append(data_entry)
|
| 851 |
|
|
@@ -941,7 +957,8 @@ def extract_section_under_header(pdf_path):
|
|
| 941 |
"Layer": "Initial",
|
| 942 |
"Code": stringtowrite,
|
| 943 |
"head above 1": paths[-2],
|
| 944 |
-
"head above 2": paths[0]
|
|
|
|
| 945 |
}
|
| 946 |
data_list_JSON.append(data_entry)
|
| 947 |
|
|
@@ -1028,3 +1045,391 @@ def extract_section_under_header(pdf_path):
|
|
| 1028 |
docHighlights.save(pdf_bytes)
|
| 1029 |
print('JSONN',json_output)
|
| 1030 |
return pdf_bytes.getvalue(), docHighlights , json_output
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
"""
|
| 9 |
baselink='https://findconsole-initialmarkups.hf.space/view-pdf?'
|
| 10 |
|
| 11 |
+
newlink='https://findconsole-initialmarkups.hf.space/view-highlight?'
|
| 12 |
|
| 13 |
+
|
| 14 |
+
from urllib.parse import urlparse, unquote
|
| 15 |
+
import os
|
| 16 |
from io import BytesIO
|
| 17 |
import re
|
| 18 |
import requests
|
|
|
|
| 29 |
import difflib
|
| 30 |
from fuzzywuzzy import fuzz
|
| 31 |
|
| 32 |
+
def filteredJsons(pdf_path,filteredjsonsfromrawan):
|
| 33 |
+
# for heading in subjects:
|
| 34 |
+
extract_section_under_headerRawan (pdf_path=pdf_path,listofheadingsfromrawan=filteredjsonsfromrawan)
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
|
| 39 |
def get_regular_font_size_and_color(doc):
|
| 40 |
font_sizes = []
|
| 41 |
colors = []
|
|
|
|
| 213 |
|
| 214 |
# Get the smallest font size among valid ones
|
| 215 |
smallest_font_size = min(valid_font_sizes) if valid_font_sizes else None
|
| 216 |
+
|
| 217 |
return headers, top_3_font_sizes, smallest_font_size, spans
|
| 218 |
|
| 219 |
def is_numbered(text):
|
|
|
|
| 617 |
bottom_margin = 50
|
| 618 |
headertoContinue1 = False
|
| 619 |
headertoContinue2=False
|
| 620 |
+
|
| 621 |
+
parsed_url = urlparse(pdf_path)
|
| 622 |
+
filename = os.path.basename(parsed_url.path)
|
| 623 |
+
filename = unquote(filename) # decode URL-encoded characters
|
| 624 |
|
| 625 |
# Optimized URL handling
|
| 626 |
if pdf_path and ('http' in pdf_path or 'dropbox' in pdf_path):
|
|
|
|
| 860 |
"Layer": "Initial",
|
| 861 |
"Code": stringtowrite,
|
| 862 |
"head above 1": paths[-2],
|
| 863 |
+
"head above 2": paths[0],
|
| 864 |
+
"MC Connnection": 'Go to ' + paths[0].strip().split()[0] +'/'+ heading_to_search.strip().split()[0] + ' in '+ filename
|
| 865 |
}
|
| 866 |
data_list_JSON.append(data_entry)
|
| 867 |
|
|
|
|
| 957 |
"Layer": "Initial",
|
| 958 |
"Code": stringtowrite,
|
| 959 |
"head above 1": paths[-2],
|
| 960 |
+
"head above 2": paths[0],
|
| 961 |
+
"MC Connnection": 'Go to ' + paths[0].strip().split()[0] +'/'+ heading_to_search.strip().split()[0] + ' in '+ filename
|
| 962 |
}
|
| 963 |
data_list_JSON.append(data_entry)
|
| 964 |
|
|
|
|
| 1045 |
docHighlights.save(pdf_bytes)
|
| 1046 |
print('JSONN',json_output)
|
| 1047 |
return pdf_bytes.getvalue(), docHighlights , json_output
|
| 1048 |
+
|
| 1049 |
+
|
| 1050 |
+
|
| 1051 |
+
|
| 1052 |
+
def extract_section_under_headerRawan(pdf_path,headingjson,pagenum=0,incomingheader=0):
|
| 1053 |
+
top_margin = 70
|
| 1054 |
+
bottom_margin = 50
|
| 1055 |
+
# Optimized URL handling
|
| 1056 |
+
if pdf_path and ('http' in pdf_path or 'dropbox' in pdf_path):
|
| 1057 |
+
pdf_path = pdf_path.replace('dl=0', 'dl=1')
|
| 1058 |
+
|
| 1059 |
+
# Cache frequently used values
|
| 1060 |
+
response = requests.get(pdf_path)
|
| 1061 |
+
pdf_content = BytesIO(response.content)
|
| 1062 |
+
if not pdf_content:
|
| 1063 |
+
raise ValueError("No valid PDF content found.")
|
| 1064 |
+
|
| 1065 |
+
doc = fitz.open(stream=pdf_content, filetype="pdf")
|
| 1066 |
+
docHighlights = fitz.open(stream=pdf_content, filetype="pdf")
|
| 1067 |
+
most_common_font_size, most_common_color, most_common_font = get_regular_font_size_and_color(doc)
|
| 1068 |
+
|
| 1069 |
+
# Precompute regex patterns
|
| 1070 |
+
dot_pattern = re.compile(r'\.{3,}')
|
| 1071 |
+
url_pattern = re.compile(r'https?://\S+|www\.\S+')
|
| 1072 |
+
|
| 1073 |
+
def get_toc_page_numbers(doc, max_pages_to_check=15):
|
| 1074 |
+
toc_pages = []
|
| 1075 |
+
for page_num in range(min(len(doc), max_pages_to_check)):
|
| 1076 |
+
page = doc.load_page(page_num)
|
| 1077 |
+
blocks = page.get_text("dict")["blocks"]
|
| 1078 |
+
|
| 1079 |
+
dot_line_count = 0
|
| 1080 |
+
for block in blocks:
|
| 1081 |
+
for line in block.get("lines", []):
|
| 1082 |
+
line_text = get_spaced_text_from_spans(line["spans"]).strip()
|
| 1083 |
+
if dot_pattern.search(line_text):
|
| 1084 |
+
dot_line_count += 1
|
| 1085 |
+
|
| 1086 |
+
if dot_line_count >= 3:
|
| 1087 |
+
toc_pages.append(page_num)
|
| 1088 |
+
|
| 1089 |
+
return list(range(0, toc_pages[-1] +1)) if toc_pages else toc_pages
|
| 1090 |
+
|
| 1091 |
+
toc_pages = get_toc_page_numbers(doc)
|
| 1092 |
+
|
| 1093 |
+
headers, top_3_font_sizes, smallest_font_size, headersSpans = extract_headers(
|
| 1094 |
+
doc, toc_pages, most_common_font_size, most_common_color, most_common_font, top_margin, bottom_margin
|
| 1095 |
+
)
|
| 1096 |
+
|
| 1097 |
+
listofheadingsfromrawan=[]
|
| 1098 |
+
if type(headingjson) == str:
|
| 1099 |
+
listofheadingsfromrawan.append(headingjson)
|
| 1100 |
+
headingjson=[headingjson]
|
| 1101 |
+
else:
|
| 1102 |
+
for item in headingjson:
|
| 1103 |
+
listofheadingsfromrawan.append(normalize_text(item['Subject']))
|
| 1104 |
+
print('hereeeeeeeeeeeeeee0',listofheadingsfromrawan)
|
| 1105 |
+
# Precompute all children headers once
|
| 1106 |
+
allchildrenheaders = listofheadingsfromrawan
|
| 1107 |
+
print('hereeeeeeeeeeeeeee00',allchildrenheaders)
|
| 1108 |
+
allchildrenheaders_set = set(allchildrenheaders) # For faster lookups
|
| 1109 |
+
|
| 1110 |
+
df = pd.DataFrame(columns=["NBSLink","Subject","Page","Author","Creation Date","Layer",'Code', 'head above 1', "head above 2"])
|
| 1111 |
+
data_list_JSON = []
|
| 1112 |
+
|
| 1113 |
+
if len(top_3_font_sizes)==3:
|
| 1114 |
+
mainHeaderFontSize, subHeaderFontSize, subsubheaderFontSize = top_3_font_sizes
|
| 1115 |
+
elif len(top_3_font_sizes)==2:
|
| 1116 |
+
mainHeaderFontSize= top_3_font_sizes[0]
|
| 1117 |
+
subHeaderFontSize= top_3_font_sizes[1]
|
| 1118 |
+
subsubheaderFontSize= top_3_font_sizes[1]
|
| 1119 |
+
|
| 1120 |
+
print("📌 Has TOC:", bool(toc_pages), " | Pages to skip:", toc_pages)
|
| 1121 |
+
|
| 1122 |
+
# Preload all pages to avoid repeated loading
|
| 1123 |
+
# pages = [doc.load_page(page_num) for page_num in range(len(doc)) if page_num not in toc_pages]
|
| 1124 |
+
newjsonList=[]
|
| 1125 |
+
for heading_to_searchDict in headingjson:
|
| 1126 |
+
if type(heading_to_searchDict) == str:
|
| 1127 |
+
heading_to_search = heading_to_searchDict
|
| 1128 |
+
heading_to_searchPageNum = pagenum
|
| 1129 |
+
else:
|
| 1130 |
+
heading_to_search = heading_to_searchDict['Subject']
|
| 1131 |
+
heading_to_searchPageNum = int(heading_to_searchDict['Page'])-1
|
| 1132 |
+
incomingheader = heading_to_searchDict['head above 1']
|
| 1133 |
+
|
| 1134 |
+
print('hereeeeeeeeeeeeeee0',heading_to_searchPageNum)
|
| 1135 |
+
done = False
|
| 1136 |
+
collecting = False
|
| 1137 |
+
collected_lines = []
|
| 1138 |
+
page_highlights = {}
|
| 1139 |
+
current_bbox = {}
|
| 1140 |
+
last_y1s = {}
|
| 1141 |
+
mainHeader = ''
|
| 1142 |
+
subHeader = ''
|
| 1143 |
+
matched_header_line_norm = heading_to_search
|
| 1144 |
+
break_collecting = False
|
| 1145 |
+
heading_norm = normalize_text(heading_to_search)
|
| 1146 |
+
|
| 1147 |
+
for page_num in range(heading_to_searchPageNum,len(doc)):
|
| 1148 |
+
print('hereeeeeeeeeeeeeee1')
|
| 1149 |
+
if page_num in toc_pages:
|
| 1150 |
+
continue
|
| 1151 |
+
if break_collecting:
|
| 1152 |
+
break
|
| 1153 |
+
page=doc[page_num]
|
| 1154 |
+
page_height = page.rect.height
|
| 1155 |
+
blocks = page.get_text("dict")["blocks"]
|
| 1156 |
+
|
| 1157 |
+
for block in blocks:
|
| 1158 |
+
if break_collecting:
|
| 1159 |
+
break
|
| 1160 |
+
|
| 1161 |
+
lines = block.get("lines", [])
|
| 1162 |
+
i = 0
|
| 1163 |
+
while i < len(lines):
|
| 1164 |
+
if break_collecting:
|
| 1165 |
+
break
|
| 1166 |
+
|
| 1167 |
+
spans = lines[i].get("spans", [])
|
| 1168 |
+
if not spans:
|
| 1169 |
+
i += 1
|
| 1170 |
+
continue
|
| 1171 |
+
|
| 1172 |
+
y0 = spans[0]["bbox"][1]
|
| 1173 |
+
y1 = spans[0]["bbox"][3]
|
| 1174 |
+
if y0 < top_margin or y1 > (page_height - bottom_margin):
|
| 1175 |
+
i += 1
|
| 1176 |
+
continue
|
| 1177 |
+
|
| 1178 |
+
line_text = get_spaced_text_from_spans(spans).lower()
|
| 1179 |
+
line_text_norm = normalize_text(line_text)
|
| 1180 |
+
|
| 1181 |
+
# Combine with next line if available
|
| 1182 |
+
if i + 1 < len(lines):
|
| 1183 |
+
next_spans = lines[i + 1].get("spans", [])
|
| 1184 |
+
next_line_text = get_spaced_text_from_spans(next_spans).lower()
|
| 1185 |
+
combined_line_norm = normalize_text(line_text + " " + next_line_text)
|
| 1186 |
+
else:
|
| 1187 |
+
combined_line_norm = line_text_norm
|
| 1188 |
+
# Optimized header matching
|
| 1189 |
+
existsfull = (
|
| 1190 |
+
( combined_line_norm in allchildrenheaders_set or
|
| 1191 |
+
combined_line_norm in allchildrenheaders ) and heading_to_search in combined_line_norm
|
| 1192 |
+
)
|
| 1193 |
+
|
| 1194 |
+
# New word-based matching
|
| 1195 |
+
current_line_words = set(combined_line_norm.split())
|
| 1196 |
+
heading_words = set(heading_norm.split())
|
| 1197 |
+
all_words_match = current_line_words.issubset(heading_words) and len(current_line_words) > 0
|
| 1198 |
+
|
| 1199 |
+
substring_match = (
|
| 1200 |
+
heading_norm in combined_line_norm or
|
| 1201 |
+
combined_line_norm in heading_norm or
|
| 1202 |
+
all_words_match # Include the new word-based matching
|
| 1203 |
+
)
|
| 1204 |
+
|
| 1205 |
+
if (substring_match and existsfull and not collecting and
|
| 1206 |
+
len(combined_line_norm) > 0 ):#and (headertoContinue1 or headertoContinue2) ):
|
| 1207 |
+
|
| 1208 |
+
# Check header conditions more efficiently
|
| 1209 |
+
header_spans = [
|
| 1210 |
+
span for span in spans
|
| 1211 |
+
if (is_header(span, most_common_font_size, most_common_color, most_common_font)
|
| 1212 |
+
# and span['size'] >= subsubheaderFontSize
|
| 1213 |
+
and span['size'] < mainHeaderFontSize)
|
| 1214 |
+
]
|
| 1215 |
+
if header_spans:
|
| 1216 |
+
collecting = True
|
| 1217 |
+
matched_header_font_size = max(span["size"] for span in header_spans)
|
| 1218 |
+
print(f"📥 Start collecting after header: {combined_line_norm} (Font size: {matched_header_font_size})")
|
| 1219 |
+
|
| 1220 |
+
collected_lines.append(line_text)
|
| 1221 |
+
valid_spans = [span for span in spans if span.get("bbox")]
|
| 1222 |
+
|
| 1223 |
+
if valid_spans:
|
| 1224 |
+
x0s = [span["bbox"][0] for span in valid_spans]
|
| 1225 |
+
x1s = [span["bbox"][2] for span in valid_spans]
|
| 1226 |
+
y0s = [span["bbox"][1] for span in valid_spans]
|
| 1227 |
+
y1s = [span["bbox"][3] for span in valid_spans]
|
| 1228 |
+
|
| 1229 |
+
header_bbox = [min(x0s), min(y0s), max(x1s), max(y1s)]
|
| 1230 |
+
|
| 1231 |
+
if page_num in current_bbox:
|
| 1232 |
+
cb = current_bbox[page_num]
|
| 1233 |
+
current_bbox[page_num] = [
|
| 1234 |
+
min(cb[0], header_bbox[0]),
|
| 1235 |
+
min(cb[1], header_bbox[1]),
|
| 1236 |
+
max(cb[2], header_bbox[2]),
|
| 1237 |
+
max(cb[3], header_bbox[3])
|
| 1238 |
+
]
|
| 1239 |
+
else:
|
| 1240 |
+
current_bbox[page_num] = header_bbox
|
| 1241 |
+
last_y1s[page_num] = header_bbox[3]
|
| 1242 |
+
x0, y0, x1, y1 = header_bbox
|
| 1243 |
+
|
| 1244 |
+
zoom = 200
|
| 1245 |
+
left = int(x0)
|
| 1246 |
+
top = int(y0)
|
| 1247 |
+
zoom_str = f"{zoom},{left},{top}"
|
| 1248 |
+
pageNumberFound = page_num + 1
|
| 1249 |
+
|
| 1250 |
+
# Build the query parameters
|
| 1251 |
+
params = {
|
| 1252 |
+
'pdfLink': pdf_path, # Your PDF link
|
| 1253 |
+
'keyword': heading_to_search, # Your keyword (could be a string or list)
|
| 1254 |
+
}
|
| 1255 |
+
|
| 1256 |
+
# URL encode each parameter
|
| 1257 |
+
encoded_params = {key: urllib.parse.quote(value, safe='') for key, value in params.items()}
|
| 1258 |
+
|
| 1259 |
+
# Construct the final encoded link
|
| 1260 |
+
encoded_link = '&'.join([f"{key}={value}" for key, value in encoded_params.items()])
|
| 1261 |
+
|
| 1262 |
+
# Correctly construct the final URL with page and zoom
|
| 1263 |
+
final_url = f"{newlink}{encoded_link}#page={str(pageNumberFound)}&zoom={zoom_str}"
|
| 1264 |
+
|
| 1265 |
+
# Get current date and time
|
| 1266 |
+
now = datetime.now()
|
| 1267 |
+
|
| 1268 |
+
# Format the output
|
| 1269 |
+
formatted_time = now.strftime("%d/%m/%Y %I:%M:%S %p")
|
| 1270 |
+
# Optionally, add the URL to a DataFrame
|
| 1271 |
+
new_url= final_url
|
| 1272 |
+
if type(heading_to_searchDict) != str:
|
| 1273 |
+
heading_to_searchDict['NBSLink']=new_url
|
| 1274 |
+
newjsonList.append(heading_to_searchDict)
|
| 1275 |
+
print("Final URL:", final_url)
|
| 1276 |
+
i += 2
|
| 1277 |
+
continue
|
| 1278 |
+
else:
|
| 1279 |
+
if (substring_match and not collecting and
|
| 1280 |
+
len(combined_line_norm) > 0): # and (headertoContinue1 or headertoContinue2) ):
|
| 1281 |
+
|
| 1282 |
+
# Calculate word match percentage
|
| 1283 |
+
word_match_percent = words_match_ratio(heading_norm, combined_line_norm) * 100
|
| 1284 |
+
|
| 1285 |
+
# Check if at least 70% of header words exist in this line
|
| 1286 |
+
meets_word_threshold = word_match_percent >= 100
|
| 1287 |
+
|
| 1288 |
+
# Check header conditions (including word threshold)
|
| 1289 |
+
header_spans = [
|
| 1290 |
+
span for span in spans
|
| 1291 |
+
if (is_header(span, most_common_font_size, most_common_color, most_common_font)
|
| 1292 |
+
# and span['size'] >= subsubheaderFontSize
|
| 1293 |
+
and span['size'] < mainHeaderFontSize)
|
| 1294 |
+
]
|
| 1295 |
+
|
| 1296 |
+
if header_spans and (meets_word_threshold or same_start_word(heading_to_search, combined_line_norm) ):
|
| 1297 |
+
collecting = True
|
| 1298 |
+
matched_header_font_size = max(span["size"] for span in header_spans)
|
| 1299 |
+
print(f"📥 Start collecting after header: {combined_line_norm} "
|
| 1300 |
+
f"(Font: {matched_header_font_size}, Word match: {word_match_percent:.0f}%)")
|
| 1301 |
+
|
| 1302 |
+
collected_lines.append(line_text)
|
| 1303 |
+
valid_spans = [span for span in spans if span.get("bbox")]
|
| 1304 |
+
|
| 1305 |
+
if valid_spans:
|
| 1306 |
+
x0s = [span["bbox"][0] for span in valid_spans]
|
| 1307 |
+
x1s = [span["bbox"][2] for span in valid_spans]
|
| 1308 |
+
y0s = [span["bbox"][1] for span in valid_spans]
|
| 1309 |
+
y1s = [span["bbox"][3] for span in valid_spans]
|
| 1310 |
+
|
| 1311 |
+
header_bbox = [min(x0s), min(y0s), max(x1s), max(y1s)]
|
| 1312 |
+
|
| 1313 |
+
if page_num in current_bbox:
|
| 1314 |
+
cb = current_bbox[page_num]
|
| 1315 |
+
current_bbox[page_num] = [
|
| 1316 |
+
min(cb[0], header_bbox[0]),
|
| 1317 |
+
min(cb[1], header_bbox[1]),
|
| 1318 |
+
max(cb[2], header_bbox[2]),
|
| 1319 |
+
max(cb[3], header_bbox[3])
|
| 1320 |
+
]
|
| 1321 |
+
else:
|
| 1322 |
+
current_bbox[page_num] = header_bbox
|
| 1323 |
+
|
| 1324 |
+
last_y1s[page_num] = header_bbox[3]
|
| 1325 |
+
x0, y0, x1, y1 = header_bbox
|
| 1326 |
+
zoom = 200
|
| 1327 |
+
left = int(x0)
|
| 1328 |
+
top = int(y0)
|
| 1329 |
+
zoom_str = f"{zoom},{left},{top}"
|
| 1330 |
+
pageNumberFound = page_num + 1
|
| 1331 |
+
|
| 1332 |
+
# Build the query parameters
|
| 1333 |
+
params = {
|
| 1334 |
+
'pdfLink': pdf_path, # Your PDF link
|
| 1335 |
+
'keyword': heading_to_search, # Your keyword (could be a string or list)
|
| 1336 |
+
}
|
| 1337 |
+
|
| 1338 |
+
# URL encode each parameter
|
| 1339 |
+
encoded_params = {key: urllib.parse.quote(value, safe='') for key, value in params.items()}
|
| 1340 |
+
|
| 1341 |
+
# Construct the final encoded link
|
| 1342 |
+
encoded_link = '&'.join([f"{key}={value}" for key, value in encoded_params.items()])
|
| 1343 |
+
|
| 1344 |
+
# Correctly construct the final URL with page and zoom
|
| 1345 |
+
final_url = f"{newlink}{encoded_link}#page={str(pageNumberFound)}&zoom={zoom_str}"
|
| 1346 |
+
new_url= final_url
|
| 1347 |
+
if type(heading_to_searchDict) != str:
|
| 1348 |
+
heading_to_searchDict['NBSLink']=new_url
|
| 1349 |
+
newjsonList.append(heading_to_searchDict)
|
| 1350 |
+
print("Final URL:", final_url)
|
| 1351 |
+
i += 2
|
| 1352 |
+
continue
|
| 1353 |
+
if collecting:
|
| 1354 |
+
norm_line = normalize_text(line_text)
|
| 1355 |
+
|
| 1356 |
+
# Optimized URL check
|
| 1357 |
+
if url_pattern.match(norm_line):
|
| 1358 |
+
line_is_header = False
|
| 1359 |
+
else:
|
| 1360 |
+
line_is_header = any(is_header(span, most_common_font_size, most_common_color, most_common_font) for span in spans)
|
| 1361 |
+
|
| 1362 |
+
if line_is_header:
|
| 1363 |
+
header_font_size = max(span["size"] for span in spans)
|
| 1364 |
+
is_probably_real_header = (
|
| 1365 |
+
header_font_size >= matched_header_font_size and
|
| 1366 |
+
is_header(spans[0], most_common_font_size, most_common_color, most_common_font) and
|
| 1367 |
+
len(line_text.strip()) > 2
|
| 1368 |
+
)
|
| 1369 |
+
|
| 1370 |
+
if (norm_line != matched_header_line_norm and
|
| 1371 |
+
norm_line != heading_norm and
|
| 1372 |
+
is_probably_real_header):
|
| 1373 |
+
if line_text not in heading_norm:
|
| 1374 |
+
print(f"🛑 Stop at header with same or larger font: '{line_text}' ({header_font_size} ≥ {matched_header_font_size})")
|
| 1375 |
+
collecting = False
|
| 1376 |
+
done = True
|
| 1377 |
+
headertoContinue1 = False
|
| 1378 |
+
headertoContinue2=False
|
| 1379 |
+
for page_num, bbox in current_bbox.items():
|
| 1380 |
+
bbox[3] = last_y1s.get(page_num, bbox[3])
|
| 1381 |
+
page_highlights[page_num] = bbox
|
| 1382 |
+
|
| 1383 |
+
if 'installation' in incomingheader or 'execution' in incomingheader or 'miscellaneous items' in incomingheader :
|
| 1384 |
+
stringtowrite='Not to be billed'
|
| 1385 |
+
else:
|
| 1386 |
+
stringtowrite='To be billed'
|
| 1387 |
+
highlight_boxes(docHighlights, page_highlights,stringtowrite)
|
| 1388 |
+
|
| 1389 |
+
break_collecting = True
|
| 1390 |
+
break
|
| 1391 |
+
|
| 1392 |
+
if break_collecting:
|
| 1393 |
+
break
|
| 1394 |
+
|
| 1395 |
+
collected_lines.append(line_text)
|
| 1396 |
+
valid_spans = [span for span in spans if span.get("bbox")]
|
| 1397 |
+
if valid_spans:
|
| 1398 |
+
x0s = [span["bbox"][0] for span in valid_spans]
|
| 1399 |
+
x1s = [span["bbox"][2] for span in valid_spans]
|
| 1400 |
+
y0s = [span["bbox"][1] for span in valid_spans]
|
| 1401 |
+
y1s = [span["bbox"][3] for span in valid_spans]
|
| 1402 |
+
|
| 1403 |
+
line_bbox = [min(x0s), min(y0s), max(x1s), max(y1s)]
|
| 1404 |
+
|
| 1405 |
+
if page_num in current_bbox:
|
| 1406 |
+
cb = current_bbox[page_num]
|
| 1407 |
+
current_bbox[page_num] = [
|
| 1408 |
+
min(cb[0], line_bbox[0]),
|
| 1409 |
+
min(cb[1], line_bbox[1]),
|
| 1410 |
+
max(cb[2], line_bbox[2]),
|
| 1411 |
+
max(cb[3], line_bbox[3])
|
| 1412 |
+
]
|
| 1413 |
+
else:
|
| 1414 |
+
current_bbox[page_num] = line_bbox
|
| 1415 |
+
|
| 1416 |
+
last_y1s[page_num] = line_bbox[3]
|
| 1417 |
+
i += 1
|
| 1418 |
+
|
| 1419 |
+
if not done:
|
| 1420 |
+
for page_num, bbox in current_bbox.items():
|
| 1421 |
+
bbox[3] = last_y1s.get(page_num, bbox[3])
|
| 1422 |
+
page_highlights[page_num] = bbox
|
| 1423 |
+
if 'installation' in incomingheader or 'execution' in incomingheader or 'miscellaneous items' in incomingheader :
|
| 1424 |
+
stringtowrite='Not to be billed'
|
| 1425 |
+
else:
|
| 1426 |
+
stringtowrite='To be billed'
|
| 1427 |
+
highlight_boxes(docHighlights, page_highlights,stringtowrite)
|
| 1428 |
+
|
| 1429 |
+
# docHighlights.save("highlighted_output.pdf", garbage=4, deflate=True)
|
| 1430 |
+
|
| 1431 |
+
pdf_bytes = BytesIO()
|
| 1432 |
+
docHighlights.save(pdf_bytes)
|
| 1433 |
+
return pdf_bytes.getvalue(), docHighlights , newjsonList
|
| 1434 |
+
|
| 1435 |
+
|