Marthee commited on
Commit
30e5400
·
verified ·
1 Parent(s): 47356e3

Upload InitialMarkups.py

Browse files
Files changed (1) hide show
  1. InitialMarkups.py +978 -0
InitialMarkups.py ADDED
@@ -0,0 +1,978 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """Copy of FindSpecsTrial(Retrieving+boundingBoxes)-InitialMarkups(ALL)_CleanedUp.ipynb
3
+
4
+ Automatically generated by Colab.
5
+
6
+ Original file is located at
7
+ https://colab.research.google.com/drive/12XfVkmKmN3oVjHhLVE0_GgkftgArFEK2
8
+ """
9
+
10
+ pip install pymupdf
11
+
12
+ pip install fuzzywuzzy
13
+
14
+ from io import BytesIO
15
+ import re
16
+ import requests
17
+ import pandas as pd
18
+ import fitz # PyMuPDF
19
+ import re
20
+ import urllib.parse
21
+ import pandas as pd
22
+ import math
23
+ import random
24
+ import json
25
+ from datetime import datetime
26
+ from collections import defaultdict, Counter
27
+ import difflib
28
+ from fuzzywuzzy import fuzz
29
+
30
+ def get_regular_font_size_and_color(doc):
31
+ font_sizes = []
32
+ colors = []
33
+ fonts = []
34
+
35
+ # Loop through all pages
36
+ for page_num in range(len(doc)):
37
+ page = doc.load_page(page_num)
38
+ for span in page.get_text("dict")["blocks"]:
39
+ if "lines" in span:
40
+ for line in span["lines"]:
41
+ for span in line["spans"]:
42
+ font_sizes.append(span['size'])
43
+ colors.append(span['color'])
44
+ fonts.append(span['font'])
45
+
46
+ # Get the most common font size, color, and font
47
+ most_common_font_size = Counter(font_sizes).most_common(1)[0][0] if font_sizes else None
48
+ most_common_color = Counter(colors).most_common(1)[0][0] if colors else None
49
+ most_common_font = Counter(fonts).most_common(1)[0][0] if fonts else None
50
+
51
+ return most_common_font_size, most_common_color, most_common_font
52
+
53
+ def normalize_text(text):
54
+ if text is None:
55
+ return ""
56
+ return re.sub(r'\s+', ' ', text.strip().lower())
57
+
58
+ def get_spaced_text_from_spans(spans):
59
+ return normalize_text(" ".join(span["text"].strip() for span in spans))
60
+
61
+ def is_header(span, most_common_font_size, most_common_color, most_common_font):
62
+ fontname = span.get("font", "").lower()
63
+ # is_italic = "italic" in fontname or "oblique" in fontname
64
+ is_bold = "bold" in fontname or span.get("bold", False)
65
+ return (
66
+ (
67
+ span["size"] > most_common_font_size or
68
+ span["font"].lower() != most_common_font.lower() or
69
+ is_bold
70
+ )
71
+ )
72
+
73
+ def add_span_to_nearest_group(span_y, grouped_dict, pageNum=None, threshold=0.5):
74
+ for (p, y) in grouped_dict:
75
+ if pageNum is not None and p != pageNum:
76
+ continue
77
+ if abs(y - span_y) <= threshold:
78
+ return (p, y)
79
+ return (pageNum, span_y)
80
+
81
+ def extract_headers(doc, toc_pages, most_common_font_size, most_common_color, most_common_font, top_margin, bottom_margin):
82
+ print("Font baseline:", most_common_font_size, most_common_color, most_common_font)
83
+
84
+ grouped_headers = defaultdict(list)
85
+ spans = []
86
+ line_merge_threshold = 1.5 # Maximum vertical distance between lines to consider as part of same header
87
+
88
+ for pageNum in range(len(doc)):
89
+ if pageNum in toc_pages:
90
+ continue
91
+ page = doc.load_page(pageNum)
92
+ page_height = page.rect.height
93
+ text_instances = page.get_text("dict")
94
+
95
+ # First pass: collect all potential header spans
96
+ potential_header_spans = []
97
+ for block in text_instances['blocks']:
98
+ if block['type'] != 0:
99
+ continue
100
+
101
+ for line in block['lines']:
102
+ for span in line['spans']:
103
+ span_y0 = span['bbox'][1]
104
+ span_y1 = span['bbox'][3]
105
+
106
+ if span_y0 < top_margin or span_y1 > (page_height - bottom_margin):
107
+ continue
108
+
109
+ span_text = normalize_text(span.get('text', ''))
110
+ if not span_text:
111
+ continue
112
+ if span_text.startswith('http://www') or span_text.startswith('www'):
113
+ continue
114
+ if any((
115
+ 'page' in span_text,
116
+ not re.search(r'[a-z0-9]', span_text),
117
+ 'end of section' in span_text,
118
+ re.search(r'page\s+\d+\s+of\s+\d+', span_text),
119
+ re.search(r'\b(?:\d{1,2}[/-])?\d{1,2}[/-]\d{2,4}\b', span_text),
120
+ # re.search(r'\b(?:jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)', span_text),
121
+ 'specification:' in span_text
122
+ )):
123
+ continue
124
+
125
+ cleaned_text = re.sub(r'[.\-]{4,}.*$', '', span_text).strip()
126
+ cleaned_text = normalize_text(cleaned_text)
127
+
128
+ if is_header(span, most_common_font_size, most_common_color, most_common_font):
129
+ potential_header_spans.append({
130
+ 'text': cleaned_text,
131
+ 'size': span['size'],
132
+ 'pageNum': pageNum,
133
+ 'y0': span_y0,
134
+ 'y1': span_y1,
135
+ 'x0': span['bbox'][0],
136
+ 'x1': span['bbox'][2],
137
+ 'span': span
138
+ })
139
+
140
+ # Sort spans by vertical position (top to bottom)
141
+ potential_header_spans.sort(key=lambda s: (s['pageNum'], s['y0']))
142
+
143
+ # Second pass: group spans that are vertically close and likely part of same header
144
+ i = 0
145
+ while i < len(potential_header_spans):
146
+ current = potential_header_spans[i]
147
+ header_text = current['text']
148
+ header_size = current['size']
149
+ header_page = current['pageNum']
150
+ min_y = current['y0']
151
+ max_y = current['y1']
152
+ spans_group = [current['span']]
153
+
154
+ # Look ahead to find adjacent lines that might be part of same header
155
+ j = i + 1
156
+ while j < len(potential_header_spans):
157
+ next_span = potential_header_spans[j]
158
+ # Check if on same page and vertically close with similar styling
159
+ if (next_span['pageNum'] == header_page and
160
+ next_span['y0'] - max_y < line_merge_threshold and
161
+ abs(next_span['size'] - header_size) < 0.5):
162
+ header_text += " " + next_span['text']
163
+ max_y = next_span['y1']
164
+ spans_group.append(next_span['span'])
165
+ j += 1
166
+ else:
167
+ break
168
+
169
+ # Add the merged header
170
+ grouped_headers[(header_page, min_y)].append({
171
+ "text": header_text.strip(),
172
+ "size": header_size,
173
+ "pageNum": header_page,
174
+ "spans": spans_group
175
+ })
176
+ spans.extend(spans_group)
177
+ i = j # Skip the spans we've already processed
178
+
179
+ # Prepare final headers list
180
+ headers = []
181
+ for (pageNum, y), header_groups in sorted(grouped_headers.items()):
182
+ for group in header_groups:
183
+ headers.append([
184
+ group['text'],
185
+ group['size'],
186
+ group['pageNum'],
187
+ y
188
+ ])
189
+
190
+ font_sizes = [size for _, size, _, _ in headers]
191
+ font_size_counts = Counter(font_sizes)
192
+
193
+ # Filter font sizes that appear at least 3 times
194
+ valid_font_sizes = [size for size, count in font_size_counts.items() if count >= 3]
195
+
196
+ # Sort in descending order
197
+ valid_font_sizes_sorted = sorted(valid_font_sizes, reverse=True)
198
+
199
+ # If only 2 sizes, repeat the second one
200
+ if len(valid_font_sizes_sorted) == 2:
201
+ top_3_font_sizes = [valid_font_sizes_sorted[0], valid_font_sizes_sorted[1], valid_font_sizes_sorted[1]]
202
+ else:
203
+ top_3_font_sizes = valid_font_sizes_sorted[:3]
204
+
205
+ # Get the smallest font size among valid ones
206
+ smallest_font_size = min(valid_font_sizes) if valid_font_sizes else None
207
+ return headers, top_3_font_sizes, smallest_font_size, spans
208
+
209
+ def is_numbered(text):
210
+ return bool(re.match(r'^\d', text.strip()))
211
+
212
+ def is_similar(a, b, threshold=0.85):
213
+ return difflib.SequenceMatcher(None, a, b).ratio() > threshold
214
+
215
+ def normalize(text):
216
+ text = text.lower()
217
+ text = re.sub(r'\.{2,}', '', text) # remove long dots
218
+ text = re.sub(r'\s+', ' ', text) # replace multiple spaces with one
219
+ return text.strip()
220
+
221
+ def clean_toc_entry(toc_text):
222
+ """Remove page numbers and formatting from TOC entries"""
223
+ # Remove everything after last sequence of dots/whitespace followed by digits
224
+ return re.sub(r'[\.\s]+\d+.*$', '', toc_text).strip('. ')
225
+
226
+ def build_header_hierarchy(doc, toc_pages, most_common_font_size, most_common_color, most_common_font, top_margin=70, bottom_margin=70):
227
+ # Extract headers with margin handling
228
+ headers_list, top_3_font_sizes, smallest_font_size, spans = extract_headers(
229
+ doc,
230
+ toc_pages=toc_pages,
231
+ most_common_font_size=most_common_font_size,
232
+ most_common_color=most_common_color,
233
+ most_common_font=most_common_font,
234
+ top_margin=top_margin,
235
+ bottom_margin=bottom_margin
236
+ )
237
+
238
+ # Step 1: Collect and filter potential headers
239
+ headers = []
240
+ seen_headers = set()
241
+
242
+ # First extract TOC entries to get exact level 0 header texts
243
+ toc_entries = {}
244
+ for pno in toc_pages:
245
+ page = doc.load_page(pno)
246
+ toc_text = page.get_text()
247
+ for line in toc_text.split('\n'):
248
+ clean_line = line.strip()
249
+ if clean_line:
250
+ norm_line = normalize(clean_line)
251
+ toc_entries[norm_line] = clean_line # Store original text
252
+
253
+ for h in headers_list:
254
+ text, size, pageNum, y = h[:4]
255
+ page = doc.load_page(pageNum)
256
+ page_height = page.rect.height
257
+
258
+ # Skip margin areas
259
+ if y < top_margin or y > (page_height - bottom_margin):
260
+ continue
261
+
262
+ norm_text = normalize(text)
263
+ if len(norm_text) > 2 and size >= most_common_font_size:
264
+ headers.append({
265
+ "text": text,
266
+ "page": pageNum,
267
+ "y": y,
268
+ "size": size,
269
+ "bold": h[4] if len(h) > 4 else False,
270
+ # "italic": h[5] if len(h) > 5 else False,
271
+ "color": h[6] if len(h) > 6 else None,
272
+ "font": h[7] if len(h) > 7 else None,
273
+ "children": [],
274
+ "is_numbered": is_numbered(text),
275
+ "original_size": size,
276
+ "norm_text": norm_text,
277
+ "level": -1 # Initialize as unassigned
278
+ })
279
+
280
+ # Sort by page and vertical position
281
+ headers.sort(key=lambda h: (h['page'], h['y']))
282
+ # Step 2: Detect consecutive headers and assign levels
283
+ i = 0
284
+ while i < len(headers) - 1:
285
+ current = headers[i]
286
+ next_header = headers[i+1]
287
+
288
+ # Check if they are on the same page and very close vertically (likely consecutive lines)
289
+ if (current['page'] == next_header['page'] and
290
+ abs(current['y'] - next_header['y']) < 20): # 20pt threshold for "same line"
291
+
292
+ # Case 1: Both unassigned - make current level 1 and next level 2
293
+ if current['level'] == -1 and next_header['level'] == -1:
294
+ current['level'] = 1
295
+ next_header['level'] = 2
296
+ i += 1 # Skip next header since we processed it
297
+
298
+ # Case 2: Current unassigned, next assigned - make current one level above
299
+ elif current['level'] == -1 and next_header['level'] != -1:
300
+ current['level'] = max(1, next_header['level'] - 1)
301
+
302
+ # Case 3: Current assigned, next unassigned - make next one level below
303
+ elif current['level'] != -1 and next_header['level'] == -1:
304
+ next_header['level'] = current['level'] + 1
305
+ i += 1 # Skip next header since we processed it
306
+ i += 1
307
+ # Step 2: Identify level 0 headers (largest and in TOC)
308
+ # max_size = max(h['size'] for h in headers) if headers else 0
309
+ max_size,subheaderSize,nbsheadersize=top_3_font_sizes
310
+ print(max_size)
311
+ toc_text_match=[]
312
+ # Improved TOC matching with exact and substring matching
313
+ toc_matches = []
314
+ for h in headers:
315
+ norm_text = h['norm_text']
316
+ matching_toc_texts = []
317
+
318
+ # Check both exact matches and substring matches
319
+ for toc_norm, toc_text in toc_entries.items():
320
+ # Exact match case
321
+ if norm_text == toc_norm and len(toc_text)>4 and h['size']==max_size:
322
+ matching_toc_texts.append(toc_text)
323
+ # Substring match case (header is substring of TOC entry)
324
+ elif norm_text in toc_norm and len(toc_text)>4 and h['size']==max_size:
325
+ matching_toc_texts.append(toc_text)
326
+ # Substring match case (TOC entry is substring of header)
327
+ elif toc_norm in norm_text and len(toc_text)>4 and h['size']==max_size:
328
+ matching_toc_texts.append(toc_text)
329
+
330
+ if matching_toc_texts and h['size'] >= max_size * 0.9:
331
+ best_match = max(matching_toc_texts,
332
+ key=lambda x: (len(x), -len(x.replace(norm_text, ''))))
333
+ h['text'] = normalize_text(clean_toc_entry(best_match))
334
+ h['level'] = 0
335
+ if h['text'] not in toc_text_match:
336
+ toc_matches.append(h)
337
+ toc_text_match.append(h['text'])
338
+ elif matching_toc_texts and h['size'] < max_size * 0.9 and h['size'] > nbsheadersize : # h['size'] < max_size * 0.9 and h['size'] > max_size*0.75:
339
+ print(h['text'],matching_toc_texts)
340
+ headers.remove(h)
341
+ continue
342
+
343
+
344
+ # Remove duplicates - keep only first occurrence of each level 0 header
345
+ unique_level0 = []
346
+ seen_level0 = set()
347
+ for h in toc_matches:
348
+ # Use the cleaned text for duplicate checking
349
+ cleaned_text = clean_toc_entry(h['text'])
350
+ norm_cleaned_text = normalize(cleaned_text)
351
+
352
+ if norm_cleaned_text not in seen_level0:
353
+ seen_level0.add(norm_cleaned_text)
354
+ # Update the header text with cleaned version
355
+ h['text'] = cleaned_text
356
+ unique_level0.append(h)
357
+ print(f"Added unique header: {cleaned_text} (normalized: {norm_cleaned_text})")
358
+
359
+ # Step 3: Process headers under each level 0 to identify level 1 format
360
+
361
+ # First, group headers by their level 0 parent
362
+ level0_headers = [h for h in headers if h['level'] == 0]
363
+ header_groups = []
364
+
365
+ for i, level0 in enumerate(level0_headers):
366
+ start_idx = headers.index(level0)
367
+ end_idx = headers.index(level0_headers[i+1]) if i+1 < len(level0_headers) else len(headers)
368
+ group = headers[start_idx:end_idx]
369
+ header_groups.append(group)
370
+
371
+ # Now process each group to identify level 1 format
372
+ for group in header_groups:
373
+ level0 = group[0]
374
+ level1_candidates = [h for h in group[1:] if h['level'] == -1]
375
+
376
+ if not level1_candidates:
377
+ continue
378
+
379
+ # The first candidate is our reference level 1
380
+ first_level1 = level1_candidates[0]
381
+ level1_format = {
382
+ 'font': first_level1['font'],
383
+ 'color': first_level1['color'],
384
+ 'starts_with_number': is_numbered(first_level1['text']),
385
+ 'size': first_level1['size'],
386
+ 'bold': first_level1['bold']
387
+ # 'italic': first_level1['italic']
388
+ }
389
+
390
+ # Assign levels based on the reference format
391
+ for h in level1_candidates:
392
+ current_format = {
393
+ 'font': h['font'],
394
+ 'color': h['color'],
395
+ 'starts_with_number': is_numbered(h['text']),
396
+ 'size': h['size'],
397
+ 'bold': h['bold']
398
+ # 'italic': h['italic']
399
+ }
400
+
401
+ # Compare with level1 format
402
+ if (current_format['font'] == level1_format['font'] and
403
+ current_format['color'] == level1_format['color'] and
404
+ current_format['starts_with_number'] == level1_format['starts_with_number'] and
405
+ abs(current_format['size'] - level1_format['size']) <= 0.1 and
406
+ current_format['bold'] == level1_format['bold'] ): #and
407
+ # current_format['italic'] == level1_format['italic']):
408
+ h['level'] = 1
409
+ else:
410
+ h['level'] = 2
411
+
412
+ # Step 4: Assign levels to remaining unassigned headers
413
+ unassigned = [h for h in headers if h['level'] == -1]
414
+ if unassigned:
415
+ # Cluster by size with tolerance
416
+ sizes = sorted({h['size'] for h in unassigned}, reverse=True)
417
+ clusters = []
418
+
419
+ for size in sizes:
420
+ found_cluster = False
421
+ for cluster in clusters:
422
+ if abs(size - cluster['size']) <= max(size, cluster['size']) * 0.1:
423
+ cluster['headers'].extend([h for h in unassigned if abs(h['size'] - size) <= size * 0.1])
424
+ found_cluster = True
425
+ break
426
+ if not found_cluster:
427
+ clusters.append({
428
+ 'size': size,
429
+ 'headers': [h for h in unassigned if abs(h['size'] - size) <= size * 0.1]
430
+ })
431
+
432
+ # Assign levels starting from 1
433
+ clusters.sort(key=lambda x: -x['size'])
434
+ for i, cluster in enumerate(clusters):
435
+ for h in cluster['headers']:
436
+ base_level = i + 1
437
+ if h['bold']:
438
+ base_level = max(1, base_level - 1)
439
+ h['level'] = base_level
440
+
441
+ # Step 5: Build hierarchy
442
+ root = []
443
+ stack = []
444
+
445
+ # Create a set of normalized texts from unique_level0 to avoid duplicates
446
+ unique_level0_texts = {h['norm_text'] for h in unique_level0}
447
+
448
+ # Filter out any headers from the original list that match unique_level0 headers
449
+ filtered_headers = []
450
+ for h in headers:
451
+ if h['norm_text'] in unique_level0_texts and h not in unique_level0:
452
+ h['level'] = 0
453
+ filtered_headers.append(h)
454
+
455
+ # Combine all headers - unique_level0 first, then the filtered headers
456
+ all_headers = unique_level0 + filtered_headers
457
+ all_headers.sort(key=lambda h: (h['page'], h['y']))
458
+
459
+ # Track which level 0 headers we've already added
460
+ added_level0 = set()
461
+
462
+ for header in all_headers:
463
+ if header['level'] < 0:
464
+ continue
465
+
466
+ if header['level'] == 0:
467
+ norm_text = header['norm_text']
468
+ if norm_text in added_level0:
469
+ continue
470
+ added_level0.add(norm_text)
471
+
472
+ # Pop stack until we find a parent
473
+ while stack and stack[-1]['level'] >= header['level']:
474
+ stack.pop()
475
+
476
+ current_parent = stack[-1] if stack else None
477
+
478
+ if current_parent:
479
+ current_parent['children'].append(header)
480
+ else:
481
+ root.append(header)
482
+
483
+ stack.append(header)
484
+
485
+ # Step 6: Enforce proper nesting
486
+ def enforce_nesting(node_list, parent_level=-1):
487
+ for node in node_list:
488
+ if node['level'] <= parent_level:
489
+ node['level'] = parent_level + 1
490
+ enforce_nesting(node['children'], node['level'])
491
+
492
+ enforce_nesting(root)
493
+ root = [h for h in root if not (h['level'] == 0 and not h['children'])]
494
+ return root
495
+
496
+ def adjust_levels_if_level0_not_in_toc(doc, toc_pages, root):
497
+ def normalize(text):
498
+ return re.sub(r'\s+', ' ', text.strip().lower())
499
+
500
+ toc_text = ""
501
+ for pno in toc_pages:
502
+ page = doc.load_page(pno)
503
+ toc_text += page.get_text()
504
+ toc_text_normalized = normalize(toc_text)
505
+
506
+ def is_level0_in_toc_text(header):
507
+ return header['level'] == 0 and normalize(header['text']) in toc_text_normalized
508
+
509
+ if any(is_level0_in_toc_text(h) for h in root):
510
+ return # No change needed
511
+
512
+ def increase_levels(node_list):
513
+ for node in node_list:
514
+ node['level'] += 1
515
+ increase_levels(node['children'])
516
+
517
+ def assign_numbers_to_headers(headers, prefix=None):
518
+ for idx, header in enumerate(headers, 1):
519
+ current_number = f"{prefix}.{idx}" if prefix else str(idx)
520
+ header["number"] = current_number
521
+ assign_numbers_to_headers(header["children"], current_number)
522
+
523
+ def print_tree_with_numbers(headers, indent=0):
524
+ for header in headers:
525
+ size_info = f"size:{header['original_size']:.1f}" if 'original_size' in header else ""
526
+ print(" " * indent +
527
+ f"{header.get('number', '?')} {header['text']} " +
528
+ f"(Level {header['level']}, p:{header['page']+1}, {size_info})")
529
+ print_tree_with_numbers(header["children"], indent + 1)
530
+
531
+ def highlight_boxes(doc, highlights):
532
+ for page_num, bbox in highlights.items():
533
+
534
+ page = doc.load_page(page_num)
535
+ page_width = page.rect.width
536
+ rect = fitz.Rect(bbox)
537
+ # Get the original bounding box
538
+ orig_rect = fitz.Rect(bbox)
539
+ rect_width = orig_rect.width
540
+ rect_height = orig_rect.height
541
+ if rect_width>10:
542
+ annot = page.add_rect_annot(rect)
543
+
544
+ annot.set_colors(stroke=(1,1,0), fill=(1,1,0))
545
+ annot.set_opacity(0.3)
546
+ annot.update()
547
+
548
+
549
+ # Calculate new x coordinates so the rect is centered on the page width
550
+ center_x = page_width / 2
551
+ new_x0 = center_x - rect_width / 2
552
+ new_x1 = center_x + rect_width / 2
553
+
554
+ # Create new rect centered on the page's x-center, preserving y-coordinates
555
+ new_rect = fitz.Rect(new_x0, orig_rect.y0, new_x1, orig_rect.y1)
556
+
557
+ # Add centered text annotation
558
+ text = "[To be billed]"
559
+ annot1 = page.add_freetext_annot(
560
+ new_rect,
561
+ text,
562
+ fontsize=15,
563
+ fontname='helv',
564
+ text_color=(1, 0, 0),
565
+ rotate=page.rotation,
566
+ align=1 # centered alignment
567
+ )
568
+ annot1.update()
569
+
570
+ def get_leaf_headers_with_paths(listtoloop, path=None, output=None):
571
+ if path is None:
572
+ path = []
573
+ if output is None:
574
+ output = []
575
+ for header in listtoloop:
576
+ current_path = path + [header['text']]
577
+ if not header['children']:
578
+ if header['level'] != 0 and header['level'] != 1:
579
+ output.append((header, current_path))
580
+ else:
581
+ get_leaf_headers_with_paths(header['children'], current_path, output)
582
+ return output
583
+
584
+ # Add this helper function at the top of your code
585
+ def words_match_ratio(text1, text2):
586
+ words1 = set(text1.split())
587
+ words2 = set(text2.split())
588
+ if not words1 or not words2:
589
+ return 0.0
590
+ common_words = words1 & words2
591
+ return len(common_words) / len(words1)
592
+
593
+ def same_start_word(s1, s2):
594
+ # Split both strings into words
595
+ words1 = s1.strip().split()
596
+ words2 = s2.strip().split()
597
+
598
+ # Check if both have at least one word and compare the first ones
599
+ if words1 and words2:
600
+ return words1[0].lower() == words2[0].lower()
601
+ return False
602
+
603
+ baselink='https://marthee-nbslink.hf.space/view-pdf?'
604
+ def extract_section_under_header(pdf_path):
605
+ top_margin = 70
606
+ bottom_margin = 50
607
+ headertoContinue1 = False
608
+ headertoContinue2=False
609
+
610
+ # Optimized URL handling
611
+ if pdf_path and ('http' in pdf_path or 'dropbox' in pdf_path):
612
+ pdf_path = pdf_path.replace('dl=0', 'dl=1')
613
+
614
+ # Cache frequently used values
615
+ response = requests.get(pdf_path)
616
+ pdf_content = BytesIO(response.content)
617
+ if not pdf_content:
618
+ raise ValueError("No valid PDF content found.")
619
+
620
+ doc = fitz.open(stream=pdf_content, filetype="pdf")
621
+ docHighlights = fitz.open(stream=pdf_content, filetype="pdf")
622
+ most_common_font_size, most_common_color, most_common_font = get_regular_font_size_and_color(doc)
623
+
624
+ # Precompute regex patterns
625
+ dot_pattern = re.compile(r'\.{3,}')
626
+ url_pattern = re.compile(r'https?://\S+|www\.\S+')
627
+
628
+ def get_toc_page_numbers(doc, max_pages_to_check=15):
629
+ toc_pages = []
630
+ for page_num in range(min(len(doc), max_pages_to_check)):
631
+ page = doc.load_page(page_num)
632
+ blocks = page.get_text("dict")["blocks"]
633
+
634
+ dot_line_count = 0
635
+ for block in blocks:
636
+ for line in block.get("lines", []):
637
+ line_text = get_spaced_text_from_spans(line["spans"]).strip()
638
+ if dot_pattern.search(line_text):
639
+ dot_line_count += 1
640
+
641
+ if dot_line_count >= 3:
642
+ toc_pages.append(page_num)
643
+
644
+ return list(range(0, toc_pages[-1] +1)) if toc_pages else toc_pages
645
+
646
+ toc_pages = get_toc_page_numbers(doc)
647
+
648
+ headers, top_3_font_sizes, smallest_font_size, headersSpans = extract_headers(
649
+ doc, toc_pages, most_common_font_size, most_common_color, most_common_font, top_margin, bottom_margin
650
+ )
651
+
652
+ hierarchy = build_header_hierarchy(doc, toc_pages, most_common_font_size, most_common_color, most_common_font)
653
+ listofHeaderstoMarkup = get_leaf_headers_with_paths(hierarchy)
654
+ print('listofHeaderstoMarkup',listofHeaderstoMarkup)
655
+ # Precompute all children headers once
656
+ allchildrenheaders = [normalize_text(item['text']) for item, p in listofHeaderstoMarkup]
657
+ allchildrenheaders_set = set(allchildrenheaders) # For faster lookups
658
+
659
+ df = pd.DataFrame(columns=["NBSLink","Subject","Page","Author","Creation Date","Layer",'Code', 'head above 1', "head above 2"])
660
+ dictionaryNBS={}
661
+ data_list_JSON = []
662
+
663
+ if len(top_3_font_sizes)==3:
664
+ mainHeaderFontSize, subHeaderFontSize, subsubheaderFontSize = top_3_font_sizes
665
+ elif len(top_3_font_sizes)==2:
666
+ mainHeaderFontSize= top_3_font_sizes[0]
667
+ subHeaderFontSize= top_3_font_sizes[1]
668
+ subsubheaderFontSize= top_3_font_sizes[1]
669
+
670
+ print("📌 Has TOC:", bool(toc_pages), " | Pages to skip:", toc_pages)
671
+
672
+ # Preload all pages to avoid repeated loading
673
+ # pages = [doc.load_page(page_num) for page_num in range(len(doc)) if page_num not in toc_pages]
674
+
675
+ for heading_to_searchDict, paths in listofHeaderstoMarkup:
676
+ heading_to_search = heading_to_searchDict['text']
677
+ heading_to_searchPageNum = heading_to_searchDict['page']
678
+
679
+ print('headertosearch', heading_to_search)
680
+
681
+ # Initialize variables
682
+ headertoContinue1 = False
683
+ headertoContinue2 = False
684
+ matched_header_line = None
685
+ done = False
686
+ collecting = False
687
+ collected_lines = []
688
+ page_highlights = {}
689
+ current_bbox = {}
690
+ last_y1s = {}
691
+ mainHeader = ''
692
+ subHeader = ''
693
+ matched_header_line_norm = heading_to_search
694
+ break_collecting = False
695
+ heading_norm = normalize_text(heading_to_search)
696
+ paths_norm = [normalize_text(p) for p in paths[0]] if paths and paths[0] else []
697
+
698
+ for page_num in range(heading_to_searchPageNum,len(doc)):
699
+ if page_num in toc_pages:
700
+ continue
701
+ if break_collecting:
702
+ break
703
+ page=doc[page_num]
704
+ page_height = page.rect.height
705
+ blocks = page.get_text("dict")["blocks"]
706
+
707
+ for block in blocks:
708
+ if break_collecting:
709
+ break
710
+
711
+ lines = block.get("lines", [])
712
+ i = 0
713
+ while i < len(lines):
714
+ if break_collecting:
715
+ break
716
+
717
+ spans = lines[i].get("spans", [])
718
+ if not spans:
719
+ i += 1
720
+ continue
721
+
722
+ y0 = spans[0]["bbox"][1]
723
+ y1 = spans[0]["bbox"][3]
724
+ if y0 < top_margin or y1 > (page_height - bottom_margin):
725
+ i += 1
726
+ continue
727
+
728
+ line_text = get_spaced_text_from_spans(spans).lower()
729
+ line_text_norm = normalize_text(line_text)
730
+
731
+ # Combine with next line if available
732
+ if i + 1 < len(lines):
733
+ next_spans = lines[i + 1].get("spans", [])
734
+ next_line_text = get_spaced_text_from_spans(next_spans).lower()
735
+ combined_line_norm = normalize_text(line_text + " " + next_line_text)
736
+ else:
737
+ combined_line_norm = line_text_norm
738
+
739
+ # Check if we should continue processing
740
+ if combined_line_norm and combined_line_norm in paths[0]:
741
+ print(combined_line_norm)
742
+ headertoContinue1 = combined_line_norm
743
+ if combined_line_norm and combined_line_norm in paths[-2]:
744
+ print(combined_line_norm)
745
+ headertoContinue2 = combined_line_norm
746
+
747
+ # Optimized header matching
748
+ existsfull = (
749
+ ( combined_line_norm in allchildrenheaders_set or
750
+ combined_line_norm in allchildrenheaders ) and heading_to_search in combined_line_norm
751
+ )
752
+
753
+ # New word-based matching
754
+ current_line_words = set(combined_line_norm.split())
755
+ heading_words = set(heading_norm.split())
756
+ all_words_match = current_line_words.issubset(heading_words) and len(current_line_words) > 0
757
+
758
+ substring_match = (
759
+ heading_norm in combined_line_norm or
760
+ combined_line_norm in heading_norm or
761
+ all_words_match # Include the new word-based matching
762
+ )
763
+ # substring_match = (
764
+ # heading_norm in combined_line_norm or
765
+ # combined_line_norm in heading_norm
766
+ # )
767
+
768
+ if (substring_match and existsfull and not collecting and
769
+ len(combined_line_norm) > 0 ):#and (headertoContinue1 or headertoContinue2) ):
770
+
771
+ # Check header conditions more efficiently
772
+ header_spans = [
773
+ span for span in spans
774
+ if (is_header(span, most_common_font_size, most_common_color, most_common_font)
775
+ # and span['size'] >= subsubheaderFontSize
776
+ and span['size'] < mainHeaderFontSize)
777
+ ]
778
+ if header_spans:
779
+ collecting = True
780
+ matched_header_font_size = max(span["size"] for span in header_spans)
781
+ print(f"📥 Start collecting after header: {combined_line_norm} (Font size: {matched_header_font_size})")
782
+
783
+ collected_lines.append(line_text)
784
+ valid_spans = [span for span in spans if span.get("bbox")]
785
+
786
+ if valid_spans:
787
+ x0s = [span["bbox"][0] for span in valid_spans]
788
+ x1s = [span["bbox"][2] for span in valid_spans]
789
+ y0s = [span["bbox"][1] for span in valid_spans]
790
+ y1s = [span["bbox"][3] for span in valid_spans]
791
+
792
+ header_bbox = [min(x0s), min(y0s), max(x1s), max(y1s)]
793
+
794
+ if page_num in current_bbox:
795
+ cb = current_bbox[page_num]
796
+ current_bbox[page_num] = [
797
+ min(cb[0], header_bbox[0]),
798
+ min(cb[1], header_bbox[1]),
799
+ max(cb[2], header_bbox[2]),
800
+ max(cb[3], header_bbox[3])
801
+ ]
802
+ else:
803
+ current_bbox[page_num] = header_bbox
804
+ last_y1s[page_num] = header_bbox[3]
805
+ x0, y0, x1, y1 = header_bbox
806
+
807
+ zoom = 200
808
+ left = int(x0)
809
+ top = int(y0)
810
+ zoom_str = f"{zoom},{left},{top}"
811
+ pageNumberFound = page_num + 1
812
+
813
+ # Build the query parameters
814
+ params = {
815
+ 'pdfLink': pdf_path, # Your PDF link
816
+ 'keyword': heading_to_search, # Your keyword (could be a string or list)
817
+ }
818
+
819
+ # URL encode each parameter
820
+ encoded_params = {key: urllib.parse.quote(value, safe='') for key, value in params.items()}
821
+
822
+ # Construct the final encoded link
823
+ encoded_link = '&'.join([f"{key}={value}" for key, value in encoded_params.items()])
824
+
825
+ # Correctly construct the final URL with page and zoom
826
+ final_url = f"{baselink}{encoded_link}#page={str(pageNumberFound)}&zoom={zoom_str}"
827
+
828
+ # Get current date and time
829
+ now = datetime.now()
830
+
831
+ # Format the output
832
+ formatted_time = now.strftime("%d/%m/%Y %I:%M:%S %p")
833
+ # Optionally, add the URL to a DataFrame
834
+
835
+
836
+ data_entry = {
837
+ "NBSLink": final_url,
838
+ "Subject": heading_to_search,
839
+ "Page": str(pageNumberFound),
840
+ "Author": "ADR",
841
+ "Creation Date": formatted_time,
842
+ "Layer": "Initial",
843
+ "Code": "to be added",
844
+ "head above 1": paths[-2],
845
+ "head above 2": paths[0]
846
+ }
847
+ data_list_JSON.append(data_entry)
848
+
849
+ # Convert list to JSON
850
+ json_output = json.dumps(data_list_JSON, indent=4)
851
+
852
+ print("Final URL:", final_url)
853
+ i += 2
854
+ continue
855
+ else:
856
+ if (substring_match and not collecting and
857
+ len(combined_line_norm) > 0): # and (headertoContinue1 or headertoContinue2) ):
858
+
859
+ # Calculate word match percentage
860
+ word_match_percent = words_match_ratio(heading_norm, combined_line_norm) * 100
861
+
862
+ # Check if at least 70% of header words exist in this line
863
+ meets_word_threshold = word_match_percent >= 100
864
+
865
+ # Check header conditions (including word threshold)
866
+ header_spans = [
867
+ span for span in spans
868
+ if (is_header(span, most_common_font_size, most_common_color, most_common_font)
869
+ # and span['size'] >= subsubheaderFontSize
870
+ and span['size'] < mainHeaderFontSize)
871
+ ]
872
+
873
+ if header_spans and (meets_word_threshold or same_start_word(heading_to_search, combined_line_norm) ):
874
+ collecting = True
875
+ matched_header_font_size = max(span["size"] for span in header_spans)
876
+ print(f"📥 Start collecting after header: {combined_line_norm} "
877
+ f"(Font: {matched_header_font_size}, Word match: {word_match_percent:.0f}%)")
878
+
879
+ collected_lines.append(line_text)
880
+ valid_spans = [span for span in spans if span.get("bbox")]
881
+
882
+ if valid_spans:
883
+ x0s = [span["bbox"][0] for span in valid_spans]
884
+ x1s = [span["bbox"][2] for span in valid_spans]
885
+ y0s = [span["bbox"][1] for span in valid_spans]
886
+ y1s = [span["bbox"][3] for span in valid_spans]
887
+
888
+ header_bbox = [min(x0s), min(y0s), max(x1s), max(y1s)]
889
+
890
+ if page_num in current_bbox:
891
+ cb = current_bbox[page_num]
892
+ current_bbox[page_num] = [
893
+ min(cb[0], header_bbox[0]),
894
+ min(cb[1], header_bbox[1]),
895
+ max(cb[2], header_bbox[2]),
896
+ max(cb[3], header_bbox[3])
897
+ ]
898
+ else:
899
+ current_bbox[page_num] = header_bbox
900
+
901
+ last_y1s[page_num] = header_bbox[3]
902
+ i += 2
903
+ continue
904
+ if collecting:
905
+ norm_line = normalize_text(line_text)
906
+
907
+ # Optimized URL check
908
+ if url_pattern.match(norm_line):
909
+ line_is_header = False
910
+ else:
911
+ line_is_header = any(is_header(span, most_common_font_size, most_common_color, most_common_font) for span in spans)
912
+
913
+ if line_is_header:
914
+ header_font_size = max(span["size"] for span in spans)
915
+ is_probably_real_header = (
916
+ header_font_size >= matched_header_font_size and
917
+ is_header(spans[0], most_common_font_size, most_common_color, most_common_font) and
918
+ len(line_text.strip()) > 2
919
+ )
920
+
921
+ if (norm_line != matched_header_line_norm and
922
+ norm_line != heading_norm and
923
+ is_probably_real_header):
924
+ if line_text not in heading_norm:
925
+ print(f"🛑 Stop at header with same or larger font: '{line_text}' ({header_font_size} ≥ {matched_header_font_size})")
926
+ collecting = False
927
+ done = True
928
+ headertoContinue1 = False
929
+ headertoContinue2=False
930
+ for page_num, bbox in current_bbox.items():
931
+ bbox[3] = last_y1s.get(page_num, bbox[3])
932
+ page_highlights[page_num] = bbox
933
+ highlight_boxes(docHighlights, page_highlights)
934
+
935
+ break_collecting = True
936
+ break
937
+
938
+ if break_collecting:
939
+ break
940
+
941
+ collected_lines.append(line_text)
942
+ valid_spans = [span for span in spans if span.get("bbox")]
943
+ if valid_spans:
944
+ x0s = [span["bbox"][0] for span in valid_spans]
945
+ x1s = [span["bbox"][2] for span in valid_spans]
946
+ y0s = [span["bbox"][1] for span in valid_spans]
947
+ y1s = [span["bbox"][3] for span in valid_spans]
948
+
949
+ line_bbox = [min(x0s), min(y0s), max(x1s), max(y1s)]
950
+
951
+ if page_num in current_bbox:
952
+ cb = current_bbox[page_num]
953
+ current_bbox[page_num] = [
954
+ min(cb[0], line_bbox[0]),
955
+ min(cb[1], line_bbox[1]),
956
+ max(cb[2], line_bbox[2]),
957
+ max(cb[3], line_bbox[3])
958
+ ]
959
+ else:
960
+ current_bbox[page_num] = line_bbox
961
+
962
+ last_y1s[page_num] = line_bbox[3]
963
+ i += 1
964
+
965
+ if not done:
966
+ for page_num, bbox in current_bbox.items():
967
+ bbox[3] = last_y1s.get(page_num, bbox[3])
968
+ page_highlights[page_num] = bbox
969
+ highlight_boxes(docHighlights, page_highlights)
970
+
971
+ docHighlights.save("highlighted_output.pdf", garbage=4, deflate=True)
972
+ return json_output
973
+
974
+ pdflink='https://www.dropbox.com/scl/fi/jtffcxszwpcnc6wdo61p6/WH007-JAC-RP-XX-SP-AA-8501-Redoak-Pump-House-Specification.pdf?rlkey=unq4ag9eajezv2j6y6ewkkk5u&e=29&st=wu3vsd70&dl=0'
975
+
976
+ jsonOutput=extract_section_under_header(pdflink)
977
+ print(jsonOutput)
978
+