Marthee commited on
Commit
ba3dba7
·
verified ·
1 Parent(s): e11dcfc

Upload InitialMarkups.py

Browse files
Files changed (1) hide show
  1. InitialMarkups.py +1019 -0
InitialMarkups.py ADDED
@@ -0,0 +1,1019 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """Copy of FindSpecsTrial(Retrieving+boundingBoxes)-InitialMarkups(ALL)_CleanedUp.ipynb
3
+
4
+ Automatically generated by Colab.
5
+
6
+ Original file is located at
7
+ https://colab.research.google.com/drive/12XfVkmKmN3oVjHhLVE0_GgkftgArFEK2
8
+ """
9
+ baselink='https://findconsole-initialmarkups.hf.space/view-pdf?'
10
+
11
+
12
+
13
+ from io import BytesIO
14
+ import re
15
+ import requests
16
+ import pandas as pd
17
+ import fitz # PyMuPDF
18
+ import re
19
+ import urllib.parse
20
+ import pandas as pd
21
+ import math
22
+ import random
23
+ import json
24
+ from datetime import datetime
25
+ from collections import defaultdict, Counter
26
+ import difflib
27
+ from fuzzywuzzy import fuzz
28
+
29
+ def get_regular_font_size_and_color(doc):
30
+ font_sizes = []
31
+ colors = []
32
+ fonts = []
33
+
34
+ # Loop through all pages
35
+ for page_num in range(len(doc)):
36
+ page = doc.load_page(page_num)
37
+ for span in page.get_text("dict")["blocks"]:
38
+ if "lines" in span:
39
+ for line in span["lines"]:
40
+ for span in line["spans"]:
41
+ font_sizes.append(span['size'])
42
+ colors.append(span['color'])
43
+ fonts.append(span['font'])
44
+
45
+ # Get the most common font size, color, and font
46
+ most_common_font_size = Counter(font_sizes).most_common(1)[0][0] if font_sizes else None
47
+ most_common_color = Counter(colors).most_common(1)[0][0] if colors else None
48
+ most_common_font = Counter(fonts).most_common(1)[0][0] if fonts else None
49
+
50
+ return most_common_font_size, most_common_color, most_common_font
51
+
52
+ def normalize_text(text):
53
+ if text is None:
54
+ return ""
55
+ return re.sub(r'\s+', ' ', text.strip().lower())
56
+
57
+ def get_spaced_text_from_spans(spans):
58
+ return normalize_text(" ".join(span["text"].strip() for span in spans))
59
+
60
+ def is_header(span, most_common_font_size, most_common_color, most_common_font):
61
+ fontname = span.get("font", "").lower()
62
+ # is_italic = "italic" in fontname or "oblique" in fontname
63
+ is_bold = "bold" in fontname or span.get("bold", False)
64
+ return (
65
+ (
66
+ span["size"] > most_common_font_size or
67
+ span["font"].lower() != most_common_font.lower() or
68
+ is_bold
69
+ )
70
+ )
71
+
72
+ def add_span_to_nearest_group(span_y, grouped_dict, pageNum=None, threshold=0.5):
73
+ for (p, y) in grouped_dict:
74
+ if pageNum is not None and p != pageNum:
75
+ continue
76
+ if abs(y - span_y) <= threshold:
77
+ return (p, y)
78
+ return (pageNum, span_y)
79
+
80
+ def extract_headers(doc, toc_pages, most_common_font_size, most_common_color, most_common_font, top_margin, bottom_margin):
81
+ print("Font baseline:", most_common_font_size, most_common_color, most_common_font)
82
+
83
+ grouped_headers = defaultdict(list)
84
+ spans = []
85
+ line_merge_threshold = 1.5 # Maximum vertical distance between lines to consider as part of same header
86
+
87
+ for pageNum in range(len(doc)):
88
+ if pageNum in toc_pages:
89
+ continue
90
+ page = doc.load_page(pageNum)
91
+ page_height = page.rect.height
92
+ text_instances = page.get_text("dict")
93
+
94
+ # First pass: collect all potential header spans
95
+ potential_header_spans = []
96
+ for block in text_instances['blocks']:
97
+ if block['type'] != 0:
98
+ continue
99
+
100
+ for line in block['lines']:
101
+ for span in line['spans']:
102
+ span_y0 = span['bbox'][1]
103
+ span_y1 = span['bbox'][3]
104
+
105
+ if span_y0 < top_margin or span_y1 > (page_height - bottom_margin):
106
+ continue
107
+
108
+ span_text = normalize_text(span.get('text', ''))
109
+ if not span_text:
110
+ continue
111
+ if span_text.startswith('http://www') or span_text.startswith('www'):
112
+ continue
113
+ if any((
114
+ 'page' in span_text,
115
+ not re.search(r'[a-z0-9]', span_text),
116
+ 'end of section' in span_text,
117
+ re.search(r'page\s+\d+\s+of\s+\d+', span_text),
118
+ re.search(r'\b(?:\d{1,2}[/-])?\d{1,2}[/-]\d{2,4}\b', span_text),
119
+ # re.search(r'\b(?:jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)', span_text),
120
+ 'specification:' in span_text
121
+ )):
122
+ continue
123
+
124
+ cleaned_text = re.sub(r'[.\-]{4,}.*$', '', span_text).strip()
125
+ cleaned_text = normalize_text(cleaned_text)
126
+
127
+ if is_header(span, most_common_font_size, most_common_color, most_common_font):
128
+ potential_header_spans.append({
129
+ 'text': cleaned_text,
130
+ 'size': span['size'],
131
+ 'pageNum': pageNum,
132
+ 'y0': span_y0,
133
+ 'y1': span_y1,
134
+ 'x0': span['bbox'][0],
135
+ 'x1': span['bbox'][2],
136
+ 'span': span
137
+ })
138
+
139
+ # Sort spans by vertical position (top to bottom)
140
+ potential_header_spans.sort(key=lambda s: (s['pageNum'], s['y0']))
141
+
142
+ # Second pass: group spans that are vertically close and likely part of same header
143
+ i = 0
144
+ while i < len(potential_header_spans):
145
+ current = potential_header_spans[i]
146
+ header_text = current['text']
147
+ header_size = current['size']
148
+ header_page = current['pageNum']
149
+ min_y = current['y0']
150
+ max_y = current['y1']
151
+ spans_group = [current['span']]
152
+
153
+ # Look ahead to find adjacent lines that might be part of same header
154
+ j = i + 1
155
+ while j < len(potential_header_spans):
156
+ next_span = potential_header_spans[j]
157
+ # Check if on same page and vertically close with similar styling
158
+ if (next_span['pageNum'] == header_page and
159
+ next_span['y0'] - max_y < line_merge_threshold and
160
+ abs(next_span['size'] - header_size) < 0.5):
161
+ header_text += " " + next_span['text']
162
+ max_y = next_span['y1']
163
+ spans_group.append(next_span['span'])
164
+ j += 1
165
+ else:
166
+ break
167
+
168
+ # Add the merged header
169
+ grouped_headers[(header_page, min_y)].append({
170
+ "text": header_text.strip(),
171
+ "size": header_size,
172
+ "pageNum": header_page,
173
+ "spans": spans_group
174
+ })
175
+ spans.extend(spans_group)
176
+ i = j # Skip the spans we've already processed
177
+
178
+ # Prepare final headers list
179
+ headers = []
180
+ for (pageNum, y), header_groups in sorted(grouped_headers.items()):
181
+ for group in header_groups:
182
+ headers.append([
183
+ group['text'],
184
+ group['size'],
185
+ group['pageNum'],
186
+ y
187
+ ])
188
+
189
+ font_sizes = [size for _, size, _, _ in headers]
190
+ font_size_counts = Counter(font_sizes)
191
+
192
+ # Filter font sizes that appear at least 3 times
193
+ valid_font_sizes = [size for size, count in font_size_counts.items() if count >= 3]
194
+
195
+ # Sort in descending order
196
+ valid_font_sizes_sorted = sorted(valid_font_sizes, reverse=True)
197
+
198
+ # If only 2 sizes, repeat the second one
199
+ if len(valid_font_sizes_sorted) == 2:
200
+ top_3_font_sizes = [valid_font_sizes_sorted[0], valid_font_sizes_sorted[1], valid_font_sizes_sorted[1]]
201
+ else:
202
+ top_3_font_sizes = valid_font_sizes_sorted[:3]
203
+
204
+ # Get the smallest font size among valid ones
205
+ smallest_font_size = min(valid_font_sizes) if valid_font_sizes else None
206
+ return headers, top_3_font_sizes, smallest_font_size, spans
207
+
208
+ def is_numbered(text):
209
+ return bool(re.match(r'^\d', text.strip()))
210
+
211
+ def is_similar(a, b, threshold=0.85):
212
+ return difflib.SequenceMatcher(None, a, b).ratio() > threshold
213
+
214
+ def normalize(text):
215
+ text = text.lower()
216
+ text = re.sub(r'\.{2,}', '', text) # remove long dots
217
+ text = re.sub(r'\s+', ' ', text) # replace multiple spaces with one
218
+ return text.strip()
219
+
220
+ def clean_toc_entry(toc_text):
221
+ """Remove page numbers and formatting from TOC entries"""
222
+ # Remove everything after last sequence of dots/whitespace followed by digits
223
+ return re.sub(r'[\.\s]+\d+.*$', '', toc_text).strip('. ')
224
+
225
+ def build_header_hierarchy(doc, toc_pages, most_common_font_size, most_common_color, most_common_font, top_margin=70, bottom_margin=70):
226
+ # Extract headers with margin handling
227
+ headers_list, top_3_font_sizes, smallest_font_size, spans = extract_headers(
228
+ doc,
229
+ toc_pages=toc_pages,
230
+ most_common_font_size=most_common_font_size,
231
+ most_common_color=most_common_color,
232
+ most_common_font=most_common_font,
233
+ top_margin=top_margin,
234
+ bottom_margin=bottom_margin
235
+ )
236
+
237
+ # Step 1: Collect and filter potential headers
238
+ headers = []
239
+ seen_headers = set()
240
+
241
+ # First extract TOC entries to get exact level 0 header texts
242
+ toc_entries = {}
243
+ for pno in toc_pages:
244
+ page = doc.load_page(pno)
245
+ toc_text = page.get_text()
246
+ for line in toc_text.split('\n'):
247
+ clean_line = line.strip()
248
+ if clean_line:
249
+ norm_line = normalize(clean_line)
250
+ toc_entries[norm_line] = clean_line # Store original text
251
+
252
+ for h in headers_list:
253
+ text, size, pageNum, y = h[:4]
254
+ page = doc.load_page(pageNum)
255
+ page_height = page.rect.height
256
+
257
+ # Skip margin areas
258
+ if y < top_margin or y > (page_height - bottom_margin):
259
+ continue
260
+
261
+ norm_text = normalize(text)
262
+ if len(norm_text) > 2 and size >= most_common_font_size:
263
+ headers.append({
264
+ "text": text,
265
+ "page": pageNum,
266
+ "y": y,
267
+ "size": size,
268
+ "bold": h[4] if len(h) > 4 else False,
269
+ # "italic": h[5] if len(h) > 5 else False,
270
+ "color": h[6] if len(h) > 6 else None,
271
+ "font": h[7] if len(h) > 7 else None,
272
+ "children": [],
273
+ "is_numbered": is_numbered(text),
274
+ "original_size": size,
275
+ "norm_text": norm_text,
276
+ "level": -1 # Initialize as unassigned
277
+ })
278
+
279
+ # Sort by page and vertical position
280
+ headers.sort(key=lambda h: (h['page'], h['y']))
281
+ # Step 2: Detect consecutive headers and assign levels
282
+ i = 0
283
+ while i < len(headers) - 1:
284
+ current = headers[i]
285
+ next_header = headers[i+1]
286
+
287
+ # Check if they are on the same page and very close vertically (likely consecutive lines)
288
+ if (current['page'] == next_header['page'] and
289
+ abs(current['y'] - next_header['y']) < 20): # 20pt threshold for "same line"
290
+
291
+ # Case 1: Both unassigned - make current level 1 and next level 2
292
+ if current['level'] == -1 and next_header['level'] == -1:
293
+ current['level'] = 1
294
+ next_header['level'] = 2
295
+ i += 1 # Skip next header since we processed it
296
+
297
+ # Case 2: Current unassigned, next assigned - make current one level above
298
+ elif current['level'] == -1 and next_header['level'] != -1:
299
+ current['level'] = max(1, next_header['level'] - 1)
300
+
301
+ # Case 3: Current assigned, next unassigned - make next one level below
302
+ elif current['level'] != -1 and next_header['level'] == -1:
303
+ next_header['level'] = current['level'] + 1
304
+ i += 1 # Skip next header since we processed it
305
+ i += 1
306
+ # Step 2: Identify level 0 headers (largest and in TOC)
307
+ # max_size = max(h['size'] for h in headers) if headers else 0
308
+ max_size,subheaderSize,nbsheadersize=top_3_font_sizes
309
+ print(max_size)
310
+ toc_text_match=[]
311
+ # Improved TOC matching with exact and substring matching
312
+ toc_matches = []
313
+ for h in headers:
314
+ norm_text = h['norm_text']
315
+ matching_toc_texts = []
316
+
317
+ # Check both exact matches and substring matches
318
+ for toc_norm, toc_text in toc_entries.items():
319
+ # Exact match case
320
+ if norm_text == toc_norm and len(toc_text)>4 and h['size']==max_size:
321
+ matching_toc_texts.append(toc_text)
322
+ # Substring match case (header is substring of TOC entry)
323
+ elif norm_text in toc_norm and len(toc_text)>4 and h['size']==max_size:
324
+ matching_toc_texts.append(toc_text)
325
+ # Substring match case (TOC entry is substring of header)
326
+ elif toc_norm in norm_text and len(toc_text)>4 and h['size']==max_size:
327
+ matching_toc_texts.append(toc_text)
328
+
329
+ if matching_toc_texts and h['size'] >= max_size * 0.9:
330
+ best_match = max(matching_toc_texts,
331
+ key=lambda x: (len(x), -len(x.replace(norm_text, ''))))
332
+ h['text'] = normalize_text(clean_toc_entry(best_match))
333
+ h['level'] = 0
334
+ if h['text'] not in toc_text_match:
335
+ toc_matches.append(h)
336
+ toc_text_match.append(h['text'])
337
+ elif matching_toc_texts and h['size'] < max_size * 0.9 and h['size'] > nbsheadersize : # h['size'] < max_size * 0.9 and h['size'] > max_size*0.75:
338
+ print(h['text'],matching_toc_texts)
339
+ headers.remove(h)
340
+ continue
341
+
342
+
343
+ # Remove duplicates - keep only first occurrence of each level 0 header
344
+ unique_level0 = []
345
+ seen_level0 = set()
346
+ for h in toc_matches:
347
+ # Use the cleaned text for duplicate checking
348
+ cleaned_text = clean_toc_entry(h['text'])
349
+ norm_cleaned_text = normalize(cleaned_text)
350
+
351
+ if norm_cleaned_text not in seen_level0:
352
+ seen_level0.add(norm_cleaned_text)
353
+ # Update the header text with cleaned version
354
+ h['text'] = cleaned_text
355
+ unique_level0.append(h)
356
+ print(f"Added unique header: {cleaned_text} (normalized: {norm_cleaned_text})")
357
+
358
+ # Step 3: Process headers under each level 0 to identify level 1 format
359
+
360
+ # First, group headers by their level 0 parent
361
+ level0_headers = [h for h in headers if h['level'] == 0]
362
+ header_groups = []
363
+
364
+ for i, level0 in enumerate(level0_headers):
365
+ start_idx = headers.index(level0)
366
+ end_idx = headers.index(level0_headers[i+1]) if i+1 < len(level0_headers) else len(headers)
367
+ group = headers[start_idx:end_idx]
368
+ header_groups.append(group)
369
+
370
+ # Now process each group to identify level 1 format
371
+ for group in header_groups:
372
+ level0 = group[0]
373
+ level1_candidates = [h for h in group[1:] if h['level'] == -1]
374
+
375
+ if not level1_candidates:
376
+ continue
377
+
378
+ # The first candidate is our reference level 1
379
+ first_level1 = level1_candidates[0]
380
+ level1_format = {
381
+ 'font': first_level1['font'],
382
+ 'color': first_level1['color'],
383
+ 'starts_with_number': is_numbered(first_level1['text']),
384
+ 'size': first_level1['size'],
385
+ 'bold': first_level1['bold']
386
+ # 'italic': first_level1['italic']
387
+ }
388
+
389
+ # Assign levels based on the reference format
390
+ for h in level1_candidates:
391
+ current_format = {
392
+ 'font': h['font'],
393
+ 'color': h['color'],
394
+ 'starts_with_number': is_numbered(h['text']),
395
+ 'size': h['size'],
396
+ 'bold': h['bold']
397
+ # 'italic': h['italic']
398
+ }
399
+
400
+ # Compare with level1 format
401
+ if (current_format['font'] == level1_format['font'] and
402
+ current_format['color'] == level1_format['color'] and
403
+ current_format['starts_with_number'] == level1_format['starts_with_number'] and
404
+ abs(current_format['size'] - level1_format['size']) <= 0.1 and
405
+ current_format['bold'] == level1_format['bold'] ): #and
406
+ # current_format['italic'] == level1_format['italic']):
407
+ h['level'] = 1
408
+ else:
409
+ h['level'] = 2
410
+
411
+ # Step 4: Assign levels to remaining unassigned headers
412
+ unassigned = [h for h in headers if h['level'] == -1]
413
+ if unassigned:
414
+ # Cluster by size with tolerance
415
+ sizes = sorted({h['size'] for h in unassigned}, reverse=True)
416
+ clusters = []
417
+
418
+ for size in sizes:
419
+ found_cluster = False
420
+ for cluster in clusters:
421
+ if abs(size - cluster['size']) <= max(size, cluster['size']) * 0.1:
422
+ cluster['headers'].extend([h for h in unassigned if abs(h['size'] - size) <= size * 0.1])
423
+ found_cluster = True
424
+ break
425
+ if not found_cluster:
426
+ clusters.append({
427
+ 'size': size,
428
+ 'headers': [h for h in unassigned if abs(h['size'] - size) <= size * 0.1]
429
+ })
430
+
431
+ # Assign levels starting from 1
432
+ clusters.sort(key=lambda x: -x['size'])
433
+ for i, cluster in enumerate(clusters):
434
+ for h in cluster['headers']:
435
+ base_level = i + 1
436
+ if h['bold']:
437
+ base_level = max(1, base_level - 1)
438
+ h['level'] = base_level
439
+
440
+ # Step 5: Build hierarchy
441
+ root = []
442
+ stack = []
443
+
444
+ # Create a set of normalized texts from unique_level0 to avoid duplicates
445
+ unique_level0_texts = {h['norm_text'] for h in unique_level0}
446
+
447
+ # Filter out any headers from the original list that match unique_level0 headers
448
+ filtered_headers = []
449
+ for h in headers:
450
+ if h['norm_text'] in unique_level0_texts and h not in unique_level0:
451
+ h['level'] = 0
452
+ filtered_headers.append(h)
453
+
454
+ # Combine all headers - unique_level0 first, then the filtered headers
455
+ all_headers = unique_level0 + filtered_headers
456
+ all_headers.sort(key=lambda h: (h['page'], h['y']))
457
+
458
+ # Track which level 0 headers we've already added
459
+ added_level0 = set()
460
+
461
+ for header in all_headers:
462
+ if header['level'] < 0:
463
+ continue
464
+
465
+ if header['level'] == 0:
466
+ norm_text = header['norm_text']
467
+ if norm_text in added_level0:
468
+ continue
469
+ added_level0.add(norm_text)
470
+
471
+ # Pop stack until we find a parent
472
+ while stack and stack[-1]['level'] >= header['level']:
473
+ stack.pop()
474
+
475
+ current_parent = stack[-1] if stack else None
476
+
477
+ if current_parent:
478
+ current_parent['children'].append(header)
479
+ else:
480
+ root.append(header)
481
+
482
+ stack.append(header)
483
+
484
+ # Step 6: Enforce proper nesting
485
+ def enforce_nesting(node_list, parent_level=-1):
486
+ for node in node_list:
487
+ if node['level'] <= parent_level:
488
+ node['level'] = parent_level + 1
489
+ enforce_nesting(node['children'], node['level'])
490
+
491
+ enforce_nesting(root)
492
+ root = [h for h in root if not (h['level'] == 0 and not h['children'])]
493
+ return root
494
+
495
+ def adjust_levels_if_level0_not_in_toc(doc, toc_pages, root):
496
+ def normalize(text):
497
+ return re.sub(r'\s+', ' ', text.strip().lower())
498
+
499
+ toc_text = ""
500
+ for pno in toc_pages:
501
+ page = doc.load_page(pno)
502
+ toc_text += page.get_text()
503
+ toc_text_normalized = normalize(toc_text)
504
+
505
+ def is_level0_in_toc_text(header):
506
+ return header['level'] == 0 and normalize(header['text']) in toc_text_normalized
507
+
508
+ if any(is_level0_in_toc_text(h) for h in root):
509
+ return # No change needed
510
+
511
+ def increase_levels(node_list):
512
+ for node in node_list:
513
+ node['level'] += 1
514
+ increase_levels(node['children'])
515
+
516
+ def assign_numbers_to_headers(headers, prefix=None):
517
+ for idx, header in enumerate(headers, 1):
518
+ current_number = f"{prefix}.{idx}" if prefix else str(idx)
519
+ header["number"] = current_number
520
+ assign_numbers_to_headers(header["children"], current_number)
521
+
522
+ def print_tree_with_numbers(headers, indent=0):
523
+ for header in headers:
524
+ size_info = f"size:{header['original_size']:.1f}" if 'original_size' in header else ""
525
+ print(" " * indent +
526
+ f"{header.get('number', '?')} {header['text']} " +
527
+ f"(Level {header['level']}, p:{header['page']+1}, {size_info})")
528
+ print_tree_with_numbers(header["children"], indent + 1)
529
+
530
+
531
+ def highlight_boxes(doc, highlights, fixed_width=500): # Set your desired width here
532
+ for page_num, bbox in highlights.items():
533
+ page = doc.load_page(page_num)
534
+ page_width = page.rect.width
535
+
536
+ # Get original rect for vertical coordinates
537
+ orig_rect = fitz.Rect(bbox)
538
+ rect_height = orig_rect.height
539
+
540
+ if orig_rect.width > 10:
541
+ # Center horizontally using fixed width
542
+ center_x = page_width / 2
543
+ new_x0 = center_x - fixed_width / 2
544
+ new_x1 = center_x + fixed_width / 2
545
+ new_rect = fitz.Rect(new_x0, orig_rect.y0, new_x1, orig_rect.y1)
546
+
547
+ # Add highlight rectangle
548
+ annot = page.add_rect_annot(new_rect)
549
+ annot.set_colors(stroke=(1, 1, 0), fill=(1, 1, 0))
550
+ annot.set_opacity(0.3)
551
+ annot.update()
552
+
553
+ # Add right-aligned freetext annotation inside the fixed-width box
554
+ text = "[To be billed]"
555
+ annot1 = page.add_freetext_annot(
556
+ new_rect,
557
+ text,
558
+ fontsize=15,
559
+ fontname='helv',
560
+ text_color=(1, 0, 0),
561
+ rotate=page.rotation,
562
+ align=2 # right alignment
563
+ )
564
+ annot1.update()
565
+
566
+ def get_leaf_headers_with_paths(listtoloop, path=None, output=None):
567
+ if path is None:
568
+ path = []
569
+ if output is None:
570
+ output = []
571
+ for header in listtoloop:
572
+ current_path = path + [header['text']]
573
+ if not header['children']:
574
+ if header['level'] != 0 and header['level'] != 1:
575
+ output.append((header, current_path))
576
+ else:
577
+ get_leaf_headers_with_paths(header['children'], current_path, output)
578
+ return output
579
+
580
+ # Add this helper function at the top of your code
581
+ def words_match_ratio(text1, text2):
582
+ words1 = set(text1.split())
583
+ words2 = set(text2.split())
584
+ if not words1 or not words2:
585
+ return 0.0
586
+ common_words = words1 & words2
587
+ return len(common_words) / len(words1)
588
+
589
+ def same_start_word(s1, s2):
590
+ # Split both strings into words
591
+ words1 = s1.strip().split()
592
+ words2 = s2.strip().split()
593
+
594
+ # Check if both have at least one word and compare the first ones
595
+ if words1 and words2:
596
+ return words1[0].lower() == words2[0].lower()
597
+ return False
598
+
599
+
600
+ def extract_section_under_header(pdf_path):
601
+ top_margin = 70
602
+ bottom_margin = 50
603
+ headertoContinue1 = False
604
+ headertoContinue2=False
605
+
606
+ # Optimized URL handling
607
+ if pdf_path and ('http' in pdf_path or 'dropbox' in pdf_path):
608
+ pdf_path = pdf_path.replace('dl=0', 'dl=1')
609
+
610
+ # Cache frequently used values
611
+ response = requests.get(pdf_path)
612
+ pdf_content = BytesIO(response.content)
613
+ if not pdf_content:
614
+ raise ValueError("No valid PDF content found.")
615
+
616
+ doc = fitz.open(stream=pdf_content, filetype="pdf")
617
+ docHighlights = fitz.open(stream=pdf_content, filetype="pdf")
618
+ most_common_font_size, most_common_color, most_common_font = get_regular_font_size_and_color(doc)
619
+
620
+ # Precompute regex patterns
621
+ dot_pattern = re.compile(r'\.{3,}')
622
+ url_pattern = re.compile(r'https?://\S+|www\.\S+')
623
+
624
+ def get_toc_page_numbers(doc, max_pages_to_check=15):
625
+ toc_pages = []
626
+ for page_num in range(min(len(doc), max_pages_to_check)):
627
+ page = doc.load_page(page_num)
628
+ blocks = page.get_text("dict")["blocks"]
629
+
630
+ dot_line_count = 0
631
+ for block in blocks:
632
+ for line in block.get("lines", []):
633
+ line_text = get_spaced_text_from_spans(line["spans"]).strip()
634
+ if dot_pattern.search(line_text):
635
+ dot_line_count += 1
636
+
637
+ if dot_line_count >= 3:
638
+ toc_pages.append(page_num)
639
+
640
+ return list(range(0, toc_pages[-1] +1)) if toc_pages else toc_pages
641
+
642
+ toc_pages = get_toc_page_numbers(doc)
643
+
644
+ headers, top_3_font_sizes, smallest_font_size, headersSpans = extract_headers(
645
+ doc, toc_pages, most_common_font_size, most_common_color, most_common_font, top_margin, bottom_margin
646
+ )
647
+
648
+ hierarchy = build_header_hierarchy(doc, toc_pages, most_common_font_size, most_common_color, most_common_font)
649
+ listofHeaderstoMarkup = get_leaf_headers_with_paths(hierarchy)
650
+ print('listofHeaderstoMarkup',listofHeaderstoMarkup)
651
+ # Precompute all children headers once
652
+ allchildrenheaders = [normalize_text(item['text']) for item, p in listofHeaderstoMarkup]
653
+ allchildrenheaders_set = set(allchildrenheaders) # For faster lookups
654
+
655
+ df = pd.DataFrame(columns=["NBSLink","Subject","Page","Author","Creation Date","Layer",'Code', 'head above 1', "head above 2"])
656
+ dictionaryNBS={}
657
+ data_list_JSON = []
658
+
659
+ if len(top_3_font_sizes)==3:
660
+ mainHeaderFontSize, subHeaderFontSize, subsubheaderFontSize = top_3_font_sizes
661
+ elif len(top_3_font_sizes)==2:
662
+ mainHeaderFontSize= top_3_font_sizes[0]
663
+ subHeaderFontSize= top_3_font_sizes[1]
664
+ subsubheaderFontSize= top_3_font_sizes[1]
665
+
666
+ print("📌 Has TOC:", bool(toc_pages), " | Pages to skip:", toc_pages)
667
+
668
+ # Preload all pages to avoid repeated loading
669
+ # pages = [doc.load_page(page_num) for page_num in range(len(doc)) if page_num not in toc_pages]
670
+
671
+ for heading_to_searchDict, paths in listofHeaderstoMarkup:
672
+ heading_to_search = heading_to_searchDict['text']
673
+ heading_to_searchPageNum = heading_to_searchDict['page']
674
+
675
+ print('headertosearch', heading_to_search)
676
+
677
+ # Initialize variables
678
+ headertoContinue1 = False
679
+ headertoContinue2 = False
680
+ matched_header_line = None
681
+ done = False
682
+ collecting = False
683
+ collected_lines = []
684
+ page_highlights = {}
685
+ current_bbox = {}
686
+ last_y1s = {}
687
+ mainHeader = ''
688
+ subHeader = ''
689
+ matched_header_line_norm = heading_to_search
690
+ break_collecting = False
691
+ heading_norm = normalize_text(heading_to_search)
692
+ paths_norm = [normalize_text(p) for p in paths[0]] if paths and paths[0] else []
693
+
694
+ for page_num in range(heading_to_searchPageNum,len(doc)):
695
+ if page_num in toc_pages:
696
+ continue
697
+ if break_collecting:
698
+ break
699
+ page=doc[page_num]
700
+ page_height = page.rect.height
701
+ blocks = page.get_text("dict")["blocks"]
702
+
703
+ for block in blocks:
704
+ if break_collecting:
705
+ break
706
+
707
+ lines = block.get("lines", [])
708
+ i = 0
709
+ while i < len(lines):
710
+ if break_collecting:
711
+ break
712
+
713
+ spans = lines[i].get("spans", [])
714
+ if not spans:
715
+ i += 1
716
+ continue
717
+
718
+ y0 = spans[0]["bbox"][1]
719
+ y1 = spans[0]["bbox"][3]
720
+ if y0 < top_margin or y1 > (page_height - bottom_margin):
721
+ i += 1
722
+ continue
723
+
724
+ line_text = get_spaced_text_from_spans(spans).lower()
725
+ line_text_norm = normalize_text(line_text)
726
+
727
+ # Combine with next line if available
728
+ if i + 1 < len(lines):
729
+ next_spans = lines[i + 1].get("spans", [])
730
+ next_line_text = get_spaced_text_from_spans(next_spans).lower()
731
+ combined_line_norm = normalize_text(line_text + " " + next_line_text)
732
+ else:
733
+ combined_line_norm = line_text_norm
734
+
735
+ # Check if we should continue processing
736
+ if combined_line_norm and combined_line_norm in paths[0]:
737
+ print(combined_line_norm)
738
+ headertoContinue1 = combined_line_norm
739
+ if combined_line_norm and combined_line_norm in paths[-2]:
740
+ print(combined_line_norm)
741
+ headertoContinue2 = combined_line_norm
742
+
743
+ # Optimized header matching
744
+ existsfull = (
745
+ ( combined_line_norm in allchildrenheaders_set or
746
+ combined_line_norm in allchildrenheaders ) and heading_to_search in combined_line_norm
747
+ )
748
+
749
+ # New word-based matching
750
+ current_line_words = set(combined_line_norm.split())
751
+ heading_words = set(heading_norm.split())
752
+ all_words_match = current_line_words.issubset(heading_words) and len(current_line_words) > 0
753
+
754
+ substring_match = (
755
+ heading_norm in combined_line_norm or
756
+ combined_line_norm in heading_norm or
757
+ all_words_match # Include the new word-based matching
758
+ )
759
+ # substring_match = (
760
+ # heading_norm in combined_line_norm or
761
+ # combined_line_norm in heading_norm
762
+ # )
763
+
764
+ if (substring_match and existsfull and not collecting and
765
+ len(combined_line_norm) > 0 ):#and (headertoContinue1 or headertoContinue2) ):
766
+
767
+ # Check header conditions more efficiently
768
+ header_spans = [
769
+ span for span in spans
770
+ if (is_header(span, most_common_font_size, most_common_color, most_common_font)
771
+ # and span['size'] >= subsubheaderFontSize
772
+ and span['size'] < mainHeaderFontSize)
773
+ ]
774
+ if header_spans:
775
+ collecting = True
776
+ matched_header_font_size = max(span["size"] for span in header_spans)
777
+ print(f"📥 Start collecting after header: {combined_line_norm} (Font size: {matched_header_font_size})")
778
+
779
+ collected_lines.append(line_text)
780
+ valid_spans = [span for span in spans if span.get("bbox")]
781
+
782
+ if valid_spans:
783
+ x0s = [span["bbox"][0] for span in valid_spans]
784
+ x1s = [span["bbox"][2] for span in valid_spans]
785
+ y0s = [span["bbox"][1] for span in valid_spans]
786
+ y1s = [span["bbox"][3] for span in valid_spans]
787
+
788
+ header_bbox = [min(x0s), min(y0s), max(x1s), max(y1s)]
789
+
790
+ if page_num in current_bbox:
791
+ cb = current_bbox[page_num]
792
+ current_bbox[page_num] = [
793
+ min(cb[0], header_bbox[0]),
794
+ min(cb[1], header_bbox[1]),
795
+ max(cb[2], header_bbox[2]),
796
+ max(cb[3], header_bbox[3])
797
+ ]
798
+ else:
799
+ current_bbox[page_num] = header_bbox
800
+ last_y1s[page_num] = header_bbox[3]
801
+ x0, y0, x1, y1 = header_bbox
802
+
803
+ zoom = 200
804
+ left = int(x0)
805
+ top = int(y0)
806
+ zoom_str = f"{zoom},{left},{top}"
807
+ pageNumberFound = page_num + 1
808
+
809
+ # Build the query parameters
810
+ params = {
811
+ 'pdfLink': pdf_path, # Your PDF link
812
+ 'keyword': heading_to_search, # Your keyword (could be a string or list)
813
+ }
814
+
815
+ # URL encode each parameter
816
+ encoded_params = {key: urllib.parse.quote(value, safe='') for key, value in params.items()}
817
+
818
+ # Construct the final encoded link
819
+ encoded_link = '&'.join([f"{key}={value}" for key, value in encoded_params.items()])
820
+
821
+ # Correctly construct the final URL with page and zoom
822
+ final_url = f"{baselink}{encoded_link}#page={str(pageNumberFound)}&zoom={zoom_str}"
823
+
824
+ # Get current date and time
825
+ now = datetime.now()
826
+
827
+ # Format the output
828
+ formatted_time = now.strftime("%d/%m/%Y %I:%M:%S %p")
829
+ # Optionally, add the URL to a DataFrame
830
+
831
+
832
+ data_entry = {
833
+ "NBSLink": final_url,
834
+ "Subject": heading_to_search,
835
+ "Page": str(pageNumberFound),
836
+ "Author": "ADR",
837
+ "Creation Date": formatted_time,
838
+ "Layer": "Initial",
839
+ "Code": "to be added",
840
+ "head above 1": paths[-2],
841
+ "head above 2": paths[0]
842
+ }
843
+ data_list_JSON.append(data_entry)
844
+
845
+ # Convert list to JSON
846
+ json_output = json.dumps(data_list_JSON, indent=4)
847
+
848
+ print("Final URL:", final_url)
849
+ i += 2
850
+ continue
851
+ else:
852
+ if (substring_match and not collecting and
853
+ len(combined_line_norm) > 0): # and (headertoContinue1 or headertoContinue2) ):
854
+
855
+ # Calculate word match percentage
856
+ word_match_percent = words_match_ratio(heading_norm, combined_line_norm) * 100
857
+
858
+ # Check if at least 70% of header words exist in this line
859
+ meets_word_threshold = word_match_percent >= 100
860
+
861
+ # Check header conditions (including word threshold)
862
+ header_spans = [
863
+ span for span in spans
864
+ if (is_header(span, most_common_font_size, most_common_color, most_common_font)
865
+ # and span['size'] >= subsubheaderFontSize
866
+ and span['size'] < mainHeaderFontSize)
867
+ ]
868
+
869
+ if header_spans and (meets_word_threshold or same_start_word(heading_to_search, combined_line_norm) ):
870
+ collecting = True
871
+ matched_header_font_size = max(span["size"] for span in header_spans)
872
+ print(f"📥 Start collecting after header: {combined_line_norm} "
873
+ f"(Font: {matched_header_font_size}, Word match: {word_match_percent:.0f}%)")
874
+
875
+ collected_lines.append(line_text)
876
+ valid_spans = [span for span in spans if span.get("bbox")]
877
+
878
+ if valid_spans:
879
+ x0s = [span["bbox"][0] for span in valid_spans]
880
+ x1s = [span["bbox"][2] for span in valid_spans]
881
+ y0s = [span["bbox"][1] for span in valid_spans]
882
+ y1s = [span["bbox"][3] for span in valid_spans]
883
+
884
+ header_bbox = [min(x0s), min(y0s), max(x1s), max(y1s)]
885
+
886
+ if page_num in current_bbox:
887
+ cb = current_bbox[page_num]
888
+ current_bbox[page_num] = [
889
+ min(cb[0], header_bbox[0]),
890
+ min(cb[1], header_bbox[1]),
891
+ max(cb[2], header_bbox[2]),
892
+ max(cb[3], header_bbox[3])
893
+ ]
894
+ else:
895
+ current_bbox[page_num] = header_bbox
896
+
897
+ last_y1s[page_num] = header_bbox[3]
898
+ x0, y0, x1, y1 = header_bbox
899
+ zoom = 200
900
+ left = int(x0)
901
+ top = int(y0)
902
+ zoom_str = f"{zoom},{left},{top}"
903
+ pageNumberFound = page_num + 1
904
+
905
+ # Build the query parameters
906
+ params = {
907
+ 'pdfLink': pdf_path, # Your PDF link
908
+ 'keyword': heading_to_search, # Your keyword (could be a string or list)
909
+ }
910
+
911
+ # URL encode each parameter
912
+ encoded_params = {key: urllib.parse.quote(value, safe='') for key, value in params.items()}
913
+
914
+ # Construct the final encoded link
915
+ encoded_link = '&'.join([f"{key}={value}" for key, value in encoded_params.items()])
916
+
917
+ # Correctly construct the final URL with page and zoom
918
+ final_url = f"{baselink}{encoded_link}#page={str(pageNumberFound)}&zoom={zoom_str}"
919
+
920
+ # Get current date and time
921
+ now = datetime.now()
922
+
923
+ # Format the output
924
+ formatted_time = now.strftime("%d/%m/%Y %I:%M:%S %p")
925
+ # Optionally, add the URL to a DataFrame
926
+
927
+
928
+ data_entry = {
929
+ "NBSLink": final_url,
930
+ "Subject": heading_to_search,
931
+ "Page": str(pageNumberFound),
932
+ "Author": "ADR",
933
+ "Creation Date": formatted_time,
934
+ "Layer": "Initial",
935
+ "Code": "to be added",
936
+ "head above 1": paths[-2],
937
+ "head above 2": paths[0]
938
+ }
939
+ data_list_JSON.append(data_entry)
940
+
941
+ # Convert list to JSON
942
+ json_output = json.dumps(data_list_JSON, indent=4)
943
+
944
+ print("Final URL:", final_url)
945
+ i += 2
946
+ continue
947
+ if collecting:
948
+ norm_line = normalize_text(line_text)
949
+
950
+ # Optimized URL check
951
+ if url_pattern.match(norm_line):
952
+ line_is_header = False
953
+ else:
954
+ line_is_header = any(is_header(span, most_common_font_size, most_common_color, most_common_font) for span in spans)
955
+
956
+ if line_is_header:
957
+ header_font_size = max(span["size"] for span in spans)
958
+ is_probably_real_header = (
959
+ header_font_size >= matched_header_font_size and
960
+ is_header(spans[0], most_common_font_size, most_common_color, most_common_font) and
961
+ len(line_text.strip()) > 2
962
+ )
963
+
964
+ if (norm_line != matched_header_line_norm and
965
+ norm_line != heading_norm and
966
+ is_probably_real_header):
967
+ if line_text not in heading_norm:
968
+ print(f"🛑 Stop at header with same or larger font: '{line_text}' ({header_font_size} ≥ {matched_header_font_size})")
969
+ collecting = False
970
+ done = True
971
+ headertoContinue1 = False
972
+ headertoContinue2=False
973
+ for page_num, bbox in current_bbox.items():
974
+ bbox[3] = last_y1s.get(page_num, bbox[3])
975
+ page_highlights[page_num] = bbox
976
+ highlight_boxes(docHighlights, page_highlights)
977
+
978
+ break_collecting = True
979
+ break
980
+
981
+ if break_collecting:
982
+ break
983
+
984
+ collected_lines.append(line_text)
985
+ valid_spans = [span for span in spans if span.get("bbox")]
986
+ if valid_spans:
987
+ x0s = [span["bbox"][0] for span in valid_spans]
988
+ x1s = [span["bbox"][2] for span in valid_spans]
989
+ y0s = [span["bbox"][1] for span in valid_spans]
990
+ y1s = [span["bbox"][3] for span in valid_spans]
991
+
992
+ line_bbox = [min(x0s), min(y0s), max(x1s), max(y1s)]
993
+
994
+ if page_num in current_bbox:
995
+ cb = current_bbox[page_num]
996
+ current_bbox[page_num] = [
997
+ min(cb[0], line_bbox[0]),
998
+ min(cb[1], line_bbox[1]),
999
+ max(cb[2], line_bbox[2]),
1000
+ max(cb[3], line_bbox[3])
1001
+ ]
1002
+ else:
1003
+ current_bbox[page_num] = line_bbox
1004
+
1005
+ last_y1s[page_num] = line_bbox[3]
1006
+ i += 1
1007
+
1008
+ if not done:
1009
+ for page_num, bbox in current_bbox.items():
1010
+ bbox[3] = last_y1s.get(page_num, bbox[3])
1011
+ page_highlights[page_num] = bbox
1012
+ highlight_boxes(docHighlights, page_highlights)
1013
+
1014
+ # docHighlights.save("highlighted_output.pdf", garbage=4, deflate=True)
1015
+
1016
+ pdf_bytes = BytesIO()
1017
+ docHighlights.save(pdf_bytes)
1018
+ print('JSONN',json_output)
1019
+ return pdf_bytes.getvalue(), docHighlights , json_output