Marthee commited on
Commit
17a558a
·
verified ·
1 Parent(s): ce2c42f

Upload findspecsv1.py

Browse files
Files changed (1) hide show
  1. findspecsv1.py +603 -0
findspecsv1.py ADDED
@@ -0,0 +1,603 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """FindSpecsTrial(Retrieving+boundingBoxes).ipynb
3
+
4
+ Automatically generated by Colab.
5
+
6
+ Original file is located at
7
+ https://colab.research.google.com/drive/1mFuB1gtGuVh3NlOnNTzOFnDVuWSwn18q
8
+ """
9
+
10
+
11
+ import fitz # PyMuPDF
12
+ from io import BytesIO
13
+ import re
14
+ import requests
15
+ import pandas as pd
16
+ from collections import Counter
17
+ import fitz # PyMuPDF
18
+ import re
19
+ import urllib.parse
20
+ import pandas as pd
21
+ import math
22
+ import random
23
+ # import tempfile
24
+ # from fpdf import FPDF
25
+ import json
26
+ from datetime import datetime
27
+
28
+ baselink='https://marthee-nbslink.hf.space/view-pdf?'
29
+
30
+ def get_repeated_texts(pdf_document, threshold=0.85):
31
+ """
32
+ Identify text that appears on most pages, with font size and color.
33
+ :param pdf_document: The opened PDF document.
34
+ :param threshold: The percentage of pages a text must appear on to be considered "repeated".
35
+ :return: A list of dictionaries with text, font size, and color.
36
+ """
37
+ text_counts = Counter()
38
+ text_metadata = defaultdict(list)
39
+ total_pages = pdf_document.page_count
40
+
41
+ for page_num in range(total_pages):
42
+ page = pdf_document.load_page(page_num)
43
+ blocks = page.get_text("dict")["blocks"]
44
+
45
+ seen_texts = set() # To avoid counting the same text twice per page
46
+
47
+ for block in blocks:
48
+ if "lines" not in block:
49
+ continue
50
+ for line in block["lines"]:
51
+ for span in line["spans"]:
52
+ text = span["text"].strip()
53
+ if not text:
54
+ continue
55
+ if text not in seen_texts:
56
+ seen_texts.add(text)
57
+ text_counts[text] += 1
58
+ text_metadata[text].append({
59
+ "font_size": span.get("size"),
60
+ "color": span.get("color")
61
+ })
62
+
63
+ # Find texts that appear in at least `threshold * total_pages` pages
64
+ min_occurrence = max(2, int(threshold * total_pages))
65
+
66
+ repeated_texts_info = []
67
+ for text, count in text_counts.items():
68
+ if count >= min_occurrence:
69
+ sizes = [meta["font_size"] for meta in text_metadata[text]]
70
+ colors = [meta["color"] for meta in text_metadata[text]]
71
+
72
+ # Get the most common size and color used for this text
73
+ most_common_size = max(set(sizes), key=sizes.count)
74
+ most_common_color = max(set(colors), key=colors.count)
75
+
76
+ repeated_texts_info.append({
77
+ "text": text,
78
+ "font_size": most_common_size,
79
+ "color": most_common_color
80
+ })
81
+
82
+ return repeated_texts_info
83
+
84
+ def get_regular_font_size_and_color(doc):
85
+ font_sizes = []
86
+ colors = []
87
+ fonts = []
88
+
89
+ # Loop through all pages
90
+ for page_num in range(len(doc)):
91
+ page = doc.load_page(page_num)
92
+ for span in page.get_text("dict")["blocks"]:
93
+ if "lines" in span:
94
+ for line in span["lines"]:
95
+ for span in line["spans"]:
96
+ font_sizes.append(span['size'])
97
+ colors.append(span['color'])
98
+ fonts.append(span['font'])
99
+
100
+ # Get the most common font size, color, and font
101
+ most_common_font_size = Counter(font_sizes).most_common(1)[0][0] if font_sizes else None
102
+ most_common_color = Counter(colors).most_common(1)[0][0] if colors else None
103
+ most_common_font = Counter(fonts).most_common(1)[0][0] if fonts else None
104
+
105
+ return most_common_font_size, most_common_color, most_common_font
106
+
107
+ import re
108
+ from collections import defaultdict
109
+ import fitz # PyMuPDF
110
+ import requests
111
+ from io import BytesIO
112
+
113
+ def normalize_text(text):
114
+ return re.sub(r'\s+', ' ', text.strip().lower())
115
+
116
+ def get_spaced_text_from_spans(spans):
117
+ return normalize_text(" ".join(span["text"].strip() for span in spans))
118
+
119
+ def is_header(span, most_common_font_size, most_common_color, most_common_font):
120
+ fontname = span.get("font", "").lower()
121
+ is_italic = "italic" in fontname or "oblique" in fontname
122
+ is_bold = "bold" in fontname or span.get("bold", False)
123
+ return (
124
+ not is_italic and (
125
+ span["size"] > most_common_font_size or
126
+ # span["color"] != most_common_color or
127
+ span["font"].lower() != most_common_font.lower() or
128
+ is_bold
129
+ )
130
+ )
131
+
132
+ def merge_consecutive_words(headers):
133
+ result = []
134
+ i = 0
135
+ while i < len(headers):
136
+ if i + 1 < len(headers) and headers[i] + ' ' + headers[i + 1] in headers:
137
+ result.append(headers[i] + ' ' + headers[i + 1])
138
+ i += 2
139
+ else:
140
+ result.append(headers[i])
141
+ i += 1
142
+ return result
143
+
144
+
145
+ def extract_headers(doc, toc_pages, most_common_font_size, most_common_color, most_common_font, top_margin, bottom_margin):
146
+ print("Font baseline:", most_common_font_size, most_common_color, most_common_font)
147
+
148
+ grouped_headers_by_y = defaultdict(list)
149
+
150
+ for pageNum in range(len(doc)):
151
+ if pageNum in toc_pages:
152
+ continue
153
+ page = doc.load_page(pageNum)
154
+ page_height = page.rect.height
155
+ text_instances = page.get_text("dict")
156
+
157
+ for block in text_instances['blocks']:
158
+ if block['type'] != 0:
159
+ continue
160
+
161
+ for line in block['lines']:
162
+ for span in line['spans']:
163
+ span_y = round(span['bbox'][1])
164
+ span_text = normalize_text(span.get('text', ''))
165
+ span_y0 = span['bbox'][1] # Top Y of this span
166
+ span_y1 = span['bbox'][3] # Bottom Y of this span
167
+
168
+ if span_y0 < top_margin or span_y1 > (page_height - bottom_margin):
169
+ continue
170
+
171
+ if not span_text:
172
+ continue
173
+ if span_text.startswith('http://www') or span_text.startswith('www'):
174
+ continue
175
+ if any((
176
+ 'page' in span_text,
177
+ not re.search(r'[a-z0-9]', span_text),
178
+ 'end of section' in span_text,
179
+ re.search(r'page\s+\d+\s+of\s+\d+', span_text),
180
+ re.search(r'\b(?:\d{1,2}[/-])?\d{1,2}[/-]\d{2,4}\b', span_text),
181
+ re.search(r'\b(?:jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)', span_text),
182
+ 'specification:' in span_text
183
+ )):
184
+ continue
185
+
186
+ span_text = re.sub(r'[.\-]{4,}.*$', '', span_text).strip()
187
+ span_text = normalize_text(span_text)
188
+
189
+ if is_header(span, most_common_font_size, most_common_color, most_common_font):
190
+ grouped_headers_by_y[(pageNum, span_y)].append({
191
+ "text": span_text,
192
+ "size": span["size"],
193
+ "pageNum": pageNum
194
+ })
195
+
196
+ headers = []
197
+ for (pageNum, y), spans in sorted(grouped_headers_by_y.items()):
198
+ combined_text = " ".join(span['text'] for span in spans)
199
+ first_span = spans[0]
200
+ headers.append([combined_text, first_span['size'], first_span['pageNum'], y]) # <--- ADDED 'y'
201
+
202
+ # Analyze font sizes
203
+ font_sizes = [size for _, size, _, _ in headers] # <--- UNPACK 4 items now
204
+ font_size_counts = Counter(font_sizes)
205
+ top_3_font_sizes = sorted(font_size_counts.keys(), reverse=True)[:3]
206
+
207
+ return headers, top_3_font_sizes
208
+
209
+ class ColorManager:
210
+ def __init__(self, palette, min_distance=100):
211
+ self.palette = palette.copy()
212
+ self.used_colors = palette.copy()
213
+ self.idx = 0
214
+ self.min_distance = min_distance
215
+
216
+ def color_distance(self, c1, c2):
217
+ return math.sqrt(sum((a - b) ** 2 for a, b in zip(c1, c2)))
218
+
219
+ def generate_new_color(self):
220
+ max_attempts = 1000
221
+ for _ in range(max_attempts):
222
+ new_color = (random.randint(0, 255), random.randint(0, 255), random.randint(0, 255))
223
+ if all(self.color_distance(new_color, existing) > self.min_distance for existing in self.used_colors):
224
+ self.used_colors.append(new_color)
225
+ return new_color
226
+ raise ValueError("Couldn't find a distinct color after many attempts.")
227
+
228
+ def get_next_color(self):
229
+ if self.idx < len(self.palette):
230
+ color = self.palette[self.idx]
231
+ else:
232
+ color = self.generate_new_color()
233
+ self.idx += 1
234
+ return color
235
+
236
+ # Your original color palette
237
+ color_palette = [
238
+ (255, 0, 0), (0, 0, 255), (0, 255, 255), (0, 64, 0), (255, 204, 0),
239
+ (255, 128, 64), (255, 0, 128), (255, 128, 192), (128, 128, 255),
240
+ (128, 64, 0), (0, 255, 0), (0, 200, 0), (255, 128, 255), (128, 0, 255),
241
+ (0, 128, 192), (128, 0, 128), (128, 0, 0), (0, 128, 255), (149, 1, 70),
242
+ (255, 182, 128), (222, 48, 71), (240, 0, 112), (255, 0, 255),
243
+ (192, 46, 65), (0, 0, 128), (0, 128, 64), (255, 255, 0), (128, 0, 80),
244
+ (255, 255, 128), (90, 255, 140), (255, 200, 20), (91, 16, 51),
245
+ (90, 105, 138), (114, 10, 138), (36, 82, 78), (225, 105, 190),
246
+ (108, 150, 170), (11, 35, 75), (42, 176, 170), (255, 176, 170),
247
+ (209, 151, 15), (81, 27, 85), (226, 106, 122), (67, 119, 149),
248
+ (159, 179, 140), (159, 179, 30), (255, 85, 198), (255, 27, 85),
249
+ (188, 158, 8), (140, 188, 120), (59, 61, 52), (65, 81, 21),
250
+ (212, 255, 174), (15, 164, 90), (41, 217, 245), (213, 23, 182),
251
+ (11, 85, 169), (78, 153, 239), (0, 66, 141), (64, 98, 232),
252
+ (140, 112, 255), (57, 33, 154), (194, 117, 252), (116, 92, 135),
253
+ (74, 43, 98), (188, 13, 123), (129, 58, 91), (255, 128, 100),
254
+ (171, 122, 145), (255, 98, 98), (222, 48, 77)
255
+ ]
256
+
257
+ # Create ONE color manager and re-use it
258
+ color_manager = ColorManager(color_palette)
259
+
260
+ def highlight_boxes(doc, highlights,color):
261
+ for page_num, bbox in highlights.items():
262
+ page = doc.load_page(page_num)
263
+ rect = fitz.Rect(bbox)
264
+ annot = page.add_rect_annot(rect)
265
+
266
+
267
+ rgb_color = tuple(c / 255 for c in color) # Normalize
268
+
269
+ annot.set_colors(stroke=rgb_color, fill=rgb_color)
270
+ annot.set_opacity(0.3)
271
+ annot.update()
272
+
273
+
274
+ def find_full_line_in_toc(doc, toc_pages, substring):
275
+ substring = normalize_text(substring) # Normalize for matching
276
+ best_match = None
277
+
278
+ for page_num in toc_pages:
279
+ page = doc.load_page(page_num)
280
+ blocks = page.get_text("dict")["blocks"]
281
+
282
+ for block in blocks:
283
+ for line in block.get("lines", []):
284
+ line_text = get_spaced_text_from_spans(line.get("spans", [])).strip()
285
+ normalized_line = normalize_text(line_text)
286
+
287
+ if substring in normalized_line:
288
+ # Remove dots and anything after
289
+ line_text = re.split(r'\.{2,}', line_text)[0].strip()
290
+ best_match = line_text
291
+ return best_match # stop at first match
292
+ return None
293
+
294
+ def extract_section_under_header(pdf_path, target_header_LIST):
295
+ top_margin=70
296
+ bottom_margin=50
297
+
298
+ df = pd.DataFrame(columns=["NBSLink","Subject","Page","Author","Creation Date","Layer",'Code', 'head above 1', "head above 2"])
299
+ dictionaryNBS={}
300
+ data_list_JSON = []
301
+
302
+ if pdf_path and ('http' in pdf_path or 'dropbox' in pdf_path):
303
+ pdf_path = pdf_path.replace('dl=0', 'dl=1')
304
+
305
+ response = requests.get(pdf_path)
306
+ pdf_content = BytesIO(response.content)
307
+ if not pdf_content:
308
+ raise ValueError("No valid PDF content found.")
309
+
310
+ doc = fitz.open(stream=pdf_content, filetype="pdf")
311
+ most_common_font_size, most_common_color, most_common_font =get_regular_font_size_and_color(doc)
312
+
313
+ def get_toc_page_numbers(doc, max_pages_to_check=15):
314
+ toc_pages = []
315
+ for page_num in range(min(len(doc), max_pages_to_check)):
316
+ page = doc.load_page(page_num)
317
+ blocks = page.get_text("dict")["blocks"]
318
+
319
+ dot_line_count = 0
320
+ lines_with_numbers_at_end = 0
321
+
322
+ for block in blocks:
323
+ for line in block.get("lines", []):
324
+ line_text = get_spaced_text_from_spans(line["spans"]).strip()
325
+
326
+ if re.search(r'\.{3,}', line_text):
327
+ dot_line_count += 1
328
+ # if re.search(r'\s\d{1,3}$', line_text):
329
+ # lines_with_numbers_at_end += 1
330
+
331
+ if dot_line_count >= 3 :#or lines_with_numbers_at_end >= 4:
332
+ toc_pages.append(page_num)
333
+ if bool(toc_pages):
334
+ return list(range(0, toc_pages[-1] + 1))
335
+ return toc_pages
336
+
337
+ toc_pages = get_toc_page_numbers(doc)
338
+
339
+ headers,top_3_font_sizes=extract_headers(doc,toc_pages,most_common_font_size, most_common_color, most_common_font,top_margin,bottom_margin)
340
+ if top_3_font_sizes:
341
+ mainHeaderFontSize, subHeaderFontSize, subsubheaderFontSize = top_3_font_sizes
342
+ print("Detected headers:", headers)
343
+ headers_set = set()
344
+ headers_dict = {}
345
+
346
+ for h in headers:
347
+ norm_text = normalize_text(h[0]) # h[0] is the text
348
+ headers_set.add(norm_text)
349
+ headers_dict[norm_text] = (h[0], h[1], h[2]) # (text, size, pageNum)
350
+ results = {}
351
+ print("📌 Has TOC:", bool(toc_pages), " | Pages to skip:", toc_pages)
352
+ matched_header_line = None # <-- Will store the line that acts as header
353
+ for heading_to_search in target_header_LIST:
354
+ print('headertosearch',heading_to_search)
355
+ matched_header_line = None
356
+ done=False
357
+ target_header = normalize_text(heading_to_search)
358
+
359
+ if target_header not in headers_set:
360
+ print(f"Header '{target_header}' not found. Searching for best match...")
361
+ heading_words = set(target_header.split())
362
+ best_match_score = 0
363
+ for page_num in range(len(doc)):
364
+ page = doc.load_page(page_num)
365
+ blocks = page.get_text("dict")["blocks"]
366
+
367
+ for block in blocks:
368
+ for line in block.get("lines", []):
369
+ line_text = " ".join(span["text"].strip() for span in line.get("spans", []))
370
+ if not line_text:
371
+ continue
372
+ line_words = set(re.findall(r'\w+', line_text.lower()))
373
+ match_count = len(heading_words & line_words)
374
+
375
+ if match_count > best_match_score:
376
+ best_match_score = match_count
377
+ matched_header_line = line_text.strip()
378
+
379
+ if matched_header_line:
380
+ print(f"✅ Best match: '{matched_header_line}' with score {best_match_score}")
381
+ else:
382
+ print("❌ No suitable match found.")
383
+ return
384
+ else:
385
+ matched_header_line = target_header # Exact match
386
+ # matched_header_line = target_header
387
+ matched_header_font_size = most_common_font_size
388
+ collecting = False
389
+ collected_lines = []
390
+ page_highlights = {}
391
+ current_bbox = {}
392
+ last_y1s = {}
393
+ mainHeader=''
394
+ subHeader=''
395
+ matched_header_line_norm = normalize_text(matched_header_line)
396
+ color = color_manager.get_next_color()
397
+ for page_num in range(len(doc)):
398
+ if page_num in toc_pages:
399
+ continue
400
+
401
+ page = doc.load_page(page_num)
402
+ page_height = page.rect.height
403
+ blocks = page.get_text("dict")["blocks"]
404
+
405
+ for block in blocks:
406
+ lines = block.get("lines", [])
407
+ i = 0
408
+ while i < len(lines):
409
+ spans = lines[i].get("spans", [])
410
+ if not spans:
411
+ i += 1
412
+ continue
413
+
414
+ y0 = spans[0]["bbox"][1]
415
+ y1 = spans[0]["bbox"][3]
416
+ if y0 < top_margin or y1 > (page_height - bottom_margin):
417
+ i += 1
418
+ continue
419
+ # print(line_text)
420
+ line_text = get_spaced_text_from_spans(spans).lower()
421
+ line_text_norm = normalize_text(line_text)
422
+
423
+ if i + 1 < len(lines):
424
+ next_spans = lines[i + 1].get("spans", [])
425
+ next_line_text = get_spaced_text_from_spans(next_spans).lower()
426
+ combined_line = (line_text + " " + next_line_text).strip()
427
+ combined_line_norm = normalize_text(combined_line)
428
+ else:
429
+ combined_line = line_text
430
+ combined_line_norm = line_text_norm
431
+
432
+ # if not done and not collecting:
433
+ if not done and not collecting:
434
+ for span in spans:
435
+ if len(normalize_text(span['text'])) > 1:
436
+ if is_header(span, most_common_font_size, most_common_color, most_common_font):
437
+ for header in headers:
438
+ header_text, header_size, header_page, header_y = header # 4 elements now!
439
+
440
+ # Check if combined_line_norm is inside header text
441
+ if combined_line_norm in header_text:
442
+
443
+ # Also check that the Y position is close (for example, within 5 pixels)
444
+ # if abs(span['bbox'][1] - header_y) < 1:
445
+ print('comb:,',combined_line_norm)
446
+ if header_size == mainHeaderFontSize:
447
+ mainHeader=find_full_line_in_toc(doc, toc_pages, combined_line_norm)
448
+ print('main:', mainHeader)
449
+
450
+ elif header_size == subHeaderFontSize:
451
+ subHeader = combined_line_norm
452
+ print('sub:', subHeader)
453
+
454
+ # Start collecting if we find the target header
455
+ if matched_header_line_norm in combined_line_norm and not collecting:
456
+ if any(is_header(span, most_common_font_size, most_common_color, most_common_font) for span in spans):
457
+ collecting = True
458
+ header_font_sizes = [span["size"] for span in spans if is_header(span, most_common_font_size, most_common_color, most_common_font)]
459
+ if header_font_sizes:
460
+ matched_header_font_size = max(header_font_sizes)
461
+ print(f"📥 Start collecting after header: {combined_line} (Font size: {matched_header_font_size})")
462
+
463
+ # Collect the header line text and bbox too!
464
+ collected_lines.append(line_text)
465
+
466
+ valid_spans = [span for span in spans if span.get("bbox")]
467
+ if valid_spans:
468
+ x0s = [span["bbox"][0] for span in valid_spans]
469
+ x1s = [span["bbox"][2] for span in valid_spans]
470
+ y0s = [span["bbox"][1] for span in valid_spans]
471
+ y1s = [span["bbox"][3] for span in valid_spans]
472
+ left = int(x0s[0])
473
+ top = int(y0s[0])
474
+ header_bbox = [min(x0s), min(y0s), max(x1s), max(y1s)]
475
+
476
+ if page_num in current_bbox:
477
+ cb = current_bbox[page_num]
478
+ current_bbox[page_num] = [
479
+ min(cb[0], header_bbox[0]),
480
+ min(cb[1], header_bbox[1]),
481
+ max(cb[2], header_bbox[2]),
482
+ max(cb[3], header_bbox[3])
483
+ ]
484
+ else:
485
+ current_bbox[page_num] = header_bbox
486
+
487
+ last_y1s[page_num] = header_bbox[3]
488
+ i += 2
489
+ continue
490
+
491
+
492
+ if collecting:
493
+ norm_line = normalize_text(line_text)
494
+ norm_combined = normalize_text(combined_line)
495
+
496
+ # 🧠 Skip URL-like lines from being considered headers
497
+ if re.match(r'https?://\S+|www\.\S+', norm_line):
498
+ line_is_header = False
499
+ else:
500
+ line_is_header = any(is_header(span, most_common_font_size, most_common_color, most_common_font) for span in spans)
501
+
502
+ if line_is_header:
503
+ header_font_size = max(span["size"] for span in spans)
504
+
505
+ is_probably_real_header = (
506
+ header_font_size >= matched_header_font_size and
507
+ is_header(spans[0], most_common_font_size, most_common_color, most_common_font) and
508
+ len(line_text.strip()) > 2
509
+ )
510
+
511
+ if (norm_line != matched_header_line_norm and
512
+ norm_combined != matched_header_line_norm and
513
+ is_probably_real_header):
514
+ print(f"🛑 Stop at header with same or larger font: '{line_text}' ({header_font_size} ≥ {matched_header_font_size})")
515
+ collecting = False
516
+ done=True
517
+ result_text = (matched_header_line + "\n" + "\n".join(collected_lines)).strip().lower()
518
+ print("\n📄 Final collected section (early return):\n" , mainHeader,subHeader)
519
+ print(result_text)
520
+
521
+ for page_num, bbox in current_bbox.items():
522
+ # update y1 to stop exactly at last_y1
523
+ bbox[3] = last_y1s.get(page_num, bbox[3])
524
+ page_highlights[page_num] = bbox
525
+ highlight_boxes(doc, page_highlights,color)
526
+ zoom = 200
527
+ zoom_str = f"{zoom},{left},{top}"
528
+ pageNumberFound = page_num + 1
529
+ params = {
530
+ 'pdfLink': pdf_path, # Your PDF link
531
+ 'keyword': heading_to_search, # Your keyword (could be a string or list)
532
+ }
533
+
534
+ # URL encode each parameter
535
+ encoded_params = {key: urllib.parse.quote(value, safe='') for key, value in params.items()}
536
+
537
+ # Construct the final encoded link
538
+ encoded_link = '&'.join([f"{key}={value}" for key, value in encoded_params.items()])
539
+
540
+ # Correctly construct the final URL with page and zoom
541
+ final_url = f"{baselink}{encoded_link}#page={str(pageNumberFound)}&zoom={zoom_str}"
542
+
543
+ # Get current date and time
544
+ now = datetime.now()
545
+
546
+ # Format the output
547
+ formatted_time = now.strftime("%d/%m/%Y %I:%M:%S %p")
548
+ if mainHeader > 0:
549
+ data_entry = {
550
+ "NBSLink": final_url,
551
+ "Subject": 'Markup (initial)',
552
+ "Page": str(pageNumberFound),
553
+ "Author": "ADR",
554
+ "Creation Date": formatted_time,
555
+ "Layer": "Initial",
556
+ "Code": heading_to_search,
557
+ "head above 1": mainHeader,
558
+ "head above 2": subHeader
559
+ }
560
+ data_list_JSON.append(data_entry)
561
+
562
+ # Convert list to JSON
563
+ json_output = json.dumps(data_list_JSON, indent=4)
564
+
565
+ # return result_text
566
+
567
+ collected_lines.append(line_text)
568
+ valid_spans = [span for span in spans if span.get("bbox")]
569
+ if valid_spans:
570
+ x0s = [span["bbox"][0] for span in valid_spans]
571
+ x1s = [span["bbox"][2] for span in valid_spans]
572
+ y0s = [span["bbox"][1] for span in valid_spans]
573
+ y1s = [span["bbox"][3] for span in valid_spans]
574
+
575
+ line_bbox = [min(x0s), min(y0s), max(x1s), max(y1s)]
576
+
577
+ if page_num in current_bbox:
578
+ cb = current_bbox[page_num]
579
+ current_bbox[page_num] = [
580
+ min(cb[0], line_bbox[0]),
581
+ min(cb[1], line_bbox[1]),
582
+ max(cb[2], line_bbox[2]),
583
+ max(cb[3], line_bbox[3])
584
+ ]
585
+ else:
586
+ current_bbox[page_num] = line_bbox
587
+
588
+ last_y1s[page_num] = line_bbox[3]
589
+
590
+ i += 1
591
+ # doc.save("highlighted_output.pdf", garbage=4, deflate=True)
592
+ result_text = (matched_header_line + "\n" + "\n".join(collected_lines)).strip().lower()
593
+ print("\n📄 Final collected section:\n")
594
+
595
+ pdf_bytes = BytesIO()
596
+ doc.save(pdf_bytes)
597
+ print('JSONN',json_output)
598
+ return pdf_bytes.getvalue(), doc , df, json_output
599
+
600
+
601
+
602
+
603
+