Marthee commited on
Commit
b539e20
·
verified ·
1 Parent(s): 8e1f0d3

Update InitialMarkups.py

Browse files
Files changed (1) hide show
  1. InitialMarkups.py +261 -1417
InitialMarkups.py CHANGED
@@ -1,1435 +1,279 @@
1
- # -*- coding: utf-8 -*-
2
- """Copy of FindSpecsTrial(Retrieving+boundingBoxes)-InitialMarkups(ALL)_CleanedUp.ipynb
3
-
4
- Automatically generated by Colab.
5
-
6
- Original file is located at
7
- https://colab.research.google.com/drive/12XfVkmKmN3oVjHhLVE0_GgkftgArFEK2
8
- """
9
- baselink='https://findconsole-initialmarkups.hf.space/view-pdf?'
10
-
11
- newlink='https://findconsole-initialmarkups.hf.space/view-highlight?'
12
-
13
-
14
- from urllib.parse import urlparse, unquote
15
- import os
16
- from io import BytesIO
17
- import re
18
- import requests
19
- import pandas as pd
20
- import fitz # PyMuPDF
21
- import re
22
- import urllib.parse
23
- import pandas as pd
24
- import math
25
- import random
26
  import json
27
- from datetime import datetime
28
- from collections import defaultdict, Counter
29
- import difflib
30
- from fuzzywuzzy import fuzz
31
-
32
- def filteredJsons(pdf_path,filteredjsonsfromrawan):
33
- # for heading in subjects:
34
- extract_section_under_headerRawan (pdf_path=pdf_path,listofheadingsfromrawan=filteredjsonsfromrawan)
35
-
36
-
37
-
38
-
39
- def get_regular_font_size_and_color(doc):
40
- font_sizes = []
41
- colors = []
42
- fonts = []
43
-
44
- # Loop through all pages
45
- for page_num in range(len(doc)):
46
- page = doc.load_page(page_num)
47
- for span in page.get_text("dict")["blocks"]:
48
- if "lines" in span:
49
- for line in span["lines"]:
50
- for span in line["spans"]:
51
- font_sizes.append(span['size'])
52
- colors.append(span['color'])
53
- fonts.append(span['font'])
54
-
55
- # Get the most common font size, color, and font
56
- most_common_font_size = Counter(font_sizes).most_common(1)[0][0] if font_sizes else None
57
- most_common_color = Counter(colors).most_common(1)[0][0] if colors else None
58
- most_common_font = Counter(fonts).most_common(1)[0][0] if fonts else None
59
-
60
- return most_common_font_size, most_common_color, most_common_font
61
-
62
- def normalize_text(text):
63
- if text is None:
64
- return ""
65
- return re.sub(r'\s+', ' ', text.strip().lower())
66
-
67
- def get_spaced_text_from_spans(spans):
68
- return normalize_text(" ".join(span["text"].strip() for span in spans))
69
-
70
- def is_header(span, most_common_font_size, most_common_color, most_common_font):
71
- fontname = span.get("font", "").lower()
72
- # is_italic = "italic" in fontname or "oblique" in fontname
73
- is_bold = "bold" in fontname or span.get("bold", False)
74
- return (
75
- (
76
- span["size"] > most_common_font_size or
77
- span["font"].lower() != most_common_font.lower() or
78
- (is_bold and span["size"] > most_common_font_size )
79
- )
80
- )
81
-
82
- def add_span_to_nearest_group(span_y, grouped_dict, pageNum=None, threshold=0.5):
83
- for (p, y) in grouped_dict:
84
- if pageNum is not None and p != pageNum:
85
- continue
86
- if abs(y - span_y) <= threshold:
87
- return (p, y)
88
- return (pageNum, span_y)
89
-
90
- def extract_headers(doc, toc_pages, most_common_font_size, most_common_color, most_common_font, top_margin, bottom_margin):
91
- print("Font baseline:", most_common_font_size, most_common_color, most_common_font)
92
-
93
- grouped_headers = defaultdict(list)
94
- spans = []
95
- line_merge_threshold = 1.5 # Maximum vertical distance between lines to consider as part of same header
96
-
97
- for pageNum in range(len(doc)):
98
- if pageNum in toc_pages:
99
- continue
100
- page = doc.load_page(pageNum)
101
- page_height = page.rect.height
102
- text_instances = page.get_text("dict")
103
-
104
- # First pass: collect all potential header spans
105
- potential_header_spans = []
106
- for block in text_instances['blocks']:
107
- if block['type'] != 0:
108
- continue
109
-
110
- for line in block['lines']:
111
- for span in line['spans']:
112
- span_y0 = span['bbox'][1]
113
- span_y1 = span['bbox'][3]
114
-
115
- if span_y0 < top_margin or span_y1 > (page_height - bottom_margin):
116
- continue
117
-
118
- span_text = normalize_text(span.get('text', ''))
119
- if not span_text:
120
- continue
121
- if span_text.startswith('http://www') or span_text.startswith('www'):
122
- continue
123
- if any((
124
- 'page' in span_text,
125
- not re.search(r'[a-z0-9]', span_text),
126
- 'end of section' in span_text,
127
- re.search(r'page\s+\d+\s+of\s+\d+', span_text),
128
- re.search(r'\b(?:\d{1,2}[/-])?\d{1,2}[/-]\d{2,4}\b', span_text),
129
- # re.search(r'\b(?:jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)', span_text),
130
- 'specification:' in span_text
131
- )):
132
- continue
133
-
134
- cleaned_text = re.sub(r'[.\-]{4,}.*$', '', span_text).strip()
135
- cleaned_text = normalize_text(cleaned_text)
136
-
137
- if is_header(span, most_common_font_size, most_common_color, most_common_font):
138
- potential_header_spans.append({
139
- 'text': cleaned_text,
140
- 'size': span['size'],
141
- 'pageNum': pageNum,
142
- 'y0': span_y0,
143
- 'y1': span_y1,
144
- 'x0': span['bbox'][0],
145
- 'x1': span['bbox'][2],
146
- 'span': span
147
- })
148
-
149
- # Sort spans by vertical position (top to bottom)
150
- potential_header_spans.sort(key=lambda s: (s['pageNum'], s['y0']))
151
-
152
- # Second pass: group spans that are vertically close and likely part of same header
153
- i = 0
154
- while i < len(potential_header_spans):
155
- current = potential_header_spans[i]
156
- header_text = current['text']
157
- header_size = current['size']
158
- header_page = current['pageNum']
159
- min_y = current['y0']
160
- max_y = current['y1']
161
- spans_group = [current['span']]
162
-
163
- # Look ahead to find adjacent lines that might be part of same header
164
- j = i + 1
165
- while j < len(potential_header_spans):
166
- next_span = potential_header_spans[j]
167
- # Check if on same page and vertically close with similar styling
168
- if (next_span['pageNum'] == header_page and
169
- next_span['y0'] - max_y < line_merge_threshold and
170
- abs(next_span['size'] - header_size) < 0.5):
171
- header_text += " " + next_span['text']
172
- max_y = next_span['y1']
173
- spans_group.append(next_span['span'])
174
- j += 1
175
- else:
176
- break
177
-
178
- # Add the merged header
179
- grouped_headers[(header_page, min_y)].append({
180
- "text": header_text.strip(),
181
- "size": header_size,
182
- "pageNum": header_page,
183
- "spans": spans_group
184
- })
185
- spans.extend(spans_group)
186
- i = j # Skip the spans we've already processed
187
-
188
- # Prepare final headers list
189
- headers = []
190
- for (pageNum, y), header_groups in sorted(grouped_headers.items()):
191
- for group in header_groups:
192
- headers.append([
193
- group['text'],
194
- group['size'],
195
- group['pageNum'],
196
- y
197
- ])
198
-
199
- font_sizes = [size for _, size, _, _ in headers]
200
- font_size_counts = Counter(font_sizes)
201
-
202
- # Filter font sizes that appear at least 3 times
203
- valid_font_sizes = [size for size, count in font_size_counts.items() if count >= 3]
204
-
205
- # Sort in descending order
206
- valid_font_sizes_sorted = sorted(valid_font_sizes, reverse=True)
207
-
208
- # If only 2 sizes, repeat the second one
209
- if len(valid_font_sizes_sorted) == 2:
210
- top_3_font_sizes = [valid_font_sizes_sorted[0], valid_font_sizes_sorted[1], valid_font_sizes_sorted[1]]
211
  else:
212
- top_3_font_sizes = valid_font_sizes_sorted[:3]
213
-
214
- # Get the smallest font size among valid ones
215
- smallest_font_size = min(valid_font_sizes) if valid_font_sizes else None
 
 
 
 
 
 
 
 
 
 
216
 
217
- return headers, top_3_font_sizes, smallest_font_size, spans
218
-
219
- def is_numbered(text):
220
- return bool(re.match(r'^\d', text.strip()))
221
-
222
- def is_similar(a, b, threshold=0.85):
223
- return difflib.SequenceMatcher(None, a, b).ratio() > threshold
224
-
225
- def normalize(text):
226
- text = text.lower()
227
- text = re.sub(r'\.{2,}', '', text) # remove long dots
228
- text = re.sub(r'\s+', ' ', text) # replace multiple spaces with one
229
- return text.strip()
230
-
231
- def clean_toc_entry(toc_text):
232
- """Remove page numbers and formatting from TOC entries"""
233
- # Remove everything after last sequence of dots/whitespace followed by digits
234
- return re.sub(r'[\.\s]+\d+.*$', '', toc_text).strip('. ')
235
-
236
- def build_header_hierarchy(doc, toc_pages, most_common_font_size, most_common_color, most_common_font, top_margin=70, bottom_margin=70):
237
- # Extract headers with margin handling
238
- headers_list, top_3_font_sizes, smallest_font_size, spans = extract_headers(
239
- doc,
240
- toc_pages=toc_pages,
241
- most_common_font_size=most_common_font_size,
242
- most_common_color=most_common_color,
243
- most_common_font=most_common_font,
244
- top_margin=top_margin,
245
- bottom_margin=bottom_margin
246
  )
247
 
248
- # Step 1: Collect and filter potential headers
249
- headers = []
250
- seen_headers = set()
251
-
252
- # First extract TOC entries to get exact level 0 header texts
253
- toc_entries = {}
254
- for pno in toc_pages:
255
- page = doc.load_page(pno)
256
- toc_text = page.get_text()
257
- for line in toc_text.split('\n'):
258
- clean_line = line.strip()
259
- if clean_line:
260
- norm_line = normalize(clean_line)
261
- toc_entries[norm_line] = clean_line # Store original text
262
-
263
- for h in headers_list:
264
- text, size, pageNum, y = h[:4]
265
- page = doc.load_page(pageNum)
266
- page_height = page.rect.height
267
-
268
- # Skip margin areas
269
- if y < top_margin or y > (page_height - bottom_margin):
270
- continue
271
-
272
- norm_text = normalize(text)
273
- if len(norm_text) > 2 and size >= most_common_font_size:
274
- headers.append({
275
- "text": text,
276
- "page": pageNum,
277
- "y": y,
278
- "size": size,
279
- "bold": h[4] if len(h) > 4 else False,
280
- # "italic": h[5] if len(h) > 5 else False,
281
- "color": h[6] if len(h) > 6 else None,
282
- "font": h[7] if len(h) > 7 else None,
283
- "children": [],
284
- "is_numbered": is_numbered(text),
285
- "original_size": size,
286
- "norm_text": norm_text,
287
- "level": -1 # Initialize as unassigned
288
- })
289
-
290
- # Sort by page and vertical position
291
- headers.sort(key=lambda h: (h['page'], h['y']))
292
- # Step 2: Detect consecutive headers and assign levels
293
- i = 0
294
- while i < len(headers) - 1:
295
- current = headers[i]
296
- next_header = headers[i+1]
297
-
298
- # Check if they are on the same page and very close vertically (likely consecutive lines)
299
- if (current['page'] == next_header['page'] and
300
- abs(current['y'] - next_header['y']) < 20): # 20pt threshold for "same line"
301
-
302
- # Case 1: Both unassigned - make current level 1 and next level 2
303
- if current['level'] == -1 and next_header['level'] == -1:
304
- current['level'] = 1
305
- next_header['level'] = 2
306
- i += 1 # Skip next header since we processed it
307
-
308
- # Case 2: Current unassigned, next assigned - make current one level above
309
- elif current['level'] == -1 and next_header['level'] != -1:
310
- current['level'] = max(1, next_header['level'] - 1)
311
-
312
- # Case 3: Current assigned, next unassigned - make next one level below
313
- elif current['level'] != -1 and next_header['level'] == -1:
314
- next_header['level'] = current['level'] + 1
315
- i += 1 # Skip next header since we processed it
316
- i += 1
317
- # Step 2: Identify level 0 headers (largest and in TOC)
318
- # max_size = max(h['size'] for h in headers) if headers else 0
319
- max_size,subheaderSize,nbsheadersize=top_3_font_sizes
320
- print(max_size)
321
- toc_text_match=[]
322
- # Improved TOC matching with exact and substring matching
323
- toc_matches = []
324
- for h in headers:
325
- norm_text = h['norm_text']
326
- matching_toc_texts = []
327
-
328
- # Check both exact matches and substring matches
329
- for toc_norm, toc_text in toc_entries.items():
330
- # Exact match case
331
- if norm_text == toc_norm and len(toc_text)>4 and h['size']==max_size:
332
- matching_toc_texts.append(toc_text)
333
- # Substring match case (header is substring of TOC entry)
334
- elif norm_text in toc_norm and len(toc_text)>4 and h['size']==max_size:
335
- matching_toc_texts.append(toc_text)
336
- # Substring match case (TOC entry is substring of header)
337
- elif toc_norm in norm_text and len(toc_text)>4 and h['size']==max_size:
338
- matching_toc_texts.append(toc_text)
339
-
340
- if matching_toc_texts and h['size'] >= max_size * 0.9:
341
- best_match = max(matching_toc_texts,
342
- key=lambda x: (len(x), -len(x.replace(norm_text, ''))))
343
- h['text'] = normalize_text(clean_toc_entry(best_match))
344
- h['level'] = 0
345
- if h['text'] not in toc_text_match:
346
- toc_matches.append(h)
347
- toc_text_match.append(h['text'])
348
- elif matching_toc_texts and h['size'] < max_size * 0.9 and h['size'] > nbsheadersize : # h['size'] < max_size * 0.9 and h['size'] > max_size*0.75:
349
- print(h['text'],matching_toc_texts)
350
- headers.remove(h)
351
- continue
352
-
353
-
354
- # Remove duplicates - keep only first occurrence of each level 0 header
355
- unique_level0 = []
356
- seen_level0 = set()
357
- for h in toc_matches:
358
- # Use the cleaned text for duplicate checking
359
- cleaned_text = clean_toc_entry(h['text'])
360
- norm_cleaned_text = normalize(cleaned_text)
361
-
362
- if norm_cleaned_text not in seen_level0:
363
- seen_level0.add(norm_cleaned_text)
364
- # Update the header text with cleaned version
365
- h['text'] = cleaned_text
366
- unique_level0.append(h)
367
- print(f"Added unique header: {cleaned_text} (normalized: {norm_cleaned_text})")
368
-
369
- # Step 3: Process headers under each level 0 to identify level 1 format
370
-
371
- # First, group headers by their level 0 parent
372
- level0_headers = [h for h in headers if h['level'] == 0]
373
- header_groups = []
374
-
375
- for i, level0 in enumerate(level0_headers):
376
- start_idx = headers.index(level0)
377
- end_idx = headers.index(level0_headers[i+1]) if i+1 < len(level0_headers) else len(headers)
378
- group = headers[start_idx:end_idx]
379
- header_groups.append(group)
380
-
381
- # Now process each group to identify level 1 format
382
- for group in header_groups:
383
- level0 = group[0]
384
- level1_candidates = [h for h in group[1:] if h['level'] == -1]
385
-
386
- if not level1_candidates:
387
- continue
388
-
389
- # The first candidate is our reference level 1
390
- first_level1 = level1_candidates[0]
391
- level1_format = {
392
- 'font': first_level1['font'],
393
- 'color': first_level1['color'],
394
- 'starts_with_number': is_numbered(first_level1['text']),
395
- 'size': first_level1['size'],
396
- 'bold': first_level1['bold']
397
- # 'italic': first_level1['italic']
398
- }
399
-
400
- # Assign levels based on the reference format
401
- for h in level1_candidates:
402
- current_format = {
403
- 'font': h['font'],
404
- 'color': h['color'],
405
- 'starts_with_number': is_numbered(h['text']),
406
- 'size': h['size'],
407
- 'bold': h['bold']
408
- # 'italic': h['italic']
409
- }
410
-
411
- # Compare with level1 format
412
- if (current_format['font'] == level1_format['font'] and
413
- current_format['color'] == level1_format['color'] and
414
- current_format['starts_with_number'] == level1_format['starts_with_number'] and
415
- abs(current_format['size'] - level1_format['size']) <= 0.1 and
416
- current_format['bold'] == level1_format['bold'] ): #and
417
- # current_format['italic'] == level1_format['italic']):
418
- h['level'] = 1
419
- else:
420
- h['level'] = 2
421
-
422
- # Step 4: Assign levels to remaining unassigned headers
423
- unassigned = [h for h in headers if h['level'] == -1]
424
- if unassigned:
425
- # Cluster by size with tolerance
426
- sizes = sorted({h['size'] for h in unassigned}, reverse=True)
427
- clusters = []
428
-
429
- for size in sizes:
430
- found_cluster = False
431
- for cluster in clusters:
432
- if abs(size - cluster['size']) <= max(size, cluster['size']) * 0.1:
433
- cluster['headers'].extend([h for h in unassigned if abs(h['size'] - size) <= size * 0.1])
434
- found_cluster = True
435
- break
436
- if not found_cluster:
437
- clusters.append({
438
- 'size': size,
439
- 'headers': [h for h in unassigned if abs(h['size'] - size) <= size * 0.1]
440
- })
441
-
442
- # Assign levels starting from 1
443
- clusters.sort(key=lambda x: -x['size'])
444
- for i, cluster in enumerate(clusters):
445
- for h in cluster['headers']:
446
- base_level = i + 1
447
- if h['bold']:
448
- base_level = max(1, base_level - 1)
449
- h['level'] = base_level
450
-
451
- # Step 5: Build hierarchy
452
- root = []
453
- stack = []
454
-
455
- # Create a set of normalized texts from unique_level0 to avoid duplicates
456
- unique_level0_texts = {h['norm_text'] for h in unique_level0}
457
-
458
- # Filter out any headers from the original list that match unique_level0 headers
459
- filtered_headers = []
460
- for h in headers:
461
- if h['norm_text'] in unique_level0_texts and h not in unique_level0:
462
- h['level'] = 0
463
- filtered_headers.append(h)
464
-
465
- # Combine all headers - unique_level0 first, then the filtered headers
466
- all_headers = unique_level0 + filtered_headers
467
- all_headers.sort(key=lambda h: (h['page'], h['y']))
468
-
469
- # Track which level 0 headers we've already added
470
- added_level0 = set()
471
-
472
- for header in all_headers:
473
- if header['level'] < 0:
474
- continue
475
-
476
- if header['level'] == 0:
477
- norm_text = header['norm_text']
478
- if norm_text in added_level0:
479
- continue
480
- added_level0.add(norm_text)
481
-
482
- # Pop stack until we find a parent
483
- while stack and stack[-1]['level'] >= header['level']:
484
- stack.pop()
485
-
486
- current_parent = stack[-1] if stack else None
487
-
488
- if current_parent:
489
- current_parent['children'].append(header)
490
- else:
491
- root.append(header)
492
-
493
- stack.append(header)
494
-
495
- # Step 6: Enforce proper nesting
496
- def enforce_nesting(node_list, parent_level=-1):
497
- for node in node_list:
498
- if node['level'] <= parent_level:
499
- node['level'] = parent_level + 1
500
- enforce_nesting(node['children'], node['level'])
501
-
502
- enforce_nesting(root)
503
- root = [h for h in root if not (h['level'] == 0 and not h['children'])]
504
- return root
505
-
506
- def adjust_levels_if_level0_not_in_toc(doc, toc_pages, root):
507
- def normalize(text):
508
- return re.sub(r'\s+', ' ', text.strip().lower())
509
-
510
- toc_text = ""
511
- for pno in toc_pages:
512
- page = doc.load_page(pno)
513
- toc_text += page.get_text()
514
- toc_text_normalized = normalize(toc_text)
515
-
516
- def is_level0_in_toc_text(header):
517
- return header['level'] == 0 and normalize(header['text']) in toc_text_normalized
518
-
519
- if any(is_level0_in_toc_text(h) for h in root):
520
- return # No change needed
521
-
522
- def increase_levels(node_list):
523
- for node in node_list:
524
- node['level'] += 1
525
- increase_levels(node['children'])
526
-
527
- def assign_numbers_to_headers(headers, prefix=None):
528
- for idx, header in enumerate(headers, 1):
529
- current_number = f"{prefix}.{idx}" if prefix else str(idx)
530
- header["number"] = current_number
531
- assign_numbers_to_headers(header["children"], current_number)
532
-
533
- def print_tree_with_numbers(headers, indent=0):
534
- for header in headers:
535
- size_info = f"size:{header['original_size']:.1f}" if 'original_size' in header else ""
536
- print(" " * indent +
537
- f"{header.get('number', '?')} {header['text']} " +
538
- f"(Level {header['level']}, p:{header['page']+1}, {size_info})")
539
- print_tree_with_numbers(header["children"], indent + 1)
540
-
541
-
542
- def highlight_boxes(doc, highlights, stringtowrite, fixed_width=500): # Set your desired width here
543
- for page_num, bbox in highlights.items():
544
- page = doc.load_page(page_num)
545
- page_width = page.rect.width
546
-
547
- # Get original rect for vertical coordinates
548
- orig_rect = fitz.Rect(bbox)
549
- rect_height = orig_rect.height
550
- if rect_height > 30:
551
- if orig_rect.width > 10:
552
- # Center horizontally using fixed width
553
- center_x = page_width / 2
554
- new_x0 = center_x - fixed_width / 2
555
- new_x1 = center_x + fixed_width / 2
556
- new_rect = fitz.Rect(new_x0, orig_rect.y0, new_x1, orig_rect.y1)
557
 
558
- # Add highlight rectangle
559
- annot = page.add_rect_annot(new_rect)
560
- if stringtowrite.startswith('Not'):
561
- annot.set_colors(stroke=(0.5, 0.5, 0.5), fill=(0.5, 0.5, 0.5))
562
- else:
563
- annot.set_colors(stroke=(1, 1, 0), fill=(1, 1, 0))
564
-
565
- annot.set_opacity(0.3)
566
- annot.update()
567
-
568
- # Add right-aligned freetext annotation inside the fixed-width box
569
- text = '['+stringtowrite +']'
570
- annot1 = page.add_freetext_annot(
571
- new_rect,
572
- text,
573
- fontsize=15,
574
- fontname='helv',
575
- text_color=(1, 0, 0),
576
- rotate=page.rotation,
577
- align=2 # right alignment
578
- )
579
- annot1.update()
580
-
581
- def get_leaf_headers_with_paths(listtoloop, path=None, output=None):
582
- if path is None:
583
- path = []
584
- if output is None:
585
- output = []
586
- for header in listtoloop:
587
- current_path = path + [header['text']]
588
- if not header['children']:
589
- if header['level'] != 0 and header['level'] != 1:
590
- output.append((header, current_path))
591
- else:
592
- get_leaf_headers_with_paths(header['children'], current_path, output)
593
- return output
594
-
595
- # Add this helper function at the top of your code
596
- def words_match_ratio(text1, text2):
597
- words1 = set(text1.split())
598
- words2 = set(text2.split())
599
- if not words1 or not words2:
600
- return 0.0
601
- common_words = words1 & words2
602
- return len(common_words) / len(words1)
603
-
604
- def same_start_word(s1, s2):
605
- # Split both strings into words
606
- words1 = s1.strip().split()
607
- words2 = s2.strip().split()
608
-
609
- # Check if both have at least one word and compare the first ones
610
- if words1 and words2:
611
- return words1[0].lower() == words2[0].lower()
612
- return False
613
-
614
 
615
- def extract_section_under_header(pdf_path):
616
- top_margin = 70
617
- bottom_margin = 50
618
- headertoContinue1 = False
619
- headertoContinue2=False
 
 
620
 
621
- parsed_url = urlparse(pdf_path)
622
- filename = os.path.basename(parsed_url.path)
623
- filename = unquote(filename) # decode URL-encoded characters
624
-
625
- # Optimized URL handling
626
- if pdf_path and ('http' in pdf_path or 'dropbox' in pdf_path):
627
- pdf_path = pdf_path.replace('dl=0', 'dl=1')
628
-
629
- # Cache frequently used values
630
- response = requests.get(pdf_path)
631
- pdf_content = BytesIO(response.content)
632
- if not pdf_content:
633
- raise ValueError("No valid PDF content found.")
634
-
635
- doc = fitz.open(stream=pdf_content, filetype="pdf")
636
- docHighlights = fitz.open(stream=pdf_content, filetype="pdf")
637
- most_common_font_size, most_common_color, most_common_font = get_regular_font_size_and_color(doc)
638
-
639
- # Precompute regex patterns
640
- dot_pattern = re.compile(r'\.{3,}')
641
- url_pattern = re.compile(r'https?://\S+|www\.\S+')
642
-
643
- def get_toc_page_numbers(doc, max_pages_to_check=15):
644
- toc_pages = []
645
- for page_num in range(min(len(doc), max_pages_to_check)):
646
- page = doc.load_page(page_num)
647
- blocks = page.get_text("dict")["blocks"]
648
-
649
- dot_line_count = 0
650
- for block in blocks:
651
- for line in block.get("lines", []):
652
- line_text = get_spaced_text_from_spans(line["spans"]).strip()
653
- if dot_pattern.search(line_text):
654
- dot_line_count += 1
655
-
656
- if dot_line_count >= 3:
657
- toc_pages.append(page_num)
658
-
659
- return list(range(0, toc_pages[-1] +1)) if toc_pages else toc_pages
660
-
661
- toc_pages = get_toc_page_numbers(doc)
662
-
663
- headers, top_3_font_sizes, smallest_font_size, headersSpans = extract_headers(
664
- doc, toc_pages, most_common_font_size, most_common_color, most_common_font, top_margin, bottom_margin
665
  )
666
 
667
- hierarchy = build_header_hierarchy(doc, toc_pages, most_common_font_size, most_common_color, most_common_font)
668
- listofHeaderstoMarkup = get_leaf_headers_with_paths(hierarchy)
669
- print('listofHeaderstoMarkup',listofHeaderstoMarkup)
670
- # Precompute all children headers once
671
- allchildrenheaders = [normalize_text(item['text']) for item, p in listofHeaderstoMarkup]
672
- allchildrenheaders_set = set(allchildrenheaders) # For faster lookups
673
-
674
- df = pd.DataFrame(columns=["NBSLink","Subject","Page","Author","Creation Date","Layer",'Code', 'head above 1', "head above 2"])
675
- dictionaryNBS={}
676
- data_list_JSON = []
677
-
678
- if len(top_3_font_sizes)==3:
679
- mainHeaderFontSize, subHeaderFontSize, subsubheaderFontSize = top_3_font_sizes
680
- elif len(top_3_font_sizes)==2:
681
- mainHeaderFontSize= top_3_font_sizes[0]
682
- subHeaderFontSize= top_3_font_sizes[1]
683
- subsubheaderFontSize= top_3_font_sizes[1]
684
-
685
- print("📌 Has TOC:", bool(toc_pages), " | Pages to skip:", toc_pages)
686
-
687
- # Preload all pages to avoid repeated loading
688
- # pages = [doc.load_page(page_num) for page_num in range(len(doc)) if page_num not in toc_pages]
689
-
690
- for heading_to_searchDict, paths in listofHeaderstoMarkup:
691
- heading_to_search = heading_to_searchDict['text']
692
- heading_to_searchPageNum = heading_to_searchDict['page']
693
-
694
- print('headertosearch', heading_to_search)
695
-
696
- # Initialize variables
697
- headertoContinue1 = False
698
- headertoContinue2 = False
699
- matched_header_line = None
700
- done = False
701
- collecting = False
702
- collected_lines = []
703
- page_highlights = {}
704
- current_bbox = {}
705
- last_y1s = {}
706
- mainHeader = ''
707
- subHeader = ''
708
- matched_header_line_norm = heading_to_search
709
- break_collecting = False
710
- heading_norm = normalize_text(heading_to_search)
711
- paths_norm = [normalize_text(p) for p in paths[0]] if paths and paths[0] else []
712
-
713
- for page_num in range(heading_to_searchPageNum,len(doc)):
714
- if page_num in toc_pages:
715
- continue
716
- if break_collecting:
717
- break
718
- page=doc[page_num]
719
- page_height = page.rect.height
720
- blocks = page.get_text("dict")["blocks"]
721
-
722
- for block in blocks:
723
- if break_collecting:
724
- break
725
-
726
- lines = block.get("lines", [])
727
- i = 0
728
- while i < len(lines):
729
- if break_collecting:
730
- break
731
-
732
- spans = lines[i].get("spans", [])
733
- if not spans:
734
- i += 1
735
- continue
736
-
737
- y0 = spans[0]["bbox"][1]
738
- y1 = spans[0]["bbox"][3]
739
- if y0 < top_margin or y1 > (page_height - bottom_margin):
740
- i += 1
741
- continue
742
-
743
- line_text = get_spaced_text_from_spans(spans).lower()
744
- line_text_norm = normalize_text(line_text)
745
-
746
- # Combine with next line if available
747
- if i + 1 < len(lines):
748
- next_spans = lines[i + 1].get("spans", [])
749
- next_line_text = get_spaced_text_from_spans(next_spans).lower()
750
- combined_line_norm = normalize_text(line_text + " " + next_line_text)
751
- else:
752
- combined_line_norm = line_text_norm
753
-
754
- # Check if we should continue processing
755
- if combined_line_norm and combined_line_norm in paths[0]:
756
- print(combined_line_norm)
757
- headertoContinue1 = combined_line_norm
758
- if combined_line_norm and combined_line_norm in paths[-2]:
759
- print(combined_line_norm)
760
- headertoContinue2 = combined_line_norm
761
- if 'installation' in paths[-2].lower() or 'execution' in paths[-2].lower() or 'miscellaneous items' in paths[-2].lower() :
762
- stringtowrite='Not to be billed'
763
- else:
764
- stringtowrite='To be billed'
765
- # Optimized header matching
766
- existsfull = (
767
- ( combined_line_norm in allchildrenheaders_set or
768
- combined_line_norm in allchildrenheaders ) and heading_to_search in combined_line_norm
769
- )
770
-
771
- # New word-based matching
772
- current_line_words = set(combined_line_norm.split())
773
- heading_words = set(heading_norm.split())
774
- all_words_match = current_line_words.issubset(heading_words) and len(current_line_words) > 0
775
-
776
- substring_match = (
777
- heading_norm in combined_line_norm or
778
- combined_line_norm in heading_norm or
779
- all_words_match # Include the new word-based matching
780
- )
781
- # substring_match = (
782
- # heading_norm in combined_line_norm or
783
- # combined_line_norm in heading_norm
784
- # )
785
-
786
- if (substring_match and existsfull and not collecting and
787
- len(combined_line_norm) > 0 ):#and (headertoContinue1 or headertoContinue2) ):
788
-
789
- # Check header conditions more efficiently
790
- header_spans = [
791
- span for span in spans
792
- if (is_header(span, most_common_font_size, most_common_color, most_common_font)
793
- # and span['size'] >= subsubheaderFontSize
794
- and span['size'] < mainHeaderFontSize)
795
- ]
796
- if header_spans:
797
- collecting = True
798
- matched_header_font_size = max(span["size"] for span in header_spans)
799
- print(f"📥 Start collecting after header: {combined_line_norm} (Font size: {matched_header_font_size})")
800
-
801
- collected_lines.append(line_text)
802
- valid_spans = [span for span in spans if span.get("bbox")]
803
-
804
- if valid_spans:
805
- x0s = [span["bbox"][0] for span in valid_spans]
806
- x1s = [span["bbox"][2] for span in valid_spans]
807
- y0s = [span["bbox"][1] for span in valid_spans]
808
- y1s = [span["bbox"][3] for span in valid_spans]
809
-
810
- header_bbox = [min(x0s), min(y0s), max(x1s), max(y1s)]
811
-
812
- if page_num in current_bbox:
813
- cb = current_bbox[page_num]
814
- current_bbox[page_num] = [
815
- min(cb[0], header_bbox[0]),
816
- min(cb[1], header_bbox[1]),
817
- max(cb[2], header_bbox[2]),
818
- max(cb[3], header_bbox[3])
819
- ]
820
- else:
821
- current_bbox[page_num] = header_bbox
822
- last_y1s[page_num] = header_bbox[3]
823
- x0, y0, x1, y1 = header_bbox
824
-
825
- zoom = 200
826
- left = int(x0)
827
- top = int(y0)
828
- zoom_str = f"{zoom},{left},{top}"
829
- pageNumberFound = page_num + 1
830
-
831
- # Build the query parameters
832
- params = {
833
- 'pdfLink': pdf_path, # Your PDF link
834
- 'keyword': heading_to_search, # Your keyword (could be a string or list)
835
- }
836
-
837
- # URL encode each parameter
838
- encoded_params = {key: urllib.parse.quote(value, safe='') for key, value in params.items()}
839
-
840
- # Construct the final encoded link
841
- encoded_link = '&'.join([f"{key}={value}" for key, value in encoded_params.items()])
842
-
843
- # Correctly construct the final URL with page and zoom
844
- final_url = f"{baselink}{encoded_link}#page={str(pageNumberFound)}&zoom={zoom_str}"
845
-
846
- # Get current date and time
847
- now = datetime.now()
848
-
849
- # Format the output
850
- formatted_time = now.strftime("%d/%m/%Y %I:%M:%S %p")
851
- # Optionally, add the URL to a DataFrame
852
-
853
-
854
- data_entry = {
855
- "NBSLink": final_url,
856
- "Subject": heading_to_search,
857
- "Page": str(pageNumberFound),
858
- "Author": "ADR",
859
- "Creation Date": formatted_time,
860
- "Layer": "Initial",
861
- "Code": stringtowrite,
862
- "head above 1": paths[-2],
863
- "head above 2": paths[0],
864
- "MC Connnection": 'Go to ' + paths[0].strip().split()[0] +'/'+ heading_to_search.strip().split()[0] + ' in '+ filename
865
- }
866
- data_list_JSON.append(data_entry)
867
-
868
- # Convert list to JSON
869
- json_output = json.dumps(data_list_JSON, indent=4)
870
-
871
- print("Final URL:", final_url)
872
- i += 2
873
- continue
874
- else:
875
- if (substring_match and not collecting and
876
- len(combined_line_norm) > 0): # and (headertoContinue1 or headertoContinue2) ):
877
-
878
- # Calculate word match percentage
879
- word_match_percent = words_match_ratio(heading_norm, combined_line_norm) * 100
880
-
881
- # Check if at least 70% of header words exist in this line
882
- meets_word_threshold = word_match_percent >= 100
883
-
884
- # Check header conditions (including word threshold)
885
- header_spans = [
886
- span for span in spans
887
- if (is_header(span, most_common_font_size, most_common_color, most_common_font)
888
- # and span['size'] >= subsubheaderFontSize
889
- and span['size'] < mainHeaderFontSize)
890
- ]
891
-
892
- if header_spans and (meets_word_threshold or same_start_word(heading_to_search, combined_line_norm) ):
893
- collecting = True
894
- matched_header_font_size = max(span["size"] for span in header_spans)
895
- print(f"📥 Start collecting after header: {combined_line_norm} "
896
- f"(Font: {matched_header_font_size}, Word match: {word_match_percent:.0f}%)")
897
-
898
- collected_lines.append(line_text)
899
- valid_spans = [span for span in spans if span.get("bbox")]
900
-
901
- if valid_spans:
902
- x0s = [span["bbox"][0] for span in valid_spans]
903
- x1s = [span["bbox"][2] for span in valid_spans]
904
- y0s = [span["bbox"][1] for span in valid_spans]
905
- y1s = [span["bbox"][3] for span in valid_spans]
906
-
907
- header_bbox = [min(x0s), min(y0s), max(x1s), max(y1s)]
908
-
909
- if page_num in current_bbox:
910
- cb = current_bbox[page_num]
911
- current_bbox[page_num] = [
912
- min(cb[0], header_bbox[0]),
913
- min(cb[1], header_bbox[1]),
914
- max(cb[2], header_bbox[2]),
915
- max(cb[3], header_bbox[3])
916
- ]
917
- else:
918
- current_bbox[page_num] = header_bbox
919
-
920
- last_y1s[page_num] = header_bbox[3]
921
- x0, y0, x1, y1 = header_bbox
922
- zoom = 200
923
- left = int(x0)
924
- top = int(y0)
925
- zoom_str = f"{zoom},{left},{top}"
926
- pageNumberFound = page_num + 1
927
-
928
- # Build the query parameters
929
- params = {
930
- 'pdfLink': pdf_path, # Your PDF link
931
- 'keyword': heading_to_search, # Your keyword (could be a string or list)
932
- }
933
-
934
- # URL encode each parameter
935
- encoded_params = {key: urllib.parse.quote(value, safe='') for key, value in params.items()}
936
-
937
- # Construct the final encoded link
938
- encoded_link = '&'.join([f"{key}={value}" for key, value in encoded_params.items()])
939
-
940
- # Correctly construct the final URL with page and zoom
941
- final_url = f"{baselink}{encoded_link}#page={str(pageNumberFound)}&zoom={zoom_str}"
942
-
943
- # Get current date and time
944
- now = datetime.now()
945
-
946
- # Format the output
947
- formatted_time = now.strftime("%d/%m/%Y %I:%M:%S %p")
948
- # Optionally, add the URL to a DataFrame
949
-
950
-
951
- data_entry = {
952
- "NBSLink": final_url,
953
- "Subject": heading_to_search,
954
- "Page": str(pageNumberFound),
955
- "Author": "ADR",
956
- "Creation Date": formatted_time,
957
- "Layer": "Initial",
958
- "Code": stringtowrite,
959
- "head above 1": paths[-2],
960
- "head above 2": paths[0],
961
- "MC Connnection": 'Go to ' + paths[0].strip().split()[0] +'/'+ heading_to_search.strip().split()[0] + ' in '+ filename
962
- }
963
- data_list_JSON.append(data_entry)
964
-
965
- # Convert list to JSON
966
- json_output = json.dumps(data_list_JSON, indent=4)
967
-
968
- print("Final URL:", final_url)
969
- i += 2
970
- continue
971
- if collecting:
972
- norm_line = normalize_text(line_text)
973
-
974
- # Optimized URL check
975
- if url_pattern.match(norm_line):
976
- line_is_header = False
977
- else:
978
- line_is_header = any(is_header(span, most_common_font_size, most_common_color, most_common_font) for span in spans)
979
-
980
- if line_is_header:
981
- header_font_size = max(span["size"] for span in spans)
982
- is_probably_real_header = (
983
- header_font_size >= matched_header_font_size and
984
- is_header(spans[0], most_common_font_size, most_common_color, most_common_font) and
985
- len(line_text.strip()) > 2
986
- )
987
-
988
- if (norm_line != matched_header_line_norm and
989
- norm_line != heading_norm and
990
- is_probably_real_header):
991
- if line_text not in heading_norm:
992
- print(f"🛑 Stop at header with same or larger font: '{line_text}' ({header_font_size} ≥ {matched_header_font_size})")
993
- collecting = False
994
- done = True
995
- headertoContinue1 = False
996
- headertoContinue2=False
997
- for page_num, bbox in current_bbox.items():
998
- bbox[3] = last_y1s.get(page_num, bbox[3])
999
- page_highlights[page_num] = bbox
1000
- highlight_boxes(docHighlights, page_highlights,stringtowrite)
1001
-
1002
- break_collecting = True
1003
- break
1004
-
1005
- if break_collecting:
1006
- break
1007
-
1008
- collected_lines.append(line_text)
1009
- valid_spans = [span for span in spans if span.get("bbox")]
1010
- if valid_spans:
1011
- x0s = [span["bbox"][0] for span in valid_spans]
1012
- x1s = [span["bbox"][2] for span in valid_spans]
1013
- y0s = [span["bbox"][1] for span in valid_spans]
1014
- y1s = [span["bbox"][3] for span in valid_spans]
1015
-
1016
- line_bbox = [min(x0s), min(y0s), max(x1s), max(y1s)]
1017
-
1018
- if page_num in current_bbox:
1019
- cb = current_bbox[page_num]
1020
- current_bbox[page_num] = [
1021
- min(cb[0], line_bbox[0]),
1022
- min(cb[1], line_bbox[1]),
1023
- max(cb[2], line_bbox[2]),
1024
- max(cb[3], line_bbox[3])
1025
- ]
1026
- else:
1027
- current_bbox[page_num] = line_bbox
1028
-
1029
- last_y1s[page_num] = line_bbox[3]
1030
- i += 1
1031
-
1032
- if not done:
1033
- for page_num, bbox in current_bbox.items():
1034
- bbox[3] = last_y1s.get(page_num, bbox[3])
1035
- page_highlights[page_num] = bbox
1036
- if 'installation' in paths[-2].lower() or 'execution' in paths[-2].lower() or 'miscellaneous items' in paths[-2].lower() :
1037
- stringtowrite='Not to be billed'
1038
- else:
1039
- stringtowrite='To be billed'
1040
- highlight_boxes(docHighlights, page_highlights,stringtowrite)
1041
-
1042
- # docHighlights.save("highlighted_output.pdf", garbage=4, deflate=True)
1043
-
1044
- pdf_bytes = BytesIO()
1045
- docHighlights.save(pdf_bytes)
1046
- print('JSONN',json_output)
1047
- return pdf_bytes.getvalue(), docHighlights , json_output
1048
-
1049
-
1050
-
1051
-
1052
- def extract_section_under_headerRawan(pdf_path,headingjson,pagenum=0,incomingheader=0):
1053
- top_margin = 70
1054
- bottom_margin = 50
1055
- # Optimized URL handling
1056
- if pdf_path and ('http' in pdf_path or 'dropbox' in pdf_path):
1057
- pdf_path = pdf_path.replace('dl=0', 'dl=1')
1058
-
1059
- # Cache frequently used values
1060
- response = requests.get(pdf_path)
1061
- pdf_content = BytesIO(response.content)
1062
- if not pdf_content:
1063
- raise ValueError("No valid PDF content found.")
1064
-
1065
- doc = fitz.open(stream=pdf_content, filetype="pdf")
1066
- docHighlights = fitz.open(stream=pdf_content, filetype="pdf")
1067
- most_common_font_size, most_common_color, most_common_font = get_regular_font_size_and_color(doc)
1068
-
1069
- # Precompute regex patterns
1070
- dot_pattern = re.compile(r'\.{3,}')
1071
- url_pattern = re.compile(r'https?://\S+|www\.\S+')
1072
-
1073
- def get_toc_page_numbers(doc, max_pages_to_check=15):
1074
- toc_pages = []
1075
- for page_num in range(min(len(doc), max_pages_to_check)):
1076
- page = doc.load_page(page_num)
1077
- blocks = page.get_text("dict")["blocks"]
1078
-
1079
- dot_line_count = 0
1080
- for block in blocks:
1081
- for line in block.get("lines", []):
1082
- line_text = get_spaced_text_from_spans(line["spans"]).strip()
1083
- if dot_pattern.search(line_text):
1084
- dot_line_count += 1
1085
-
1086
- if dot_line_count >= 3:
1087
- toc_pages.append(page_num)
1088
-
1089
- return list(range(0, toc_pages[-1] +1)) if toc_pages else toc_pages
1090
-
1091
- toc_pages = get_toc_page_numbers(doc)
1092
-
1093
- headers, top_3_font_sizes, smallest_font_size, headersSpans = extract_headers(
1094
- doc, toc_pages, most_common_font_size, most_common_color, most_common_font, top_margin, bottom_margin
1095
- )
1096
-
1097
- listofheadingsfromrawan=[]
1098
- if type(headingjson) == str:
1099
- listofheadingsfromrawan.append(headingjson)
1100
- headingjson=[headingjson]
1101
  else:
1102
- for item in headingjson:
1103
- listofheadingsfromrawan.append(normalize_text(item['Subject']))
1104
- print('hereeeeeeeeeeeeeee0',listofheadingsfromrawan)
1105
- # Precompute all children headers once
1106
- allchildrenheaders = listofheadingsfromrawan
1107
- print('hereeeeeeeeeeeeeee00',allchildrenheaders)
1108
- allchildrenheaders_set = set(allchildrenheaders) # For faster lookups
1109
-
1110
- df = pd.DataFrame(columns=["NBSLink","Subject","Page","Author","Creation Date","Layer",'Code', 'head above 1', "head above 2"])
1111
- data_list_JSON = []
1112
-
1113
- if len(top_3_font_sizes)==3:
1114
- mainHeaderFontSize, subHeaderFontSize, subsubheaderFontSize = top_3_font_sizes
1115
- elif len(top_3_font_sizes)==2:
1116
- mainHeaderFontSize= top_3_font_sizes[0]
1117
- subHeaderFontSize= top_3_font_sizes[1]
1118
- subsubheaderFontSize= top_3_font_sizes[1]
1119
-
1120
- print("📌 Has TOC:", bool(toc_pages), " | Pages to skip:", toc_pages)
1121
-
1122
- # Preload all pages to avoid repeated loading
1123
- # pages = [doc.load_page(page_num) for page_num in range(len(doc)) if page_num not in toc_pages]
1124
- newjsonList=[]
1125
- for heading_to_searchDict in headingjson:
1126
- if type(heading_to_searchDict) == str:
1127
- heading_to_search = heading_to_searchDict
1128
- heading_to_searchPageNum = pagenum
1129
- else:
1130
- heading_to_search = heading_to_searchDict['Subject']
1131
- heading_to_searchPageNum = int(heading_to_searchDict['Page'])-1
1132
- incomingheader = heading_to_searchDict['head above 1']
1133
-
1134
- print('hereeeeeeeeeeeeeee0',heading_to_searchPageNum)
1135
- done = False
1136
- collecting = False
1137
- collected_lines = []
1138
- page_highlights = {}
1139
- current_bbox = {}
1140
- last_y1s = {}
1141
- mainHeader = ''
1142
- subHeader = ''
1143
- matched_header_line_norm = heading_to_search
1144
- break_collecting = False
1145
- heading_norm = normalize_text(heading_to_search)
1146
-
1147
- for page_num in range(heading_to_searchPageNum,len(doc)):
1148
- print('hereeeeeeeeeeeeeee1')
1149
- if page_num in toc_pages:
1150
- continue
1151
- if break_collecting:
1152
- break
1153
- page=doc[page_num]
1154
- page_height = page.rect.height
1155
- blocks = page.get_text("dict")["blocks"]
1156
-
1157
- for block in blocks:
1158
- if break_collecting:
1159
- break
1160
-
1161
- lines = block.get("lines", [])
1162
- i = 0
1163
- while i < len(lines):
1164
- if break_collecting:
1165
- break
1166
-
1167
- spans = lines[i].get("spans", [])
1168
- if not spans:
1169
- i += 1
1170
- continue
1171
-
1172
- y0 = spans[0]["bbox"][1]
1173
- y1 = spans[0]["bbox"][3]
1174
- if y0 < top_margin or y1 > (page_height - bottom_margin):
1175
- i += 1
1176
- continue
1177
-
1178
- line_text = get_spaced_text_from_spans(spans).lower()
1179
- line_text_norm = normalize_text(line_text)
1180
-
1181
- # Combine with next line if available
1182
- if i + 1 < len(lines):
1183
- next_spans = lines[i + 1].get("spans", [])
1184
- next_line_text = get_spaced_text_from_spans(next_spans).lower()
1185
- combined_line_norm = normalize_text(line_text + " " + next_line_text)
1186
- else:
1187
- combined_line_norm = line_text_norm
1188
- # Optimized header matching
1189
- existsfull = (
1190
- ( combined_line_norm in allchildrenheaders_set or
1191
- combined_line_norm in allchildrenheaders ) and heading_to_search in combined_line_norm
1192
- )
1193
-
1194
- # New word-based matching
1195
- current_line_words = set(combined_line_norm.split())
1196
- heading_words = set(heading_norm.split())
1197
- all_words_match = current_line_words.issubset(heading_words) and len(current_line_words) > 0
1198
-
1199
- substring_match = (
1200
- heading_norm in combined_line_norm or
1201
- combined_line_norm in heading_norm or
1202
- all_words_match # Include the new word-based matching
1203
- )
1204
-
1205
- if (substring_match and existsfull and not collecting and
1206
- len(combined_line_norm) > 0 ):#and (headertoContinue1 or headertoContinue2) ):
1207
-
1208
- # Check header conditions more efficiently
1209
- header_spans = [
1210
- span for span in spans
1211
- if (is_header(span, most_common_font_size, most_common_color, most_common_font)
1212
- # and span['size'] >= subsubheaderFontSize
1213
- and span['size'] < mainHeaderFontSize)
1214
- ]
1215
- if header_spans:
1216
- collecting = True
1217
- matched_header_font_size = max(span["size"] for span in header_spans)
1218
- print(f"📥 Start collecting after header: {combined_line_norm} (Font size: {matched_header_font_size})")
1219
-
1220
- collected_lines.append(line_text)
1221
- valid_spans = [span for span in spans if span.get("bbox")]
1222
-
1223
- if valid_spans:
1224
- x0s = [span["bbox"][0] for span in valid_spans]
1225
- x1s = [span["bbox"][2] for span in valid_spans]
1226
- y0s = [span["bbox"][1] for span in valid_spans]
1227
- y1s = [span["bbox"][3] for span in valid_spans]
1228
-
1229
- header_bbox = [min(x0s), min(y0s), max(x1s), max(y1s)]
1230
-
1231
- if page_num in current_bbox:
1232
- cb = current_bbox[page_num]
1233
- current_bbox[page_num] = [
1234
- min(cb[0], header_bbox[0]),
1235
- min(cb[1], header_bbox[1]),
1236
- max(cb[2], header_bbox[2]),
1237
- max(cb[3], header_bbox[3])
1238
- ]
1239
- else:
1240
- current_bbox[page_num] = header_bbox
1241
- last_y1s[page_num] = header_bbox[3]
1242
- x0, y0, x1, y1 = header_bbox
1243
-
1244
- zoom = 200
1245
- left = int(x0)
1246
- top = int(y0)
1247
- zoom_str = f"{zoom},{left},{top}"
1248
- pageNumberFound = page_num + 1
1249
-
1250
- # Build the query parameters
1251
- params = {
1252
- 'pdfLink': pdf_path, # Your PDF link
1253
- 'keyword': heading_to_search, # Your keyword (could be a string or list)
1254
- }
1255
-
1256
- # URL encode each parameter
1257
- encoded_params = {key: urllib.parse.quote(value, safe='') for key, value in params.items()}
1258
-
1259
- # Construct the final encoded link
1260
- encoded_link = '&'.join([f"{key}={value}" for key, value in encoded_params.items()])
1261
-
1262
- # Correctly construct the final URL with page and zoom
1263
- final_url = f"{newlink}{encoded_link}#page={str(pageNumberFound)}&zoom={zoom_str}"
1264
-
1265
- # Get current date and time
1266
- now = datetime.now()
1267
-
1268
- # Format the output
1269
- formatted_time = now.strftime("%d/%m/%Y %I:%M:%S %p")
1270
- # Optionally, add the URL to a DataFrame
1271
- new_url= final_url
1272
- if type(heading_to_searchDict) != str:
1273
- heading_to_searchDict['NBSLink']=new_url
1274
- newjsonList.append(heading_to_searchDict)
1275
- print("Final URL:", final_url)
1276
- i += 2
1277
- continue
1278
- else:
1279
- if (substring_match and not collecting and
1280
- len(combined_line_norm) > 0): # and (headertoContinue1 or headertoContinue2) ):
1281
-
1282
- # Calculate word match percentage
1283
- word_match_percent = words_match_ratio(heading_norm, combined_line_norm) * 100
1284
-
1285
- # Check if at least 70% of header words exist in this line
1286
- meets_word_threshold = word_match_percent >= 100
1287
-
1288
- # Check header conditions (including word threshold)
1289
- header_spans = [
1290
- span for span in spans
1291
- if (is_header(span, most_common_font_size, most_common_color, most_common_font)
1292
- # and span['size'] >= subsubheaderFontSize
1293
- and span['size'] < mainHeaderFontSize)
1294
- ]
1295
-
1296
- if header_spans and (meets_word_threshold or same_start_word(heading_to_search, combined_line_norm) ):
1297
- collecting = True
1298
- matched_header_font_size = max(span["size"] for span in header_spans)
1299
- print(f"📥 Start collecting after header: {combined_line_norm} "
1300
- f"(Font: {matched_header_font_size}, Word match: {word_match_percent:.0f}%)")
1301
-
1302
- collected_lines.append(line_text)
1303
- valid_spans = [span for span in spans if span.get("bbox")]
1304
-
1305
- if valid_spans:
1306
- x0s = [span["bbox"][0] for span in valid_spans]
1307
- x1s = [span["bbox"][2] for span in valid_spans]
1308
- y0s = [span["bbox"][1] for span in valid_spans]
1309
- y1s = [span["bbox"][3] for span in valid_spans]
1310
-
1311
- header_bbox = [min(x0s), min(y0s), max(x1s), max(y1s)]
1312
-
1313
- if page_num in current_bbox:
1314
- cb = current_bbox[page_num]
1315
- current_bbox[page_num] = [
1316
- min(cb[0], header_bbox[0]),
1317
- min(cb[1], header_bbox[1]),
1318
- max(cb[2], header_bbox[2]),
1319
- max(cb[3], header_bbox[3])
1320
- ]
1321
- else:
1322
- current_bbox[page_num] = header_bbox
1323
-
1324
- last_y1s[page_num] = header_bbox[3]
1325
- x0, y0, x1, y1 = header_bbox
1326
- zoom = 200
1327
- left = int(x0)
1328
- top = int(y0)
1329
- zoom_str = f"{zoom},{left},{top}"
1330
- pageNumberFound = page_num + 1
1331
-
1332
- # Build the query parameters
1333
- params = {
1334
- 'pdfLink': pdf_path, # Your PDF link
1335
- 'keyword': heading_to_search, # Your keyword (could be a string or list)
1336
- }
1337
-
1338
- # URL encode each parameter
1339
- encoded_params = {key: urllib.parse.quote(value, safe='') for key, value in params.items()}
1340
-
1341
- # Construct the final encoded link
1342
- encoded_link = '&'.join([f"{key}={value}" for key, value in encoded_params.items()])
1343
-
1344
- # Correctly construct the final URL with page and zoom
1345
- final_url = f"{newlink}{encoded_link}#page={str(pageNumberFound)}&zoom={zoom_str}"
1346
- new_url= final_url
1347
- if type(heading_to_searchDict) != str:
1348
- heading_to_searchDict['NBSLink']=new_url
1349
- newjsonList.append(heading_to_searchDict)
1350
- print("Final URL:", final_url)
1351
- i += 2
1352
- continue
1353
- if collecting:
1354
- norm_line = normalize_text(line_text)
1355
-
1356
- # Optimized URL check
1357
- if url_pattern.match(norm_line):
1358
- line_is_header = False
1359
- else:
1360
- line_is_header = any(is_header(span, most_common_font_size, most_common_color, most_common_font) for span in spans)
1361
-
1362
- if line_is_header:
1363
- header_font_size = max(span["size"] for span in spans)
1364
- is_probably_real_header = (
1365
- header_font_size >= matched_header_font_size and
1366
- is_header(spans[0], most_common_font_size, most_common_color, most_common_font) and
1367
- len(line_text.strip()) > 2
1368
- )
1369
-
1370
- if (norm_line != matched_header_line_norm and
1371
- norm_line != heading_norm and
1372
- is_probably_real_header):
1373
- if line_text not in heading_norm:
1374
- print(f"🛑 Stop at header with same or larger font: '{line_text}' ({header_font_size} ≥ {matched_header_font_size})")
1375
- collecting = False
1376
- done = True
1377
- headertoContinue1 = False
1378
- headertoContinue2=False
1379
- for page_num, bbox in current_bbox.items():
1380
- bbox[3] = last_y1s.get(page_num, bbox[3])
1381
- page_highlights[page_num] = bbox
1382
-
1383
- if 'installation' in incomingheader or 'execution' in incomingheader or 'miscellaneous items' in incomingheader :
1384
- stringtowrite='Not to be billed'
1385
- else:
1386
- stringtowrite='To be billed'
1387
- highlight_boxes(docHighlights, page_highlights,stringtowrite)
1388
-
1389
- break_collecting = True
1390
- break
1391
-
1392
- if break_collecting:
1393
- break
1394
-
1395
- collected_lines.append(line_text)
1396
- valid_spans = [span for span in spans if span.get("bbox")]
1397
- if valid_spans:
1398
- x0s = [span["bbox"][0] for span in valid_spans]
1399
- x1s = [span["bbox"][2] for span in valid_spans]
1400
- y0s = [span["bbox"][1] for span in valid_spans]
1401
- y1s = [span["bbox"][3] for span in valid_spans]
1402
 
1403
- line_bbox = [min(x0s), min(y0s), max(x1s), max(y1s)]
1404
 
1405
- if page_num in current_bbox:
1406
- cb = current_bbox[page_num]
1407
- current_bbox[page_num] = [
1408
- min(cb[0], line_bbox[0]),
1409
- min(cb[1], line_bbox[1]),
1410
- max(cb[2], line_bbox[2]),
1411
- max(cb[3], line_bbox[3])
1412
- ]
1413
- else:
1414
- current_bbox[page_num] = line_bbox
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1415
 
1416
- last_y1s[page_num] = line_bbox[3]
1417
- i += 1
 
1418
 
1419
- if not done:
1420
- for page_num, bbox in current_bbox.items():
1421
- bbox[3] = last_y1s.get(page_num, bbox[3])
1422
- page_highlights[page_num] = bbox
1423
- if 'installation' in incomingheader or 'execution' in incomingheader or 'miscellaneous items' in incomingheader :
1424
- stringtowrite='Not to be billed'
1425
- else:
1426
- stringtowrite='To be billed'
1427
- highlight_boxes(docHighlights, page_highlights,stringtowrite)
1428
 
1429
- # docHighlights.save("highlighted_output.pdf", garbage=4, deflate=True)
 
1430
 
1431
- pdf_bytes = BytesIO()
1432
- docHighlights.save(pdf_bytes)
1433
- return pdf_bytes.getvalue(), docHighlights , newjsonList
1434
 
1435
-
 
 
 
1
+ from flask import Flask, request, jsonify, abort , render_template , send_file
2
+ import tsadropboxretrieval
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
  import json
4
+ import Find_Hyperlinking_text
5
+ import findspecsv1
6
+ import InitialMarkups
7
+ import requests
8
+ from io import BytesIO
9
+ import datetime
10
+ import time
11
+ from threading import Thread
12
+ import urllib
13
+ from urllib.parse import quote
14
+ app = Flask(__name__)
15
+
16
+ pageNumTextFound = 0
17
+ BASE_URL = "https://findconsole-initialmarkups.hf.space"
18
+ # Simulate a backend readiness flag (replace with actual check if possible)
19
+ backend_ready = False
20
+ # @app.route("/")
21
+ # def thismain():
22
+ # print('Home page loaded')
23
+ # return render_template("gui.html")
24
+
25
+ @app.route("/keepaliveapii", methods=["GET", "POST"])
26
+ def keepaliveapi():
27
+ try:
28
+ print('Keepalive pinged')
29
+ return 'alivee'
30
+ except Exception as error:
31
+ print('Error in keepalive:', error)
32
+ return jsonify(status="error", message=str(error)), 500
33
+
34
+
35
+
36
+ @app.route("/")
37
+ def home():
38
+ global backend_ready
39
+ # If backend not ready, show loading page
40
+ if not backend_ready:
41
+ return render_template("wake_and_redirect.html")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
  else:
43
+ # Redirect to your PDF viewer route when ready
44
+ return redirect(url_for("view_pdf", **request.args))
45
+ ################################################################################################################################################################
46
+ ################################################################################################################################################################
47
+ ##################### Main console ###########################################################################################################
48
+ ################################################################################################################################################################
49
+ ################################################################################################################################################################
50
+
51
+ @app.route('/view-pdf', methods=['GET'])
52
+ def download_pdf():
53
+ # Parse and decode pdfLink safely
54
+ full_query_string = request.query_string.decode()
55
+ parsed_params = urllib.parse.parse_qs(full_query_string)
56
+ encoded_pdf_link = parsed_params.get('pdfLink', [None])[0]
57
 
58
+ if not encoded_pdf_link:
59
+ return "Missing pdfLink parameter.", 400
60
+
61
+ # Decode the URL-encoded PDF link
62
+ pdf_link = urllib.parse.unquote(encoded_pdf_link)
63
+ print("Extracted PDF Link:", pdf_link)
64
+
65
+ try:
66
+ # Use InitialMarkups to extract content
67
+ pdf_content = InitialMarkups.extract_section_under_header(pdf_link)[0]
68
+ except Exception as e:
69
+ print("Error during PDF extraction:", e)
70
+ return "PDF could not be processed.", 500
71
+
72
+ if pdf_content is None or not pdf_content.startswith(b"%PDF"):
73
+ return "PDF content not found or broken.", 404
74
+
75
+ pdf_bytes = BytesIO(pdf_content)
76
+ return send_file(
77
+ pdf_bytes,
78
+ mimetype='application/pdf',
79
+ as_attachment=False,
80
+ download_name=f"annotated_page_{pageNumTextFound}.pdf"
 
 
 
 
 
 
81
  )
82
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
83
 
84
+ @app.route('/api/process-data', methods=['POST'])
85
+ def receive_pdf_data():
86
+ global pdf_content, pageNumTextFound
87
+
88
+ # Get PDF link and keyword from finddata()
89
+ pdfLink = finddata()
90
+
91
+ if not pdfLink :
92
+ return jsonify({"error": "'pdfLink' must be provided."}), 400
93
+
94
+ try:
95
+ print(pdfLink)
96
+
97
+
98
+ pdfbytes, pdf_document,tablepdfoutput= InitialMarkups.extract_section_under_header(pdfLink)
99
+ dbxTeam= tsadropboxretrieval.ADR_Access_DropboxTeam('user')
100
+
101
+ # Get metadata using the shared link
102
+ metadata = dbxTeam.sharing_get_shared_link_metadata(pdfLink)
103
+ dbPath='/TSA JOBS/ADR Test/FIND/'
104
+ pdflink= tsadropboxretrieval.uploadanyFile(doc=pdf_document,path=dbPath,pdfname=metadata.name) #doc=doc,pdfname=path,pdfpath=pdfpath+'Measured Plan/
105
+ print('LINKS0',pdflink)
106
+
107
+ dbPath='/TSA JOBS/ADR Test/FIND/'
108
+ tablepdfLink=tsadropboxretrieval.uploadanyFile(doc=tablepdfoutput,path=dbPath,pdfname=metadata.name.rsplit(".pdf", 1)[0] +' Markup Summary'+'.pdf')
109
+ print(f"PDF successfully uploaded to Dropbox at")
110
+ print('LINKS1',tablepdfLink)
111
+ return jsonify({
112
+ "message": "PDF processed successfully.",
113
+ "PDF_MarkedUp": pdflink,
114
+ 'Table_PDF_Markup_Summary': tablepdfLink
115
+ })
116
+
117
+ except Exception as e:
118
+ return jsonify({"error": str(e)}), 500
119
+ ################################################################################################################################################################
120
+ ################################################################################################################################################################
121
+ ##################### Not to billed not markuped up ###########################################################################################################
122
+ ################################################################################################################################################################
123
+ ################################################################################################################################################################
124
+ @app.route('/findapitobebilled', methods=['GET','POST'])
125
+ def findapitobebilled():
126
+ try:
127
+ print('In process [Try]')
128
+ data = request.get_json()
129
+ # Extracting values
130
+ pdfLink = data.get('filePath')
131
+ pdfbytes, pdf_document,tablepdfoutput= InitialMarkups.extract_section_under_header_tobebilledOnly(pdfLink)
132
+ global jsonoutput
133
+ jsonoutput=tablepdfoutput
134
+ return jsonify(tablepdfoutput)
135
+ except Exception as e:
136
+ return jsonify({"error": str(e)}), 500
 
 
 
137
 
138
+
139
+ @app.route('/view-pdf-tobebilled', methods=['GET'])
140
+ def download_pdf_tobebilled():
141
+ # Parse and decode pdfLink safely
142
+ full_query_string = request.query_string.decode()
143
+ parsed_params = urllib.parse.parse_qs(full_query_string)
144
+ encoded_pdf_link = parsed_params.get('pdfLink', [None])[0]
145
 
146
+ if not encoded_pdf_link:
147
+ return "Missing pdfLink parameter.", 400
148
+
149
+ # Decode the URL-encoded PDF link
150
+ pdf_link = urllib.parse.unquote(encoded_pdf_link)
151
+ print("Extracted PDF Link:", pdf_link)
152
+
153
+ try:
154
+ # Use InitialMarkups to extract content
155
+ pdf_content = InitialMarkups.extract_section_under_header_tobebilledOnly(pdf_link)[0]
156
+ except Exception as e:
157
+ print("Error during PDF extraction:", e)
158
+ return "PDF could not be processed.", 500
159
+
160
+ if pdf_content is None or not pdf_content.startswith(b"%PDF"):
161
+ return "PDF content not found or broken.", 404
162
+
163
+ pdf_bytes = BytesIO(pdf_content)
164
+ return send_file(
165
+ pdf_bytes,
166
+ mimetype='application/pdf',
167
+ as_attachment=False,
168
+ download_name=f"annotated_page_{pageNumTextFound}.pdf"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
169
  )
170
 
171
+ ################################################################################################################################################################
172
+ ################################################################################################################################################################
173
+ ##################### For final markups - view one highlight at a time - not used yet ###########################################################################################################
174
+ ################################################################################################################################################################
175
+ ################################################################################################################################################################
176
+
177
+
178
+ @app.route('/view-highlight', methods=['GET','POST'])
179
+ def download_pdfHighlight():
180
+
181
+ # Manually parse the query parameters
182
+ full_query_string = request.query_string.decode() # Get raw query string
183
+ parsed_params = urllib.parse.parse_qs(full_query_string) # Parse it
184
+ # Extract pdfLink and keyword manually
185
+ pdf_link = parsed_params.get('pdfLink', [None])[0]
186
+ keyword = parsed_params.get('keyword', [None])[0]
187
+ # linktoreplace = [listofheadingsfromrawan["Link"]]
188
+ if not pdf_link :
189
+ return "Missing required parameters.", 400
190
+
191
+ # Decode the extracted values
192
+ pdf_link = urllib.parse.unquote(pdf_link)
193
+
194
+ print("Extracted PDF Link:", pdf_link)
195
+ print("Extracted Keywords:", keyword)
196
+ createDF=False
197
+ global jsonoutput
198
+ matching_item = next((item for item in jsonoutput if item.get("Subject") == keyword), None)
199
+
200
+ if matching_item:
201
+ page_number = int(matching_item.get("Page"))-1
202
+ stringtowrite = matching_item.get("head above 1")
203
+ print(f"Page number for '{keyword}': {page_number}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
204
  else:
205
+ page_number=0
206
+ print("No match found.")
207
+ pdf_content = InitialMarkups.extract_section_under_headerRawan(pdf_link,keyword,page_number,stringtowrite)[0]
208
+ if pdf_content is None:
209
+ return "PDF content not found.", 404
210
+
211
+ pdf_bytes = BytesIO(pdf_content)
212
+ return send_file(
213
+ pdf_bytes,
214
+ mimetype='application/pdf',
215
+ as_attachment=False,
216
+ download_name=f"annotated_page_{pageNumTextFound}.pdf"
217
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
218
 
 
219
 
220
+ @app.route('/findapiFilteredHeadings', methods=['GET','POST'])
221
+ def findapiFilteredHeadings():
222
+ try:
223
+ print('In process [Try]')
224
+ data = request.get_json()
225
+ # Extracting values
226
+ pdfLink = data.get('filePath')
227
+ print(pdfLink)
228
+ listofheadings = data.get('listofheadings') #in json format
229
+ print(listofheadings)
230
+ pdfbytes, pdf_document,tablepdfoutput= InitialMarkups.extract_section_under_headerRawan(pdfLink,listofheadings)
231
+ global jsonoutput
232
+ jsonoutput=tablepdfoutput
233
+ return jsonify(tablepdfoutput)
234
+ except Exception as e:
235
+ return jsonify({"error": str(e)}), 500
236
+
237
+
238
+
239
+ ################################################################################################################################################################
240
+ ################################################################################################################################################################
241
+ ##################### For Rawan - MC Connection ###########################################################################################################
242
+ ################################################################################################################################################################
243
+ ################################################################################################################################################################
244
+
245
+ @app.route('/findapi', methods=['GET','POST'])
246
+ def findapi():
247
+ try:
248
+ print('In process [Try]')
249
+ data = request.get_json()
250
+ # Extracting values
251
+ pdfLink = data.get('filePath')
252
+ pdfbytes, pdf_document,tablepdfoutput= InitialMarkups.extract_section_under_header(pdfLink)
253
+ global jsonoutput
254
+ jsonoutput=tablepdfoutput
255
+ return jsonify(tablepdfoutput)
256
+ except Exception as e:
257
+ return jsonify({"error": str(e)}), 500
258
+
259
+ ############################################# Testing #################################################
260
+
261
+ def finddata():
262
+ pdfLink = 'https://www.dropbox.com/scl/fi/hnp4mqigb51a5kp89kgfa/00801-ARC-20-ZZ-S-A-0002.pdf?rlkey=45abeoebzqw4qwnslnei6dkd6&st=m4yrcjm2&dl=1'
263
+ keyword = ['115 INTEGRATED MRI ROOM LININGS', '310 ACCURACY']
264
+ return pdfLink, keyword
265
 
266
+ ########################################### Running #####################################################
267
+ #_________________________________________________________________________________________________________________________
268
+ #_________________________________________________________________________________________________________________________
269
 
270
+ #_________________________________________________________________________________________________________________________
271
+ #_________________________________________________________________________________________________________________________
 
 
 
 
 
 
 
272
 
273
+ #_________________________________________________________________________________________________________________________
274
+ #_________________________________________________________________________________________________________________________
275
 
 
 
 
276
 
277
+ if __name__ == '__main__':
278
+ app.run(host='0.0.0.0', port=7860)
279
+