Marthee commited on
Commit
10128bc
·
verified ·
1 Parent(s): e2ef6e1

Update InitialMarkups.py

Browse files
Files changed (1) hide show
  1. InitialMarkups.py +89 -512
InitialMarkups.py CHANGED
@@ -6,12 +6,14 @@ Automatically generated by Colab.
6
  Original file is located at
7
  https://colab.research.google.com/drive/12XfVkmKmN3oVjHhLVE0_GgkftgArFEK2
8
  """
9
- baselink='https://findconsole-initialmarkups.hf.space/view-pdf?'
10
-
11
- newlink='https://findconsole-initialmarkups.hf.space/view-highlight?'
12
- tobebilledonlyLink='https://findconsole-initialmarkups.hf.space/view-pdf-tobebilled?'
 
 
 
13
 
14
- import tsadropboxretrieval
15
  from urllib.parse import urlparse, unquote
16
  import os
17
  from io import BytesIO
@@ -29,13 +31,11 @@ from datetime import datetime
29
  from collections import defaultdict, Counter
30
  import difflib
31
  from fuzzywuzzy import fuzz
 
 
32
 
33
- def filteredJsons(pdf_path,filteredjsonsfromrawan):
34
- # for heading in subjects:
35
- extract_section_under_headerRawan (pdf_path=pdf_path,listofheadingsfromrawan=filteredjsonsfromrawan)
36
-
37
 
38
-
39
  def changepdflinks(data_list_JSON, pdflink):
40
  print('henaaaa weee',data_list_JSON)
41
 
@@ -75,7 +75,6 @@ def changepdflinks(data_list_JSON, pdflink):
75
 
76
  return data_list_JSON
77
 
78
-
79
  def get_regular_font_size_and_color(doc):
80
  font_sizes = []
81
  colors = []
@@ -239,7 +238,7 @@ def extract_headers(doc, toc_pages, most_common_font_size, most_common_color, mo
239
  font_size_counts = Counter(font_sizes)
240
 
241
  # Filter font sizes that appear at least 3 times
242
- valid_font_sizes = [size for size, count in font_size_counts.items() if count >= 3]
243
 
244
  # Sort in descending order
245
  valid_font_sizes_sorted = sorted(valid_font_sizes, reverse=True)
@@ -649,8 +648,6 @@ def same_start_word(s1, s2):
649
  return False
650
 
651
 
652
-
653
-
654
  def extract_section_under_header(multiplePDF_Paths):
655
  filenames=[]
656
  keywords = {'installation', 'execution', 'miscellaneous items', 'workmanship', 'testing', 'labeling'}
@@ -1096,7 +1093,7 @@ def extract_section_under_header(multiplePDF_Paths):
1096
  jsonCombined.extend(json_output1)
1097
  combined_json_str = json.dumps(jsonCombined, indent=1)
1098
  return pdf_bytes.getvalue(), docHighlights , combined_json_str
1099
-
1100
  ########################################################################################################################################################
1101
  ########################################################################################################################################################
1102
 
@@ -1105,6 +1102,10 @@ def extract_section_under_header(multiplePDF_Paths):
1105
  def extract_section_under_header_tobebilledOnly(pdf_path):
1106
  Alltexttobebilled=''
1107
  alltextWithoutNotbilled=''
 
 
 
 
1108
  top_margin = 70
1109
  bottom_margin = 50
1110
  headertoContinue1 = False
@@ -1125,7 +1126,12 @@ def extract_section_under_header_tobebilledOnly(pdf_path):
1125
  raise ValueError("No valid PDF content found.")
1126
 
1127
  doc = fitz.open(stream=pdf_content, filetype="pdf")
1128
- docHighlights = fitz.open(stream=pdf_content, filetype="pdf")
 
 
 
 
 
1129
  most_common_font_size, most_common_color, most_common_font = get_regular_font_size_and_color(doc)
1130
 
1131
  # Precompute regex patterns
@@ -1145,7 +1151,7 @@ def extract_section_under_header_tobebilledOnly(pdf_path):
1145
  if dot_pattern.search(line_text):
1146
  dot_line_count += 1
1147
 
1148
- if dot_line_count >= 3:
1149
  toc_pages.append(page_num)
1150
 
1151
  return list(range(0, toc_pages[-1] +1)) if toc_pages else toc_pages
@@ -1163,7 +1169,7 @@ def extract_section_under_header_tobebilledOnly(pdf_path):
1163
  allchildrenheaders = [normalize_text(item['text']) for item, p in listofHeaderstoMarkup]
1164
  allchildrenheaders_set = set(allchildrenheaders) # For faster lookups
1165
 
1166
- df = pd.DataFrame(columns=["NBSLink","Subject","Page","Author","Creation Date","Layer",'Code', 'head above 1', "head above 2"])
1167
  dictionaryNBS={}
1168
  data_list_JSON = []
1169
 
@@ -1182,7 +1188,9 @@ def extract_section_under_header_tobebilledOnly(pdf_path):
1182
  for heading_to_searchDict, paths in listofHeaderstoMarkup:
1183
  heading_to_search = heading_to_searchDict['text']
1184
  heading_to_searchPageNum = heading_to_searchDict['page']
1185
-
 
 
1186
  # Initialize variables
1187
  headertoContinue1 = False
1188
  headertoContinue2 = False
@@ -1199,7 +1207,7 @@ def extract_section_under_header_tobebilledOnly(pdf_path):
1199
  break_collecting = False
1200
  heading_norm = normalize_text(heading_to_search)
1201
  paths_norm = [normalize_text(p) for p in paths[0]] if paths and paths[0] else []
1202
-
1203
  for page_num in range(heading_to_searchPageNum,len(doc)):
1204
  if page_num in toc_pages:
1205
  continue
@@ -1240,7 +1248,7 @@ def extract_section_under_header_tobebilledOnly(pdf_path):
1240
  combined_line_norm = normalize_text(line_text + " " + next_line_text)
1241
  else:
1242
  combined_line_norm = line_text_norm
1243
-
1244
  # Check if we should continue processing
1245
  if combined_line_norm and combined_line_norm in paths[0]:
1246
 
@@ -1249,6 +1257,7 @@ def extract_section_under_header_tobebilledOnly(pdf_path):
1249
 
1250
  headertoContinue2 = combined_line_norm
1251
  if 'installation' in paths[-2].lower() or 'execution' in paths[-2].lower() or 'miscellaneous items' in paths[-2].lower() :
 
1252
  stringtowrite='Not to be billed'
1253
  else:
1254
  stringtowrite='To be billed'
@@ -1289,7 +1298,8 @@ def extract_section_under_header_tobebilledOnly(pdf_path):
1289
  collecting = True
1290
  matched_header_font_size = max(span["size"] for span in header_spans)
1291
  Alltexttobebilled+= ' '+ combined_line_norm
1292
- collected_lines.append(line_text)
 
1293
  valid_spans = [span for span in spans if span.get("bbox")]
1294
 
1295
  if valid_spans:
@@ -1352,7 +1362,9 @@ def extract_section_under_header_tobebilledOnly(pdf_path):
1352
  "Code": stringtowrite,
1353
  "head above 1": paths[-2],
1354
  "head above 2": paths[0],
 
1355
  "MC Connnection": 'Go to ' + paths[0].strip().split()[0] +'/'+ heading_to_search.strip().split()[0] + ' in '+ filename
 
1356
  }
1357
  data_list_JSON.append(data_entry)
1358
 
@@ -1383,6 +1395,7 @@ def extract_section_under_header_tobebilledOnly(pdf_path):
1383
  collecting = True
1384
  matched_header_font_size = max(span["size"] for span in header_spans)
1385
  Alltexttobebilled+= ' '+ combined_line_norm
 
1386
  collected_lines.append(line_text)
1387
  valid_spans = [span for span in spans if span.get("bbox")]
1388
 
@@ -1446,6 +1459,7 @@ def extract_section_under_header_tobebilledOnly(pdf_path):
1446
  "Code": stringtowrite,
1447
  "head above 1": paths[-2],
1448
  "head above 2": paths[0],
 
1449
  "MC Connnection": 'Go to ' + paths[0].strip().split()[0] +'/'+ heading_to_search.strip().split()[0] + ' in '+ filename
1450
  }
1451
  data_list_JSON.append(data_entry)
@@ -1458,7 +1472,7 @@ def extract_section_under_header_tobebilledOnly(pdf_path):
1458
  continue
1459
  if collecting:
1460
  norm_line = normalize_text(line_text)
1461
-
1462
  # Optimized URL check
1463
  if url_pattern.match(norm_line):
1464
  line_is_header = False
@@ -1492,7 +1506,9 @@ def extract_section_under_header_tobebilledOnly(pdf_path):
1492
  if break_collecting:
1493
  break
1494
 
 
1495
  collected_lines.append(line_text)
 
1496
  valid_spans = [span for span in spans if span.get("bbox")]
1497
  if valid_spans:
1498
  x0s = [span["bbox"][0] for span in valid_spans]
@@ -1528,12 +1544,20 @@ def extract_section_under_header_tobebilledOnly(pdf_path):
1528
 
1529
  # docHighlights.save("highlighted_output.pdf", garbage=4, deflate=True)
1530
 
 
 
 
1531
  pdf_bytes = BytesIO()
1532
  docHighlights.save(pdf_bytes)
1533
- return pdf_bytes.getvalue(), docHighlights , json_output , Alltexttobebilled , alltextWithoutNotbilled
 
 
1534
 
1535
 
1536
  def extract_section_under_header_tobebilled2(pdf_path):
 
 
 
1537
  top_margin = 70
1538
  bottom_margin = 50
1539
  headertoContinue1 = False
@@ -1574,7 +1598,7 @@ def extract_section_under_header_tobebilled2(pdf_path):
1574
  if dot_pattern.search(line_text):
1575
  dot_line_count += 1
1576
 
1577
- if dot_line_count >= 3:
1578
  toc_pages.append(page_num)
1579
 
1580
  return list(range(0, toc_pages[-1] +1)) if toc_pages else toc_pages
@@ -1592,7 +1616,7 @@ def extract_section_under_header_tobebilled2(pdf_path):
1592
  allchildrenheaders = [normalize_text(item['text']) for item, p in listofHeaderstoMarkup]
1593
  allchildrenheaders_set = set(allchildrenheaders) # For faster lookups
1594
 
1595
- df = pd.DataFrame(columns=["NBSLink","Subject","Page","Author","Creation Date","Layer",'Code', 'head above 1', "head above 2"])
1596
  dictionaryNBS={}
1597
  data_list_JSON = []
1598
  currentgroupname=''
@@ -1683,7 +1707,11 @@ def extract_section_under_header_tobebilled2(pdf_path):
1683
  if combined_line_norm and combined_line_norm in paths[-2]:
1684
 
1685
  headertoContinue2 = combined_line_norm
1686
- if 'installation' in paths[-2].lower() or 'execution' in paths[-2].lower() or 'miscellaneous items' in paths[-2].lower() :
 
 
 
 
1687
  stringtowrite='Not to be billed'
1688
  else:
1689
  stringtowrite='To be billed'
@@ -1723,13 +1751,13 @@ def extract_section_under_header_tobebilled2(pdf_path):
1723
  # and span['size'] >= subsubheaderFontSize
1724
  and span['size'] < mainHeaderFontSize)
1725
  ]
1726
- if header_spans:
1727
  collecting = True
1728
  # if stringtowrite=='To be billed':
1729
  # Alltexttobebilled+='\n'
1730
  matched_header_font_size = max(span["size"] for span in header_spans)
1731
 
1732
- collected_lines.append(line_text)
1733
  valid_spans = [span for span in spans if span.get("bbox")]
1734
 
1735
  if valid_spans:
@@ -1792,6 +1820,7 @@ def extract_section_under_header_tobebilled2(pdf_path):
1792
  "Code": stringtowrite,
1793
  "head above 1": paths[-2],
1794
  "head above 2": paths[0],
 
1795
  "MC Connnection": 'Go to ' + paths[0].strip().split()[0] +'/'+ heading_to_search.strip().split()[0] + ' in '+ filename
1796
  }
1797
  data_list_JSON.append(data_entry)
@@ -1819,7 +1848,7 @@ def extract_section_under_header_tobebilled2(pdf_path):
1819
  and span['size'] < mainHeaderFontSize)
1820
  ]
1821
 
1822
- if header_spans and (meets_word_threshold or same_start_word(heading_to_search, combined_line_norm) ):
1823
  collecting = True
1824
  if stringtowrite=='To be billed':
1825
  Alltexttobebilled+='\n'
@@ -1890,6 +1919,7 @@ def extract_section_under_header_tobebilled2(pdf_path):
1890
  "Code": stringtowrite,
1891
  "head above 1": paths[-2],
1892
  "head above 2": paths[0],
 
1893
  "MC Connnection": 'Go to ' + paths[0].strip().split()[0] +'/'+ heading_to_search.strip().split()[0] + ' in '+ filename
1894
  }
1895
  data_list_JSON.append(data_entry)
@@ -1972,15 +2002,22 @@ def extract_section_under_header_tobebilled2(pdf_path):
1972
 
1973
  # docHighlights.save("highlighted_output.pdf", garbage=4, deflate=True)
1974
 
 
 
 
1975
  pdf_bytes = BytesIO()
1976
  docHighlights.save(pdf_bytes)
 
 
 
 
1977
 
1978
- return pdf_bytes.getvalue(), docHighlights , json_output, Alltexttobebilled
1979
 
1980
 
1981
 
1982
 
1983
- def extract_section_under_header_tobebilled2marthe(multiplePDF_Paths):
 
1984
  # keywordstoSkip=["installation", "execution", "miscellaneous items", "workmanship", "testing", "labeling"]
1985
  filenames=[]
1986
  keywords = {'installation', 'execution', 'miscellaneous items', 'workmanship', 'testing', 'labeling'}
@@ -1989,6 +2026,8 @@ def extract_section_under_header_tobebilled2marthe(multiplePDF_Paths):
1989
  arrayofPDFS=multiplePDF_Paths.split(',')
1990
  print(multiplePDF_Paths)
1991
  print(arrayofPDFS)
 
 
1992
  df = pd.DataFrame(columns=["PDF Name","NBSLink","Subject","Page","Author","Creation Date","Layer",'Code', 'head above 1', "head above 2","BodyText"])
1993
  for pdf_path in arrayofPDFS:
1994
  headertoContinue1 = False
@@ -2029,7 +2068,7 @@ def extract_section_under_header_tobebilled2marthe(multiplePDF_Paths):
2029
  if dot_pattern.search(line_text):
2030
  dot_line_count += 1
2031
 
2032
- if dot_line_count >= 3:
2033
  toc_pages.append(page_num)
2034
 
2035
  return list(range(0, toc_pages[-1] +1)) if toc_pages else toc_pages
@@ -2050,6 +2089,7 @@ def extract_section_under_header_tobebilled2marthe(multiplePDF_Paths):
2050
  # df = pd.DataFrame(columns=["NBSLink","Subject","Page","Author","Creation Date","Layer",'Code', 'head above 1', "head above 2","BodyText"])
2051
  dictionaryNBS={}
2052
  data_list_JSON = []
 
2053
  currentgroupname=''
2054
  if len(top_3_font_sizes)==3:
2055
  mainHeaderFontSize, subHeaderFontSize, subsubheaderFontSize = top_3_font_sizes
@@ -2258,7 +2298,8 @@ def extract_section_under_header_tobebilled2marthe(multiplePDF_Paths):
2258
  data_list_JSON.append(data_entry)
2259
 
2260
  # Convert list to JSON
2261
- json_output = json.dumps(data_list_JSON, indent=4)
 
2262
 
2263
  i += 2
2264
  continue
@@ -2358,7 +2399,8 @@ def extract_section_under_header_tobebilled2marthe(multiplePDF_Paths):
2358
  data_list_JSON.append(data_entry)
2359
 
2360
  # Convert list to JSON
2361
- json_output = json.dumps(data_list_JSON, indent=4)
 
2362
 
2363
 
2364
  i += 2
@@ -2432,489 +2474,24 @@ def extract_section_under_header_tobebilled2marthe(multiplePDF_Paths):
2432
  else:
2433
  stringtowrite='To be billed'
2434
  highlight_boxes(docHighlights, page_highlights,stringtowrite)
2435
-
2436
- # docHighlights.save("highlighted_output.pdf", garbage=4, deflate=True)
2437
-
2438
- pdf_bytes = BytesIO()
2439
- docHighlights.save(pdf_bytes)
2440
- print(filenames)
2441
- return pdf_bytes.getvalue(), docHighlights , json_output, Alltexttobebilled , filenames
2442
-
2443
-
2444
-
2445
- def extract_section_under_header_tobebilledMultiplePDFSmarthe(multiplePDF_Paths):
2446
- # keywordstoSkip=["installation", "execution", "miscellaneous items", "workmanship", "testing", "labeling"]
2447
- filenames=[]
2448
- keywords = {'installation', 'execution', 'miscellaneous items', 'workmanship', 'testing', 'labeling'}
2449
- top_margin = 70
2450
- bottom_margin = 50
2451
- arrayofPDFS=multiplePDF_Paths.split(',')
2452
- print(multiplePDF_Paths)
2453
- print(arrayofPDFS)
2454
- docarray=[]
2455
- jsons=[]
2456
- df = pd.DataFrame(columns=["PDF Name","NBSLink","Subject","Page","Author","Creation Date","Layer",'Code', 'head above 1', "head above 2","BodyText"])
2457
- for pdf_path in arrayofPDFS:
2458
- headertoContinue1 = False
2459
- headertoContinue2=False
2460
- Alltexttobebilled=''
2461
- parsed_url = urlparse(pdf_path)
2462
- filename = os.path.basename(parsed_url.path)
2463
- filename = unquote(filename) # decode URL-encoded characters
2464
- print(filename)
2465
- filenames.append(filename)
2466
- # Optimized URL handling
2467
- if pdf_path and ('http' in pdf_path or 'dropbox' in pdf_path):
2468
- pdf_path = pdf_path.replace('dl=0', 'dl=1')
2469
-
2470
- # Cache frequently used values
2471
- response = requests.get(pdf_path)
2472
- pdf_content = BytesIO(response.content)
2473
- if not pdf_content:
2474
- raise ValueError("No valid PDF content found.")
2475
-
2476
- doc = fitz.open(stream=pdf_content, filetype="pdf")
2477
- docHighlights = fitz.open(stream=pdf_content, filetype="pdf")
2478
- most_common_font_size, most_common_color, most_common_font = get_regular_font_size_and_color(doc)
2479
-
2480
- # Precompute regex patterns
2481
- dot_pattern = re.compile(r'\.{3,}')
2482
- url_pattern = re.compile(r'https?://\S+|www\.\S+')
2483
-
2484
- def get_toc_page_numbers(doc, max_pages_to_check=15):
2485
- toc_pages = []
2486
- for page_num in range(min(len(doc), max_pages_to_check)):
2487
- page = doc.load_page(page_num)
2488
- blocks = page.get_text("dict")["blocks"]
2489
-
2490
- dot_line_count = 0
2491
- for block in blocks:
2492
- for line in block.get("lines", []):
2493
- line_text = get_spaced_text_from_spans(line["spans"]).strip()
2494
- if dot_pattern.search(line_text):
2495
- dot_line_count += 1
2496
-
2497
- if dot_line_count >= 3:
2498
- toc_pages.append(page_num)
2499
-
2500
- return list(range(0, toc_pages[-1] +1)) if toc_pages else toc_pages
2501
-
2502
- toc_pages = get_toc_page_numbers(doc)
2503
-
2504
- headers, top_3_font_sizes, smallest_font_size, headersSpans = extract_headers(
2505
- doc, toc_pages, most_common_font_size, most_common_color, most_common_font, top_margin, bottom_margin
2506
- )
2507
-
2508
- hierarchy = build_header_hierarchy(doc, toc_pages, most_common_font_size, most_common_color, most_common_font)
2509
- listofHeaderstoMarkup = get_leaf_headers_with_paths(hierarchy)
2510
-
2511
- # Precompute all children headers once
2512
- allchildrenheaders = [normalize_text(item['text']) for item, p in listofHeaderstoMarkup]
2513
- allchildrenheaders_set = set(allchildrenheaders) # For faster lookups
2514
-
2515
- # df = pd.DataFrame(columns=["NBSLink","Subject","Page","Author","Creation Date","Layer",'Code', 'head above 1', "head above 2","BodyText"])
2516
- dictionaryNBS={}
2517
- data_list_JSON = []
2518
- currentgroupname=''
2519
- if len(top_3_font_sizes)==3:
2520
- mainHeaderFontSize, subHeaderFontSize, subsubheaderFontSize = top_3_font_sizes
2521
- elif len(top_3_font_sizes)==2:
2522
- mainHeaderFontSize= top_3_font_sizes[0]
2523
- subHeaderFontSize= top_3_font_sizes[1]
2524
- subsubheaderFontSize= top_3_font_sizes[1]
2525
-
2526
 
2527
-
2528
- # Preload all pages to avoid repeated loading
2529
- # pages = [doc.load_page(page_num) for page_num in range(len(doc)) if page_num not in toc_pages]
2530
-
2531
- for heading_to_searchDict, paths in listofHeaderstoMarkup:
2532
- heading_to_search = heading_to_searchDict['text']
2533
- heading_to_searchPageNum = heading_to_searchDict['page']
2534
-
2535
- # Initialize variables
2536
- headertoContinue1 = False
2537
- headertoContinue2 = False
2538
- matched_header_line = None
2539
- done = False
2540
- collecting = False
2541
- collected_lines = []
2542
- page_highlights = {}
2543
- current_bbox = {}
2544
- last_y1s = {}
2545
- mainHeader = ''
2546
- subHeader = ''
2547
- matched_header_line_norm = heading_to_search
2548
- break_collecting = False
2549
- heading_norm = normalize_text(heading_to_search)
2550
- paths_norm = [normalize_text(p) for p in paths[0]] if paths and paths[0] else []
2551
- for page_num in range(heading_to_searchPageNum,len(doc)):
2552
- # print(heading_to_search)
2553
- if paths[0].strip().lower() != currentgroupname.strip().lower():
2554
- Alltexttobebilled+= paths[0] +'\n'
2555
- currentgroupname=paths[0]
2556
- # print(paths[0])
2557
-
2558
-
2559
- if page_num in toc_pages:
2560
- continue
2561
- if break_collecting:
2562
- break
2563
- page=doc[page_num]
2564
- page_height = page.rect.height
2565
- blocks = page.get_text("dict")["blocks"]
2566
-
2567
- for block in blocks:
2568
- if break_collecting:
2569
- break
2570
-
2571
- lines = block.get("lines", [])
2572
- i = 0
2573
- while i < len(lines):
2574
- if break_collecting:
2575
- break
2576
-
2577
- spans = lines[i].get("spans", [])
2578
- if not spans:
2579
- i += 1
2580
- continue
2581
-
2582
- y0 = spans[0]["bbox"][1]
2583
- y1 = spans[0]["bbox"][3]
2584
- if y0 < top_margin or y1 > (page_height - bottom_margin):
2585
- i += 1
2586
- continue
2587
-
2588
- line_text = get_spaced_text_from_spans(spans).lower()
2589
- line_text_norm = normalize_text(line_text)
2590
-
2591
- # Combine with next line if available
2592
- if i + 1 < len(lines):
2593
- next_spans = lines[i + 1].get("spans", [])
2594
- next_line_text = get_spaced_text_from_spans(next_spans).lower()
2595
- combined_line_norm = normalize_text(line_text + " " + next_line_text)
2596
- else:
2597
- combined_line_norm = line_text_norm
2598
-
2599
- # Check if we should continue processing
2600
- if combined_line_norm and combined_line_norm in paths[0]:
2601
-
2602
- headertoContinue1 = combined_line_norm
2603
- if combined_line_norm and combined_line_norm in paths[-2]:
2604
-
2605
- headertoContinue2 = combined_line_norm
2606
- # if 'installation' in paths[-2].lower() or 'execution' in paths[-2].lower() or 'miscellaneous items' in paths[-2].lower() :
2607
- last_path = paths[-2].lower()
2608
- # if any(word in paths[-2].lower() for word in keywordstoSkip):
2609
- # if 'installation' in paths[-2].lower() or 'execution' in paths[-2].lower() or 'miscellaneous items' in paths[-2].lower() or 'workmanship' in paths[-2].lower() or 'testing' in paths[-2].lower() or 'labeling' in paths[-2].lower():
2610
- if any(keyword in last_path for keyword in keywords):
2611
- stringtowrite='Not to be billed'
2612
- else:
2613
- stringtowrite='To be billed'
2614
- if stringtowrite=='To be billed':
2615
- # Alltexttobebilled+= combined_line_norm #################################################
2616
- if matched_header_line_norm in combined_line_norm:
2617
- Alltexttobebilled+='\n'
2618
- Alltexttobebilled+= ' '+combined_line_norm
2619
- # Optimized header matching
2620
- existsfull = (
2621
- ( combined_line_norm in allchildrenheaders_set or
2622
- combined_line_norm in allchildrenheaders ) and heading_to_search in combined_line_norm
2623
- )
2624
-
2625
- # New word-based matching
2626
- current_line_words = set(combined_line_norm.split())
2627
- heading_words = set(heading_norm.split())
2628
- all_words_match = current_line_words.issubset(heading_words) and len(current_line_words) > 0
2629
-
2630
- substring_match = (
2631
- heading_norm in combined_line_norm or
2632
- combined_line_norm in heading_norm or
2633
- all_words_match # Include the new word-based matching
2634
- )
2635
- # substring_match = (
2636
- # heading_norm in combined_line_norm or
2637
- # combined_line_norm in heading_norm
2638
- # )
2639
-
2640
- if (substring_match and existsfull and not collecting and
2641
- len(combined_line_norm) > 0 ):#and (headertoContinue1 or headertoContinue2) ):
2642
-
2643
- # Check header conditions more efficiently
2644
- header_spans = [
2645
- span for span in spans
2646
- if (is_header(span, most_common_font_size, most_common_color, most_common_font)
2647
- # and span['size'] >= subsubheaderFontSize
2648
- and span['size'] < mainHeaderFontSize)
2649
- ]
2650
- if header_spans and stringtowrite.startswith('To'):
2651
- collecting = True
2652
- # if stringtowrite=='To be billed':
2653
- # Alltexttobebilled+='\n'
2654
- matched_header_font_size = max(span["size"] for span in header_spans)
2655
-
2656
- # collected_lines.append(line_text)
2657
- valid_spans = [span for span in spans if span.get("bbox")]
2658
-
2659
- if valid_spans:
2660
- x0s = [span["bbox"][0] for span in valid_spans]
2661
- x1s = [span["bbox"][2] for span in valid_spans]
2662
- y0s = [span["bbox"][1] for span in valid_spans]
2663
- y1s = [span["bbox"][3] for span in valid_spans]
2664
-
2665
- header_bbox = [min(x0s), min(y0s), max(x1s), max(y1s)]
2666
-
2667
- if page_num in current_bbox:
2668
- cb = current_bbox[page_num]
2669
- current_bbox[page_num] = [
2670
- min(cb[0], header_bbox[0]),
2671
- min(cb[1], header_bbox[1]),
2672
- max(cb[2], header_bbox[2]),
2673
- max(cb[3], header_bbox[3])
2674
- ]
2675
- else:
2676
- current_bbox[page_num] = header_bbox
2677
- last_y1s[page_num] = header_bbox[3]
2678
- x0, y0, x1, y1 = header_bbox
2679
-
2680
- zoom = 200
2681
- left = int(x0)
2682
- top = int(y0)
2683
- zoom_str = f"{zoom},{left},{top}"
2684
- pageNumberFound = page_num + 1
2685
-
2686
- # Build the query parameters
2687
- params = {
2688
- 'pdfLink': pdf_path, # Your PDF link
2689
- 'keyword': heading_to_search, # Your keyword (could be a string or list)
2690
- }
2691
-
2692
- # URL encode each parameter
2693
- encoded_params = {key: urllib.parse.quote(value, safe='') for key, value in params.items()}
2694
-
2695
- # Construct the final encoded link
2696
- encoded_link = '&'.join([f"{key}={value}" for key, value in encoded_params.items()])
2697
-
2698
- # Correctly construct the final URL with page and zoom
2699
- final_url = f"{baselink}{encoded_link}#page={str(pageNumberFound)}&zoom={zoom_str}"
2700
-
2701
- # Get current date and time
2702
- now = datetime.now()
2703
-
2704
- # Format the output
2705
- formatted_time = now.strftime("%d/%m/%Y %I:%M:%S %p")
2706
- # Optionally, add the URL to a DataFrame
2707
-
2708
-
2709
- data_entry = {
2710
- "PDF Name":filename,
2711
- "NBSLink": final_url,
2712
- "Subject": heading_to_search,
2713
- "Page": str(pageNumberFound),
2714
- "Author": "ADR",
2715
- "Creation Date": formatted_time,
2716
- "Layer": "Initial",
2717
- "Code": stringtowrite,
2718
- "head above 1": paths[-2],
2719
- "head above 2": paths[0],
2720
- "BodyText":collected_lines,
2721
- "MC Connnection": 'Go to ' + paths[0].strip().split()[0] +'/'+ heading_to_search.strip().split()[0] + ' in '+ filename
2722
- }
2723
- data_list_JSON.append(data_entry)
2724
-
2725
- # Convert list to JSON
2726
- json_output = json.dumps(data_list_JSON, indent=4)
2727
-
2728
- i += 2
2729
- continue
2730
- else:
2731
- if (substring_match and not collecting and
2732
- len(combined_line_norm) > 0): # and (headertoContinue1 or headertoContinue2) ):
2733
-
2734
- # Calculate word match percentage
2735
- word_match_percent = words_match_ratio(heading_norm, combined_line_norm) * 100
2736
-
2737
- # Check if at least 70% of header words exist in this line
2738
- meets_word_threshold = word_match_percent >= 100
2739
-
2740
- # Check header conditions (including word threshold)
2741
- header_spans = [
2742
- span for span in spans
2743
- if (is_header(span, most_common_font_size, most_common_color, most_common_font)
2744
- # and span['size'] >= subsubheaderFontSize
2745
- and span['size'] < mainHeaderFontSize)
2746
- ]
2747
-
2748
- if header_spans and (meets_word_threshold or same_start_word(heading_to_search, combined_line_norm) ) and stringtowrite.startswith('To'):
2749
- collecting = True
2750
- if stringtowrite=='To be billed':
2751
- Alltexttobebilled+='\n'
2752
- # if stringtowrite=='To be billed':
2753
- # Alltexttobebilled+= ' '+ combined_line_norm
2754
- matched_header_font_size = max(span["size"] for span in header_spans)
2755
-
2756
- collected_lines.append(line_text)
2757
- valid_spans = [span for span in spans if span.get("bbox")]
2758
-
2759
- if valid_spans:
2760
- x0s = [span["bbox"][0] for span in valid_spans]
2761
- x1s = [span["bbox"][2] for span in valid_spans]
2762
- y0s = [span["bbox"][1] for span in valid_spans]
2763
- y1s = [span["bbox"][3] for span in valid_spans]
2764
-
2765
- header_bbox = [min(x0s), min(y0s), max(x1s), max(y1s)]
2766
-
2767
- if page_num in current_bbox:
2768
- cb = current_bbox[page_num]
2769
- current_bbox[page_num] = [
2770
- min(cb[0], header_bbox[0]),
2771
- min(cb[1], header_bbox[1]),
2772
- max(cb[2], header_bbox[2]),
2773
- max(cb[3], header_bbox[3])
2774
- ]
2775
- else:
2776
- current_bbox[page_num] = header_bbox
2777
-
2778
- last_y1s[page_num] = header_bbox[3]
2779
- x0, y0, x1, y1 = header_bbox
2780
- zoom = 200
2781
- left = int(x0)
2782
- top = int(y0)
2783
- zoom_str = f"{zoom},{left},{top}"
2784
- pageNumberFound = page_num + 1
2785
-
2786
- # Build the query parameters
2787
- params = {
2788
- 'pdfLink': pdf_path, # Your PDF link
2789
- 'keyword': heading_to_search, # Your keyword (could be a string or list)
2790
- }
2791
-
2792
- # URL encode each parameter
2793
- encoded_params = {key: urllib.parse.quote(value, safe='') for key, value in params.items()}
2794
-
2795
- # Construct the final encoded link
2796
- encoded_link = '&'.join([f"{key}={value}" for key, value in encoded_params.items()])
2797
-
2798
- # Correctly construct the final URL with page and zoom
2799
- final_url = f"{baselink}{encoded_link}#page={str(pageNumberFound)}&zoom={zoom_str}"
2800
-
2801
- # Get current date and time
2802
- now = datetime.now()
2803
-
2804
- # Format the output
2805
- formatted_time = now.strftime("%d/%m/%Y %I:%M:%S %p")
2806
- # Optionally, add the URL to a DataFrame
2807
-
2808
-
2809
- data_entry = {
2810
- "PDF Name":filename,
2811
- "NBSLink": final_url,
2812
- "Subject": heading_to_search,
2813
- "Page": str(pageNumberFound),
2814
- "Author": "ADR",
2815
- "Creation Date": formatted_time,
2816
- "Layer": "Initial",
2817
- "Code": stringtowrite,
2818
- "head above 1": paths[-2],
2819
- "head above 2": paths[0],
2820
- "BodyText":collected_lines,
2821
- "MC Connnection": 'Go to ' + paths[0].strip().split()[0] +'/'+ heading_to_search.strip().split()[0] + ' in '+ filename
2822
- }
2823
- data_list_JSON.append(data_entry)
2824
-
2825
- # Convert list to JSON
2826
- json_output = json.dumps(data_list_JSON, indent=4)
2827
-
2828
-
2829
- i += 2
2830
- continue
2831
- if collecting:
2832
- norm_line = normalize_text(line_text)
2833
-
2834
- # Optimized URL check
2835
- if url_pattern.match(norm_line):
2836
- line_is_header = False
2837
- else:
2838
- line_is_header = any(is_header(span, most_common_font_size, most_common_color, most_common_font) for span in spans)
2839
-
2840
- if line_is_header:
2841
- header_font_size = max(span["size"] for span in spans)
2842
- is_probably_real_header = (
2843
- header_font_size >= matched_header_font_size and
2844
- is_header(spans[0], most_common_font_size, most_common_color, most_common_font) and
2845
- len(line_text.strip()) > 2
2846
- )
2847
-
2848
- if (norm_line != matched_header_line_norm and
2849
- norm_line != heading_norm and
2850
- is_probably_real_header):
2851
- if line_text not in heading_norm:
2852
- collecting = False
2853
- done = True
2854
- headertoContinue1 = False
2855
- headertoContinue2=False
2856
- for page_num, bbox in current_bbox.items():
2857
- bbox[3] = last_y1s.get(page_num, bbox[3])
2858
- page_highlights[page_num] = bbox
2859
- highlight_boxes(docHighlights, page_highlights,stringtowrite)
2860
-
2861
- break_collecting = True
2862
- break
2863
-
2864
- if break_collecting:
2865
- break
2866
-
2867
- collected_lines.append(line_text)
2868
- valid_spans = [span for span in spans if span.get("bbox")]
2869
- if valid_spans:
2870
- x0s = [span["bbox"][0] for span in valid_spans]
2871
- x1s = [span["bbox"][2] for span in valid_spans]
2872
- y0s = [span["bbox"][1] for span in valid_spans]
2873
- y1s = [span["bbox"][3] for span in valid_spans]
2874
-
2875
- line_bbox = [min(x0s), min(y0s), max(x1s), max(y1s)]
2876
-
2877
- if page_num in current_bbox:
2878
- cb = current_bbox[page_num]
2879
- current_bbox[page_num] = [
2880
- min(cb[0], line_bbox[0]),
2881
- min(cb[1], line_bbox[1]),
2882
- max(cb[2], line_bbox[2]),
2883
- max(cb[3], line_bbox[3])
2884
- ]
2885
- else:
2886
- current_bbox[page_num] = line_bbox
2887
-
2888
- last_y1s[page_num] = line_bbox[3]
2889
- i += 1
2890
-
2891
- if not done:
2892
- for page_num, bbox in current_bbox.items():
2893
- bbox[3] = last_y1s.get(page_num, bbox[3])
2894
- page_highlights[page_num] = bbox
2895
- if 'installation' in paths[-2].lower() or 'execution' in paths[-2].lower() or 'miscellaneous items' in paths[-2].lower() :
2896
- stringtowrite='Not to be billed'
2897
- else:
2898
- stringtowrite='To be billed'
2899
- highlight_boxes(docHighlights, page_highlights,stringtowrite)
2900
- docarray.append(docHighlights)
2901
- jsons.append(json_output)
2902
- # docHighlights.save("highlighted_output.pdf", garbage=4, deflate=True)
2903
-
2904
- dbxTeam = tsadropboxretrieval.ADR_Access_DropboxTeam('user')
2905
- dbPath = '/TSA JOBS/ADR Test/FIND/'
2906
- jsonCombined=[]
2907
- for i in range(len(arrayofPDFS)):
2908
- singlepdf=arrayofPDFS[i]
2909
-
2910
- metadata = dbxTeam.sharing_get_shared_link_metadata(singlepdf)
2911
  pdf_bytes = BytesIO()
2912
  docHighlights.save(pdf_bytes)
2913
  pdflink = tsadropboxretrieval.uploadanyFile(doc=docarray[i], path=dbPath, pdfname=filenames[i])
 
 
 
2914
  json_output1=changepdflinks(jsons[i],pdflink)
2915
  jsonCombined.extend(json_output1)
2916
  combined_json_str = json.dumps(jsonCombined, indent=1)
2917
  print(combined_json_str)
2918
  return pdf_bytes.getvalue(), docHighlights , combined_json_str, Alltexttobebilled , filenames
2919
-
2920
-
 
6
  Original file is located at
7
  https://colab.research.google.com/drive/12XfVkmKmN3oVjHhLVE0_GgkftgArFEK2
8
  """
9
+ baselink='https://adr.trevorsadd.co.uk/api/view-pdf?'
10
+
11
+ newlink='https://adr.trevorsadd.co.uk/api/view-highlight?'
12
+ tobebilledonlyLink='https://adr.trevorsadd.co.uk/api/view-pdf-tobebilled?'
13
+
14
+
15
+
16
 
 
17
  from urllib.parse import urlparse, unquote
18
  import os
19
  from io import BytesIO
 
31
  from collections import defaultdict, Counter
32
  import difflib
33
  from fuzzywuzzy import fuzz
34
+ import copy
35
+ import tsadropboxretrieval
36
 
37
+
 
 
 
38
 
 
39
  def changepdflinks(data_list_JSON, pdflink):
40
  print('henaaaa weee',data_list_JSON)
41
 
 
75
 
76
  return data_list_JSON
77
 
 
78
  def get_regular_font_size_and_color(doc):
79
  font_sizes = []
80
  colors = []
 
238
  font_size_counts = Counter(font_sizes)
239
 
240
  # Filter font sizes that appear at least 3 times
241
+ valid_font_sizes = [size for size, count in font_size_counts.items() if count >= 1]
242
 
243
  # Sort in descending order
244
  valid_font_sizes_sorted = sorted(valid_font_sizes, reverse=True)
 
648
  return False
649
 
650
 
 
 
651
  def extract_section_under_header(multiplePDF_Paths):
652
  filenames=[]
653
  keywords = {'installation', 'execution', 'miscellaneous items', 'workmanship', 'testing', 'labeling'}
 
1093
  jsonCombined.extend(json_output1)
1094
  combined_json_str = json.dumps(jsonCombined, indent=1)
1095
  return pdf_bytes.getvalue(), docHighlights , combined_json_str
1096
+
1097
  ########################################################################################################################################################
1098
  ########################################################################################################################################################
1099
 
 
1102
  def extract_section_under_header_tobebilledOnly(pdf_path):
1103
  Alltexttobebilled=''
1104
  alltextWithoutNotbilled=''
1105
+ # keywordstoSkip=["installation", "execution", "miscellaneous items", "workmanship", "testing", "labeling"]
1106
+
1107
+
1108
+
1109
  top_margin = 70
1110
  bottom_margin = 50
1111
  headertoContinue1 = False
 
1126
  raise ValueError("No valid PDF content found.")
1127
 
1128
  doc = fitz.open(stream=pdf_content, filetype="pdf")
1129
+ docHighlights = fitz.open(stream=pdf_content, filetype="pdf")
1130
+ parsed_url = urlparse(pdf_path)
1131
+ filename = os.path.basename(parsed_url.path)
1132
+ filename = unquote(filename) # decode URL-encoded characters
1133
+
1134
+
1135
  most_common_font_size, most_common_color, most_common_font = get_regular_font_size_and_color(doc)
1136
 
1137
  # Precompute regex patterns
 
1151
  if dot_pattern.search(line_text):
1152
  dot_line_count += 1
1153
 
1154
+ if dot_line_count >= 1:
1155
  toc_pages.append(page_num)
1156
 
1157
  return list(range(0, toc_pages[-1] +1)) if toc_pages else toc_pages
 
1169
  allchildrenheaders = [normalize_text(item['text']) for item, p in listofHeaderstoMarkup]
1170
  allchildrenheaders_set = set(allchildrenheaders) # For faster lookups
1171
 
1172
+ df = pd.DataFrame(columns=["NBSLink","Subject","Page","Author","Creation Date","Layer",'Code', 'head above 1', "head above 2",'BodyText'])
1173
  dictionaryNBS={}
1174
  data_list_JSON = []
1175
 
 
1188
  for heading_to_searchDict, paths in listofHeaderstoMarkup:
1189
  heading_to_search = heading_to_searchDict['text']
1190
  heading_to_searchPageNum = heading_to_searchDict['page']
1191
+
1192
+
1193
+
1194
  # Initialize variables
1195
  headertoContinue1 = False
1196
  headertoContinue2 = False
 
1207
  break_collecting = False
1208
  heading_norm = normalize_text(heading_to_search)
1209
  paths_norm = [normalize_text(p) for p in paths[0]] if paths and paths[0] else []
1210
+
1211
  for page_num in range(heading_to_searchPageNum,len(doc)):
1212
  if page_num in toc_pages:
1213
  continue
 
1248
  combined_line_norm = normalize_text(line_text + " " + next_line_text)
1249
  else:
1250
  combined_line_norm = line_text_norm
1251
+
1252
  # Check if we should continue processing
1253
  if combined_line_norm and combined_line_norm in paths[0]:
1254
 
 
1257
 
1258
  headertoContinue2 = combined_line_norm
1259
  if 'installation' in paths[-2].lower() or 'execution' in paths[-2].lower() or 'miscellaneous items' in paths[-2].lower() :
1260
+ # if any(word in paths[-2].lower() for word in keywordstoSkip):
1261
  stringtowrite='Not to be billed'
1262
  else:
1263
  stringtowrite='To be billed'
 
1298
  collecting = True
1299
  matched_header_font_size = max(span["size"] for span in header_spans)
1300
  Alltexttobebilled+= ' '+ combined_line_norm
1301
+
1302
+ # collected_lines.append(line_text)
1303
  valid_spans = [span for span in spans if span.get("bbox")]
1304
 
1305
  if valid_spans:
 
1362
  "Code": stringtowrite,
1363
  "head above 1": paths[-2],
1364
  "head above 2": paths[0],
1365
+ "BodyText": collected_lines,
1366
  "MC Connnection": 'Go to ' + paths[0].strip().split()[0] +'/'+ heading_to_search.strip().split()[0] + ' in '+ filename
1367
+
1368
  }
1369
  data_list_JSON.append(data_entry)
1370
 
 
1395
  collecting = True
1396
  matched_header_font_size = max(span["size"] for span in header_spans)
1397
  Alltexttobebilled+= ' '+ combined_line_norm
1398
+
1399
  collected_lines.append(line_text)
1400
  valid_spans = [span for span in spans if span.get("bbox")]
1401
 
 
1459
  "Code": stringtowrite,
1460
  "head above 1": paths[-2],
1461
  "head above 2": paths[0],
1462
+ "BodyText": collected_lines,
1463
  "MC Connnection": 'Go to ' + paths[0].strip().split()[0] +'/'+ heading_to_search.strip().split()[0] + ' in '+ filename
1464
  }
1465
  data_list_JSON.append(data_entry)
 
1472
  continue
1473
  if collecting:
1474
  norm_line = normalize_text(line_text)
1475
+
1476
  # Optimized URL check
1477
  if url_pattern.match(norm_line):
1478
  line_is_header = False
 
1506
  if break_collecting:
1507
  break
1508
 
1509
+
1510
  collected_lines.append(line_text)
1511
+
1512
  valid_spans = [span for span in spans if span.get("bbox")]
1513
  if valid_spans:
1514
  x0s = [span["bbox"][0] for span in valid_spans]
 
1544
 
1545
  # docHighlights.save("highlighted_output.pdf", garbage=4, deflate=True)
1546
 
1547
+ dbxTeam = tsadropboxretrieval.ADR_Access_DropboxTeam('user')
1548
+ metadata = dbxTeam.sharing_get_shared_link_metadata(pdf_path)
1549
+ dbPath = '/TSA JOBS/ADR Test/FIND/'
1550
  pdf_bytes = BytesIO()
1551
  docHighlights.save(pdf_bytes)
1552
+ pdflink = tsadropboxretrieval.uploadanyFile(doc=docHighlights, path=dbPath, pdfname=filename)
1553
+ json_output=changepdflinks(json_output,pdflink)
1554
+ return pdf_bytes.getvalue(), docHighlights , json_output , Alltexttobebilled , alltextWithoutNotbilled , filename
1555
 
1556
 
1557
  def extract_section_under_header_tobebilled2(pdf_path):
1558
+ # keywordstoSkip=["installation", "execution", "miscellaneous items", "workmanship", "testing", "labeling"]
1559
+
1560
+ keywords = {'installation', 'execution', 'miscellaneous items', 'workmanship', 'testing', 'labeling'}
1561
  top_margin = 70
1562
  bottom_margin = 50
1563
  headertoContinue1 = False
 
1598
  if dot_pattern.search(line_text):
1599
  dot_line_count += 1
1600
 
1601
+ if dot_line_count >= 1:
1602
  toc_pages.append(page_num)
1603
 
1604
  return list(range(0, toc_pages[-1] +1)) if toc_pages else toc_pages
 
1616
  allchildrenheaders = [normalize_text(item['text']) for item, p in listofHeaderstoMarkup]
1617
  allchildrenheaders_set = set(allchildrenheaders) # For faster lookups
1618
 
1619
+ df = pd.DataFrame(columns=["NBSLink","Subject","Page","Author","Creation Date","Layer",'Code', 'head above 1', "head above 2","BodyText"])
1620
  dictionaryNBS={}
1621
  data_list_JSON = []
1622
  currentgroupname=''
 
1707
  if combined_line_norm and combined_line_norm in paths[-2]:
1708
 
1709
  headertoContinue2 = combined_line_norm
1710
+ # if 'installation' in paths[-2].lower() or 'execution' in paths[-2].lower() or 'miscellaneous items' in paths[-2].lower() :
1711
+ last_path = paths[-2].lower()
1712
+ # if any(word in paths[-2].lower() for word in keywordstoSkip):
1713
+ # if 'installation' in paths[-2].lower() or 'execution' in paths[-2].lower() or 'miscellaneous items' in paths[-2].lower() or 'workmanship' in paths[-2].lower() or 'testing' in paths[-2].lower() or 'labeling' in paths[-2].lower():
1714
+ if any(keyword in last_path for keyword in keywords):
1715
  stringtowrite='Not to be billed'
1716
  else:
1717
  stringtowrite='To be billed'
 
1751
  # and span['size'] >= subsubheaderFontSize
1752
  and span['size'] < mainHeaderFontSize)
1753
  ]
1754
+ if header_spans and stringtowrite.startswith('To'):
1755
  collecting = True
1756
  # if stringtowrite=='To be billed':
1757
  # Alltexttobebilled+='\n'
1758
  matched_header_font_size = max(span["size"] for span in header_spans)
1759
 
1760
+ # collected_lines.append(line_text)
1761
  valid_spans = [span for span in spans if span.get("bbox")]
1762
 
1763
  if valid_spans:
 
1820
  "Code": stringtowrite,
1821
  "head above 1": paths[-2],
1822
  "head above 2": paths[0],
1823
+ "BodyText":collected_lines,
1824
  "MC Connnection": 'Go to ' + paths[0].strip().split()[0] +'/'+ heading_to_search.strip().split()[0] + ' in '+ filename
1825
  }
1826
  data_list_JSON.append(data_entry)
 
1848
  and span['size'] < mainHeaderFontSize)
1849
  ]
1850
 
1851
+ if header_spans and (meets_word_threshold or same_start_word(heading_to_search, combined_line_norm) ) and stringtowrite.startswith('To'):
1852
  collecting = True
1853
  if stringtowrite=='To be billed':
1854
  Alltexttobebilled+='\n'
 
1919
  "Code": stringtowrite,
1920
  "head above 1": paths[-2],
1921
  "head above 2": paths[0],
1922
+ "BodyText":collected_lines,
1923
  "MC Connnection": 'Go to ' + paths[0].strip().split()[0] +'/'+ heading_to_search.strip().split()[0] + ' in '+ filename
1924
  }
1925
  data_list_JSON.append(data_entry)
 
2002
 
2003
  # docHighlights.save("highlighted_output.pdf", garbage=4, deflate=True)
2004
 
2005
+ dbxTeam = tsadropboxretrieval.ADR_Access_DropboxTeam('user')
2006
+ metadata = dbxTeam.sharing_get_shared_link_metadata(pdf_path)
2007
+ dbPath = '/TSA JOBS/ADR Test/FIND/'
2008
  pdf_bytes = BytesIO()
2009
  docHighlights.save(pdf_bytes)
2010
+ pdflink = tsadropboxretrieval.uploadanyFile(doc=docHighlights, path=dbPath, pdfname=filename)
2011
+ json_output=changepdflinks(json_output,pdflink)
2012
+ return pdf_bytes.getvalue(), docHighlights , json_output, Alltexttobebilled , filename
2013
+
2014
 
 
2015
 
2016
 
2017
 
2018
 
2019
+
2020
+ def extract_section_under_header_tobebilledMultiplePDFS(multiplePDF_Paths):
2021
  # keywordstoSkip=["installation", "execution", "miscellaneous items", "workmanship", "testing", "labeling"]
2022
  filenames=[]
2023
  keywords = {'installation', 'execution', 'miscellaneous items', 'workmanship', 'testing', 'labeling'}
 
2026
  arrayofPDFS=multiplePDF_Paths.split(',')
2027
  print(multiplePDF_Paths)
2028
  print(arrayofPDFS)
2029
+ docarray=[]
2030
+ jsons=[]
2031
  df = pd.DataFrame(columns=["PDF Name","NBSLink","Subject","Page","Author","Creation Date","Layer",'Code', 'head above 1', "head above 2","BodyText"])
2032
  for pdf_path in arrayofPDFS:
2033
  headertoContinue1 = False
 
2068
  if dot_pattern.search(line_text):
2069
  dot_line_count += 1
2070
 
2071
+ if dot_line_count >= 1:
2072
  toc_pages.append(page_num)
2073
 
2074
  return list(range(0, toc_pages[-1] +1)) if toc_pages else toc_pages
 
2089
  # df = pd.DataFrame(columns=["NBSLink","Subject","Page","Author","Creation Date","Layer",'Code', 'head above 1', "head above 2","BodyText"])
2090
  dictionaryNBS={}
2091
  data_list_JSON = []
2092
+ json_output=[]
2093
  currentgroupname=''
2094
  if len(top_3_font_sizes)==3:
2095
  mainHeaderFontSize, subHeaderFontSize, subsubheaderFontSize = top_3_font_sizes
 
2298
  data_list_JSON.append(data_entry)
2299
 
2300
  # Convert list to JSON
2301
+ # json_output = [data_list_JSON]
2302
+ # json_output = json.dumps(data_list_JSON, indent=4)
2303
 
2304
  i += 2
2305
  continue
 
2399
  data_list_JSON.append(data_entry)
2400
 
2401
  # Convert list to JSON
2402
+ # json_output = [data_list_JSON]
2403
+ # json_output = json.dumps(data_list_JSON, indent=4)
2404
 
2405
 
2406
  i += 2
 
2474
  else:
2475
  stringtowrite='To be billed'
2476
  highlight_boxes(docHighlights, page_highlights,stringtowrite)
2477
+ docarray.append(docHighlights)
2478
+ jsons.append(data_list_JSON)
2479
+ dbxTeam = tsadropboxretrieval.ADR_Access_DropboxTeam('user')
2480
+ dbPath = '/TSA JOBS/ADR Test/FIND/'
2481
+ jsonCombined=[]
2482
+ for i in range(len(arrayofPDFS)):
2483
+ singlepdf=arrayofPDFS[i]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2484
 
2485
+ metadata = dbxTeam.sharing_get_shared_link_metadata(singlepdf)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2486
  pdf_bytes = BytesIO()
2487
  docHighlights.save(pdf_bytes)
2488
  pdflink = tsadropboxretrieval.uploadanyFile(doc=docarray[i], path=dbPath, pdfname=filenames[i])
2489
+ # json_copy = copy.deepcopy(jsons[i])
2490
+ # Update links for this JSON
2491
+ # json_output1 = changepdflinks(json_copy, pdflink)
2492
  json_output1=changepdflinks(jsons[i],pdflink)
2493
  jsonCombined.extend(json_output1)
2494
  combined_json_str = json.dumps(jsonCombined, indent=1)
2495
  print(combined_json_str)
2496
  return pdf_bytes.getvalue(), docHighlights , combined_json_str, Alltexttobebilled , filenames
2497
+