Marthee commited on
Commit
1bf5d60
·
verified ·
1 Parent(s): 069ecb9

Update Find_Hyperlinking_text.py

Browse files
Files changed (1) hide show
  1. Find_Hyperlinking_text.py +226 -189
Find_Hyperlinking_text.py CHANGED
@@ -1,3 +1,4 @@
 
1
  import fitz # PyMuPDF
2
  from io import BytesIO
3
  import re
@@ -10,7 +11,8 @@ import urllib.parse
10
  import pandas as pd
11
  import tempfile
12
  from fpdf import FPDF
13
-
 
14
 
15
  baselink='https://marthee-nbslink.hf.space/view-pdf?'
16
  class PDF(FPDF):
@@ -29,11 +31,15 @@ def save_df_to_pdf(df):
29
  pdf.set_right_margin(margin)
30
 
31
  pdf.add_page()
32
- pdf.set_font("Arial", size=10)
 
 
 
 
33
 
34
- # Set column widths and calculate total table width
35
- col_width = 50
36
- num_cols = 4
37
  table_width = col_width * num_cols
38
 
39
  # Get page width and calculate left alignment
@@ -44,23 +50,20 @@ def save_df_to_pdf(df):
44
 
45
  # Table headers
46
  pdf.set_fill_color(200, 200, 200) # Light gray background
47
- pdf.set_font("Arial", "B", 10)
48
- headers = ["NBS Link", "NBS", "Head Above 1", "Head Above 2"]
49
-
50
- # Draw table headers
51
  for header in headers:
52
  pdf.cell(col_width, 8, header, border=1, fill=True, align="C")
53
  pdf.ln()
54
 
55
- pdf.set_font("Arial", size=9)
56
 
57
- # Add rows
58
  for _, row in df.iterrows():
59
  x_start = start_x # Ensure every row starts at the same position
60
  y_start = pdf.get_y()
61
 
62
  # Calculate max height needed for this row
63
- text_lines = {col: pdf.multi_cell(col_width, 5, row[col], border=0, align="L", split_only=True) for col in ["NBS", "head above 1", "head above 2"]}
64
  max_lines = max(len(lines) for lines in text_lines.values())
65
  max_height = max_lines * 5
66
 
@@ -73,7 +76,7 @@ def save_df_to_pdf(df):
73
  pdf.set_xy(x_start + col_width, y_start)
74
 
75
  # Draw each cell manually, ensuring equal height
76
- for i, col_name in enumerate(["NBS", "head above 1", "head above 2"]):
77
  x_col = x_start + col_width * (i + 1)
78
  y_col = y_start
79
  pdf.multi_cell(col_width, 5, row[col_name], border=0, align="L") # Draw text
@@ -82,11 +85,8 @@ def save_df_to_pdf(df):
82
 
83
  # Move to the next row
84
  pdf.ln(max_height)
85
- # Save PDF to memory instead of a file
86
- # pdf_output = BytesIO()
87
- # pdf_output = 'output.pdf'
88
- pdf_output = pdf.output(dest="S").encode("latin1") # Returns the PDF as a byte string
89
-
90
  return pdf_output
91
 
92
 
@@ -96,7 +96,7 @@ def normalize_text(text):
96
  text = text.lower().strip()
97
  text = re.sub(r'\s+', ' ', text) # Normalize multiple spaces
98
  return re.sub(r'[^\w\s]', '', text) # Remove punctuation
99
- def get_repeated_texts(pdf_document, threshold=0.9):
100
  """
101
  Identify text that appears on most pages.
102
  :param pdf_document: The opened PDF document.
@@ -125,11 +125,9 @@ def annotate_text_from_pdf(pdfshareablelinks, LISTheading_to_search):
125
  """
126
  Annotates text under a specific heading in a PDF, highlights it,
127
  and constructs zoom coordinates for the first occurrence of the heading.
128
-
129
  Args:
130
  pdfshareablelinks (list): List of shareable links to PDFs.
131
  heading_to_search (str): The heading to search for in the PDF.
132
-
133
  Returns:
134
  Tuple: Annotated PDF bytes, count of heading occurrences, and zoom string.
135
  """
@@ -153,8 +151,9 @@ def annotate_text_from_pdf(pdfshareablelinks, LISTheading_to_search):
153
  # Open the PDF using PyMuPDF
154
  pdf_document = fitz.open(stream=pdf_content, filetype="pdf")
155
  repeated_texts = get_repeated_texts(pdf_document)
156
- df = pd.DataFrame(columns=["NBSLink","NBS", 'head above 1', "head above 2"])
157
  dictionaryNBS={}
 
158
  for NBSindex, heading_to_search in enumerate(LISTheading_to_search):
159
  if NBSindex == len(LISTheading_to_search) - 1:
160
  flagAllNBSvisited = True
@@ -164,6 +163,8 @@ def annotate_text_from_pdf(pdfshareablelinks, LISTheading_to_search):
164
  f10_count = 0
165
  current_y = None
166
  highlight_rect = None
 
 
167
  zoom_str = None
168
  toc_flag = False
169
  span_font_goal = None
@@ -172,6 +173,7 @@ def annotate_text_from_pdf(pdfshareablelinks, LISTheading_to_search):
172
  groupheadings = []
173
  merged_groupheadings = []
174
  collectheader2 = False
 
175
  header2 = ''
176
  header2_first_span_size = 0
177
  previous_header = ''
@@ -213,177 +215,212 @@ def annotate_text_from_pdf(pdfshareablelinks, LISTheading_to_search):
213
  span_y = span['bbox'][1]
214
  span_font = span['font']
215
  span_size = span['size']
216
-
217
- if previous_y is None:
218
- previous_y = span_y # Initialize on first span
219
-
220
- # If same Y coordinate as previous, append to the current line
221
- if abs(span_y - previous_y) < 5: # Allow a small margin for OCR variations
222
- current_line_text += " " + span_text
223
- current_line_text = normalize_text(current_line_text)
224
- current_line_span_size = span_size
225
- else:
226
- # Store the complete line and reset for the new line
227
- if current_line_text.strip():
228
- all_text.append(current_line_text.strip())
229
-
230
- current_line_text = span_text # Start a new line
231
- previous_y = span_y # Update the reference Y
232
- text = span_text
233
- if collecting_text and span_font == span_font_goal and span_size == span_size_goal and span_text[0].isdigit():
234
- print(f"Ending collection at heading: {span_text}")
235
- print("merged_groupheadings:", merged_groupheadings)
236
- collecting_text = False
237
- continue
238
- if collecting_text:
239
- annot = page.add_highlight_annot(highlight_rect)
240
- annot.update()
241
-
242
- if 'Content' in span_text:
243
- toc_flag = True
244
- TOC_start = span_text
245
- print('content', TOC_start, span_size)
246
-
247
- if toc_flag:
248
- if 'Content' not in span_text:
249
- if current_y is None:
250
- current_y = span_y
251
- current_size = span_size # Initialize the reference span size
252
- # Check if the current span size deviates significantly
253
- if abs(span_size - current_size) > 1: # Threshold for size difference
254
- toc_flag = False
255
-
256
- if abs(current_y - span_y) < 5: # Allowing more flexibility for multi-line headings
257
- current_line += " " + span_text # Keep accumulating text
258
- else:
259
- if current_line.strip(): # Only process non-empty lines
260
- pattern = r"^([A-Za-z0-9\s\/\-,]+)(?=\.+)"
261
- match = re.match(pattern, current_line.strip())
262
-
263
- if match:
264
- groupheadings.append(match.group(1).strip())
265
- current_line = span_text
266
- current_y = span_y
267
- current_size = span_size # Update reference span size
268
- if len(groupheadings) > 0:
269
- pattern = re.compile(r"^[A-Za-z]\d{2} ") # Match headings starting with letter + 2 digits
270
- merged_groupheadings = []
271
- current_item = None # Start as None to avoid an initial blank entry
272
-
273
- for item in groupheadings:
274
- if pattern.match(item): # If item starts with correct pattern, it's a new heading
275
- if current_item: # Append only if current_item is not empty
276
- merged_groupheadings.append(current_item.strip())
277
- current_item = item # Start new heading
278
- else:
279
- if current_item:
280
- current_item += " " + item # Merge with previous heading
281
-
282
- # Append last merged item after loop
283
- if current_item:
284
- merged_groupheadings.append(current_item.strip())
285
- if span_text == first_word:
286
- print('First word found:', span_text)
287
- # Check if it's not the last span in the current line
288
- print(i + 1, len(spans))
289
- if i + 1 < len(spans):
290
- next_span_text = (spans[i + 1]['text'].strip())
291
- # Check if the next span's text is in the heading list
292
- if next_span_text.replace(" ", "") in heading_to_search.replace(" ", ""):
293
- text = (span_text + ' ' + next_span_text)
294
- # After processing the current line, check if there's a next line
295
- if first_word == span_text:
296
- if line_index + 1 < len(block.get('lines', [])):
297
- next_line = block['lines'][line_index + 1]
298
- # You can process the spans of the next line here
299
- for next_span in next_line.get('spans', []):
300
- next_span_text = next_span['text'].strip()
301
- text = span_text + ' ' + next_span_text
302
- if len(merged_groupheadings) > 0:
303
- if re.match(r"[A-Za-z]\d{2}", span_text) and span_size > 10:
304
- previous_header = span_text # Store last detected header
305
- print('previous_header', span_text)
306
- groupmainheadingFromArray = [item for item in merged_groupheadings if previous_header in item]
307
-
308
- if previous_header:
309
- if not collectheader2:
310
- if header2_first_span_size == 0:
311
- spanSizeHeader = 10
312
  else:
313
- spanSizeHeader = header2_first_span_size
314
-
315
- for item in groupmainheadingFromArray:
316
- if not any(normalize_text(current_line_text) in normalize_text(item) for item in groupmainheadingFromArray):
317
- if span_size >= spanSizeHeader:
318
- if not re.match(r"^\d{2}", current_line_text) and current_line_text not in repeated_texts and "Bold" in span["font"] :
319
- if len(header2) > 0:
320
- header2_first_span_size = span_size
321
- header2 = current_line_text
322
- print('header2', header2, span_size, spanSizeHeader)
323
-
324
- trimmed_text = text.replace(" ", "")
325
- if len(text) > 0:
326
- if text.split()[0] in heading_words:
327
- if len(trimmed_text) > 0 and (heading_to_search.replace(" ", "") in trimmed_text):
328
- print(trimmed_text, heading_to_search)
329
- f10_count += 1
330
- # Start collecting text under the second occurrence of the heading
331
- if f10_count == 1:
332
- collecting_text = True
333
- print(f"Starting collection under heading: {text}, {span_font}, {span_size}")
334
- collectheader2 = True
335
- NBS_heading = heading_to_searchNBS
336
- x0, y0, x1, y1 = highlight_rect
337
-
338
- span_font_goal = span_font # Capture the font at the first heading match
339
- span_size_goal = span_size # Capture the size at the first heading match
340
- zoom = 200
341
- left = int(x0)
342
- top = int(y0)
343
- zoom_str = f"{zoom},{left},{top}"
344
- pageNumberFound = page_num + 1
345
- dictionaryNBS[heading_to_searchNBS] = [pageNumberFound, zoom_str]
346
-
347
- annot = page.add_highlight_annot(highlight_rect)
348
- annot.update()
349
- groupmainheadingFromArray = [item for item in merged_groupheadings if previous_header in item]
350
-
351
- # Build the query parameters
352
- params = {
353
- 'pdfLink': link, # Your PDF link
354
- 'keyword': NBS_heading, # Your keyword (could be a string or list)
355
- }
356
-
357
- # URL encode each parameter
358
- encoded_params = {key: urllib.parse.quote(value, safe='') for key, value in params.items()}
359
-
360
- # Construct the final encoded link
361
- encoded_link = '&'.join([f"{key}={value}" for key, value in encoded_params.items()])
362
-
363
- # Correctly construct the final URL with page and zoom
364
- final_url = f"{baselink}{encoded_link}#page={str(pageNumberFound)}&zoom={zoom_str}"
365
-
366
- # Optionally, add the URL to a DataFrame
367
- if len(groupmainheadingFromArray) > 0:
368
- df = pd.concat([df, pd.DataFrame([{
369
- "NBSLink": final_url,
370
- "NBS": NBS_heading,
371
- 'head above 1': header2,
372
- "head above 2": groupmainheadingFromArray[0]
373
- }])], ignore_index=True)
374
-
375
- print("Final URL:", final_url)
376
-
377
- if collecting_text:
378
- annot = page.add_highlight_annot(highlight_rect)
379
- annot.update()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
380
  if current_line.strip():
381
  all_text += current_line.strip() + '\n' # Append the current line
382
  print(df)
383
  print(dictionaryNBS)
384
- xx=save_df_to_pdf(df)
385
- outputpdfFitz =fitz.open('pdf',xx)
386
  pdf_bytes = BytesIO()
387
  pdf_document.save(pdf_bytes)
388
- return pdf_bytes.getvalue(), pdf_document , df,outputpdfFitz
389
-
 
1
+
2
  import fitz # PyMuPDF
3
  from io import BytesIO
4
  import re
 
11
  import pandas as pd
12
  import tempfile
13
  from fpdf import FPDF
14
+ import json
15
+ from datetime import datetime
16
 
17
  baselink='https://marthee-nbslink.hf.space/view-pdf?'
18
  class PDF(FPDF):
 
31
  pdf.set_right_margin(margin)
32
 
33
  pdf.add_page()
34
+ pdf.set_font("Arial", size=8) # Reduce font size to fit more text
35
+
36
+ # Table headers
37
+ headers = ["NBSLink", "Subject","Page","Author","Creation Date","Layer",'Code', 'head above 1', "head above 2"]
38
+ num_cols = len(headers)
39
 
40
+ # Calculate column width dynamically
41
+ max_table_width = pdf.w - 2 * margin # Total available width
42
+ col_width = max_table_width / num_cols # Distribute evenly
43
  table_width = col_width * num_cols
44
 
45
  # Get page width and calculate left alignment
 
50
 
51
  # Table headers
52
  pdf.set_fill_color(200, 200, 200) # Light gray background
53
+ pdf.set_font("Arial", "B", 8)
54
+
 
 
55
  for header in headers:
56
  pdf.cell(col_width, 8, header, border=1, fill=True, align="C")
57
  pdf.ln()
58
 
59
+ pdf.set_font("Arial", size=7) # Reduce font size for data rows
60
 
 
61
  for _, row in df.iterrows():
62
  x_start = start_x # Ensure every row starts at the same position
63
  y_start = pdf.get_y()
64
 
65
  # Calculate max height needed for this row
66
+ text_lines = {col: pdf.multi_cell(col_width, 5, row[col], border=0, align="L", split_only=True) for col in ["Subject","Page","Author","Creation Date","Layer",'Code', 'head above 1', "head above 2"]}
67
  max_lines = max(len(lines) for lines in text_lines.values())
68
  max_height = max_lines * 5
69
 
 
76
  pdf.set_xy(x_start + col_width, y_start)
77
 
78
  # Draw each cell manually, ensuring equal height
79
+ for i, col_name in enumerate(["Subject","Page","Author","Creation Date","Layer",'Code', 'head above 1', "head above 2"]):
80
  x_col = x_start + col_width * (i + 1)
81
  y_col = y_start
82
  pdf.multi_cell(col_width, 5, row[col_name], border=0, align="L") # Draw text
 
85
 
86
  # Move to the next row
87
  pdf.ln(max_height)
88
+ # Save PDF to memory
89
+ pdf_output = pdf.output(dest="S").encode("latin1")
 
 
 
90
  return pdf_output
91
 
92
 
 
96
  text = text.lower().strip()
97
  text = re.sub(r'\s+', ' ', text) # Normalize multiple spaces
98
  return re.sub(r'[^\w\s]', '', text) # Remove punctuation
99
+ def get_repeated_texts(pdf_document, threshold=0.85):
100
  """
101
  Identify text that appears on most pages.
102
  :param pdf_document: The opened PDF document.
 
125
  """
126
  Annotates text under a specific heading in a PDF, highlights it,
127
  and constructs zoom coordinates for the first occurrence of the heading.
 
128
  Args:
129
  pdfshareablelinks (list): List of shareable links to PDFs.
130
  heading_to_search (str): The heading to search for in the PDF.
 
131
  Returns:
132
  Tuple: Annotated PDF bytes, count of heading occurrences, and zoom string.
133
  """
 
151
  # Open the PDF using PyMuPDF
152
  pdf_document = fitz.open(stream=pdf_content, filetype="pdf")
153
  repeated_texts = get_repeated_texts(pdf_document)
154
+ df = pd.DataFrame(columns=["NBSLink","Subject","Page","Author","Creation Date","Layer",'Code', 'head above 1', "head above 2"])
155
  dictionaryNBS={}
156
+ data_list_JSON = []
157
  for NBSindex, heading_to_search in enumerate(LISTheading_to_search):
158
  if NBSindex == len(LISTheading_to_search) - 1:
159
  flagAllNBSvisited = True
 
163
  f10_count = 0
164
  current_y = None
165
  highlight_rect = None
166
+ highlight_rectEnding=None
167
+ highlight_rectBegin=None
168
  zoom_str = None
169
  toc_flag = False
170
  span_font_goal = None
 
173
  groupheadings = []
174
  merged_groupheadings = []
175
  collectheader2 = False
176
+ endingcontentFlag=True
177
  header2 = ''
178
  header2_first_span_size = 0
179
  previous_header = ''
 
215
  span_y = span['bbox'][1]
216
  span_font = span['font']
217
  span_size = span['size']
218
+ if normalize_text(span_text) not in repeated_texts and not (span_text.startswith('Page')):
219
+ if previous_y is None:
220
+ previous_y = span_y # Initialize on first span
221
+
222
+ # If same Y coordinate as previous, append to the current line
223
+ if abs(span_y - previous_y) < 5: # Allow a small margin for OCR variations
224
+ current_line_text += " " + span_text
225
+ current_line_text = normalize_text(current_line_text)
226
+ current_line_span_size = span_size
227
+ else:
228
+ # Store the complete line and reset for the new line
229
+ if current_line_text.strip():
230
+ all_text.append(current_line_text.strip())
231
+
232
+ current_line_text = span_text # Start a new line
233
+ previous_y = span_y # Update the reference Y
234
+ text = span_text
235
+ if collecting_text and span_font == span_font_goal and span_size == span_size_goal and span_text[0].isdigit():
236
+ print(f"Ending collection at heading: {span_text}")
237
+ highlight_rectEnding=highlight_rect
238
+ print("merged_groupheadings:", merged_groupheadings)
239
+ print('groupheadingss',groupheadings)
240
+ collecting_text = False
241
+ continue
242
+ if collecting_text:
243
+ annot = page.add_rect_annot(highlight_rect) # Create a rectangle annotation
244
+ annot.set_colors(stroke=(1, 0, 0)) # Set border color (Red)
245
+ annot.update() # Apply changes
246
+
247
+ if 'Content' in span_text:
248
+ toc_flag = True
249
+ TOC_start = span_text
250
+ print('content', TOC_start, span_size)
251
+
252
+ if toc_flag or endingcontentFlag:
253
+ if 'Content' not in span_text:
254
+ if current_y is None:
255
+ current_y = span_y
256
+ current_size = span_size # Initialize the reference span size
257
+ # Check if the current span size deviates significantly
258
+ if abs(span_size - current_size) > 1: # Threshold for size difference
259
+ toc_flag = False
260
+
261
+ if abs(current_y - span_y) < 5: # Allowing more flexibility for multi-line headings
262
+ current_line += " " + span_text # Keep accumulating text
263
+ else:
264
+ if current_line.strip(): # Only process non-empty lines
265
+ clean_text = re.sub(r'\.{5,}\d*$', '', current_line, flags=re.MULTILINE) # Remove dots and trailing numbers
266
+
267
+ print(clean_text.strip())
268
+ if clean_text:
269
+ groupheadings.append(clean_text)
270
+ # else:
271
+ # toc_flag = False
272
+
273
+ current_line = span_text
274
+ current_y = span_y
275
+ current_size = span_size # Update reference span size
276
+ # print('outofcurrent')
277
+ if len(groupheadings) > 0:
278
+ pattern = re.compile(r"^[A-Za-z]\d{2} ") # Match headings starting with letter + 2 digits
279
+ merged_groupheadings = []
280
+ current_item = None # Start as None to avoid an initial blank entry
281
+
282
+ for item in groupheadings:
283
+ if pattern.match(item): # If item starts with correct pattern, it's a new heading
284
+ if current_item: # Append only if current_item is not empty
285
+ if current_item not in merged_groupheadings:
286
+ extracted_text = re.split(r"\.{3,}", current_item)[0].strip()
287
+ merged_groupheadings.append(extracted_text.strip())
288
+ current_item = item # Start new heading
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
289
  else:
290
+ if current_item:
291
+ if item not in current_item:
292
+ current_item += " " + item # Merge with previous heading
293
+
294
+ # Append last merged item after loop
295
+ if current_item:
296
+ if current_item not in merged_groupheadings:
297
+ extracted_text = re.split(r"\.{3,}", current_item)[0].strip()
298
+ merged_groupheadings.append(extracted_text.strip())
299
+ if span_text == first_word:
300
+ print('First word found:', span_text)
301
+ # Check if it's not the last span in the current line
302
+ print(i + 1, len(spans))
303
+ if i + 1 < len(spans):
304
+ next_span_text = (spans[i + 1]['text'].strip())
305
+ # Check if the next span's text is in the heading list
306
+ if next_span_text.replace(" ", "") in heading_to_search.replace(" ", ""):
307
+ text = (span_text + ' ' + next_span_text)
308
+ # After processing the current line, check if there's a next line
309
+ if first_word == span_text:
310
+ if line_index + 1 < len(block.get('lines', [])):
311
+ next_line = block['lines'][line_index + 1]
312
+ # You can process the spans of the next line here
313
+ for next_span in next_line.get('spans', []):
314
+ next_span_text = next_span['text'].strip()
315
+ text = span_text + ' ' + next_span_text
316
+ if len(merged_groupheadings) > 0:
317
+ if re.match(r"[A-Za-z]\d{2}", span_text) and span_size > 10:
318
+ toc_flag = False
319
+ endingcontentFlag=False
320
+ previous_header = span_text # Store last detected header
321
+ print('previous_header', span_text)
322
+
323
+ groupmainheadingFromArray = [item for item in merged_groupheadings if previous_header in item]
324
+
325
+ if previous_header:
326
+ if not collectheader2:
327
+ if header2_first_span_size == 0:
328
+ spanSizeHeader = 10
329
+ else:
330
+ spanSizeHeader = header2_first_span_size
331
+
332
+ for item in groupmainheadingFromArray:
333
+ if not any(normalize_text(current_line_text) in normalize_text(item) for item in groupmainheadingFromArray):
334
+ if not current_line_text[0].isdigit() :
335
+ if span_size >= spanSizeHeader:
336
+ if not re.match(r"^\d{2}", current_line_text) and current_line_text not in repeated_texts and "Bold" in span["font"] :
337
+ if len(header2) > 0 :
338
+ header2_first_span_size = span_size
339
+ header2 = current_line_text
340
+ print('header2', header2, span_size, spanSizeHeader)
341
+
342
+ trimmed_text = text.replace(" ", "")
343
+ if len(text) > 0:
344
+ if text.split()[0] in heading_words:
345
+ if len(trimmed_text) > 0 and (heading_to_search.replace(" ", "") in trimmed_text):
346
+ print(trimmed_text, heading_to_search)
347
+ f10_count += 1
348
+ # Start collecting text under the second occurrence of the heading
349
+ if f10_count == 1:
350
+ collecting_text = True
351
+ print(f"Starting collection under heading: {text}, {span_font}, {span_size}")
352
+ collectheader2 = True
353
+ NBS_heading = heading_to_searchNBS
354
+ highlight_rectBegin=highlight_rect
355
+ x0, y0, x1, y1 = highlight_rectBegin
356
+
357
+ span_font_goal = span_font # Capture the font at the first heading match
358
+ span_size_goal = span_size # Capture the size at the first heading match
359
+ zoom = 200
360
+ left = int(x0)
361
+ top = int(y0)
362
+ zoom_str = f"{zoom},{left},{top}"
363
+ pageNumberFound = page_num + 1
364
+ dictionaryNBS[heading_to_searchNBS] = [pageNumberFound, zoom_str]
365
+
366
+ annot = page.add_rect_annot(highlight_rect) # Create a rectangle annotation
367
+ annot.set_colors(stroke=(1, 0, 0)) # Set border color (Red)
368
+ annot.update() # Apply changes
369
+ groupmainheadingFromArray = [item for item in merged_groupheadings if previous_header in item]
370
+
371
+ # Build the query parameters
372
+ params = {
373
+ 'pdfLink': link, # Your PDF link
374
+ 'keyword': NBS_heading, # Your keyword (could be a string or list)
375
+ }
376
+
377
+ # URL encode each parameter
378
+ encoded_params = {key: urllib.parse.quote(value, safe='') for key, value in params.items()}
379
+
380
+ # Construct the final encoded link
381
+ encoded_link = '&'.join([f"{key}={value}" for key, value in encoded_params.items()])
382
+
383
+ # Correctly construct the final URL with page and zoom
384
+ final_url = f"{baselink}{encoded_link}#page={str(pageNumberFound)}&zoom={zoom_str}"
385
+
386
+ # Get current date and time
387
+ now = datetime.now()
388
+
389
+ # Format the output
390
+ formatted_time = now.strftime("%d/%m/%Y %I:%M:%S %p")
391
+ # Optionally, add the URL to a DataFrame
392
+
393
+
394
+ if len(groupmainheadingFromArray) > 0:
395
+ data_entry = {
396
+ "NBSLink": final_url,
397
+ "Subject": NBS_heading,
398
+ "Page": str(pageNumberFound),
399
+ "Author": "ADR",
400
+ "Creation Date": formatted_time,
401
+ "Layer": "Initial",
402
+ "Code": "to be added",
403
+ "head above 1": header2,
404
+ "head above 2": groupmainheadingFromArray[0]
405
+ }
406
+ data_list_JSON.append(data_entry)
407
+
408
+ # Convert list to JSON
409
+ json_output = json.dumps(data_list_JSON, indent=4)
410
+
411
+ print("Final URL:", final_url)
412
+
413
+ if collecting_text:
414
+ annot = page.add_rect_annot(highlight_rect) # Create a rectangle annotation
415
+ annot.set_colors(stroke=(1, 0, 0)) # Set border color (Red)
416
+ annot.update() # Apply changes
417
  if current_line.strip():
418
  all_text += current_line.strip() + '\n' # Append the current line
419
  print(df)
420
  print(dictionaryNBS)
421
+ # xx=save_df_to_pdf(df)
422
+ # outputpdfFitz =fitz.open('pdf',xx)
423
  pdf_bytes = BytesIO()
424
  pdf_document.save(pdf_bytes)
425
+ print('JSONN',json_output)
426
+ return pdf_bytes.getvalue(), pdf_document , df, json_output