Marthee commited on
Commit
2f352ee
·
verified ·
1 Parent(s): 3e216ca

Update Find_Hyperlinking_text.py

Browse files
Files changed (1) hide show
  1. Find_Hyperlinking_text.py +8 -4
Find_Hyperlinking_text.py CHANGED
@@ -6,7 +6,8 @@ import pandas as pd
6
  from collections import Counter
7
  import fitz # PyMuPDF
8
  import re
9
-
 
10
  def normalize_text(text):
11
  """Lowercase, remove extra spaces, and strip special characters."""
12
  text = text.lower().strip()
@@ -69,7 +70,7 @@ def annotate_text_from_pdf(pdfshareablelinks, LISTheading_to_search):
69
  # Open the PDF using PyMuPDF
70
  pdf_document = fitz.open(stream=pdf_content, filetype="pdf")
71
  repeated_texts = get_repeated_texts(pdf_document)
72
- df = pd.DataFrame(columns=["NBS Link","NBS", 'head above 1', "head above 2"])
73
  dictionaryNBS={}
74
  for NBSindex, heading_to_search in enumerate(LISTheading_to_search):
75
  if NBSindex == len(LISTheading_to_search) - 1:
@@ -263,9 +264,12 @@ def annotate_text_from_pdf(pdfshareablelinks, LISTheading_to_search):
263
  annot = page.add_highlight_annot(highlight_rect)
264
  annot.update()
265
  groupmainheadingFromArray = [item for item in merged_groupheadings if previous_header in item]
266
-
 
 
267
  if len(groupmainheadingFromArray) > 0:
268
- df = pd.concat([df, pd.DataFrame([{"NBS": NBS_heading, 'head above 1': header2, "head above 2": groupmainheadingFromArray[0]}])], ignore_index=True)
 
269
  # Highlight the text
270
  if collecting_text:
271
  annot = page.add_highlight_annot(highlight_rect)
 
6
  from collections import Counter
7
  import fitz # PyMuPDF
8
  import re
9
+ import urllib.parse
10
+ baselink='https://marthee-nbslink.hf.space/view-pdf?'
11
  def normalize_text(text):
12
  """Lowercase, remove extra spaces, and strip special characters."""
13
  text = text.lower().strip()
 
70
  # Open the PDF using PyMuPDF
71
  pdf_document = fitz.open(stream=pdf_content, filetype="pdf")
72
  repeated_texts = get_repeated_texts(pdf_document)
73
+ df = pd.DataFrame(columns=["NBSLink","NBS", 'head above 1', "head above 2"])
74
  dictionaryNBS={}
75
  for NBSindex, heading_to_search in enumerate(LISTheading_to_search):
76
  if NBSindex == len(LISTheading_to_search) - 1:
 
264
  annot = page.add_highlight_annot(highlight_rect)
265
  annot.update()
266
  groupmainheadingFromArray = [item for item in merged_groupheadings if previous_header in item]
267
+
268
+ NBSlinkeach='pdfLink='+link+'&keyword='+NBS_heading+'#page='+str(pageNumberFound)+'&zoom='+str(highlight_rect)
269
+ encoded_link = urllib.parse.quote(NBSlinkeach, safe='')
270
  if len(groupmainheadingFromArray) > 0:
271
+ print('LINKLINK:',baselink+encoded_link)
272
+ df = pd.concat([df, pd.DataFrame([{"NBSLink":baselink+encoded_link,"NBS": NBS_heading, 'head above 1': header2, "head above 2": groupmainheadingFromArray[0]}])], ignore_index=True)
273
  # Highlight the text
274
  if collecting_text:
275
  annot = page.add_highlight_annot(highlight_rect)