InitialMarkups

Runtime error

App Files Files Community

Marthee commited on Feb 7, 2025

Commit

2f352ee

verified ·

1 Parent(s): 3e216ca

Update Find_Hyperlinking_text.py

Browse files

Files changed (1) hide show

Find_Hyperlinking_text.py +8 -4

Find_Hyperlinking_text.py CHANGED Viewed

@@ -6,7 +6,8 @@ import pandas as pd
 from collections import Counter
 import fitz  # PyMuPDF
 import re
 def normalize_text(text):
     """Lowercase, remove extra spaces, and strip special characters."""
     text = text.lower().strip()
@@ -69,7 +70,7 @@ def annotate_text_from_pdf(pdfshareablelinks, LISTheading_to_search):
     # Open the PDF using PyMuPDF
     pdf_document = fitz.open(stream=pdf_content, filetype="pdf")
     repeated_texts = get_repeated_texts(pdf_document)
-    df = pd.DataFrame(columns=["NBS Link","NBS", 'head above 1', "head above 2"])
     dictionaryNBS={}
     for NBSindex, heading_to_search in enumerate(LISTheading_to_search):
         if NBSindex == len(LISTheading_to_search) - 1:
@@ -263,9 +264,12 @@ def annotate_text_from_pdf(pdfshareablelinks, LISTheading_to_search):
                                             annot = page.add_highlight_annot(highlight_rect)
                                             annot.update()
                                             groupmainheadingFromArray = [item for item in merged_groupheadings if previous_header in item]
                                             if len(groupmainheadingFromArray) > 0:
-                                                df = pd.concat([df, pd.DataFrame([{"NBS": NBS_heading, 'head above 1': header2, "head above 2": groupmainheadingFromArray[0]}])], ignore_index=True)
                                         # Highlight the text
                             if collecting_text:
                                 annot = page.add_highlight_annot(highlight_rect)

 from collections import Counter
 import fitz  # PyMuPDF
 import re
+import urllib.parse
+baselink='https://marthee-nbslink.hf.space/view-pdf?'
 def normalize_text(text):
     """Lowercase, remove extra spaces, and strip special characters."""
     text = text.lower().strip()
     # Open the PDF using PyMuPDF
     pdf_document = fitz.open(stream=pdf_content, filetype="pdf")
     repeated_texts = get_repeated_texts(pdf_document)
+    df = pd.DataFrame(columns=["NBSLink","NBS", 'head above 1', "head above 2"])
     dictionaryNBS={}
     for NBSindex, heading_to_search in enumerate(LISTheading_to_search):
         if NBSindex == len(LISTheading_to_search) - 1:
                                             annot = page.add_highlight_annot(highlight_rect)
                                             annot.update()
                                             groupmainheadingFromArray = [item for item in merged_groupheadings if previous_header in item]
+                                            NBSlinkeach='pdfLink='+link+'&keyword='+NBS_heading+'#page='+str(pageNumberFound)+'&zoom='+str(highlight_rect)
+                                            encoded_link = urllib.parse.quote(NBSlinkeach, safe='')
                                             if len(groupmainheadingFromArray) > 0:
+                                                print('LINKLINK:',baselink+encoded_link)
+                                                df = pd.concat([df, pd.DataFrame([{"NBSLink":baselink+encoded_link,"NBS": NBS_heading, 'head above 1': header2, "head above 2": groupmainheadingFromArray[0]}])], ignore_index=True)
                                         # Highlight the text
                             if collecting_text:
                                 annot = page.add_highlight_annot(highlight_rect)