Spaces:
Runtime error
Runtime error
Update Find_Hyperlinking_text.py
Browse files
Find_Hyperlinking_text.py
CHANGED
|
@@ -6,7 +6,8 @@ import pandas as pd
|
|
| 6 |
from collections import Counter
|
| 7 |
import fitz # PyMuPDF
|
| 8 |
import re
|
| 9 |
-
|
|
|
|
| 10 |
def normalize_text(text):
|
| 11 |
"""Lowercase, remove extra spaces, and strip special characters."""
|
| 12 |
text = text.lower().strip()
|
|
@@ -69,7 +70,7 @@ def annotate_text_from_pdf(pdfshareablelinks, LISTheading_to_search):
|
|
| 69 |
# Open the PDF using PyMuPDF
|
| 70 |
pdf_document = fitz.open(stream=pdf_content, filetype="pdf")
|
| 71 |
repeated_texts = get_repeated_texts(pdf_document)
|
| 72 |
-
df = pd.DataFrame(columns=["
|
| 73 |
dictionaryNBS={}
|
| 74 |
for NBSindex, heading_to_search in enumerate(LISTheading_to_search):
|
| 75 |
if NBSindex == len(LISTheading_to_search) - 1:
|
|
@@ -263,9 +264,12 @@ def annotate_text_from_pdf(pdfshareablelinks, LISTheading_to_search):
|
|
| 263 |
annot = page.add_highlight_annot(highlight_rect)
|
| 264 |
annot.update()
|
| 265 |
groupmainheadingFromArray = [item for item in merged_groupheadings if previous_header in item]
|
| 266 |
-
|
|
|
|
|
|
|
| 267 |
if len(groupmainheadingFromArray) > 0:
|
| 268 |
-
|
|
|
|
| 269 |
# Highlight the text
|
| 270 |
if collecting_text:
|
| 271 |
annot = page.add_highlight_annot(highlight_rect)
|
|
|
|
| 6 |
from collections import Counter
|
| 7 |
import fitz # PyMuPDF
|
| 8 |
import re
|
| 9 |
+
import urllib.parse
|
| 10 |
+
baselink='https://marthee-nbslink.hf.space/view-pdf?'
|
| 11 |
def normalize_text(text):
|
| 12 |
"""Lowercase, remove extra spaces, and strip special characters."""
|
| 13 |
text = text.lower().strip()
|
|
|
|
| 70 |
# Open the PDF using PyMuPDF
|
| 71 |
pdf_document = fitz.open(stream=pdf_content, filetype="pdf")
|
| 72 |
repeated_texts = get_repeated_texts(pdf_document)
|
| 73 |
+
df = pd.DataFrame(columns=["NBSLink","NBS", 'head above 1', "head above 2"])
|
| 74 |
dictionaryNBS={}
|
| 75 |
for NBSindex, heading_to_search in enumerate(LISTheading_to_search):
|
| 76 |
if NBSindex == len(LISTheading_to_search) - 1:
|
|
|
|
| 264 |
annot = page.add_highlight_annot(highlight_rect)
|
| 265 |
annot.update()
|
| 266 |
groupmainheadingFromArray = [item for item in merged_groupheadings if previous_header in item]
|
| 267 |
+
|
| 268 |
+
NBSlinkeach='pdfLink='+link+'&keyword='+NBS_heading+'#page='+str(pageNumberFound)+'&zoom='+str(highlight_rect)
|
| 269 |
+
encoded_link = urllib.parse.quote(NBSlinkeach, safe='')
|
| 270 |
if len(groupmainheadingFromArray) > 0:
|
| 271 |
+
print('LINKLINK:',baselink+encoded_link)
|
| 272 |
+
df = pd.concat([df, pd.DataFrame([{"NBSLink":baselink+encoded_link,"NBS": NBS_heading, 'head above 1': header2, "head above 2": groupmainheadingFromArray[0]}])], ignore_index=True)
|
| 273 |
# Highlight the text
|
| 274 |
if collecting_text:
|
| 275 |
annot = page.add_highlight_annot(highlight_rect)
|