Spaces:
Runtime error
Runtime error
Update InitialMarkups.py
Browse files- InitialMarkups.py +42 -55
InitialMarkups.py
CHANGED
|
@@ -39,43 +39,30 @@ import json
|
|
| 39 |
import copy
|
| 40 |
import urllib.parse
|
| 41 |
|
| 42 |
-
def changepdflinks(data_list_JSON, pdflink):
|
| 43 |
-
print('Received JSON:', data_list_JSON,pdflink)
|
| 44 |
-
|
| 45 |
-
# Ensure list of dicts
|
| 46 |
-
if isinstance(data_list_JSON, str):
|
| 47 |
-
try:
|
| 48 |
-
data_list_JSON = json.loads(data_list_JSON)
|
| 49 |
-
except json.JSONDecodeError:
|
| 50 |
-
raise ValueError(f"Invalid JSON string passed: {data_list_JSON[:200]}")
|
| 51 |
-
elif not isinstance(data_list_JSON, list):
|
| 52 |
-
raise ValueError(f"Input must be JSON string or list, got {type(data_list_JSON)}")
|
| 53 |
-
|
| 54 |
-
data_list_JSON = copy.deepcopy(data_list_JSON)
|
| 55 |
-
|
| 56 |
-
for entry in data_list_JSON:
|
| 57 |
-
old_url = entry.get("NBSLink", "")
|
| 58 |
-
print('old_url',old_url)
|
| 59 |
-
if not old_url:
|
| 60 |
-
continue
|
| 61 |
|
| 62 |
-
|
| 63 |
-
fragment = parsed.fragment # e.g. "page=3&zoom=150"
|
| 64 |
-
print("Fragment:", fragment)
|
| 65 |
|
| 66 |
-
|
| 67 |
-
|
| 68 |
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 77 |
|
| 78 |
-
return data_list_JSON
|
| 79 |
|
| 80 |
|
| 81 |
|
|
@@ -875,19 +862,19 @@ def extract_section_under_header(multiplePDF_Paths):
|
|
| 875 |
pageNumberFound = page_num + 1
|
| 876 |
|
| 877 |
# Build the query parameters
|
| 878 |
-
params = {
|
| 879 |
-
|
| 880 |
-
|
| 881 |
-
}
|
| 882 |
|
| 883 |
-
# URL encode each parameter
|
| 884 |
-
encoded_params = {key: urllib.parse.quote(value, safe='') for key, value in params.items()}
|
| 885 |
|
| 886 |
-
# Construct the final encoded link
|
| 887 |
-
encoded_link = '&'.join([f"{key}={value}" for key, value in encoded_params.items()])
|
| 888 |
|
| 889 |
-
# Correctly construct the final URL with page and zoom
|
| 890 |
-
final_url = f"{baselink}{encoded_link}#page={str(pageNumberFound)}&zoom={zoom_str}"
|
| 891 |
|
| 892 |
# Get current date and time
|
| 893 |
now = datetime.now()
|
|
@@ -898,7 +885,7 @@ def extract_section_under_header(multiplePDF_Paths):
|
|
| 898 |
|
| 899 |
|
| 900 |
data_entry = {
|
| 901 |
-
"NBSLink":
|
| 902 |
"Subject": heading_to_search,
|
| 903 |
"Page": str(pageNumberFound),
|
| 904 |
"Author": "ADR",
|
|
@@ -969,19 +956,19 @@ def extract_section_under_header(multiplePDF_Paths):
|
|
| 969 |
pageNumberFound = page_num + 1
|
| 970 |
|
| 971 |
# Build the query parameters
|
| 972 |
-
params = {
|
| 973 |
-
|
| 974 |
-
|
| 975 |
-
}
|
| 976 |
|
| 977 |
-
# URL encode each parameter
|
| 978 |
-
encoded_params = {key: urllib.parse.quote(value, safe='') for key, value in params.items()}
|
| 979 |
|
| 980 |
-
# Construct the final encoded link
|
| 981 |
-
encoded_link = '&'.join([f"{key}={value}" for key, value in encoded_params.items()])
|
| 982 |
|
| 983 |
-
# Correctly construct the final URL with page and zoom
|
| 984 |
-
final_url = f"{baselink}{encoded_link}#page={str(pageNumberFound)}&zoom={zoom_str}"
|
| 985 |
|
| 986 |
# Get current date and time
|
| 987 |
now = datetime.now()
|
|
@@ -992,7 +979,7 @@ def extract_section_under_header(multiplePDF_Paths):
|
|
| 992 |
|
| 993 |
|
| 994 |
data_entry = {
|
| 995 |
-
"NBSLink":
|
| 996 |
"Subject": heading_to_search,
|
| 997 |
"Page": str(pageNumberFound),
|
| 998 |
"Author": "ADR",
|
|
|
|
| 39 |
import copy
|
| 40 |
import urllib.parse
|
| 41 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 42 |
|
| 43 |
+
import urllib.parse
|
|
|
|
|
|
|
| 44 |
|
| 45 |
+
def setpdflinks(json_data, pdf_path):
|
| 46 |
+
base_viewer_link = "https://findconsole-initialmarkups.hf.space/view-pdf?"
|
| 47 |
|
| 48 |
+
updated_json = []
|
| 49 |
+
for entry in json_data:
|
| 50 |
+
# Extract needed fields
|
| 51 |
+
zoom_str = entry.get("NBSLink", "")
|
| 52 |
+
|
| 53 |
+
# Encode the pdf link safely for URL usage
|
| 54 |
+
encoded_pdf_link = urllib.parse.quote(pdf_path, safe='')
|
| 55 |
+
|
| 56 |
+
# Construct the final link
|
| 57 |
+
final_url = f"{base_viewer_link}pdfLink={encoded_pdf_link}#{zoom_str}"
|
| 58 |
+
|
| 59 |
+
# Replace the old NBSLink value with the full URL
|
| 60 |
+
entry["NBSLink"] = final_url
|
| 61 |
+
|
| 62 |
+
updated_json.append(entry)
|
| 63 |
+
|
| 64 |
+
return updated_json
|
| 65 |
|
|
|
|
| 66 |
|
| 67 |
|
| 68 |
|
|
|
|
| 862 |
pageNumberFound = page_num + 1
|
| 863 |
|
| 864 |
# Build the query parameters
|
| 865 |
+
# params = {
|
| 866 |
+
# 'pdfLink': pdf_path, # Your PDF link
|
| 867 |
+
# 'keyword': heading_to_search, # Your keyword (could be a string or list)
|
| 868 |
+
# }
|
| 869 |
|
| 870 |
+
# # URL encode each parameter
|
| 871 |
+
# encoded_params = {key: urllib.parse.quote(value, safe='') for key, value in params.items()}
|
| 872 |
|
| 873 |
+
# # Construct the final encoded link
|
| 874 |
+
# encoded_link = '&'.join([f"{key}={value}" for key, value in encoded_params.items()])
|
| 875 |
|
| 876 |
+
# # Correctly construct the final URL with page and zoom
|
| 877 |
+
# final_url = f"{baselink}{encoded_link}#page={str(pageNumberFound)}&zoom={zoom_str}"
|
| 878 |
|
| 879 |
# Get current date and time
|
| 880 |
now = datetime.now()
|
|
|
|
| 885 |
|
| 886 |
|
| 887 |
data_entry = {
|
| 888 |
+
"NBSLink": zoom_str,
|
| 889 |
"Subject": heading_to_search,
|
| 890 |
"Page": str(pageNumberFound),
|
| 891 |
"Author": "ADR",
|
|
|
|
| 956 |
pageNumberFound = page_num + 1
|
| 957 |
|
| 958 |
# Build the query parameters
|
| 959 |
+
# params = {
|
| 960 |
+
# 'pdfLink': pdf_path, # Your PDF link
|
| 961 |
+
# 'keyword': heading_to_search, # Your keyword (could be a string or list)
|
| 962 |
+
# }
|
| 963 |
|
| 964 |
+
# # URL encode each parameter
|
| 965 |
+
# encoded_params = {key: urllib.parse.quote(value, safe='') for key, value in params.items()}
|
| 966 |
|
| 967 |
+
# # Construct the final encoded link
|
| 968 |
+
# encoded_link = '&'.join([f"{key}={value}" for key, value in encoded_params.items()])
|
| 969 |
|
| 970 |
+
# # Correctly construct the final URL with page and zoom
|
| 971 |
+
# final_url = f"{baselink}{encoded_link}#page={str(pageNumberFound)}&zoom={zoom_str}"
|
| 972 |
|
| 973 |
# Get current date and time
|
| 974 |
now = datetime.now()
|
|
|
|
| 979 |
|
| 980 |
|
| 981 |
data_entry = {
|
| 982 |
+
"NBSLink": zoom_str,
|
| 983 |
"Subject": heading_to_search,
|
| 984 |
"Page": str(pageNumberFound),
|
| 985 |
"Author": "ADR",
|