Marthee commited on
Commit
242d2f6
·
verified ·
1 Parent(s): 4a2eef2

Update InitialMarkups.py

Browse files
Files changed (1) hide show
  1. InitialMarkups.py +42 -55
InitialMarkups.py CHANGED
@@ -39,43 +39,30 @@ import json
39
  import copy
40
  import urllib.parse
41
 
42
- def changepdflinks(data_list_JSON, pdflink):
43
- print('Received JSON:', data_list_JSON,pdflink)
44
-
45
- # Ensure list of dicts
46
- if isinstance(data_list_JSON, str):
47
- try:
48
- data_list_JSON = json.loads(data_list_JSON)
49
- except json.JSONDecodeError:
50
- raise ValueError(f"Invalid JSON string passed: {data_list_JSON[:200]}")
51
- elif not isinstance(data_list_JSON, list):
52
- raise ValueError(f"Input must be JSON string or list, got {type(data_list_JSON)}")
53
-
54
- data_list_JSON = copy.deepcopy(data_list_JSON)
55
-
56
- for entry in data_list_JSON:
57
- old_url = entry.get("NBSLink", "")
58
- print('old_url',old_url)
59
- if not old_url:
60
- continue
61
 
62
- parsed = urllib.parse.urlparse(old_url)
63
- fragment = parsed.fragment # e.g. "page=3&zoom=150"
64
- print("Fragment:", fragment)
65
 
66
- # Encode the provided PDF link for safe use in query string
67
- encoded_pdf = urllib.parse.quote(pdflink, safe='')
68
 
69
- # Build final view link
70
- if fragment:
71
- new_url = f"https://findconsole-initialmarkups.hf.space/view-pdf?pdfLink={encoded_pdf}#{fragment}"
72
- else:
73
- new_url = f"https://findconsole-initialmarkups.hf.space/view-pdf?pdfLink={encoded_pdf}"
74
- print('urlsent:',pdflink)
75
- print("New URL:", new_url)
76
- entry["NBSLink"] = new_url
 
 
 
 
 
 
 
 
 
77
 
78
- return data_list_JSON
79
 
80
 
81
 
@@ -875,19 +862,19 @@ def extract_section_under_header(multiplePDF_Paths):
875
  pageNumberFound = page_num + 1
876
 
877
  # Build the query parameters
878
- params = {
879
- 'pdfLink': pdf_path, # Your PDF link
880
- 'keyword': heading_to_search, # Your keyword (could be a string or list)
881
- }
882
 
883
- # URL encode each parameter
884
- encoded_params = {key: urllib.parse.quote(value, safe='') for key, value in params.items()}
885
 
886
- # Construct the final encoded link
887
- encoded_link = '&'.join([f"{key}={value}" for key, value in encoded_params.items()])
888
 
889
- # Correctly construct the final URL with page and zoom
890
- final_url = f"{baselink}{encoded_link}#page={str(pageNumberFound)}&zoom={zoom_str}"
891
 
892
  # Get current date and time
893
  now = datetime.now()
@@ -898,7 +885,7 @@ def extract_section_under_header(multiplePDF_Paths):
898
 
899
 
900
  data_entry = {
901
- "NBSLink": final_url,
902
  "Subject": heading_to_search,
903
  "Page": str(pageNumberFound),
904
  "Author": "ADR",
@@ -969,19 +956,19 @@ def extract_section_under_header(multiplePDF_Paths):
969
  pageNumberFound = page_num + 1
970
 
971
  # Build the query parameters
972
- params = {
973
- 'pdfLink': pdf_path, # Your PDF link
974
- 'keyword': heading_to_search, # Your keyword (could be a string or list)
975
- }
976
 
977
- # URL encode each parameter
978
- encoded_params = {key: urllib.parse.quote(value, safe='') for key, value in params.items()}
979
 
980
- # Construct the final encoded link
981
- encoded_link = '&'.join([f"{key}={value}" for key, value in encoded_params.items()])
982
 
983
- # Correctly construct the final URL with page and zoom
984
- final_url = f"{baselink}{encoded_link}#page={str(pageNumberFound)}&zoom={zoom_str}"
985
 
986
  # Get current date and time
987
  now = datetime.now()
@@ -992,7 +979,7 @@ def extract_section_under_header(multiplePDF_Paths):
992
 
993
 
994
  data_entry = {
995
- "NBSLink": final_url,
996
  "Subject": heading_to_search,
997
  "Page": str(pageNumberFound),
998
  "Author": "ADR",
 
39
  import copy
40
  import urllib.parse
41
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
 
43
+ import urllib.parse
 
 
44
 
45
+ def setpdflinks(json_data, pdf_path):
46
+ base_viewer_link = "https://findconsole-initialmarkups.hf.space/view-pdf?"
47
 
48
+ updated_json = []
49
+ for entry in json_data:
50
+ # Extract needed fields
51
+ zoom_str = entry.get("NBSLink", "")
52
+
53
+ # Encode the pdf link safely for URL usage
54
+ encoded_pdf_link = urllib.parse.quote(pdf_path, safe='')
55
+
56
+ # Construct the final link
57
+ final_url = f"{base_viewer_link}pdfLink={encoded_pdf_link}#{zoom_str}"
58
+
59
+ # Replace the old NBSLink value with the full URL
60
+ entry["NBSLink"] = final_url
61
+
62
+ updated_json.append(entry)
63
+
64
+ return updated_json
65
 
 
66
 
67
 
68
 
 
862
  pageNumberFound = page_num + 1
863
 
864
  # Build the query parameters
865
+ # params = {
866
+ # 'pdfLink': pdf_path, # Your PDF link
867
+ # 'keyword': heading_to_search, # Your keyword (could be a string or list)
868
+ # }
869
 
870
+ # # URL encode each parameter
871
+ # encoded_params = {key: urllib.parse.quote(value, safe='') for key, value in params.items()}
872
 
873
+ # # Construct the final encoded link
874
+ # encoded_link = '&'.join([f"{key}={value}" for key, value in encoded_params.items()])
875
 
876
+ # # Correctly construct the final URL with page and zoom
877
+ # final_url = f"{baselink}{encoded_link}#page={str(pageNumberFound)}&zoom={zoom_str}"
878
 
879
  # Get current date and time
880
  now = datetime.now()
 
885
 
886
 
887
  data_entry = {
888
+ "NBSLink": zoom_str,
889
  "Subject": heading_to_search,
890
  "Page": str(pageNumberFound),
891
  "Author": "ADR",
 
956
  pageNumberFound = page_num + 1
957
 
958
  # Build the query parameters
959
+ # params = {
960
+ # 'pdfLink': pdf_path, # Your PDF link
961
+ # 'keyword': heading_to_search, # Your keyword (could be a string or list)
962
+ # }
963
 
964
+ # # URL encode each parameter
965
+ # encoded_params = {key: urllib.parse.quote(value, safe='') for key, value in params.items()}
966
 
967
+ # # Construct the final encoded link
968
+ # encoded_link = '&'.join([f"{key}={value}" for key, value in encoded_params.items()])
969
 
970
+ # # Correctly construct the final URL with page and zoom
971
+ # final_url = f"{baselink}{encoded_link}#page={str(pageNumberFound)}&zoom={zoom_str}"
972
 
973
  # Get current date and time
974
  now = datetime.now()
 
979
 
980
 
981
  data_entry = {
982
+ "NBSLink": zoom_str,
983
  "Subject": heading_to_search,
984
  "Page": str(pageNumberFound),
985
  "Author": "ADR",