Marthee commited on
Commit
e0663da
·
verified ·
1 Parent(s): a29a783

Update InitialMarkups.py

Browse files
Files changed (1) hide show
  1. InitialMarkups.py +19 -13
InitialMarkups.py CHANGED
@@ -34,11 +34,12 @@ from fuzzywuzzy import fuzz
34
  import copy
35
  import tsadropboxretrieval
36
 
 
37
 
38
  def changepdflinks(data_list_JSON, pdflink):
39
  print('Received JSON:', data_list_JSON)
40
 
41
- # Ensure it's a list of dicts
42
  if isinstance(data_list_JSON, str):
43
  try:
44
  data_list_JSON = json.loads(data_list_JSON)
@@ -47,29 +48,34 @@ def changepdflinks(data_list_JSON, pdflink):
47
  elif not isinstance(data_list_JSON, list):
48
  raise ValueError(f"Input must be JSON string or list, got {type(data_list_JSON)}")
49
 
 
 
 
50
  for entry in data_list_JSON:
51
  old_url = entry.get("NBSLink", "")
52
  if not old_url:
53
  continue
54
 
55
  parsed = urllib.parse.urlparse(old_url)
56
- query = urllib.parse.parse_qs(parsed.query)
57
-
58
- # Replace only if present
59
- if "pdfLink" in query:
60
- # Decode old link for readability
61
- decoded_pdf_link = urllib.parse.unquote(query["pdfLink"][0])
62
-
63
- # Assign the new one (encode once)
64
- query["pdfLink"] = [urllib.parse.quote(pdflink, safe=":/")]
65
 
66
- new_query = urllib.parse.urlencode(query, doseq=True)
67
- new_url = urllib.parse.urlunparse(parsed._replace(query=new_query))
68
- entry["NBSLink"] = new_url
 
 
69
 
 
 
 
 
 
 
 
 
70
  return data_list_JSON
71
 
72
 
 
73
  def get_regular_font_size_and_color(doc):
74
  font_sizes = []
75
  colors = []
 
34
  import copy
35
  import tsadropboxretrieval
36
 
37
+ import json, urllib.parse, copy
38
 
39
  def changepdflinks(data_list_JSON, pdflink):
40
  print('Received JSON:', data_list_JSON)
41
 
42
+ # Ensure list of dicts
43
  if isinstance(data_list_JSON, str):
44
  try:
45
  data_list_JSON = json.loads(data_list_JSON)
 
48
  elif not isinstance(data_list_JSON, list):
49
  raise ValueError(f"Input must be JSON string or list, got {type(data_list_JSON)}")
50
 
51
+ # Work on a safe copy
52
+ data_list_JSON = copy.deepcopy(data_list_JSON)
53
+
54
  for entry in data_list_JSON:
55
  old_url = entry.get("NBSLink", "")
56
  if not old_url:
57
  continue
58
 
59
  parsed = urllib.parse.urlparse(old_url)
 
 
 
 
 
 
 
 
 
60
 
61
+ # Extract page/zoom fragment (if present)
62
+ fragment = parsed.fragment # e.g. "page=3&zoom=150"
63
+ print(fragment)
64
+ # Encode the new pdf link safely
65
+ encoded_pdf = urllib.parse.quote(pdflink, safe=":/?=&")
66
 
67
+ # Construct the new final link: encoded pdf link + old fragment
68
+ if fragment:
69
+ new_url = f"{encoded_pdf}#{fragment}"
70
+ else:
71
+ new_url = encoded_pdf
72
+ print(newurl)
73
+ entry["NBSLink"] = 'https://findconsole-initialmarkups.hf.space/view-pdf?pdfLink='+new_url
74
+ print(entry["NBSLink"] )
75
  return data_list_JSON
76
 
77
 
78
+
79
  def get_regular_font_size_and_color(doc):
80
  font_sizes = []
81
  colors = []