Marthee commited on
Commit
44c9429
·
verified ·
1 Parent(s): 8a9992f

Update InitialMarkups.py

Browse files
Files changed (1) hide show
  1. InitialMarkups.py +19 -34
InitialMarkups.py CHANGED
@@ -34,23 +34,18 @@ from fuzzywuzzy import fuzz
34
  import copy
35
  import tsadropboxretrieval
36
 
37
-
38
 
39
  def changepdflinks(data_list_JSON, pdflink):
40
- print("changepdflinks called with:", pdflink)
41
-
 
42
  if isinstance(data_list_JSON, str):
43
- data_list_JSON = json.loads(data_list_JSON)
44
-
45
- # Normalize link (decode if double-encoded)
46
- prev, decoded = None, pdflink
47
- while decoded != prev:
48
- prev = decoded
49
- decoded = urllib.parse.unquote(decoded)
50
- safe_pdf_link = decoded.strip()
51
-
52
- # Re-encode ONCE for embedding in ?pdfLink=
53
- encoded_pdf_link = urllib.parse.quote(safe_pdf_link, safe='')
54
 
55
  for entry in data_list_JSON:
56
  old_url = entry.get("NBSLink", "")
@@ -60,30 +55,20 @@ def changepdflinks(data_list_JSON, pdflink):
60
  parsed = urllib.parse.urlparse(old_url)
61
  query = urllib.parse.parse_qs(parsed.query)
62
 
63
- # Replace pdfLink safely
64
  if "pdfLink" in query:
65
- query["pdfLink"] = [encoded_pdf_link]
66
-
67
- # Rebuild query with proper escaping for all other params
68
- query_parts = []
69
- for key, values in query.items():
70
- for value in values:
71
- query_parts.append(f"{key}={value}")
72
 
73
- new_query = "&".join(query_parts)
74
-
75
- new_url = f"{parsed.scheme}://{parsed.netloc}{parsed.path}?{new_query}"
76
- if parsed.fragment:
77
- new_url += f"#{parsed.fragment}"
78
-
79
- entry["NBSLink"] = new_url
80
-
81
- print("\n--- URL Update ---")
82
- print("OLD:", old_url)
83
- print("NEW:", new_url)
84
 
85
  return data_list_JSON
86
-
87
 
88
  def get_regular_font_size_and_color(doc):
89
  font_sizes = []
 
34
  import copy
35
  import tsadropboxretrieval
36
 
 
37
 
38
  def changepdflinks(data_list_JSON, pdflink):
39
+ print('Received JSON:', data_list_JSON)
40
+
41
+ # Ensure it's a list of dicts
42
  if isinstance(data_list_JSON, str):
43
+ try:
44
+ data_list_JSON = json.loads(data_list_JSON)
45
+ except json.JSONDecodeError:
46
+ raise ValueError(f"Invalid JSON string passed: {data_list_JSON[:200]}")
47
+ elif not isinstance(data_list_JSON, list):
48
+ raise ValueError(f"Input must be JSON string or list, got {type(data_list_JSON)}")
 
 
 
 
 
49
 
50
  for entry in data_list_JSON:
51
  old_url = entry.get("NBSLink", "")
 
55
  parsed = urllib.parse.urlparse(old_url)
56
  query = urllib.parse.parse_qs(parsed.query)
57
 
58
+ # Replace only if present
59
  if "pdfLink" in query:
60
+ # Decode old link for readability
61
+ decoded_pdf_link = urllib.parse.unquote(query["pdfLink"][0])
62
+
63
+ # Assign the new one (encode once)
64
+ query["pdfLink"] = [urllib.parse.quote(pdflink, safe=":/")]
 
 
65
 
66
+ new_query = urllib.parse.urlencode(query, doseq=True)
67
+ new_url = urllib.parse.urlunparse(parsed._replace(query=new_query))
68
+ entry["NBSLink"] = new_url
 
 
 
 
 
 
 
 
69
 
70
  return data_list_JSON
71
+
72
 
73
  def get_regular_font_size_and_color(doc):
74
  font_sizes = []