Marthee commited on
Commit
8a9992f
·
verified ·
1 Parent(s): 3af82b0

Update InitialMarkups.py

Browse files
Files changed (1) hide show
  1. InitialMarkups.py +17 -32
InitialMarkups.py CHANGED
@@ -34,34 +34,24 @@ from fuzzywuzzy import fuzz
34
  import copy
35
  import tsadropboxretrieval
36
 
37
-
38
-
39
  def changepdflinks(data_list_JSON, pdflink):
40
  print("changepdflinks called with:", pdflink)
41
 
42
- # --- Validate input format ---
43
  if isinstance(data_list_JSON, str):
44
- if data_list_JSON.strip().startswith('['):
45
- try:
46
- data_list_JSON = json.loads(data_list_JSON)
47
- except json.JSONDecodeError:
48
- raise ValueError(f"Invalid JSON string passed: {data_list_JSON[:200]}")
49
- else:
50
- raise ValueError(f"Expected JSON string but got something else: {data_list_JSON[:200]}")
51
- elif not isinstance(data_list_JSON, list):
52
- raise ValueError(f"Input must be JSON string or list, got {type(data_list_JSON)}")
53
-
54
- # --- Normalize the incoming PDF link (avoid double-encoding) ---
55
- pdflink = pdflink.strip()
56
- # If somehow encoded multiple times, decode until stable
57
- prev = None
58
- decoded = pdflink
59
  while decoded != prev:
60
  prev = decoded
61
  decoded = urllib.parse.unquote(decoded)
62
- safe_pdf_link = decoded
 
 
 
63
 
64
- # --- Update each entry ---
65
  for entry in data_list_JSON:
66
  old_url = entry.get("NBSLink", "")
67
  if not old_url:
@@ -70,36 +60,31 @@ def changepdflinks(data_list_JSON, pdflink):
70
  parsed = urllib.parse.urlparse(old_url)
71
  query = urllib.parse.parse_qs(parsed.query)
72
 
73
- # Only replace the pdfLink parameter
74
  if "pdfLink" in query:
75
- query["pdfLink"] = [safe_pdf_link]
76
 
77
- # Rebuild query manually to avoid double encoding
78
  query_parts = []
79
  for key, values in query.items():
80
  for value in values:
81
- if key == "pdfLink":
82
- query_parts.append(f"{key}={value}")
83
- else:
84
- query_parts.append(f"{key}={urllib.parse.quote_plus(value)}")
85
 
86
  new_query = "&".join(query_parts)
87
 
88
- # Preserve everything else
89
- new_url = f"{parsed.scheme}://{parsed.netloc}{parsed.path}"
90
- if new_query:
91
- new_url += f"?{new_query}"
92
  if parsed.fragment:
93
  new_url += f"#{parsed.fragment}"
94
 
95
  entry["NBSLink"] = new_url
96
 
97
- # Debugging
98
  print("\n--- URL Update ---")
99
  print("OLD:", old_url)
100
  print("NEW:", new_url)
101
 
102
  return data_list_JSON
 
 
103
  def get_regular_font_size_and_color(doc):
104
  font_sizes = []
105
  colors = []
 
34
  import copy
35
  import tsadropboxretrieval
36
 
37
+
38
+
39
  def changepdflinks(data_list_JSON, pdflink):
40
  print("changepdflinks called with:", pdflink)
41
 
 
42
  if isinstance(data_list_JSON, str):
43
+ data_list_JSON = json.loads(data_list_JSON)
44
+
45
+ # Normalize link (decode if double-encoded)
46
+ prev, decoded = None, pdflink
 
 
 
 
 
 
 
 
 
 
 
47
  while decoded != prev:
48
  prev = decoded
49
  decoded = urllib.parse.unquote(decoded)
50
+ safe_pdf_link = decoded.strip()
51
+
52
+ # Re-encode ONCE for embedding in ?pdfLink=
53
+ encoded_pdf_link = urllib.parse.quote(safe_pdf_link, safe='')
54
 
 
55
  for entry in data_list_JSON:
56
  old_url = entry.get("NBSLink", "")
57
  if not old_url:
 
60
  parsed = urllib.parse.urlparse(old_url)
61
  query = urllib.parse.parse_qs(parsed.query)
62
 
63
+ # Replace pdfLink safely
64
  if "pdfLink" in query:
65
+ query["pdfLink"] = [encoded_pdf_link]
66
 
67
+ # Rebuild query with proper escaping for all other params
68
  query_parts = []
69
  for key, values in query.items():
70
  for value in values:
71
+ query_parts.append(f"{key}={value}")
 
 
 
72
 
73
  new_query = "&".join(query_parts)
74
 
75
+ new_url = f"{parsed.scheme}://{parsed.netloc}{parsed.path}?{new_query}"
 
 
 
76
  if parsed.fragment:
77
  new_url += f"#{parsed.fragment}"
78
 
79
  entry["NBSLink"] = new_url
80
 
 
81
  print("\n--- URL Update ---")
82
  print("OLD:", old_url)
83
  print("NEW:", new_url)
84
 
85
  return data_list_JSON
86
+
87
+
88
  def get_regular_font_size_and_color(doc):
89
  font_sizes = []
90
  colors = []