Marthee commited on
Commit
3af82b0
·
verified ·
1 Parent(s): 6eb951f

Update InitialMarkups.py

Browse files
Files changed (1) hide show
  1. InitialMarkups.py +40 -15
InitialMarkups.py CHANGED
@@ -37,8 +37,9 @@ import tsadropboxretrieval
37
 
38
 
39
  def changepdflinks(data_list_JSON, pdflink):
40
- print('henaaaa weee',data_list_JSON)
41
-
 
42
  if isinstance(data_list_JSON, str):
43
  if data_list_JSON.strip().startswith('['):
44
  try:
@@ -50,31 +51,55 @@ def changepdflinks(data_list_JSON, pdflink):
50
  elif not isinstance(data_list_JSON, list):
51
  raise ValueError(f"Input must be JSON string or list, got {type(data_list_JSON)}")
52
 
53
- # Loop through all entries and update their NBSLink
 
 
 
 
 
 
 
 
 
 
54
  for entry in data_list_JSON:
55
  old_url = entry.get("NBSLink", "")
 
 
56
 
57
- # Parse URL and query params
58
  parsed = urllib.parse.urlparse(old_url)
59
  query = urllib.parse.parse_qs(parsed.query)
60
 
61
- # Replace only the 'pdfLink' parameter if present
62
  if "pdfLink" in query:
63
- query["pdfLink"] = [pdflink]
 
 
 
 
 
 
 
 
 
64
 
65
- # Rebuild query string
66
- new_query = urllib.parse.urlencode(query, doseq=True)
67
 
68
- # Rebuild full URL with the same fragment (page/zoom)
69
- new_url = f"{parsed.scheme}://{parsed.netloc}{parsed.path}?{new_query}"
70
- if parsed.fragment:
71
- new_url += f"#{parsed.fragment}"
 
 
72
 
73
- # Update the entry
74
- entry["NBSLink"] = new_url
75
 
76
- return data_list_JSON
 
 
 
77
 
 
78
  def get_regular_font_size_and_color(doc):
79
  font_sizes = []
80
  colors = []
 
37
 
38
 
39
  def changepdflinks(data_list_JSON, pdflink):
40
+ print("changepdflinks called with:", pdflink)
41
+
42
+ # --- Validate input format ---
43
  if isinstance(data_list_JSON, str):
44
  if data_list_JSON.strip().startswith('['):
45
  try:
 
51
  elif not isinstance(data_list_JSON, list):
52
  raise ValueError(f"Input must be JSON string or list, got {type(data_list_JSON)}")
53
 
54
+ # --- Normalize the incoming PDF link (avoid double-encoding) ---
55
+ pdflink = pdflink.strip()
56
+ # If somehow encoded multiple times, decode until stable
57
+ prev = None
58
+ decoded = pdflink
59
+ while decoded != prev:
60
+ prev = decoded
61
+ decoded = urllib.parse.unquote(decoded)
62
+ safe_pdf_link = decoded
63
+
64
+ # --- Update each entry ---
65
  for entry in data_list_JSON:
66
  old_url = entry.get("NBSLink", "")
67
+ if not old_url:
68
+ continue
69
 
 
70
  parsed = urllib.parse.urlparse(old_url)
71
  query = urllib.parse.parse_qs(parsed.query)
72
 
73
+ # Only replace the pdfLink parameter
74
  if "pdfLink" in query:
75
+ query["pdfLink"] = [safe_pdf_link]
76
+
77
+ # Rebuild query manually to avoid double encoding
78
+ query_parts = []
79
+ for key, values in query.items():
80
+ for value in values:
81
+ if key == "pdfLink":
82
+ query_parts.append(f"{key}={value}")
83
+ else:
84
+ query_parts.append(f"{key}={urllib.parse.quote_plus(value)}")
85
 
86
+ new_query = "&".join(query_parts)
 
87
 
88
+ # Preserve everything else
89
+ new_url = f"{parsed.scheme}://{parsed.netloc}{parsed.path}"
90
+ if new_query:
91
+ new_url += f"?{new_query}"
92
+ if parsed.fragment:
93
+ new_url += f"#{parsed.fragment}"
94
 
95
+ entry["NBSLink"] = new_url
 
96
 
97
+ # Debugging
98
+ print("\n--- URL Update ---")
99
+ print("OLD:", old_url)
100
+ print("NEW:", new_url)
101
 
102
+ return data_list_JSON
103
  def get_regular_font_size_and_color(doc):
104
  font_sizes = []
105
  colors = []