Update pdf_attacker.py
Browse files- pdf_attacker.py +58 -16
pdf_attacker.py
CHANGED
|
@@ -59,13 +59,29 @@ class PDFAttacker:
|
|
| 59 |
y = self.page_size[1] - self.margin
|
| 60 |
|
| 61 |
for item in cluster_items:
|
| 62 |
-
|
|
|
|
|
|
|
|
|
|
| 63 |
s = item['text']
|
| 64 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 65 |
x = self.margin
|
| 66 |
y -= self.line_height
|
| 67 |
-
|
| 68 |
-
x
|
|
|
|
| 69 |
|
| 70 |
c.save()
|
| 71 |
print(f"Normal PDF saved: {output_path}")
|
|
@@ -104,8 +120,19 @@ class PDFAttacker:
|
|
| 104 |
for line in lines:
|
| 105 |
x = self.margin
|
| 106 |
for item in line:
|
| 107 |
-
|
| 108 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 109 |
y -= self.line_height
|
| 110 |
|
| 111 |
# drawing order is per-cluster; attack by shuffling a subset
|
|
@@ -181,8 +208,19 @@ class PDFAttacker:
|
|
| 181 |
for line in lines:
|
| 182 |
x = self.margin
|
| 183 |
for item in line:
|
| 184 |
-
|
| 185 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 186 |
y -= self.line_height
|
| 187 |
|
| 188 |
c = canvas.Canvas(output_path, pagesize=self.page_size)
|
|
@@ -298,10 +336,17 @@ class PDFAttacker:
|
|
| 298 |
char_end = byte_to_char.get(next_byte, len(text))
|
| 299 |
else:
|
| 300 |
char_end = len(text)
|
| 301 |
-
|
| 302 |
substr = text[char_start:char_end]
|
| 303 |
-
|
| 304 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 305 |
|
| 306 |
return items
|
| 307 |
|
|
@@ -309,7 +354,7 @@ class PDFAttacker:
|
|
| 309 |
# fallback: per-character widths
|
| 310 |
for ch in text:
|
| 311 |
w = pdfmetrics.stringWidth(ch, self.font_name, self.font_size)
|
| 312 |
-
items.append({'text': ch, 'width': w})
|
| 313 |
return items
|
| 314 |
|
| 315 |
def _find_cluster_sequence_for_target(self, cluster_items, target_text: str):
|
|
@@ -341,10 +386,7 @@ class PDFAttacker:
|
|
| 341 |
|
| 342 |
def main():
|
| 343 |
ai_text = """
|
| 344 |
-
The rapid advancement of artificial intelligence has transformed numerous industries
|
| 345 |
-
and revolutionized the way we approach complex problems. Machine learning algorithms
|
| 346 |
-
have demonstrated remarkable capabilities in pattern recognition, data analysis,
|
| 347 |
-
and predictive modeling. These technological innovations continue to push the
|
| 348 |
boundaries of what was previously thought impossible, enabling automation and
|
| 349 |
efficiency improvements across various sectors. As we move forward, the integration
|
| 350 |
of AI systems into our daily lives becomes increasingly prevalent and sophisticated.
|
|
|
|
| 59 |
y = self.page_size[1] - self.margin
|
| 60 |
|
| 61 |
for item in cluster_items:
|
| 62 |
+
# prefer HarfBuzz advance if present
|
| 63 |
+
adv = item.get('adv_pts', item.get('width', 0))
|
| 64 |
+
width_rl = item.get('width_rl', adv)
|
| 65 |
+
offset = item.get('offset_pts', 0)
|
| 66 |
s = item['text']
|
| 67 |
+
|
| 68 |
+
# stability heuristic: if measured width differs significantly from HarfBuzz advance,
|
| 69 |
+
# prefer the ReportLab-measured width for layout to match drawString behavior (fix em-dash cases)
|
| 70 |
+
thresh = max(0.5, self.font_size * 0.1)
|
| 71 |
+
used_adv = adv
|
| 72 |
+
if abs(width_rl - adv) > thresh:
|
| 73 |
+
used_adv = width_rl
|
| 74 |
+
|
| 75 |
+
# clamp offset if it's unreasonably large relative to advance
|
| 76 |
+
if abs(offset) > (used_adv * 0.6):
|
| 77 |
+
offset = 0
|
| 78 |
+
|
| 79 |
+
if x + used_adv > self.margin + max_width:
|
| 80 |
x = self.margin
|
| 81 |
y -= self.line_height
|
| 82 |
+
# draw at x + offset to respect glyph x_offset where reasonable
|
| 83 |
+
c.drawString(x + offset, y, s)
|
| 84 |
+
x += used_adv
|
| 85 |
|
| 86 |
c.save()
|
| 87 |
print(f"Normal PDF saved: {output_path}")
|
|
|
|
| 120 |
for line in lines:
|
| 121 |
x = self.margin
|
| 122 |
for item in line:
|
| 123 |
+
adv = item.get('adv_pts', item.get('width', 0))
|
| 124 |
+
width_rl = item.get('width_rl', adv)
|
| 125 |
+
offset = item.get('offset_pts', 0)
|
| 126 |
+
|
| 127 |
+
thresh = max(0.5, self.font_size * 0.1)
|
| 128 |
+
used_adv = adv
|
| 129 |
+
if abs(width_rl - adv) > thresh:
|
| 130 |
+
used_adv = width_rl
|
| 131 |
+
if abs(offset) > (used_adv * 0.6):
|
| 132 |
+
offset = 0
|
| 133 |
+
|
| 134 |
+
char_positions.append((x + offset, y, item['text']))
|
| 135 |
+
x += used_adv
|
| 136 |
y -= self.line_height
|
| 137 |
|
| 138 |
# drawing order is per-cluster; attack by shuffling a subset
|
|
|
|
| 208 |
for line in lines:
|
| 209 |
x = self.margin
|
| 210 |
for item in line:
|
| 211 |
+
adv = item.get('adv_pts', item.get('width', 0))
|
| 212 |
+
width_rl = item.get('width_rl', adv)
|
| 213 |
+
offset = item.get('offset_pts', 0)
|
| 214 |
+
|
| 215 |
+
thresh = max(0.5, self.font_size * 0.1)
|
| 216 |
+
used_adv = adv
|
| 217 |
+
if abs(width_rl - adv) > thresh:
|
| 218 |
+
used_adv = width_rl
|
| 219 |
+
if abs(offset) > (used_adv * 0.6):
|
| 220 |
+
offset = 0
|
| 221 |
+
|
| 222 |
+
positions.append((x + offset, y, item['text']))
|
| 223 |
+
x += used_adv
|
| 224 |
y -= self.line_height
|
| 225 |
|
| 226 |
c = canvas.Canvas(output_path, pagesize=self.page_size)
|
|
|
|
| 336 |
char_end = byte_to_char.get(next_byte, len(text))
|
| 337 |
else:
|
| 338 |
char_end = len(text)
|
| 339 |
+
# substring for this cluster
|
| 340 |
substr = text[char_start:char_end]
|
| 341 |
+
|
| 342 |
+
# Use ReportLab measured width for cluster advance and set offset to zero
|
| 343 |
+
try:
|
| 344 |
+
width_rl = pdfmetrics.stringWidth(substr, self.font_name, self.font_size)
|
| 345 |
+
except Exception:
|
| 346 |
+
# fallback: estimate from HarfBuzz if possible
|
| 347 |
+
adv_sum = clusters.get(start, 0)
|
| 348 |
+
width_rl = (adv_sum / float(self.upem)) * self.font_size
|
| 349 |
+
items.append({'text': substr, 'adv_pts': width_rl, 'offset_pts': 0, 'width_rl': width_rl, 'width': width_rl})
|
| 350 |
|
| 351 |
return items
|
| 352 |
|
|
|
|
| 354 |
# fallback: per-character widths
|
| 355 |
for ch in text:
|
| 356 |
w = pdfmetrics.stringWidth(ch, self.font_name, self.font_size)
|
| 357 |
+
items.append({'text': ch, 'adv_pts': w, 'offset_pts': 0, 'width_rl': w, 'width': w})
|
| 358 |
return items
|
| 359 |
|
| 360 |
def _find_cluster_sequence_for_target(self, cluster_items, target_text: str):
|
|
|
|
| 386 |
|
| 387 |
def main():
|
| 388 |
ai_text = """
|
| 389 |
+
The rapid advancement of artificial intelligence has transformed numerous industries — and revolutionized the way we approach complex problems. Machine learning algorithms have demonstrated remarkable capabilities in pattern recognition, data analysis, and predictive modeling. These technological innovations continue to push the
|
|
|
|
|
|
|
|
|
|
| 390 |
boundaries of what was previously thought impossible, enabling automation and
|
| 391 |
efficiency improvements across various sectors. As we move forward, the integration
|
| 392 |
of AI systems into our daily lives becomes increasingly prevalent and sophisticated.
|