Taha493 commited on
Commit
dbbfa34
·
verified ·
1 Parent(s): 114cdfa

Update babeldoc/format/pdf/document_il/midend/typesetting.py

Browse files
babeldoc/format/pdf/document_il/midend/typesetting.py CHANGED
@@ -199,11 +199,11 @@ class TypesettingUnit:
199
  unicode = self.try_get_unicode()
200
  if unicode:
201
  return unicode in [
202
- "£",
203
- "،",
204
- "!",
205
- ":",
206
- ")",
207
  ]
208
  return False
209
 
@@ -241,20 +241,20 @@ class TypesettingUnit:
241
  return False
242
  assert len(unicode) == 1, "Unicode must be a single character"
243
  if unicode in [
244
- "(",
245
- ")",
246
- "،",
247
- "。",
248
- "、",
249
- "ï¼›",
250
- ":",
251
- "?",
252
- "!",
253
- ")",
254
- "،",
255
- "!",
256
- ":",
257
- ")",
258
  ]:
259
  return True
260
  if unicode:
@@ -327,33 +327,33 @@ class TypesettingUnit:
327
  "?",
328
  "!",
329
  # Chinese punctuation
330
- "،", # Comma
331
- "。", # Period
332
- ":", # Colon
333
- "ï¼›", # Semicolon
334
- "?", # Question mark
335
- "!", # Exclamation mark
336
- "、", # Enumeration comma
337
  # Closing brackets
338
  ")", # Right parenthesis
339
  "]", # Right square bracket
340
  "}", # Right curly bracket
341
- ")", # Right parenthesis
342
- "】", # Right square bracket
343
- "》", # Right double angle bracket
344
- "』", # Right single quotation mark
345
- "」", # Right corner bracket
346
  # Connected line symbols
347
- "–", # EN DASH
348
- "—", # EM DASH
349
  # Special punctuation
350
- "·", # Middle dot
351
- "…", # Ellipsis
352
- "°", # Degree symbol
353
  # Slash
354
  "/", # Slash
355
- "/", # Fullwidth solidus
356
- "‰", # Per mille sign
357
  ]
358
  return False
359
 
@@ -376,21 +376,21 @@ class TypesettingUnit:
376
  # Opening brackets
377
  """, # Left double quotation mark
378
  "'", # Left single quotation mark
379
- "《", # Left double angle bracket
380
- "『", # Left single quotation mark
381
  # Opening brackets
382
  "(", # Left parenthesis
383
  "[", # Left square bracket
384
  "{", # Left curly bracket
385
- "(", # Left parenthesis
386
- "【", # Left square bracket
387
- "《", # Left double angle bracket
388
- "『", # Left single quotation mark
389
  # Cannot appear at end of line - combined with closing brackets
390
  """, # Right double quotation mark
391
  "'", # Right single quotation mark
392
- "》", # Right double angle bracket
393
- "』", # Right single quotation mark
394
  ]
395
 
396
  def passthrough(
@@ -1316,6 +1316,76 @@ class Typesetting:
1316
  return True
1317
  return False
1318
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1319
  def _calculate_rtl_margin_shift(self, paragraph: il_version_1.PdfParagraph, page: il_version_1.Page) -> float:
1320
  """Calculate the X shift needed for RTL margin mirroring.
1321
 
@@ -1325,6 +1395,8 @@ class Typesetting:
1325
  IMPORTANT: When there are sidebars or page decorations, we mirror within the
1326
  CONTENT AREA (excluding decorations), not the full page width.
1327
 
 
 
1328
  Args:
1329
  paragraph: The paragraph to mirror
1330
  page: The page containing the paragraph
@@ -1352,6 +1424,22 @@ class Typesetting:
1352
  new_x = content_left + original_right_margin
1353
  shift_x = new_x - box.x
1354
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1355
  return shift_x
1356
 
1357
  def _get_content_area_bounds(self, page: il_version_1.Page) -> tuple[float, float]:
@@ -1469,16 +1557,28 @@ class Typesetting:
1469
  def _apply_rtl_margin_shift_to_composition(
1470
  self,
1471
  composition: list[PdfParagraphComposition],
1472
- shift_x: float
 
1473
  ) -> None:
1474
  """Apply RTL margin shift to all characters in a composition list.
1475
 
1476
  Args:
1477
  composition: List of paragraph compositions to shift
1478
  shift_x: The X shift to apply
 
1479
  """
1480
  if shift_x == 0:
1481
  return
 
 
 
 
 
 
 
 
 
 
1482
 
1483
  for comp in composition:
1484
  if comp.pdf_character:
@@ -1486,9 +1586,28 @@ class Typesetting:
1486
  if char.box:
1487
  char.box.x += shift_x
1488
  char.box.x2 += shift_x
 
 
 
 
 
 
 
 
 
 
1489
  if char.visual_bbox and char.visual_bbox.box:
1490
  char.visual_bbox.box.x += shift_x
1491
  char.visual_bbox.box.x2 += shift_x
 
 
 
 
 
 
 
 
 
1492
  elif comp.pdf_formula:
1493
  formula = comp.pdf_formula
1494
  if formula.box:
@@ -1731,12 +1850,25 @@ class Typesetting:
1731
  if shift_x != 0:
1732
  self._apply_rtl_margin_shift_to_composition(
1733
  paragraph.pdf_paragraph_composition,
1734
- shift_x
 
1735
  )
1736
  # Also update the paragraph box to reflect the new position
1737
  if paragraph.box:
1738
  paragraph.box.x += shift_x
1739
  paragraph.box.x2 += shift_x
 
 
 
 
 
 
 
 
 
 
 
 
1740
  else:
1741
  # Use precomputed scale factor to layout typesetting units
1742
  precomputed_scale = (
@@ -1761,12 +1893,25 @@ class Typesetting:
1761
  if shift_x != 0:
1762
  self._apply_rtl_margin_shift_to_composition(
1763
  paragraph.pdf_paragraph_composition,
1764
- shift_x
 
1765
  )
1766
  # Also update the paragraph box to reflect the new position
1767
  if paragraph.box:
1768
  paragraph.box.x += shift_x
1769
  paragraph.box.x2 += shift_x
 
 
 
 
 
 
 
 
 
 
 
 
1770
 
1771
  def _is_arabic_char(self, char: str) -> bool:
1772
  """Check if character is Arabic - OPTIMIZED"""
@@ -1841,9 +1986,12 @@ class Typesetting:
1841
  arabic_word_spacing = space_width * ARABIC_WORD_SPACING_RATIO
1842
  line_units_map: dict[int, list[TypesettingUnit]] = {}
1843
 
 
 
 
1844
  i = 0
1845
  safety_counter = 0
1846
- max_iterations = len(typesetting_units) * 2 # Safety limit
1847
 
1848
  while i < len(typesetting_units) and safety_counter < max_iterations:
1849
  safety_counter += 1
@@ -1871,54 +2019,136 @@ class Typesetting:
1871
  if current_x == box.x and word_units and word_units[0].is_space:
1872
  continue
1873
 
1874
- # Check if needs new line
1875
- if current_x + word_width > box.x2 and current_x > box.x:
1876
- current_x = box.x
1877
- if current_line_heights:
1878
- max_height = max(current_line_heights)
1879
- mode_height = statistics.mode(current_line_heights) if len(current_line_heights) > 1 else max_height
1880
- current_y -= max(mode_height * line_skip, max_height * 1.05)
1881
- line_ys.append(current_y)
1882
- current_line_heights = []
1883
-
1884
- if current_y < box.y:
1885
- all_units_fit = False
1886
-
1887
- current_line_index += 1
1888
 
1889
- # Place word units
1890
- relocated_word_units = []
1891
- for unit in word_units:
1892
- if unit.is_space and current_x == box.x:
1893
- continue
1894
-
1895
- unit_width = unit.width * scale
1896
- unit_height = unit.height * scale
1897
-
1898
- # CJK spacing
1899
- if (last_unit and last_unit.is_cjk_char ^ unit.is_cjk_char
1900
- and not unit.is_space and current_x > box.x):
1901
- current_x += space_width * 0.5
 
 
 
 
 
 
 
 
 
 
 
 
 
1902
 
1903
- relocated_unit = unit.relocate(current_x, current_y, scale)
1904
- relocated_unit.line_id = current_line_index
1905
- relocated_word_units.append(relocated_unit)
1906
- typeset_units.append(relocated_unit)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1907
 
1908
- if not unit.is_space:
1909
- current_line_heights.append(unit_height)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1910
 
1911
- current_x = relocated_unit.box.x2
1912
- last_unit = relocated_unit
1913
-
1914
- if relocated_word_units:
1915
- relocated_word_units[-1].ends_word = True
1916
- line_units_map.setdefault(current_line_index, []).extend(relocated_word_units)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1917
 
1918
 
1919
  # Apply post-layout spacing so wrapping stays unchanged
 
1920
  if typeset_units and line_units_map:
1921
- self._apply_arabic_word_spacing(line_units_map, arabic_word_spacing)
1922
 
1923
  # Right-align Arabic lines (but NOT table content)
1924
  # Check if this paragraph is inside a table by examining layout_label
@@ -1969,10 +2199,22 @@ class Typesetting:
1969
  self._justify_arabic_line(line_units, box.x, box.x2, line_min_x, line_max_x)
1970
  else:
1971
  # For last line or non-justified paragraphs:
1972
- # Just right-align the line
 
 
1973
  target_right_position = box.x2
1974
  shift_x = target_right_position - line_max_x
1975
 
 
 
 
 
 
 
 
 
 
 
1976
  for unit in line_units:
1977
  self._shift_unit_x(unit, shift_x)
1978
  else:
@@ -1990,7 +2232,7 @@ class Typesetting:
1990
  and not last_unit.mixed_character_blacklist and not unit.mixed_character_blacklist
1991
  and current_x > box.x and unit.try_get_unicode() != " "
1992
  and last_unit.try_get_unicode() != " "
1993
- and last_unit.try_get_unicode() not in ["、", "،", "。", ":", "!", "?"]):
1994
  current_x += space_width * 0.5
1995
 
1996
  if use_english_line_break:
@@ -2043,18 +2285,54 @@ class Typesetting:
2043
  self,
2044
  line_units_map: dict[int, list[TypesettingUnit]],
2045
  spacing: float,
 
2046
  ):
2047
- """Apply additional spacing between Arabic words post layout."""
 
 
 
2048
  if spacing <= 0:
2049
  return
2050
 
2051
- for units in line_units_map.values():
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2052
  cumulative_shift = 0.0
2053
  for unit in units:
2054
  if cumulative_shift:
2055
  self._shift_unit_x(unit, cumulative_shift)
2056
  if getattr(unit, "ends_word", False):
2057
- cumulative_shift += spacing
2058
 
2059
  def _shift_unit_x(self, unit: TypesettingUnit, shift_x: float):
2060
  """Shift a typesetting unit horizontally."""
@@ -2091,6 +2369,8 @@ class Typesetting:
2091
  - Extend to the left margin (target_left)
2092
  - Space between words is distributed evenly to fill the gap
2093
 
 
 
2094
  Args:
2095
  line_units: List of typesetting units in the line
2096
  target_left: Target left edge (box.x)
@@ -2106,7 +2386,15 @@ class Typesetting:
2106
 
2107
  if current_width <= 0 or target_width <= current_width:
2108
  # Line already fills the space or is wider, just right-align
 
2109
  shift_x = target_right - current_right
 
 
 
 
 
 
 
2110
  for unit in line_units:
2111
  self._shift_unit_x(unit, shift_x)
2112
  return
 
199
  unicode = self.try_get_unicode()
200
  if unicode:
201
  return unicode in [
202
+ "£",
203
+ "،",
204
+ "!",
205
+ ":",
206
+ ")",
207
  ]
208
  return False
209
 
 
241
  return False
242
  assert len(unicode) == 1, "Unicode must be a single character"
243
  if unicode in [
244
+ "(",
245
+ ")",
246
+ "،",
247
+ "。",
248
+ "、",
249
+ "ï¼›",
250
+ ":",
251
+ "?",
252
+ "!",
253
+ ")",
254
+ "،",
255
+ "!",
256
+ ":",
257
+ ")",
258
  ]:
259
  return True
260
  if unicode:
 
327
  "?",
328
  "!",
329
  # Chinese punctuation
330
+ "،", # Comma
331
+ "。", # Period
332
+ ":", # Colon
333
+ "ï¼›", # Semicolon
334
+ "?", # Question mark
335
+ "!", # Exclamation mark
336
+ "、", # Enumeration comma
337
  # Closing brackets
338
  ")", # Right parenthesis
339
  "]", # Right square bracket
340
  "}", # Right curly bracket
341
+ ")", # Right parenthesis
342
+ "】", # Right square bracket
343
+ "》", # Right double angle bracket
344
+ "』", # Right single quotation mark
345
+ "」", # Right corner bracket
346
  # Connected line symbols
347
+ "–", # EN DASH
348
+ "—", # EM DASH
349
  # Special punctuation
350
+ "·", # Middle dot
351
+ "…", # Ellipsis
352
+ "°", # Degree symbol
353
  # Slash
354
  "/", # Slash
355
+ "/", # Fullwidth solidus
356
+ "‰", # Per mille sign
357
  ]
358
  return False
359
 
 
376
  # Opening brackets
377
  """, # Left double quotation mark
378
  "'", # Left single quotation mark
379
+ "《", # Left double angle bracket
380
+ "『", # Left single quotation mark
381
  # Opening brackets
382
  "(", # Left parenthesis
383
  "[", # Left square bracket
384
  "{", # Left curly bracket
385
+ "(", # Left parenthesis
386
+ "【", # Left square bracket
387
+ "《", # Left double angle bracket
388
+ "『", # Left single quotation mark
389
  # Cannot appear at end of line - combined with closing brackets
390
  """, # Right double quotation mark
391
  "'", # Right single quotation mark
392
+ "》", # Right double angle bracket
393
+ "』", # Right single quotation mark
394
  ]
395
 
396
  def passthrough(
 
1316
  return True
1317
  return False
1318
 
1319
+ def _get_effective_arabic_box(
1320
+ self,
1321
+ paragraph: il_version_1.PdfParagraph,
1322
+ page: il_version_1.Page,
1323
+ original_box: Box
1324
+ ) -> Box:
1325
+ """Get the effective bounding box for Arabic text layout.
1326
+
1327
+ This calculates where the text will actually end up after RTL margin mirroring,
1328
+ and ensures the box stays within page bounds. This prevents text from overflowing
1329
+ the page margins.
1330
+
1331
+ Args:
1332
+ paragraph: The paragraph being laid out
1333
+ page: The page containing the paragraph
1334
+ original_box: The original bounding box from the English source
1335
+
1336
+ Returns:
1337
+ A new Box that represents the effective layout area after considering
1338
+ RTL margin mirroring and page bounds
1339
+ """
1340
+ if not page.cropbox or not page.cropbox.box:
1341
+ return original_box
1342
+
1343
+ page_box = page.cropbox.box
1344
+
1345
+ # Get the margin shift that will be applied later
1346
+ shift_x = self._calculate_rtl_margin_shift(paragraph, page)
1347
+
1348
+ # Calculate where the box will be after margin shifting
1349
+ final_x = original_box.x + shift_x
1350
+ final_x2 = original_box.x2 + shift_x
1351
+
1352
+ # Get safe page margins (2% from each edge)
1353
+ page_left_margin = page_box.x + (page_box.x2 - page_box.x) * 0.02
1354
+ page_right_margin = page_box.x2 - (page_box.x2 - page_box.x) * 0.02
1355
+
1356
+ # Check if the shifted box would overflow the right margin
1357
+ if final_x2 > page_right_margin:
1358
+ # Calculate how much we need to shrink the box
1359
+ overflow = final_x2 - page_right_margin
1360
+ # Reduce the box width from the right side
1361
+ new_x2 = original_box.x2 - overflow
1362
+ # Ensure we don't make the box too small
1363
+ min_width = (page_box.x2 - page_box.x) * 0.1 # At least 10% of page width
1364
+ if new_x2 - original_box.x < min_width:
1365
+ new_x2 = original_box.x + min_width
1366
+
1367
+ return Box(
1368
+ x=original_box.x,
1369
+ y=original_box.y,
1370
+ x2=new_x2,
1371
+ y2=original_box.y2
1372
+ )
1373
+
1374
+ # Check if the shifted box would overflow the left margin
1375
+ if final_x < page_left_margin:
1376
+ # This shouldn't normally happen with RTL, but handle it
1377
+ overflow = page_left_margin - final_x
1378
+ new_x = original_box.x + overflow
1379
+
1380
+ return Box(
1381
+ x=new_x,
1382
+ y=original_box.y,
1383
+ x2=original_box.x2,
1384
+ y2=original_box.y2
1385
+ )
1386
+
1387
+ return original_box
1388
+
1389
  def _calculate_rtl_margin_shift(self, paragraph: il_version_1.PdfParagraph, page: il_version_1.Page) -> float:
1390
  """Calculate the X shift needed for RTL margin mirroring.
1391
 
 
1395
  IMPORTANT: When there are sidebars or page decorations, we mirror within the
1396
  CONTENT AREA (excluding decorations), not the full page width.
1397
 
1398
+ The shift is clamped to ensure the paragraph stays within page bounds.
1399
+
1400
  Args:
1401
  paragraph: The paragraph to mirror
1402
  page: The page containing the paragraph
 
1424
  new_x = content_left + original_right_margin
1425
  shift_x = new_x - box.x
1426
 
1427
+ # CLAMP: Ensure the paragraph's right edge doesn't exceed content_right after shifting
1428
+ # new_x2 = box.x2 + shift_x should be <= content_right
1429
+ new_x2 = box.x2 + shift_x
1430
+ if new_x2 > content_right:
1431
+ # Reduce the shift to keep the right edge at content_right
1432
+ overflow = new_x2 - content_right
1433
+ shift_x -= overflow
1434
+
1435
+ # CLAMP: Ensure the paragraph's left edge doesn't go below content_left
1436
+ # new_x = box.x + shift_x should be >= content_left
1437
+ new_x_clamped = box.x + shift_x
1438
+ if new_x_clamped < content_left:
1439
+ # Increase the shift to keep the left edge at content_left
1440
+ underflow = content_left - new_x_clamped
1441
+ shift_x += underflow
1442
+
1443
  return shift_x
1444
 
1445
  def _get_content_area_bounds(self, page: il_version_1.Page) -> tuple[float, float]:
 
1557
  def _apply_rtl_margin_shift_to_composition(
1558
  self,
1559
  composition: list[PdfParagraphComposition],
1560
+ shift_x: float,
1561
+ page: il_version_1.Page = None
1562
  ) -> None:
1563
  """Apply RTL margin shift to all characters in a composition list.
1564
 
1565
  Args:
1566
  composition: List of paragraph compositions to shift
1567
  shift_x: The X shift to apply
1568
+ page: Optional page for clamping to page bounds
1569
  """
1570
  if shift_x == 0:
1571
  return
1572
+
1573
+ # Get page bounds for clamping if page is provided
1574
+ clamp_right = None
1575
+ clamp_left = None
1576
+ if page and page.cropbox and page.cropbox.box:
1577
+ page_box = page.cropbox.box
1578
+ # Use a small margin (2%) from page edges for safety
1579
+ page_margin = (page_box.x2 - page_box.x) * 0.02
1580
+ clamp_left = page_box.x + page_margin
1581
+ clamp_right = page_box.x2 - page_margin
1582
 
1583
  for comp in composition:
1584
  if comp.pdf_character:
 
1586
  if char.box:
1587
  char.box.x += shift_x
1588
  char.box.x2 += shift_x
1589
+ # Clamp to page bounds if clamping is enabled
1590
+ if clamp_right is not None:
1591
+ if char.box.x2 > clamp_right:
1592
+ overflow = char.box.x2 - clamp_right
1593
+ char.box.x -= overflow
1594
+ char.box.x2 -= overflow
1595
+ if char.box.x < clamp_left:
1596
+ underflow = clamp_left - char.box.x
1597
+ char.box.x += underflow
1598
+ char.box.x2 += underflow
1599
  if char.visual_bbox and char.visual_bbox.box:
1600
  char.visual_bbox.box.x += shift_x
1601
  char.visual_bbox.box.x2 += shift_x
1602
+ if clamp_right is not None:
1603
+ if char.visual_bbox.box.x2 > clamp_right:
1604
+ overflow = char.visual_bbox.box.x2 - clamp_right
1605
+ char.visual_bbox.box.x -= overflow
1606
+ char.visual_bbox.box.x2 -= overflow
1607
+ if char.visual_bbox.box.x < clamp_left:
1608
+ underflow = clamp_left - char.visual_bbox.box.x
1609
+ char.visual_bbox.box.x += underflow
1610
+ char.visual_bbox.box.x2 += underflow
1611
  elif comp.pdf_formula:
1612
  formula = comp.pdf_formula
1613
  if formula.box:
 
1850
  if shift_x != 0:
1851
  self._apply_rtl_margin_shift_to_composition(
1852
  paragraph.pdf_paragraph_composition,
1853
+ shift_x,
1854
+ page # Pass page for clamping
1855
  )
1856
  # Also update the paragraph box to reflect the new position
1857
  if paragraph.box:
1858
  paragraph.box.x += shift_x
1859
  paragraph.box.x2 += shift_x
1860
+ # Clamp paragraph box to page bounds
1861
+ if page.cropbox and page.cropbox.box:
1862
+ page_box = page.cropbox.box
1863
+ page_margin = (page_box.x2 - page_box.x) * 0.02
1864
+ if paragraph.box.x2 > page_box.x2 - page_margin:
1865
+ overflow = paragraph.box.x2 - (page_box.x2 - page_margin)
1866
+ paragraph.box.x -= overflow
1867
+ paragraph.box.x2 -= overflow
1868
+ if paragraph.box.x < page_box.x + page_margin:
1869
+ underflow = (page_box.x + page_margin) - paragraph.box.x
1870
+ paragraph.box.x += underflow
1871
+ paragraph.box.x2 += underflow
1872
  else:
1873
  # Use precomputed scale factor to layout typesetting units
1874
  precomputed_scale = (
 
1893
  if shift_x != 0:
1894
  self._apply_rtl_margin_shift_to_composition(
1895
  paragraph.pdf_paragraph_composition,
1896
+ shift_x,
1897
+ page # Pass page for clamping
1898
  )
1899
  # Also update the paragraph box to reflect the new position
1900
  if paragraph.box:
1901
  paragraph.box.x += shift_x
1902
  paragraph.box.x2 += shift_x
1903
+ # Clamp paragraph box to page bounds
1904
+ if page.cropbox and page.cropbox.box:
1905
+ page_box = page.cropbox.box
1906
+ page_margin = (page_box.x2 - page_box.x) * 0.02
1907
+ if paragraph.box.x2 > page_box.x2 - page_margin:
1908
+ overflow = paragraph.box.x2 - (page_box.x2 - page_margin)
1909
+ paragraph.box.x -= overflow
1910
+ paragraph.box.x2 -= overflow
1911
+ if paragraph.box.x < page_box.x + page_margin:
1912
+ underflow = (page_box.x + page_margin) - paragraph.box.x
1913
+ paragraph.box.x += underflow
1914
+ paragraph.box.x2 += underflow
1915
 
1916
  def _is_arabic_char(self, char: str) -> bool:
1917
  """Check if character is Arabic - OPTIMIZED"""
 
1986
  arabic_word_spacing = space_width * ARABIC_WORD_SPACING_RATIO
1987
  line_units_map: dict[int, list[TypesettingUnit]] = {}
1988
 
1989
+ # Calculate effective line width for wrapping
1990
+ effective_line_width = box.x2 - box.x
1991
+
1992
  i = 0
1993
  safety_counter = 0
1994
+ max_iterations = len(typesetting_units) * 3 # Increased safety limit for char-level breaks
1995
 
1996
  while i < len(typesetting_units) and safety_counter < max_iterations:
1997
  safety_counter += 1
 
2019
  if current_x == box.x and word_units and word_units[0].is_space:
2020
  continue
2021
 
2022
+ # Calculate remaining space on current line
2023
+ remaining_space = box.x2 - current_x
 
 
 
 
 
 
 
 
 
 
 
 
2024
 
2025
+ # Case 1: Word fits on current line
2026
+ if word_width <= remaining_space:
2027
+ # Place all word units on current line
2028
+ relocated_word_units = []
2029
+ for unit in word_units:
2030
+ if unit.is_space and current_x == box.x:
2031
+ continue
2032
+
2033
+ unit_width = unit.width * scale
2034
+ unit_height = unit.height * scale
2035
+
2036
+ # CJK spacing
2037
+ if (last_unit and last_unit.is_cjk_char ^ unit.is_cjk_char
2038
+ and not unit.is_space and current_x > box.x):
2039
+ current_x += space_width * 0.5
2040
+
2041
+ relocated_unit = unit.relocate(current_x, current_y, scale)
2042
+ relocated_unit.line_id = current_line_index
2043
+ relocated_word_units.append(relocated_unit)
2044
+ typeset_units.append(relocated_unit)
2045
+
2046
+ if not unit.is_space:
2047
+ current_line_heights.append(unit_height)
2048
+
2049
+ current_x = relocated_unit.box.x2
2050
+ last_unit = relocated_unit
2051
 
2052
+ if relocated_word_units:
2053
+ relocated_word_units[-1].ends_word = True
2054
+ line_units_map.setdefault(current_line_index, []).extend(relocated_word_units)
2055
+
2056
+ # Case 2: Word is too long - need to handle line break
2057
+ else:
2058
+ # Check if we're not at the start of line - wrap to next line first
2059
+ if current_x > box.x:
2060
+ current_x = box.x
2061
+ if current_line_heights:
2062
+ max_height = max(current_line_heights)
2063
+ mode_height = statistics.mode(current_line_heights) if len(current_line_heights) > 1 else max_height
2064
+ current_y -= max(mode_height * line_skip, max_height * 1.05)
2065
+ line_ys.append(current_y)
2066
+ current_line_heights = []
2067
+
2068
+ if current_y < box.y:
2069
+ all_units_fit = False
2070
+
2071
+ current_line_index += 1
2072
 
2073
+ # Now check if word fits on a fresh line
2074
+ if word_width <= effective_line_width:
2075
+ # Word fits on a line by itself
2076
+ relocated_word_units = []
2077
+ for unit in word_units:
2078
+ if unit.is_space and current_x == box.x:
2079
+ continue
2080
+
2081
+ unit_width = unit.width * scale
2082
+ unit_height = unit.height * scale
2083
+
2084
+ relocated_unit = unit.relocate(current_x, current_y, scale)
2085
+ relocated_unit.line_id = current_line_index
2086
+ relocated_word_units.append(relocated_unit)
2087
+ typeset_units.append(relocated_unit)
2088
+
2089
+ if not unit.is_space:
2090
+ current_line_heights.append(unit_height)
2091
+
2092
+ current_x = relocated_unit.box.x2
2093
+ last_unit = relocated_unit
2094
+
2095
+ if relocated_word_units:
2096
+ relocated_word_units[-1].ends_word = True
2097
+ line_units_map.setdefault(current_line_index, []).extend(relocated_word_units)
2098
 
2099
+ else:
2100
+ # Word is too long even for a full line - break at character level
2101
+ # This handles long Arabic strings without spaces
2102
+ relocated_word_units = []
2103
+ for unit_idx, unit in enumerate(word_units):
2104
+ if unit.is_space and current_x == box.x:
2105
+ continue
2106
+
2107
+ unit_width = unit.width * scale
2108
+ unit_height = unit.height * scale
2109
+
2110
+ # Check if this character fits on current line
2111
+ if current_x + unit_width > box.x2 and current_x > box.x:
2112
+ # Start a new line
2113
+ # Mark previous unit as ending a "word" (partial word)
2114
+ if relocated_word_units:
2115
+ relocated_word_units[-1].ends_word = True
2116
+ line_units_map.setdefault(current_line_index, []).extend(relocated_word_units)
2117
+ relocated_word_units = []
2118
+
2119
+ current_x = box.x
2120
+ if current_line_heights:
2121
+ max_height = max(current_line_heights)
2122
+ mode_height = statistics.mode(current_line_heights) if len(current_line_heights) > 1 else max_height
2123
+ current_y -= max(mode_height * line_skip, max_height * 1.05)
2124
+ line_ys.append(current_y)
2125
+ current_line_heights = []
2126
+
2127
+ if current_y < box.y:
2128
+ all_units_fit = False
2129
+
2130
+ current_line_index += 1
2131
+
2132
+ relocated_unit = unit.relocate(current_x, current_y, scale)
2133
+ relocated_unit.line_id = current_line_index
2134
+ relocated_word_units.append(relocated_unit)
2135
+ typeset_units.append(relocated_unit)
2136
+
2137
+ if not unit.is_space:
2138
+ current_line_heights.append(unit_height)
2139
+
2140
+ current_x = relocated_unit.box.x2
2141
+ last_unit = relocated_unit
2142
+
2143
+ if relocated_word_units:
2144
+ relocated_word_units[-1].ends_word = True
2145
+ line_units_map.setdefault(current_line_index, []).extend(relocated_word_units)
2146
 
2147
 
2148
  # Apply post-layout spacing so wrapping stays unchanged
2149
+ # Pass box to ensure spacing doesn't cause overflow
2150
  if typeset_units and line_units_map:
2151
+ self._apply_arabic_word_spacing(line_units_map, arabic_word_spacing, box)
2152
 
2153
  # Right-align Arabic lines (but NOT table content)
2154
  # Check if this paragraph is inside a table by examining layout_label
 
2199
  self._justify_arabic_line(line_units, box.x, box.x2, line_min_x, line_max_x)
2200
  else:
2201
  # For last line or non-justified paragraphs:
2202
+ # Right-align the line, but CLAMP to prevent left-side overflow
2203
+
2204
+ # Calculate shift to align right edge with box.x2
2205
  target_right_position = box.x2
2206
  shift_x = target_right_position - line_max_x
2207
 
2208
+ # CRITICAL: After shifting, the LEFT edge must not go below box.x
2209
+ # new_left = line_min_x + shift_x must be >= box.x
2210
+ new_left_after_shift = line_min_x + shift_x
2211
+
2212
+ if new_left_after_shift < box.x:
2213
+ # Shifting would push left edge beyond left margin
2214
+ # Clamp the shift so left edge stays at box.x
2215
+ shift_x = box.x - line_min_x
2216
+
2217
+ # Apply the clamped shift
2218
  for unit in line_units:
2219
  self._shift_unit_x(unit, shift_x)
2220
  else:
 
2232
  and not last_unit.mixed_character_blacklist and not unit.mixed_character_blacklist
2233
  and current_x > box.x and unit.try_get_unicode() != " "
2234
  and last_unit.try_get_unicode() != " "
2235
+ and last_unit.try_get_unicode() not in ["、", "،", "。", ":", "!", "?"]):
2236
  current_x += space_width * 0.5
2237
 
2238
  if use_english_line_break:
 
2285
  self,
2286
  line_units_map: dict[int, list[TypesettingUnit]],
2287
  spacing: float,
2288
+ box: Box = None,
2289
  ):
2290
+ """Apply additional spacing between Arabic words post layout.
2291
+
2292
+ If box is provided, ensures the total line width doesn't exceed box width.
2293
+ """
2294
  if spacing <= 0:
2295
  return
2296
 
2297
+ for line_idx, units in line_units_map.items():
2298
+ if not units:
2299
+ continue
2300
+
2301
+ # Count word gaps
2302
+ word_gap_count = sum(1 for unit in units if getattr(unit, "ends_word", False))
2303
+ if word_gap_count == 0:
2304
+ continue
2305
+
2306
+ # Calculate total spacing that would be added
2307
+ total_spacing = word_gap_count * spacing
2308
+
2309
+ # If box is provided, check if spacing would cause overflow
2310
+ actual_spacing = spacing
2311
+ if box is not None:
2312
+ box_width = box.x2 - box.x
2313
+ # Get current line width
2314
+ line_min_x = min(u.box.x for u in units if u.box and u.box.x is not None)
2315
+ line_max_x = max(u.box.x2 for u in units if u.box and u.box.x2 is not None)
2316
+ current_width = line_max_x - line_min_x
2317
+
2318
+ # Calculate available space for word spacing
2319
+ available_space = box_width - current_width
2320
+
2321
+ if available_space <= 0:
2322
+ # Line already at or beyond box width, skip spacing
2323
+ continue
2324
+
2325
+ # If total spacing would exceed available space, reduce it
2326
+ if total_spacing > available_space:
2327
+ actual_spacing = available_space / word_gap_count
2328
+
2329
+ # Apply the (possibly reduced) spacing
2330
  cumulative_shift = 0.0
2331
  for unit in units:
2332
  if cumulative_shift:
2333
  self._shift_unit_x(unit, cumulative_shift)
2334
  if getattr(unit, "ends_word", False):
2335
+ cumulative_shift += actual_spacing
2336
 
2337
  def _shift_unit_x(self, unit: TypesettingUnit, shift_x: float):
2338
  """Shift a typesetting unit horizontally."""
 
2369
  - Extend to the left margin (target_left)
2370
  - Space between words is distributed evenly to fill the gap
2371
 
2372
+ IMPORTANT: The line must stay within bounds - left edge >= target_left
2373
+
2374
  Args:
2375
  line_units: List of typesetting units in the line
2376
  target_left: Target left edge (box.x)
 
2386
 
2387
  if current_width <= 0 or target_width <= current_width:
2388
  # Line already fills the space or is wider, just right-align
2389
+ # But clamp to prevent left-side overflow!
2390
  shift_x = target_right - current_right
2391
+
2392
+ # Check if shifting would push left edge beyond target_left
2393
+ new_left = current_left + shift_x
2394
+ if new_left < target_left:
2395
+ # Clamp shift to keep left edge at target_left
2396
+ shift_x = target_left - current_left
2397
+
2398
  for unit in line_units:
2399
  self._shift_unit_x(unit, shift_x)
2400
  return