dhruv575 commited on
Commit
98df35b
·
1 Parent(s): be12533

Attempting something new

Browse files
Files changed (2) hide show
  1. app/email_new_converter.py +23 -72
  2. app/post_process.py +42 -67
app/email_new_converter.py CHANGED
@@ -575,14 +575,16 @@ class EmailNewConverter:
575
  def _fix_section_header_tables(self, soup: BeautifulSoup) -> None:
576
  """Restore section header tables to the simplest, most compatible 2-column layout.
577
 
578
- Section headers like "Hot Polymarkets" and "Volume" are in a table with two <td>s.
579
- Desktop clients handle many layout hints, but mobile Gmail can mis-render when we add
580
- `table-layout:fixed` or explicit widths/min-widths. The most reliable pattern is:
581
- - header table: width="100%"
582
- - second <td>: align="right" (and optional text-align:right), no forced widths
 
 
 
583
  """
584
  # Find all section header tables - they're inside .wrap_h divs and have two <td>s
585
- # One with the section title, one with align="right" for the label
586
  wrap_h_divs = soup.find_all("div", class_="wrap_h")
587
 
588
  for wrap_h in wrap_h_divs:
@@ -591,17 +593,14 @@ class EmailNewConverter:
591
  for table in tables:
592
  tds = table.find_all("td")
593
  if len(tds) == 2:
594
- # This is likely a section header table
595
- # First td: section title (left)
596
- # Second td: label like "Volume" (right)
597
-
598
  first_td = tds[0]
599
  second_td = tds[1]
600
 
601
- # Ensure the header table is full width using HTML attribute.
602
  table["width"] = "100%"
603
 
604
- # Remove table-layout constraints (common cause of mobile misalignment).
605
  table_style = table.get("style", "")
606
  if table_style and "table-layout" in table_style.lower():
607
  table["style"] = re.sub(
@@ -611,7 +610,8 @@ class EmailNewConverter:
611
  flags=re.IGNORECASE,
612
  ).strip("; ")
613
 
614
- # Remove width/min-width constraints on both header cells (let table algorithm work).
 
615
  for td in (first_td, second_td):
616
  if td.get("width"):
617
  del td["width"]
@@ -619,71 +619,22 @@ class EmailNewConverter:
619
  if td_style:
620
  td_style = re.sub(r'width\s*:\s*[^;]+;?\s*', '', td_style, flags=re.IGNORECASE)
621
  td_style = re.sub(r'min-width\s*:\s*[^;]+;?\s*', '', td_style, flags=re.IGNORECASE)
622
- # We previously added white-space:nowrap; remove it to match legacy behavior.
623
  td_style = re.sub(r'white-space\s*:\s*[^;]+;?\s*', '', td_style, flags=re.IGNORECASE)
 
624
  td["style"] = td_style.strip("; ")
625
 
626
- # Use 3-column layout with spacer cell for reliable right-alignment on mobile.
627
- # This is more reliable than 2-column layouts on mobile email clients.
628
- # Structure: [Title] [Spacer] [Subtitle]
629
-
630
- # Get the current content of both cells
631
- first_td_contents = list(first_td.children)
632
 
633
- # Extract content from second_td, handling nested tables if present
634
  second_td_style = second_td.get("style", "")
635
- nested_table = second_td.find("table")
636
- if nested_table:
637
- # If there's a nested table, get content from its td
638
- nested_td = nested_table.find("td")
639
- if nested_td:
640
- second_td_contents = list(nested_td.children)
641
- # Use nested_td style if available, otherwise use second_td style
642
- nested_style = nested_td.get("style", "")
643
- if nested_style:
644
- second_td_style = nested_style
645
- else:
646
- second_td_contents = list(second_td.children)
647
- else:
648
- second_td_contents = list(second_td.children)
649
-
650
- # Get the row containing these cells
651
- tr = first_td.find_parent("tr")
652
- if not tr:
653
- continue
654
-
655
- # Clear the row
656
- tr.clear()
657
-
658
- # First cell: title (left)
659
- new_first_td = soup.new_tag("td")
660
- first_td_style = first_td.get("style", "")
661
- if first_td_style:
662
- # Remove padding-right if it exists (we'll use spacer instead)
663
- first_td_style = re.sub(r'padding-right\s*:\s*[^;]+;?\s*', '', first_td_style, flags=re.IGNORECASE)
664
- new_first_td["style"] = first_td_style.strip("; ")
665
- for content in first_td_contents:
666
- new_first_td.append(content)
667
- tr.append(new_first_td)
668
-
669
- # Second cell: spacer (flexible width, pushes content apart)
670
- # No width specified - table algorithm will give it remaining space
671
- spacer_td = soup.new_tag("td")
672
- spacer_td["style"] = "font-size:1px;line-height:1px;"
673
- spacer_td.append("\u00A0") # Non-breaking space to prevent collapse
674
- tr.append(spacer_td)
675
-
676
- # Third cell: subtitle (right)
677
- new_second_td = soup.new_tag("td")
678
- if second_td_style:
679
- new_second_td["style"] = second_td_style
680
- new_second_td["align"] = "right"
681
- new_second_td["valign"] = "bottom"
682
- for content in second_td_contents:
683
- new_second_td.append(content)
684
- tr.append(new_second_td)
685
 
686
- logger.info("Fixed section header table layout using nested table for mobile compatibility")
687
 
688
  def _normalize_style_attributes(self, soup: BeautifulSoup) -> None:
689
  """Normalize style attributes for consistent email client parsing.
 
575
  def _fix_section_header_tables(self, soup: BeautifulSoup) -> None:
576
  """Restore section header tables to the simplest, most compatible 2-column layout.
577
 
578
+ Based on analysis of working emails (original_msg.eml), the most reliable pattern is:
579
+ - Simple 2-column table with width="100%"
580
+ - Second cell has align="right" attribute
581
+ - NO table-layout:fixed
582
+ - NO explicit widths on cells
583
+ - NO spacer cells or nested tables
584
+
585
+ This matches the original template structure that worked on mobile.
586
  """
587
  # Find all section header tables - they're inside .wrap_h divs and have two <td>s
 
588
  wrap_h_divs = soup.find_all("div", class_="wrap_h")
589
 
590
  for wrap_h in wrap_h_divs:
 
593
  for table in tables:
594
  tds = table.find_all("td")
595
  if len(tds) == 2:
596
+ # This is a section header table
 
 
 
597
  first_td = tds[0]
598
  second_td = tds[1]
599
 
600
+ # Ensure the header table is full width using HTML attribute
601
  table["width"] = "100%"
602
 
603
+ # Remove table-layout constraints (causes mobile misalignment)
604
  table_style = table.get("style", "")
605
  if table_style and "table-layout" in table_style.lower():
606
  table["style"] = re.sub(
 
610
  flags=re.IGNORECASE,
611
  ).strip("; ")
612
 
613
+ # Remove width/min-width/white-space constraints on both cells
614
+ # Let the table algorithm handle spacing naturally
615
  for td in (first_td, second_td):
616
  if td.get("width"):
617
  del td["width"]
 
619
  if td_style:
620
  td_style = re.sub(r'width\s*:\s*[^;]+;?\s*', '', td_style, flags=re.IGNORECASE)
621
  td_style = re.sub(r'min-width\s*:\s*[^;]+;?\s*', '', td_style, flags=re.IGNORECASE)
 
622
  td_style = re.sub(r'white-space\s*:\s*[^;]+;?\s*', '', td_style, flags=re.IGNORECASE)
623
+ td_style = re.sub(r'padding-right\s*:\s*[^;]+;?\s*', '', td_style, flags=re.IGNORECASE)
624
  td["style"] = td_style.strip("; ")
625
 
626
+ # Ensure second cell has align="right" attribute
627
+ second_td["align"] = "right"
 
 
 
 
628
 
629
+ # Add text-align:right as fallback in inline style
630
  second_td_style = second_td.get("style", "")
631
+ if "text-align" not in second_td_style.lower():
632
+ if second_td_style and not second_td_style.endswith(";"):
633
+ second_td_style += ";"
634
+ second_td_style += "text-align:right;"
635
+ second_td["style"] = second_td_style.strip("; ")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
636
 
637
+ logger.info("Fixed section header table to simple 2-column layout for mobile compatibility")
638
 
639
  def _normalize_style_attributes(self, soup: BeautifulSoup) -> None:
640
  """Normalize style attributes for consistent email client parsing.
app/post_process.py CHANGED
@@ -354,14 +354,16 @@ def fix_cloudinary_image_transformations(html_content: str) -> str:
354
 
355
 
356
  def fix_section_header_alignment(html_content: str) -> str:
357
- """Restore section header tables to the simplest, most compatible layout.
358
 
359
- Insight from prior working emails: the most reliable pattern for mobile clients is the
360
- default table algorithm (no table-layout:fixed, no forced widths/min-widths on the header cells).
 
 
 
 
361
 
362
- This function strips those constraints while preserving the core semantics:
363
- - header table remains width="100%"
364
- - second cell remains align="right" (with optional text-align:right fallback)
365
  """
366
  from bs4 import BeautifulSoup
367
 
@@ -372,91 +374,64 @@ def fix_section_header_alignment(html_content: str) -> str:
372
  for wrap_h in wrap_h_divs:
373
  tables = wrap_h.find_all("table", role="presentation")
374
  for table in tables:
375
- # Identify candidate header table: exactly one row with 2 or 3 tds.
376
  tr = table.find("tr")
377
  if not tr:
378
  continue
379
  tds = tr.find_all("td", recursive=False)
380
 
381
- # If already 3 columns, skip (already converted)
382
- if len(tds) == 3:
383
- continue
384
-
385
- # Must have exactly 2 columns to convert
386
  if len(tds) != 2:
387
  continue
388
 
389
  first_td, second_td = tds[0], tds[1]
390
 
391
- # Keep header table full width via attribute (legacy behavior).
392
  if table.get("width") != "100%":
393
  table["width"] = "100%"
394
  modified = True
395
 
396
- # Strip table-layout from table style if present.
397
  table_style = table.get("style", "")
398
  if table_style and "table-layout" in table_style.lower():
399
  new_table_style = re.sub(r"table-layout\s*:\s*[^;]+;?\s*", "", table_style, flags=re.IGNORECASE).strip("; ")
400
  table["style"] = new_table_style
401
  modified = True
402
 
403
- # Convert 2-column to 3-column layout with spacer cell
404
- # Get the current content of both cells
405
- first_td_contents = list(first_td.children)
406
-
407
- # Get styles first
408
- first_td_style = first_td.get("style", "")
409
- second_td_style = second_td.get("style", "")
410
-
411
- # Extract content from second_td, handling nested tables if present
412
- nested_table = second_td.find("table")
413
- if nested_table:
414
- # If there's a nested table, get content from its td
415
- nested_td = nested_table.find("td")
416
- if nested_td:
417
- second_td_contents = list(nested_td.children)
418
- # Use nested_td style if available, otherwise use second_td style
419
- nested_style = nested_td.get("style", "")
420
- if nested_style:
421
- second_td_style = nested_style
422
- else:
423
- second_td_contents = list(second_td.children)
424
- else:
425
- second_td_contents = list(second_td.children)
426
-
427
- # Clear the row
428
- tr.clear()
429
-
430
- # First cell: title (left)
431
- new_first_td = soup.new_tag("td")
432
- if first_td_style:
433
- # Remove padding-right if it exists (we'll use spacer instead)
434
- first_td_style = re.sub(r"padding-right\s*:\s*[^;]+;?\s*", "", first_td_style, flags=re.IGNORECASE)
435
- new_first_td["style"] = first_td_style.strip("; ")
436
- for content in first_td_contents:
437
- new_first_td.append(content)
438
- tr.append(new_first_td)
439
-
440
- # Second cell: spacer (flexible width, pushes content apart)
441
- spacer_td = soup.new_tag("td")
442
- spacer_td["style"] = "font-size:1px;line-height:1px;"
443
- spacer_td.append("\u00A0") # Non-breaking space to prevent collapse
444
- tr.append(spacer_td)
445
 
446
- # Third cell: subtitle (right)
447
- new_second_td = soup.new_tag("td")
448
- if second_td_style:
449
- new_second_td["style"] = second_td_style
450
- new_second_td["align"] = "right"
451
- new_second_td["valign"] = "bottom"
452
- for content in second_td_contents:
453
- new_second_td.append(content)
454
- tr.append(new_second_td)
455
 
456
- modified = True
 
 
 
 
 
 
 
457
 
458
  if modified:
459
- print(" Restored section header table layout for mobile")
460
  return str(soup)
461
 
462
  print(" No section header tables found that need alignment fixes")
 
354
 
355
 
356
  def fix_section_header_alignment(html_content: str) -> str:
357
+ """Restore section header tables to the simplest, most compatible 2-column layout.
358
 
359
+ Based on analysis of working emails (original_msg.eml), the most reliable pattern is:
360
+ - Simple 2-column table with width="100%"
361
+ - Second cell has align="right" attribute
362
+ - NO table-layout:fixed
363
+ - NO explicit widths on cells
364
+ - NO spacer cells or nested tables
365
 
366
+ This matches the original template structure that worked on mobile.
 
 
367
  """
368
  from bs4 import BeautifulSoup
369
 
 
374
  for wrap_h in wrap_h_divs:
375
  tables = wrap_h.find_all("table", role="presentation")
376
  for table in tables:
377
+ # Identify candidate header table: exactly one row with 2 tds
378
  tr = table.find("tr")
379
  if not tr:
380
  continue
381
  tds = tr.find_all("td", recursive=False)
382
 
383
+ # Must have exactly 2 columns
 
 
 
 
384
  if len(tds) != 2:
385
  continue
386
 
387
  first_td, second_td = tds[0], tds[1]
388
 
389
+ # Ensure header table is full width via attribute
390
  if table.get("width") != "100%":
391
  table["width"] = "100%"
392
  modified = True
393
 
394
+ # Strip table-layout from table style if present
395
  table_style = table.get("style", "")
396
  if table_style and "table-layout" in table_style.lower():
397
  new_table_style = re.sub(r"table-layout\s*:\s*[^;]+;?\s*", "", table_style, flags=re.IGNORECASE).strip("; ")
398
  table["style"] = new_table_style
399
  modified = True
400
 
401
+ # Remove width/min-width/white-space/padding-right constraints on both cells
402
+ for td in (first_td, second_td):
403
+ if td.get("width"):
404
+ del td["width"]
405
+ modified = True
406
+
407
+ td_style = td.get("style", "")
408
+ if td_style:
409
+ original_style = td_style
410
+ td_style = re.sub(r"width\s*:\s*[^;]+;?\s*", "", td_style, flags=re.IGNORECASE)
411
+ td_style = re.sub(r"min-width\s*:\s*[^;]+;?\s*", "", td_style, flags=re.IGNORECASE)
412
+ td_style = re.sub(r"white-space\s*:\s*[^;]+;?\s*", "", td_style, flags=re.IGNORECASE)
413
+ td_style = re.sub(r"padding-right\s*:\s*[^;]+;?\s*", "", td_style, flags=re.IGNORECASE)
414
+ td_style = td_style.strip("; ")
415
+ if td_style != original_style:
416
+ td["style"] = td_style
417
+ modified = True
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
418
 
419
+ # Ensure second cell has align="right" attribute
420
+ if second_td.get("align") != "right":
421
+ second_td["align"] = "right"
422
+ modified = True
 
 
 
 
 
423
 
424
+ # Add text-align:right as fallback in inline style
425
+ second_td_style = second_td.get("style", "")
426
+ if "text-align" not in second_td_style.lower():
427
+ if second_td_style and not second_td_style.endswith(";"):
428
+ second_td_style += ";"
429
+ second_td_style += "text-align:right;"
430
+ second_td["style"] = second_td_style.strip("; ")
431
+ modified = True
432
 
433
  if modified:
434
+ print(" Restored section header tables to simple 2-column layout for mobile")
435
  return str(soup)
436
 
437
  print(" No section header tables found that need alignment fixes")