Spaces:
Sleeping
Sleeping
dhruv575 commited on
Commit ·
98df35b
1
Parent(s): be12533
Attempting something new
Browse files- app/email_new_converter.py +23 -72
- app/post_process.py +42 -67
app/email_new_converter.py
CHANGED
|
@@ -575,14 +575,16 @@ class EmailNewConverter:
|
|
| 575 |
def _fix_section_header_tables(self, soup: BeautifulSoup) -> None:
|
| 576 |
"""Restore section header tables to the simplest, most compatible 2-column layout.
|
| 577 |
|
| 578 |
-
|
| 579 |
-
|
| 580 |
-
|
| 581 |
-
-
|
| 582 |
-
-
|
|
|
|
|
|
|
|
|
|
| 583 |
"""
|
| 584 |
# Find all section header tables - they're inside .wrap_h divs and have two <td>s
|
| 585 |
-
# One with the section title, one with align="right" for the label
|
| 586 |
wrap_h_divs = soup.find_all("div", class_="wrap_h")
|
| 587 |
|
| 588 |
for wrap_h in wrap_h_divs:
|
|
@@ -591,17 +593,14 @@ class EmailNewConverter:
|
|
| 591 |
for table in tables:
|
| 592 |
tds = table.find_all("td")
|
| 593 |
if len(tds) == 2:
|
| 594 |
-
# This is
|
| 595 |
-
# First td: section title (left)
|
| 596 |
-
# Second td: label like "Volume" (right)
|
| 597 |
-
|
| 598 |
first_td = tds[0]
|
| 599 |
second_td = tds[1]
|
| 600 |
|
| 601 |
-
# Ensure the header table is full width using HTML attribute
|
| 602 |
table["width"] = "100%"
|
| 603 |
|
| 604 |
-
# Remove table-layout constraints (
|
| 605 |
table_style = table.get("style", "")
|
| 606 |
if table_style and "table-layout" in table_style.lower():
|
| 607 |
table["style"] = re.sub(
|
|
@@ -611,7 +610,8 @@ class EmailNewConverter:
|
|
| 611 |
flags=re.IGNORECASE,
|
| 612 |
).strip("; ")
|
| 613 |
|
| 614 |
-
# Remove width/min-width constraints on both
|
|
|
|
| 615 |
for td in (first_td, second_td):
|
| 616 |
if td.get("width"):
|
| 617 |
del td["width"]
|
|
@@ -619,71 +619,22 @@ class EmailNewConverter:
|
|
| 619 |
if td_style:
|
| 620 |
td_style = re.sub(r'width\s*:\s*[^;]+;?\s*', '', td_style, flags=re.IGNORECASE)
|
| 621 |
td_style = re.sub(r'min-width\s*:\s*[^;]+;?\s*', '', td_style, flags=re.IGNORECASE)
|
| 622 |
-
# We previously added white-space:nowrap; remove it to match legacy behavior.
|
| 623 |
td_style = re.sub(r'white-space\s*:\s*[^;]+;?\s*', '', td_style, flags=re.IGNORECASE)
|
|
|
|
| 624 |
td["style"] = td_style.strip("; ")
|
| 625 |
|
| 626 |
-
#
|
| 627 |
-
|
| 628 |
-
# Structure: [Title] [Spacer] [Subtitle]
|
| 629 |
-
|
| 630 |
-
# Get the current content of both cells
|
| 631 |
-
first_td_contents = list(first_td.children)
|
| 632 |
|
| 633 |
-
#
|
| 634 |
second_td_style = second_td.get("style", "")
|
| 635 |
-
|
| 636 |
-
|
| 637 |
-
|
| 638 |
-
|
| 639 |
-
|
| 640 |
-
second_td_contents = list(nested_td.children)
|
| 641 |
-
# Use nested_td style if available, otherwise use second_td style
|
| 642 |
-
nested_style = nested_td.get("style", "")
|
| 643 |
-
if nested_style:
|
| 644 |
-
second_td_style = nested_style
|
| 645 |
-
else:
|
| 646 |
-
second_td_contents = list(second_td.children)
|
| 647 |
-
else:
|
| 648 |
-
second_td_contents = list(second_td.children)
|
| 649 |
-
|
| 650 |
-
# Get the row containing these cells
|
| 651 |
-
tr = first_td.find_parent("tr")
|
| 652 |
-
if not tr:
|
| 653 |
-
continue
|
| 654 |
-
|
| 655 |
-
# Clear the row
|
| 656 |
-
tr.clear()
|
| 657 |
-
|
| 658 |
-
# First cell: title (left)
|
| 659 |
-
new_first_td = soup.new_tag("td")
|
| 660 |
-
first_td_style = first_td.get("style", "")
|
| 661 |
-
if first_td_style:
|
| 662 |
-
# Remove padding-right if it exists (we'll use spacer instead)
|
| 663 |
-
first_td_style = re.sub(r'padding-right\s*:\s*[^;]+;?\s*', '', first_td_style, flags=re.IGNORECASE)
|
| 664 |
-
new_first_td["style"] = first_td_style.strip("; ")
|
| 665 |
-
for content in first_td_contents:
|
| 666 |
-
new_first_td.append(content)
|
| 667 |
-
tr.append(new_first_td)
|
| 668 |
-
|
| 669 |
-
# Second cell: spacer (flexible width, pushes content apart)
|
| 670 |
-
# No width specified - table algorithm will give it remaining space
|
| 671 |
-
spacer_td = soup.new_tag("td")
|
| 672 |
-
spacer_td["style"] = "font-size:1px;line-height:1px;"
|
| 673 |
-
spacer_td.append("\u00A0") # Non-breaking space to prevent collapse
|
| 674 |
-
tr.append(spacer_td)
|
| 675 |
-
|
| 676 |
-
# Third cell: subtitle (right)
|
| 677 |
-
new_second_td = soup.new_tag("td")
|
| 678 |
-
if second_td_style:
|
| 679 |
-
new_second_td["style"] = second_td_style
|
| 680 |
-
new_second_td["align"] = "right"
|
| 681 |
-
new_second_td["valign"] = "bottom"
|
| 682 |
-
for content in second_td_contents:
|
| 683 |
-
new_second_td.append(content)
|
| 684 |
-
tr.append(new_second_td)
|
| 685 |
|
| 686 |
-
logger.info("Fixed section header table
|
| 687 |
|
| 688 |
def _normalize_style_attributes(self, soup: BeautifulSoup) -> None:
|
| 689 |
"""Normalize style attributes for consistent email client parsing.
|
|
|
|
| 575 |
def _fix_section_header_tables(self, soup: BeautifulSoup) -> None:
|
| 576 |
"""Restore section header tables to the simplest, most compatible 2-column layout.
|
| 577 |
|
| 578 |
+
Based on analysis of working emails (original_msg.eml), the most reliable pattern is:
|
| 579 |
+
- Simple 2-column table with width="100%"
|
| 580 |
+
- Second cell has align="right" attribute
|
| 581 |
+
- NO table-layout:fixed
|
| 582 |
+
- NO explicit widths on cells
|
| 583 |
+
- NO spacer cells or nested tables
|
| 584 |
+
|
| 585 |
+
This matches the original template structure that worked on mobile.
|
| 586 |
"""
|
| 587 |
# Find all section header tables - they're inside .wrap_h divs and have two <td>s
|
|
|
|
| 588 |
wrap_h_divs = soup.find_all("div", class_="wrap_h")
|
| 589 |
|
| 590 |
for wrap_h in wrap_h_divs:
|
|
|
|
| 593 |
for table in tables:
|
| 594 |
tds = table.find_all("td")
|
| 595 |
if len(tds) == 2:
|
| 596 |
+
# This is a section header table
|
|
|
|
|
|
|
|
|
|
| 597 |
first_td = tds[0]
|
| 598 |
second_td = tds[1]
|
| 599 |
|
| 600 |
+
# Ensure the header table is full width using HTML attribute
|
| 601 |
table["width"] = "100%"
|
| 602 |
|
| 603 |
+
# Remove table-layout constraints (causes mobile misalignment)
|
| 604 |
table_style = table.get("style", "")
|
| 605 |
if table_style and "table-layout" in table_style.lower():
|
| 606 |
table["style"] = re.sub(
|
|
|
|
| 610 |
flags=re.IGNORECASE,
|
| 611 |
).strip("; ")
|
| 612 |
|
| 613 |
+
# Remove width/min-width/white-space constraints on both cells
|
| 614 |
+
# Let the table algorithm handle spacing naturally
|
| 615 |
for td in (first_td, second_td):
|
| 616 |
if td.get("width"):
|
| 617 |
del td["width"]
|
|
|
|
| 619 |
if td_style:
|
| 620 |
td_style = re.sub(r'width\s*:\s*[^;]+;?\s*', '', td_style, flags=re.IGNORECASE)
|
| 621 |
td_style = re.sub(r'min-width\s*:\s*[^;]+;?\s*', '', td_style, flags=re.IGNORECASE)
|
|
|
|
| 622 |
td_style = re.sub(r'white-space\s*:\s*[^;]+;?\s*', '', td_style, flags=re.IGNORECASE)
|
| 623 |
+
td_style = re.sub(r'padding-right\s*:\s*[^;]+;?\s*', '', td_style, flags=re.IGNORECASE)
|
| 624 |
td["style"] = td_style.strip("; ")
|
| 625 |
|
| 626 |
+
# Ensure second cell has align="right" attribute
|
| 627 |
+
second_td["align"] = "right"
|
|
|
|
|
|
|
|
|
|
|
|
|
| 628 |
|
| 629 |
+
# Add text-align:right as fallback in inline style
|
| 630 |
second_td_style = second_td.get("style", "")
|
| 631 |
+
if "text-align" not in second_td_style.lower():
|
| 632 |
+
if second_td_style and not second_td_style.endswith(";"):
|
| 633 |
+
second_td_style += ";"
|
| 634 |
+
second_td_style += "text-align:right;"
|
| 635 |
+
second_td["style"] = second_td_style.strip("; ")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 636 |
|
| 637 |
+
logger.info("Fixed section header table to simple 2-column layout for mobile compatibility")
|
| 638 |
|
| 639 |
def _normalize_style_attributes(self, soup: BeautifulSoup) -> None:
|
| 640 |
"""Normalize style attributes for consistent email client parsing.
|
app/post_process.py
CHANGED
|
@@ -354,14 +354,16 @@ def fix_cloudinary_image_transformations(html_content: str) -> str:
|
|
| 354 |
|
| 355 |
|
| 356 |
def fix_section_header_alignment(html_content: str) -> str:
|
| 357 |
-
"""Restore section header tables to the simplest, most compatible layout.
|
| 358 |
|
| 359 |
-
|
| 360 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 361 |
|
| 362 |
-
This
|
| 363 |
-
- header table remains width="100%"
|
| 364 |
-
- second cell remains align="right" (with optional text-align:right fallback)
|
| 365 |
"""
|
| 366 |
from bs4 import BeautifulSoup
|
| 367 |
|
|
@@ -372,91 +374,64 @@ def fix_section_header_alignment(html_content: str) -> str:
|
|
| 372 |
for wrap_h in wrap_h_divs:
|
| 373 |
tables = wrap_h.find_all("table", role="presentation")
|
| 374 |
for table in tables:
|
| 375 |
-
# Identify candidate header table: exactly one row with 2
|
| 376 |
tr = table.find("tr")
|
| 377 |
if not tr:
|
| 378 |
continue
|
| 379 |
tds = tr.find_all("td", recursive=False)
|
| 380 |
|
| 381 |
-
#
|
| 382 |
-
if len(tds) == 3:
|
| 383 |
-
continue
|
| 384 |
-
|
| 385 |
-
# Must have exactly 2 columns to convert
|
| 386 |
if len(tds) != 2:
|
| 387 |
continue
|
| 388 |
|
| 389 |
first_td, second_td = tds[0], tds[1]
|
| 390 |
|
| 391 |
-
#
|
| 392 |
if table.get("width") != "100%":
|
| 393 |
table["width"] = "100%"
|
| 394 |
modified = True
|
| 395 |
|
| 396 |
-
# Strip table-layout from table style if present
|
| 397 |
table_style = table.get("style", "")
|
| 398 |
if table_style and "table-layout" in table_style.lower():
|
| 399 |
new_table_style = re.sub(r"table-layout\s*:\s*[^;]+;?\s*", "", table_style, flags=re.IGNORECASE).strip("; ")
|
| 400 |
table["style"] = new_table_style
|
| 401 |
modified = True
|
| 402 |
|
| 403 |
-
#
|
| 404 |
-
|
| 405 |
-
|
| 406 |
-
|
| 407 |
-
|
| 408 |
-
|
| 409 |
-
|
| 410 |
-
|
| 411 |
-
|
| 412 |
-
|
| 413 |
-
|
| 414 |
-
|
| 415 |
-
|
| 416 |
-
|
| 417 |
-
|
| 418 |
-
|
| 419 |
-
|
| 420 |
-
if nested_style:
|
| 421 |
-
second_td_style = nested_style
|
| 422 |
-
else:
|
| 423 |
-
second_td_contents = list(second_td.children)
|
| 424 |
-
else:
|
| 425 |
-
second_td_contents = list(second_td.children)
|
| 426 |
-
|
| 427 |
-
# Clear the row
|
| 428 |
-
tr.clear()
|
| 429 |
-
|
| 430 |
-
# First cell: title (left)
|
| 431 |
-
new_first_td = soup.new_tag("td")
|
| 432 |
-
if first_td_style:
|
| 433 |
-
# Remove padding-right if it exists (we'll use spacer instead)
|
| 434 |
-
first_td_style = re.sub(r"padding-right\s*:\s*[^;]+;?\s*", "", first_td_style, flags=re.IGNORECASE)
|
| 435 |
-
new_first_td["style"] = first_td_style.strip("; ")
|
| 436 |
-
for content in first_td_contents:
|
| 437 |
-
new_first_td.append(content)
|
| 438 |
-
tr.append(new_first_td)
|
| 439 |
-
|
| 440 |
-
# Second cell: spacer (flexible width, pushes content apart)
|
| 441 |
-
spacer_td = soup.new_tag("td")
|
| 442 |
-
spacer_td["style"] = "font-size:1px;line-height:1px;"
|
| 443 |
-
spacer_td.append("\u00A0") # Non-breaking space to prevent collapse
|
| 444 |
-
tr.append(spacer_td)
|
| 445 |
|
| 446 |
-
#
|
| 447 |
-
|
| 448 |
-
|
| 449 |
-
|
| 450 |
-
new_second_td["align"] = "right"
|
| 451 |
-
new_second_td["valign"] = "bottom"
|
| 452 |
-
for content in second_td_contents:
|
| 453 |
-
new_second_td.append(content)
|
| 454 |
-
tr.append(new_second_td)
|
| 455 |
|
| 456 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 457 |
|
| 458 |
if modified:
|
| 459 |
-
print(" Restored section header
|
| 460 |
return str(soup)
|
| 461 |
|
| 462 |
print(" No section header tables found that need alignment fixes")
|
|
|
|
| 354 |
|
| 355 |
|
| 356 |
def fix_section_header_alignment(html_content: str) -> str:
|
| 357 |
+
"""Restore section header tables to the simplest, most compatible 2-column layout.
|
| 358 |
|
| 359 |
+
Based on analysis of working emails (original_msg.eml), the most reliable pattern is:
|
| 360 |
+
- Simple 2-column table with width="100%"
|
| 361 |
+
- Second cell has align="right" attribute
|
| 362 |
+
- NO table-layout:fixed
|
| 363 |
+
- NO explicit widths on cells
|
| 364 |
+
- NO spacer cells or nested tables
|
| 365 |
|
| 366 |
+
This matches the original template structure that worked on mobile.
|
|
|
|
|
|
|
| 367 |
"""
|
| 368 |
from bs4 import BeautifulSoup
|
| 369 |
|
|
|
|
| 374 |
for wrap_h in wrap_h_divs:
|
| 375 |
tables = wrap_h.find_all("table", role="presentation")
|
| 376 |
for table in tables:
|
| 377 |
+
# Identify candidate header table: exactly one row with 2 tds
|
| 378 |
tr = table.find("tr")
|
| 379 |
if not tr:
|
| 380 |
continue
|
| 381 |
tds = tr.find_all("td", recursive=False)
|
| 382 |
|
| 383 |
+
# Must have exactly 2 columns
|
|
|
|
|
|
|
|
|
|
|
|
|
| 384 |
if len(tds) != 2:
|
| 385 |
continue
|
| 386 |
|
| 387 |
first_td, second_td = tds[0], tds[1]
|
| 388 |
|
| 389 |
+
# Ensure header table is full width via attribute
|
| 390 |
if table.get("width") != "100%":
|
| 391 |
table["width"] = "100%"
|
| 392 |
modified = True
|
| 393 |
|
| 394 |
+
# Strip table-layout from table style if present
|
| 395 |
table_style = table.get("style", "")
|
| 396 |
if table_style and "table-layout" in table_style.lower():
|
| 397 |
new_table_style = re.sub(r"table-layout\s*:\s*[^;]+;?\s*", "", table_style, flags=re.IGNORECASE).strip("; ")
|
| 398 |
table["style"] = new_table_style
|
| 399 |
modified = True
|
| 400 |
|
| 401 |
+
# Remove width/min-width/white-space/padding-right constraints on both cells
|
| 402 |
+
for td in (first_td, second_td):
|
| 403 |
+
if td.get("width"):
|
| 404 |
+
del td["width"]
|
| 405 |
+
modified = True
|
| 406 |
+
|
| 407 |
+
td_style = td.get("style", "")
|
| 408 |
+
if td_style:
|
| 409 |
+
original_style = td_style
|
| 410 |
+
td_style = re.sub(r"width\s*:\s*[^;]+;?\s*", "", td_style, flags=re.IGNORECASE)
|
| 411 |
+
td_style = re.sub(r"min-width\s*:\s*[^;]+;?\s*", "", td_style, flags=re.IGNORECASE)
|
| 412 |
+
td_style = re.sub(r"white-space\s*:\s*[^;]+;?\s*", "", td_style, flags=re.IGNORECASE)
|
| 413 |
+
td_style = re.sub(r"padding-right\s*:\s*[^;]+;?\s*", "", td_style, flags=re.IGNORECASE)
|
| 414 |
+
td_style = td_style.strip("; ")
|
| 415 |
+
if td_style != original_style:
|
| 416 |
+
td["style"] = td_style
|
| 417 |
+
modified = True
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 418 |
|
| 419 |
+
# Ensure second cell has align="right" attribute
|
| 420 |
+
if second_td.get("align") != "right":
|
| 421 |
+
second_td["align"] = "right"
|
| 422 |
+
modified = True
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 423 |
|
| 424 |
+
# Add text-align:right as fallback in inline style
|
| 425 |
+
second_td_style = second_td.get("style", "")
|
| 426 |
+
if "text-align" not in second_td_style.lower():
|
| 427 |
+
if second_td_style and not second_td_style.endswith(";"):
|
| 428 |
+
second_td_style += ";"
|
| 429 |
+
second_td_style += "text-align:right;"
|
| 430 |
+
second_td["style"] = second_td_style.strip("; ")
|
| 431 |
+
modified = True
|
| 432 |
|
| 433 |
if modified:
|
| 434 |
+
print(" Restored section header tables to simple 2-column layout for mobile")
|
| 435 |
return str(soup)
|
| 436 |
|
| 437 |
print(" No section header tables found that need alignment fixes")
|