Spaces:
Sleeping
Sleeping
| #!/usr/bin/env python3 | |
| """ | |
| Post-process Polygraph newsletter HTML: | |
| 1. Convert Polymarket links to Dub tracking links | |
| 2. Minify HTML for Gmail compatibility (removes @font-face style tags, whitespace) | |
| Usage: python post_process.py <input_html_file> <dub_api_key> [output_html_file] | |
| IMPORTANT: This script uses conservative minification to preserve email formatting. | |
| It does NOT remove: | |
| - role="none" attributes (needed for table alignment in Gmail) | |
| - Quoted attribute values (some email clients require them) | |
| - font-style properties (even non-standard ones like Bold/Regular) | |
| - Style tags (except those containing @font-face rules) | |
| """ | |
| import os | |
| import re | |
| import sys | |
| import html | |
| import requests | |
| from urllib.parse import urlparse | |
| # ============================================================================= | |
| # DUB LINK CONVERSION | |
| # ============================================================================= | |
| def create_dub_tracking_link(url: str, title: str, api_key: str = None, tags: list = None) -> str: | |
| """Create Dub tracking link with multiple tags.""" | |
| if not api_key: | |
| print(f" Warning: Dub API key not provided, using original URL") | |
| return url | |
| try: | |
| if tags is None: | |
| tags = ["polygraph"] | |
| payload = { | |
| "url": url, | |
| "workspaceId": "ws_cm7dm89q90000qmskmss62vla", | |
| "title": title[:50] + "..." if len(title) > 50 else title, | |
| "comments": "Polymarket daily email", | |
| "trackConversion": True | |
| } | |
| if tags: | |
| payload["tagNames"] = tags | |
| response = requests.post( | |
| "https://api.dub.co/links", | |
| headers={ | |
| "Authorization": f"Bearer {api_key}", | |
| "Content-Type": "application/json" | |
| }, | |
| json=payload, | |
| timeout=10 | |
| ) | |
| if response.status_code in [200, 201]: | |
| data = response.json() | |
| short_link = data.get('shortLink') or data.get('shortUrl') or data.get('link') | |
| if short_link: | |
| if not short_link.startswith('http'): | |
| print(f" Warning: Invalid dub link format: {short_link}") | |
| return url | |
| if 'polymarket.com' in short_link: | |
| print(f" Warning: Dub returned original URL") | |
| return url | |
| if 'poly.market' not in short_link and 'dub.co' not in short_link and 'dub.sh' not in short_link: | |
| print(f" Warning: Unexpected domain: {short_link}") | |
| return url | |
| print(f" + {short_link}") | |
| return short_link | |
| else: | |
| print(f" Warning: No short link in response") | |
| return url | |
| else: | |
| print(f" Warning: Dub API error {response.status_code}") | |
| return url | |
| except Exception as e: | |
| print(f" Warning: Dub exception: {str(e)}") | |
| return url | |
| def extract_market_title(url: str) -> str: | |
| """Extract a title from the Polymarket URL.""" | |
| path = urlparse(url).path | |
| parts = path.strip('/').split('/') | |
| if parts: | |
| title = parts[-1].replace('-', ' ').title() | |
| return title[:50] | |
| return "Polymarket Market" | |
| def convert_polymarket_links(html_content: str, api_key: str = None) -> str: | |
| """Convert all Polymarket links in HTML to Dub tracking links.""" | |
| if not api_key: | |
| print(" Skipping link conversion: No Dub API key provided") | |
| return html_content | |
| pattern = r'href=["\']([^"\']*polymarket\.com[^"\']*)["\']' | |
| matches = list(re.finditer(pattern, html_content)) | |
| if not matches: | |
| print(" No Polymarket links found") | |
| return html_content | |
| print(f" Found {len(matches)} Polymarket link(s) to convert") | |
| converted_count = 0 | |
| for match in reversed(matches): | |
| original_url = match.group(1) | |
| decoded_url = html.unescape(original_url) | |
| title = extract_market_title(decoded_url) | |
| dub_url = create_dub_tracking_link(decoded_url, title, api_key=api_key, tags=["polygraph"]) | |
| if dub_url != decoded_url: | |
| html_content = html_content[:match.start(1)] + dub_url + html_content[match.end(1):] | |
| converted_count += 1 | |
| print(f" Converted {converted_count} link(s) to Dub tracking links") | |
| return html_content | |
| # ============================================================================= | |
| # HTML MINIFICATION (Conservative - preserves email formatting) | |
| # ============================================================================= | |
| def remove_font_face_styles(html_content: str) -> str: | |
| """Remove <style> blocks containing @font-face rules from anywhere in the document. | |
| Email clients don't support @font-face, so these should be removed.""" | |
| original_content = html_content | |
| # Find and remove style tags containing @font-face | |
| pattern = r'<style[^>]*>.*?@font-face.*?</style>' | |
| matches = list(re.finditer(pattern, html_content, flags=re.DOTALL | re.IGNORECASE)) | |
| if matches: | |
| print(f" Found {len(matches)} style tag(s) containing @font-face") | |
| # Remove from end to start to preserve indices | |
| for match in reversed(matches): | |
| html_content = html_content[:match.start()] + html_content[match.end():] | |
| print(f" Removed @font-face style tag") | |
| else: | |
| print(f" No @font-face style tags found") | |
| return html_content | |
| def minify_whitespace(html_content: str) -> str: | |
| """Remove unnecessary whitespace while preserving structure. | |
| IMPORTANT: This is conservative - it only removes: | |
| - Blank lines | |
| - Leading/trailing whitespace on lines | |
| - Whitespace between tags | |
| It does NOT modify attribute values or remove attributes. | |
| """ | |
| # Remove blank lines and strip each line, join without newlines | |
| lines = html_content.split('\n') | |
| lines = [line.strip() for line in lines if line.strip()] | |
| html_content = ''.join(lines) | |
| # Remove whitespace between tags (safe - doesn't affect content) | |
| html_content = re.sub(r'>\s+<', '><', html_content) | |
| return html_content | |
| def fix_header_centering(html_content: str) -> str: | |
| """Ensure header container has margin:0 auto for centering.""" | |
| html_content = re.sub( | |
| r'<div class="container" style="max-width:500px;\s*padding:0 16px;\s*text-align:center"', | |
| '<div class="container" style="max-width:500px; margin:0 auto; padding:0 16px; text-align:center"', | |
| html_content | |
| ) | |
| return html_content | |
| def fix_story_box_headlines(html_content: str) -> str: | |
| """Add margin-top:0 to story box h4 elements to prevent empty line above headlines.""" | |
| # Match h4 in story boxes that have margin-bottom:8px but no margin-top:0 | |
| def fix_h4(match): | |
| full = match.group(0) | |
| if 'margin-top:0' in full: | |
| return full | |
| # Insert margin-top:0 after margin-bottom:8px | |
| return full.replace("margin-bottom:8px'", "margin-bottom:8px; margin-top:0'") | |
| html_content = re.sub(r"<h4 style='[^']*margin-bottom:8px'[^>]*>", fix_h4, html_content) | |
| return html_content | |
| def fix_unsubscribe_link(html_content: str) -> str: | |
| """Fix unsubscribe link href to use Customer.io merge tag. | |
| Finds links containing "unsubscribe here" text and updates their href | |
| from "#" (or empty) to {% manage_subscription_preferences_url %}. | |
| """ | |
| modified = False | |
| # Pattern to match <a> tag with href="#" followed by "unsubscribe here" text | |
| # Matches: <a href="#" ...>...unsubscribe here...</a> | |
| # This pattern captures the full link tag including attributes and closing tag | |
| pattern = r'(<a[^>]*href=["\']#["\'][^>]*>.*?unsubscribe\s+here.*?</a>)' | |
| def replace_href(match): | |
| link_tag = match.group(0) | |
| # Replace href="#" with href="{% manage_subscription_preferences_url %}" | |
| updated_link = re.sub( | |
| r'href=["\']#["\']', | |
| 'href="{% manage_subscription_preferences_url %}"', | |
| link_tag, | |
| flags=re.IGNORECASE | |
| ) | |
| return updated_link | |
| if re.search(pattern, html_content, re.IGNORECASE | re.DOTALL): | |
| html_content = re.sub(pattern, replace_href, html_content, flags=re.IGNORECASE | re.DOTALL) | |
| modified = True | |
| print(" Fixed unsubscribe link href to {% manage_subscription_preferences_url %}") | |
| # Also handle case where just "here" is the link text (after "unsubscribe" text) | |
| # Pattern: <a href="#">here</a> when preceded by "unsubscribe" text | |
| pattern2 = r'(unsubscribe\s+)(<a[^>]*href=["\']#["\'][^>]*>.*?here.*?</a>)' | |
| def replace_href_here_only(match): | |
| before_text = match.group(1) | |
| link_tag = match.group(2) | |
| # Replace href="#" with href="{% manage_subscription_preferences_url %}" | |
| updated_link = re.sub( | |
| r'href=["\']#["\']', | |
| 'href="{% manage_subscription_preferences_url %}"', | |
| link_tag, | |
| flags=re.IGNORECASE | |
| ) | |
| return before_text + updated_link | |
| if re.search(pattern2, html_content, re.IGNORECASE | re.DOTALL): | |
| html_content = re.sub(pattern2, replace_href_here_only, html_content, flags=re.IGNORECASE | re.DOTALL) | |
| modified = True | |
| print(" Fixed unsubscribe 'here' link href to {% manage_subscription_preferences_url %}") | |
| if not modified: | |
| print(f" No unsubscribe links found that need href fixing") | |
| return html_content | |
| def fix_cloudinary_image_transformations(html_content: str) -> str: | |
| """Add Cloudinary transformations to market images that are missing them. | |
| This ensures all market images (48px square images) have proper transformations | |
| for cropping, while preserving aspect ratio for footer and top news images. | |
| """ | |
| modified = False | |
| fixed_count = 0 | |
| # Pattern to match img tags with Cloudinary URLs | |
| # Matches: <img ... src="https://res.cloudinary.com/.../upload/v123/..." ...> | |
| pattern = r'(<img[^>]*src=["\'])(https://res\.cloudinary\.com/[^"\']+)(["\'][^>]*>)' | |
| def add_transformations(match): | |
| nonlocal modified, fixed_count | |
| prefix = match.group(1) | |
| url = match.group(2) | |
| suffix = match.group(3) | |
| full_tag = match.group(0) | |
| # Check if URL already has transformations | |
| has_transformations = "/upload/w_" in url or "/upload/c_" in url or "/upload/h_" in url | |
| if has_transformations: | |
| return full_tag # Already has transformations, skip | |
| # Check if this is a market image (48px width) vs footer/top news | |
| # Market images have width="48" or style containing "width:48px" | |
| is_market_image = False | |
| is_footer_or_top_news = False | |
| # Check for width="48" attribute | |
| if 'width="48"' in full_tag or "width='48'" in full_tag: | |
| is_market_image = True | |
| # Check for style with width:48px | |
| if re.search(r'width\s*:\s*48px', full_tag, re.IGNORECASE): | |
| is_market_image = True | |
| # Check if it's a footer image (alt="Polymarket" or in footer section) | |
| if 'alt="Polymarket"' in full_tag or 'alt=\'Polymarket\'' in full_tag: | |
| is_footer_or_top_news = True | |
| # Check if it's a top news image (inside top_news_box or alt="Top news image") | |
| if 'alt="Top news image"' in full_tag or 'alt=\'Top news image\'' in full_tag: | |
| is_footer_or_top_news = True | |
| # Check if it's a last word image (inside last_word_box or alt="Last word image") | |
| if 'alt="Last word image"' in full_tag or 'alt=\'Last word image\'' in full_tag: | |
| is_footer_or_top_news = True | |
| if not is_footer_or_top_news: | |
| # Check if it's inside a top_news_box or last_word_box by looking for the class in nearby HTML | |
| img_pos = html_content.find(full_tag) | |
| if img_pos != -1: | |
| # Check if there's a top_news_box or last_word_box before this img tag (within 1000 chars) | |
| before_img = html_content[max(0, img_pos - 1000):img_pos] | |
| # Look for top_news_box or last_word_box class (handle both quoted and unquoted, with/without spaces) | |
| if re.search(r'class\s*=\s*["\']?[^"\'>]*(top_news_box|last_word_box)', before_img, re.IGNORECASE): | |
| is_footer_or_top_news = True | |
| # Only add transformations to market images (48px square images) | |
| if is_market_image and not is_footer_or_top_news: | |
| # Extract target size from width attribute or style | |
| target_size = 48 # Default for market images | |
| # Try to extract from width attribute | |
| width_match = re.search(r'width=["\'](\d+)["\']', full_tag, re.IGNORECASE) | |
| if width_match: | |
| target_size = int(width_match.group(1)) | |
| else: | |
| # Try to extract from style | |
| style_match = re.search(r'width\s*:\s*(\d+)px', full_tag, re.IGNORECASE) | |
| if style_match: | |
| target_size = int(style_match.group(1)) | |
| # Add transformations to the URL | |
| transformation = f"w_{target_size},h_{target_size},c_fill,g_center,q_auto,f_auto" | |
| # Insert transformations into the URL | |
| if "/upload/" in url: | |
| parts = url.split("/upload/", 1) | |
| if len(parts) == 2: | |
| path_part = parts[1] | |
| new_url = f"{parts[0]}/upload/{transformation}/{path_part}" | |
| modified = True | |
| fixed_count += 1 | |
| return f"{prefix}{new_url}{suffix}" | |
| return full_tag # No changes needed | |
| # Process all img tags | |
| html_content = re.sub(pattern, add_transformations, html_content, flags=re.IGNORECASE) | |
| if modified: | |
| print(f" Fixed {fixed_count} Cloudinary image URL(s) by adding transformations") | |
| else: | |
| print(f" No Cloudinary images found that need transformation fixes") | |
| return html_content | |
| def fix_section_header_alignment(html_content: str) -> str: | |
| """Restore section header tables to the simplest, most compatible 2-column layout. | |
| Based on analysis of working emails (original_msg.eml), the most reliable pattern is: | |
| - Simple 2-column table with width="100%" | |
| - Second cell has align="right" attribute | |
| - NO table-layout:fixed | |
| - NO explicit widths on cells | |
| - NO spacer cells or nested tables | |
| This matches the original template structure that worked on mobile. | |
| """ | |
| from bs4 import BeautifulSoup | |
| soup = BeautifulSoup(html_content, "html.parser") | |
| modified = False | |
| wrap_h_divs = soup.find_all("div", class_="wrap_h") | |
| for wrap_h in wrap_h_divs: | |
| tables = wrap_h.find_all("table", role="presentation") | |
| for table in tables: | |
| # Identify candidate header table: exactly one row with 2 tds | |
| tr = table.find("tr") | |
| if not tr: | |
| continue | |
| tds = tr.find_all("td", recursive=False) | |
| # Must have exactly 2 columns | |
| if len(tds) != 2: | |
| continue | |
| first_td, second_td = tds[0], tds[1] | |
| # Ensure header table is full width via attribute | |
| if table.get("width") != "100%": | |
| table["width"] = "100%" | |
| modified = True | |
| # Strip table-layout from table style if present | |
| table_style = table.get("style", "") | |
| if table_style and "table-layout" in table_style.lower(): | |
| new_table_style = re.sub(r"table-layout\s*:\s*[^;]+;?\s*", "", table_style, flags=re.IGNORECASE).strip("; ") | |
| table["style"] = new_table_style | |
| modified = True | |
| # Remove width/min-width/white-space/padding-right constraints on both cells | |
| for td in (first_td, second_td): | |
| if td.get("width"): | |
| del td["width"] | |
| modified = True | |
| td_style = td.get("style", "") | |
| if td_style: | |
| original_style = td_style | |
| td_style = re.sub(r"width\s*:\s*[^;]+;?\s*", "", td_style, flags=re.IGNORECASE) | |
| td_style = re.sub(r"min-width\s*:\s*[^;]+;?\s*", "", td_style, flags=re.IGNORECASE) | |
| td_style = re.sub(r"white-space\s*:\s*[^;]+;?\s*", "", td_style, flags=re.IGNORECASE) | |
| td_style = re.sub(r"padding-right\s*:\s*[^;]+;?\s*", "", td_style, flags=re.IGNORECASE) | |
| td_style = td_style.strip("; ") | |
| if td_style != original_style: | |
| td["style"] = td_style | |
| modified = True | |
| # Ensure second cell has align="right" attribute | |
| if second_td.get("align") != "right": | |
| second_td["align"] = "right" | |
| modified = True | |
| # Add text-align:right as fallback in inline style | |
| second_td_style = second_td.get("style", "") | |
| if "text-align" not in second_td_style.lower(): | |
| # Clean up the style first - remove trailing semicolons and spaces | |
| second_td_style = second_td_style.strip().rstrip(";").strip() | |
| # Add text-align:right | |
| if second_td_style: | |
| second_td_style += ";text-align:right" | |
| else: | |
| second_td_style = "text-align:right" | |
| second_td["style"] = second_td_style | |
| modified = True | |
| if modified: | |
| print(" Restored section header tables to simple 2-column layout for mobile") | |
| return str(soup) | |
| print(" No section header tables found that need alignment fixes") | |
| return html_content | |
| def add_link_spacing(html_content: str) -> str: | |
| """Add spaces before specific links to ensure proper spacing in email clients. | |
| This function works on the final HTML string and adds spaces before: | |
| - "Get Priority Access" links | |
| - "support@polymarket.com" links | |
| - "unsubscribe here" links | |
| """ | |
| modified = False | |
| # Pattern 1: "Get Priority Access" link | |
| # Match: non-whitespace character immediately before <a> tag containing "Get Priority Access" | |
| pattern1 = r'([^\s&>])(<a[^>]*>.*?Get Priority Access)' | |
| if re.search(pattern1, html_content, re.IGNORECASE | re.DOTALL): | |
| html_content = re.sub(pattern1, r'\1 \2', html_content, flags=re.IGNORECASE | re.DOTALL) | |
| modified = True | |
| print(f" Added space before 'Get Priority Access' link") | |
| # Pattern 2: "support@polymarket.com" link | |
| # Match: non-whitespace character immediately before <a> tag with mailto containing "support@polymarket.com" | |
| pattern2 = r'([^\s&>])(<a[^>]*mailto[^>]*>.*?support@polymarket\.com)' | |
| if re.search(pattern2, html_content, re.IGNORECASE | re.DOTALL): | |
| html_content = re.sub(pattern2, r'\1 \2', html_content, flags=re.IGNORECASE | re.DOTALL) | |
| modified = True | |
| print(f" Added space before 'support@polymarket.com' link") | |
| # Pattern 3: "unsubscribe here" link | |
| # Match: non-whitespace character immediately before <a> tag containing "unsubscribe here" or just "here" | |
| pattern3 = r'([^\s&>])(<a[^>]*>.*?unsubscribe here)' | |
| if re.search(pattern3, html_content, re.IGNORECASE | re.DOTALL): | |
| html_content = re.sub(pattern3, r'\1 \2', html_content, flags=re.IGNORECASE | re.DOTALL) | |
| modified = True | |
| print(f" Added space before 'unsubscribe here' link") | |
| # Also handle cases where "here" is the link text (for unsubscribe links) | |
| pattern4 = r'([^\s&>])(<a[^>]*href[^>]*manage_subscription[^>]*>.*?here)' | |
| if re.search(pattern4, html_content, re.IGNORECASE | re.DOTALL): | |
| html_content = re.sub(pattern4, r'\1 \2', html_content, flags=re.IGNORECASE | re.DOTALL) | |
| modified = True | |
| print(f" Added space before 'unsubscribe here' link (manage_subscription)") | |
| if not modified: | |
| print(f" No links found that need spacing") | |
| return html_content | |
| def minify_html(html_content: str) -> str: | |
| """Apply conservative minification steps. | |
| This function is designed to reduce file size for Gmail while | |
| preserving all formatting attributes that email clients need. | |
| """ | |
| original_size = len(html_content) | |
| print(f" Starting minification (original size: {original_size:,} bytes)") | |
| # Step 1: Remove @font-face style tags (email clients don't support them) | |
| html_content = remove_font_face_styles(html_content) | |
| after_font_face = len(html_content) | |
| if after_font_face != original_size: | |
| print(f" After removing @font-face: {after_font_face:,} bytes (saved {original_size - after_font_face:,} bytes)") | |
| # Step 2: Minify whitespace (conservative - only removes safe whitespace) | |
| html_content = minify_whitespace(html_content) | |
| after_whitespace = len(html_content) | |
| print(f" After minifying whitespace: {after_whitespace:,} bytes") | |
| # Step 3: Fix header centering (add margin:0 auto if missing) | |
| html_content = fix_header_centering(html_content) | |
| if 'margin:0 auto' in html_content: | |
| print(f" Fixed header centering (added margin:0 auto)") | |
| # Step 4: Fix story box headlines (add margin-top:0 to prevent empty line) | |
| html_content = fix_story_box_headlines(html_content) | |
| if "margin-top:0" in html_content: | |
| print(f" Fixed story box headlines (added margin-top:0)") | |
| # Step 5: Fix unsubscribe link href (update from # to {% manage_subscription_preferences_url %}) | |
| html_content = fix_unsubscribe_link(html_content) | |
| # Step 6: Fix Cloudinary image URLs (add transformations to market images) | |
| html_content = fix_cloudinary_image_transformations(html_content) | |
| # Step 7: Fix section header alignment for mobile | |
| html_content = fix_section_header_alignment(html_content) | |
| # Step 8: Add spaces before links (ensures proper spacing in email clients) | |
| html_content = add_link_spacing(html_content) | |
| final_size = len(html_content) | |
| total_saved = original_size - final_size | |
| percent_saved = (total_saved / original_size * 100) if original_size > 0 else 0 | |
| print(f" Final size: {final_size:,} bytes (saved {total_saved:,} bytes, {percent_saved:.1f}%)") | |
| return html_content | |
| # ============================================================================= | |
| # VALIDATION | |
| # ============================================================================= | |
| def validate_html(html_content: str) -> list: | |
| """Validate the HTML structure and return any errors.""" | |
| errors = [] | |
| if '<html' not in html_content: | |
| errors.append("Missing <html> tag") | |
| if '</html>' not in html_content: | |
| errors.append("Missing </html> closing tag") | |
| if '<head' not in html_content: | |
| errors.append("Missing <head> tag") | |
| if '<body' not in html_content: | |
| errors.append("Missing <body> tag") | |
| # Check for balanced tables | |
| table_opens = html_content.count('<table') | |
| table_closes = html_content.count('</table>') | |
| if table_opens != table_closes: | |
| errors.append(f"Unbalanced tables: {table_opens} opens, {table_closes} closes") | |
| # Check container structure | |
| if 'max-width:500px' not in html_content: | |
| errors.append("Container max-width missing") | |
| # Check that we preserved important attributes | |
| if 'role="none"' not in html_content: | |
| errors.append("Warning: role='none' attributes missing (may affect table alignment)") | |
| if 'cellpadding="0"' not in html_content: | |
| errors.append("Warning: quoted cellpadding attributes missing (may affect email clients)") | |
| return errors | |
| # ============================================================================= | |
| # MAIN | |
| # ============================================================================= | |
| def main(): | |
| if len(sys.argv) < 3: | |
| print("Usage: python post_process.py <input_html_file> <dub_api_key> [output_html_file]") | |
| print("\nThis script:") | |
| print(" 1. Converts Polymarket links to Dub tracking links") | |
| print(" 2. Minifies HTML for Gmail compatibility (conservative)") | |
| print("\nArguments:") | |
| print(" input_html_file: Path to input HTML file") | |
| print(" dub_api_key: Dub API key for link tracking") | |
| print(" output_html_file: (Optional) Path to output file (default: input_final.html)") | |
| print("\nConservative minification preserves:") | |
| print(" - role='none' attributes (table alignment)") | |
| print(" - Quoted attribute values (email client compatibility)") | |
| print(" - font-style properties (even non-standard ones)") | |
| sys.exit(1) | |
| input_file = sys.argv[1] | |
| dub_api_key = sys.argv[2] | |
| # Default output: replace .html with _final.html | |
| if len(sys.argv) > 3: | |
| output_file = sys.argv[3] | |
| else: | |
| output_file = input_file.replace('.html', '_final.html') | |
| # Read input | |
| print(f"Reading {input_file}...") | |
| with open(input_file, 'r', encoding='utf-8') as f: | |
| html_content = f.read() | |
| original_size = len(html_content) | |
| print(f" Original size: {original_size:,} bytes\n") | |
| # Step 1: Convert Polymarket links to Dub tracking links | |
| print("Step 1: Converting Polymarket links to Dub tracking links...") | |
| html_content = convert_polymarket_links(html_content, api_key=dub_api_key) | |
| print() | |
| # Step 2: Minify HTML (conservative) | |
| print("Step 2: Minifying HTML for Gmail (conservative)...") | |
| html_content = minify_html(html_content) | |
| print() | |
| # Validate | |
| print("Validating HTML structure...") | |
| errors = validate_html(html_content) | |
| warnings = [e for e in errors if e.startswith("Warning:")] | |
| real_errors = [e for e in errors if not e.startswith("Warning:")] | |
| if real_errors: | |
| print(" ERRORS:") | |
| for e in real_errors: | |
| print(f" - {e}") | |
| if warnings: | |
| print(" Warnings:") | |
| for w in warnings: | |
| print(f" - {w}") | |
| if not errors: | |
| print(" Structure OK") | |
| # Check Gmail compatibility | |
| final_size = len(html_content) | |
| gmail_limit = 102000 | |
| if final_size < gmail_limit: | |
| print(f" Gmail: OK ({final_size:,} < {gmail_limit:,} bytes)") | |
| else: | |
| print(f" Gmail: WARNING - may be clipped ({final_size:,} >= {gmail_limit:,} bytes)") | |
| print() | |
| # Write output | |
| print(f"Writing to {output_file}...") | |
| with open(output_file, 'w', encoding='utf-8') as f: | |
| f.write(html_content) | |
| print(f"\nDone! Saved to {output_file}") | |
| print(f" Original: {original_size:,} bytes") | |
| print(f" Final: {final_size:,} bytes") | |
| print(f" Saved: {original_size - final_size:,} bytes ({((original_size - final_size) / original_size * 100):.1f}%)") | |
| if __name__ == "__main__": | |
| main() | |