Spaces:

priyadip
/

HTML_to_JPG_PNG_PDF

Sleeping

App Files Files Community

priyadip commited on Mar 11

Commit

b572903

verified ·

1 Parent(s): 5b0602c

Upload app.py

Browse files

Files changed (1) hide show

app.py +267 -0

app.py ADDED Viewed

	@@ -0,0 +1,267 @@

+"""
+HTML → JPG / PNG / PDF Exporter
+Accepts any uploaded HTML file, auto-detects its exact pixel dimensions,
+and exports pixel-perfect PNG, JPG, and PDF — nothing discarded.
+"""
+import base64
+import os
+import re
+import shutil
+import subprocess
+import sys
+import tempfile
+import urllib.request
+from pathlib import Path
+import gradio as gr
+from PIL import Image
+# ── One-time Playwright Chromium install ─────────────────────────────────────
+def _install_chromium():
+    print("[setup] Installing Playwright Chromium + system deps …")
+    r = subprocess.run(
+        [sys.executable, "-m", "playwright", "install", "--with-deps", "chromium"],
+        capture_output=True, text=True,
+    )
+    print("[setup]", r.stdout[-400:] or r.stderr[-400:])
+_install_chromium()
+# ── Image inliner: download every external <img src> → base64 data URI ───────
+def _inline_images(html: str) -> str:
+    """
+    Replaces every  src="https://..."  inside <img> tags with a base64
+    data URI so Playwright never needs to make CDN requests during render.
+    CDNs (e.g. simpleicons.org) often block headless Chromium; inlining
+    eliminates that problem entirely.
+    """
+    headers = {
+        "User-Agent": (
+            "Mozilla/5.0 (X11; Linux x86_64) "
+            "AppleWebKit/537.36 (KHTML, like Gecko) "
+            "Chrome/124.0.0.0 Safari/537.36"
+        )
+    }
+    def replace(match):
+        url = match.group(1)
+        try:
+            req = urllib.request.Request(url, headers=headers)
+            with urllib.request.urlopen(req, timeout=15) as resp:
+                data   = resp.read()
+                ctype  = resp.headers.get_content_type() or "image/svg+xml"
+            b64 = base64.b64encode(data).decode()
+            print(f"[inline] OK  {url[:70]}")
+            return f'src="data:{ctype};base64,{b64}"'
+        except Exception as exc:
+            print(f"[inline] FAIL {url[:70]} — {exc}")
+            return match.group(0)   # leave original src on failure
+    # Match src="http..." inside any tag (covers <img> and background src attrs)
+    return re.sub(r'src="(https?://[^"]+)"', replace, html)
+# ── Core renderer ─────────────────────────────────────────────────────────────
+def _render_to_png(html_path: str, out_png: str) -> tuple[int, int]:
+    """
+    1. Read HTML and inline all external images as base64 data URIs.
+    2. Load the modified HTML in headless Chromium (8192×8192 viewport).
+    3. Detect the exact rendered size of the top-level element.
+    4. Resize viewport to that size and screenshot.
+    Returns (width, height) in pixels.
+    """
+    from playwright.sync_api import sync_playwright
+    # ── Step 1: inline all external images ───────────────────────────────────
+    raw_html = Path(html_path).read_text(encoding="utf-8", errors="replace")
+    inlined  = _inline_images(raw_html)
+    inlined_path = html_path + ".inlined.html"
+    Path(inlined_path).write_text(inlined, encoding="utf-8")
+    url = f"file://{Path(inlined_path).as_posix()}"
+    # ── Step 2: render ────────────────────────────────────────────────────────
+    with sync_playwright() as p:
+        browser = p.chromium.launch(args=["--no-sandbox", "--disable-dev-shm-usage"])
+        page = browser.new_page(viewport={"width": 8192, "height": 8192})
+        page.goto(url)
+        # networkidle: no CDN img requests outstanding (all inlined), so this
+        # resolves quickly and mainly waits for Google Fonts CSS
+        page.wait_for_load_state("networkidle", timeout=30_000)
+        # Wait for any remaining <img> (e.g. failed inlines kept as http src)
+        page.evaluate("""async () => {
+            const imgs = Array.from(document.querySelectorAll('img'));
+            await Promise.all(imgs.map(img => {
+                if (img.complete) return Promise.resolve();
+                return new Promise(resolve => {
+                    img.addEventListener('load',  resolve);
+                    img.addEventListener('error', resolve);
+                });
+            }));
+        }""")
+        # Let Google Fonts finish painting glyphs
+        page.wait_for_timeout(2_500)
+        # ── Step 3: detect exact content size ────────────────────────────────
+        dims = page.evaluate("""() => {
+            const el = document.body && document.body.firstElementChild;
+            if (el) {
+                const s  = window.getComputedStyle(el);
+                const sw = parseFloat(s.width);
+                const sh = parseFloat(s.height);
+                if (sw > 0 && sh > 0) return { w: Math.round(sw), h: Math.round(sh) };
+                if (el.offsetWidth > 0)
+                    return { w: el.offsetWidth, h: el.offsetHeight };
+            }
+            return {
+                w: document.documentElement.scrollWidth,
+                h: document.documentElement.scrollHeight,
+            };
+        }""")
+        W = max(int(dims["w"]), 1)
+        H = max(int(dims["h"]), 1)
+        # ── Step 4: resize viewport → repaint → screenshot ───────────────────
+        page.set_viewport_size({"width": W, "height": H})
+        page.wait_for_timeout(1_000)
+        page.screenshot(
+            path=out_png,
+            full_page=False,
+            clip={"x": 0, "y": 0, "width": W, "height": H},
+            type="png",
+        )
+        browser.close()
+    return W, H
+# ── Format converters ─────────────────────────────────────────────────────────
+def _to_jpg(png: str, jpg: str, quality: int = 95):
+    Image.open(png).convert("RGB").save(
+        jpg, "JPEG", quality=quality, subsampling=0
+    )
+def _to_pdf(png: str, pdf: str, dpi: int = 150):
+    """Embeds all pixels into a PDF page. DPI sets print size only."""
+    Image.open(png).convert("RGB").save(
+        pdf, "PDF", resolution=float(dpi), save_all=False
+    )
+# ── Gradio handler ────────────────────────────────────────────────────────────
+DPI_MAP = {
+    "72  DPI — screen / web":        72,
+    "96  DPI — standard screen":     96,
+    "150 DPI — print (default)":    150,
+    "200 DPI — high quality print": 200,
+    "300 DPI — press / poster":     300,
+}
+def process(html_file, dpi_label, progress=gr.Progress()):
+    if html_file is None:
+        raise gr.Error("Please upload an HTML file first.")
+    dpi  = DPI_MAP[dpi_label]
+    work = tempfile.mkdtemp(prefix="html2img_")
+    html_dest = os.path.join(work, "page.html")
+    shutil.copy(str(html_file), html_dest)
+    png = os.path.join(work, "export.png")
+    jpg = os.path.join(work, "export.jpg")
+    pdf = os.path.join(work, "export.pdf")
+    progress(0.05, desc="Downloading & inlining external images …")
+    W, H = _render_to_png(html_dest, png)
+    progress(0.70, desc=f"Captured {W} × {H} px — building outputs …")
+    _to_jpg(png, jpg)
+    progress(0.85, desc="JPG done — converting to PDF …")
+    _to_pdf(png, pdf, dpi=dpi)
+    progress(1.00, desc="All three files ready!")
+    info = (
+        f"**Detected size:** {W} × {H} px &nbsp;·&nbsp; "
+        f"PNG {os.path.getsize(png)//1024} KB &nbsp;·&nbsp; "
+        f"JPG {os.path.getsize(jpg)//1024} KB &nbsp;·&nbsp; "
+        f"PDF {os.path.getsize(pdf)//1024} KB"
+    )
+    return png, png, jpg, pdf, info
+# ── UI ────────────────────────────────────────────────────────────────────────
+with gr.Blocks(title="HTML → JPG / PNG / PDF") as demo:
+    gr.Markdown(
+        """
+        # 🖼 HTML → JPG · PNG · PDF Exporter
+        Upload **any HTML file**. The app auto-detects its exact pixel size from
+        the rendered layout and exports three pixel-perfect files — nothing cropped or discarded.
+        """
+    )
+    with gr.Row():
+        with gr.Column(scale=1):
+            html_upload = gr.File(
+                label="📂 Upload HTML file",
+                file_types=[".html", ".htm"],
+                type="filepath",
+            )
+            dpi_radio = gr.Radio(
+                choices=list(DPI_MAP.keys()),
+                value="150 DPI — print (default)",
+                label="PDF DPI  (only affects physical print size, not pixel count)",
+            )
+            run_btn = gr.Button("🚀  Convert", variant="primary", size="lg")
+            info_md = gr.Markdown("_Upload an HTML file and click Convert._")
+        with gr.Column(scale=2):
+            preview = gr.Image(
+                label="Preview (PNG — full resolution)",
+                elem_id="preview",
+                interactive=False,
+            )
+    gr.Markdown("### ⬇ Download exports")
+    with gr.Row():
+        out_png = gr.File(label="📥 PNG  (lossless)")
+        out_jpg = gr.File(label="📥 JPG  (Q95, near-lossless)")
+        out_pdf = gr.File(label="📥 PDF  (chosen DPI, all pixels kept)")
+    gr.Markdown(
+        """
+        <details>
+        <summary>ℹ️ How it works</summary>
+        | Step | What happens |
+        |------|-------------|
+        | 1. Inline images | Every `src="https://..."` in the HTML is downloaded and replaced with a base64 data URI — no CDN calls during render |
+        | 2. Detect size | Chromium loads the modified HTML in an 8192 × 8192 viewport; the computed CSS size of the top-level element is read via JS |
+        | 3. Screenshot | Viewport is resized to exactly that size and a lossless PNG is captured |
+        | 4. JPG | Converted from PNG at quality 95, no chroma subsampling |
+        | 5. PDF | PNG pixels embedded into a PDF page; DPI sets declared print size only |
+        **Works with:** inline CSS/JS, CDN fonts/icons, SVG, web fonts, gradients.
+        </details>
+        """
+    )
+    run_btn.click(
+        fn=process,
+        inputs=[html_upload, dpi_radio],
+        outputs=[preview, out_png, out_jpg, out_pdf, info_md],
+    )
+if __name__ == "__main__":
+    demo.launch()