priyadip commited on
Commit
b572903
Β·
verified Β·
1 Parent(s): 5b0602c

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +267 -0
app.py ADDED
@@ -0,0 +1,267 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ HTML β†’ JPG / PNG / PDF Exporter
3
+ Accepts any uploaded HTML file, auto-detects its exact pixel dimensions,
4
+ and exports pixel-perfect PNG, JPG, and PDF β€” nothing discarded.
5
+ """
6
+
7
+ import base64
8
+ import os
9
+ import re
10
+ import shutil
11
+ import subprocess
12
+ import sys
13
+ import tempfile
14
+ import urllib.request
15
+ from pathlib import Path
16
+
17
+ import gradio as gr
18
+ from PIL import Image
19
+
20
+
21
+ # ── One-time Playwright Chromium install ─────────────────────────────────────
22
+ def _install_chromium():
23
+ print("[setup] Installing Playwright Chromium + system deps …")
24
+ r = subprocess.run(
25
+ [sys.executable, "-m", "playwright", "install", "--with-deps", "chromium"],
26
+ capture_output=True, text=True,
27
+ )
28
+ print("[setup]", r.stdout[-400:] or r.stderr[-400:])
29
+
30
+ _install_chromium()
31
+
32
+
33
+ # ── Image inliner: download every external <img src> β†’ base64 data URI ───────
34
+ def _inline_images(html: str) -> str:
35
+ """
36
+ Replaces every src="https://..." inside <img> tags with a base64
37
+ data URI so Playwright never needs to make CDN requests during render.
38
+ CDNs (e.g. simpleicons.org) often block headless Chromium; inlining
39
+ eliminates that problem entirely.
40
+ """
41
+ headers = {
42
+ "User-Agent": (
43
+ "Mozilla/5.0 (X11; Linux x86_64) "
44
+ "AppleWebKit/537.36 (KHTML, like Gecko) "
45
+ "Chrome/124.0.0.0 Safari/537.36"
46
+ )
47
+ }
48
+
49
+ def replace(match):
50
+ url = match.group(1)
51
+ try:
52
+ req = urllib.request.Request(url, headers=headers)
53
+ with urllib.request.urlopen(req, timeout=15) as resp:
54
+ data = resp.read()
55
+ ctype = resp.headers.get_content_type() or "image/svg+xml"
56
+ b64 = base64.b64encode(data).decode()
57
+ print(f"[inline] OK {url[:70]}")
58
+ return f'src="data:{ctype};base64,{b64}"'
59
+ except Exception as exc:
60
+ print(f"[inline] FAIL {url[:70]} β€” {exc}")
61
+ return match.group(0) # leave original src on failure
62
+
63
+ # Match src="http..." inside any tag (covers <img> and background src attrs)
64
+ return re.sub(r'src="(https?://[^"]+)"', replace, html)
65
+
66
+
67
+ # ── Core renderer ─────────────────────────────────────────────────────────────
68
+ def _render_to_png(html_path: str, out_png: str) -> tuple[int, int]:
69
+ """
70
+ 1. Read HTML and inline all external images as base64 data URIs.
71
+ 2. Load the modified HTML in headless Chromium (8192Γ—8192 viewport).
72
+ 3. Detect the exact rendered size of the top-level element.
73
+ 4. Resize viewport to that size and screenshot.
74
+ Returns (width, height) in pixels.
75
+ """
76
+ from playwright.sync_api import sync_playwright
77
+
78
+ # ── Step 1: inline all external images ───────────────────────────────────
79
+ raw_html = Path(html_path).read_text(encoding="utf-8", errors="replace")
80
+ inlined = _inline_images(raw_html)
81
+
82
+ inlined_path = html_path + ".inlined.html"
83
+ Path(inlined_path).write_text(inlined, encoding="utf-8")
84
+
85
+ url = f"file://{Path(inlined_path).as_posix()}"
86
+
87
+ # ── Step 2: render ────────────────────────────────────────────────────────
88
+ with sync_playwright() as p:
89
+ browser = p.chromium.launch(args=["--no-sandbox", "--disable-dev-shm-usage"])
90
+ page = browser.new_page(viewport={"width": 8192, "height": 8192})
91
+
92
+ page.goto(url)
93
+ # networkidle: no CDN img requests outstanding (all inlined), so this
94
+ # resolves quickly and mainly waits for Google Fonts CSS
95
+ page.wait_for_load_state("networkidle", timeout=30_000)
96
+
97
+ # Wait for any remaining <img> (e.g. failed inlines kept as http src)
98
+ page.evaluate("""async () => {
99
+ const imgs = Array.from(document.querySelectorAll('img'));
100
+ await Promise.all(imgs.map(img => {
101
+ if (img.complete) return Promise.resolve();
102
+ return new Promise(resolve => {
103
+ img.addEventListener('load', resolve);
104
+ img.addEventListener('error', resolve);
105
+ });
106
+ }));
107
+ }""")
108
+
109
+ # Let Google Fonts finish painting glyphs
110
+ page.wait_for_timeout(2_500)
111
+
112
+ # ── Step 3: detect exact content size ────────────────────────────────
113
+ dims = page.evaluate("""() => {
114
+ const el = document.body && document.body.firstElementChild;
115
+ if (el) {
116
+ const s = window.getComputedStyle(el);
117
+ const sw = parseFloat(s.width);
118
+ const sh = parseFloat(s.height);
119
+ if (sw > 0 && sh > 0) return { w: Math.round(sw), h: Math.round(sh) };
120
+ if (el.offsetWidth > 0)
121
+ return { w: el.offsetWidth, h: el.offsetHeight };
122
+ }
123
+ return {
124
+ w: document.documentElement.scrollWidth,
125
+ h: document.documentElement.scrollHeight,
126
+ };
127
+ }""")
128
+
129
+ W = max(int(dims["w"]), 1)
130
+ H = max(int(dims["h"]), 1)
131
+
132
+ # ── Step 4: resize viewport β†’ repaint β†’ screenshot ───────────────────
133
+ page.set_viewport_size({"width": W, "height": H})
134
+ page.wait_for_timeout(1_000)
135
+
136
+ page.screenshot(
137
+ path=out_png,
138
+ full_page=False,
139
+ clip={"x": 0, "y": 0, "width": W, "height": H},
140
+ type="png",
141
+ )
142
+ browser.close()
143
+
144
+ return W, H
145
+
146
+
147
+ # ── Format converters ─────────────────────────────────────────────────────────
148
+ def _to_jpg(png: str, jpg: str, quality: int = 95):
149
+ Image.open(png).convert("RGB").save(
150
+ jpg, "JPEG", quality=quality, subsampling=0
151
+ )
152
+
153
+
154
+ def _to_pdf(png: str, pdf: str, dpi: int = 150):
155
+ """Embeds all pixels into a PDF page. DPI sets print size only."""
156
+ Image.open(png).convert("RGB").save(
157
+ pdf, "PDF", resolution=float(dpi), save_all=False
158
+ )
159
+
160
+
161
+ # ── Gradio handler ────────────────────────────────────────────────────────────
162
+ DPI_MAP = {
163
+ "72 DPI β€” screen / web": 72,
164
+ "96 DPI β€” standard screen": 96,
165
+ "150 DPI β€” print (default)": 150,
166
+ "200 DPI β€” high quality print": 200,
167
+ "300 DPI β€” press / poster": 300,
168
+ }
169
+
170
+ def process(html_file, dpi_label, progress=gr.Progress()):
171
+ if html_file is None:
172
+ raise gr.Error("Please upload an HTML file first.")
173
+
174
+ dpi = DPI_MAP[dpi_label]
175
+ work = tempfile.mkdtemp(prefix="html2img_")
176
+
177
+ html_dest = os.path.join(work, "page.html")
178
+ shutil.copy(str(html_file), html_dest)
179
+
180
+ png = os.path.join(work, "export.png")
181
+ jpg = os.path.join(work, "export.jpg")
182
+ pdf = os.path.join(work, "export.pdf")
183
+
184
+ progress(0.05, desc="Downloading & inlining external images …")
185
+ W, H = _render_to_png(html_dest, png)
186
+ progress(0.70, desc=f"Captured {W} Γ— {H} px β€” building outputs …")
187
+
188
+ _to_jpg(png, jpg)
189
+ progress(0.85, desc="JPG done β€” converting to PDF …")
190
+
191
+ _to_pdf(png, pdf, dpi=dpi)
192
+ progress(1.00, desc="All three files ready!")
193
+
194
+ info = (
195
+ f"**Detected size:** {W} Γ— {H} px &nbsp;Β·&nbsp; "
196
+ f"PNG {os.path.getsize(png)//1024} KB &nbsp;Β·&nbsp; "
197
+ f"JPG {os.path.getsize(jpg)//1024} KB &nbsp;Β·&nbsp; "
198
+ f"PDF {os.path.getsize(pdf)//1024} KB"
199
+ )
200
+ return png, png, jpg, pdf, info
201
+
202
+
203
+ # ── UI ────────────────────────────────────────────────────────────────────────
204
+ with gr.Blocks(title="HTML β†’ JPG / PNG / PDF") as demo:
205
+
206
+ gr.Markdown(
207
+ """
208
+ # πŸ–Ό HTML β†’ JPG Β· PNG Β· PDF Exporter
209
+ Upload **any HTML file**. The app auto-detects its exact pixel size from
210
+ the rendered layout and exports three pixel-perfect files β€” nothing cropped or discarded.
211
+ """
212
+ )
213
+
214
+ with gr.Row():
215
+ with gr.Column(scale=1):
216
+ html_upload = gr.File(
217
+ label="πŸ“‚ Upload HTML file",
218
+ file_types=[".html", ".htm"],
219
+ type="filepath",
220
+ )
221
+ dpi_radio = gr.Radio(
222
+ choices=list(DPI_MAP.keys()),
223
+ value="150 DPI β€” print (default)",
224
+ label="PDF DPI (only affects physical print size, not pixel count)",
225
+ )
226
+ run_btn = gr.Button("πŸš€ Convert", variant="primary", size="lg")
227
+ info_md = gr.Markdown("_Upload an HTML file and click Convert._")
228
+
229
+ with gr.Column(scale=2):
230
+ preview = gr.Image(
231
+ label="Preview (PNG β€” full resolution)",
232
+ elem_id="preview",
233
+ interactive=False,
234
+ )
235
+
236
+ gr.Markdown("### ⬇ Download exports")
237
+ with gr.Row():
238
+ out_png = gr.File(label="πŸ“₯ PNG (lossless)")
239
+ out_jpg = gr.File(label="πŸ“₯ JPG (Q95, near-lossless)")
240
+ out_pdf = gr.File(label="πŸ“₯ PDF (chosen DPI, all pixels kept)")
241
+
242
+ gr.Markdown(
243
+ """
244
+ <details>
245
+ <summary>ℹ️ How it works</summary>
246
+
247
+ | Step | What happens |
248
+ |------|-------------|
249
+ | 1. Inline images | Every `src="https://..."` in the HTML is downloaded and replaced with a base64 data URI β€” no CDN calls during render |
250
+ | 2. Detect size | Chromium loads the modified HTML in an 8192 Γ— 8192 viewport; the computed CSS size of the top-level element is read via JS |
251
+ | 3. Screenshot | Viewport is resized to exactly that size and a lossless PNG is captured |
252
+ | 4. JPG | Converted from PNG at quality 95, no chroma subsampling |
253
+ | 5. PDF | PNG pixels embedded into a PDF page; DPI sets declared print size only |
254
+
255
+ **Works with:** inline CSS/JS, CDN fonts/icons, SVG, web fonts, gradients.
256
+ </details>
257
+ """
258
+ )
259
+
260
+ run_btn.click(
261
+ fn=process,
262
+ inputs=[html_upload, dpi_radio],
263
+ outputs=[preview, out_png, out_jpg, out_pdf, info_md],
264
+ )
265
+
266
+ if __name__ == "__main__":
267
+ demo.launch()