"""Gradio demo for distill-structure model."""
import json
import re
import gradio as gr
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
# ---------------------------------------------------------------------------
# Model
# ---------------------------------------------------------------------------
MODEL_ID = "nahidstaq/distill-structure"
SYSTEM = (
"You are an HTML structure analyzer. Given a compact DOM representation "
"of a web page (with headings removed), identify the logical sections. "
"Output a JSON array of sections, each with title, start_text, content_type, and assets fields."
)
_model = None
_tokenizer = None
def _load():
global _model, _tokenizer
if _model is None:
device = "cuda" if torch.cuda.is_available() else "cpu"
dtype = torch.bfloat16 if device == "cuda" else torch.float32
_tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
_model = AutoModelForCausalLM.from_pretrained(
MODEL_ID, dtype=dtype, device_map="auto"
)
_model.eval()
return _model, _tokenizer
# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------
def _compact_dom(html: str) -> str:
from lxml import html as lxml_html
try:
doc = lxml_html.fromstring(html)
except Exception:
return html[:3000]
for tag in ("h1", "h2", "h3", "h4", "h5", "h6", "script", "style", "head"):
for el in doc.findall(f".//{tag}"):
p = el.getparent()
if p is not None:
p.remove(el)
def _walk(el, depth=0):
if not hasattr(el, "tag") or not isinstance(el.tag, str):
return ""
tag = el.tag
indent = " " * depth
if tag == "img":
alt = el.get("alt", "")
return f'{indent} Welcome to the best product you've ever seen.' if alt else f'{indent}
'
if tag == "a":
text = (el.text_content() or "").strip()[:40]
href = (el.get("href") or "")[:60]
return f'{indent} {text}'
if tag in ("td", "th"):
# Recurse into td if it has block children, otherwise truncate
children = [c for c in el if hasattr(c, "tag") and isinstance(c.tag, str)]
if children and depth < 8:
lines = [f"{indent}<{tag}>"]
for child in children:
r = _walk(child, depth + 1)
if r:
lines.append(r)
return "\n".join(lines)
text = (el.text_content() or "").strip()[:60]
return f"{indent}<{tag}> {text}" if text else ""
if depth > 7:
text = (el.text_content() or "").strip()[:80]
return f"{indent}[... {text}...]" if text else ""
text = (el.text or "").strip()[:50]
attrs = ""
for a in ("id", "class", "role"):
v = el.get(a)
if v:
attrs += f' {a}="{v[:30]}"'
line = f"{indent}<{tag}{attrs}>"
if text:
line += f" {text}"
lines = [line]
for child in el:
r = _walk(child, depth + 1)
if r:
lines.append(r)
return "\n".join(lines)
body = doc.find(".//body") or doc
result = _walk(body)
# Truncate to 4096 chars
if len(result) > 4096:
result = result[:4096] + "\n... (truncated)"
return result
def _extract_title(html: str) -> str:
m = re.search(r"
Our Amazing Product
Features
Pricing
| Plan | Price |
|---|---|
| Starter | $9/mo |
| Pro | $29/mo |
Yes! 14 days free, no credit card required.
""" with gr.Blocks(title="distill-structure", theme=gr.themes.Soft()) as demo: gr.Markdown("# distill-structure\nHTML section analyzer — fine-tuned Qwen3.5-2B") with gr.Tabs(): with gr.Tab("Paste HTML"): with gr.Row(): with gr.Column(): html_input = gr.Textbox( label="HTML", placeholder="Paste HTML here...", lines=15, value=EXAMPLE_HTML, ) title_input = gr.Textbox(label="Page title (optional)", placeholder="Auto-detected from