"""Gradio demo for distill-structure model.""" import json import re import gradio as gr import torch from transformers import AutoModelForCausalLM, AutoTokenizer # --------------------------------------------------------------------------- # Model # --------------------------------------------------------------------------- MODEL_ID = "nahidstaq/distill-structure" SYSTEM = ( "You are an HTML structure analyzer. Given a compact DOM representation " "of a web page (with headings removed), identify the logical sections. " "Output a JSON array of sections, each with title, start_text, content_type, and assets fields." ) _model = None _tokenizer = None def _load(): global _model, _tokenizer if _model is None: device = "cuda" if torch.cuda.is_available() else "cpu" dtype = torch.bfloat16 if device == "cuda" else torch.float32 _tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) _model = AutoModelForCausalLM.from_pretrained( MODEL_ID, dtype=dtype, device_map="auto" ) _model.eval() return _model, _tokenizer # --------------------------------------------------------------------------- # Helpers # --------------------------------------------------------------------------- def _compact_dom(html: str) -> str: from lxml import html as lxml_html try: doc = lxml_html.fromstring(html) except Exception: return html[:3000] for tag in ("h1", "h2", "h3", "h4", "h5", "h6", "script", "style", "head"): for el in doc.findall(f".//{tag}"): p = el.getparent() if p is not None: p.remove(el) def _walk(el, depth=0): if not hasattr(el, "tag") or not isinstance(el.tag, str): return "" tag = el.tag indent = " " * depth if tag == "img": alt = el.get("alt", "") return f'{indent} {alt}

Our Amazing Product

Welcome to the best product you've ever seen.

Features

Lightning fast
Easy to use
Affordable pricing

Pricing

Plan	Price
Starter	$9/mo
Pro	$29/mo

FAQ

Is there a free trial?

Yes! 14 days free, no credit card required.

""" with gr.Blocks(title="distill-structure", theme=gr.themes.Soft()) as demo: gr.Markdown("# distill-structure\nHTML section analyzer — fine-tuned Qwen3.5-2B") with gr.Tabs(): with gr.Tab("Paste HTML"): with gr.Row(): with gr.Column(): html_input = gr.Textbox( label="HTML", placeholder="Paste HTML here...", lines=15, value=EXAMPLE_HTML, ) title_input = gr.Textbox(label="Page title (optional)", placeholder="Auto-detected from ") btn_html = gr.Button("Analyze", variant="primary") with gr.Column(): sections_out = gr.Markdown(label="Sections") raw_out = gr.Textbox(label="Raw JSON output", lines=10) btn_html.click(analyze_html, inputs=[html_input, title_input], outputs=[sections_out, raw_out]) with gr.Tab("From URL"): with gr.Row(): with gr.Column(): url_input = gr.Textbox(label="URL", placeholder="https://news.ycombinator.com") btn_url = gr.Button("Fetch & Analyze", variant="primary") html_preview = gr.Textbox(label="Fetched HTML (preview)", lines=8) with gr.Column(): sections_out2 = gr.Markdown(label="Sections") raw_out2 = gr.Textbox(label="Raw JSON output", lines=10) btn_url.click(analyze_url, inputs=[url_input], outputs=[html_preview, sections_out2, raw_out2]) gr.Markdown(""" --- **Model**: [nahidstaq/distill-structure](https://huggingface.co/nahidstaq/distill-structure) · **Base**: Qwen3.5-2B · **Task**: HTML structure analysis """) if __name__ == "__main__": demo.launch()