import spaces import os import json import html import threading import torch import gradio as gr from huggingface_hub import snapshot_download from transformers import AutoProcessor, AutoModelForImageTextToText, TextIteratorStreamer # --------------------------------------------------------------------------- # # Models β€” both LFM2.5-VL Extract checkpoints, loaded eagerly for ZeroGPU. # # --------------------------------------------------------------------------- # MODEL_IDS = { "450M": "LiquidAI/LFM2.5-VL-450M-Extract", "1.6B": "LiquidAI/LFM2.5-VL-1.6B-Extract", } def load_processor(mid): # These repos ship the image-processor config nested inside # processor_config.json but lack a top-level preprocessor_config.json, # which AutoImageProcessor needs. Materialize one from the nested dict. local = snapshot_download(mid, allow_patterns=["*.json", "*.jinja", "*.txt"]) pre = os.path.join(local, "preprocessor_config.json") cfg = json.load(open(os.path.join(local, "processor_config.json"))) img = dict(cfg.get("image_processor", {})) # Drop image_processor_type: lfm2_vl ships only a *Fast* processor, so the # exact "Lfm2VlImageProcessor" name won't resolve. Without it, AutoImageProcessor # falls back to the lfm2_vl model_type mapping (which picks the Fast class) # while still reading every param from this file. (Overwrite each boot.) img.pop("image_processor_type", None) with open(pre, "w") as f: json.dump(img, f) return AutoProcessor.from_pretrained(local, trust_remote_code=True) processors, models = {}, {} for key, mid in MODEL_IDS.items(): processors[key] = load_processor(mid) models[key] = AutoModelForImageTextToText.from_pretrained( mid, dtype=torch.bfloat16, trust_remote_code=True ).to("cuda") # --------------------------------------------------------------------------- # # Schema presets β€” fill the visual field builder with a single click. # # --------------------------------------------------------------------------- # # Each example's fields are tailored to what is actually visible in its image. PRESETS = { "wood": { "label": "πŸͺ΅ Wood surface", "fields": [ {"name": "wood_color", "description": "The overall coloration of the wood surface"}, {"name": "wood_texture", "description": "The tactile quality of the wood surface"}, {"name": "grain_pattern", "description": "The pattern of the wood grain"}, ], }, "receipt": { "label": "🧾 Receipt", "fields": [ {"name": "total_amount", "description": "The total amount printed on the receipt"}, {"name": "cash_paid", "description": "The amount of cash tendered"}, {"name": "change_due", "description": "The change given back"}, {"name": "gst_rate", "description": "The GST / tax percentage shown"}, ], }, "nutrition": { "label": "πŸ₯« Nutrition label", "fields": [ {"name": "product_name", "description": "The name of the product on the label"}, {"name": "brand", "description": "The brand shown on the label"}, {"name": "net_weight", "description": "The net or drained weight"}, {"name": "servings_per_container", "description": "Number of servings per container"}, {"name": "best_before_date", "description": "The best-before or expiry date"}, ], }, "card": { "label": "πŸ’Ό Business card", "fields": [ {"name": "full_name", "description": "The person's full name"}, {"name": "job_title", "description": "Their job title or role"}, {"name": "company", "description": "Company name or website"}, {"name": "email", "description": "Email address"}, {"name": "phone", "description": "Phone number"}, ], }, "product": { "label": "πŸ›οΈ Product photo", "fields": [ {"name": "product_type", "description": "What kind of product this is"}, {"name": "brand", "description": "The brand, if a logo is visible"}, {"name": "primary_color", "description": "The dominant color of the product"}, {"name": "accent_colors", "description": "Secondary or accent colors"}, {"name": "closure_type", "description": "How the item fastens or closes"}, ], }, } # One example image per preset, swapped in when a preset chip is clicked. EXAMPLE_IMAGES = { "wood": "sample_wood.png", "receipt": "ex_receipt.jpg", "nutrition": "ex_nutrition.jpg", "card": "ex_card.jpg", "product": "ex_product.jpg", } def load_example(key): path = EXAMPLE_IMAGES.get(key) return path if path else gr.update() # --------------------------------------------------------------------------- # # Inference # # --------------------------------------------------------------------------- # def build_system_prompt(fields): yaml = "\n".join( f"{f['name'].strip()}: {f.get('description', '').strip()}" for f in fields if f.get("name", "").strip() ) return ( "Extract the following from the image:\n\n" f"{yaml}\n\n" "Respond with only a JSON object. Do not include any text outside the JSON." ) def parse_json(text): text = text.strip() if text.startswith("```"): text = text.split("```", 2)[1] if "```" in text[3:] else text[3:] text = text[4:] if text.lower().startswith("json") else text try: i, j = text.index("{"), text.rindex("}") return json.loads(text[i : j + 1]) except (ValueError, json.JSONDecodeError): return None def shell(inner): return f'
{inner}
' def placeholder_html(msg="Build a schema, drop an image, and hit Extract."): return shell( f'
πŸ’§
' f'
{html.escape(msg)}
' ) def stream_html(acc): body = html.escape(acc) if acc else "" return shell( '
extracting…
' f'
{body}
' ) def value_html(v): if isinstance(v, list): return "".join(f'{html.escape(str(x))}' for x in v) if isinstance(v, dict): return f'
{html.escape(json.dumps(v, indent=2, ensure_ascii=False))}
' if isinstance(v, bool): return f'{v}' if v is None or v == "": return 'β€”' return html.escape(str(v)) def cards_html(acc): obj = parse_json(acc) if obj is None or not isinstance(obj, dict): return shell( '
' f'
{html.escape(acc)}
' ) pretty = json.dumps(obj, indent=2, ensure_ascii=False) cards = "" for idx, (k, v) in enumerate(obj.items()): cards += ( f'
' f'
{html.escape(str(k))}
' f'
{value_html(v)}
' ) return shell( '
' f'{len(obj)} fields extracted' '' '' '' "
" f'
{cards}
' f'' f'' ) @spaces.GPU(duration=60) def extract(image, model_key, schema_json): if image is None: yield placeholder_html("Please drop an image first.") return try: fields = json.loads(schema_json) if schema_json else [] except json.JSONDecodeError: fields = [] fields = [f for f in fields if f.get("name", "").strip()] if not fields: yield placeholder_html("Add at least one field to extract.") return key = model_key if model_key in MODEL_IDS else "1.6B" proc, model = processors[key], models[key] conversation = [ {"role": "system", "content": build_system_prompt(fields)}, {"role": "user", "content": [{"type": "image", "image": image}]}, ] inputs = proc.apply_chat_template( conversation, add_generation_prompt=True, return_tensors="pt", return_dict=True, tokenize=True, ).to(model.device) tok = getattr(proc, "tokenizer", proc) streamer = TextIteratorStreamer(tok, skip_prompt=True, skip_special_tokens=True) gen_kwargs = dict(**inputs, max_new_tokens=512, do_sample=False, streamer=streamer) thread = threading.Thread(target=model.generate, kwargs=gen_kwargs) thread.start() acc = "" yield stream_html("") for piece in streamer: acc += piece yield stream_html(acc) thread.join() yield cards_html(acc) # --------------------------------------------------------------------------- # # Front-end: custom HTML schema-builder widget + result viewer glue. # # --------------------------------------------------------------------------- # HEAD = """ """.replace("__PRESETS__", json.dumps(PRESETS)) CSS = """ :root { --lq-orange:#ff7a00; --lq-amber:#ffb000; --lq-lime:#c6e600; } .gradio-container { max-width: 1280px !important; margin: 0 auto !important; --layout-gap: 20px !important; } #lq-schema-store, #lq-preset-store { display: none !important; } #lq-fields { padding: 0 !important; border: none !important; } #lq-hero { padding: 6px 2px 2px; margin-bottom: 2px; } #lq-hero h1 { margin:0; font-size: 1.75rem; font-weight: 800; letter-spacing:-.02em; color:#e0590a; -webkit-text-fill-color:#e0590a; } #lq-hero p { margin:.3rem 0 0; color:#8a6a2a; font-size:.95rem; white-space:nowrap; } /* schema builder */ #lq-schema-builder { display:block; } .lq-row { display:flex; gap:.5rem; margin-bottom:.45rem; align-items:center; } .lq-row input { border:1px solid rgba(180,160,120,.4); border-radius:11px; padding:.55rem .7rem; font-size:.9rem; background:#fffdf8; transition:border .12s,box-shadow .12s; } .lq-row input:focus { outline:none; border-color:var(--lq-orange); box-shadow:0 0 0 3px rgba(255,150,0,.16); } .lq-name { width:34%; font-family:ui-monospace,Menlo,monospace; font-weight:600; color:#b35900; } .lq-desc { flex:1; } .lq-del { border:none; background:#fff0e0; color:#e06a00; width:30px; height:30px; border-radius:9px; font-size:1.1rem; line-height:1; cursor:pointer; flex:none; transition:all .12s; } .lq-del:hover { background:#ffd9b0; } .lq-add { margin-top:.3rem; border:1.5px dashed rgba(255,140,0,.5); background:transparent; color:#c25e00; border-radius:11px; padding:.5rem .8rem; font-weight:600; font-size:.86rem; cursor:pointer; width:100%; transition:all .12s; } .lq-add:hover { background:#fff6ea; border-color:var(--lq-orange); } /* examples */ #lq-examples { margin-top:.4rem; } .lq-ex-label { font-size:.78rem; font-weight:700; text-transform:uppercase; letter-spacing:.05em; color:#b88a3a; margin-bottom:.5rem; } .lq-ex-grid { display:flex; flex-wrap:wrap; gap:.45rem; } .lq-ex { border:1px solid rgba(180,160,120,.35); background:#fffdf8; color:#7a5a20; border-radius:11px; padding:.45rem .7rem; font-size:.85rem; font-weight:600; cursor:pointer; transition:all .12s ease; } .lq-ex:hover { transform:translateY(-1px); border-color:var(--lq-orange); color:#b35900; box-shadow:0 5px 14px -8px rgba(255,140,0,.6); background:#fff; } /* hover magnifier loupe */ .lq-loupe { position:fixed; border-radius:50%; pointer-events:none; display:none; z-index:9999; background-repeat:no-repeat; background-color:#fff; border:3px solid #fff; box-shadow:0 8px 24px -6px rgba(120,70,0,.5), 0 0 0 2px var(--lq-orange); } #lq-image img { cursor:crosshair; } /* result viewer */ .lq-result-shell { border-radius:18px; min-height:360px; padding:16px; background:linear-gradient(160deg,#fffaf0,#fff7e6); border:1px solid rgba(255,160,0,.22); } .lq-empty { display:flex; flex-direction:column; align-items:center; justify-content:center; height:330px; color:#c79a4a; gap:.6rem; } .lq-drop { font-size:3.2rem; animation:lq-bob 2.4s ease-in-out infinite; } @keyframes lq-bob { 0%,100%{transform:translateY(0)} 50%{transform:translateY(-9px)} } .lq-empty-txt { font-size:.96rem; } .lq-stream-head { display:flex; align-items:center; gap:.5rem; font-weight:700; color:#c25e00; margin-bottom:.6rem; font-size:.9rem; } .lq-pulse { width:9px; height:9px; border-radius:50%; background:var(--lq-orange); animation:lq-pulse 1s ease-in-out infinite; } @keyframes lq-pulse { 0%,100%{opacity:.3;transform:scale(.8)} 50%{opacity:1;transform:scale(1.25)} } .lq-terminal { font-family:ui-monospace,Menlo,monospace; font-size:.86rem; white-space:pre-wrap; word-break:break-word; color:#5a4a20; background:#fffdf6; border-radius:12px; padding:14px; border:1px solid rgba(200,170,90,.3); margin:0; } .lq-caret { display:inline-block; width:7px; height:1.05em; background:var(--lq-orange); vertical-align:-2px; margin-left:1px; animation:lq-blink 1s step-end infinite; } @keyframes lq-blink { 50%{opacity:0} } .lq-toolbar { display:flex; align-items:center; gap:.5rem; margin-bottom:.8rem; } .lq-count { font-size:.82rem; font-weight:700; color:#a85f00; } .lq-spacer { flex:1; } .lq-btn { border:1px solid rgba(255,150,0,.4); background:#fff; color:#b35900; border-radius:9px; padding:.36rem .7rem; font-size:.82rem; font-weight:600; cursor:pointer; transition:all .12s; } .lq-btn:hover { background:#fff3e2; } .lq-btn-go { background:linear-gradient(90deg,var(--lq-orange),var(--lq-amber)); color:#fff; border-color:transparent; } .lq-cards { display:grid; grid-template-columns:repeat(auto-fill,minmax(150px,1fr)); gap:.7rem; } .lq-card { background:#fff; border-radius:14px; padding:.75rem .85rem; border:1px solid rgba(255,160,0,.22); box-shadow:0 6px 16px -12px rgba(255,140,0,.6); animation:lq-pop .4s cubic-bezier(.2,.9,.3,1.2) both; } @keyframes lq-pop { from{opacity:0;transform:translateY(10px) scale(.96)} to{opacity:1;transform:none} } .lq-key { font-family:ui-monospace,Menlo,monospace; font-size:.72rem; font-weight:700; text-transform:uppercase; letter-spacing:.04em; color:#d98000; margin-bottom:.3rem; } .lq-val { font-size:.98rem; color:#3a3320; font-weight:600; word-break:break-word; } .lq-tag { display:inline-block; background:#fff1d6; color:#b35900; border-radius:7px; padding:.12rem .45rem; font-size:.82rem; margin:.1rem .2rem .1rem 0; font-weight:600; } .lq-bool { padding:.1rem .4rem; border-radius:6px; font-size:.85rem; } .lq-true { background:#e7f7c4; color:#5a7a00; } .lq-false { background:#ffe0d6; color:#c2410c; } .lq-null { color:#bca; } .lq-nested { font-size:.78rem; margin:0; white-space:pre-wrap; } .lq-raw { font-family:ui-monospace,Menlo,monospace; font-size:.84rem; white-space:pre-wrap; word-break:break-word; background:#fffdf6; border-radius:12px; padding:14px; margin:0; border:1px solid rgba(200,170,90,.3); color:#5a4a20; } #lq-go { font-size:1.02rem !important; font-weight:700 !important; } """ with gr.Blocks(title="Liquid Image β†’ JSON") as demo: gr.HTML( '
' "

πŸ’§ Liquid Image β†’ JSON

" "

Define the fields you want, drop an image, and watch " "LFM2.5-VL Extract turn pixels into clean structured JSON.

" "
" ) with gr.Row(equal_height=False): with gr.Column(scale=5): image = gr.Image(type="pil", label="Image", height=300, elem_id="lq-image") model_key = gr.Radio( choices=[("450M Β· ⚑ fastest", "450M"), ("1.6B Β· 🎯 most accurate", "1.6B")], value="1.6B", show_label=False, ) gr.Markdown("**Extraction schema** β€” name a field and describe what to pull out") gr.HTML('
', elem_id="lq-fields") schema_store = gr.Textbox(elem_id="lq-schema-store", value="[]") preset_store = gr.Textbox(elem_id="lq-preset-store", value="") go = gr.Button("πŸ’§ Extract JSON", variant="primary", elem_id="lq-go") gr.HTML('
') with gr.Column(scale=5): result = gr.HTML(placeholder_html()) go.click(extract, [image, model_key, schema_store], result) preset_store.change(load_example, preset_store, image) if __name__ == "__main__": demo.queue(max_size=20).launch( theme=gr.themes.Citrus(), css=CSS, head=HEAD, ssr_mode=False )