multimodalart's picture
multimodalart HF Staff
Upload app.py with huggingface_hub
0e26381 verified
import spaces
import os
import json
import html
import threading
import torch
import gradio as gr
from huggingface_hub import snapshot_download
from transformers import AutoProcessor, AutoModelForImageTextToText, TextIteratorStreamer
# --------------------------------------------------------------------------- #
# Models — both LFM2.5-VL Extract checkpoints, loaded eagerly for ZeroGPU. #
# --------------------------------------------------------------------------- #
MODEL_IDS = {
"450M": "LiquidAI/LFM2.5-VL-450M-Extract",
"1.6B": "LiquidAI/LFM2.5-VL-1.6B-Extract",
}
def load_processor(mid):
# These repos ship the image-processor config nested inside
# processor_config.json but lack a top-level preprocessor_config.json,
# which AutoImageProcessor needs. Materialize one from the nested dict.
local = snapshot_download(mid, allow_patterns=["*.json", "*.jinja", "*.txt"])
pre = os.path.join(local, "preprocessor_config.json")
cfg = json.load(open(os.path.join(local, "processor_config.json")))
img = dict(cfg.get("image_processor", {}))
# Drop image_processor_type: lfm2_vl ships only a *Fast* processor, so the
# exact "Lfm2VlImageProcessor" name won't resolve. Without it, AutoImageProcessor
# falls back to the lfm2_vl model_type mapping (which picks the Fast class)
# while still reading every param from this file. (Overwrite each boot.)
img.pop("image_processor_type", None)
with open(pre, "w") as f:
json.dump(img, f)
return AutoProcessor.from_pretrained(local, trust_remote_code=True)
processors, models = {}, {}
for key, mid in MODEL_IDS.items():
processors[key] = load_processor(mid)
models[key] = AutoModelForImageTextToText.from_pretrained(
mid, dtype=torch.bfloat16, trust_remote_code=True
).to("cuda")
# --------------------------------------------------------------------------- #
# Schema presets — fill the visual field builder with a single click. #
# --------------------------------------------------------------------------- #
# Each example's fields are tailored to what is actually visible in its image.
PRESETS = {
"wood": {
"label": "🪵 Wood surface",
"fields": [
{"name": "wood_color", "description": "The overall coloration of the wood surface"},
{"name": "wood_texture", "description": "The tactile quality of the wood surface"},
{"name": "grain_pattern", "description": "The pattern of the wood grain"},
],
},
"receipt": {
"label": "🧾 Receipt",
"fields": [
{"name": "total_amount", "description": "The total amount printed on the receipt"},
{"name": "cash_paid", "description": "The amount of cash tendered"},
{"name": "change_due", "description": "The change given back"},
{"name": "gst_rate", "description": "The GST / tax percentage shown"},
],
},
"nutrition": {
"label": "🥫 Nutrition label",
"fields": [
{"name": "product_name", "description": "The name of the product on the label"},
{"name": "brand", "description": "The brand shown on the label"},
{"name": "net_weight", "description": "The net or drained weight"},
{"name": "servings_per_container", "description": "Number of servings per container"},
{"name": "best_before_date", "description": "The best-before or expiry date"},
],
},
"card": {
"label": "💼 Business card",
"fields": [
{"name": "full_name", "description": "The person's full name"},
{"name": "job_title", "description": "Their job title or role"},
{"name": "company", "description": "Company name or website"},
{"name": "email", "description": "Email address"},
{"name": "phone", "description": "Phone number"},
],
},
"product": {
"label": "🛍️ Product photo",
"fields": [
{"name": "product_type", "description": "What kind of product this is"},
{"name": "brand", "description": "The brand, if a logo is visible"},
{"name": "primary_color", "description": "The dominant color of the product"},
{"name": "accent_colors", "description": "Secondary or accent colors"},
{"name": "closure_type", "description": "How the item fastens or closes"},
],
},
}
# One example image per preset, swapped in when a preset chip is clicked.
EXAMPLE_IMAGES = {
"wood": "sample_wood.png",
"receipt": "ex_receipt.jpg",
"nutrition": "ex_nutrition.jpg",
"card": "ex_card.jpg",
"product": "ex_product.jpg",
}
def load_example(key):
path = EXAMPLE_IMAGES.get(key)
return path if path else gr.update()
# --------------------------------------------------------------------------- #
# Inference #
# --------------------------------------------------------------------------- #
def build_system_prompt(fields):
yaml = "\n".join(
f"{f['name'].strip()}: {f.get('description', '').strip()}"
for f in fields
if f.get("name", "").strip()
)
return (
"Extract the following from the image:\n\n"
f"{yaml}\n\n"
"Respond with only a JSON object. Do not include any text outside the JSON."
)
def parse_json(text):
text = text.strip()
if text.startswith("```"):
text = text.split("```", 2)[1] if "```" in text[3:] else text[3:]
text = text[4:] if text.lower().startswith("json") else text
try:
i, j = text.index("{"), text.rindex("}")
return json.loads(text[i : j + 1])
except (ValueError, json.JSONDecodeError):
return None
def shell(inner):
return f'<div class="lq-result-shell">{inner}</div>'
def placeholder_html(msg="Build a schema, drop an image, and hit Extract."):
return shell(
f'<div class="lq-empty"><div class="lq-drop">💧</div>'
f'<div class="lq-empty-txt">{html.escape(msg)}</div></div>'
)
def stream_html(acc):
body = html.escape(acc) if acc else ""
return shell(
'<div class="lq-stream-head"><span class="lq-pulse"></span>extracting…</div>'
f'<pre class="lq-terminal">{body}<span class="lq-caret"></span></pre>'
)
def value_html(v):
if isinstance(v, list):
return "".join(f'<span class="lq-tag">{html.escape(str(x))}</span>' for x in v)
if isinstance(v, dict):
return f'<pre class="lq-nested">{html.escape(json.dumps(v, indent=2, ensure_ascii=False))}</pre>'
if isinstance(v, bool):
return f'<span class="lq-bool lq-{str(v).lower()}">{v}</span>'
if v is None or v == "":
return '<span class="lq-null">—</span>'
return html.escape(str(v))
def cards_html(acc):
obj = parse_json(acc)
if obj is None or not isinstance(obj, dict):
return shell(
'<div class="lq-toolbar"><button data-lq-copy class="lq-btn">Copy JSON</button></div>'
f'<pre id="lq-raw" class="lq-raw">{html.escape(acc)}</pre>'
)
pretty = json.dumps(obj, indent=2, ensure_ascii=False)
cards = ""
for idx, (k, v) in enumerate(obj.items()):
cards += (
f'<div class="lq-card" style="animation-delay:{idx * 55}ms">'
f'<div class="lq-key">{html.escape(str(k))}</div>'
f'<div class="lq-val">{value_html(v)}</div></div>'
)
return shell(
'<div class="lq-toolbar">'
f'<span class="lq-count">{len(obj)} fields extracted</span>'
'<span class="lq-spacer"></span>'
'<button data-lq-toggle class="lq-btn">⌗ Raw</button>'
'<button data-lq-copy class="lq-btn lq-btn-go">Copy JSON</button>'
"</div>"
f'<div id="lq-cards" class="lq-cards">{cards}</div>'
f'<pre id="lq-rawview" class="lq-raw" style="display:none">{html.escape(pretty)}</pre>'
f'<pre id="lq-raw" style="display:none">{html.escape(pretty)}</pre>'
)
@spaces.GPU(duration=60)
def extract(image, model_key, schema_json):
if image is None:
yield placeholder_html("Please drop an image first.")
return
try:
fields = json.loads(schema_json) if schema_json else []
except json.JSONDecodeError:
fields = []
fields = [f for f in fields if f.get("name", "").strip()]
if not fields:
yield placeholder_html("Add at least one field to extract.")
return
key = model_key if model_key in MODEL_IDS else "1.6B"
proc, model = processors[key], models[key]
conversation = [
{"role": "system", "content": build_system_prompt(fields)},
{"role": "user", "content": [{"type": "image", "image": image}]},
]
inputs = proc.apply_chat_template(
conversation,
add_generation_prompt=True,
return_tensors="pt",
return_dict=True,
tokenize=True,
).to(model.device)
tok = getattr(proc, "tokenizer", proc)
streamer = TextIteratorStreamer(tok, skip_prompt=True, skip_special_tokens=True)
gen_kwargs = dict(**inputs, max_new_tokens=512, do_sample=False, streamer=streamer)
thread = threading.Thread(target=model.generate, kwargs=gen_kwargs)
thread.start()
acc = ""
yield stream_html("")
for piece in streamer:
acc += piece
yield stream_html(acc)
thread.join()
yield cards_html(acc)
# --------------------------------------------------------------------------- #
# Front-end: custom HTML schema-builder widget + result viewer glue. #
# --------------------------------------------------------------------------- #
HEAD = """
<script>
const LQ_PRESETS = __PRESETS__;
(function () {
function syncStore(mount) {
const store = document.querySelector('#lq-schema-store textarea');
if (!store) return;
const rows = [...mount.querySelectorAll('.lq-row')];
const data = rows.map(r => ({
name: r.querySelector('.lq-name').value,
description: r.querySelector('.lq-desc').value,
}));
const setter = Object.getOwnPropertyDescriptor(window.HTMLTextAreaElement.prototype, 'value').set;
setter.call(store, JSON.stringify(data));
store.dispatchEvent(new Event('input', { bubbles: true }));
}
function makeRow(mount, name, desc) {
const row = document.createElement('div');
row.className = 'lq-row';
row.innerHTML =
'<input class="lq-name" spellcheck="false" placeholder="field_name">' +
'<input class="lq-desc" placeholder="what should the model pull out?">' +
'<button class="lq-del" title="remove">×</button>';
row.querySelector('.lq-name').value = name || '';
row.querySelector('.lq-desc').value = desc || '';
row.querySelector('.lq-name').addEventListener('input', () => syncStore(mount));
row.querySelector('.lq-desc').addEventListener('input', () => syncStore(mount));
row.querySelector('.lq-del').addEventListener('click', () => { row.remove(); syncStore(mount); });
mount.querySelector('#lq-rows').appendChild(row);
}
function setPreset(key) {
const store = document.querySelector('#lq-preset-store textarea');
if (!store) return;
const setter = Object.getOwnPropertyDescriptor(window.HTMLTextAreaElement.prototype, 'value').set;
setter.call(store, key);
store.dispatchEvent(new Event('input', { bubbles: true }));
}
function loadPreset(mount, key) {
const p = LQ_PRESETS[key];
if (!p || !mount) return;
mount.querySelector('#lq-rows').innerHTML = '';
p.fields.forEach(f => makeRow(mount, f.name, f.description));
syncStore(mount);
setPreset(key);
}
function initBuilder() {
const mount = document.getElementById('lq-schema-builder');
if (!mount || mount.dataset.ready) return;
mount.dataset.ready = '1';
mount.innerHTML = '<div id="lq-rows"></div>' +
'<button class="lq-add" id="lq-add">+ add field</button>';
mount.querySelector('#lq-add').addEventListener('click', () => { makeRow(mount, '', ''); });
makeRow(mount, '', ''); makeRow(mount, '', ''); makeRow(mount, '', '');
syncStore(mount);
}
function initExamples() {
const ex = document.getElementById('lq-examples');
if (!ex || ex.dataset.ready) return;
ex.dataset.ready = '1';
let html = '<div class="lq-ex-label">Examples — click to try one</div><div class="lq-ex-grid">';
Object.keys(LQ_PRESETS).forEach(k => {
html += '<button class="lq-ex" data-preset="' + k + '">' + LQ_PRESETS[k].label + '</button>';
});
html += '</div>';
ex.innerHTML = html;
ex.querySelectorAll('.lq-ex').forEach(c => c.addEventListener('click', () =>
loadPreset(document.getElementById('lq-schema-builder'), c.dataset.preset)));
}
function initLoupe() {
const wrap = document.getElementById('lq-image');
if (!wrap || wrap.dataset.loupe) return;
wrap.dataset.loupe = '1';
const D = 190, r = D / 2, zoom = 2.8;
const loupe = document.createElement('div');
loupe.className = 'lq-loupe';
loupe.style.width = D + 'px';
loupe.style.height = D + 'px';
document.body.appendChild(loupe);
function previewImg() {
for (const im of wrap.querySelectorAll('img')) {
if (im.src && im.naturalWidth > 0 && im.offsetWidth > 48) return im;
}
return null;
}
wrap.addEventListener('mousemove', function (e) {
const img = previewImg();
if (!img) { loupe.style.display = 'none'; return; }
const rect = img.getBoundingClientRect();
if (e.clientX < rect.left || e.clientX > rect.right ||
e.clientY < rect.top || e.clientY > rect.bottom) {
loupe.style.display = 'none'; return;
}
const x = e.clientX - rect.left, y = e.clientY - rect.top;
loupe.style.display = 'block';
loupe.style.backgroundImage = 'url("' + img.src + '")';
loupe.style.backgroundSize = (rect.width * zoom) + 'px ' + (rect.height * zoom) + 'px';
loupe.style.backgroundPosition = (r - x * zoom) + 'px ' + (r - y * zoom) + 'px';
loupe.style.left = (e.clientX - r) + 'px';
loupe.style.top = (e.clientY - r) + 'px';
});
wrap.addEventListener('mouseleave', function () { loupe.style.display = 'none'; });
}
function init() { initBuilder(); initExamples(); initLoupe(); }
setInterval(init, 250);
document.addEventListener('click', function (e) {
const cp = e.target.closest('[data-lq-copy]');
if (cp) {
const raw = document.getElementById('lq-raw');
if (raw) {
navigator.clipboard.writeText(raw.textContent);
const t = cp.textContent; cp.textContent = 'copied ✓';
setTimeout(() => { cp.textContent = t; }, 1200);
}
}
const tg = e.target.closest('[data-lq-toggle]');
if (tg) {
const cards = document.getElementById('lq-cards');
const rawv = document.getElementById('lq-rawview');
if (cards && rawv) {
const showRaw = cards.style.display !== 'none';
cards.style.display = showRaw ? 'none' : 'grid';
rawv.style.display = showRaw ? 'block' : 'none';
tg.textContent = showRaw ? '▧ Cards' : '⌗ Raw';
}
}
});
})();
</script>
""".replace("__PRESETS__", json.dumps(PRESETS))
CSS = """
:root { --lq-orange:#ff7a00; --lq-amber:#ffb000; --lq-lime:#c6e600; }
.gradio-container { max-width: 1280px !important; margin: 0 auto !important; --layout-gap: 20px !important; }
#lq-schema-store, #lq-preset-store { display: none !important; }
#lq-fields { padding: 0 !important; border: none !important; }
#lq-hero { padding: 6px 2px 2px; margin-bottom: 2px; }
#lq-hero h1 { margin:0; font-size: 1.75rem; font-weight: 800; letter-spacing:-.02em;
color:#e0590a; -webkit-text-fill-color:#e0590a; }
#lq-hero p { margin:.3rem 0 0; color:#8a6a2a; font-size:.95rem; white-space:nowrap; }
/* schema builder */
#lq-schema-builder { display:block; }
.lq-row { display:flex; gap:.5rem; margin-bottom:.45rem; align-items:center; }
.lq-row input { border:1px solid rgba(180,160,120,.4); border-radius:11px; padding:.55rem .7rem;
font-size:.9rem; background:#fffdf8; transition:border .12s,box-shadow .12s; }
.lq-row input:focus { outline:none; border-color:var(--lq-orange);
box-shadow:0 0 0 3px rgba(255,150,0,.16); }
.lq-name { width:34%; font-family:ui-monospace,Menlo,monospace; font-weight:600; color:#b35900; }
.lq-desc { flex:1; }
.lq-del { border:none; background:#fff0e0; color:#e06a00; width:30px; height:30px; border-radius:9px;
font-size:1.1rem; line-height:1; cursor:pointer; flex:none; transition:all .12s; }
.lq-del:hover { background:#ffd9b0; }
.lq-add { margin-top:.3rem; border:1.5px dashed rgba(255,140,0,.5); background:transparent;
color:#c25e00; border-radius:11px; padding:.5rem .8rem; font-weight:600; font-size:.86rem;
cursor:pointer; width:100%; transition:all .12s; }
.lq-add:hover { background:#fff6ea; border-color:var(--lq-orange); }
/* examples */
#lq-examples { margin-top:.4rem; }
.lq-ex-label { font-size:.78rem; font-weight:700; text-transform:uppercase; letter-spacing:.05em;
color:#b88a3a; margin-bottom:.5rem; }
.lq-ex-grid { display:flex; flex-wrap:wrap; gap:.45rem; }
.lq-ex { border:1px solid rgba(180,160,120,.35); background:#fffdf8; color:#7a5a20;
border-radius:11px; padding:.45rem .7rem; font-size:.85rem; font-weight:600; cursor:pointer;
transition:all .12s ease; }
.lq-ex:hover { transform:translateY(-1px); border-color:var(--lq-orange); color:#b35900;
box-shadow:0 5px 14px -8px rgba(255,140,0,.6); background:#fff; }
/* hover magnifier loupe */
.lq-loupe { position:fixed; border-radius:50%; pointer-events:none; display:none; z-index:9999;
background-repeat:no-repeat; background-color:#fff;
border:3px solid #fff; box-shadow:0 8px 24px -6px rgba(120,70,0,.5), 0 0 0 2px var(--lq-orange); }
#lq-image img { cursor:crosshair; }
/* result viewer */
.lq-result-shell { border-radius:18px; min-height:360px; padding:16px;
background:linear-gradient(160deg,#fffaf0,#fff7e6); border:1px solid rgba(255,160,0,.22); }
.lq-empty { display:flex; flex-direction:column; align-items:center; justify-content:center;
height:330px; color:#c79a4a; gap:.6rem; }
.lq-drop { font-size:3.2rem; animation:lq-bob 2.4s ease-in-out infinite; }
@keyframes lq-bob { 0%,100%{transform:translateY(0)} 50%{transform:translateY(-9px)} }
.lq-empty-txt { font-size:.96rem; }
.lq-stream-head { display:flex; align-items:center; gap:.5rem; font-weight:700; color:#c25e00;
margin-bottom:.6rem; font-size:.9rem; }
.lq-pulse { width:9px; height:9px; border-radius:50%; background:var(--lq-orange);
animation:lq-pulse 1s ease-in-out infinite; }
@keyframes lq-pulse { 0%,100%{opacity:.3;transform:scale(.8)} 50%{opacity:1;transform:scale(1.25)} }
.lq-terminal { font-family:ui-monospace,Menlo,monospace; font-size:.86rem; white-space:pre-wrap;
word-break:break-word; color:#5a4a20; background:#fffdf6; border-radius:12px; padding:14px;
border:1px solid rgba(200,170,90,.3); margin:0; }
.lq-caret { display:inline-block; width:7px; height:1.05em; background:var(--lq-orange);
vertical-align:-2px; margin-left:1px; animation:lq-blink 1s step-end infinite; }
@keyframes lq-blink { 50%{opacity:0} }
.lq-toolbar { display:flex; align-items:center; gap:.5rem; margin-bottom:.8rem; }
.lq-count { font-size:.82rem; font-weight:700; color:#a85f00; }
.lq-spacer { flex:1; }
.lq-btn { border:1px solid rgba(255,150,0,.4); background:#fff; color:#b35900; border-radius:9px;
padding:.36rem .7rem; font-size:.82rem; font-weight:600; cursor:pointer; transition:all .12s; }
.lq-btn:hover { background:#fff3e2; }
.lq-btn-go { background:linear-gradient(90deg,var(--lq-orange),var(--lq-amber)); color:#fff; border-color:transparent; }
.lq-cards { display:grid; grid-template-columns:repeat(auto-fill,minmax(150px,1fr)); gap:.7rem; }
.lq-card { background:#fff; border-radius:14px; padding:.75rem .85rem;
border:1px solid rgba(255,160,0,.22); box-shadow:0 6px 16px -12px rgba(255,140,0,.6);
animation:lq-pop .4s cubic-bezier(.2,.9,.3,1.2) both; }
@keyframes lq-pop { from{opacity:0;transform:translateY(10px) scale(.96)} to{opacity:1;transform:none} }
.lq-key { font-family:ui-monospace,Menlo,monospace; font-size:.72rem; font-weight:700;
text-transform:uppercase; letter-spacing:.04em; color:#d98000; margin-bottom:.3rem; }
.lq-val { font-size:.98rem; color:#3a3320; font-weight:600; word-break:break-word; }
.lq-tag { display:inline-block; background:#fff1d6; color:#b35900; border-radius:7px;
padding:.12rem .45rem; font-size:.82rem; margin:.1rem .2rem .1rem 0; font-weight:600; }
.lq-bool { padding:.1rem .4rem; border-radius:6px; font-size:.85rem; }
.lq-true { background:#e7f7c4; color:#5a7a00; } .lq-false { background:#ffe0d6; color:#c2410c; }
.lq-null { color:#bca; } .lq-nested { font-size:.78rem; margin:0; white-space:pre-wrap; }
.lq-raw { font-family:ui-monospace,Menlo,monospace; font-size:.84rem; white-space:pre-wrap;
word-break:break-word; background:#fffdf6; border-radius:12px; padding:14px; margin:0;
border:1px solid rgba(200,170,90,.3); color:#5a4a20; }
#lq-go { font-size:1.02rem !important; font-weight:700 !important; }
"""
with gr.Blocks(title="Liquid Image → JSON") as demo:
gr.HTML(
'<div id="lq-hero">'
"<h1>💧 Liquid Image → JSON</h1>"
"<p>Define the fields you want, drop an image, and watch "
"<b>LFM2.5-VL Extract</b> turn pixels into clean structured JSON.</p>"
"</div>"
)
with gr.Row(equal_height=False):
with gr.Column(scale=5):
image = gr.Image(type="pil", label="Image", height=300, elem_id="lq-image")
model_key = gr.Radio(
choices=[("450M · ⚡ fastest", "450M"), ("1.6B · 🎯 most accurate", "1.6B")],
value="1.6B",
show_label=False,
)
gr.Markdown("**Extraction schema** — name a field and describe what to pull out")
gr.HTML('<div id="lq-schema-builder"></div>', elem_id="lq-fields")
schema_store = gr.Textbox(elem_id="lq-schema-store", value="[]")
preset_store = gr.Textbox(elem_id="lq-preset-store", value="")
go = gr.Button("💧 Extract JSON", variant="primary", elem_id="lq-go")
gr.HTML('<div id="lq-examples"></div>')
with gr.Column(scale=5):
result = gr.HTML(placeholder_html())
go.click(extract, [image, model_key, schema_store], result)
preset_store.change(load_example, preset_store, image)
if __name__ == "__main__":
demo.queue(max_size=20).launch(
theme=gr.themes.Citrus(), css=CSS, head=HEAD, ssr_mode=False
)