import spaces
import os
import json
import html
import threading
import torch
import gradio as gr
from huggingface_hub import snapshot_download
from transformers import AutoProcessor, AutoModelForImageTextToText, TextIteratorStreamer
# --------------------------------------------------------------------------- #
# Models β both LFM2.5-VL Extract checkpoints, loaded eagerly for ZeroGPU. #
# --------------------------------------------------------------------------- #
MODEL_IDS = {
"450M": "LiquidAI/LFM2.5-VL-450M-Extract",
"1.6B": "LiquidAI/LFM2.5-VL-1.6B-Extract",
}
def load_processor(mid):
# These repos ship the image-processor config nested inside
# processor_config.json but lack a top-level preprocessor_config.json,
# which AutoImageProcessor needs. Materialize one from the nested dict.
local = snapshot_download(mid, allow_patterns=["*.json", "*.jinja", "*.txt"])
pre = os.path.join(local, "preprocessor_config.json")
cfg = json.load(open(os.path.join(local, "processor_config.json")))
img = dict(cfg.get("image_processor", {}))
# Drop image_processor_type: lfm2_vl ships only a *Fast* processor, so the
# exact "Lfm2VlImageProcessor" name won't resolve. Without it, AutoImageProcessor
# falls back to the lfm2_vl model_type mapping (which picks the Fast class)
# while still reading every param from this file. (Overwrite each boot.)
img.pop("image_processor_type", None)
with open(pre, "w") as f:
json.dump(img, f)
return AutoProcessor.from_pretrained(local, trust_remote_code=True)
processors, models = {}, {}
for key, mid in MODEL_IDS.items():
processors[key] = load_processor(mid)
models[key] = AutoModelForImageTextToText.from_pretrained(
mid, dtype=torch.bfloat16, trust_remote_code=True
).to("cuda")
# --------------------------------------------------------------------------- #
# Schema presets β fill the visual field builder with a single click. #
# --------------------------------------------------------------------------- #
# Each example's fields are tailored to what is actually visible in its image.
PRESETS = {
"wood": {
"label": "πͺ΅ Wood surface",
"fields": [
{"name": "wood_color", "description": "The overall coloration of the wood surface"},
{"name": "wood_texture", "description": "The tactile quality of the wood surface"},
{"name": "grain_pattern", "description": "The pattern of the wood grain"},
],
},
"receipt": {
"label": "π§Ύ Receipt",
"fields": [
{"name": "total_amount", "description": "The total amount printed on the receipt"},
{"name": "cash_paid", "description": "The amount of cash tendered"},
{"name": "change_due", "description": "The change given back"},
{"name": "gst_rate", "description": "The GST / tax percentage shown"},
],
},
"nutrition": {
"label": "π₯« Nutrition label",
"fields": [
{"name": "product_name", "description": "The name of the product on the label"},
{"name": "brand", "description": "The brand shown on the label"},
{"name": "net_weight", "description": "The net or drained weight"},
{"name": "servings_per_container", "description": "Number of servings per container"},
{"name": "best_before_date", "description": "The best-before or expiry date"},
],
},
"card": {
"label": "πΌ Business card",
"fields": [
{"name": "full_name", "description": "The person's full name"},
{"name": "job_title", "description": "Their job title or role"},
{"name": "company", "description": "Company name or website"},
{"name": "email", "description": "Email address"},
{"name": "phone", "description": "Phone number"},
],
},
"product": {
"label": "ποΈ Product photo",
"fields": [
{"name": "product_type", "description": "What kind of product this is"},
{"name": "brand", "description": "The brand, if a logo is visible"},
{"name": "primary_color", "description": "The dominant color of the product"},
{"name": "accent_colors", "description": "Secondary or accent colors"},
{"name": "closure_type", "description": "How the item fastens or closes"},
],
},
}
# One example image per preset, swapped in when a preset chip is clicked.
EXAMPLE_IMAGES = {
"wood": "sample_wood.png",
"receipt": "ex_receipt.jpg",
"nutrition": "ex_nutrition.jpg",
"card": "ex_card.jpg",
"product": "ex_product.jpg",
}
def load_example(key):
path = EXAMPLE_IMAGES.get(key)
return path if path else gr.update()
# --------------------------------------------------------------------------- #
# Inference #
# --------------------------------------------------------------------------- #
def build_system_prompt(fields):
yaml = "\n".join(
f"{f['name'].strip()}: {f.get('description', '').strip()}"
for f in fields
if f.get("name", "").strip()
)
return (
"Extract the following from the image:\n\n"
f"{yaml}\n\n"
"Respond with only a JSON object. Do not include any text outside the JSON."
)
def parse_json(text):
text = text.strip()
if text.startswith("```"):
text = text.split("```", 2)[1] if "```" in text[3:] else text[3:]
text = text[4:] if text.lower().startswith("json") else text
try:
i, j = text.index("{"), text.rindex("}")
return json.loads(text[i : j + 1])
except (ValueError, json.JSONDecodeError):
return None
def shell(inner):
return f'
{inner}
'
def placeholder_html(msg="Build a schema, drop an image, and hit Extract."):
return shell(
f'
π§
'
f'
{html.escape(msg)}
'
)
def stream_html(acc):
body = html.escape(acc) if acc else ""
return shell(
'
extractingβ¦
'
f'
{body}
'
)
def value_html(v):
if isinstance(v, list):
return "".join(f'{html.escape(str(x))}' for x in v)
if isinstance(v, dict):
return f'
'
if isinstance(v, bool):
return f'{v}'
if v is None or v == "":
return 'β'
return html.escape(str(v))
def cards_html(acc):
obj = parse_json(acc)
if obj is None or not isinstance(obj, dict):
return shell(
''
f'
{html.escape(acc)}
'
)
pretty = json.dumps(obj, indent=2, ensure_ascii=False)
cards = ""
for idx, (k, v) in enumerate(obj.items()):
cards += (
f'