Ferr0's picture
ZeroGPU: Qwen2.5-3B + Outlines — schema-conformance demo
2a3835f verified
Raw
History Blame Contribute Delete
8.08 kB
"""Structured Output Playground — lock any LLM's output to a JSON schema.
A local model (Qwen2.5-3B-Instruct) extracts structured data from free text. With
*constrained decoding* on (Outlines), the decoder can only emit tokens that keep the
output conformant to the schema — right keys, right types, valid enums, every time.
Flip constraints off and the same model free-styles: it may wrap JSON in a markdown
fence, or — more subtly — return valid JSON that violates the schema. That contrast
*is* the demo.
Runs on ZeroGPU (H200). No external API, no secrets.
"""
import json
import os
import time
import gradio as gr
import jsonschema
import spaces
import torch
import outlines
from outlines.types import JsonSchema
from transformers import AutoModelForCausalLM, AutoTokenizer
from examples import CONTACT_TEXT, EXAMPLES
from schemas import CUSTOM_LABEL, PRESETS, preset_schema
MODEL_ID = os.environ.get("MODEL_ID", "Qwen/Qwen2.5-3B-Instruct")
MAX_NEW_TOKENS = int(os.environ.get("MAX_NEW_TOKENS", "384"))
SYSTEM_PROMPT = (
"You are a precise information-extraction engine. You output a single JSON object and "
"nothing else. Put each value in the field whose meaning it matches, and never copy the "
"same value into two different fields."
)
DEFAULT_CUSTOM_SCHEMA = json.dumps(
{
"type": "object",
"properties": {
"summary": {"type": "string"},
"topics": {"type": "array", "items": {"type": "string"}},
"sentiment": {"type": "string", "enum": ["positive", "neutral", "negative"]},
},
"required": ["summary", "sentiment"],
},
indent=2,
)
print(f"[init] loading {MODEL_ID} …")
_t0 = time.perf_counter()
_tok = AutoTokenizer.from_pretrained(MODEL_ID)
_hf = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype=torch.bfloat16, device_map="cuda")
model = outlines.from_transformers(_hf, _tok)
print(f"[init] model ready in {time.perf_counter() - _t0:.1f}s")
def resolve_schema(preset: str, custom_schema: str) -> dict:
if preset == CUSTOM_LABEL:
return json.loads(custom_schema)
return preset_schema(preset)
def build_prompt(text: str, schema: dict) -> str:
# Naming the fields keeps the model from mis-mapping; the grammar enforces structure.
fields = ", ".join(schema.get("properties", {}).keys())
hint = f" with these fields: {fields}" if fields else ""
user = (
f"Extract the information from the text below as a JSON object{hint}.\n\n"
f'Text:\n"""{text}"""\n\n'
"Return only the JSON object."
)
return _tok.apply_chat_template(
[{"role": "system", "content": SYSTEM_PROMPT}, {"role": "user", "content": user}],
tokenize=False,
add_generation_prompt=True,
)
@spaces.GPU(duration=60)
def _generate(prompt: str, preset: str, custom_schema: str, constraints_on: bool) -> str:
# Simple, picklable args only (ZeroGPU forks a worker); rebuild the output type here.
if not constraints_on:
return model(prompt, max_new_tokens=MAX_NEW_TOKENS)
if preset == CUSTOM_LABEL:
output_type = JsonSchema(json.loads(custom_schema))
else:
output_type = PRESETS[preset]
return model(prompt, output_type=output_type, max_new_tokens=MAX_NEW_TOKENS)
def extract(text, preset, custom_schema, constraints_on):
"""Extract structured data from free text as JSON that conforms to a schema.
With constraints on, the output is guaranteed valid against the chosen schema
(right keys, types and enums) via constrained decoding.
Args:
text: The free text to extract structured information from.
preset: Which schema to use — "Contact card", "Product", "Job posting",
"Event", or "Custom (edit the schema)".
custom_schema: A JSON Schema string; used only when preset is the Custom option.
constraints_on: If true, force the output to match the schema (recommended).
Returns:
The extracted JSON (string) and a short validity/status badge (markdown).
"""
text = (text or "").strip()
if not text:
return "", "Paste some text first."
try:
schema = resolve_schema(preset, custom_schema)
except json.JSONDecodeError as e:
return "", f"❌ Your custom schema is not valid JSON: {e}"
prompt = build_prompt(text, schema)
t0 = time.perf_counter()
raw = _generate(prompt, preset, custom_schema, constraints_on)
dt = time.perf_counter() - t0
# 1) Is it even JSON?
try:
parsed = json.loads(raw)
except json.JSONDecodeError as e:
why = (
"the model wrapped its answer in a markdown code fence (```)"
if raw.lstrip().startswith("```")
else f"`{e}`"
)
note = (
"But constraints were ON — please report this."
if constraints_on
else "Constrained decoding never emits a fence or prose — always parseable."
)
return raw, f"❌ **Not valid JSON** · {dt:.1f}s — {why}. {note}"
# 2) Does it actually conform to the schema (types, enums, required)?
try:
jsonschema.validate(parsed, schema)
except jsonschema.ValidationError as e:
shown = json.dumps(parsed, indent=2, ensure_ascii=False)
note = (
"But constraints were ON — please report this."
if constraints_on
else "Constrained decoding would have *forced* the right type/enum here."
)
return shown, f"⚠️ **Valid JSON, but it breaks the schema** · {dt:.1f}s\n\n`{e.message}` — {note}"
pretty = json.dumps(parsed, indent=2, ensure_ascii=False)
extra = "" if constraints_on else " — the model complied this time, but nothing *forced* it to."
return pretty, f"✅ **Valid & schema-conformant** · {dt:.1f}s{extra}"
def on_preset_change(preset):
return gr.update(visible=(preset == CUSTOM_LABEL))
INTRO = """
# 🔒 Structured Output Playground
**Lock any LLM's output to a JSON schema.** A local model (Qwen2.5-3B) extracts structured data from
free text. With **constraints ON**, the decoder can only emit tokens that keep the output
**conformant to your schema** — right keys, right *types*, valid *enums*, every time.
Flip **OFF** and the same model free-styles: it may wrap the JSON in a markdown fence, or — more
subtly — return *valid JSON that violates your schema* (a string where you asked for an integer, a
value outside your enum). A good model complies *often*; constrained decoding makes it **always**.
Runs on **ZeroGPU** (H200) · no external API, no secrets · built by
[Ferr0](https://huggingface.co/Ferr0) · [pixelium.win](https://pixelium.win) · [GitHub](https://github.com/ferr079)
"""
with gr.Blocks(title="Structured Output Playground") as demo:
gr.Markdown(INTRO)
with gr.Row():
with gr.Column(scale=1):
preset = gr.Dropdown(
choices=list(PRESETS.keys()) + [CUSTOM_LABEL],
value="Contact card",
label="Schema preset",
)
custom = gr.Code(
value=DEFAULT_CUSTOM_SCHEMA,
language="json",
label="Custom JSON Schema",
visible=False,
)
text = gr.Textbox(
value=CONTACT_TEXT,
lines=8,
label="Source text",
placeholder="Paste any text to extract from…",
)
constraints = gr.Checkbox(value=True, label="Constraints ON (force schema)")
go = gr.Button("Extract", variant="primary")
with gr.Column(scale=1):
out = gr.Code(label="Extracted JSON", language="json")
badge = gr.Markdown()
gr.Examples(examples=EXAMPLES, inputs=[text, preset, constraints])
preset.change(on_preset_change, inputs=preset, outputs=custom, api_name=False)
go.click(extract, inputs=[text, preset, custom, constraints], outputs=[out, badge])
if __name__ == "__main__":
demo.launch()