Spaces:
Sleeping
Sleeping
| """Gradio demo for StructFix: JSON/tool-call repair against a schema. | |
| The user provides: | |
| - A DSL schema (typed fields with enums and required flags), OR | |
| - A free-form text instruction broken/loose (the model will try to repair it anyway) | |
| The model returns a valid JSON output that matches the schema. | |
| """ | |
| import os | |
| os.environ.setdefault("DISABLE_ONNXRUNTIME_GPU", "1") | |
| import json | |
| import gradio as gr | |
| from transformers import AutoModelForSeq2SeqLM, AutoTokenizer | |
| MODEL_ID = "ottema/structfix-codet5p-220m" | |
| _model = None | |
| _tokenizer = None | |
| def get_model(): | |
| global _model, _tokenizer | |
| if _model is None: | |
| _tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) | |
| _model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_ID) | |
| return _model, _tokenizer | |
| PRESET_SCHEMAS = { | |
| "Customer Support Ticket": """FIELD priority TYPE string VALUES low|medium|high|urgent REQUIRED yes | |
| FIELD category TYPE string VALUES billing|technical|account|other REQUIRED yes | |
| FIELD description TYPE string REQUIRED yes | |
| FIELD needs_callback TYPE boolean REQUIRED no""", | |
| "Task Management": """FIELD task TYPE string REQUIRED yes | |
| FIELD assignee TYPE string REQUIRED no | |
| FIELD deadline TYPE string REQUIRED no | |
| FIELD priority TYPE string VALUES low|medium|high REQUIRED no | |
| FIELD status TYPE string VALUES todo|in_progress|done REQUIRED no""", | |
| "Code Review Comment": """FIELD file_path TYPE string REQUIRED yes | |
| FIELD line_number TYPE integer REQUIRED no | |
| FIELD severity TYPE string VALUES nit|warning|error REQUIRED yes | |
| FIELD message TYPE string REQUIRED yes""", | |
| "Restaurant Review": """FIELD rating TYPE integer REQUIRED yes | |
| FIELD cuisine TYPE string REQUIRED no | |
| FIELD price_range TYPE string VALUES $|$$|$$$|$$$$ REQUIRED no | |
| FIELD comments TYPE string REQUIRED no""", | |
| "Custom (write your own)": "", | |
| } | |
| PRESET_BROKEN = { | |
| "Missing required field": '{"priority":"high"}', | |
| "Invalid enum value": '{"priority":"super-mega-urgent","description":"Server crashed"}', | |
| "Wrong type": '{"priority":123,"description":"Database is down"}', | |
| "Markdown-wrapped JSON": '```json\n{"status":"done","result":"Successfully deployed"}\n```', | |
| "Partial / truncated": '{"status":"succe', | |
| "Extra surrounding text": 'The system returned this response: {"task":"Review PR","assignee":"alice"} - end of response.', | |
| "Code review (análise real)": 'Aqui está minha análise:\n\n```json\n{"file": "auth.py", "line": 42, "severity": "critical", "msg": "SQL injection vulnerability on user input"}\n```\n\nPor favor revisar.', | |
| "API de suporte (vazio)": "A chamada retornou: {}", | |
| "Resposta multilíngue (PT-BR)": 'O sistema retornou isto: {"tarefa":"Revisar PR","responsavel":"alice","status":"em_andamento"}', | |
| "Resposta em PT-BR (schema inglês)": '{"status":"concluido","descricao":"Deploy foi realizado","prioridade":5}', | |
| "Custom (write your own)": "", | |
| } | |
| # Examples that pair a schema with a broken output, for the Examples block | |
| EXAMPLES = [ | |
| # Customer Support | |
| ["Customer Support Ticket", "Missing required field"], | |
| ["Customer Support Ticket", "Invalid enum value"], | |
| ["Customer Support Ticket", "Wrong type"], | |
| ["Customer Support Ticket", "Markdown-wrapped JSON"], | |
| ["Customer Support Ticket", "API de suporte (vazio)"], | |
| # Task Management | |
| ["Task Management", "Partial / truncated"], | |
| ["Task Management", "Extra surrounding text"], | |
| # Code Review | |
| ["Code Review Comment", "Code review (análise real)"], | |
| ["Code Review Comment", "Markdown-wrapped JSON"], | |
| # Restaurant | |
| ["Restaurant Review", "Wrong type"], | |
| ["Restaurant Review", "Extra surrounding text"], | |
| # PT-BR test | |
| ["Task Management", "Resposta multilíngue (PT-BR)"], | |
| ["Customer Support Ticket", "Resposta em PT-BR (schema inglês)"], | |
| ] | |
| def repair(schema, broken_output): | |
| if not schema or not schema.strip(): | |
| return "Empty schema.", "", "" | |
| if not broken_output or not broken_output.strip(): | |
| return "Empty broken output.", "", "" | |
| model, tokenizer = get_model() | |
| prompt = f"""TASK repair_structured_output | |
| SPEC | |
| {schema} | |
| BROKEN_OUTPUT | |
| {broken_output}""" | |
| inputs = tokenizer(prompt, return_tensors="pt", max_length=512, truncation=True) | |
| outputs = model.generate( | |
| **inputs, | |
| max_length=256, | |
| num_beams=1, | |
| do_sample=False, | |
| ) | |
| text = tokenizer.decode(outputs[0], skip_special_tokens=True) | |
| pretty = text | |
| is_valid = False | |
| try: | |
| parsed = json.loads(text) | |
| pretty = json.dumps(parsed, indent=2, ensure_ascii=False) | |
| is_valid = True | |
| except Exception: | |
| pass | |
| status = "✅ Valid JSON" if is_valid else "⚠️ Could not parse output as JSON" | |
| return status, text, pretty | |
| with gr.Blocks(title="Ottema StructFix Demo", theme=gr.themes.Soft()) as demo: | |
| gr.Markdown( | |
| """# Ottema StructFix Demo | |
| Repair broken JSON / tool-call output against a typed schema. Model: [`ottema/structfix-codet5p-220m`](https://huggingface.co/ottema/structfix-codet5p-220m) (220M params, Apache-2.0). | |
| **Use cases:** | |
| - LLM emits invalid JSON → fix it | |
| - Missing required fields → fill or flag | |
| - Wrong enum values → coerce to valid | |
| - Markdown-wrapped output → strip the wrapper | |
| - Partial / truncated JSON → complete it | |
| - Schema-constrained generation recovery | |
| **Pipeline:** user provides a DSL schema + a broken output. Model returns a valid JSON object that matches the schema. | |
| """ | |
| ) | |
| with gr.Row(): | |
| with gr.Column(): | |
| schema_preset = gr.Dropdown( | |
| choices=list(PRESET_SCHEMAS.keys()), | |
| value="Customer Support Ticket", | |
| label="Schema preset", | |
| ) | |
| schema_input = gr.Textbox( | |
| label="Schema (DSL)", | |
| value=PRESET_SCHEMAS["Customer Support Ticket"], | |
| lines=6, | |
| ) | |
| with gr.Column(): | |
| broken_preset = gr.Dropdown( | |
| choices=list(PRESET_BROKEN.keys()), | |
| value="Invalid enum value", | |
| label="Broken-output preset", | |
| ) | |
| broken_input = gr.Textbox( | |
| label="Broken output (anything goes)", | |
| value=PRESET_BROKEN["Invalid enum value"], | |
| lines=4, | |
| ) | |
| run_btn = gr.Button("Repair", variant="primary") | |
| with gr.Row(): | |
| status = gr.Textbox(label="Status", interactive=False) | |
| raw_output = gr.Textbox(label="Raw model output", interactive=False) | |
| pretty_output = gr.Code(label="Repaired JSON", language="json") | |
| schema_preset.change(lambda p: PRESET_SCHEMAS[p], inputs=[schema_preset], outputs=[schema_input]) | |
| broken_preset.change(lambda p: PRESET_BROKEN[p], inputs=[broken_preset], outputs=[broken_input]) | |
| run_btn.click(repair, inputs=[schema_input, broken_input], outputs=[status, raw_output, pretty_output]) | |
| gr.Examples( | |
| examples=EXAMPLES, | |
| inputs=[schema_preset, broken_preset], | |
| label="Click any example to load it. Then click Repair.", | |
| ) | |
| gr.Markdown( | |
| """--- | |
| **How it works:** the DSL schema is a compact typed-declaration language. Each line declares a field with name, type, optional enum, and required flag. The model was trained on 250k synthetic examples of broken outputs paired with their valid counterparts (see [`ottema/structfix-bench`](https://huggingface.co/datasets/ottema/structfix-bench)). | |
| **Credits:** [CodeT5+](https://github.com/salesforce/CodeT5) backbone (Apache-2.0), fine-tuning and dataset by [Ottema](https://huggingface.co/ottema). | |
| **Try the on-demand API:** see [ottema/structfix-codet5p-220m](https://huggingface.co/ottema/structfix-codet5p-220m) for the inference snippet.""" | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch() |