File size: 8,711 Bytes
a78dbe0
 
0d6ff7a
a78dbe0
 
0d6ff7a
d7ce295
a78dbe0
 
d7ce295
0d6ff7a
a78dbe0
d7ce295
 
be027a7
 
 
 
 
 
d7ce295
 
be027a7
d7ce295
a78dbe0
be027a7
d7ce295
0d6ff7a
d7ce295
a78dbe0
 
0d6ff7a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
be027a7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0d6ff7a
 
be027a7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a78dbe0
 
be027a7
a78dbe0
be027a7
 
 
 
 
 
 
 
 
 
 
 
d7ce295
0d6ff7a
 
 
 
 
 
a78dbe0
 
 
 
0d6ff7a
 
a78dbe0
0d6ff7a
a78dbe0
 
0d6ff7a
be027a7
a78dbe0
0d6ff7a
 
 
be027a7
0d6ff7a
be027a7
a78dbe0
 
 
 
be027a7
 
 
 
 
 
a78dbe0
 
 
be027a7
a78dbe0
 
 
 
be027a7
 
a78dbe0
 
 
 
 
 
 
 
 
 
 
 
be027a7
 
 
 
 
 
 
a78dbe0
 
be027a7
a78dbe0
 
 
be027a7
a78dbe0
 
be027a7
 
a78dbe0
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
import gradio as gr
import json
import re
import spaces
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, StoppingCriteria, StoppingCriteriaList
from peft import PeftModel

# ── Load model once at startup ──────────────────────────────
BASE_MODEL = "Qwen/Qwen2.5-1.5B-Instruct"
LORA_MODEL = "suneeldk/json-extract"  # ← change this

tokenizer = AutoTokenizer.from_pretrained(LORA_MODEL)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
)

base_model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL,
    quantization_config=bnb_config,
    device_map="auto",
)

model = PeftModel.from_pretrained(base_model, LORA_MODEL)
model = model.merge_and_unload()
model.eval()


# ── Stop generation when JSON is complete ───────────────────
class StopOnJsonComplete(StoppingCriteria):
    """Stop generating once we have a complete JSON object."""
    def __init__(self, tokenizer, prompt_length):
        self.tokenizer = tokenizer
        self.prompt_length = prompt_length

    def __call__(self, input_ids, scores, **kwargs):
        new_tokens = input_ids[0][self.prompt_length:]
        text = self.tokenizer.decode(new_tokens, skip_special_tokens=True).strip()
        if not text.startswith("{"):
            return False
        # Count braces to detect complete JSON
        depth = 0
        for char in text:
            if char == "{":
                depth += 1
            elif char == "}":
                depth -= 1
                if depth == 0:
                    return True  # JSON object is complete, stop!
        return False


# ── Extract first valid JSON from text ──────────────────────
def extract_json(text):
    """Find and return the first complete JSON object in text."""
    depth = 0
    start = None
    for i, char in enumerate(text):
        if char == "{":
            if start is None:
                start = i
            depth += 1
        elif char == "}":
            depth -= 1
            if depth == 0 and start is not None:
                try:
                    return json.loads(text[start:i + 1])
                except json.JSONDecodeError:
                    start = None
    return None


# ── Auto-detect schema from text ────────────────────────────
def auto_schema(text):
    text_lower = text.lower()
    schema = {}

    money_keywords = ["paid", "sent", "received", "cost", "price", "rupees", "rs",
                      "β‚Ή", "$", "bought", "sold", "charged", "fee", "salary",
                      "budget", "owes", "owe", "lent", "borrowed", "fare", "rent"]
    if any(k in text_lower for k in money_keywords) or any(c.isdigit() for c in text):
        schema["amount"] = "number|null"

    person_keywords = ["to", "from", "with", "for", "by", "told", "asked",
                       "met", "called", "emailed", "messaged", "owes", "owe"]
    if any(k in text_lower for k in person_keywords):
        schema["person"] = "string|null"

    date_keywords = ["jan", "feb", "mar", "apr", "may", "jun", "jul", "aug",
                     "sep", "oct", "nov", "dec", "monday", "tuesday", "wednesday",
                     "thursday", "friday", "saturday", "sunday", "today", "tomorrow",
                     "yesterday", "morning", "evening", "night", "on", "at", "pm", "am"]
    if any(k in text_lower for k in date_keywords):
        schema["date"] = "ISO date|null"
        if any(k in text_lower for k in ["pm", "am", "morning", "evening", "night", "at"]):
            schema["time"] = "string|null"

    item_keywords = ["bought", "ordered", "purchased", "delivered", "shipped",
                     "kg", "litre", "pieces", "items", "pack", "bottle"]
    if any(k in text_lower for k in item_keywords):
        schema["item"] = "string|null"
        schema["quantity"] = "string|null"

    location_keywords = ["store", "shop", "restaurant", "station", "airport",
                         "hotel", "office"]
    if any(k in text_lower for k in location_keywords):
        schema["location"] = "string|null"

    travel_keywords = ["train", "flight", "bus", "booked", "ticket", "pnr",
                       "travel", "trip", "journey"]
    if any(k in text_lower for k in travel_keywords):
        schema["from_location"] = "string|null"
        schema["to_location"] = "string|null"
        schema.pop("location", None)

    meeting_keywords = ["meeting", "call", "discuss", "review", "presentation",
                        "interview", "appointment", "schedule"]
    if any(k in text_lower for k in meeting_keywords):
        schema["topic"] = "string|null"

    schema["note"] = "string|null"

    if len(schema) <= 1:
        schema = {
            "amount": "number|null",
            "person": "string|null",
            "date": "ISO date|null",
            "note": "string|null",
        }

    return schema


# ── Inference function ──────────────────────────────────────
@spaces.GPU
def extract(text, custom_schema):
    if not text.strip():
        return "", ""

    if custom_schema and custom_schema.strip():
        try:
            schema = json.loads(custom_schema)
        except json.JSONDecodeError:
            return "Invalid JSON schema.", ""
    else:
        schema = auto_schema(text)

    schema_str = json.dumps(schema)
    prompt = f"### Input: {text}\n### Schema: {schema_str}\n### Output:"
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    prompt_length = inputs["input_ids"].shape[1]

    # Stop as soon as JSON is complete
    stop_criteria = StoppingCriteriaList([
        StopOnJsonComplete(tokenizer, prompt_length)
    ])

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=128,
            do_sample=False,
            pad_token_id=tokenizer.eos_token_id,
            stopping_criteria=stop_criteria,
        )

    new_tokens = outputs[0][prompt_length:]
    output_part = tokenizer.decode(new_tokens, skip_special_tokens=True).strip()

    # Extract just the JSON, ignore any trailing garbage
    parsed = extract_json(output_part)
    if parsed:
        return json.dumps(parsed, indent=2, ensure_ascii=False), json.dumps(schema, indent=2)
    else:
        return output_part, json.dumps(schema, indent=2)


# ── Example inputs ──────────────────────────────────────────
examples = [
    ["Paid 500 to Ravi for lunch on Jan 5"],
    ["Meeting with Sarah at 3pm tomorrow to discuss the project budget of $10,000"],
    ["Bought 3 kg of rice from Krishna Stores for 250 rupees on March 10"],
    ["Booked a train from Chennai to Bangalore on April 10 for 750 rupees"],
    ["Ravi owes me 300 for last week's dinner"],
    ["Ordered 2 pizzas and 1 coke from Dominos for 850 rupees"],
]

# ── Gradio UI ───────────────────────────────────────────────
with gr.Blocks(title="json-extract") as demo:
    gr.Markdown(
        """
        # json-extract
        Extract structured JSON from natural language text.

        Just type a sentence β€” the model auto-detects the right schema and extracts clean JSON.
        """
    )

    with gr.Row():
        with gr.Column():
            text_input = gr.Textbox(
                label="Input Text",
                placeholder="e.g. Paid 500 to Ravi for lunch on Jan 5",
                lines=3,
            )
            btn = gr.Button("Extract", variant="primary")

            with gr.Accordion("Advanced: Custom Schema (optional)", open=False):
                schema_input = gr.Textbox(
                    label="Custom JSON Schema",
                    placeholder='Leave empty for auto-detect, or enter e.g. {"amount": "number", "person": "string|null"}',
                    lines=3,
                )

        with gr.Column():
            output = gr.Textbox(label="Extracted JSON", lines=10)
            detected_schema = gr.Textbox(label="Schema Used", lines=5)

    gr.Examples(
        examples=examples,
        inputs=[text_input],
    )

    btn.click(fn=extract, inputs=[text_input, schema_input], outputs=[output, detected_schema])
    text_input.submit(fn=extract, inputs=[text_input, schema_input], outputs=[output, detected_schema])

demo.launch()