File size: 10,332 Bytes
865237e
 
 
 
 
 
 
 
 
 
 
 
 
 
37978e1
 
 
 
 
 
 
 
 
 
865237e
 
 
 
 
 
 
 
37978e1
de686dc
 
865237e
 
 
 
 
 
 
37978e1
 
 
de686dc
 
37978e1
865237e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37978e1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
865237e
 
37978e1
865237e
 
 
 
37978e1
 
 
 
 
 
 
 
de686dc
 
 
 
 
37978e1
 
 
 
 
 
 
 
 
 
 
 
 
865237e
 
 
 
 
 
 
 
37978e1
865237e
 
 
 
 
 
37978e1
865237e
 
 
 
de686dc
 
 
 
37978e1
865237e
 
 
 
 
37978e1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
de686dc
 
 
 
37978e1
 
 
 
 
 
 
 
865237e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37978e1
865237e
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
"""AI-generated form field definitions per commodity β€” cached in SQLite for consistency."""

from __future__ import annotations

import json
import os
import sqlite3
from typing import Any

from openai import OpenAI

from server.catalog import get_commodity, summarize_row
from server.pr_lines import INTERVAL_ORDER

# Used when the model (or legacy cache) emits text fields or too few options.
GENERIC_SELECT_FALLBACK = (
    "Standard / typical requirement",
    "Enhanced vs baseline",
    "Economy / essential only",
    "Pilot or limited scope",
    "Strategic priority program",
    "Other β€” use specification notes below",
)

SCHEMA_GEN_SYSTEM = """You design procurement intake forms for a single catalogue commodity (segment β†’ family β†’ class β†’ commodity).

Return ONE JSON object only (no markdown). Shape:
{
  "fields": [
    {
      "id": "stable_snake_case_id",
      "label": "Full question text shown to the user (no Q1/Q2 prefixes)",
      "type": "select" | "chips" | "number",
      "options": ["required for select and chips: 3–12 distinct, short option strings"],
      "unit": "ONLY for type number: short suffix shown next to the input (e.g. kg, lb, mm, in, %)"
    }
  ]
}

Rules:
- 3 to 7 fields. Labels must be clear procurement questions for THIS commodity type.
- Do NOT include: number of deliveries, delivery interval/frequency, year for scheduling, or a generic "other / free text specifications" field β€” the application collects those separately.
- **No open-ended typing:** NEVER use type "text" or "textarea". Users must tap choices only.
- Prefer **select** with 5–12 concise options for objectives, scope, methodology, audience, timing, risk, quality level, etc.
- Use **chips** for 3–8 mutually exclusive options when labels are short (single choice β€” same as select, shown as buttons).
- Use **number** only for true numeric values (counts, currency amounts, percentages, sizes, weights, dimensions).
- For **every number field**, set **"unit"** to the metric users should enter (e.g. `"kg"` for weight capacity, `"mm"` for seat depth, `"lb"` only if Imperial is explicit). Never leave unit ambiguous when the question is a measurement.
- Every option string must be self-contained (no reliance on free-form explanations). If a case might need nuance, add an option such as "Other β€” see specification notes below".
- Use stable `id` values (snake_case) β€” they are keys in saved data.
- Same commodity must always get the same structure when regenerated; the app caches by commodity code, but ids and intent must stay consistent if you see similar commodities.
"""


def ensure_form_schema_table(conn: sqlite3.Connection) -> None:
    conn.execute(
        """
        CREATE TABLE IF NOT EXISTS commodity_form_schemas (
            commodity_code INTEGER PRIMARY KEY,
            schema_json TEXT NOT NULL,
            updated_at TEXT DEFAULT CURRENT_TIMESTAMP
        )
        """
    )
    conn.commit()


def _load_cached(conn: sqlite3.Connection, commodity_code: int) -> dict[str, Any] | None:
    cur = conn.cursor()
    cur.execute(
        "SELECT schema_json FROM commodity_form_schemas WHERE commodity_code = ?",
        (commodity_code,),
    )
    row = cur.fetchone()
    if not row:
        return None
    try:
        return json.loads(row[0])
    except json.JSONDecodeError:
        return None


def _save_cache(conn: sqlite3.Connection, commodity_code: int, schema: dict[str, Any]) -> None:
    conn.execute(
        """
        INSERT INTO commodity_form_schemas (commodity_code, schema_json, updated_at)
        VALUES (?, ?, datetime('now'))
        ON CONFLICT(commodity_code) DO UPDATE SET
          schema_json = excluded.schema_json,
          updated_at = datetime('now')
        """,
        (commodity_code, json.dumps(schema, ensure_ascii=False)),
    )
    conn.commit()


_FALLBACK_FIELDS_RAW: list[dict[str, Any]] = [
    {
        "id": "primary_scope",
        "label": "What is the primary scope or geography for this requirement?",
        "type": "select",
        "options": [
            "Local / single site",
            "Regional",
            "National",
            "International",
            "Multi-region program",
            "To be determined",
        ],
    },
    {
        "id": "scale_band",
        "label": "What scale band best matches expected volume or spend?",
        "type": "chips",
        "options": [
            "Pilot / small",
            "Medium",
            "Large",
            "Enterprise-wide",
            "Not yet estimated",
        ],
    },
    {
        "id": "compliance_focus",
        "label": "Which compliance themes apply (if any)?",
        "type": "select",
        "options": [
            "None identified yet",
            "Data privacy / residency",
            "Safety / quality standards",
            "Financial / audit controls",
            "Industry-specific regulations",
            "Mixed β€” see specification notes",
        ],
    },
]


def _fallback_schema() -> dict[str, Any]:
    return {
        "fields": [_coerce_field_selectable(dict(f)) for f in _FALLBACK_FIELDS_RAW],
        "source": "fallback",
    }


def _coerce_field_selectable(entry: dict[str, Any]) -> dict[str, Any]:
    """Ensure fields are selectable (select/chips) or number β€” never free-text."""
    typ = str(entry.get("type") or "select").lower()
    if typ in ("text", "textarea"):
        typ = "select"
    elif typ == "number":
        out = {**entry, "type": "number"}
        out.pop("options", None)
        unit = str(out.get("unit") or "").strip()
        if unit:
            out["unit"] = unit[:24]
        else:
            out.pop("unit", None)
        return out
    elif typ not in ("select", "chips"):
        typ = "select"

    opts_raw = entry.get("options")
    clean: list[str] = []
    if isinstance(opts_raw, list):
        clean = [str(o).strip() for o in opts_raw if str(o).strip()]
    if len(clean) < 2:
        clean = list(GENERIC_SELECT_FALLBACK)
    return {**entry, "type": typ, "options": clean}


def _validate_and_normalize(raw: dict[str, Any]) -> dict[str, Any]:
    fields_out: list[dict[str, Any]] = []
    seen_ids: set[str] = set()
    for f in raw.get("fields") or []:
        if not isinstance(f, dict):
            continue
        fid = str(f.get("id") or "").strip()
        label = str(f.get("label") or "").strip()
        typ = str(f.get("type") or "select").lower()
        if not fid or not label:
            continue
        if fid in seen_ids:
            continue
        seen_ids.add(fid)
        if typ not in ("select", "number", "text", "chips", "textarea"):
            typ = "select"
        opts = f.get("options")
        entry: dict[str, Any] = {"id": fid, "label": label, "type": typ}
        if typ in ("select", "chips") and isinstance(opts, list) and opts:
            entry["options"] = [str(o) for o in opts if str(o).strip()]
        if typ == "number":
            u = str(f.get("unit") or "").strip()
            if u:
                entry["unit"] = u[:24]
        fields_out.append(_coerce_field_selectable(entry))
    if len(fields_out) < 1:
        return _fallback_schema()
    return {"fields": fields_out, "source": "openai"}


def _coerce_cached_schema(cached: dict[str, Any]) -> dict[str, Any]:
    """Upgrade legacy cached schemas (text/textarea) to selectable controls."""
    fields_in = cached.get("fields") or []
    fields_out: list[dict[str, Any]] = []
    seen_ids: set[str] = set()
    for f in fields_in:
        if not isinstance(f, dict):
            continue
        fid = str(f.get("id") or "").strip()
        label = str(f.get("label") or "").strip()
        if not fid or not label or fid in seen_ids:
            continue
        seen_ids.add(fid)
        typ = str(f.get("type") or "select").lower()
        entry: dict[str, Any] = {"id": fid, "label": label, "type": typ}
        opts = f.get("options")
        if typ in ("select", "chips") and isinstance(opts, list) and opts:
            entry["options"] = [str(o) for o in opts if str(o).strip()]
        if typ == "number":
            u = str(f.get("unit") or "").strip()
            if u:
                entry["unit"] = u[:24]
        fields_out.append(_coerce_field_selectable(entry))
    if len(fields_out) < 1:
        return _fallback_schema()
    out = {**cached, "fields": fields_out, "source": cached.get("source", "cache")}
    out["interval_options"] = INTERVAL_ORDER
    return out


def generate_schema_with_llm(row: dict[str, Any]) -> dict[str, Any]:
    api_key = os.environ.get("OPENAI_API_KEY")
    if not api_key:
        return _fallback_schema()

    s = summarize_row(row)
    user_block = json.dumps(
        {
            "segment_code": s.get("segment_code"),
            "family_code": s.get("family_code"),
            "class_code": s.get("class_code"),
            "commodity_code": s.get("commodity_code"),
            "path": s.get("path"),
            "commodity_title": s.get("commodity_title"),
            "commodity_definition": s.get("commodity_definition"),
        },
        ensure_ascii=False,
    )

    client = OpenAI(api_key=api_key)
    resp = client.chat.completions.create(
        model=os.environ.get("OPENAI_MODEL", "gpt-4o-mini"),
        messages=[
            {"role": "system", "content": SCHEMA_GEN_SYSTEM},
            {"role": "user", "content": user_block},
        ],
        temperature=0.2,
        response_format={"type": "json_object"},
    )
    text = (resp.choices[0].message.content or "").strip()
    try:
        parsed = json.loads(text)
    except json.JSONDecodeError:
        return _fallback_schema()
    return _validate_and_normalize(parsed)


def get_or_create_schema(conn: sqlite3.Connection, commodity_code: int) -> dict[str, Any]:
    ensure_form_schema_table(conn)
    cached = _load_cached(conn, commodity_code)
    if cached and cached.get("fields"):
        return _coerce_cached_schema(cached)

    row = get_commodity(conn, commodity_code)
    if not row:
        return {"fields": [], "error": "commodity_not_found", "interval_options": INTERVAL_ORDER}

    schema = generate_schema_with_llm(row)
    if schema.get("fields"):
        _save_cache(conn, commodity_code, schema)
    schema["interval_options"] = INTERVAL_ORDER
    return schema