| """AI-generated form field definitions per commodity — cached in SQLite for consistency.""" |
|
|
| from __future__ import annotations |
|
|
| import json |
| import os |
| import sqlite3 |
| from typing import Any |
|
|
| from openai import OpenAI |
|
|
| from server.catalog import get_commodity, summarize_row |
| from server.pr_lines import INTERVAL_ORDER |
|
|
| |
| GENERIC_SELECT_FALLBACK = ( |
| "Standard / typical requirement", |
| "Enhanced vs baseline", |
| "Economy / essential only", |
| "Pilot or limited scope", |
| "Strategic priority program", |
| "Other — use specification notes below", |
| ) |
|
|
| SCHEMA_GEN_SYSTEM = """You design procurement intake forms for a single catalogue commodity (segment → family → class → commodity). |
| |
| Return ONE JSON object only (no markdown). Shape: |
| { |
| "fields": [ |
| { |
| "id": "stable_snake_case_id", |
| "label": "Full question text shown to the user (no Q1/Q2 prefixes)", |
| "type": "select" | "chips" | "number", |
| "options": ["required for select and chips: 3–12 distinct, short option strings"], |
| "unit": "ONLY for type number: short suffix shown next to the input (e.g. kg, lb, mm, in, %)" |
| } |
| ] |
| } |
| |
| Rules: |
| - 3 to 7 fields. Labels must be clear procurement questions for THIS commodity type. |
| - Do NOT include: number of deliveries, delivery interval/frequency, year for scheduling, or a generic "other / free text specifications" field — the application collects those separately. |
| - **No open-ended typing:** NEVER use type "text" or "textarea". Users must tap choices only. |
| - Prefer **select** with 5–12 concise options for objectives, scope, methodology, audience, timing, risk, quality level, etc. |
| - Use **chips** for 3–8 mutually exclusive options when labels are short (single choice — same as select, shown as buttons). |
| - Use **number** only for true numeric values (counts, currency amounts, percentages, sizes, weights, dimensions). |
| - For **every number field**, set **"unit"** to the metric users should enter (e.g. `"kg"` for weight capacity, `"mm"` for seat depth, `"lb"` only if Imperial is explicit). Never leave unit ambiguous when the question is a measurement. |
| - Every option string must be self-contained (no reliance on free-form explanations). If a case might need nuance, add an option such as "Other — see specification notes below". |
| - Use stable `id` values (snake_case) — they are keys in saved data. |
| - Same commodity must always get the same structure when regenerated; the app caches by commodity code, but ids and intent must stay consistent if you see similar commodities. |
| """ |
|
|
|
|
| def ensure_form_schema_table(conn: sqlite3.Connection) -> None: |
| conn.execute( |
| """ |
| CREATE TABLE IF NOT EXISTS commodity_form_schemas ( |
| commodity_code INTEGER PRIMARY KEY, |
| schema_json TEXT NOT NULL, |
| updated_at TEXT DEFAULT CURRENT_TIMESTAMP |
| ) |
| """ |
| ) |
| conn.commit() |
|
|
|
|
| def _load_cached(conn: sqlite3.Connection, commodity_code: int) -> dict[str, Any] | None: |
| cur = conn.cursor() |
| cur.execute( |
| "SELECT schema_json FROM commodity_form_schemas WHERE commodity_code = ?", |
| (commodity_code,), |
| ) |
| row = cur.fetchone() |
| if not row: |
| return None |
| try: |
| return json.loads(row[0]) |
| except json.JSONDecodeError: |
| return None |
|
|
|
|
| def _save_cache(conn: sqlite3.Connection, commodity_code: int, schema: dict[str, Any]) -> None: |
| conn.execute( |
| """ |
| INSERT INTO commodity_form_schemas (commodity_code, schema_json, updated_at) |
| VALUES (?, ?, datetime('now')) |
| ON CONFLICT(commodity_code) DO UPDATE SET |
| schema_json = excluded.schema_json, |
| updated_at = datetime('now') |
| """, |
| (commodity_code, json.dumps(schema, ensure_ascii=False)), |
| ) |
| conn.commit() |
|
|
|
|
| _FALLBACK_FIELDS_RAW: list[dict[str, Any]] = [ |
| { |
| "id": "primary_scope", |
| "label": "What is the primary scope or geography for this requirement?", |
| "type": "select", |
| "options": [ |
| "Local / single site", |
| "Regional", |
| "National", |
| "International", |
| "Multi-region program", |
| "To be determined", |
| ], |
| }, |
| { |
| "id": "scale_band", |
| "label": "What scale band best matches expected volume or spend?", |
| "type": "chips", |
| "options": [ |
| "Pilot / small", |
| "Medium", |
| "Large", |
| "Enterprise-wide", |
| "Not yet estimated", |
| ], |
| }, |
| { |
| "id": "compliance_focus", |
| "label": "Which compliance themes apply (if any)?", |
| "type": "select", |
| "options": [ |
| "None identified yet", |
| "Data privacy / residency", |
| "Safety / quality standards", |
| "Financial / audit controls", |
| "Industry-specific regulations", |
| "Mixed — see specification notes", |
| ], |
| }, |
| ] |
|
|
|
|
| def _fallback_schema() -> dict[str, Any]: |
| return { |
| "fields": [_coerce_field_selectable(dict(f)) for f in _FALLBACK_FIELDS_RAW], |
| "source": "fallback", |
| } |
|
|
|
|
| def _coerce_field_selectable(entry: dict[str, Any]) -> dict[str, Any]: |
| """Ensure fields are selectable (select/chips) or number — never free-text.""" |
| typ = str(entry.get("type") or "select").lower() |
| if typ in ("text", "textarea"): |
| typ = "select" |
| elif typ == "number": |
| out = {**entry, "type": "number"} |
| out.pop("options", None) |
| unit = str(out.get("unit") or "").strip() |
| if unit: |
| out["unit"] = unit[:24] |
| else: |
| out.pop("unit", None) |
| return out |
| elif typ not in ("select", "chips"): |
| typ = "select" |
|
|
| opts_raw = entry.get("options") |
| clean: list[str] = [] |
| if isinstance(opts_raw, list): |
| clean = [str(o).strip() for o in opts_raw if str(o).strip()] |
| if len(clean) < 2: |
| clean = list(GENERIC_SELECT_FALLBACK) |
| return {**entry, "type": typ, "options": clean} |
|
|
|
|
| def _validate_and_normalize(raw: dict[str, Any]) -> dict[str, Any]: |
| fields_out: list[dict[str, Any]] = [] |
| seen_ids: set[str] = set() |
| for f in raw.get("fields") or []: |
| if not isinstance(f, dict): |
| continue |
| fid = str(f.get("id") or "").strip() |
| label = str(f.get("label") or "").strip() |
| typ = str(f.get("type") or "select").lower() |
| if not fid or not label: |
| continue |
| if fid in seen_ids: |
| continue |
| seen_ids.add(fid) |
| if typ not in ("select", "number", "text", "chips", "textarea"): |
| typ = "select" |
| opts = f.get("options") |
| entry: dict[str, Any] = {"id": fid, "label": label, "type": typ} |
| if typ in ("select", "chips") and isinstance(opts, list) and opts: |
| entry["options"] = [str(o) for o in opts if str(o).strip()] |
| if typ == "number": |
| u = str(f.get("unit") or "").strip() |
| if u: |
| entry["unit"] = u[:24] |
| fields_out.append(_coerce_field_selectable(entry)) |
| if len(fields_out) < 1: |
| return _fallback_schema() |
| return {"fields": fields_out, "source": "openai"} |
|
|
|
|
| def _coerce_cached_schema(cached: dict[str, Any]) -> dict[str, Any]: |
| """Upgrade legacy cached schemas (text/textarea) to selectable controls.""" |
| fields_in = cached.get("fields") or [] |
| fields_out: list[dict[str, Any]] = [] |
| seen_ids: set[str] = set() |
| for f in fields_in: |
| if not isinstance(f, dict): |
| continue |
| fid = str(f.get("id") or "").strip() |
| label = str(f.get("label") or "").strip() |
| if not fid or not label or fid in seen_ids: |
| continue |
| seen_ids.add(fid) |
| typ = str(f.get("type") or "select").lower() |
| entry: dict[str, Any] = {"id": fid, "label": label, "type": typ} |
| opts = f.get("options") |
| if typ in ("select", "chips") and isinstance(opts, list) and opts: |
| entry["options"] = [str(o) for o in opts if str(o).strip()] |
| if typ == "number": |
| u = str(f.get("unit") or "").strip() |
| if u: |
| entry["unit"] = u[:24] |
| fields_out.append(_coerce_field_selectable(entry)) |
| if len(fields_out) < 1: |
| return _fallback_schema() |
| out = {**cached, "fields": fields_out, "source": cached.get("source", "cache")} |
| out["interval_options"] = INTERVAL_ORDER |
| return out |
|
|
|
|
| def generate_schema_with_llm(row: dict[str, Any]) -> dict[str, Any]: |
| api_key = os.environ.get("OPENAI_API_KEY") |
| if not api_key: |
| return _fallback_schema() |
|
|
| s = summarize_row(row) |
| user_block = json.dumps( |
| { |
| "segment_code": s.get("segment_code"), |
| "family_code": s.get("family_code"), |
| "class_code": s.get("class_code"), |
| "commodity_code": s.get("commodity_code"), |
| "path": s.get("path"), |
| "commodity_title": s.get("commodity_title"), |
| "commodity_definition": s.get("commodity_definition"), |
| }, |
| ensure_ascii=False, |
| ) |
|
|
| client = OpenAI(api_key=api_key) |
| resp = client.chat.completions.create( |
| model=os.environ.get("OPENAI_MODEL", "gpt-4o-mini"), |
| messages=[ |
| {"role": "system", "content": SCHEMA_GEN_SYSTEM}, |
| {"role": "user", "content": user_block}, |
| ], |
| temperature=0.2, |
| response_format={"type": "json_object"}, |
| ) |
| text = (resp.choices[0].message.content or "").strip() |
| try: |
| parsed = json.loads(text) |
| except json.JSONDecodeError: |
| return _fallback_schema() |
| return _validate_and_normalize(parsed) |
|
|
|
|
| def get_or_create_schema(conn: sqlite3.Connection, commodity_code: int) -> dict[str, Any]: |
| ensure_form_schema_table(conn) |
| cached = _load_cached(conn, commodity_code) |
| if cached and cached.get("fields"): |
| return _coerce_cached_schema(cached) |
|
|
| row = get_commodity(conn, commodity_code) |
| if not row: |
| return {"fields": [], "error": "commodity_not_found", "interval_options": INTERVAL_ORDER} |
|
|
| schema = generate_schema_with_llm(row) |
| if schema.get("fields"): |
| _save_cache(conn, commodity_code, schema) |
| schema["interval_options"] = INTERVAL_ORDER |
| return schema |
|
|