""" Build SQLite + FTS5 index from procurement catalogue Excel export (UNv26 layout). Run: python -m server.build_db Env: UNSPSC_XLSX_PATH, UNSPSC_DB_PATH """ from __future__ import annotations import os import sqlite3 import sys from pathlib import Path import pandas as pd def _as_int(v) -> int | None: if pd.isna(v): return None try: return int(float(v)) except (TypeError, ValueError): return None def _as_str(v) -> str: if pd.isna(v): return "" return str(v).strip() def main() -> None: root = Path(__file__).resolve().parents[1] xlsx = Path(os.environ.get("UNSPSC_XLSX_PATH", root / "data" / "unspsc-english.xlsx")) db_path = Path(os.environ.get("UNSPSC_DB_PATH", root / "data" / "unspsc.db")) if not xlsx.exists(): print(f"Missing XLSX at {xlsx}", file=sys.stderr) sys.exit(1) db_path.parent.mkdir(parents=True, exist_ok=True) if db_path.exists(): db_path.unlink() print(f"Loading {xlsx} …") df = pd.read_excel(xlsx, sheet_name="Sheet1", header=12) print(f"Rows: {len(df)}") conn = sqlite3.connect(db_path) cur = conn.cursor() cur.execute("PRAGMA journal_mode=WAL;") cur.execute("PRAGMA synchronous=NORMAL;") cur.execute( """ CREATE TABLE commodities ( id INTEGER PRIMARY KEY AUTOINCREMENT, unspsc_key TEXT, version TEXT, segment INTEGER, segment_title TEXT, segment_definition TEXT, family INTEGER, family_title TEXT, family_definition TEXT, class INTEGER, class_title TEXT, class_definition TEXT, commodity INTEGER, commodity_title TEXT, commodity_definition TEXT, synonym TEXT, acronym TEXT, path_titles TEXT, commodity_code_text TEXT ); """ ) cur.execute( """ CREATE VIRTUAL TABLE commodities_fts USING fts5( segment_title, family_title, class_title, commodity_title, path_titles, commodity_code_text, content='commodities', content_rowid='id', tokenize='unicode61 remove_diacritics 1' ); """ ) rows: list[tuple] = [] for _, r in df.iterrows(): seg_t = _as_str(r.get("Segment Title")) fam_t = _as_str(r.get("Family Title")) cls_t = _as_str(r.get("Class Title")) com_t = _as_str(r.get("Commodity Title")) com_code = _as_int(r.get("Commodity")) if com_code is None: continue path_titles = " > ".join(p for p in (seg_t, fam_t, cls_t, com_t) if p) code_txt = str(com_code) rows.append( ( _as_str(r.get("Key")), _as_str(r.get("Version")), _as_int(r.get("Segment")), seg_t, _as_str(r.get("Segment Definition")), _as_int(r.get("Family")), fam_t, _as_str(r.get("Family Definition")), _as_int(r.get("Class")), cls_t, _as_str(r.get("Class Definition")), com_code, com_t, _as_str(r.get("Commodity Definition")), _as_str(r.get("Synonym")), _as_str(r.get("Acronym")), path_titles, code_txt, ) ) print(f"Inserting {len(rows)} commodities …") cur.executemany( """ INSERT INTO commodities ( unspsc_key, version, segment, segment_title, segment_definition, family, family_title, family_definition, class, class_title, class_definition, commodity, commodity_title, commodity_definition, synonym, acronym, path_titles, commodity_code_text ) VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?) """, rows, ) print("Rebuilding FTS index …") cur.execute("INSERT INTO commodities_fts(commodities_fts) VALUES('rebuild');") conn.commit() cur.execute("SELECT COUNT(*) FROM commodities;") (n,) = cur.fetchone() print(f"Done. commodities={n}, db={db_path}") conn.close() if __name__ == "__main__": main()