| """ |
| Build SQLite + FTS5 index from procurement catalogue Excel export (UNv26 layout). |
| Run: python -m server.build_db |
| Env: UNSPSC_XLSX_PATH, UNSPSC_DB_PATH |
| """ |
|
|
| from __future__ import annotations |
|
|
| import os |
| import sqlite3 |
| import sys |
| from pathlib import Path |
|
|
| import pandas as pd |
|
|
|
|
| def _as_int(v) -> int | None: |
| if pd.isna(v): |
| return None |
| try: |
| return int(float(v)) |
| except (TypeError, ValueError): |
| return None |
|
|
|
|
| def _as_str(v) -> str: |
| if pd.isna(v): |
| return "" |
| return str(v).strip() |
|
|
|
|
| def main() -> None: |
| root = Path(__file__).resolve().parents[1] |
| xlsx = Path(os.environ.get("UNSPSC_XLSX_PATH", root / "data" / "unspsc-english.xlsx")) |
| db_path = Path(os.environ.get("UNSPSC_DB_PATH", root / "data" / "unspsc.db")) |
|
|
| if not xlsx.exists(): |
| print(f"Missing XLSX at {xlsx}", file=sys.stderr) |
| sys.exit(1) |
|
|
| db_path.parent.mkdir(parents=True, exist_ok=True) |
| if db_path.exists(): |
| db_path.unlink() |
|
|
| print(f"Loading {xlsx} …") |
| df = pd.read_excel(xlsx, sheet_name="Sheet1", header=12) |
| print(f"Rows: {len(df)}") |
|
|
| conn = sqlite3.connect(db_path) |
| cur = conn.cursor() |
| cur.execute("PRAGMA journal_mode=WAL;") |
| cur.execute("PRAGMA synchronous=NORMAL;") |
|
|
| cur.execute( |
| """ |
| CREATE TABLE commodities ( |
| id INTEGER PRIMARY KEY AUTOINCREMENT, |
| unspsc_key TEXT, |
| version TEXT, |
| segment INTEGER, |
| segment_title TEXT, |
| segment_definition TEXT, |
| family INTEGER, |
| family_title TEXT, |
| family_definition TEXT, |
| class INTEGER, |
| class_title TEXT, |
| class_definition TEXT, |
| commodity INTEGER, |
| commodity_title TEXT, |
| commodity_definition TEXT, |
| synonym TEXT, |
| acronym TEXT, |
| path_titles TEXT, |
| commodity_code_text TEXT |
| ); |
| """ |
| ) |
|
|
| cur.execute( |
| """ |
| CREATE VIRTUAL TABLE commodities_fts USING fts5( |
| segment_title, |
| family_title, |
| class_title, |
| commodity_title, |
| path_titles, |
| commodity_code_text, |
| content='commodities', |
| content_rowid='id', |
| tokenize='unicode61 remove_diacritics 1' |
| ); |
| """ |
| ) |
|
|
| rows: list[tuple] = [] |
| for _, r in df.iterrows(): |
| seg_t = _as_str(r.get("Segment Title")) |
| fam_t = _as_str(r.get("Family Title")) |
| cls_t = _as_str(r.get("Class Title")) |
| com_t = _as_str(r.get("Commodity Title")) |
| com_code = _as_int(r.get("Commodity")) |
| if com_code is None: |
| continue |
| path_titles = " > ".join(p for p in (seg_t, fam_t, cls_t, com_t) if p) |
| code_txt = str(com_code) |
| rows.append( |
| ( |
| _as_str(r.get("Key")), |
| _as_str(r.get("Version")), |
| _as_int(r.get("Segment")), |
| seg_t, |
| _as_str(r.get("Segment Definition")), |
| _as_int(r.get("Family")), |
| fam_t, |
| _as_str(r.get("Family Definition")), |
| _as_int(r.get("Class")), |
| cls_t, |
| _as_str(r.get("Class Definition")), |
| com_code, |
| com_t, |
| _as_str(r.get("Commodity Definition")), |
| _as_str(r.get("Synonym")), |
| _as_str(r.get("Acronym")), |
| path_titles, |
| code_txt, |
| ) |
| ) |
|
|
| print(f"Inserting {len(rows)} commodities …") |
| cur.executemany( |
| """ |
| INSERT INTO commodities ( |
| unspsc_key, version, |
| segment, segment_title, segment_definition, |
| family, family_title, family_definition, |
| class, class_title, class_definition, |
| commodity, commodity_title, commodity_definition, |
| synonym, acronym, |
| path_titles, commodity_code_text |
| ) VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?) |
| """, |
| rows, |
| ) |
|
|
| print("Rebuilding FTS index …") |
| cur.execute("INSERT INTO commodities_fts(commodities_fts) VALUES('rebuild');") |
| conn.commit() |
|
|
| cur.execute("SELECT COUNT(*) FROM commodities;") |
| (n,) = cur.fetchone() |
| print(f"Done. commodities={n}, db={db_path}") |
| conn.close() |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|