PR-AGENT / server /build_db.py
Seth
Update
a44477f
"""
Build SQLite + FTS5 index from procurement catalogue Excel export (UNv26 layout).
Run: python -m server.build_db
Env: UNSPSC_XLSX_PATH, UNSPSC_DB_PATH
"""
from __future__ import annotations
import os
import sqlite3
import sys
from pathlib import Path
import pandas as pd
def _as_int(v) -> int | None:
if pd.isna(v):
return None
try:
return int(float(v))
except (TypeError, ValueError):
return None
def _as_str(v) -> str:
if pd.isna(v):
return ""
return str(v).strip()
def main() -> None:
root = Path(__file__).resolve().parents[1]
xlsx = Path(os.environ.get("UNSPSC_XLSX_PATH", root / "data" / "unspsc-english.xlsx"))
db_path = Path(os.environ.get("UNSPSC_DB_PATH", root / "data" / "unspsc.db"))
if not xlsx.exists():
print(f"Missing XLSX at {xlsx}", file=sys.stderr)
sys.exit(1)
db_path.parent.mkdir(parents=True, exist_ok=True)
if db_path.exists():
db_path.unlink()
print(f"Loading {xlsx} …")
df = pd.read_excel(xlsx, sheet_name="Sheet1", header=12)
print(f"Rows: {len(df)}")
conn = sqlite3.connect(db_path)
cur = conn.cursor()
cur.execute("PRAGMA journal_mode=WAL;")
cur.execute("PRAGMA synchronous=NORMAL;")
cur.execute(
"""
CREATE TABLE commodities (
id INTEGER PRIMARY KEY AUTOINCREMENT,
unspsc_key TEXT,
version TEXT,
segment INTEGER,
segment_title TEXT,
segment_definition TEXT,
family INTEGER,
family_title TEXT,
family_definition TEXT,
class INTEGER,
class_title TEXT,
class_definition TEXT,
commodity INTEGER,
commodity_title TEXT,
commodity_definition TEXT,
synonym TEXT,
acronym TEXT,
path_titles TEXT,
commodity_code_text TEXT
);
"""
)
cur.execute(
"""
CREATE VIRTUAL TABLE commodities_fts USING fts5(
segment_title,
family_title,
class_title,
commodity_title,
path_titles,
commodity_code_text,
content='commodities',
content_rowid='id',
tokenize='unicode61 remove_diacritics 1'
);
"""
)
rows: list[tuple] = []
for _, r in df.iterrows():
seg_t = _as_str(r.get("Segment Title"))
fam_t = _as_str(r.get("Family Title"))
cls_t = _as_str(r.get("Class Title"))
com_t = _as_str(r.get("Commodity Title"))
com_code = _as_int(r.get("Commodity"))
if com_code is None:
continue
path_titles = " > ".join(p for p in (seg_t, fam_t, cls_t, com_t) if p)
code_txt = str(com_code)
rows.append(
(
_as_str(r.get("Key")),
_as_str(r.get("Version")),
_as_int(r.get("Segment")),
seg_t,
_as_str(r.get("Segment Definition")),
_as_int(r.get("Family")),
fam_t,
_as_str(r.get("Family Definition")),
_as_int(r.get("Class")),
cls_t,
_as_str(r.get("Class Definition")),
com_code,
com_t,
_as_str(r.get("Commodity Definition")),
_as_str(r.get("Synonym")),
_as_str(r.get("Acronym")),
path_titles,
code_txt,
)
)
print(f"Inserting {len(rows)} commodities …")
cur.executemany(
"""
INSERT INTO commodities (
unspsc_key, version,
segment, segment_title, segment_definition,
family, family_title, family_definition,
class, class_title, class_definition,
commodity, commodity_title, commodity_definition,
synonym, acronym,
path_titles, commodity_code_text
) VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)
""",
rows,
)
print("Rebuilding FTS index …")
cur.execute("INSERT INTO commodities_fts(commodities_fts) VALUES('rebuild');")
conn.commit()
cur.execute("SELECT COUNT(*) FROM commodities;")
(n,) = cur.fetchone()
print(f"Done. commodities={n}, db={db_path}")
conn.close()
if __name__ == "__main__":
main()