Spaces:
Sleeping
Sleeping
| """Load the seed store data and turn it into retrievable documents. | |
| A :class:`Document` is one citable unit of knowledge — a policy section or a single | |
| product. Keeping products and policy sections as separate documents means the agent | |
| can cite exactly what it used ("Returns policy", "Ultralight 2-Person Tent"). | |
| """ | |
| from __future__ import annotations | |
| import json | |
| import re | |
| from dataclasses import dataclass, field | |
| from pathlib import Path | |
| from typing import Any | |
| from .config import get_settings | |
| class Document: | |
| """One retrievable, citable knowledge unit.""" | |
| doc_id: str | |
| kind: str # "policy" | "product" | |
| title: str | |
| text: str | |
| metadata: dict[str, Any] = field(default_factory=dict) | |
| _HEADING = re.compile(r"^##\s+(.*)$", re.MULTILINE) | |
| def load_policy_documents(policies_path: Path) -> list[Document]: | |
| """Split ``policies.md`` into one document per ``##`` section.""" | |
| raw = policies_path.read_text(encoding="utf-8") | |
| docs: list[Document] = [] | |
| matches = list(_HEADING.finditer(raw)) | |
| for i, m in enumerate(matches): | |
| title = m.group(1).strip() | |
| start = m.end() | |
| end = matches[i + 1].start() if i + 1 < len(matches) else len(raw) | |
| body = raw[start:end].strip() | |
| if not body: | |
| continue | |
| # Repeat the section title so the heading (a strong topical signal) is | |
| # weighted into retrieval — e.g. a "shipping" query should land on the | |
| # Shipping section, not merely any section that mentions shipping in passing. | |
| docs.append( | |
| Document( | |
| doc_id=f"policy::{title.lower().replace(' ', '-')}", | |
| kind="policy", | |
| title=f"{title} policy", | |
| text=f"{title}. {title}. {body}", | |
| metadata={"section": title}, | |
| ) | |
| ) | |
| return docs | |
| def load_product_documents(catalog_path: Path) -> list[Document]: | |
| """Turn each catalog product into a retrievable document.""" | |
| data = json.loads(catalog_path.read_text(encoding="utf-8")) | |
| docs: list[Document] = [] | |
| for p in data.get("products", []): | |
| stock = "in stock" if p.get("in_stock") else "out of stock" | |
| tags = " ".join(p.get("tags", [])) | |
| text = ( | |
| f"{p['name']} (SKU {p['sku']}, {p['category']}). " | |
| f"Price ${p['price']:.2f}, {stock}. " | |
| f"Weight {p.get('weight_kg', 'n/a')} kg. " | |
| f"Warranty {p.get('warranty_years', 'n/a')} years. " | |
| f"{p['description']} Keywords: {tags}." | |
| ) | |
| docs.append( | |
| Document( | |
| doc_id=f"product::{p['sku']}", | |
| kind="product", | |
| title=p["name"], | |
| text=text, | |
| metadata=p, | |
| ) | |
| ) | |
| return docs | |
| def load_knowledge_base(data_dir: Path | None = None) -> list[Document]: | |
| """Load every policy and product document from the data directory.""" | |
| data_dir = data_dir or get_settings().data_dir | |
| docs = load_policy_documents(data_dir / "policies.md") | |
| docs += load_product_documents(data_dir / "catalog.json") | |
| return docs | |