LaelaZ's picture
Deploy SupportCopilot to HF Spaces (Docker)
8981bf6 verified
"""Load the seed store data and turn it into retrievable documents.
A :class:`Document` is one citable unit of knowledge — a policy section or a single
product. Keeping products and policy sections as separate documents means the agent
can cite exactly what it used ("Returns policy", "Ultralight 2-Person Tent").
"""
from __future__ import annotations
import json
import re
from dataclasses import dataclass, field
from pathlib import Path
from typing import Any
from .config import get_settings
@dataclass(frozen=True)
class Document:
"""One retrievable, citable knowledge unit."""
doc_id: str
kind: str # "policy" | "product"
title: str
text: str
metadata: dict[str, Any] = field(default_factory=dict)
_HEADING = re.compile(r"^##\s+(.*)$", re.MULTILINE)
def load_policy_documents(policies_path: Path) -> list[Document]:
"""Split ``policies.md`` into one document per ``##`` section."""
raw = policies_path.read_text(encoding="utf-8")
docs: list[Document] = []
matches = list(_HEADING.finditer(raw))
for i, m in enumerate(matches):
title = m.group(1).strip()
start = m.end()
end = matches[i + 1].start() if i + 1 < len(matches) else len(raw)
body = raw[start:end].strip()
if not body:
continue
# Repeat the section title so the heading (a strong topical signal) is
# weighted into retrieval — e.g. a "shipping" query should land on the
# Shipping section, not merely any section that mentions shipping in passing.
docs.append(
Document(
doc_id=f"policy::{title.lower().replace(' ', '-')}",
kind="policy",
title=f"{title} policy",
text=f"{title}. {title}. {body}",
metadata={"section": title},
)
)
return docs
def load_product_documents(catalog_path: Path) -> list[Document]:
"""Turn each catalog product into a retrievable document."""
data = json.loads(catalog_path.read_text(encoding="utf-8"))
docs: list[Document] = []
for p in data.get("products", []):
stock = "in stock" if p.get("in_stock") else "out of stock"
tags = " ".join(p.get("tags", []))
text = (
f"{p['name']} (SKU {p['sku']}, {p['category']}). "
f"Price ${p['price']:.2f}, {stock}. "
f"Weight {p.get('weight_kg', 'n/a')} kg. "
f"Warranty {p.get('warranty_years', 'n/a')} years. "
f"{p['description']} Keywords: {tags}."
)
docs.append(
Document(
doc_id=f"product::{p['sku']}",
kind="product",
title=p["name"],
text=text,
metadata=p,
)
)
return docs
def load_knowledge_base(data_dir: Path | None = None) -> list[Document]:
"""Load every policy and product document from the data directory."""
data_dir = data_dir or get_settings().data_dir
docs = load_policy_documents(data_dir / "policies.md")
docs += load_product_documents(data_dir / "catalog.json")
return docs