Spaces:

LaelaZ
/

ecom-support-copilot

Sleeping

App Files Files Community

ecom-support-copilot / supportcopilot /knowledge.py

LaelaZ

Deploy SupportCopilot to HF Spaces (Docker)

8981bf6 verified 5 days ago

raw

history blame contribute delete

3.15 kB

	"""Load the seed store data and turn it into retrievable documents.

	A :class:`Document` is one citable unit of knowledge — a policy section or a single
	product. Keeping products and policy sections as separate documents means the agent
	can cite exactly what it used ("Returns policy", "Ultralight 2-Person Tent").
	"""

	from __future__ import annotations

	import json
	import re
	from dataclasses import dataclass, field
	from pathlib import Path
	from typing import Any

	from .config import get_settings


	@dataclass(frozen=True)
	class Document:
	"""One retrievable, citable knowledge unit."""

	doc_id: str
	kind: str # "policy" \| "product"
	title: str
	text: str
	metadata: dict[str, Any] = field(default_factory=dict)


	_HEADING = re.compile(r"^##\s+(.*)$", re.MULTILINE)


	def load_policy_documents(policies_path: Path) -> list[Document]:
	"""Split ``policies.md`` into one document per ``##`` section."""
	raw = policies_path.read_text(encoding="utf-8")
	docs: list[Document] = []
	matches = list(_HEADING.finditer(raw))
	for i, m in enumerate(matches):
	title = m.group(1).strip()
	start = m.end()
	end = matches[i + 1].start() if i + 1 < len(matches) else len(raw)
	body = raw[start:end].strip()
	if not body:
	continue
	# Repeat the section title so the heading (a strong topical signal) is
	# weighted into retrieval — e.g. a "shipping" query should land on the
	# Shipping section, not merely any section that mentions shipping in passing.
	docs.append(
	Document(
	doc_id=f"policy::{title.lower().replace(' ', '-')}",
	kind="policy",
	title=f"{title} policy",
	text=f"{title}. {title}. {body}",
	metadata={"section": title},
	)
	)
	return docs


	def load_product_documents(catalog_path: Path) -> list[Document]:
	"""Turn each catalog product into a retrievable document."""
	data = json.loads(catalog_path.read_text(encoding="utf-8"))
	docs: list[Document] = []
	for p in data.get("products", []):
	stock = "in stock" if p.get("in_stock") else "out of stock"
	tags = " ".join(p.get("tags", []))
	text = (
	f"{p['name']} (SKU {p['sku']}, {p['category']}). "
	f"Price ${p['price']:.2f}, {stock}. "
	f"Weight {p.get('weight_kg', 'n/a')} kg. "
	f"Warranty {p.get('warranty_years', 'n/a')} years. "
	f"{p['description']} Keywords: {tags}."
	)
	docs.append(
	Document(
	doc_id=f"product::{p['sku']}",
	kind="product",
	title=p["name"],
	text=text,
	metadata=p,
	)
	)
	return docs


	def load_knowledge_base(data_dir: Path \| None = None) -> list[Document]:
	"""Load every policy and product document from the data directory."""
	data_dir = data_dir or get_settings().data_dir
	docs = load_policy_documents(data_dir / "policies.md")
	docs += load_product_documents(data_dir / "catalog.json")
	return docs