File size: 3,147 Bytes
8981bf6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
"""Load the seed store data and turn it into retrievable documents.

A :class:`Document` is one citable unit of knowledge — a policy section or a single
product. Keeping products and policy sections as separate documents means the agent
can cite exactly what it used ("Returns policy", "Ultralight 2-Person Tent").
"""

from __future__ import annotations

import json
import re
from dataclasses import dataclass, field
from pathlib import Path
from typing import Any

from .config import get_settings


@dataclass(frozen=True)
class Document:
    """One retrievable, citable knowledge unit."""

    doc_id: str
    kind: str  # "policy" | "product"
    title: str
    text: str
    metadata: dict[str, Any] = field(default_factory=dict)


_HEADING = re.compile(r"^##\s+(.*)$", re.MULTILINE)


def load_policy_documents(policies_path: Path) -> list[Document]:
    """Split ``policies.md`` into one document per ``##`` section."""
    raw = policies_path.read_text(encoding="utf-8")
    docs: list[Document] = []
    matches = list(_HEADING.finditer(raw))
    for i, m in enumerate(matches):
        title = m.group(1).strip()
        start = m.end()
        end = matches[i + 1].start() if i + 1 < len(matches) else len(raw)
        body = raw[start:end].strip()
        if not body:
            continue
        # Repeat the section title so the heading (a strong topical signal) is
        # weighted into retrieval — e.g. a "shipping" query should land on the
        # Shipping section, not merely any section that mentions shipping in passing.
        docs.append(
            Document(
                doc_id=f"policy::{title.lower().replace(' ', '-')}",
                kind="policy",
                title=f"{title} policy",
                text=f"{title}. {title}. {body}",
                metadata={"section": title},
            )
        )
    return docs


def load_product_documents(catalog_path: Path) -> list[Document]:
    """Turn each catalog product into a retrievable document."""
    data = json.loads(catalog_path.read_text(encoding="utf-8"))
    docs: list[Document] = []
    for p in data.get("products", []):
        stock = "in stock" if p.get("in_stock") else "out of stock"
        tags = " ".join(p.get("tags", []))
        text = (
            f"{p['name']} (SKU {p['sku']}, {p['category']}). "
            f"Price ${p['price']:.2f}, {stock}. "
            f"Weight {p.get('weight_kg', 'n/a')} kg. "
            f"Warranty {p.get('warranty_years', 'n/a')} years. "
            f"{p['description']} Keywords: {tags}."
        )
        docs.append(
            Document(
                doc_id=f"product::{p['sku']}",
                kind="product",
                title=p["name"],
                text=text,
                metadata=p,
            )
        )
    return docs


def load_knowledge_base(data_dir: Path | None = None) -> list[Document]:
    """Load every policy and product document from the data directory."""
    data_dir = data_dir or get_settings().data_dir
    docs = load_policy_documents(data_dir / "policies.md")
    docs += load_product_documents(data_dir / "catalog.json")
    return docs