| """Load the bundled offline seed corpus. |
| |
| The corpus is a set of JSON files in ``data/corpus/`` (one per source). Each |
| file has source metadata + a list of chunks. We flatten them into |
| ``RetrievedChunk``-shaped records (score filled in by the retriever). |
| """ |
| from __future__ import annotations |
|
|
| import json |
| from pathlib import Path |
| from typing import Dict, List |
|
|
| from app.config import get_settings |
|
|
|
|
| class Chunk: |
| """A single retrievable unit of grounded text plus its provenance.""" |
|
|
| __slots__ = ( |
| "source_id", "chunk_id", "title", "url", "publisher", |
| "retrieved", "heading", "text", "tags", |
| ) |
|
|
| def __init__( |
| self, |
| source_id: str, |
| chunk_id: str, |
| title: str, |
| url: str, |
| publisher: str, |
| retrieved: str, |
| heading: str, |
| text: str, |
| tags: List[str], |
| ) -> None: |
| self.source_id = source_id |
| self.chunk_id = chunk_id |
| self.title = title |
| self.url = url |
| self.publisher = publisher |
| self.retrieved = retrieved |
| self.heading = heading |
| self.text = text |
| self.tags = tags |
|
|
| @property |
| def key(self) -> str: |
| return self.source_id + "::" + self.chunk_id |
|
|
| @property |
| def search_text(self) -> str: |
| """Text used for lexical/embedding matching (heading + tags + body).""" |
| return " ".join([self.heading, " ".join(self.tags), self.text]) |
|
|
|
|
| def load_corpus(corpus_dir: Path = None) -> List[Chunk]: |
| """Read every ``*.json`` source file and return a flat list of chunks.""" |
| settings = get_settings() |
| base = corpus_dir or settings.corpus_path |
| chunks: List[Chunk] = [] |
|
|
| if not base.exists(): |
| return chunks |
|
|
| for path in sorted(base.glob("*.json")): |
| if path.name.startswith("_"): |
| continue |
| with path.open("r", encoding="utf-8") as fh: |
| doc = json.load(fh) |
|
|
| |
| s_url = doc.get("url", "") |
| for ch in doc.get("chunks", []): |
| chunks.append( |
| Chunk( |
| source_id=doc["source_id"], |
| chunk_id=ch["id"], |
| title=doc.get("title", doc["source_id"]), |
| url=ch.get("url", s_url), |
| publisher=doc.get("publisher", ""), |
| retrieved=doc.get("retrieved", ""), |
| heading=ch.get("heading", ""), |
| text=ch["text"], |
| tags=ch.get("tags", []), |
| ) |
| ) |
| return chunks |
|
|
|
|
| def corpus_stats(corpus_dir: Path = None) -> Dict[str, int]: |
| chunks = load_corpus(corpus_dir) |
| sources = {c.source_id for c in chunks} |
| return {"sources": len(sources), "chunks": len(chunks)} |
|
|