parapilot / app /rag /corpus.py
LaelaZ's picture
Deploy ParaPilot to HF Spaces (Docker)
d787a09 verified
"""Load the bundled offline seed corpus.
The corpus is a set of JSON files in ``data/corpus/`` (one per source). Each
file has source metadata + a list of chunks. We flatten them into
``RetrievedChunk``-shaped records (score filled in by the retriever).
"""
from __future__ import annotations
import json
from pathlib import Path
from typing import Dict, List
from app.config import get_settings
class Chunk:
"""A single retrievable unit of grounded text plus its provenance."""
__slots__ = (
"source_id", "chunk_id", "title", "url", "publisher",
"retrieved", "heading", "text", "tags",
)
def __init__(
self,
source_id: str,
chunk_id: str,
title: str,
url: str,
publisher: str,
retrieved: str,
heading: str,
text: str,
tags: List[str],
) -> None:
self.source_id = source_id
self.chunk_id = chunk_id
self.title = title
self.url = url
self.publisher = publisher
self.retrieved = retrieved
self.heading = heading
self.text = text
self.tags = tags
@property
def key(self) -> str:
return self.source_id + "::" + self.chunk_id
@property
def search_text(self) -> str:
"""Text used for lexical/embedding matching (heading + tags + body)."""
return " ".join([self.heading, " ".join(self.tags), self.text])
def load_corpus(corpus_dir: Path = None) -> List[Chunk]:
"""Read every ``*.json`` source file and return a flat list of chunks."""
settings = get_settings()
base = corpus_dir or settings.corpus_path
chunks: List[Chunk] = []
if not base.exists():
return chunks
for path in sorted(base.glob("*.json")):
if path.name.startswith("_"):
continue
with path.open("r", encoding="utf-8") as fh:
doc = json.load(fh)
# Per-source defaults; a chunk may override url for granular linking.
s_url = doc.get("url", "")
for ch in doc.get("chunks", []):
chunks.append(
Chunk(
source_id=doc["source_id"],
chunk_id=ch["id"],
title=doc.get("title", doc["source_id"]),
url=ch.get("url", s_url),
publisher=doc.get("publisher", ""),
retrieved=doc.get("retrieved", ""),
heading=ch.get("heading", ""),
text=ch["text"],
tags=ch.get("tags", []),
)
)
return chunks
def corpus_stats(corpus_dir: Path = None) -> Dict[str, int]:
chunks = load_corpus(corpus_dir)
sources = {c.source_id for c in chunks}
return {"sources": len(sources), "chunks": len(chunks)}