Spaces:

LaelaZ
/

parapilot

Sleeping

App Files Files Community

parapilot / app /rag /corpus.py

LaelaZ

Deploy ParaPilot to HF Spaces (Docker)

d787a09 verified 5 days ago

raw

history blame contribute delete

2.81 kB

	"""Load the bundled offline seed corpus.

	The corpus is a set of JSON files in ``data/corpus/`` (one per source). Each
	file has source metadata + a list of chunks. We flatten them into
	``RetrievedChunk``-shaped records (score filled in by the retriever).
	"""
	from __future__ import annotations

	import json
	from pathlib import Path
	from typing import Dict, List

	from app.config import get_settings


	class Chunk:
	"""A single retrievable unit of grounded text plus its provenance."""

	__slots__ = (
	"source_id", "chunk_id", "title", "url", "publisher",
	"retrieved", "heading", "text", "tags",
	)

	def __init__(
	self,
	source_id: str,
	chunk_id: str,
	title: str,
	url: str,
	publisher: str,
	retrieved: str,
	heading: str,
	text: str,
	tags: List[str],
	) -> None:
	self.source_id = source_id
	self.chunk_id = chunk_id
	self.title = title
	self.url = url
	self.publisher = publisher
	self.retrieved = retrieved
	self.heading = heading
	self.text = text
	self.tags = tags

	@property
	def key(self) -> str:
	return self.source_id + "::" + self.chunk_id

	@property
	def search_text(self) -> str:
	"""Text used for lexical/embedding matching (heading + tags + body)."""
	return " ".join([self.heading, " ".join(self.tags), self.text])


	def load_corpus(corpus_dir: Path = None) -> List[Chunk]:
	"""Read every ``*.json`` source file and return a flat list of chunks."""
	settings = get_settings()
	base = corpus_dir or settings.corpus_path
	chunks: List[Chunk] = []

	if not base.exists():
	return chunks

	for path in sorted(base.glob("*.json")):
	if path.name.startswith("_"):
	continue
	with path.open("r", encoding="utf-8") as fh:
	doc = json.load(fh)

	# Per-source defaults; a chunk may override url for granular linking.
	s_url = doc.get("url", "")
	for ch in doc.get("chunks", []):
	chunks.append(
	Chunk(
	source_id=doc["source_id"],
	chunk_id=ch["id"],
	title=doc.get("title", doc["source_id"]),
	url=ch.get("url", s_url),
	publisher=doc.get("publisher", ""),
	retrieved=doc.get("retrieved", ""),
	heading=ch.get("heading", ""),
	text=ch["text"],
	tags=ch.get("tags", []),
	)
	)
	return chunks


	def corpus_stats(corpus_dir: Path = None) -> Dict[str, int]:
	chunks = load_corpus(corpus_dir)
	sources = {c.source_id for c in chunks}
	return {"sources": len(sources), "chunks": len(chunks)}