"""Load the bundled offline seed corpus.

The corpus is a set of JSON files in ``data/corpus/`` (one per source). Each
file has source metadata + a list of chunks. We flatten them into
``RetrievedChunk``-shaped records (score filled in by the retriever).
"""
from __future__ import annotations

import json
from pathlib import Path
from typing import Dict, List

from app.config import get_settings


class Chunk:
    """A single retrievable unit of grounded text plus its provenance."""

    __slots__ = (
        "source_id", "chunk_id", "title", "url", "publisher",
        "retrieved", "heading", "text", "tags",
    )

    def __init__(
        self,
        source_id: str,
        chunk_id: str,
        title: str,
        url: str,
        publisher: str,
        retrieved: str,
        heading: str,
        text: str,
        tags: List[str],
    ) -> None:
        self.source_id = source_id
        self.chunk_id = chunk_id
        self.title = title
        self.url = url
        self.publisher = publisher
        self.retrieved = retrieved
        self.heading = heading
        self.text = text
        self.tags = tags

    @property
    def key(self) -> str:
        return self.source_id + "::" + self.chunk_id

    @property
    def search_text(self) -> str:
        """Text used for lexical/embedding matching (heading + tags + body)."""
        return " ".join([self.heading, " ".join(self.tags), self.text])


def load_corpus(corpus_dir: Path = None) -> List[Chunk]:
    """Read every ``*.json`` source file and return a flat list of chunks."""
    settings = get_settings()
    base = corpus_dir or settings.corpus_path
    chunks: List[Chunk] = []

    if not base.exists():
        return chunks

    for path in sorted(base.glob("*.json")):
        if path.name.startswith("_"):
            continue
        with path.open("r", encoding="utf-8") as fh:
            doc = json.load(fh)

        # Per-source defaults; a chunk may override url for granular linking.
        s_url = doc.get("url", "")
        for ch in doc.get("chunks", []):
            chunks.append(
                Chunk(
                    source_id=doc["source_id"],
                    chunk_id=ch["id"],
                    title=doc.get("title", doc["source_id"]),
                    url=ch.get("url", s_url),
                    publisher=doc.get("publisher", ""),
                    retrieved=doc.get("retrieved", ""),
                    heading=ch.get("heading", ""),
                    text=ch["text"],
                    tags=ch.get("tags", []),
                )
            )
    return chunks


def corpus_stats(corpus_dir: Path = None) -> Dict[str, int]:
    chunks = load_corpus(corpus_dir)
    sources = {c.source_id for c in chunks}
    return {"sources": len(sources), "chunks": len(chunks)}