File size: 6,732 Bytes

"""Generate markdown resource views and search index from catalog JSON.

Usage:
    python scripts/generate_resource_views.py
"""

from __future__ import annotations

import json
from pathlib import Path
from typing import Any


CATEGORY_CONFIG = {
    "dataset": ("resources/datasets/README.md", "Datasets"),
    "model": ("resources/models/README.md", "Models"),
    "benchmark": ("resources/benchmarks/README.md", "Benchmarks"),
    "tool": ("resources/tools/README.md", "Tools"),
    "paper": ("resources/papers/README.md", "Papers"),
    "project": ("resources/projects/README.md", "Projects"),
    "code": ("resources/codes/README.md", "Code"),
}


def _load_catalog(path: Path) -> dict[str, Any]:
    return json.loads(path.read_text(encoding="utf-8"))


def _escape_cell(value: str) -> str:
    return value.replace("|", "\\|").strip()


def _marker_text(markers: list[str]) -> str:
    return ", ".join(f"`{marker}`" for marker in markers)


def _resource_row(resource: dict[str, Any]) -> str:
    evidence = resource["pashto_evidence"]
    evidence_text = _escape_cell(evidence["evidence_text"])
    markers = _marker_text(evidence["markers"])
    if markers:
        evidence_text = f"{evidence_text} ({markers})"
    return (
        f"| {_escape_cell(resource['title'])} | "
        f"[{resource['source']}]({resource['url']}) | "
        f"[{evidence_text}]({evidence['evidence_url']}) | "
        f"{_escape_cell(resource['primary_use'])} |"
    )


def _write_markdown_table(path: Path, title: str, resources: list[dict[str, Any]]) -> None:
    lines = [
        f"# {title}",
        "",
        "## Verified Pashto Resources",
        "",
        "| Resource | Link | Pashto Evidence | Primary Use |",
        "|---|---|---|---|",
    ]

    if resources:
        lines.extend(_resource_row(resource) for resource in resources)
    else:
        lines.append("| _None yet_ | - | - | - |")

    lines.extend(
        [
            "",
            "## Maintenance",
            "- Source of truth: [../catalog/resources.json](../catalog/resources.json)",
            "- Validation: [../../scripts/validate_resource_catalog.py](../../scripts/validate_resource_catalog.py)",
            "- Generated by: [../../scripts/generate_resource_views.py](../../scripts/generate_resource_views.py)",
            "",
        ]
    )
    path.write_text("\n".join(lines), encoding="utf-8")


def _write_resources_home(path: Path, counts: dict[str, int], total_verified: int) -> None:
    lines = [
        "# Resources",
        "",
        "Structured, Pashto-focused resource tracking lives in this folder.",
        "",
        "## Sections",
        f"- Datasets ({counts.get('dataset', 0)}): [datasets/README.md](datasets/README.md)",
        f"- Models ({counts.get('model', 0)}): [models/README.md](models/README.md)",
        f"- Benchmarks ({counts.get('benchmark', 0)}): [benchmarks/README.md](benchmarks/README.md)",
        f"- Tools ({counts.get('tool', 0)}): [tools/README.md](tools/README.md)",
        f"- Papers ({counts.get('paper', 0)}): [papers/README.md](papers/README.md)",
        f"- Projects ({counts.get('project', 0)}): [projects/README.md](projects/README.md)",
        f"- Code ({counts.get('code', 0)}): [codes/README.md](codes/README.md)",
        "",
        "## Machine-Readable Catalog",
        "- Canonical catalog: [catalog/resources.json](catalog/resources.json)",
        "- Candidate feed: [catalog/pending_candidates.json](catalog/pending_candidates.json)",
        "- Schema: [schema/resource.schema.json](schema/resource.schema.json)",
        "",
        "## Update Rule",
        "- Add only validated resources with explicit Pashto relevance.",
        "- Keep every external reference clickable using markdown links.",
        "- Run `python scripts/validate_resource_catalog.py` before opening a PR.",
        "- Run `python scripts/generate_resource_views.py` after catalog changes.",
        "",
        f"Verified resource count: `{total_verified}`",
        "",
    ]
    path.write_text("\n".join(lines), encoding="utf-8")


def _build_search_payload(resources: list[dict[str, Any]], updated_on: str) -> dict[str, Any]:
    search_items: list[dict[str, Any]] = []
    for resource in resources:
        evidence = resource["pashto_evidence"]
        search_items.append(
            {
                "id": resource["id"],
                "title": resource["title"],
                "url": resource["url"],
                "category": resource["category"],
                "source": resource["source"],
                "status": resource["status"],
                "summary": resource["summary"],
                "primary_use": resource["primary_use"],
                "tasks": resource.get("tasks", []),
                "tags": resource["tags"],
                "evidence_text": evidence["evidence_text"],
                "evidence_url": evidence["evidence_url"],
                "markers": evidence["markers"],
            }
        )

    return {
        "generated_on": f"{updated_on}T00:00:00Z",
        "count": len(search_items),
        "resources": search_items,
    }


def main() -> int:
    catalog_path = Path("resources/catalog/resources.json")
    catalog = _load_catalog(catalog_path)
    resources: list[dict[str, Any]] = catalog.get("resources", [])
    updated_on = catalog.get("updated_on", "1970-01-01")
    verified = [resource for resource in resources if resource.get("status") == "verified"]

    grouped: dict[str, list[dict[str, Any]]] = {category: [] for category in CATEGORY_CONFIG}
    for resource in verified:
        category = resource.get("category")
        if category in grouped:
            grouped[category].append(resource)

    for category, (file_path, title) in CATEGORY_CONFIG.items():
        output_path = Path(file_path)
        output_path.parent.mkdir(parents=True, exist_ok=True)
        rows = sorted(grouped[category], key=lambda item: item["title"].lower())
        _write_markdown_table(output_path, title, rows)

    counts = {category: len(items) for category, items in grouped.items()}
    _write_resources_home(Path("resources/README.md"), counts, len(verified))

    search_payload = _build_search_payload(resources, updated_on)
    search_json_path = Path("docs/search/resources.json")
    search_json_path.parent.mkdir(parents=True, exist_ok=True)
    search_json_path.write_text(
        json.dumps(search_payload, ensure_ascii=False, indent=2) + "\n",
        encoding="utf-8",
    )

    print(
        "Generated resources markdown and search index: "
        f"{len(verified)} verified resources, {len(resources)} total resources"
    )
    return 0


if __name__ == "__main__":
    raise SystemExit(main())