pashto-language-resources / scripts /generate_resource_views.py
musaw
Expand resource cycle for projects/code and promote new Pashto sources
081627f
"""Generate markdown resource views and search index from catalog JSON.
Usage:
python scripts/generate_resource_views.py
"""
from __future__ import annotations
import json
from pathlib import Path
from typing import Any
CATEGORY_CONFIG = {
"dataset": ("resources/datasets/README.md", "Datasets"),
"model": ("resources/models/README.md", "Models"),
"benchmark": ("resources/benchmarks/README.md", "Benchmarks"),
"tool": ("resources/tools/README.md", "Tools"),
"paper": ("resources/papers/README.md", "Papers"),
"project": ("resources/projects/README.md", "Projects"),
"code": ("resources/codes/README.md", "Code"),
}
def _load_catalog(path: Path) -> dict[str, Any]:
return json.loads(path.read_text(encoding="utf-8"))
def _escape_cell(value: str) -> str:
return value.replace("|", "\\|").strip()
def _marker_text(markers: list[str]) -> str:
return ", ".join(f"`{marker}`" for marker in markers)
def _resource_row(resource: dict[str, Any]) -> str:
evidence = resource["pashto_evidence"]
evidence_text = _escape_cell(evidence["evidence_text"])
markers = _marker_text(evidence["markers"])
if markers:
evidence_text = f"{evidence_text} ({markers})"
return (
f"| {_escape_cell(resource['title'])} | "
f"[{resource['source']}]({resource['url']}) | "
f"[{evidence_text}]({evidence['evidence_url']}) | "
f"{_escape_cell(resource['primary_use'])} |"
)
def _write_markdown_table(path: Path, title: str, resources: list[dict[str, Any]]) -> None:
lines = [
f"# {title}",
"",
"## Verified Pashto Resources",
"",
"| Resource | Link | Pashto Evidence | Primary Use |",
"|---|---|---|---|",
]
if resources:
lines.extend(_resource_row(resource) for resource in resources)
else:
lines.append("| _None yet_ | - | - | - |")
lines.extend(
[
"",
"## Maintenance",
"- Source of truth: [../catalog/resources.json](../catalog/resources.json)",
"- Validation: [../../scripts/validate_resource_catalog.py](../../scripts/validate_resource_catalog.py)",
"- Generated by: [../../scripts/generate_resource_views.py](../../scripts/generate_resource_views.py)",
"",
]
)
path.write_text("\n".join(lines), encoding="utf-8")
def _write_resources_home(path: Path, counts: dict[str, int], total_verified: int) -> None:
lines = [
"# Resources",
"",
"Structured, Pashto-focused resource tracking lives in this folder.",
"",
"## Sections",
f"- Datasets ({counts.get('dataset', 0)}): [datasets/README.md](datasets/README.md)",
f"- Models ({counts.get('model', 0)}): [models/README.md](models/README.md)",
f"- Benchmarks ({counts.get('benchmark', 0)}): [benchmarks/README.md](benchmarks/README.md)",
f"- Tools ({counts.get('tool', 0)}): [tools/README.md](tools/README.md)",
f"- Papers ({counts.get('paper', 0)}): [papers/README.md](papers/README.md)",
f"- Projects ({counts.get('project', 0)}): [projects/README.md](projects/README.md)",
f"- Code ({counts.get('code', 0)}): [codes/README.md](codes/README.md)",
"",
"## Machine-Readable Catalog",
"- Canonical catalog: [catalog/resources.json](catalog/resources.json)",
"- Candidate feed: [catalog/pending_candidates.json](catalog/pending_candidates.json)",
"- Schema: [schema/resource.schema.json](schema/resource.schema.json)",
"",
"## Update Rule",
"- Add only validated resources with explicit Pashto relevance.",
"- Keep every external reference clickable using markdown links.",
"- Run `python scripts/validate_resource_catalog.py` before opening a PR.",
"- Run `python scripts/generate_resource_views.py` after catalog changes.",
"",
f"Verified resource count: `{total_verified}`",
"",
]
path.write_text("\n".join(lines), encoding="utf-8")
def _build_search_payload(resources: list[dict[str, Any]], updated_on: str) -> dict[str, Any]:
search_items: list[dict[str, Any]] = []
for resource in resources:
evidence = resource["pashto_evidence"]
search_items.append(
{
"id": resource["id"],
"title": resource["title"],
"url": resource["url"],
"category": resource["category"],
"source": resource["source"],
"status": resource["status"],
"summary": resource["summary"],
"primary_use": resource["primary_use"],
"tasks": resource.get("tasks", []),
"tags": resource["tags"],
"evidence_text": evidence["evidence_text"],
"evidence_url": evidence["evidence_url"],
"markers": evidence["markers"],
}
)
return {
"generated_on": f"{updated_on}T00:00:00Z",
"count": len(search_items),
"resources": search_items,
}
def main() -> int:
catalog_path = Path("resources/catalog/resources.json")
catalog = _load_catalog(catalog_path)
resources: list[dict[str, Any]] = catalog.get("resources", [])
updated_on = catalog.get("updated_on", "1970-01-01")
verified = [resource for resource in resources if resource.get("status") == "verified"]
grouped: dict[str, list[dict[str, Any]]] = {category: [] for category in CATEGORY_CONFIG}
for resource in verified:
category = resource.get("category")
if category in grouped:
grouped[category].append(resource)
for category, (file_path, title) in CATEGORY_CONFIG.items():
output_path = Path(file_path)
output_path.parent.mkdir(parents=True, exist_ok=True)
rows = sorted(grouped[category], key=lambda item: item["title"].lower())
_write_markdown_table(output_path, title, rows)
counts = {category: len(items) for category, items in grouped.items()}
_write_resources_home(Path("resources/README.md"), counts, len(verified))
search_payload = _build_search_payload(resources, updated_on)
search_json_path = Path("docs/search/resources.json")
search_json_path.parent.mkdir(parents=True, exist_ok=True)
search_json_path.write_text(
json.dumps(search_payload, ensure_ascii=False, indent=2) + "\n",
encoding="utf-8",
)
print(
"Generated resources markdown and search index: "
f"{len(verified)} verified resources, {len(resources)} total resources"
)
return 0
if __name__ == "__main__":
raise SystemExit(main())