File size: 6,732 Bytes
f13fd7c 081627f f13fd7c 081627f f13fd7c | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 | """Generate markdown resource views and search index from catalog JSON.
Usage:
python scripts/generate_resource_views.py
"""
from __future__ import annotations
import json
from pathlib import Path
from typing import Any
CATEGORY_CONFIG = {
"dataset": ("resources/datasets/README.md", "Datasets"),
"model": ("resources/models/README.md", "Models"),
"benchmark": ("resources/benchmarks/README.md", "Benchmarks"),
"tool": ("resources/tools/README.md", "Tools"),
"paper": ("resources/papers/README.md", "Papers"),
"project": ("resources/projects/README.md", "Projects"),
"code": ("resources/codes/README.md", "Code"),
}
def _load_catalog(path: Path) -> dict[str, Any]:
return json.loads(path.read_text(encoding="utf-8"))
def _escape_cell(value: str) -> str:
return value.replace("|", "\\|").strip()
def _marker_text(markers: list[str]) -> str:
return ", ".join(f"`{marker}`" for marker in markers)
def _resource_row(resource: dict[str, Any]) -> str:
evidence = resource["pashto_evidence"]
evidence_text = _escape_cell(evidence["evidence_text"])
markers = _marker_text(evidence["markers"])
if markers:
evidence_text = f"{evidence_text} ({markers})"
return (
f"| {_escape_cell(resource['title'])} | "
f"[{resource['source']}]({resource['url']}) | "
f"[{evidence_text}]({evidence['evidence_url']}) | "
f"{_escape_cell(resource['primary_use'])} |"
)
def _write_markdown_table(path: Path, title: str, resources: list[dict[str, Any]]) -> None:
lines = [
f"# {title}",
"",
"## Verified Pashto Resources",
"",
"| Resource | Link | Pashto Evidence | Primary Use |",
"|---|---|---|---|",
]
if resources:
lines.extend(_resource_row(resource) for resource in resources)
else:
lines.append("| _None yet_ | - | - | - |")
lines.extend(
[
"",
"## Maintenance",
"- Source of truth: [../catalog/resources.json](../catalog/resources.json)",
"- Validation: [../../scripts/validate_resource_catalog.py](../../scripts/validate_resource_catalog.py)",
"- Generated by: [../../scripts/generate_resource_views.py](../../scripts/generate_resource_views.py)",
"",
]
)
path.write_text("\n".join(lines), encoding="utf-8")
def _write_resources_home(path: Path, counts: dict[str, int], total_verified: int) -> None:
lines = [
"# Resources",
"",
"Structured, Pashto-focused resource tracking lives in this folder.",
"",
"## Sections",
f"- Datasets ({counts.get('dataset', 0)}): [datasets/README.md](datasets/README.md)",
f"- Models ({counts.get('model', 0)}): [models/README.md](models/README.md)",
f"- Benchmarks ({counts.get('benchmark', 0)}): [benchmarks/README.md](benchmarks/README.md)",
f"- Tools ({counts.get('tool', 0)}): [tools/README.md](tools/README.md)",
f"- Papers ({counts.get('paper', 0)}): [papers/README.md](papers/README.md)",
f"- Projects ({counts.get('project', 0)}): [projects/README.md](projects/README.md)",
f"- Code ({counts.get('code', 0)}): [codes/README.md](codes/README.md)",
"",
"## Machine-Readable Catalog",
"- Canonical catalog: [catalog/resources.json](catalog/resources.json)",
"- Candidate feed: [catalog/pending_candidates.json](catalog/pending_candidates.json)",
"- Schema: [schema/resource.schema.json](schema/resource.schema.json)",
"",
"## Update Rule",
"- Add only validated resources with explicit Pashto relevance.",
"- Keep every external reference clickable using markdown links.",
"- Run `python scripts/validate_resource_catalog.py` before opening a PR.",
"- Run `python scripts/generate_resource_views.py` after catalog changes.",
"",
f"Verified resource count: `{total_verified}`",
"",
]
path.write_text("\n".join(lines), encoding="utf-8")
def _build_search_payload(resources: list[dict[str, Any]], updated_on: str) -> dict[str, Any]:
search_items: list[dict[str, Any]] = []
for resource in resources:
evidence = resource["pashto_evidence"]
search_items.append(
{
"id": resource["id"],
"title": resource["title"],
"url": resource["url"],
"category": resource["category"],
"source": resource["source"],
"status": resource["status"],
"summary": resource["summary"],
"primary_use": resource["primary_use"],
"tasks": resource.get("tasks", []),
"tags": resource["tags"],
"evidence_text": evidence["evidence_text"],
"evidence_url": evidence["evidence_url"],
"markers": evidence["markers"],
}
)
return {
"generated_on": f"{updated_on}T00:00:00Z",
"count": len(search_items),
"resources": search_items,
}
def main() -> int:
catalog_path = Path("resources/catalog/resources.json")
catalog = _load_catalog(catalog_path)
resources: list[dict[str, Any]] = catalog.get("resources", [])
updated_on = catalog.get("updated_on", "1970-01-01")
verified = [resource for resource in resources if resource.get("status") == "verified"]
grouped: dict[str, list[dict[str, Any]]] = {category: [] for category in CATEGORY_CONFIG}
for resource in verified:
category = resource.get("category")
if category in grouped:
grouped[category].append(resource)
for category, (file_path, title) in CATEGORY_CONFIG.items():
output_path = Path(file_path)
output_path.parent.mkdir(parents=True, exist_ok=True)
rows = sorted(grouped[category], key=lambda item: item["title"].lower())
_write_markdown_table(output_path, title, rows)
counts = {category: len(items) for category, items in grouped.items()}
_write_resources_home(Path("resources/README.md"), counts, len(verified))
search_payload = _build_search_payload(resources, updated_on)
search_json_path = Path("docs/search/resources.json")
search_json_path.parent.mkdir(parents=True, exist_ok=True)
search_json_path.write_text(
json.dumps(search_payload, ensure_ascii=False, indent=2) + "\n",
encoding="utf-8",
)
print(
"Generated resources markdown and search index: "
f"{len(verified)} verified resources, {len(resources)} total resources"
)
return 0
if __name__ == "__main__":
raise SystemExit(main())
|