| | """Validate the machine-readable Pashto resource catalog. |
| | |
| | Usage: |
| | python scripts/validate_resource_catalog.py |
| | python scripts/validate_resource_catalog.py --catalog resources/catalog/resources.json |
| | """ |
| |
|
| | from __future__ import annotations |
| |
|
| | import argparse |
| | import json |
| | import re |
| | from datetime import date |
| | from pathlib import Path |
| | from typing import Any |
| | from urllib.parse import urlparse |
| |
|
| |
|
| | ALLOWED_CATEGORIES = {"dataset", "model", "benchmark", "tool", "paper", "project", "code"} |
| | ALLOWED_SOURCES = { |
| | "huggingface", |
| | "mozilla", |
| | "kaggle", |
| | "github", |
| | "gitlab", |
| | "arxiv", |
| | "openalex", |
| | "crossref", |
| | "zenodo", |
| | "dataverse", |
| | "datacite", |
| | "meta", |
| | "other", |
| | } |
| | ALLOWED_STATUS = {"verified", "candidate"} |
| | RESOURCE_ID_RE = re.compile(r"^[a-z0-9][a-z0-9._-]*$") |
| | STRICT_PASHTO_CATEGORIES = {"model", "paper", "tool", "code", "project"} |
| | PASHTO_PUSHTO_WORD_RE = re.compile(r"(?<![a-z0-9])pushto(?![a-z0-9])") |
| | PASHTO_CODE_RE = re.compile(r"\b(ps(_af)?|pus|pbt[_-]?arab)\b") |
| |
|
| |
|
| | def _load_json(path: Path) -> dict[str, Any]: |
| | return json.loads(path.read_text(encoding="utf-8")) |
| |
|
| |
|
| | def _is_valid_http_url(value: str) -> bool: |
| | parsed = urlparse(value) |
| | return parsed.scheme in {"http", "https"} and bool(parsed.netloc) |
| |
|
| |
|
| | def _validate_iso_date(value: str) -> bool: |
| | try: |
| | date.fromisoformat(value) |
| | except ValueError: |
| | return False |
| | return True |
| |
|
| |
|
| | def _contains_pashto_marker(value: str) -> bool: |
| | if not isinstance(value, str): |
| | return False |
| | lowered = value.casefold() |
| | if any(marker in lowered for marker in ("pashto", "pukhto", "pakhto")): |
| | return True |
| | if PASHTO_PUSHTO_WORD_RE.search(lowered): |
| | return True |
| | if PASHTO_CODE_RE.search(lowered): |
| | return True |
| | return any(marker in value for marker in ("پښتو", "پشتو")) |
| |
|
| |
|
| | def _has_pashto_evidence(evidence: dict[str, Any]) -> bool: |
| | fields: list[str] = [] |
| | evidence_text = evidence.get("evidence_text") |
| | if isinstance(evidence_text, str): |
| | fields.append(evidence_text) |
| |
|
| | evidence_url = evidence.get("evidence_url") |
| | if isinstance(evidence_url, str): |
| | fields.append(evidence_url) |
| |
|
| | markers = evidence.get("markers") |
| | if isinstance(markers, list): |
| | fields.extend(marker for marker in markers if isinstance(marker, str)) |
| |
|
| | return any(_contains_pashto_marker(field) for field in fields) |
| |
|
| |
|
| | def validate_resource(resource: dict[str, Any], index: int) -> list[str]: |
| | errors: list[str] = [] |
| | prefix = f"resource[{index}]" |
| |
|
| | required_fields = { |
| | "id", |
| | "title", |
| | "url", |
| | "category", |
| | "source", |
| | "status", |
| | "summary", |
| | "primary_use", |
| | "pashto_evidence", |
| | "tags", |
| | } |
| | missing = sorted(required_fields - resource.keys()) |
| | if missing: |
| | errors.append(f"{prefix} missing required fields: {', '.join(missing)}") |
| | return errors |
| |
|
| | rid = resource["id"] |
| | if not isinstance(rid, str) or not RESOURCE_ID_RE.fullmatch(rid): |
| | errors.append(f"{prefix}.id must match {RESOURCE_ID_RE.pattern}") |
| |
|
| | title = resource["title"] |
| | if not isinstance(title, str) or len(title.strip()) < 3: |
| | errors.append(f"{prefix}.title must be a non-empty string") |
| |
|
| | url = resource["url"] |
| | if not isinstance(url, str) or not _is_valid_http_url(url): |
| | errors.append(f"{prefix}.url must be a valid http/https URL") |
| |
|
| | category = resource["category"] |
| | if category not in ALLOWED_CATEGORIES: |
| | errors.append(f"{prefix}.category must be one of {sorted(ALLOWED_CATEGORIES)}") |
| |
|
| | source = resource["source"] |
| | if source not in ALLOWED_SOURCES: |
| | errors.append(f"{prefix}.source must be one of {sorted(ALLOWED_SOURCES)}") |
| |
|
| | status = resource["status"] |
| | if status not in ALLOWED_STATUS: |
| | errors.append(f"{prefix}.status must be one of {sorted(ALLOWED_STATUS)}") |
| |
|
| | summary = resource["summary"] |
| | if not isinstance(summary, str) or len(summary.strip()) < 10: |
| | errors.append(f"{prefix}.summary must be at least 10 characters") |
| |
|
| | primary_use = resource["primary_use"] |
| | if not isinstance(primary_use, str) or len(primary_use.strip()) < 3: |
| | errors.append(f"{prefix}.primary_use must be a non-empty string") |
| |
|
| | if "tasks" in resource and not ( |
| | isinstance(resource["tasks"], list) |
| | and all(isinstance(item, str) and item.strip() for item in resource["tasks"]) |
| | ): |
| | errors.append(f"{prefix}.tasks must be a list of strings") |
| |
|
| | tags = resource["tags"] |
| | if not (isinstance(tags, list) and tags and all(isinstance(tag, str) and tag.strip() for tag in tags)): |
| | errors.append(f"{prefix}.tags must be a non-empty list of strings") |
| |
|
| | evidence = resource["pashto_evidence"] |
| | if not isinstance(evidence, dict): |
| | errors.append(f"{prefix}.pashto_evidence must be an object") |
| | return errors |
| |
|
| | for key in ("evidence_text", "evidence_url", "markers"): |
| | if key not in evidence: |
| | errors.append(f"{prefix}.pashto_evidence missing '{key}'") |
| |
|
| | evidence_text = evidence.get("evidence_text") |
| | if not isinstance(evidence_text, str) or len(evidence_text.strip()) < 3: |
| | errors.append(f"{prefix}.pashto_evidence.evidence_text must be a string") |
| |
|
| | evidence_url = evidence.get("evidence_url") |
| | if not isinstance(evidence_url, str) or not _is_valid_http_url(evidence_url): |
| | errors.append(f"{prefix}.pashto_evidence.evidence_url must be a valid http/https URL") |
| |
|
| | markers = evidence.get("markers") |
| | if not (isinstance(markers, list) and markers and all(isinstance(marker, str) and marker.strip() for marker in markers)): |
| | errors.append(f"{prefix}.pashto_evidence.markers must be a non-empty list of strings") |
| |
|
| | if category in STRICT_PASHTO_CATEGORIES and not ( |
| | _contains_pashto_marker(title) |
| | or _contains_pashto_marker(url) |
| | or _has_pashto_evidence(evidence) |
| | ): |
| | errors.append( |
| | f"{prefix} must be Pashto-centric for category '{category}' " |
| | "(include a Pashto marker in title, URL, or pashto_evidence)" |
| | ) |
| |
|
| | return errors |
| |
|
| |
|
| | def validate_catalog(catalog: dict[str, Any]) -> list[str]: |
| | errors: list[str] = [] |
| |
|
| | for key in ("version", "updated_on", "resources"): |
| | if key not in catalog: |
| | errors.append(f"catalog missing required top-level key: {key}") |
| |
|
| | if errors: |
| | return errors |
| |
|
| | version = catalog["version"] |
| | if not isinstance(version, str) or not re.fullmatch(r"^\d+\.\d+\.\d+$", version): |
| | errors.append("catalog.version must look like '1.0.0'") |
| |
|
| | updated_on = catalog["updated_on"] |
| | if not isinstance(updated_on, str) or not _validate_iso_date(updated_on): |
| | errors.append("catalog.updated_on must be a valid ISO date (YYYY-MM-DD)") |
| |
|
| | resources = catalog["resources"] |
| | if not isinstance(resources, list): |
| | errors.append("catalog.resources must be a list") |
| | return errors |
| |
|
| | seen_ids: set[str] = set() |
| | for index, resource in enumerate(resources): |
| | if not isinstance(resource, dict): |
| | errors.append(f"resource[{index}] must be an object") |
| | continue |
| | errors.extend(validate_resource(resource, index)) |
| | resource_id = resource.get("id") |
| | if isinstance(resource_id, str): |
| | if resource_id in seen_ids: |
| | errors.append(f"duplicate resource id: {resource_id}") |
| | seen_ids.add(resource_id) |
| |
|
| | return errors |
| |
|
| |
|
| | def main() -> int: |
| | parser = argparse.ArgumentParser() |
| | parser.add_argument("--catalog", default="resources/catalog/resources.json") |
| | parser.add_argument("--schema", default="resources/schema/resource.schema.json") |
| | args = parser.parse_args() |
| |
|
| | catalog_path = Path(args.catalog) |
| | schema_path = Path(args.schema) |
| |
|
| | if not catalog_path.exists(): |
| | print(f"Missing catalog file: {catalog_path}") |
| | return 1 |
| | if not schema_path.exists(): |
| | print(f"Missing schema file: {schema_path}") |
| | return 1 |
| |
|
| | try: |
| | schema = _load_json(schema_path) |
| | catalog = _load_json(catalog_path) |
| | except json.JSONDecodeError as exc: |
| | print(f"Invalid JSON: {exc}") |
| | return 1 |
| |
|
| | |
| | if not isinstance(schema, dict) or "$schema" not in schema: |
| | print("Schema file must be a JSON object with a '$schema' key") |
| | return 1 |
| |
|
| | errors = validate_catalog(catalog) |
| | if errors: |
| | print("Resource catalog validation failed:") |
| | for error in errors: |
| | print(f"- {error}") |
| | return 1 |
| |
|
| | print(f"Resource catalog valid: {len(catalog['resources'])} resources") |
| | return 0 |
| |
|
| |
|
| | if __name__ == "__main__": |
| | raise SystemExit(main()) |
| |
|