musaw commited on
Commit ·
f13fd7c
1
Parent(s): fb472d7
Add automated Pashto resource catalog, search UI, and sync workflow
Browse files- .github/ISSUE_TEMPLATE/resource_addition.md +34 -0
- .github/workflows/ci.yml +9 -0
- .github/workflows/resource_sync.yml +52 -0
- CONTRIBUTING.md +4 -6
- README.md +4 -0
- docs/README.md +11 -6
- docs/index.md +7 -1
- docs/resource_automation.md +36 -0
- docs/resource_catalog.md +23 -16
- docs/search/index.html +418 -0
- docs/search/resources.json +595 -0
- resources/README.md +15 -6
- resources/benchmarks/README.md +12 -11
- resources/catalog/README.md +14 -0
- resources/catalog/pending_candidates.json +474 -0
- resources/catalog/resource.template.json +25 -0
- resources/catalog/resources.json +645 -0
- resources/datasets/README.md +14 -13
- resources/models/README.md +12 -13
- resources/papers/README.md +15 -0
- resources/schema/resource.schema.json +142 -0
- resources/tools/README.md +10 -14
- scripts/README.md +24 -6
- scripts/generate_resource_views.py +174 -0
- scripts/sync_resources.py +283 -0
- scripts/validate_resource_catalog.py +207 -0
- tests/test_validate_resource_catalog.py +45 -0
.github/ISSUE_TEMPLATE/resource_addition.md
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
name: Resource addition
|
| 3 |
+
about: Propose a new Pashto-related dataset/model/tool/paper for the catalog
|
| 4 |
+
title: "[resource] "
|
| 5 |
+
labels: ["docs", "help wanted"]
|
| 6 |
+
assignees: []
|
| 7 |
+
---
|
| 8 |
+
|
| 9 |
+
## Resource type
|
| 10 |
+
- [ ] Dataset
|
| 11 |
+
- [ ] Model
|
| 12 |
+
- [ ] Benchmark
|
| 13 |
+
- [ ] Tool
|
| 14 |
+
- [ ] Paper
|
| 15 |
+
|
| 16 |
+
## Resource URL
|
| 17 |
+
<!-- Add one canonical URL -->
|
| 18 |
+
|
| 19 |
+
## Why this is Pashto-relevant
|
| 20 |
+
<!-- Include explicit markers such as Pashto, ps, pus, pbt_Arab, ps_af -->
|
| 21 |
+
|
| 22 |
+
## Pashto evidence link
|
| 23 |
+
<!-- Link to the exact line/page/model card proving Pashto support -->
|
| 24 |
+
|
| 25 |
+
## Suggested primary use in this repository
|
| 26 |
+
<!-- Example: ASR baseline, MT benchmark, NLP pretraining -->
|
| 27 |
+
|
| 28 |
+
## License and usage notes
|
| 29 |
+
<!-- Include known license terms and restrictions -->
|
| 30 |
+
|
| 31 |
+
## Checklist
|
| 32 |
+
- [ ] Link is clickable markdown format
|
| 33 |
+
- [ ] Evidence is explicit and verifiable
|
| 34 |
+
- [ ] Not already present in `resources/catalog/resources.json`
|
.github/workflows/ci.yml
CHANGED
|
@@ -23,6 +23,15 @@ jobs:
|
|
| 23 |
python -m pip install --upgrade pip
|
| 24 |
python -m pip install -e ".[dev]"
|
| 25 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 26 |
- name: Check markdown links format
|
| 27 |
run: python scripts/check_links.py
|
| 28 |
|
|
|
|
| 23 |
python -m pip install --upgrade pip
|
| 24 |
python -m pip install -e ".[dev]"
|
| 25 |
|
| 26 |
+
- name: Validate resource catalog
|
| 27 |
+
run: python scripts/validate_resource_catalog.py
|
| 28 |
+
|
| 29 |
+
- name: Generate resource views
|
| 30 |
+
run: python scripts/generate_resource_views.py
|
| 31 |
+
|
| 32 |
+
- name: Ensure generated files are committed
|
| 33 |
+
run: git diff --exit-code
|
| 34 |
+
|
| 35 |
- name: Check markdown links format
|
| 36 |
run: python scripts/check_links.py
|
| 37 |
|
.github/workflows/resource_sync.yml
ADDED
|
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
name: Resource Sync
|
| 2 |
+
|
| 3 |
+
on:
|
| 4 |
+
schedule:
|
| 5 |
+
- cron: "0 4 * * 1"
|
| 6 |
+
workflow_dispatch:
|
| 7 |
+
|
| 8 |
+
permissions:
|
| 9 |
+
contents: write
|
| 10 |
+
pull-requests: write
|
| 11 |
+
|
| 12 |
+
jobs:
|
| 13 |
+
sync:
|
| 14 |
+
runs-on: ubuntu-latest
|
| 15 |
+
steps:
|
| 16 |
+
- name: Checkout
|
| 17 |
+
uses: actions/checkout@v4
|
| 18 |
+
|
| 19 |
+
- name: Set up Python
|
| 20 |
+
uses: actions/setup-python@v5
|
| 21 |
+
with:
|
| 22 |
+
python-version: "3.11"
|
| 23 |
+
|
| 24 |
+
- name: Install dependencies
|
| 25 |
+
run: |
|
| 26 |
+
python -m pip install --upgrade pip
|
| 27 |
+
python -m pip install -e ".[dev]"
|
| 28 |
+
|
| 29 |
+
- name: Sync candidate resources
|
| 30 |
+
run: python scripts/sync_resources.py --limit 20
|
| 31 |
+
|
| 32 |
+
- name: Validate catalog
|
| 33 |
+
run: python scripts/validate_resource_catalog.py
|
| 34 |
+
|
| 35 |
+
- name: Create review PR
|
| 36 |
+
uses: peter-evans/create-pull-request@v6
|
| 37 |
+
with:
|
| 38 |
+
branch: bot/resource-sync
|
| 39 |
+
delete-branch: true
|
| 40 |
+
commit-message: "chore(resources): sync candidate feed"
|
| 41 |
+
title: "chore(resources): sync Pashto resource candidates"
|
| 42 |
+
body: |
|
| 43 |
+
Automated weekly candidate sync.
|
| 44 |
+
|
| 45 |
+
Scope:
|
| 46 |
+
- Updates `resources/catalog/pending_candidates.json`
|
| 47 |
+
- Leaves verified catalog unchanged for maintainer review
|
| 48 |
+
labels: |
|
| 49 |
+
resource-update
|
| 50 |
+
needs-review
|
| 51 |
+
add-paths: |
|
| 52 |
+
resources/catalog/pending_candidates.json
|
CONTRIBUTING.md
CHANGED
|
@@ -24,12 +24,10 @@ Then contribute here by opening an issue/PR with:
|
|
| 24 |
- what concrete follow-up is needed in this repository.
|
| 25 |
|
| 26 |
## 🔍 External Resource Contribution Rules
|
| 27 |
-
- Add
|
| 28 |
-
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
- how it can be used in this repository,
|
| 32 |
-
- practical applications.
|
| 33 |
- Prefer official pages and model/dataset cards over third-party reposts.
|
| 34 |
|
| 35 |
## 🔄 Contribution Flow
|
|
|
|
| 24 |
- what concrete follow-up is needed in this repository.
|
| 25 |
|
| 26 |
## 🔍 External Resource Contribution Rules
|
| 27 |
+
- Add or update entries in [resources/catalog/resources.json](resources/catalog/resources.json) using [resources/catalog/resource.template.json](resources/catalog/resource.template.json).
|
| 28 |
+
- Validate catalog changes with `python scripts/validate_resource_catalog.py`.
|
| 29 |
+
- Regenerate resource docs and search data with `python scripts/generate_resource_views.py`.
|
| 30 |
+
- Use [docs/resource_catalog.md](docs/resource_catalog.md) and [docs/resource_automation.md](docs/resource_automation.md) for full rules.
|
|
|
|
|
|
|
| 31 |
- Prefer official pages and model/dataset cards over third-party reposts.
|
| 32 |
|
| 33 |
## 🔄 Contribution Flow
|
README.md
CHANGED
|
@@ -17,6 +17,7 @@ Community-led open-source project to make Pashto a first-class language in AI sp
|
|
| 17 |
- GitHub: [Pukhto_Pashto](https://github.com/Musawer1214/Pukhto_Pashto)
|
| 18 |
- Hugging Face: [Musawer14/Pukhto_Pashto](https://huggingface.co/Musawer14/Pukhto_Pashto)
|
| 19 |
- GitHub Pages (About): [Pukhto_Pashto Site](https://musawer1214.github.io/Pukhto_Pashto/)
|
|
|
|
| 20 |
|
| 21 |
## 🎯 Core Goal
|
| 22 |
- Build open datasets, benchmarks, and models for Pashto ASR, TTS, and NLP.
|
|
@@ -41,10 +42,13 @@ Community-led open-source project to make Pashto a first-class language in AI sp
|
|
| 41 |
## 📚 Verified Resource Catalog
|
| 42 |
The project tracks validated external resources in:
|
| 43 |
- [docs/resource_catalog.md](docs/resource_catalog.md) (master index)
|
|
|
|
|
|
|
| 44 |
- [resources/datasets/README.md](resources/datasets/README.md)
|
| 45 |
- [resources/models/README.md](resources/models/README.md)
|
| 46 |
- [resources/benchmarks/README.md](resources/benchmarks/README.md)
|
| 47 |
- [resources/tools/README.md](resources/tools/README.md)
|
|
|
|
| 48 |
|
| 49 |
## 🎙️ Featured Dataset: Common Voice Pashto
|
| 50 |
- Dataset: Common Voice Scripted Speech 24.0 - Pashto
|
|
|
|
| 17 |
- GitHub: [Pukhto_Pashto](https://github.com/Musawer1214/Pukhto_Pashto)
|
| 18 |
- Hugging Face: [Musawer14/Pukhto_Pashto](https://huggingface.co/Musawer14/Pukhto_Pashto)
|
| 19 |
- GitHub Pages (About): [Pukhto_Pashto Site](https://musawer1214.github.io/Pukhto_Pashto/)
|
| 20 |
+
- GitHub Pages (Resource Search): [Pashto Resource Search](https://musawer1214.github.io/Pukhto_Pashto/search/)
|
| 21 |
|
| 22 |
## 🎯 Core Goal
|
| 23 |
- Build open datasets, benchmarks, and models for Pashto ASR, TTS, and NLP.
|
|
|
|
| 42 |
## 📚 Verified Resource Catalog
|
| 43 |
The project tracks validated external resources in:
|
| 44 |
- [docs/resource_catalog.md](docs/resource_catalog.md) (master index)
|
| 45 |
+
- [resources/catalog/resources.json](resources/catalog/resources.json) (canonical machine-readable catalog)
|
| 46 |
+
- [resources/schema/resource.schema.json](resources/schema/resource.schema.json) (catalog schema)
|
| 47 |
- [resources/datasets/README.md](resources/datasets/README.md)
|
| 48 |
- [resources/models/README.md](resources/models/README.md)
|
| 49 |
- [resources/benchmarks/README.md](resources/benchmarks/README.md)
|
| 50 |
- [resources/tools/README.md](resources/tools/README.md)
|
| 51 |
+
- [resources/papers/README.md](resources/papers/README.md)
|
| 52 |
|
| 53 |
## 🎙️ Featured Dataset: Common Voice Pashto
|
| 54 |
- Dataset: Common Voice Scripted Speech 24.0 - Pashto
|
docs/README.md
CHANGED
|
@@ -1,8 +1,8 @@
|
|
| 1 |
-
#
|
| 2 |
|
| 3 |
This folder is the main documentation entry point for contributors.
|
| 4 |
|
| 5 |
-
##
|
| 6 |
- Project purpose: [../PROJECT_PURPOSE.md](../PROJECT_PURPOSE.md)
|
| 7 |
- Contributing guide: [../CONTRIBUTING.md](../CONTRIBUTING.md)
|
| 8 |
- Governance: [../GOVERNANCE.md](../GOVERNANCE.md)
|
|
@@ -10,7 +10,7 @@ This folder is the main documentation entry point for contributors.
|
|
| 10 |
- Roadmap: [../ROADMAP.md](../ROADMAP.md)
|
| 11 |
- Changelog: [../CHANGELOG.md](../CHANGELOG.md)
|
| 12 |
|
| 13 |
-
##
|
| 14 |
- Workstreams: [workstreams.md](workstreams.md)
|
| 15 |
- Dataset guidelines: [dataset_guidelines.md](dataset_guidelines.md)
|
| 16 |
- Pashto normalization policy: [pashto_normalization_v0.1.md](pashto_normalization_v0.1.md)
|
|
@@ -19,17 +19,22 @@ This folder is the main documentation entry point for contributors.
|
|
| 19 |
- Release checklist: [release_checklist.md](release_checklist.md)
|
| 20 |
- Platforms and publish flow: [platforms.md](platforms.md)
|
| 21 |
- GitHub operations: [github_operations.md](github_operations.md)
|
|
|
|
| 22 |
|
| 23 |
-
##
|
| 24 |
- Master resource index: [resource_catalog.md](resource_catalog.md)
|
|
|
|
| 25 |
- Structured resources folder: [../resources/README.md](../resources/README.md)
|
| 26 |
|
| 27 |
-
##
|
| 28 |
- Scripts overview: [../scripts/README.md](../scripts/README.md)
|
| 29 |
- Link checker: [../scripts/check_links.py](../scripts/check_links.py)
|
|
|
|
|
|
|
|
|
|
| 30 |
- Normalization validator: [../scripts/validate_normalization.py](../scripts/validate_normalization.py)
|
| 31 |
|
| 32 |
-
##
|
| 33 |
- Benchmark result format: [../benchmarks/results/README.md](../benchmarks/results/README.md)
|
| 34 |
- Benchmark schema: [../benchmarks/schema/benchmark_result.schema.json](../benchmarks/schema/benchmark_result.schema.json)
|
| 35 |
- Experiment run cards: [../experiments/README.md](../experiments/README.md)
|
|
|
|
| 1 |
+
# Documentation Hub
|
| 2 |
|
| 3 |
This folder is the main documentation entry point for contributors.
|
| 4 |
|
| 5 |
+
## Start here
|
| 6 |
- Project purpose: [../PROJECT_PURPOSE.md](../PROJECT_PURPOSE.md)
|
| 7 |
- Contributing guide: [../CONTRIBUTING.md](../CONTRIBUTING.md)
|
| 8 |
- Governance: [../GOVERNANCE.md](../GOVERNANCE.md)
|
|
|
|
| 10 |
- Roadmap: [../ROADMAP.md](../ROADMAP.md)
|
| 11 |
- Changelog: [../CHANGELOG.md](../CHANGELOG.md)
|
| 12 |
|
| 13 |
+
## Core documentation
|
| 14 |
- Workstreams: [workstreams.md](workstreams.md)
|
| 15 |
- Dataset guidelines: [dataset_guidelines.md](dataset_guidelines.md)
|
| 16 |
- Pashto normalization policy: [pashto_normalization_v0.1.md](pashto_normalization_v0.1.md)
|
|
|
|
| 19 |
- Release checklist: [release_checklist.md](release_checklist.md)
|
| 20 |
- Platforms and publish flow: [platforms.md](platforms.md)
|
| 21 |
- GitHub operations: [github_operations.md](github_operations.md)
|
| 22 |
+
- Resource automation: [resource_automation.md](resource_automation.md)
|
| 23 |
|
| 24 |
+
## Resource tracking
|
| 25 |
- Master resource index: [resource_catalog.md](resource_catalog.md)
|
| 26 |
+
- GitHub Pages search: [search/index.html](search/index.html)
|
| 27 |
- Structured resources folder: [../resources/README.md](../resources/README.md)
|
| 28 |
|
| 29 |
+
## Tooling
|
| 30 |
- Scripts overview: [../scripts/README.md](../scripts/README.md)
|
| 31 |
- Link checker: [../scripts/check_links.py](../scripts/check_links.py)
|
| 32 |
+
- Resource catalog validator: [../scripts/validate_resource_catalog.py](../scripts/validate_resource_catalog.py)
|
| 33 |
+
- Resource view generator: [../scripts/generate_resource_views.py](../scripts/generate_resource_views.py)
|
| 34 |
+
- Candidate sync script: [../scripts/sync_resources.py](../scripts/sync_resources.py)
|
| 35 |
- Normalization validator: [../scripts/validate_normalization.py](../scripts/validate_normalization.py)
|
| 36 |
|
| 37 |
+
## Evaluation and experiments
|
| 38 |
- Benchmark result format: [../benchmarks/results/README.md](../benchmarks/results/README.md)
|
| 39 |
- Benchmark schema: [../benchmarks/schema/benchmark_result.schema.json](../benchmarks/schema/benchmark_result.schema.json)
|
| 40 |
- Experiment run cards: [../experiments/README.md](../experiments/README.md)
|
docs/index.md
CHANGED
|
@@ -21,7 +21,13 @@ title: About Pukhto Pashto
|
|
| 21 |
- `benchmarks/`: benchmark schema, result format, and metric guidance.
|
| 22 |
- `experiments/`: reproducible run cards and experiment records.
|
| 23 |
- `docs/`: policies, roadmap, release process, and operating guides.
|
| 24 |
-
- `resources/`: verified external Pashto datasets, models, tools, and
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 25 |
|
| 26 |
## Project References
|
| 27 |
|
|
|
|
| 21 |
- `benchmarks/`: benchmark schema, result format, and metric guidance.
|
| 22 |
- `experiments/`: reproducible run cards and experiment records.
|
| 23 |
- `docs/`: policies, roadmap, release process, and operating guides.
|
| 24 |
+
- `resources/`: verified external Pashto datasets, models, tools, benchmarks, and papers.
|
| 25 |
+
|
| 26 |
+
## Search Resources
|
| 27 |
+
|
| 28 |
+
- Search UI: [Pashto Resource Search](search/)
|
| 29 |
+
- Resource index docs: [resource_catalog.md](resource_catalog.md)
|
| 30 |
+
- Machine-readable catalog: [../resources/catalog/resources.json](../resources/catalog/resources.json)
|
| 31 |
|
| 32 |
## Project References
|
| 33 |
|
docs/resource_automation.md
ADDED
|
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Resource Automation
|
| 2 |
+
|
| 3 |
+
This repository uses a semi-automated process to keep Pashto resources current while preserving human review.
|
| 4 |
+
|
| 5 |
+
## Goals
|
| 6 |
+
- Discover new Pashto-relevant resources from trusted public endpoints.
|
| 7 |
+
- Keep a machine-readable canonical catalog.
|
| 8 |
+
- Prevent unreviewed low-confidence resources from directly entering verified lists.
|
| 9 |
+
|
| 10 |
+
## Files involved
|
| 11 |
+
- Canonical verified catalog: [../resources/catalog/resources.json](../resources/catalog/resources.json)
|
| 12 |
+
- Candidate feed: [../resources/catalog/pending_candidates.json](../resources/catalog/pending_candidates.json)
|
| 13 |
+
- Catalog schema: [../resources/schema/resource.schema.json](../resources/schema/resource.schema.json)
|
| 14 |
+
- Search export: [search/resources.json](search/resources.json)
|
| 15 |
+
|
| 16 |
+
## Scripts
|
| 17 |
+
- Validate catalog: `python scripts/validate_resource_catalog.py`
|
| 18 |
+
- Generate markdown and search index: `python scripts/generate_resource_views.py`
|
| 19 |
+
- Sync new candidates: `python scripts/sync_resources.py --limit 20`
|
| 20 |
+
|
| 21 |
+
## GitHub Actions
|
| 22 |
+
- CI (`.github/workflows/ci.yml`) enforces:
|
| 23 |
+
- catalog validation
|
| 24 |
+
- generated file consistency
|
| 25 |
+
- markdown link checks
|
| 26 |
+
- tests
|
| 27 |
+
- Resource Sync (`.github/workflows/resource_sync.yml`) runs weekly and opens a PR with candidate updates.
|
| 28 |
+
|
| 29 |
+
## Review flow
|
| 30 |
+
1. Inspect candidate entries in `resources/catalog/pending_candidates.json`.
|
| 31 |
+
2. Select useful items and move them into `resources/catalog/resources.json`.
|
| 32 |
+
3. Set `status` to `verified` only after checking evidence and license.
|
| 33 |
+
4. Run:
|
| 34 |
+
- `python scripts/validate_resource_catalog.py`
|
| 35 |
+
- `python scripts/generate_resource_views.py`
|
| 36 |
+
5. Commit and open PR.
|
docs/resource_catalog.md
CHANGED
|
@@ -1,36 +1,43 @@
|
|
| 1 |
-
#
|
| 2 |
|
| 3 |
Last updated: `2026-02-15`
|
| 4 |
|
| 5 |
This index points to validated Pashto-related resources tracked in structured files.
|
| 6 |
|
| 7 |
-
##
|
| 8 |
-
- Verify
|
| 9 |
-
- Verify explicit Pashto support markers (`ps`, `ps_af`, `
|
| 10 |
- Include only resources with practical use for this repository.
|
| 11 |
|
| 12 |
-
##
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 13 |
- Datasets: [../resources/datasets/README.md](../resources/datasets/README.md)
|
| 14 |
- Models: [../resources/models/README.md](../resources/models/README.md)
|
| 15 |
- Benchmarks: [../resources/benchmarks/README.md](../resources/benchmarks/README.md)
|
| 16 |
-
- Tools
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 17 |
|
| 18 |
-
##
|
| 19 |
- Data workspace: [../data/README.md](../data/README.md)
|
| 20 |
- ASR workspace: [../asr/README.md](../asr/README.md)
|
| 21 |
- TTS workspace: [../tts/README.md](../tts/README.md)
|
| 22 |
- Benchmarks workspace: [../benchmarks/README.md](../benchmarks/README.md)
|
| 23 |
- Applications workspace: [../apps/desktop/README.md](../apps/desktop/README.md)
|
| 24 |
|
| 25 |
-
##
|
| 26 |
Before each release:
|
| 27 |
- Confirm links still resolve.
|
| 28 |
- Confirm Pashto support markers remain valid.
|
| 29 |
-
- Confirm license
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
- `
|
| 33 |
-
- `FLORES-200` benchmark reference with `pbt_Arab` language code coverage.
|
| 34 |
-
- `facebook/mms-1b-all` ASR model reference for multilingual Pashto transfer.
|
| 35 |
-
- `mdarhri/pashto-bert` model for Pashto NLP baseline work.
|
| 36 |
-
- Two Kaggle resources: Pashto isolated-word speech and Pashto word embeddings.
|
|
|
|
| 1 |
+
# Verified Pashto Resource Catalog
|
| 2 |
|
| 3 |
Last updated: `2026-02-15`
|
| 4 |
|
| 5 |
This index points to validated Pashto-related resources tracked in structured files.
|
| 6 |
|
| 7 |
+
## Validation method
|
| 8 |
+
- Verify source URL resolves to official page or canonical repository.
|
| 9 |
+
- Verify explicit Pashto support markers (`Pashto`, `ps`, `ps_af`, `pus`, `pbt_Arab`) where possible.
|
| 10 |
- Include only resources with practical use for this repository.
|
| 11 |
|
| 12 |
+
## Structured catalog
|
| 13 |
+
- Canonical JSON: [../resources/catalog/resources.json](../resources/catalog/resources.json)
|
| 14 |
+
- Candidate feed: [../resources/catalog/pending_candidates.json](../resources/catalog/pending_candidates.json)
|
| 15 |
+
- JSON schema: [../resources/schema/resource.schema.json](../resources/schema/resource.schema.json)
|
| 16 |
+
|
| 17 |
+
## Generated markdown views
|
| 18 |
- Datasets: [../resources/datasets/README.md](../resources/datasets/README.md)
|
| 19 |
- Models: [../resources/models/README.md](../resources/models/README.md)
|
| 20 |
- Benchmarks: [../resources/benchmarks/README.md](../resources/benchmarks/README.md)
|
| 21 |
+
- Tools: [../resources/tools/README.md](../resources/tools/README.md)
|
| 22 |
+
- Papers: [../resources/papers/README.md](../resources/papers/README.md)
|
| 23 |
+
|
| 24 |
+
## Search page
|
| 25 |
+
- GitHub Pages search UI: [search/index.html](search/index.html)
|
| 26 |
+
- Search data export: [search/resources.json](search/resources.json)
|
| 27 |
+
- Automation guide: [resource_automation.md](resource_automation.md)
|
| 28 |
|
| 29 |
+
## Workspace mapping
|
| 30 |
- Data workspace: [../data/README.md](../data/README.md)
|
| 31 |
- ASR workspace: [../asr/README.md](../asr/README.md)
|
| 32 |
- TTS workspace: [../tts/README.md](../tts/README.md)
|
| 33 |
- Benchmarks workspace: [../benchmarks/README.md](../benchmarks/README.md)
|
| 34 |
- Applications workspace: [../apps/desktop/README.md](../apps/desktop/README.md)
|
| 35 |
|
| 36 |
+
## Maintenance rule
|
| 37 |
Before each release:
|
| 38 |
- Confirm links still resolve.
|
| 39 |
- Confirm Pashto support markers remain valid.
|
| 40 |
+
- Confirm license or usage terms are still compatible.
|
| 41 |
+
- Run:
|
| 42 |
+
- `python scripts/validate_resource_catalog.py`
|
| 43 |
+
- `python scripts/generate_resource_views.py`
|
|
|
|
|
|
|
|
|
|
|
|
docs/search/index.html
ADDED
|
@@ -0,0 +1,418 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<!doctype html>
|
| 2 |
+
<html lang="en">
|
| 3 |
+
<head>
|
| 4 |
+
<meta charset="utf-8">
|
| 5 |
+
<meta name="viewport" content="width=device-width, initial-scale=1">
|
| 6 |
+
<title>Pashto Resource Search</title>
|
| 7 |
+
<link rel="preconnect" href="https://fonts.googleapis.com">
|
| 8 |
+
<link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
|
| 9 |
+
<link href="https://fonts.googleapis.com/css2?family=IBM+Plex+Sans+Arabic:wght@400;500;700&family=Space+Grotesk:wght@500;700&display=swap" rel="stylesheet">
|
| 10 |
+
<style>
|
| 11 |
+
:root {
|
| 12 |
+
--bg: #f6f4ec;
|
| 13 |
+
--panel: #fffef9;
|
| 14 |
+
--ink: #1d2a24;
|
| 15 |
+
--muted: #4c6158;
|
| 16 |
+
--line: #d6ddd7;
|
| 17 |
+
--brand: #106b53;
|
| 18 |
+
--brand-soft: #e0f0ea;
|
| 19 |
+
--accent: #c76a1a;
|
| 20 |
+
--accent-soft: #f7e9d8;
|
| 21 |
+
--shadow: 0 12px 30px rgba(29, 42, 36, 0.08);
|
| 22 |
+
}
|
| 23 |
+
|
| 24 |
+
* { box-sizing: border-box; }
|
| 25 |
+
|
| 26 |
+
body {
|
| 27 |
+
margin: 0;
|
| 28 |
+
font-family: "IBM Plex Sans Arabic", "Segoe UI", sans-serif;
|
| 29 |
+
color: var(--ink);
|
| 30 |
+
background:
|
| 31 |
+
radial-gradient(circle at 8% 12%, #fce4c4 0, rgba(252, 228, 196, 0) 35%),
|
| 32 |
+
radial-gradient(circle at 92% 6%, #d8ece5 0, rgba(216, 236, 229, 0) 38%),
|
| 33 |
+
var(--bg);
|
| 34 |
+
min-height: 100vh;
|
| 35 |
+
}
|
| 36 |
+
|
| 37 |
+
.wrap {
|
| 38 |
+
max-width: 1100px;
|
| 39 |
+
margin: 0 auto;
|
| 40 |
+
padding: 24px 18px 44px;
|
| 41 |
+
}
|
| 42 |
+
|
| 43 |
+
.hero {
|
| 44 |
+
background: linear-gradient(120deg, #fff, #f7fbf9);
|
| 45 |
+
border: 1px solid var(--line);
|
| 46 |
+
box-shadow: var(--shadow);
|
| 47 |
+
border-radius: 18px;
|
| 48 |
+
padding: 20px 18px;
|
| 49 |
+
margin-bottom: 16px;
|
| 50 |
+
transform: translateY(8px);
|
| 51 |
+
opacity: 0;
|
| 52 |
+
animation: rise 500ms ease forwards;
|
| 53 |
+
}
|
| 54 |
+
|
| 55 |
+
.eyebrow {
|
| 56 |
+
letter-spacing: 0.07em;
|
| 57 |
+
text-transform: uppercase;
|
| 58 |
+
color: var(--accent);
|
| 59 |
+
font-family: "Space Grotesk", sans-serif;
|
| 60 |
+
font-weight: 700;
|
| 61 |
+
font-size: 12px;
|
| 62 |
+
margin-bottom: 6px;
|
| 63 |
+
}
|
| 64 |
+
|
| 65 |
+
h1 {
|
| 66 |
+
margin: 0 0 8px;
|
| 67 |
+
font-family: "Space Grotesk", sans-serif;
|
| 68 |
+
font-size: 33px;
|
| 69 |
+
line-height: 1.1;
|
| 70 |
+
}
|
| 71 |
+
|
| 72 |
+
.hero p {
|
| 73 |
+
margin: 0;
|
| 74 |
+
color: var(--muted);
|
| 75 |
+
line-height: 1.5;
|
| 76 |
+
}
|
| 77 |
+
|
| 78 |
+
.controls {
|
| 79 |
+
margin-top: 14px;
|
| 80 |
+
display: grid;
|
| 81 |
+
grid-template-columns: 2.2fr 1fr 1fr 1fr 1fr;
|
| 82 |
+
gap: 10px;
|
| 83 |
+
}
|
| 84 |
+
|
| 85 |
+
.field {
|
| 86 |
+
display: flex;
|
| 87 |
+
flex-direction: column;
|
| 88 |
+
gap: 5px;
|
| 89 |
+
}
|
| 90 |
+
|
| 91 |
+
.field label {
|
| 92 |
+
font-size: 12px;
|
| 93 |
+
font-weight: 700;
|
| 94 |
+
color: var(--muted);
|
| 95 |
+
letter-spacing: 0.04em;
|
| 96 |
+
text-transform: uppercase;
|
| 97 |
+
font-family: "Space Grotesk", sans-serif;
|
| 98 |
+
}
|
| 99 |
+
|
| 100 |
+
input, select {
|
| 101 |
+
width: 100%;
|
| 102 |
+
border: 1px solid var(--line);
|
| 103 |
+
background: var(--panel);
|
| 104 |
+
color: var(--ink);
|
| 105 |
+
border-radius: 10px;
|
| 106 |
+
padding: 10px 11px;
|
| 107 |
+
font: inherit;
|
| 108 |
+
}
|
| 109 |
+
|
| 110 |
+
input:focus, select:focus {
|
| 111 |
+
outline: 2px solid #8cc9b5;
|
| 112 |
+
border-color: #8cc9b5;
|
| 113 |
+
}
|
| 114 |
+
|
| 115 |
+
.summary {
|
| 116 |
+
margin: 14px 2px 6px;
|
| 117 |
+
display: flex;
|
| 118 |
+
justify-content: space-between;
|
| 119 |
+
align-items: center;
|
| 120 |
+
gap: 12px;
|
| 121 |
+
color: var(--muted);
|
| 122 |
+
font-size: 14px;
|
| 123 |
+
}
|
| 124 |
+
|
| 125 |
+
.badge {
|
| 126 |
+
background: var(--brand-soft);
|
| 127 |
+
color: var(--brand);
|
| 128 |
+
border: 1px solid #b7dccc;
|
| 129 |
+
padding: 3px 9px;
|
| 130 |
+
border-radius: 999px;
|
| 131 |
+
font-weight: 600;
|
| 132 |
+
}
|
| 133 |
+
|
| 134 |
+
.grid {
|
| 135 |
+
display: grid;
|
| 136 |
+
grid-template-columns: repeat(auto-fill, minmax(260px, 1fr));
|
| 137 |
+
gap: 12px;
|
| 138 |
+
list-style: none;
|
| 139 |
+
padding: 0;
|
| 140 |
+
margin: 0;
|
| 141 |
+
}
|
| 142 |
+
|
| 143 |
+
.card {
|
| 144 |
+
border: 1px solid var(--line);
|
| 145 |
+
border-radius: 14px;
|
| 146 |
+
background: var(--panel);
|
| 147 |
+
box-shadow: var(--shadow);
|
| 148 |
+
padding: 13px 12px;
|
| 149 |
+
display: flex;
|
| 150 |
+
flex-direction: column;
|
| 151 |
+
gap: 9px;
|
| 152 |
+
opacity: 0;
|
| 153 |
+
animation: rise 420ms ease forwards;
|
| 154 |
+
}
|
| 155 |
+
|
| 156 |
+
.chips {
|
| 157 |
+
display: flex;
|
| 158 |
+
gap: 6px;
|
| 159 |
+
flex-wrap: wrap;
|
| 160 |
+
}
|
| 161 |
+
|
| 162 |
+
.chip {
|
| 163 |
+
border-radius: 999px;
|
| 164 |
+
padding: 2px 8px;
|
| 165 |
+
font-size: 11px;
|
| 166 |
+
font-weight: 700;
|
| 167 |
+
font-family: "Space Grotesk", sans-serif;
|
| 168 |
+
letter-spacing: 0.03em;
|
| 169 |
+
text-transform: uppercase;
|
| 170 |
+
border: 1px solid transparent;
|
| 171 |
+
}
|
| 172 |
+
|
| 173 |
+
.chip.category { background: var(--brand-soft); color: var(--brand); border-color: #b7dccc; }
|
| 174 |
+
.chip.source { background: var(--accent-soft); color: #955016; border-color: #efc89f; }
|
| 175 |
+
.chip.status { background: #eef3ff; color: #3e4f86; border-color: #d2dbf6; }
|
| 176 |
+
|
| 177 |
+
.title {
|
| 178 |
+
margin: 0;
|
| 179 |
+
font-size: 17px;
|
| 180 |
+
line-height: 1.3;
|
| 181 |
+
}
|
| 182 |
+
|
| 183 |
+
.title a {
|
| 184 |
+
color: var(--ink);
|
| 185 |
+
text-decoration: none;
|
| 186 |
+
border-bottom: 1px solid transparent;
|
| 187 |
+
}
|
| 188 |
+
|
| 189 |
+
.title a:hover {
|
| 190 |
+
border-bottom-color: currentColor;
|
| 191 |
+
}
|
| 192 |
+
|
| 193 |
+
.meta {
|
| 194 |
+
color: var(--muted);
|
| 195 |
+
font-size: 13px;
|
| 196 |
+
line-height: 1.45;
|
| 197 |
+
margin: 0;
|
| 198 |
+
}
|
| 199 |
+
|
| 200 |
+
.card footer {
|
| 201 |
+
margin-top: auto;
|
| 202 |
+
font-size: 12px;
|
| 203 |
+
color: var(--muted);
|
| 204 |
+
}
|
| 205 |
+
|
| 206 |
+
.empty {
|
| 207 |
+
border: 1px dashed #b9c4bd;
|
| 208 |
+
border-radius: 12px;
|
| 209 |
+
padding: 24px;
|
| 210 |
+
background: #fcfcfa;
|
| 211 |
+
color: var(--muted);
|
| 212 |
+
text-align: center;
|
| 213 |
+
}
|
| 214 |
+
|
| 215 |
+
@keyframes rise {
|
| 216 |
+
from { opacity: 0; transform: translateY(8px); }
|
| 217 |
+
to { opacity: 1; transform: translateY(0); }
|
| 218 |
+
}
|
| 219 |
+
|
| 220 |
+
@media (max-width: 900px) {
|
| 221 |
+
.controls {
|
| 222 |
+
grid-template-columns: 1fr 1fr;
|
| 223 |
+
}
|
| 224 |
+
|
| 225 |
+
.controls .field:first-child {
|
| 226 |
+
grid-column: span 2;
|
| 227 |
+
}
|
| 228 |
+
}
|
| 229 |
+
|
| 230 |
+
@media (max-width: 560px) {
|
| 231 |
+
h1 { font-size: 28px; }
|
| 232 |
+
.controls { grid-template-columns: 1fr; }
|
| 233 |
+
.controls .field:first-child { grid-column: span 1; }
|
| 234 |
+
.summary { flex-direction: column; align-items: flex-start; }
|
| 235 |
+
}
|
| 236 |
+
</style>
|
| 237 |
+
</head>
|
| 238 |
+
<body>
|
| 239 |
+
<main class="wrap">
|
| 240 |
+
<section class="hero">
|
| 241 |
+
<div class="eyebrow">Pukhto Pashto</div>
|
| 242 |
+
<h1>Pashto Technology Resource Search</h1>
|
| 243 |
+
<p>
|
| 244 |
+
Search and filter verified and candidate resources that support Pashto in ASR, TTS, NLP, translation, tooling, and research.
|
| 245 |
+
</p>
|
| 246 |
+
|
| 247 |
+
<div class="controls">
|
| 248 |
+
<div class="field">
|
| 249 |
+
<label for="q">Search</label>
|
| 250 |
+
<input id="q" type="search" placeholder="Try: ASR, pbt_Arab, translation, speech" />
|
| 251 |
+
</div>
|
| 252 |
+
<div class="field">
|
| 253 |
+
<label for="category">Category</label>
|
| 254 |
+
<select id="category"></select>
|
| 255 |
+
</div>
|
| 256 |
+
<div class="field">
|
| 257 |
+
<label for="source">Source</label>
|
| 258 |
+
<select id="source"></select>
|
| 259 |
+
</div>
|
| 260 |
+
<div class="field">
|
| 261 |
+
<label for="task">Task</label>
|
| 262 |
+
<select id="task"></select>
|
| 263 |
+
</div>
|
| 264 |
+
<div class="field">
|
| 265 |
+
<label for="status">Status</label>
|
| 266 |
+
<select id="status"></select>
|
| 267 |
+
</div>
|
| 268 |
+
</div>
|
| 269 |
+
</section>
|
| 270 |
+
|
| 271 |
+
<div class="summary">
|
| 272 |
+
<span id="countText">Loading resources...</span>
|
| 273 |
+
<span class="badge" id="generatedAt">Catalog timestamp: -</span>
|
| 274 |
+
</div>
|
| 275 |
+
|
| 276 |
+
<ul id="results" class="grid"></ul>
|
| 277 |
+
</main>
|
| 278 |
+
|
| 279 |
+
<script>
|
| 280 |
+
const state = {
|
| 281 |
+
all: [],
|
| 282 |
+
filtered: []
|
| 283 |
+
};
|
| 284 |
+
|
| 285 |
+
const els = {
|
| 286 |
+
q: document.getElementById("q"),
|
| 287 |
+
category: document.getElementById("category"),
|
| 288 |
+
source: document.getElementById("source"),
|
| 289 |
+
task: document.getElementById("task"),
|
| 290 |
+
status: document.getElementById("status"),
|
| 291 |
+
results: document.getElementById("results"),
|
| 292 |
+
countText: document.getElementById("countText"),
|
| 293 |
+
generatedAt: document.getElementById("generatedAt")
|
| 294 |
+
};
|
| 295 |
+
|
| 296 |
+
function uniqSorted(values) {
|
| 297 |
+
return [...new Set(values.filter(Boolean))].sort((a, b) => a.localeCompare(b));
|
| 298 |
+
}
|
| 299 |
+
|
| 300 |
+
function fillSelect(select, options, allLabel) {
|
| 301 |
+
select.innerHTML = "";
|
| 302 |
+
const allOption = document.createElement("option");
|
| 303 |
+
allOption.value = "";
|
| 304 |
+
allOption.textContent = allLabel;
|
| 305 |
+
select.appendChild(allOption);
|
| 306 |
+
for (const opt of options) {
|
| 307 |
+
const el = document.createElement("option");
|
| 308 |
+
el.value = opt;
|
| 309 |
+
el.textContent = opt;
|
| 310 |
+
select.appendChild(el);
|
| 311 |
+
}
|
| 312 |
+
}
|
| 313 |
+
|
| 314 |
+
function matchesQuery(resource, query) {
|
| 315 |
+
if (!query) return true;
|
| 316 |
+
const hay = [
|
| 317 |
+
resource.title,
|
| 318 |
+
resource.summary,
|
| 319 |
+
resource.primary_use,
|
| 320 |
+
resource.category,
|
| 321 |
+
resource.source,
|
| 322 |
+
resource.status,
|
| 323 |
+
...(resource.tags || []),
|
| 324 |
+
...(resource.tasks || []),
|
| 325 |
+
...(resource.markers || []),
|
| 326 |
+
resource.evidence_text
|
| 327 |
+
].join(" ").toLowerCase();
|
| 328 |
+
return hay.includes(query);
|
| 329 |
+
}
|
| 330 |
+
|
| 331 |
+
function applyFilters() {
|
| 332 |
+
const q = els.q.value.trim().toLowerCase();
|
| 333 |
+
const category = els.category.value;
|
| 334 |
+
const source = els.source.value;
|
| 335 |
+
const task = els.task.value;
|
| 336 |
+
const status = els.status.value;
|
| 337 |
+
|
| 338 |
+
state.filtered = state.all.filter((resource) => {
|
| 339 |
+
if (!matchesQuery(resource, q)) return false;
|
| 340 |
+
if (category && resource.category !== category) return false;
|
| 341 |
+
if (source && resource.source !== source) return false;
|
| 342 |
+
if (status && resource.status !== status) return false;
|
| 343 |
+
if (task && !(resource.tasks || []).includes(task)) return false;
|
| 344 |
+
return true;
|
| 345 |
+
});
|
| 346 |
+
|
| 347 |
+
renderResults();
|
| 348 |
+
}
|
| 349 |
+
|
| 350 |
+
function chip(label, cls) {
|
| 351 |
+
return `<span class="chip ${cls}">${label}</span>`;
|
| 352 |
+
}
|
| 353 |
+
|
| 354 |
+
function renderResults() {
|
| 355 |
+
const items = state.filtered;
|
| 356 |
+
els.countText.textContent = `${items.length} result${items.length === 1 ? "" : "s"} of ${state.all.length}`;
|
| 357 |
+
|
| 358 |
+
if (!items.length) {
|
| 359 |
+
els.results.innerHTML = `<li class="empty">No matches. Try broadening filters or changing keywords.</li>`;
|
| 360 |
+
return;
|
| 361 |
+
}
|
| 362 |
+
|
| 363 |
+
els.results.innerHTML = items.map((resource, idx) => `
|
| 364 |
+
<li class="card" style="animation-delay:${Math.min(idx * 28, 320)}ms">
|
| 365 |
+
<div class="chips">
|
| 366 |
+
${chip(resource.category, "category")}
|
| 367 |
+
${chip(resource.source, "source")}
|
| 368 |
+
${chip(resource.status, "status")}
|
| 369 |
+
</div>
|
| 370 |
+
<h2 class="title"><a href="${resource.url}" target="_blank" rel="noreferrer">${resource.title}</a></h2>
|
| 371 |
+
<p class="meta">${resource.summary}</p>
|
| 372 |
+
<p class="meta"><strong>Primary use:</strong> ${resource.primary_use}</p>
|
| 373 |
+
<p class="meta"><strong>Pashto evidence:</strong> <a href="${resource.evidence_url}" target="_blank" rel="noreferrer">${resource.evidence_text}</a></p>
|
| 374 |
+
<footer>
|
| 375 |
+
${(resource.tasks || []).length ? `Tasks: ${resource.tasks.join(", ")}` : "Tasks: n/a"}
|
| 376 |
+
</footer>
|
| 377 |
+
</li>
|
| 378 |
+
`).join("");
|
| 379 |
+
}
|
| 380 |
+
|
| 381 |
+
async function load() {
|
| 382 |
+
try {
|
| 383 |
+
const res = await fetch("./resources.json", { cache: "no-store" });
|
| 384 |
+
if (!res.ok) throw new Error(`HTTP ${res.status}`);
|
| 385 |
+
const payload = await res.json();
|
| 386 |
+
state.all = payload.resources || [];
|
| 387 |
+
state.filtered = [...state.all];
|
| 388 |
+
|
| 389 |
+
fillSelect(els.category, uniqSorted(state.all.map((r) => r.category)), "All categories");
|
| 390 |
+
fillSelect(els.source, uniqSorted(state.all.map((r) => r.source)), "All sources");
|
| 391 |
+
fillSelect(els.status, uniqSorted(state.all.map((r) => r.status)), "All statuses");
|
| 392 |
+
fillSelect(
|
| 393 |
+
els.task,
|
| 394 |
+
uniqSorted(state.all.flatMap((r) => r.tasks || [])),
|
| 395 |
+
"All tasks"
|
| 396 |
+
);
|
| 397 |
+
|
| 398 |
+
const generated = payload.generated_on ? new Date(payload.generated_on) : null;
|
| 399 |
+
els.generatedAt.textContent = generated && !Number.isNaN(generated.getTime())
|
| 400 |
+
? `Catalog timestamp: ${generated.toISOString()}`
|
| 401 |
+
: "Catalog timestamp: unknown";
|
| 402 |
+
|
| 403 |
+
applyFilters();
|
| 404 |
+
} catch (err) {
|
| 405 |
+
els.countText.textContent = "Failed to load resources";
|
| 406 |
+
els.results.innerHTML = `<li class="empty">Could not load search data. ${String(err)}</li>`;
|
| 407 |
+
}
|
| 408 |
+
}
|
| 409 |
+
|
| 410 |
+
[els.q, els.category, els.source, els.task, els.status].forEach((el) => {
|
| 411 |
+
el.addEventListener("input", applyFilters);
|
| 412 |
+
el.addEventListener("change", applyFilters);
|
| 413 |
+
});
|
| 414 |
+
|
| 415 |
+
load();
|
| 416 |
+
</script>
|
| 417 |
+
</body>
|
| 418 |
+
</html>
|
docs/search/resources.json
ADDED
|
@@ -0,0 +1,595 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"generated_on": "2026-02-15T00:00:00Z",
|
| 3 |
+
"count": 25,
|
| 4 |
+
"resources": [
|
| 5 |
+
{
|
| 6 |
+
"id": "dataset-common-voice-ps-v24",
|
| 7 |
+
"title": "Common Voice Scripted Speech 24.0 - Pashto",
|
| 8 |
+
"url": "https://datacollective.mozillafoundation.org/datasets/cmj8u3pnb00llnxxbfvxo3b14",
|
| 9 |
+
"category": "dataset",
|
| 10 |
+
"source": "mozilla",
|
| 11 |
+
"status": "verified",
|
| 12 |
+
"summary": "Large open Pashto speech dataset for ASR training and evaluation.",
|
| 13 |
+
"primary_use": "ASR training and evaluation",
|
| 14 |
+
"tasks": [
|
| 15 |
+
"asr"
|
| 16 |
+
],
|
| 17 |
+
"tags": [
|
| 18 |
+
"pashto",
|
| 19 |
+
"speech",
|
| 20 |
+
"asr"
|
| 21 |
+
],
|
| 22 |
+
"evidence_text": "Official dataset page is for Pashto.",
|
| 23 |
+
"evidence_url": "https://datacollective.mozillafoundation.org/datasets/cmj8u3pnb00llnxxbfvxo3b14",
|
| 24 |
+
"markers": [
|
| 25 |
+
"Pashto"
|
| 26 |
+
]
|
| 27 |
+
},
|
| 28 |
+
{
|
| 29 |
+
"id": "dataset-google-fleurs",
|
| 30 |
+
"title": "Google FLEURS",
|
| 31 |
+
"url": "https://huggingface.co/datasets/google/fleurs",
|
| 32 |
+
"category": "dataset",
|
| 33 |
+
"source": "huggingface",
|
| 34 |
+
"status": "verified",
|
| 35 |
+
"summary": "Standard multilingual speech benchmark dataset with Pashto subset.",
|
| 36 |
+
"primary_use": "Speech benchmark and external evaluation",
|
| 37 |
+
"tasks": [
|
| 38 |
+
"asr",
|
| 39 |
+
"benchmarking"
|
| 40 |
+
],
|
| 41 |
+
"tags": [
|
| 42 |
+
"pashto",
|
| 43 |
+
"speech",
|
| 44 |
+
"benchmark"
|
| 45 |
+
],
|
| 46 |
+
"evidence_text": "Dataset config includes ps_af.",
|
| 47 |
+
"evidence_url": "https://huggingface.co/datasets/google/fleurs/blob/main/fleurs.py",
|
| 48 |
+
"markers": [
|
| 49 |
+
"ps_af"
|
| 50 |
+
]
|
| 51 |
+
},
|
| 52 |
+
{
|
| 53 |
+
"id": "dataset-oscar-ps",
|
| 54 |
+
"title": "OSCAR Corpus",
|
| 55 |
+
"url": "https://huggingface.co/datasets/oscar-corpus/oscar",
|
| 56 |
+
"category": "dataset",
|
| 57 |
+
"source": "huggingface",
|
| 58 |
+
"status": "verified",
|
| 59 |
+
"summary": "Large web text corpus that includes Pashto text split.",
|
| 60 |
+
"primary_use": "Language modeling and lexicon expansion",
|
| 61 |
+
"tasks": [
|
| 62 |
+
"nlp"
|
| 63 |
+
],
|
| 64 |
+
"tags": [
|
| 65 |
+
"pashto",
|
| 66 |
+
"text",
|
| 67 |
+
"nlp"
|
| 68 |
+
],
|
| 69 |
+
"evidence_text": "Dataset includes unshuffled_deduplicated_ps split.",
|
| 70 |
+
"evidence_url": "https://huggingface.co/datasets/oscar-corpus/oscar",
|
| 71 |
+
"markers": [
|
| 72 |
+
"unshuffled_deduplicated_ps"
|
| 73 |
+
]
|
| 74 |
+
},
|
| 75 |
+
{
|
| 76 |
+
"id": "dataset-wikipedia-ps",
|
| 77 |
+
"title": "Wikimedia Wikipedia",
|
| 78 |
+
"url": "https://huggingface.co/datasets/wikimedia/wikipedia",
|
| 79 |
+
"category": "dataset",
|
| 80 |
+
"source": "huggingface",
|
| 81 |
+
"status": "verified",
|
| 82 |
+
"summary": "Wikipedia corpus with Pashto edition for cleaner text resources.",
|
| 83 |
+
"primary_use": "Terminology and balanced text corpus",
|
| 84 |
+
"tasks": [
|
| 85 |
+
"nlp"
|
| 86 |
+
],
|
| 87 |
+
"tags": [
|
| 88 |
+
"pashto",
|
| 89 |
+
"text",
|
| 90 |
+
"nlp"
|
| 91 |
+
],
|
| 92 |
+
"evidence_text": "Dataset includes 20231101.ps subset.",
|
| 93 |
+
"evidence_url": "https://huggingface.co/datasets/wikimedia/wikipedia",
|
| 94 |
+
"markers": [
|
| 95 |
+
"20231101.ps"
|
| 96 |
+
]
|
| 97 |
+
},
|
| 98 |
+
{
|
| 99 |
+
"id": "dataset-belebele-pbt-arab",
|
| 100 |
+
"title": "Belebele",
|
| 101 |
+
"url": "https://huggingface.co/datasets/facebook/belebele",
|
| 102 |
+
"category": "dataset",
|
| 103 |
+
"source": "huggingface",
|
| 104 |
+
"status": "verified",
|
| 105 |
+
"summary": "Reading comprehension dataset with Pashto script subset.",
|
| 106 |
+
"primary_use": "Comprehension and multilingual NLP benchmark",
|
| 107 |
+
"tasks": [
|
| 108 |
+
"nlp",
|
| 109 |
+
"benchmarking"
|
| 110 |
+
],
|
| 111 |
+
"tags": [
|
| 112 |
+
"pashto",
|
| 113 |
+
"nlp",
|
| 114 |
+
"benchmark"
|
| 115 |
+
],
|
| 116 |
+
"evidence_text": "Dataset includes pbt_Arab subset.",
|
| 117 |
+
"evidence_url": "https://huggingface.co/datasets/facebook/belebele",
|
| 118 |
+
"markers": [
|
| 119 |
+
"pbt_Arab"
|
| 120 |
+
]
|
| 121 |
+
},
|
| 122 |
+
{
|
| 123 |
+
"id": "dataset-opus100-en-ps",
|
| 124 |
+
"title": "OPUS-100",
|
| 125 |
+
"url": "https://huggingface.co/datasets/Helsinki-NLP/opus-100",
|
| 126 |
+
"category": "dataset",
|
| 127 |
+
"source": "huggingface",
|
| 128 |
+
"status": "verified",
|
| 129 |
+
"summary": "Parallel corpus with English to Pashto split for MT tasks.",
|
| 130 |
+
"primary_use": "Machine translation training and evaluation",
|
| 131 |
+
"tasks": [
|
| 132 |
+
"mt",
|
| 133 |
+
"nlp"
|
| 134 |
+
],
|
| 135 |
+
"tags": [
|
| 136 |
+
"pashto",
|
| 137 |
+
"mt",
|
| 138 |
+
"parallel-corpus"
|
| 139 |
+
],
|
| 140 |
+
"evidence_text": "Dataset viewer includes en-ps split.",
|
| 141 |
+
"evidence_url": "https://huggingface.co/datasets/Helsinki-NLP/opus-100/viewer/en-ps",
|
| 142 |
+
"markers": [
|
| 143 |
+
"en-ps"
|
| 144 |
+
]
|
| 145 |
+
},
|
| 146 |
+
{
|
| 147 |
+
"id": "dataset-kaggle-pashto-isolated-words",
|
| 148 |
+
"title": "Pashto Isolated Words Speech Dataset",
|
| 149 |
+
"url": "https://www.kaggle.com/datasets/engrirf/pashto-isolated-words-speech-dataset",
|
| 150 |
+
"category": "dataset",
|
| 151 |
+
"source": "kaggle",
|
| 152 |
+
"status": "verified",
|
| 153 |
+
"summary": "Speech dataset focused on isolated Pashto words.",
|
| 154 |
+
"primary_use": "Keyword spotting and constrained ASR experiments",
|
| 155 |
+
"tasks": [
|
| 156 |
+
"asr"
|
| 157 |
+
],
|
| 158 |
+
"tags": [
|
| 159 |
+
"pashto",
|
| 160 |
+
"speech",
|
| 161 |
+
"kaggle"
|
| 162 |
+
],
|
| 163 |
+
"evidence_text": "Dataset title explicitly states Pashto speech dataset.",
|
| 164 |
+
"evidence_url": "https://www.kaggle.com/datasets/engrirf/pashto-isolated-words-speech-dataset",
|
| 165 |
+
"markers": [
|
| 166 |
+
"Pashto"
|
| 167 |
+
]
|
| 168 |
+
},
|
| 169 |
+
{
|
| 170 |
+
"id": "dataset-kaggle-pashto-word-embeddings",
|
| 171 |
+
"title": "Pashto Word Embeddings",
|
| 172 |
+
"url": "https://www.kaggle.com/datasets/drijaz/pashto-word-embeddings",
|
| 173 |
+
"category": "dataset",
|
| 174 |
+
"source": "kaggle",
|
| 175 |
+
"status": "verified",
|
| 176 |
+
"summary": "Pretrained Pashto word vectors for classic NLP baselines.",
|
| 177 |
+
"primary_use": "Lexical semantics and lightweight NLP baselines",
|
| 178 |
+
"tasks": [
|
| 179 |
+
"nlp"
|
| 180 |
+
],
|
| 181 |
+
"tags": [
|
| 182 |
+
"pashto",
|
| 183 |
+
"nlp",
|
| 184 |
+
"embeddings",
|
| 185 |
+
"kaggle"
|
| 186 |
+
],
|
| 187 |
+
"evidence_text": "Dataset description states pretrained Pashto embeddings.",
|
| 188 |
+
"evidence_url": "https://www.kaggle.com/datasets/drijaz/pashto-word-embeddings",
|
| 189 |
+
"markers": [
|
| 190 |
+
"Pashto"
|
| 191 |
+
]
|
| 192 |
+
},
|
| 193 |
+
{
|
| 194 |
+
"id": "model-whisper-large-v3",
|
| 195 |
+
"title": "Whisper Large v3",
|
| 196 |
+
"url": "https://huggingface.co/openai/whisper-large-v3",
|
| 197 |
+
"category": "model",
|
| 198 |
+
"source": "huggingface",
|
| 199 |
+
"status": "verified",
|
| 200 |
+
"summary": "Strong multilingual ASR baseline suitable for Pashto bootstrapping.",
|
| 201 |
+
"primary_use": "ASR baseline and pseudo-labeling",
|
| 202 |
+
"tasks": [
|
| 203 |
+
"asr"
|
| 204 |
+
],
|
| 205 |
+
"tags": [
|
| 206 |
+
"pashto",
|
| 207 |
+
"asr",
|
| 208 |
+
"whisper"
|
| 209 |
+
],
|
| 210 |
+
"evidence_text": "Whisper tokenizer map includes ps language key.",
|
| 211 |
+
"evidence_url": "https://raw.githubusercontent.com/openai/whisper/main/whisper/tokenizer.py",
|
| 212 |
+
"markers": [
|
| 213 |
+
"ps"
|
| 214 |
+
]
|
| 215 |
+
},
|
| 216 |
+
{
|
| 217 |
+
"id": "model-mms-1b-all",
|
| 218 |
+
"title": "MMS 1B All",
|
| 219 |
+
"url": "https://huggingface.co/facebook/mms-1b-all",
|
| 220 |
+
"category": "model",
|
| 221 |
+
"source": "huggingface",
|
| 222 |
+
"status": "verified",
|
| 223 |
+
"summary": "Multilingual ASR model from MMS for low-resource transfer.",
|
| 224 |
+
"primary_use": "ASR transfer baseline",
|
| 225 |
+
"tasks": [
|
| 226 |
+
"asr"
|
| 227 |
+
],
|
| 228 |
+
"tags": [
|
| 229 |
+
"pashto",
|
| 230 |
+
"asr",
|
| 231 |
+
"mms"
|
| 232 |
+
],
|
| 233 |
+
"evidence_text": "MMS coverage table includes pus with ASR support.",
|
| 234 |
+
"evidence_url": "https://dl.fbaipublicfiles.com/mms/misc/language_coverage_mms.html",
|
| 235 |
+
"markers": [
|
| 236 |
+
"pus"
|
| 237 |
+
]
|
| 238 |
+
},
|
| 239 |
+
{
|
| 240 |
+
"id": "model-mms-tts",
|
| 241 |
+
"title": "MMS TTS",
|
| 242 |
+
"url": "https://huggingface.co/facebook/mms-tts",
|
| 243 |
+
"category": "model",
|
| 244 |
+
"source": "huggingface",
|
| 245 |
+
"status": "verified",
|
| 246 |
+
"summary": "Multilingual TTS checkpoints useful for Pashto voice synthesis.",
|
| 247 |
+
"primary_use": "TTS baseline and transfer",
|
| 248 |
+
"tasks": [
|
| 249 |
+
"tts"
|
| 250 |
+
],
|
| 251 |
+
"tags": [
|
| 252 |
+
"pashto",
|
| 253 |
+
"tts",
|
| 254 |
+
"mms"
|
| 255 |
+
],
|
| 256 |
+
"evidence_text": "MMS coverage table includes pus with TTS support.",
|
| 257 |
+
"evidence_url": "https://dl.fbaipublicfiles.com/mms/misc/language_coverage_mms.html",
|
| 258 |
+
"markers": [
|
| 259 |
+
"pus"
|
| 260 |
+
]
|
| 261 |
+
},
|
| 262 |
+
{
|
| 263 |
+
"id": "model-nllb-200-distilled-600m",
|
| 264 |
+
"title": "NLLB-200 Distilled 600M",
|
| 265 |
+
"url": "https://huggingface.co/facebook/nllb-200-distilled-600M",
|
| 266 |
+
"category": "model",
|
| 267 |
+
"source": "huggingface",
|
| 268 |
+
"status": "verified",
|
| 269 |
+
"summary": "General multilingual translation model with Pashto script token support.",
|
| 270 |
+
"primary_use": "Pashto translation baseline",
|
| 271 |
+
"tasks": [
|
| 272 |
+
"mt"
|
| 273 |
+
],
|
| 274 |
+
"tags": [
|
| 275 |
+
"pashto",
|
| 276 |
+
"mt",
|
| 277 |
+
"nllb"
|
| 278 |
+
],
|
| 279 |
+
"evidence_text": "Model special token map includes pbt_Arab.",
|
| 280 |
+
"evidence_url": "https://huggingface.co/facebook/nllb-200-distilled-600M/blob/main/special_tokens_map.json",
|
| 281 |
+
"markers": [
|
| 282 |
+
"pbt_Arab"
|
| 283 |
+
]
|
| 284 |
+
},
|
| 285 |
+
{
|
| 286 |
+
"id": "model-opus-mt-en-mul",
|
| 287 |
+
"title": "OPUS MT en-mul",
|
| 288 |
+
"url": "https://huggingface.co/Helsinki-NLP/opus-mt-en-mul",
|
| 289 |
+
"category": "model",
|
| 290 |
+
"source": "huggingface",
|
| 291 |
+
"status": "verified",
|
| 292 |
+
"summary": "Translation model that can route English into Pashto via multilingual set.",
|
| 293 |
+
"primary_use": "English to Pashto translation path",
|
| 294 |
+
"tasks": [
|
| 295 |
+
"mt"
|
| 296 |
+
],
|
| 297 |
+
"tags": [
|
| 298 |
+
"pashto",
|
| 299 |
+
"mt",
|
| 300 |
+
"opus"
|
| 301 |
+
],
|
| 302 |
+
"evidence_text": "Language list includes pus code.",
|
| 303 |
+
"evidence_url": "https://huggingface.co/Helsinki-NLP/opus-mt-en-mul",
|
| 304 |
+
"markers": [
|
| 305 |
+
"pus"
|
| 306 |
+
]
|
| 307 |
+
},
|
| 308 |
+
{
|
| 309 |
+
"id": "model-opus-mt-mul-en",
|
| 310 |
+
"title": "OPUS MT mul-en",
|
| 311 |
+
"url": "https://huggingface.co/Helsinki-NLP/opus-mt-mul-en",
|
| 312 |
+
"category": "model",
|
| 313 |
+
"source": "huggingface",
|
| 314 |
+
"status": "verified",
|
| 315 |
+
"summary": "Translation model for Pashto to English via multilingual encoder.",
|
| 316 |
+
"primary_use": "Pashto to English translation path",
|
| 317 |
+
"tasks": [
|
| 318 |
+
"mt"
|
| 319 |
+
],
|
| 320 |
+
"tags": [
|
| 321 |
+
"pashto",
|
| 322 |
+
"mt",
|
| 323 |
+
"opus"
|
| 324 |
+
],
|
| 325 |
+
"evidence_text": "Language list includes pus code.",
|
| 326 |
+
"evidence_url": "https://huggingface.co/Helsinki-NLP/opus-mt-mul-en",
|
| 327 |
+
"markers": [
|
| 328 |
+
"pus"
|
| 329 |
+
]
|
| 330 |
+
},
|
| 331 |
+
{
|
| 332 |
+
"id": "model-pashto-bert",
|
| 333 |
+
"title": "PashtoBERT",
|
| 334 |
+
"url": "https://huggingface.co/mdarhri/pashto-bert",
|
| 335 |
+
"category": "model",
|
| 336 |
+
"source": "huggingface",
|
| 337 |
+
"status": "verified",
|
| 338 |
+
"summary": "Pashto-specific encoder model for NLP transfer tasks.",
|
| 339 |
+
"primary_use": "Pashto NLP baseline encoder",
|
| 340 |
+
"tasks": [
|
| 341 |
+
"nlp"
|
| 342 |
+
],
|
| 343 |
+
"tags": [
|
| 344 |
+
"pashto",
|
| 345 |
+
"nlp",
|
| 346 |
+
"bert"
|
| 347 |
+
],
|
| 348 |
+
"evidence_text": "Model card states training on Pashto corpus data.",
|
| 349 |
+
"evidence_url": "https://huggingface.co/mdarhri/pashto-bert",
|
| 350 |
+
"markers": [
|
| 351 |
+
"Pashto"
|
| 352 |
+
]
|
| 353 |
+
},
|
| 354 |
+
{
|
| 355 |
+
"id": "benchmark-fleurs-ps-af",
|
| 356 |
+
"title": "FLEURS Pashto Benchmark",
|
| 357 |
+
"url": "https://huggingface.co/datasets/google/fleurs",
|
| 358 |
+
"category": "benchmark",
|
| 359 |
+
"source": "huggingface",
|
| 360 |
+
"status": "verified",
|
| 361 |
+
"summary": "Fixed multilingual speech benchmark with Pashto subset for WER and CER.",
|
| 362 |
+
"primary_use": "ASR benchmark reporting",
|
| 363 |
+
"tasks": [
|
| 364 |
+
"asr",
|
| 365 |
+
"benchmarking"
|
| 366 |
+
],
|
| 367 |
+
"tags": [
|
| 368 |
+
"pashto",
|
| 369 |
+
"benchmark",
|
| 370 |
+
"asr"
|
| 371 |
+
],
|
| 372 |
+
"evidence_text": "Dataset includes ps_af split.",
|
| 373 |
+
"evidence_url": "https://huggingface.co/datasets/google/fleurs/blob/main/fleurs.py",
|
| 374 |
+
"markers": [
|
| 375 |
+
"ps_af"
|
| 376 |
+
]
|
| 377 |
+
},
|
| 378 |
+
{
|
| 379 |
+
"id": "benchmark-common-voice-ps-v24",
|
| 380 |
+
"title": "Common Voice Pashto v24 Benchmark",
|
| 381 |
+
"url": "https://datacollective.mozillafoundation.org/datasets/cmj8u3pnb00llnxxbfvxo3b14",
|
| 382 |
+
"category": "benchmark",
|
| 383 |
+
"source": "mozilla",
|
| 384 |
+
"status": "verified",
|
| 385 |
+
"summary": "Core benchmark reference for project-level Pashto ASR tracking.",
|
| 386 |
+
"primary_use": "ASR baseline tracking",
|
| 387 |
+
"tasks": [
|
| 388 |
+
"asr",
|
| 389 |
+
"benchmarking"
|
| 390 |
+
],
|
| 391 |
+
"tags": [
|
| 392 |
+
"pashto",
|
| 393 |
+
"benchmark",
|
| 394 |
+
"asr"
|
| 395 |
+
],
|
| 396 |
+
"evidence_text": "Official Pashto split and versioned release.",
|
| 397 |
+
"evidence_url": "https://datacollective.mozillafoundation.org/datasets/cmj8u3pnb00llnxxbfvxo3b14",
|
| 398 |
+
"markers": [
|
| 399 |
+
"Pashto"
|
| 400 |
+
]
|
| 401 |
+
},
|
| 402 |
+
{
|
| 403 |
+
"id": "benchmark-belebele-pbt-arab",
|
| 404 |
+
"title": "Belebele Pashto Benchmark",
|
| 405 |
+
"url": "https://huggingface.co/datasets/facebook/belebele",
|
| 406 |
+
"category": "benchmark",
|
| 407 |
+
"source": "huggingface",
|
| 408 |
+
"status": "verified",
|
| 409 |
+
"summary": "Comprehension benchmark for multilingual NLP with Pashto variant.",
|
| 410 |
+
"primary_use": "NLP benchmark reporting",
|
| 411 |
+
"tasks": [
|
| 412 |
+
"nlp",
|
| 413 |
+
"benchmarking"
|
| 414 |
+
],
|
| 415 |
+
"tags": [
|
| 416 |
+
"pashto",
|
| 417 |
+
"benchmark",
|
| 418 |
+
"nlp"
|
| 419 |
+
],
|
| 420 |
+
"evidence_text": "Includes pbt_Arab language variant.",
|
| 421 |
+
"evidence_url": "https://huggingface.co/datasets/facebook/belebele",
|
| 422 |
+
"markers": [
|
| 423 |
+
"pbt_Arab"
|
| 424 |
+
]
|
| 425 |
+
},
|
| 426 |
+
{
|
| 427 |
+
"id": "benchmark-flores-200-pbt-arab",
|
| 428 |
+
"title": "FLORES-200 Pashto Benchmark",
|
| 429 |
+
"url": "https://github.com/facebookresearch/flores/tree/main/flores200",
|
| 430 |
+
"category": "benchmark",
|
| 431 |
+
"source": "github",
|
| 432 |
+
"status": "verified",
|
| 433 |
+
"summary": "Translation benchmark language inventory including Pashto script variant.",
|
| 434 |
+
"primary_use": "MT benchmark with BLEU and chrF",
|
| 435 |
+
"tasks": [
|
| 436 |
+
"mt",
|
| 437 |
+
"benchmarking"
|
| 438 |
+
],
|
| 439 |
+
"tags": [
|
| 440 |
+
"pashto",
|
| 441 |
+
"benchmark",
|
| 442 |
+
"mt"
|
| 443 |
+
],
|
| 444 |
+
"evidence_text": "Language list includes pbt_Arab.",
|
| 445 |
+
"evidence_url": "https://raw.githubusercontent.com/facebookresearch/flores/main/flores200/README.md",
|
| 446 |
+
"markers": [
|
| 447 |
+
"pbt_Arab"
|
| 448 |
+
]
|
| 449 |
+
},
|
| 450 |
+
{
|
| 451 |
+
"id": "tool-faster-whisper",
|
| 452 |
+
"title": "Faster-Whisper",
|
| 453 |
+
"url": "https://github.com/SYSTRAN/faster-whisper",
|
| 454 |
+
"category": "tool",
|
| 455 |
+
"source": "github",
|
| 456 |
+
"status": "verified",
|
| 457 |
+
"summary": "Optimized Whisper inference runtime for faster Pashto ASR experiments.",
|
| 458 |
+
"primary_use": "ASR inference acceleration",
|
| 459 |
+
"tasks": [
|
| 460 |
+
"asr"
|
| 461 |
+
],
|
| 462 |
+
"tags": [
|
| 463 |
+
"pashto",
|
| 464 |
+
"tooling",
|
| 465 |
+
"asr"
|
| 466 |
+
],
|
| 467 |
+
"evidence_text": "Whisper tokenizer includes ps and tool runs Whisper models.",
|
| 468 |
+
"evidence_url": "https://raw.githubusercontent.com/openai/whisper/main/whisper/tokenizer.py",
|
| 469 |
+
"markers": [
|
| 470 |
+
"ps"
|
| 471 |
+
]
|
| 472 |
+
},
|
| 473 |
+
{
|
| 474 |
+
"id": "tool-coqui-tts",
|
| 475 |
+
"title": "Coqui TTS",
|
| 476 |
+
"url": "https://github.com/coqui-ai/TTS",
|
| 477 |
+
"category": "tool",
|
| 478 |
+
"source": "github",
|
| 479 |
+
"status": "verified",
|
| 480 |
+
"summary": "Open toolkit for TTS training and inference used for Pashto experiments.",
|
| 481 |
+
"primary_use": "TTS training and inference",
|
| 482 |
+
"tasks": [
|
| 483 |
+
"tts"
|
| 484 |
+
],
|
| 485 |
+
"tags": [
|
| 486 |
+
"pashto",
|
| 487 |
+
"tooling",
|
| 488 |
+
"tts"
|
| 489 |
+
],
|
| 490 |
+
"evidence_text": "Can be paired with Pashto-supporting MMS checkpoints.",
|
| 491 |
+
"evidence_url": "https://dl.fbaipublicfiles.com/mms/misc/language_coverage_mms.html",
|
| 492 |
+
"markers": [
|
| 493 |
+
"pus"
|
| 494 |
+
]
|
| 495 |
+
},
|
| 496 |
+
{
|
| 497 |
+
"id": "paper-whisper-2212-04356",
|
| 498 |
+
"title": "Robust Speech Recognition via Large-Scale Weak Supervision",
|
| 499 |
+
"url": "https://arxiv.org/abs/2212.04356",
|
| 500 |
+
"category": "paper",
|
| 501 |
+
"source": "arxiv",
|
| 502 |
+
"status": "verified",
|
| 503 |
+
"summary": "Whisper paper used as a foundational ASR reference for Pashto baselines.",
|
| 504 |
+
"primary_use": "ASR methodology reference",
|
| 505 |
+
"tasks": [
|
| 506 |
+
"asr",
|
| 507 |
+
"research"
|
| 508 |
+
],
|
| 509 |
+
"tags": [
|
| 510 |
+
"pashto",
|
| 511 |
+
"paper",
|
| 512 |
+
"asr"
|
| 513 |
+
],
|
| 514 |
+
"evidence_text": "Paired with tokenizer language map containing ps.",
|
| 515 |
+
"evidence_url": "https://raw.githubusercontent.com/openai/whisper/main/whisper/tokenizer.py",
|
| 516 |
+
"markers": [
|
| 517 |
+
"ps"
|
| 518 |
+
]
|
| 519 |
+
},
|
| 520 |
+
{
|
| 521 |
+
"id": "paper-mms-2305-13516",
|
| 522 |
+
"title": "Scaling Speech Technology to 1,000+ Languages",
|
| 523 |
+
"url": "https://arxiv.org/abs/2305.13516",
|
| 524 |
+
"category": "paper",
|
| 525 |
+
"source": "arxiv",
|
| 526 |
+
"status": "verified",
|
| 527 |
+
"summary": "MMS paper covering multilingual speech scaling and low-resource transfer.",
|
| 528 |
+
"primary_use": "ASR and TTS transfer reference",
|
| 529 |
+
"tasks": [
|
| 530 |
+
"asr",
|
| 531 |
+
"tts",
|
| 532 |
+
"research"
|
| 533 |
+
],
|
| 534 |
+
"tags": [
|
| 535 |
+
"pashto",
|
| 536 |
+
"paper",
|
| 537 |
+
"speech"
|
| 538 |
+
],
|
| 539 |
+
"evidence_text": "Coverage table marks pus support in MMS release.",
|
| 540 |
+
"evidence_url": "https://dl.fbaipublicfiles.com/mms/misc/language_coverage_mms.html",
|
| 541 |
+
"markers": [
|
| 542 |
+
"pus"
|
| 543 |
+
]
|
| 544 |
+
},
|
| 545 |
+
{
|
| 546 |
+
"id": "paper-nllb-2207-04672",
|
| 547 |
+
"title": "No Language Left Behind",
|
| 548 |
+
"url": "https://arxiv.org/abs/2207.04672",
|
| 549 |
+
"category": "paper",
|
| 550 |
+
"source": "arxiv",
|
| 551 |
+
"status": "verified",
|
| 552 |
+
"summary": "NLLB paper supporting multilingual MT strategy for Pashto integration.",
|
| 553 |
+
"primary_use": "MT research reference",
|
| 554 |
+
"tasks": [
|
| 555 |
+
"mt",
|
| 556 |
+
"research"
|
| 557 |
+
],
|
| 558 |
+
"tags": [
|
| 559 |
+
"pashto",
|
| 560 |
+
"paper",
|
| 561 |
+
"mt"
|
| 562 |
+
],
|
| 563 |
+
"evidence_text": "Model usage in repo references pbt_Arab token support.",
|
| 564 |
+
"evidence_url": "https://huggingface.co/facebook/nllb-200-distilled-600M/blob/main/special_tokens_map.json",
|
| 565 |
+
"markers": [
|
| 566 |
+
"pbt_Arab"
|
| 567 |
+
]
|
| 568 |
+
},
|
| 569 |
+
{
|
| 570 |
+
"id": "paper-fleurs-2205-12446",
|
| 571 |
+
"title": "FLEURS: Few-shot Learning Evaluation of Universal Representations of Speech",
|
| 572 |
+
"url": "https://arxiv.org/abs/2205.12446",
|
| 573 |
+
"category": "paper",
|
| 574 |
+
"source": "arxiv",
|
| 575 |
+
"status": "verified",
|
| 576 |
+
"summary": "FLEURS benchmark paper supporting multilingual speech evaluation including Pashto.",
|
| 577 |
+
"primary_use": "Speech benchmark methodology reference",
|
| 578 |
+
"tasks": [
|
| 579 |
+
"asr",
|
| 580 |
+
"benchmarking",
|
| 581 |
+
"research"
|
| 582 |
+
],
|
| 583 |
+
"tags": [
|
| 584 |
+
"pashto",
|
| 585 |
+
"paper",
|
| 586 |
+
"benchmark"
|
| 587 |
+
],
|
| 588 |
+
"evidence_text": "Dataset implementation includes ps_af language code.",
|
| 589 |
+
"evidence_url": "https://huggingface.co/datasets/google/fleurs/blob/main/fleurs.py",
|
| 590 |
+
"markers": [
|
| 591 |
+
"ps_af"
|
| 592 |
+
]
|
| 593 |
+
}
|
| 594 |
+
]
|
| 595 |
+
}
|
resources/README.md
CHANGED
|
@@ -1,14 +1,23 @@
|
|
| 1 |
-
#
|
| 2 |
|
| 3 |
Structured, Pashto-focused resource tracking lives in this folder.
|
| 4 |
|
| 5 |
## Sections
|
| 6 |
-
- Datasets: [datasets/README.md](datasets/README.md)
|
| 7 |
-
- Models: [models/README.md](models/README.md)
|
| 8 |
-
- Benchmarks: [benchmarks/README.md](benchmarks/README.md)
|
| 9 |
-
- Tools
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10 |
|
| 11 |
## Update Rule
|
| 12 |
- Add only validated resources with explicit Pashto relevance.
|
| 13 |
- Keep every external reference clickable using markdown links.
|
| 14 |
-
-
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Resources
|
| 2 |
|
| 3 |
Structured, Pashto-focused resource tracking lives in this folder.
|
| 4 |
|
| 5 |
## Sections
|
| 6 |
+
- Datasets (8): [datasets/README.md](datasets/README.md)
|
| 7 |
+
- Models (7): [models/README.md](models/README.md)
|
| 8 |
+
- Benchmarks (4): [benchmarks/README.md](benchmarks/README.md)
|
| 9 |
+
- Tools (2): [tools/README.md](tools/README.md)
|
| 10 |
+
- Papers (4): [papers/README.md](papers/README.md)
|
| 11 |
+
|
| 12 |
+
## Machine-Readable Catalog
|
| 13 |
+
- Canonical catalog: [catalog/resources.json](catalog/resources.json)
|
| 14 |
+
- Candidate feed: [catalog/pending_candidates.json](catalog/pending_candidates.json)
|
| 15 |
+
- Schema: [schema/resource.schema.json](schema/resource.schema.json)
|
| 16 |
|
| 17 |
## Update Rule
|
| 18 |
- Add only validated resources with explicit Pashto relevance.
|
| 19 |
- Keep every external reference clickable using markdown links.
|
| 20 |
+
- Run `python scripts/validate_resource_catalog.py` before opening a PR.
|
| 21 |
+
- Run `python scripts/generate_resource_views.py` after catalog changes.
|
| 22 |
+
|
| 23 |
+
Verified resource count: `25`
|
resources/benchmarks/README.md
CHANGED
|
@@ -1,14 +1,15 @@
|
|
| 1 |
-
#
|
| 2 |
|
| 3 |
-
##
|
| 4 |
|
| 5 |
-
|
|
| 6 |
-
|---|---|---|
|
| 7 |
-
|
|
| 8 |
-
| Common Voice Pashto v24 | [
|
| 9 |
-
|
|
| 10 |
-
| FLORES-200 (
|
| 11 |
|
| 12 |
-
##
|
| 13 |
-
-
|
| 14 |
-
-
|
|
|
|
|
|
| 1 |
+
# Benchmarks
|
| 2 |
|
| 3 |
+
## Verified Pashto Resources
|
| 4 |
|
| 5 |
+
| Resource | Link | Pashto Evidence | Primary Use |
|
| 6 |
+
|---|---|---|---|
|
| 7 |
+
| Belebele Pashto Benchmark | [huggingface](https://huggingface.co/datasets/facebook/belebele) | [Includes pbt_Arab language variant. (`pbt_Arab`)](https://huggingface.co/datasets/facebook/belebele) | NLP benchmark reporting |
|
| 8 |
+
| Common Voice Pashto v24 Benchmark | [mozilla](https://datacollective.mozillafoundation.org/datasets/cmj8u3pnb00llnxxbfvxo3b14) | [Official Pashto split and versioned release. (`Pashto`)](https://datacollective.mozillafoundation.org/datasets/cmj8u3pnb00llnxxbfvxo3b14) | ASR baseline tracking |
|
| 9 |
+
| FLEURS Pashto Benchmark | [huggingface](https://huggingface.co/datasets/google/fleurs) | [Dataset includes ps_af split. (`ps_af`)](https://huggingface.co/datasets/google/fleurs/blob/main/fleurs.py) | ASR benchmark reporting |
|
| 10 |
+
| FLORES-200 Pashto Benchmark | [github](https://github.com/facebookresearch/flores/tree/main/flores200) | [Language list includes pbt_Arab. (`pbt_Arab`)](https://raw.githubusercontent.com/facebookresearch/flores/main/flores200/README.md) | MT benchmark with BLEU and chrF |
|
| 11 |
|
| 12 |
+
## Maintenance
|
| 13 |
+
- Source of truth: [../catalog/resources.json](../catalog/resources.json)
|
| 14 |
+
- Validation: [../../scripts/validate_resource_catalog.py](../../scripts/validate_resource_catalog.py)
|
| 15 |
+
- Generated by: [../../scripts/generate_resource_views.py](../../scripts/generate_resource_views.py)
|
resources/catalog/README.md
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Resource Catalog
|
| 2 |
+
|
| 3 |
+
This folder holds machine-readable resource data used by docs and GitHub Pages search.
|
| 4 |
+
|
| 5 |
+
## Files
|
| 6 |
+
- `resources.json`: canonical Pashto resource catalog (source of truth).
|
| 7 |
+
- `pending_candidates.json`: automation output for candidate resources requiring review.
|
| 8 |
+
- `resource.template.json`: starter template for adding a new resource entry.
|
| 9 |
+
|
| 10 |
+
## Required workflow
|
| 11 |
+
1. Update `resources.json`.
|
| 12 |
+
2. Run `python scripts/validate_resource_catalog.py`.
|
| 13 |
+
3. Run `python scripts/generate_resource_views.py`.
|
| 14 |
+
4. Commit both catalog and generated markdown/search files.
|
resources/catalog/pending_candidates.json
ADDED
|
@@ -0,0 +1,474 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"generated_on": "2026-02-15T09:45:32.641403+00:00",
|
| 3 |
+
"sources": [
|
| 4 |
+
"huggingface-datasets",
|
| 5 |
+
"huggingface-models"
|
| 6 |
+
],
|
| 7 |
+
"candidate_count": 20,
|
| 8 |
+
"candidates": [
|
| 9 |
+
{
|
| 10 |
+
"id": "candidate-hf-dataset-aamirhs-pashto",
|
| 11 |
+
"title": "aamirhs/pashto",
|
| 12 |
+
"url": "https://huggingface.co/datasets/aamirhs/pashto",
|
| 13 |
+
"category": "dataset",
|
| 14 |
+
"source": "huggingface",
|
| 15 |
+
"status": "candidate",
|
| 16 |
+
"summary": "Candidate dataset returned from Hugging Face search for Pashto.",
|
| 17 |
+
"primary_use": "Needs maintainer review before promotion to verified catalog.",
|
| 18 |
+
"tasks": [],
|
| 19 |
+
"pashto_evidence": {
|
| 20 |
+
"evidence_text": "Matched by Pashto keyword in Hugging Face search results.",
|
| 21 |
+
"evidence_url": "https://huggingface.co/datasets/aamirhs/pashto",
|
| 22 |
+
"markers": [
|
| 23 |
+
"pashto"
|
| 24 |
+
]
|
| 25 |
+
},
|
| 26 |
+
"tags": [
|
| 27 |
+
"pashto",
|
| 28 |
+
"candidate",
|
| 29 |
+
"dataset"
|
| 30 |
+
]
|
| 31 |
+
},
|
| 32 |
+
{
|
| 33 |
+
"id": "candidate-hf-dataset-aamirhs-pashto-audio-wav2vec",
|
| 34 |
+
"title": "aamirhs/pashto-audio-wav2vec",
|
| 35 |
+
"url": "https://huggingface.co/datasets/aamirhs/pashto-audio-wav2vec",
|
| 36 |
+
"category": "dataset",
|
| 37 |
+
"source": "huggingface",
|
| 38 |
+
"status": "candidate",
|
| 39 |
+
"summary": "Candidate dataset returned from Hugging Face search for Pashto.",
|
| 40 |
+
"primary_use": "Needs maintainer review before promotion to verified catalog.",
|
| 41 |
+
"tasks": [],
|
| 42 |
+
"pashto_evidence": {
|
| 43 |
+
"evidence_text": "Matched by Pashto keyword in Hugging Face search results.",
|
| 44 |
+
"evidence_url": "https://huggingface.co/datasets/aamirhs/pashto-audio-wav2vec",
|
| 45 |
+
"markers": [
|
| 46 |
+
"pashto"
|
| 47 |
+
]
|
| 48 |
+
},
|
| 49 |
+
"tags": [
|
| 50 |
+
"pashto",
|
| 51 |
+
"candidate",
|
| 52 |
+
"dataset"
|
| 53 |
+
]
|
| 54 |
+
},
|
| 55 |
+
{
|
| 56 |
+
"id": "candidate-hf-dataset-aamirhs-pashto-test-1",
|
| 57 |
+
"title": "aamirhs/pashto_test_1",
|
| 58 |
+
"url": "https://huggingface.co/datasets/aamirhs/pashto_test_1",
|
| 59 |
+
"category": "dataset",
|
| 60 |
+
"source": "huggingface",
|
| 61 |
+
"status": "candidate",
|
| 62 |
+
"summary": "Candidate dataset returned from Hugging Face search for Pashto.",
|
| 63 |
+
"primary_use": "Needs maintainer review before promotion to verified catalog.",
|
| 64 |
+
"tasks": [],
|
| 65 |
+
"pashto_evidence": {
|
| 66 |
+
"evidence_text": "Matched by Pashto keyword in Hugging Face search results.",
|
| 67 |
+
"evidence_url": "https://huggingface.co/datasets/aamirhs/pashto_test_1",
|
| 68 |
+
"markers": [
|
| 69 |
+
"pashto"
|
| 70 |
+
]
|
| 71 |
+
},
|
| 72 |
+
"tags": [
|
| 73 |
+
"pashto",
|
| 74 |
+
"candidate",
|
| 75 |
+
"dataset"
|
| 76 |
+
]
|
| 77 |
+
},
|
| 78 |
+
{
|
| 79 |
+
"id": "candidate-hf-dataset-arsalagrey-pashto",
|
| 80 |
+
"title": "arsalagrey/pashto",
|
| 81 |
+
"url": "https://huggingface.co/datasets/arsalagrey/pashto",
|
| 82 |
+
"category": "dataset",
|
| 83 |
+
"source": "huggingface",
|
| 84 |
+
"status": "candidate",
|
| 85 |
+
"summary": "Candidate dataset returned from Hugging Face search for Pashto.",
|
| 86 |
+
"primary_use": "Needs maintainer review before promotion to verified catalog.",
|
| 87 |
+
"tasks": [],
|
| 88 |
+
"pashto_evidence": {
|
| 89 |
+
"evidence_text": "Matched by Pashto keyword in Hugging Face search results.",
|
| 90 |
+
"evidence_url": "https://huggingface.co/datasets/arsalagrey/pashto",
|
| 91 |
+
"markers": [
|
| 92 |
+
"pashto"
|
| 93 |
+
]
|
| 94 |
+
},
|
| 95 |
+
"tags": [
|
| 96 |
+
"pashto",
|
| 97 |
+
"candidate",
|
| 98 |
+
"dataset"
|
| 99 |
+
]
|
| 100 |
+
},
|
| 101 |
+
{
|
| 102 |
+
"id": "candidate-hf-dataset-arsalagrey-pashto-books",
|
| 103 |
+
"title": "arsalagrey/pashto-books",
|
| 104 |
+
"url": "https://huggingface.co/datasets/arsalagrey/pashto-books",
|
| 105 |
+
"category": "dataset",
|
| 106 |
+
"source": "huggingface",
|
| 107 |
+
"status": "candidate",
|
| 108 |
+
"summary": "Candidate dataset returned from Hugging Face search for Pashto.",
|
| 109 |
+
"primary_use": "Needs maintainer review before promotion to verified catalog.",
|
| 110 |
+
"tasks": [],
|
| 111 |
+
"pashto_evidence": {
|
| 112 |
+
"evidence_text": "Matched by Pashto keyword in Hugging Face search results.",
|
| 113 |
+
"evidence_url": "https://huggingface.co/datasets/arsalagrey/pashto-books",
|
| 114 |
+
"markers": [
|
| 115 |
+
"pashto"
|
| 116 |
+
]
|
| 117 |
+
},
|
| 118 |
+
"tags": [
|
| 119 |
+
"pashto",
|
| 120 |
+
"candidate",
|
| 121 |
+
"dataset"
|
| 122 |
+
]
|
| 123 |
+
},
|
| 124 |
+
{
|
| 125 |
+
"id": "candidate-hf-dataset-arsalagrey-pashto-books-json",
|
| 126 |
+
"title": "arsalagrey/pashto-books-json",
|
| 127 |
+
"url": "https://huggingface.co/datasets/arsalagrey/pashto-books-json",
|
| 128 |
+
"category": "dataset",
|
| 129 |
+
"source": "huggingface",
|
| 130 |
+
"status": "candidate",
|
| 131 |
+
"summary": "Candidate dataset returned from Hugging Face search for Pashto.",
|
| 132 |
+
"primary_use": "Needs maintainer review before promotion to verified catalog.",
|
| 133 |
+
"tasks": [],
|
| 134 |
+
"pashto_evidence": {
|
| 135 |
+
"evidence_text": "Matched by Pashto keyword in Hugging Face search results.",
|
| 136 |
+
"evidence_url": "https://huggingface.co/datasets/arsalagrey/pashto-books-json",
|
| 137 |
+
"markers": [
|
| 138 |
+
"pashto"
|
| 139 |
+
]
|
| 140 |
+
},
|
| 141 |
+
"tags": [
|
| 142 |
+
"pashto",
|
| 143 |
+
"candidate",
|
| 144 |
+
"dataset"
|
| 145 |
+
]
|
| 146 |
+
},
|
| 147 |
+
{
|
| 148 |
+
"id": "candidate-hf-model-ihanif-wav2vec2-xls-r-300m-pashto",
|
| 149 |
+
"title": "ihanif/wav2vec2-xls-r-300m-pashto",
|
| 150 |
+
"url": "https://huggingface.co/ihanif/wav2vec2-xls-r-300m-pashto",
|
| 151 |
+
"category": "model",
|
| 152 |
+
"source": "huggingface",
|
| 153 |
+
"status": "candidate",
|
| 154 |
+
"summary": "Candidate model returned from Hugging Face search for Pashto.",
|
| 155 |
+
"primary_use": "Needs maintainer review before promotion to verified catalog.",
|
| 156 |
+
"tasks": [],
|
| 157 |
+
"pashto_evidence": {
|
| 158 |
+
"evidence_text": "Matched by Pashto keyword in Hugging Face search results.",
|
| 159 |
+
"evidence_url": "https://huggingface.co/ihanif/wav2vec2-xls-r-300m-pashto",
|
| 160 |
+
"markers": [
|
| 161 |
+
"pashto"
|
| 162 |
+
]
|
| 163 |
+
},
|
| 164 |
+
"tags": [
|
| 165 |
+
"pashto",
|
| 166 |
+
"candidate",
|
| 167 |
+
"model"
|
| 168 |
+
]
|
| 169 |
+
},
|
| 170 |
+
{
|
| 171 |
+
"id": "candidate-hf-model-ihanif-wav2vec2-xls-r-300m-pashto-lm",
|
| 172 |
+
"title": "ihanif/wav2vec2-xls-r-300m-pashto-lm",
|
| 173 |
+
"url": "https://huggingface.co/ihanif/wav2vec2-xls-r-300m-pashto-lm",
|
| 174 |
+
"category": "model",
|
| 175 |
+
"source": "huggingface",
|
| 176 |
+
"status": "candidate",
|
| 177 |
+
"summary": "Candidate model returned from Hugging Face search for Pashto.",
|
| 178 |
+
"primary_use": "Needs maintainer review before promotion to verified catalog.",
|
| 179 |
+
"tasks": [],
|
| 180 |
+
"pashto_evidence": {
|
| 181 |
+
"evidence_text": "Matched by Pashto keyword in Hugging Face search results.",
|
| 182 |
+
"evidence_url": "https://huggingface.co/ihanif/wav2vec2-xls-r-300m-pashto-lm",
|
| 183 |
+
"markers": [
|
| 184 |
+
"pashto"
|
| 185 |
+
]
|
| 186 |
+
},
|
| 187 |
+
"tags": [
|
| 188 |
+
"pashto",
|
| 189 |
+
"candidate",
|
| 190 |
+
"model"
|
| 191 |
+
]
|
| 192 |
+
},
|
| 193 |
+
{
|
| 194 |
+
"id": "candidate-hf-model-ihanif-whisper-base-pashto",
|
| 195 |
+
"title": "ihanif/whisper-base-pashto",
|
| 196 |
+
"url": "https://huggingface.co/ihanif/whisper-base-pashto",
|
| 197 |
+
"category": "model",
|
| 198 |
+
"source": "huggingface",
|
| 199 |
+
"status": "candidate",
|
| 200 |
+
"summary": "Candidate model returned from Hugging Face search for Pashto.",
|
| 201 |
+
"primary_use": "Needs maintainer review before promotion to verified catalog.",
|
| 202 |
+
"tasks": [],
|
| 203 |
+
"pashto_evidence": {
|
| 204 |
+
"evidence_text": "Matched by Pashto keyword in Hugging Face search results.",
|
| 205 |
+
"evidence_url": "https://huggingface.co/ihanif/whisper-base-pashto",
|
| 206 |
+
"markers": [
|
| 207 |
+
"pashto"
|
| 208 |
+
]
|
| 209 |
+
},
|
| 210 |
+
"tags": [
|
| 211 |
+
"pashto",
|
| 212 |
+
"candidate",
|
| 213 |
+
"model"
|
| 214 |
+
]
|
| 215 |
+
},
|
| 216 |
+
{
|
| 217 |
+
"id": "candidate-hf-model-ihanif-whisper-large-pashto",
|
| 218 |
+
"title": "ihanif/whisper-large-pashto",
|
| 219 |
+
"url": "https://huggingface.co/ihanif/whisper-large-pashto",
|
| 220 |
+
"category": "model",
|
| 221 |
+
"source": "huggingface",
|
| 222 |
+
"status": "candidate",
|
| 223 |
+
"summary": "Candidate model returned from Hugging Face search for Pashto.",
|
| 224 |
+
"primary_use": "Needs maintainer review before promotion to verified catalog.",
|
| 225 |
+
"tasks": [],
|
| 226 |
+
"pashto_evidence": {
|
| 227 |
+
"evidence_text": "Matched by Pashto keyword in Hugging Face search results.",
|
| 228 |
+
"evidence_url": "https://huggingface.co/ihanif/whisper-large-pashto",
|
| 229 |
+
"markers": [
|
| 230 |
+
"pashto"
|
| 231 |
+
]
|
| 232 |
+
},
|
| 233 |
+
"tags": [
|
| 234 |
+
"pashto",
|
| 235 |
+
"candidate",
|
| 236 |
+
"model"
|
| 237 |
+
]
|
| 238 |
+
},
|
| 239 |
+
{
|
| 240 |
+
"id": "candidate-hf-model-ihanif-whisper-medium-pashto",
|
| 241 |
+
"title": "ihanif/whisper-medium-pashto",
|
| 242 |
+
"url": "https://huggingface.co/ihanif/whisper-medium-pashto",
|
| 243 |
+
"category": "model",
|
| 244 |
+
"source": "huggingface",
|
| 245 |
+
"status": "candidate",
|
| 246 |
+
"summary": "Candidate model returned from Hugging Face search for Pashto.",
|
| 247 |
+
"primary_use": "Needs maintainer review before promotion to verified catalog.",
|
| 248 |
+
"tasks": [],
|
| 249 |
+
"pashto_evidence": {
|
| 250 |
+
"evidence_text": "Matched by Pashto keyword in Hugging Face search results.",
|
| 251 |
+
"evidence_url": "https://huggingface.co/ihanif/whisper-medium-pashto",
|
| 252 |
+
"markers": [
|
| 253 |
+
"pashto"
|
| 254 |
+
]
|
| 255 |
+
},
|
| 256 |
+
"tags": [
|
| 257 |
+
"pashto",
|
| 258 |
+
"candidate",
|
| 259 |
+
"model"
|
| 260 |
+
]
|
| 261 |
+
},
|
| 262 |
+
{
|
| 263 |
+
"id": "candidate-hf-model-ihanif-whisper-medium-pashto-3e-7",
|
| 264 |
+
"title": "ihanif/whisper-medium-pashto-3e-7",
|
| 265 |
+
"url": "https://huggingface.co/ihanif/whisper-medium-pashto-3e-7",
|
| 266 |
+
"category": "model",
|
| 267 |
+
"source": "huggingface",
|
| 268 |
+
"status": "candidate",
|
| 269 |
+
"summary": "Candidate model returned from Hugging Face search for Pashto.",
|
| 270 |
+
"primary_use": "Needs maintainer review before promotion to verified catalog.",
|
| 271 |
+
"tasks": [],
|
| 272 |
+
"pashto_evidence": {
|
| 273 |
+
"evidence_text": "Matched by Pashto keyword in Hugging Face search results.",
|
| 274 |
+
"evidence_url": "https://huggingface.co/ihanif/whisper-medium-pashto-3e-7",
|
| 275 |
+
"markers": [
|
| 276 |
+
"pashto"
|
| 277 |
+
]
|
| 278 |
+
},
|
| 279 |
+
"tags": [
|
| 280 |
+
"pashto",
|
| 281 |
+
"candidate",
|
| 282 |
+
"model"
|
| 283 |
+
]
|
| 284 |
+
},
|
| 285 |
+
{
|
| 286 |
+
"id": "candidate-hf-model-ihanif-whisper-small-pashto",
|
| 287 |
+
"title": "ihanif/whisper-small-pashto",
|
| 288 |
+
"url": "https://huggingface.co/ihanif/whisper-small-pashto",
|
| 289 |
+
"category": "model",
|
| 290 |
+
"source": "huggingface",
|
| 291 |
+
"status": "candidate",
|
| 292 |
+
"summary": "Candidate model returned from Hugging Face search for Pashto.",
|
| 293 |
+
"primary_use": "Needs maintainer review before promotion to verified catalog.",
|
| 294 |
+
"tasks": [],
|
| 295 |
+
"pashto_evidence": {
|
| 296 |
+
"evidence_text": "Matched by Pashto keyword in Hugging Face search results.",
|
| 297 |
+
"evidence_url": "https://huggingface.co/ihanif/whisper-small-pashto",
|
| 298 |
+
"markers": [
|
| 299 |
+
"pashto"
|
| 300 |
+
]
|
| 301 |
+
},
|
| 302 |
+
"tags": [
|
| 303 |
+
"pashto",
|
| 304 |
+
"candidate",
|
| 305 |
+
"model"
|
| 306 |
+
]
|
| 307 |
+
},
|
| 308 |
+
{
|
| 309 |
+
"id": "candidate-hf-model-ihanif-whisper-small-pashto-dropout",
|
| 310 |
+
"title": "ihanif/whisper-small-pashto-dropout",
|
| 311 |
+
"url": "https://huggingface.co/ihanif/whisper-small-pashto-dropout",
|
| 312 |
+
"category": "model",
|
| 313 |
+
"source": "huggingface",
|
| 314 |
+
"status": "candidate",
|
| 315 |
+
"summary": "Candidate model returned from Hugging Face search for Pashto.",
|
| 316 |
+
"primary_use": "Needs maintainer review before promotion to verified catalog.",
|
| 317 |
+
"tasks": [],
|
| 318 |
+
"pashto_evidence": {
|
| 319 |
+
"evidence_text": "Matched by Pashto keyword in Hugging Face search results.",
|
| 320 |
+
"evidence_url": "https://huggingface.co/ihanif/whisper-small-pashto-dropout",
|
| 321 |
+
"markers": [
|
| 322 |
+
"pashto"
|
| 323 |
+
]
|
| 324 |
+
},
|
| 325 |
+
"tags": [
|
| 326 |
+
"pashto",
|
| 327 |
+
"candidate",
|
| 328 |
+
"model"
|
| 329 |
+
]
|
| 330 |
+
},
|
| 331 |
+
{
|
| 332 |
+
"id": "candidate-hf-model-ihanif-xls-r-1b-pashto",
|
| 333 |
+
"title": "ihanif/xls-r-1b-pashto",
|
| 334 |
+
"url": "https://huggingface.co/ihanif/xls-r-1b-pashto",
|
| 335 |
+
"category": "model",
|
| 336 |
+
"source": "huggingface",
|
| 337 |
+
"status": "candidate",
|
| 338 |
+
"summary": "Candidate model returned from Hugging Face search for Pashto.",
|
| 339 |
+
"primary_use": "Needs maintainer review before promotion to verified catalog.",
|
| 340 |
+
"tasks": [],
|
| 341 |
+
"pashto_evidence": {
|
| 342 |
+
"evidence_text": "Matched by Pashto keyword in Hugging Face search results.",
|
| 343 |
+
"evidence_url": "https://huggingface.co/ihanif/xls-r-1b-pashto",
|
| 344 |
+
"markers": [
|
| 345 |
+
"pashto"
|
| 346 |
+
]
|
| 347 |
+
},
|
| 348 |
+
"tags": [
|
| 349 |
+
"pashto",
|
| 350 |
+
"candidate",
|
| 351 |
+
"model"
|
| 352 |
+
]
|
| 353 |
+
},
|
| 354 |
+
{
|
| 355 |
+
"id": "candidate-hf-dataset-koochikoo25-pashto-concatenated",
|
| 356 |
+
"title": "koochikoo25/Pashto-Concatenated",
|
| 357 |
+
"url": "https://huggingface.co/datasets/koochikoo25/Pashto-Concatenated",
|
| 358 |
+
"category": "dataset",
|
| 359 |
+
"source": "huggingface",
|
| 360 |
+
"status": "candidate",
|
| 361 |
+
"summary": "Candidate dataset returned from Hugging Face search for Pashto.",
|
| 362 |
+
"primary_use": "Needs maintainer review before promotion to verified catalog.",
|
| 363 |
+
"tasks": [],
|
| 364 |
+
"pashto_evidence": {
|
| 365 |
+
"evidence_text": "Matched by Pashto keyword in Hugging Face search results.",
|
| 366 |
+
"evidence_url": "https://huggingface.co/datasets/koochikoo25/Pashto-Concatenated",
|
| 367 |
+
"markers": [
|
| 368 |
+
"pashto"
|
| 369 |
+
]
|
| 370 |
+
},
|
| 371 |
+
"tags": [
|
| 372 |
+
"pashto",
|
| 373 |
+
"candidate",
|
| 374 |
+
"dataset"
|
| 375 |
+
]
|
| 376 |
+
},
|
| 377 |
+
{
|
| 378 |
+
"id": "candidate-hf-dataset-nexdata-99-hours-pashto-spontaneous-dialogue-smartphone-speech-dataset",
|
| 379 |
+
"title": "Nexdata/99_Hours_Pashto_Spontaneous_Dialogue_Smartphone_speech_dataset",
|
| 380 |
+
"url": "https://huggingface.co/datasets/Nexdata/99_Hours_Pashto_Spontaneous_Dialogue_Smartphone_speech_dataset",
|
| 381 |
+
"category": "dataset",
|
| 382 |
+
"source": "huggingface",
|
| 383 |
+
"status": "candidate",
|
| 384 |
+
"summary": "Candidate dataset returned from Hugging Face search for Pashto.",
|
| 385 |
+
"primary_use": "Needs maintainer review before promotion to verified catalog.",
|
| 386 |
+
"tasks": [],
|
| 387 |
+
"pashto_evidence": {
|
| 388 |
+
"evidence_text": "Matched by Pashto keyword in Hugging Face search results.",
|
| 389 |
+
"evidence_url": "https://huggingface.co/datasets/Nexdata/99_Hours_Pashto_Spontaneous_Dialogue_Smartphone_speech_dataset",
|
| 390 |
+
"markers": [
|
| 391 |
+
"pashto"
|
| 392 |
+
]
|
| 393 |
+
},
|
| 394 |
+
"tags": [
|
| 395 |
+
"pashto",
|
| 396 |
+
"candidate",
|
| 397 |
+
"dataset"
|
| 398 |
+
]
|
| 399 |
+
},
|
| 400 |
+
{
|
| 401 |
+
"id": "candidate-hf-dataset-saillab-alpaca-pashto-taco",
|
| 402 |
+
"title": "saillab/alpaca_pashto_taco",
|
| 403 |
+
"url": "https://huggingface.co/datasets/saillab/alpaca_pashto_taco",
|
| 404 |
+
"category": "dataset",
|
| 405 |
+
"source": "huggingface",
|
| 406 |
+
"status": "candidate",
|
| 407 |
+
"summary": "Candidate dataset returned from Hugging Face search for Pashto.",
|
| 408 |
+
"primary_use": "Needs maintainer review before promotion to verified catalog.",
|
| 409 |
+
"tasks": [],
|
| 410 |
+
"pashto_evidence": {
|
| 411 |
+
"evidence_text": "Matched by Pashto keyword in Hugging Face search results.",
|
| 412 |
+
"evidence_url": "https://huggingface.co/datasets/saillab/alpaca_pashto_taco",
|
| 413 |
+
"markers": [
|
| 414 |
+
"pashto"
|
| 415 |
+
]
|
| 416 |
+
},
|
| 417 |
+
"tags": [
|
| 418 |
+
"pashto",
|
| 419 |
+
"candidate",
|
| 420 |
+
"dataset"
|
| 421 |
+
]
|
| 422 |
+
},
|
| 423 |
+
{
|
| 424 |
+
"id": "candidate-hf-model-zirak-ai-pashto-bert-v1",
|
| 425 |
+
"title": "zirak-ai/pashto-bert-v1",
|
| 426 |
+
"url": "https://huggingface.co/zirak-ai/pashto-bert-v1",
|
| 427 |
+
"category": "model",
|
| 428 |
+
"source": "huggingface",
|
| 429 |
+
"status": "candidate",
|
| 430 |
+
"summary": "Candidate model returned from Hugging Face search for Pashto.",
|
| 431 |
+
"primary_use": "Needs maintainer review before promotion to verified catalog.",
|
| 432 |
+
"tasks": [],
|
| 433 |
+
"pashto_evidence": {
|
| 434 |
+
"evidence_text": "Matched by Pashto keyword in Hugging Face search results.",
|
| 435 |
+
"evidence_url": "https://huggingface.co/zirak-ai/pashto-bert-v1",
|
| 436 |
+
"markers": [
|
| 437 |
+
"pashto"
|
| 438 |
+
]
|
| 439 |
+
},
|
| 440 |
+
"tags": [
|
| 441 |
+
"pashto",
|
| 442 |
+
"candidate",
|
| 443 |
+
"model"
|
| 444 |
+
]
|
| 445 |
+
},
|
| 446 |
+
{
|
| 447 |
+
"id": "candidate-hf-dataset-zirak-ai-pashtoocr",
|
| 448 |
+
"title": "zirak-ai/PashtoOCR",
|
| 449 |
+
"url": "https://huggingface.co/datasets/zirak-ai/PashtoOCR",
|
| 450 |
+
"category": "dataset",
|
| 451 |
+
"source": "huggingface",
|
| 452 |
+
"status": "candidate",
|
| 453 |
+
"summary": "Candidate dataset returned from Hugging Face search for Pashto.",
|
| 454 |
+
"primary_use": "Needs maintainer review before promotion to verified catalog.",
|
| 455 |
+
"tasks": [],
|
| 456 |
+
"pashto_evidence": {
|
| 457 |
+
"evidence_text": "Matched by Pashto keyword in Hugging Face search results.",
|
| 458 |
+
"evidence_url": "https://huggingface.co/datasets/zirak-ai/PashtoOCR",
|
| 459 |
+
"markers": [
|
| 460 |
+
"pashto"
|
| 461 |
+
]
|
| 462 |
+
},
|
| 463 |
+
"tags": [
|
| 464 |
+
"pashto",
|
| 465 |
+
"candidate",
|
| 466 |
+
"dataset"
|
| 467 |
+
]
|
| 468 |
+
}
|
| 469 |
+
],
|
| 470 |
+
"errors": [
|
| 471 |
+
"arxiv: <urlopen error [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: unable to get local issuer certificate (_ssl.c:1007)>",
|
| 472 |
+
"semantic-scholar: HTTP Error 429: "
|
| 473 |
+
]
|
| 474 |
+
}
|
resources/catalog/resource.template.json
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"id": "example-resource-id",
|
| 3 |
+
"title": "Example Resource Title",
|
| 4 |
+
"url": "https://example.org/resource",
|
| 5 |
+
"category": "dataset",
|
| 6 |
+
"source": "other",
|
| 7 |
+
"status": "verified",
|
| 8 |
+
"summary": "One-line summary explaining why this resource matters for Pashto in technology.",
|
| 9 |
+
"primary_use": "ASR baseline",
|
| 10 |
+
"license": "Unknown",
|
| 11 |
+
"tasks": [
|
| 12 |
+
"asr"
|
| 13 |
+
],
|
| 14 |
+
"pashto_evidence": {
|
| 15 |
+
"evidence_text": "Resource page explicitly lists Pashto support.",
|
| 16 |
+
"evidence_url": "https://example.org/resource",
|
| 17 |
+
"markers": [
|
| 18 |
+
"Pashto"
|
| 19 |
+
]
|
| 20 |
+
},
|
| 21 |
+
"tags": [
|
| 22 |
+
"pashto",
|
| 23 |
+
"speech"
|
| 24 |
+
]
|
| 25 |
+
}
|
resources/catalog/resources.json
ADDED
|
@@ -0,0 +1,645 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"version": "1.0.0",
|
| 3 |
+
"updated_on": "2026-02-15",
|
| 4 |
+
"resources": [
|
| 5 |
+
{
|
| 6 |
+
"id": "dataset-common-voice-ps-v24",
|
| 7 |
+
"title": "Common Voice Scripted Speech 24.0 - Pashto",
|
| 8 |
+
"url": "https://datacollective.mozillafoundation.org/datasets/cmj8u3pnb00llnxxbfvxo3b14",
|
| 9 |
+
"category": "dataset",
|
| 10 |
+
"source": "mozilla",
|
| 11 |
+
"status": "verified",
|
| 12 |
+
"summary": "Large open Pashto speech dataset for ASR training and evaluation.",
|
| 13 |
+
"primary_use": "ASR training and evaluation",
|
| 14 |
+
"tasks": [
|
| 15 |
+
"asr"
|
| 16 |
+
],
|
| 17 |
+
"pashto_evidence": {
|
| 18 |
+
"evidence_text": "Official dataset page is for Pashto.",
|
| 19 |
+
"evidence_url": "https://datacollective.mozillafoundation.org/datasets/cmj8u3pnb00llnxxbfvxo3b14",
|
| 20 |
+
"markers": [
|
| 21 |
+
"Pashto"
|
| 22 |
+
]
|
| 23 |
+
},
|
| 24 |
+
"tags": [
|
| 25 |
+
"pashto",
|
| 26 |
+
"speech",
|
| 27 |
+
"asr"
|
| 28 |
+
]
|
| 29 |
+
},
|
| 30 |
+
{
|
| 31 |
+
"id": "dataset-google-fleurs",
|
| 32 |
+
"title": "Google FLEURS",
|
| 33 |
+
"url": "https://huggingface.co/datasets/google/fleurs",
|
| 34 |
+
"category": "dataset",
|
| 35 |
+
"source": "huggingface",
|
| 36 |
+
"status": "verified",
|
| 37 |
+
"summary": "Standard multilingual speech benchmark dataset with Pashto subset.",
|
| 38 |
+
"primary_use": "Speech benchmark and external evaluation",
|
| 39 |
+
"tasks": [
|
| 40 |
+
"asr",
|
| 41 |
+
"benchmarking"
|
| 42 |
+
],
|
| 43 |
+
"pashto_evidence": {
|
| 44 |
+
"evidence_text": "Dataset config includes ps_af.",
|
| 45 |
+
"evidence_url": "https://huggingface.co/datasets/google/fleurs/blob/main/fleurs.py",
|
| 46 |
+
"markers": [
|
| 47 |
+
"ps_af"
|
| 48 |
+
]
|
| 49 |
+
},
|
| 50 |
+
"tags": [
|
| 51 |
+
"pashto",
|
| 52 |
+
"speech",
|
| 53 |
+
"benchmark"
|
| 54 |
+
]
|
| 55 |
+
},
|
| 56 |
+
{
|
| 57 |
+
"id": "dataset-oscar-ps",
|
| 58 |
+
"title": "OSCAR Corpus",
|
| 59 |
+
"url": "https://huggingface.co/datasets/oscar-corpus/oscar",
|
| 60 |
+
"category": "dataset",
|
| 61 |
+
"source": "huggingface",
|
| 62 |
+
"status": "verified",
|
| 63 |
+
"summary": "Large web text corpus that includes Pashto text split.",
|
| 64 |
+
"primary_use": "Language modeling and lexicon expansion",
|
| 65 |
+
"tasks": [
|
| 66 |
+
"nlp"
|
| 67 |
+
],
|
| 68 |
+
"pashto_evidence": {
|
| 69 |
+
"evidence_text": "Dataset includes unshuffled_deduplicated_ps split.",
|
| 70 |
+
"evidence_url": "https://huggingface.co/datasets/oscar-corpus/oscar",
|
| 71 |
+
"markers": [
|
| 72 |
+
"unshuffled_deduplicated_ps"
|
| 73 |
+
]
|
| 74 |
+
},
|
| 75 |
+
"tags": [
|
| 76 |
+
"pashto",
|
| 77 |
+
"text",
|
| 78 |
+
"nlp"
|
| 79 |
+
]
|
| 80 |
+
},
|
| 81 |
+
{
|
| 82 |
+
"id": "dataset-wikipedia-ps",
|
| 83 |
+
"title": "Wikimedia Wikipedia",
|
| 84 |
+
"url": "https://huggingface.co/datasets/wikimedia/wikipedia",
|
| 85 |
+
"category": "dataset",
|
| 86 |
+
"source": "huggingface",
|
| 87 |
+
"status": "verified",
|
| 88 |
+
"summary": "Wikipedia corpus with Pashto edition for cleaner text resources.",
|
| 89 |
+
"primary_use": "Terminology and balanced text corpus",
|
| 90 |
+
"tasks": [
|
| 91 |
+
"nlp"
|
| 92 |
+
],
|
| 93 |
+
"pashto_evidence": {
|
| 94 |
+
"evidence_text": "Dataset includes 20231101.ps subset.",
|
| 95 |
+
"evidence_url": "https://huggingface.co/datasets/wikimedia/wikipedia",
|
| 96 |
+
"markers": [
|
| 97 |
+
"20231101.ps"
|
| 98 |
+
]
|
| 99 |
+
},
|
| 100 |
+
"tags": [
|
| 101 |
+
"pashto",
|
| 102 |
+
"text",
|
| 103 |
+
"nlp"
|
| 104 |
+
]
|
| 105 |
+
},
|
| 106 |
+
{
|
| 107 |
+
"id": "dataset-belebele-pbt-arab",
|
| 108 |
+
"title": "Belebele",
|
| 109 |
+
"url": "https://huggingface.co/datasets/facebook/belebele",
|
| 110 |
+
"category": "dataset",
|
| 111 |
+
"source": "huggingface",
|
| 112 |
+
"status": "verified",
|
| 113 |
+
"summary": "Reading comprehension dataset with Pashto script subset.",
|
| 114 |
+
"primary_use": "Comprehension and multilingual NLP benchmark",
|
| 115 |
+
"tasks": [
|
| 116 |
+
"nlp",
|
| 117 |
+
"benchmarking"
|
| 118 |
+
],
|
| 119 |
+
"pashto_evidence": {
|
| 120 |
+
"evidence_text": "Dataset includes pbt_Arab subset.",
|
| 121 |
+
"evidence_url": "https://huggingface.co/datasets/facebook/belebele",
|
| 122 |
+
"markers": [
|
| 123 |
+
"pbt_Arab"
|
| 124 |
+
]
|
| 125 |
+
},
|
| 126 |
+
"tags": [
|
| 127 |
+
"pashto",
|
| 128 |
+
"nlp",
|
| 129 |
+
"benchmark"
|
| 130 |
+
]
|
| 131 |
+
},
|
| 132 |
+
{
|
| 133 |
+
"id": "dataset-opus100-en-ps",
|
| 134 |
+
"title": "OPUS-100",
|
| 135 |
+
"url": "https://huggingface.co/datasets/Helsinki-NLP/opus-100",
|
| 136 |
+
"category": "dataset",
|
| 137 |
+
"source": "huggingface",
|
| 138 |
+
"status": "verified",
|
| 139 |
+
"summary": "Parallel corpus with English to Pashto split for MT tasks.",
|
| 140 |
+
"primary_use": "Machine translation training and evaluation",
|
| 141 |
+
"tasks": [
|
| 142 |
+
"mt",
|
| 143 |
+
"nlp"
|
| 144 |
+
],
|
| 145 |
+
"pashto_evidence": {
|
| 146 |
+
"evidence_text": "Dataset viewer includes en-ps split.",
|
| 147 |
+
"evidence_url": "https://huggingface.co/datasets/Helsinki-NLP/opus-100/viewer/en-ps",
|
| 148 |
+
"markers": [
|
| 149 |
+
"en-ps"
|
| 150 |
+
]
|
| 151 |
+
},
|
| 152 |
+
"tags": [
|
| 153 |
+
"pashto",
|
| 154 |
+
"mt",
|
| 155 |
+
"parallel-corpus"
|
| 156 |
+
]
|
| 157 |
+
},
|
| 158 |
+
{
|
| 159 |
+
"id": "dataset-kaggle-pashto-isolated-words",
|
| 160 |
+
"title": "Pashto Isolated Words Speech Dataset",
|
| 161 |
+
"url": "https://www.kaggle.com/datasets/engrirf/pashto-isolated-words-speech-dataset",
|
| 162 |
+
"category": "dataset",
|
| 163 |
+
"source": "kaggle",
|
| 164 |
+
"status": "verified",
|
| 165 |
+
"summary": "Speech dataset focused on isolated Pashto words.",
|
| 166 |
+
"primary_use": "Keyword spotting and constrained ASR experiments",
|
| 167 |
+
"tasks": [
|
| 168 |
+
"asr"
|
| 169 |
+
],
|
| 170 |
+
"pashto_evidence": {
|
| 171 |
+
"evidence_text": "Dataset title explicitly states Pashto speech dataset.",
|
| 172 |
+
"evidence_url": "https://www.kaggle.com/datasets/engrirf/pashto-isolated-words-speech-dataset",
|
| 173 |
+
"markers": [
|
| 174 |
+
"Pashto"
|
| 175 |
+
]
|
| 176 |
+
},
|
| 177 |
+
"tags": [
|
| 178 |
+
"pashto",
|
| 179 |
+
"speech",
|
| 180 |
+
"kaggle"
|
| 181 |
+
]
|
| 182 |
+
},
|
| 183 |
+
{
|
| 184 |
+
"id": "dataset-kaggle-pashto-word-embeddings",
|
| 185 |
+
"title": "Pashto Word Embeddings",
|
| 186 |
+
"url": "https://www.kaggle.com/datasets/drijaz/pashto-word-embeddings",
|
| 187 |
+
"category": "dataset",
|
| 188 |
+
"source": "kaggle",
|
| 189 |
+
"status": "verified",
|
| 190 |
+
"summary": "Pretrained Pashto word vectors for classic NLP baselines.",
|
| 191 |
+
"primary_use": "Lexical semantics and lightweight NLP baselines",
|
| 192 |
+
"tasks": [
|
| 193 |
+
"nlp"
|
| 194 |
+
],
|
| 195 |
+
"pashto_evidence": {
|
| 196 |
+
"evidence_text": "Dataset description states pretrained Pashto embeddings.",
|
| 197 |
+
"evidence_url": "https://www.kaggle.com/datasets/drijaz/pashto-word-embeddings",
|
| 198 |
+
"markers": [
|
| 199 |
+
"Pashto"
|
| 200 |
+
]
|
| 201 |
+
},
|
| 202 |
+
"tags": [
|
| 203 |
+
"pashto",
|
| 204 |
+
"nlp",
|
| 205 |
+
"embeddings",
|
| 206 |
+
"kaggle"
|
| 207 |
+
]
|
| 208 |
+
},
|
| 209 |
+
{
|
| 210 |
+
"id": "model-whisper-large-v3",
|
| 211 |
+
"title": "Whisper Large v3",
|
| 212 |
+
"url": "https://huggingface.co/openai/whisper-large-v3",
|
| 213 |
+
"category": "model",
|
| 214 |
+
"source": "huggingface",
|
| 215 |
+
"status": "verified",
|
| 216 |
+
"summary": "Strong multilingual ASR baseline suitable for Pashto bootstrapping.",
|
| 217 |
+
"primary_use": "ASR baseline and pseudo-labeling",
|
| 218 |
+
"tasks": [
|
| 219 |
+
"asr"
|
| 220 |
+
],
|
| 221 |
+
"pashto_evidence": {
|
| 222 |
+
"evidence_text": "Whisper tokenizer map includes ps language key.",
|
| 223 |
+
"evidence_url": "https://raw.githubusercontent.com/openai/whisper/main/whisper/tokenizer.py",
|
| 224 |
+
"markers": [
|
| 225 |
+
"ps"
|
| 226 |
+
]
|
| 227 |
+
},
|
| 228 |
+
"tags": [
|
| 229 |
+
"pashto",
|
| 230 |
+
"asr",
|
| 231 |
+
"whisper"
|
| 232 |
+
]
|
| 233 |
+
},
|
| 234 |
+
{
|
| 235 |
+
"id": "model-mms-1b-all",
|
| 236 |
+
"title": "MMS 1B All",
|
| 237 |
+
"url": "https://huggingface.co/facebook/mms-1b-all",
|
| 238 |
+
"category": "model",
|
| 239 |
+
"source": "huggingface",
|
| 240 |
+
"status": "verified",
|
| 241 |
+
"summary": "Multilingual ASR model from MMS for low-resource transfer.",
|
| 242 |
+
"primary_use": "ASR transfer baseline",
|
| 243 |
+
"tasks": [
|
| 244 |
+
"asr"
|
| 245 |
+
],
|
| 246 |
+
"pashto_evidence": {
|
| 247 |
+
"evidence_text": "MMS coverage table includes pus with ASR support.",
|
| 248 |
+
"evidence_url": "https://dl.fbaipublicfiles.com/mms/misc/language_coverage_mms.html",
|
| 249 |
+
"markers": [
|
| 250 |
+
"pus"
|
| 251 |
+
]
|
| 252 |
+
},
|
| 253 |
+
"tags": [
|
| 254 |
+
"pashto",
|
| 255 |
+
"asr",
|
| 256 |
+
"mms"
|
| 257 |
+
]
|
| 258 |
+
},
|
| 259 |
+
{
|
| 260 |
+
"id": "model-mms-tts",
|
| 261 |
+
"title": "MMS TTS",
|
| 262 |
+
"url": "https://huggingface.co/facebook/mms-tts",
|
| 263 |
+
"category": "model",
|
| 264 |
+
"source": "huggingface",
|
| 265 |
+
"status": "verified",
|
| 266 |
+
"summary": "Multilingual TTS checkpoints useful for Pashto voice synthesis.",
|
| 267 |
+
"primary_use": "TTS baseline and transfer",
|
| 268 |
+
"tasks": [
|
| 269 |
+
"tts"
|
| 270 |
+
],
|
| 271 |
+
"pashto_evidence": {
|
| 272 |
+
"evidence_text": "MMS coverage table includes pus with TTS support.",
|
| 273 |
+
"evidence_url": "https://dl.fbaipublicfiles.com/mms/misc/language_coverage_mms.html",
|
| 274 |
+
"markers": [
|
| 275 |
+
"pus"
|
| 276 |
+
]
|
| 277 |
+
},
|
| 278 |
+
"tags": [
|
| 279 |
+
"pashto",
|
| 280 |
+
"tts",
|
| 281 |
+
"mms"
|
| 282 |
+
]
|
| 283 |
+
},
|
| 284 |
+
{
|
| 285 |
+
"id": "model-nllb-200-distilled-600m",
|
| 286 |
+
"title": "NLLB-200 Distilled 600M",
|
| 287 |
+
"url": "https://huggingface.co/facebook/nllb-200-distilled-600M",
|
| 288 |
+
"category": "model",
|
| 289 |
+
"source": "huggingface",
|
| 290 |
+
"status": "verified",
|
| 291 |
+
"summary": "General multilingual translation model with Pashto script token support.",
|
| 292 |
+
"primary_use": "Pashto translation baseline",
|
| 293 |
+
"tasks": [
|
| 294 |
+
"mt"
|
| 295 |
+
],
|
| 296 |
+
"pashto_evidence": {
|
| 297 |
+
"evidence_text": "Model special token map includes pbt_Arab.",
|
| 298 |
+
"evidence_url": "https://huggingface.co/facebook/nllb-200-distilled-600M/blob/main/special_tokens_map.json",
|
| 299 |
+
"markers": [
|
| 300 |
+
"pbt_Arab"
|
| 301 |
+
]
|
| 302 |
+
},
|
| 303 |
+
"tags": [
|
| 304 |
+
"pashto",
|
| 305 |
+
"mt",
|
| 306 |
+
"nllb"
|
| 307 |
+
]
|
| 308 |
+
},
|
| 309 |
+
{
|
| 310 |
+
"id": "model-opus-mt-en-mul",
|
| 311 |
+
"title": "OPUS MT en-mul",
|
| 312 |
+
"url": "https://huggingface.co/Helsinki-NLP/opus-mt-en-mul",
|
| 313 |
+
"category": "model",
|
| 314 |
+
"source": "huggingface",
|
| 315 |
+
"status": "verified",
|
| 316 |
+
"summary": "Translation model that can route English into Pashto via multilingual set.",
|
| 317 |
+
"primary_use": "English to Pashto translation path",
|
| 318 |
+
"tasks": [
|
| 319 |
+
"mt"
|
| 320 |
+
],
|
| 321 |
+
"pashto_evidence": {
|
| 322 |
+
"evidence_text": "Language list includes pus code.",
|
| 323 |
+
"evidence_url": "https://huggingface.co/Helsinki-NLP/opus-mt-en-mul",
|
| 324 |
+
"markers": [
|
| 325 |
+
"pus"
|
| 326 |
+
]
|
| 327 |
+
},
|
| 328 |
+
"tags": [
|
| 329 |
+
"pashto",
|
| 330 |
+
"mt",
|
| 331 |
+
"opus"
|
| 332 |
+
]
|
| 333 |
+
},
|
| 334 |
+
{
|
| 335 |
+
"id": "model-opus-mt-mul-en",
|
| 336 |
+
"title": "OPUS MT mul-en",
|
| 337 |
+
"url": "https://huggingface.co/Helsinki-NLP/opus-mt-mul-en",
|
| 338 |
+
"category": "model",
|
| 339 |
+
"source": "huggingface",
|
| 340 |
+
"status": "verified",
|
| 341 |
+
"summary": "Translation model for Pashto to English via multilingual encoder.",
|
| 342 |
+
"primary_use": "Pashto to English translation path",
|
| 343 |
+
"tasks": [
|
| 344 |
+
"mt"
|
| 345 |
+
],
|
| 346 |
+
"pashto_evidence": {
|
| 347 |
+
"evidence_text": "Language list includes pus code.",
|
| 348 |
+
"evidence_url": "https://huggingface.co/Helsinki-NLP/opus-mt-mul-en",
|
| 349 |
+
"markers": [
|
| 350 |
+
"pus"
|
| 351 |
+
]
|
| 352 |
+
},
|
| 353 |
+
"tags": [
|
| 354 |
+
"pashto",
|
| 355 |
+
"mt",
|
| 356 |
+
"opus"
|
| 357 |
+
]
|
| 358 |
+
},
|
| 359 |
+
{
|
| 360 |
+
"id": "model-pashto-bert",
|
| 361 |
+
"title": "PashtoBERT",
|
| 362 |
+
"url": "https://huggingface.co/mdarhri/pashto-bert",
|
| 363 |
+
"category": "model",
|
| 364 |
+
"source": "huggingface",
|
| 365 |
+
"status": "verified",
|
| 366 |
+
"summary": "Pashto-specific encoder model for NLP transfer tasks.",
|
| 367 |
+
"primary_use": "Pashto NLP baseline encoder",
|
| 368 |
+
"tasks": [
|
| 369 |
+
"nlp"
|
| 370 |
+
],
|
| 371 |
+
"pashto_evidence": {
|
| 372 |
+
"evidence_text": "Model card states training on Pashto corpus data.",
|
| 373 |
+
"evidence_url": "https://huggingface.co/mdarhri/pashto-bert",
|
| 374 |
+
"markers": [
|
| 375 |
+
"Pashto"
|
| 376 |
+
]
|
| 377 |
+
},
|
| 378 |
+
"tags": [
|
| 379 |
+
"pashto",
|
| 380 |
+
"nlp",
|
| 381 |
+
"bert"
|
| 382 |
+
]
|
| 383 |
+
},
|
| 384 |
+
{
|
| 385 |
+
"id": "benchmark-fleurs-ps-af",
|
| 386 |
+
"title": "FLEURS Pashto Benchmark",
|
| 387 |
+
"url": "https://huggingface.co/datasets/google/fleurs",
|
| 388 |
+
"category": "benchmark",
|
| 389 |
+
"source": "huggingface",
|
| 390 |
+
"status": "verified",
|
| 391 |
+
"summary": "Fixed multilingual speech benchmark with Pashto subset for WER and CER.",
|
| 392 |
+
"primary_use": "ASR benchmark reporting",
|
| 393 |
+
"tasks": [
|
| 394 |
+
"asr",
|
| 395 |
+
"benchmarking"
|
| 396 |
+
],
|
| 397 |
+
"pashto_evidence": {
|
| 398 |
+
"evidence_text": "Dataset includes ps_af split.",
|
| 399 |
+
"evidence_url": "https://huggingface.co/datasets/google/fleurs/blob/main/fleurs.py",
|
| 400 |
+
"markers": [
|
| 401 |
+
"ps_af"
|
| 402 |
+
]
|
| 403 |
+
},
|
| 404 |
+
"tags": [
|
| 405 |
+
"pashto",
|
| 406 |
+
"benchmark",
|
| 407 |
+
"asr"
|
| 408 |
+
]
|
| 409 |
+
},
|
| 410 |
+
{
|
| 411 |
+
"id": "benchmark-common-voice-ps-v24",
|
| 412 |
+
"title": "Common Voice Pashto v24 Benchmark",
|
| 413 |
+
"url": "https://datacollective.mozillafoundation.org/datasets/cmj8u3pnb00llnxxbfvxo3b14",
|
| 414 |
+
"category": "benchmark",
|
| 415 |
+
"source": "mozilla",
|
| 416 |
+
"status": "verified",
|
| 417 |
+
"summary": "Core benchmark reference for project-level Pashto ASR tracking.",
|
| 418 |
+
"primary_use": "ASR baseline tracking",
|
| 419 |
+
"tasks": [
|
| 420 |
+
"asr",
|
| 421 |
+
"benchmarking"
|
| 422 |
+
],
|
| 423 |
+
"pashto_evidence": {
|
| 424 |
+
"evidence_text": "Official Pashto split and versioned release.",
|
| 425 |
+
"evidence_url": "https://datacollective.mozillafoundation.org/datasets/cmj8u3pnb00llnxxbfvxo3b14",
|
| 426 |
+
"markers": [
|
| 427 |
+
"Pashto"
|
| 428 |
+
]
|
| 429 |
+
},
|
| 430 |
+
"tags": [
|
| 431 |
+
"pashto",
|
| 432 |
+
"benchmark",
|
| 433 |
+
"asr"
|
| 434 |
+
]
|
| 435 |
+
},
|
| 436 |
+
{
|
| 437 |
+
"id": "benchmark-belebele-pbt-arab",
|
| 438 |
+
"title": "Belebele Pashto Benchmark",
|
| 439 |
+
"url": "https://huggingface.co/datasets/facebook/belebele",
|
| 440 |
+
"category": "benchmark",
|
| 441 |
+
"source": "huggingface",
|
| 442 |
+
"status": "verified",
|
| 443 |
+
"summary": "Comprehension benchmark for multilingual NLP with Pashto variant.",
|
| 444 |
+
"primary_use": "NLP benchmark reporting",
|
| 445 |
+
"tasks": [
|
| 446 |
+
"nlp",
|
| 447 |
+
"benchmarking"
|
| 448 |
+
],
|
| 449 |
+
"pashto_evidence": {
|
| 450 |
+
"evidence_text": "Includes pbt_Arab language variant.",
|
| 451 |
+
"evidence_url": "https://huggingface.co/datasets/facebook/belebele",
|
| 452 |
+
"markers": [
|
| 453 |
+
"pbt_Arab"
|
| 454 |
+
]
|
| 455 |
+
},
|
| 456 |
+
"tags": [
|
| 457 |
+
"pashto",
|
| 458 |
+
"benchmark",
|
| 459 |
+
"nlp"
|
| 460 |
+
]
|
| 461 |
+
},
|
| 462 |
+
{
|
| 463 |
+
"id": "benchmark-flores-200-pbt-arab",
|
| 464 |
+
"title": "FLORES-200 Pashto Benchmark",
|
| 465 |
+
"url": "https://github.com/facebookresearch/flores/tree/main/flores200",
|
| 466 |
+
"category": "benchmark",
|
| 467 |
+
"source": "github",
|
| 468 |
+
"status": "verified",
|
| 469 |
+
"summary": "Translation benchmark language inventory including Pashto script variant.",
|
| 470 |
+
"primary_use": "MT benchmark with BLEU and chrF",
|
| 471 |
+
"tasks": [
|
| 472 |
+
"mt",
|
| 473 |
+
"benchmarking"
|
| 474 |
+
],
|
| 475 |
+
"pashto_evidence": {
|
| 476 |
+
"evidence_text": "Language list includes pbt_Arab.",
|
| 477 |
+
"evidence_url": "https://raw.githubusercontent.com/facebookresearch/flores/main/flores200/README.md",
|
| 478 |
+
"markers": [
|
| 479 |
+
"pbt_Arab"
|
| 480 |
+
]
|
| 481 |
+
},
|
| 482 |
+
"tags": [
|
| 483 |
+
"pashto",
|
| 484 |
+
"benchmark",
|
| 485 |
+
"mt"
|
| 486 |
+
]
|
| 487 |
+
},
|
| 488 |
+
{
|
| 489 |
+
"id": "tool-faster-whisper",
|
| 490 |
+
"title": "Faster-Whisper",
|
| 491 |
+
"url": "https://github.com/SYSTRAN/faster-whisper",
|
| 492 |
+
"category": "tool",
|
| 493 |
+
"source": "github",
|
| 494 |
+
"status": "verified",
|
| 495 |
+
"summary": "Optimized Whisper inference runtime for faster Pashto ASR experiments.",
|
| 496 |
+
"primary_use": "ASR inference acceleration",
|
| 497 |
+
"tasks": [
|
| 498 |
+
"asr"
|
| 499 |
+
],
|
| 500 |
+
"pashto_evidence": {
|
| 501 |
+
"evidence_text": "Whisper tokenizer includes ps and tool runs Whisper models.",
|
| 502 |
+
"evidence_url": "https://raw.githubusercontent.com/openai/whisper/main/whisper/tokenizer.py",
|
| 503 |
+
"markers": [
|
| 504 |
+
"ps"
|
| 505 |
+
]
|
| 506 |
+
},
|
| 507 |
+
"tags": [
|
| 508 |
+
"pashto",
|
| 509 |
+
"tooling",
|
| 510 |
+
"asr"
|
| 511 |
+
]
|
| 512 |
+
},
|
| 513 |
+
{
|
| 514 |
+
"id": "tool-coqui-tts",
|
| 515 |
+
"title": "Coqui TTS",
|
| 516 |
+
"url": "https://github.com/coqui-ai/TTS",
|
| 517 |
+
"category": "tool",
|
| 518 |
+
"source": "github",
|
| 519 |
+
"status": "verified",
|
| 520 |
+
"summary": "Open toolkit for TTS training and inference used for Pashto experiments.",
|
| 521 |
+
"primary_use": "TTS training and inference",
|
| 522 |
+
"tasks": [
|
| 523 |
+
"tts"
|
| 524 |
+
],
|
| 525 |
+
"pashto_evidence": {
|
| 526 |
+
"evidence_text": "Can be paired with Pashto-supporting MMS checkpoints.",
|
| 527 |
+
"evidence_url": "https://dl.fbaipublicfiles.com/mms/misc/language_coverage_mms.html",
|
| 528 |
+
"markers": [
|
| 529 |
+
"pus"
|
| 530 |
+
]
|
| 531 |
+
},
|
| 532 |
+
"tags": [
|
| 533 |
+
"pashto",
|
| 534 |
+
"tooling",
|
| 535 |
+
"tts"
|
| 536 |
+
]
|
| 537 |
+
},
|
| 538 |
+
{
|
| 539 |
+
"id": "paper-whisper-2212-04356",
|
| 540 |
+
"title": "Robust Speech Recognition via Large-Scale Weak Supervision",
|
| 541 |
+
"url": "https://arxiv.org/abs/2212.04356",
|
| 542 |
+
"category": "paper",
|
| 543 |
+
"source": "arxiv",
|
| 544 |
+
"status": "verified",
|
| 545 |
+
"summary": "Whisper paper used as a foundational ASR reference for Pashto baselines.",
|
| 546 |
+
"primary_use": "ASR methodology reference",
|
| 547 |
+
"tasks": [
|
| 548 |
+
"asr",
|
| 549 |
+
"research"
|
| 550 |
+
],
|
| 551 |
+
"pashto_evidence": {
|
| 552 |
+
"evidence_text": "Paired with tokenizer language map containing ps.",
|
| 553 |
+
"evidence_url": "https://raw.githubusercontent.com/openai/whisper/main/whisper/tokenizer.py",
|
| 554 |
+
"markers": [
|
| 555 |
+
"ps"
|
| 556 |
+
]
|
| 557 |
+
},
|
| 558 |
+
"tags": [
|
| 559 |
+
"pashto",
|
| 560 |
+
"paper",
|
| 561 |
+
"asr"
|
| 562 |
+
]
|
| 563 |
+
},
|
| 564 |
+
{
|
| 565 |
+
"id": "paper-mms-2305-13516",
|
| 566 |
+
"title": "Scaling Speech Technology to 1,000+ Languages",
|
| 567 |
+
"url": "https://arxiv.org/abs/2305.13516",
|
| 568 |
+
"category": "paper",
|
| 569 |
+
"source": "arxiv",
|
| 570 |
+
"status": "verified",
|
| 571 |
+
"summary": "MMS paper covering multilingual speech scaling and low-resource transfer.",
|
| 572 |
+
"primary_use": "ASR and TTS transfer reference",
|
| 573 |
+
"tasks": [
|
| 574 |
+
"asr",
|
| 575 |
+
"tts",
|
| 576 |
+
"research"
|
| 577 |
+
],
|
| 578 |
+
"pashto_evidence": {
|
| 579 |
+
"evidence_text": "Coverage table marks pus support in MMS release.",
|
| 580 |
+
"evidence_url": "https://dl.fbaipublicfiles.com/mms/misc/language_coverage_mms.html",
|
| 581 |
+
"markers": [
|
| 582 |
+
"pus"
|
| 583 |
+
]
|
| 584 |
+
},
|
| 585 |
+
"tags": [
|
| 586 |
+
"pashto",
|
| 587 |
+
"paper",
|
| 588 |
+
"speech"
|
| 589 |
+
]
|
| 590 |
+
},
|
| 591 |
+
{
|
| 592 |
+
"id": "paper-nllb-2207-04672",
|
| 593 |
+
"title": "No Language Left Behind",
|
| 594 |
+
"url": "https://arxiv.org/abs/2207.04672",
|
| 595 |
+
"category": "paper",
|
| 596 |
+
"source": "arxiv",
|
| 597 |
+
"status": "verified",
|
| 598 |
+
"summary": "NLLB paper supporting multilingual MT strategy for Pashto integration.",
|
| 599 |
+
"primary_use": "MT research reference",
|
| 600 |
+
"tasks": [
|
| 601 |
+
"mt",
|
| 602 |
+
"research"
|
| 603 |
+
],
|
| 604 |
+
"pashto_evidence": {
|
| 605 |
+
"evidence_text": "Model usage in repo references pbt_Arab token support.",
|
| 606 |
+
"evidence_url": "https://huggingface.co/facebook/nllb-200-distilled-600M/blob/main/special_tokens_map.json",
|
| 607 |
+
"markers": [
|
| 608 |
+
"pbt_Arab"
|
| 609 |
+
]
|
| 610 |
+
},
|
| 611 |
+
"tags": [
|
| 612 |
+
"pashto",
|
| 613 |
+
"paper",
|
| 614 |
+
"mt"
|
| 615 |
+
]
|
| 616 |
+
},
|
| 617 |
+
{
|
| 618 |
+
"id": "paper-fleurs-2205-12446",
|
| 619 |
+
"title": "FLEURS: Few-shot Learning Evaluation of Universal Representations of Speech",
|
| 620 |
+
"url": "https://arxiv.org/abs/2205.12446",
|
| 621 |
+
"category": "paper",
|
| 622 |
+
"source": "arxiv",
|
| 623 |
+
"status": "verified",
|
| 624 |
+
"summary": "FLEURS benchmark paper supporting multilingual speech evaluation including Pashto.",
|
| 625 |
+
"primary_use": "Speech benchmark methodology reference",
|
| 626 |
+
"tasks": [
|
| 627 |
+
"asr",
|
| 628 |
+
"benchmarking",
|
| 629 |
+
"research"
|
| 630 |
+
],
|
| 631 |
+
"pashto_evidence": {
|
| 632 |
+
"evidence_text": "Dataset implementation includes ps_af language code.",
|
| 633 |
+
"evidence_url": "https://huggingface.co/datasets/google/fleurs/blob/main/fleurs.py",
|
| 634 |
+
"markers": [
|
| 635 |
+
"ps_af"
|
| 636 |
+
]
|
| 637 |
+
},
|
| 638 |
+
"tags": [
|
| 639 |
+
"pashto",
|
| 640 |
+
"paper",
|
| 641 |
+
"benchmark"
|
| 642 |
+
]
|
| 643 |
+
}
|
| 644 |
+
]
|
| 645 |
+
}
|
resources/datasets/README.md
CHANGED
|
@@ -1,18 +1,19 @@
|
|
| 1 |
-
#
|
| 2 |
|
| 3 |
-
## Pashto
|
| 4 |
|
| 5 |
| Resource | Link | Pashto Evidence | Primary Use |
|
| 6 |
|---|---|---|---|
|
| 7 |
-
|
|
| 8 |
-
|
|
| 9 |
-
|
|
| 10 |
-
|
|
| 11 |
-
|
|
| 12 |
-
|
|
| 13 |
-
| Pashto
|
| 14 |
-
|
|
| 15 |
|
| 16 |
-
##
|
| 17 |
-
-
|
| 18 |
-
-
|
|
|
|
|
|
| 1 |
+
# Datasets
|
| 2 |
|
| 3 |
+
## Verified Pashto Resources
|
| 4 |
|
| 5 |
| Resource | Link | Pashto Evidence | Primary Use |
|
| 6 |
|---|---|---|---|
|
| 7 |
+
| Belebele | [huggingface](https://huggingface.co/datasets/facebook/belebele) | [Dataset includes pbt_Arab subset. (`pbt_Arab`)](https://huggingface.co/datasets/facebook/belebele) | Comprehension and multilingual NLP benchmark |
|
| 8 |
+
| Common Voice Scripted Speech 24.0 - Pashto | [mozilla](https://datacollective.mozillafoundation.org/datasets/cmj8u3pnb00llnxxbfvxo3b14) | [Official dataset page is for Pashto. (`Pashto`)](https://datacollective.mozillafoundation.org/datasets/cmj8u3pnb00llnxxbfvxo3b14) | ASR training and evaluation |
|
| 9 |
+
| Google FLEURS | [huggingface](https://huggingface.co/datasets/google/fleurs) | [Dataset config includes ps_af. (`ps_af`)](https://huggingface.co/datasets/google/fleurs/blob/main/fleurs.py) | Speech benchmark and external evaluation |
|
| 10 |
+
| OPUS-100 | [huggingface](https://huggingface.co/datasets/Helsinki-NLP/opus-100) | [Dataset viewer includes en-ps split. (`en-ps`)](https://huggingface.co/datasets/Helsinki-NLP/opus-100/viewer/en-ps) | Machine translation training and evaluation |
|
| 11 |
+
| OSCAR Corpus | [huggingface](https://huggingface.co/datasets/oscar-corpus/oscar) | [Dataset includes unshuffled_deduplicated_ps split. (`unshuffled_deduplicated_ps`)](https://huggingface.co/datasets/oscar-corpus/oscar) | Language modeling and lexicon expansion |
|
| 12 |
+
| Pashto Isolated Words Speech Dataset | [kaggle](https://www.kaggle.com/datasets/engrirf/pashto-isolated-words-speech-dataset) | [Dataset title explicitly states Pashto speech dataset. (`Pashto`)](https://www.kaggle.com/datasets/engrirf/pashto-isolated-words-speech-dataset) | Keyword spotting and constrained ASR experiments |
|
| 13 |
+
| Pashto Word Embeddings | [kaggle](https://www.kaggle.com/datasets/drijaz/pashto-word-embeddings) | [Dataset description states pretrained Pashto embeddings. (`Pashto`)](https://www.kaggle.com/datasets/drijaz/pashto-word-embeddings) | Lexical semantics and lightweight NLP baselines |
|
| 14 |
+
| Wikimedia Wikipedia | [huggingface](https://huggingface.co/datasets/wikimedia/wikipedia) | [Dataset includes 20231101.ps subset. (`20231101.ps`)](https://huggingface.co/datasets/wikimedia/wikipedia) | Terminology and balanced text corpus |
|
| 15 |
|
| 16 |
+
## Maintenance
|
| 17 |
+
- Source of truth: [../catalog/resources.json](../catalog/resources.json)
|
| 18 |
+
- Validation: [../../scripts/validate_resource_catalog.py](../../scripts/validate_resource_catalog.py)
|
| 19 |
+
- Generated by: [../../scripts/generate_resource_views.py](../../scripts/generate_resource_views.py)
|
resources/models/README.md
CHANGED
|
@@ -1,19 +1,18 @@
|
|
| 1 |
# Models
|
| 2 |
|
| 3 |
-
## Pashto
|
| 4 |
|
| 5 |
| Resource | Link | Pashto Evidence | Primary Use |
|
| 6 |
|---|---|---|---|
|
| 7 |
-
|
|
| 8 |
-
| MMS
|
| 9 |
-
|
|
| 10 |
-
|
|
| 11 |
-
|
|
| 12 |
-
|
|
| 13 |
-
|
|
| 14 |
-
| PashtoBERT | [Hugging Face - mdarhri/pashto-bert](https://huggingface.co/mdarhri/pashto-bert) | Model card states it is trained on Pashto corpus data | Pashto NLP encoder baseline |
|
| 15 |
|
| 16 |
-
##
|
| 17 |
-
-
|
| 18 |
-
-
|
| 19 |
-
-
|
|
|
|
| 1 |
# Models
|
| 2 |
|
| 3 |
+
## Verified Pashto Resources
|
| 4 |
|
| 5 |
| Resource | Link | Pashto Evidence | Primary Use |
|
| 6 |
|---|---|---|---|
|
| 7 |
+
| MMS 1B All | [huggingface](https://huggingface.co/facebook/mms-1b-all) | [MMS coverage table includes pus with ASR support. (`pus`)](https://dl.fbaipublicfiles.com/mms/misc/language_coverage_mms.html) | ASR transfer baseline |
|
| 8 |
+
| MMS TTS | [huggingface](https://huggingface.co/facebook/mms-tts) | [MMS coverage table includes pus with TTS support. (`pus`)](https://dl.fbaipublicfiles.com/mms/misc/language_coverage_mms.html) | TTS baseline and transfer |
|
| 9 |
+
| NLLB-200 Distilled 600M | [huggingface](https://huggingface.co/facebook/nllb-200-distilled-600M) | [Model special token map includes pbt_Arab. (`pbt_Arab`)](https://huggingface.co/facebook/nllb-200-distilled-600M/blob/main/special_tokens_map.json) | Pashto translation baseline |
|
| 10 |
+
| OPUS MT en-mul | [huggingface](https://huggingface.co/Helsinki-NLP/opus-mt-en-mul) | [Language list includes pus code. (`pus`)](https://huggingface.co/Helsinki-NLP/opus-mt-en-mul) | English to Pashto translation path |
|
| 11 |
+
| OPUS MT mul-en | [huggingface](https://huggingface.co/Helsinki-NLP/opus-mt-mul-en) | [Language list includes pus code. (`pus`)](https://huggingface.co/Helsinki-NLP/opus-mt-mul-en) | Pashto to English translation path |
|
| 12 |
+
| PashtoBERT | [huggingface](https://huggingface.co/mdarhri/pashto-bert) | [Model card states training on Pashto corpus data. (`Pashto`)](https://huggingface.co/mdarhri/pashto-bert) | Pashto NLP baseline encoder |
|
| 13 |
+
| Whisper Large v3 | [huggingface](https://huggingface.co/openai/whisper-large-v3) | [Whisper tokenizer map includes ps language key. (`ps`)](https://raw.githubusercontent.com/openai/whisper/main/whisper/tokenizer.py) | ASR baseline and pseudo-labeling |
|
|
|
|
| 14 |
|
| 15 |
+
## Maintenance
|
| 16 |
+
- Source of truth: [../catalog/resources.json](../catalog/resources.json)
|
| 17 |
+
- Validation: [../../scripts/validate_resource_catalog.py](../../scripts/validate_resource_catalog.py)
|
| 18 |
+
- Generated by: [../../scripts/generate_resource_views.py](../../scripts/generate_resource_views.py)
|
resources/papers/README.md
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Papers
|
| 2 |
+
|
| 3 |
+
## Verified Pashto Resources
|
| 4 |
+
|
| 5 |
+
| Resource | Link | Pashto Evidence | Primary Use |
|
| 6 |
+
|---|---|---|---|
|
| 7 |
+
| FLEURS: Few-shot Learning Evaluation of Universal Representations of Speech | [arxiv](https://arxiv.org/abs/2205.12446) | [Dataset implementation includes ps_af language code. (`ps_af`)](https://huggingface.co/datasets/google/fleurs/blob/main/fleurs.py) | Speech benchmark methodology reference |
|
| 8 |
+
| No Language Left Behind | [arxiv](https://arxiv.org/abs/2207.04672) | [Model usage in repo references pbt_Arab token support. (`pbt_Arab`)](https://huggingface.co/facebook/nllb-200-distilled-600M/blob/main/special_tokens_map.json) | MT research reference |
|
| 9 |
+
| Robust Speech Recognition via Large-Scale Weak Supervision | [arxiv](https://arxiv.org/abs/2212.04356) | [Paired with tokenizer language map containing ps. (`ps`)](https://raw.githubusercontent.com/openai/whisper/main/whisper/tokenizer.py) | ASR methodology reference |
|
| 10 |
+
| Scaling Speech Technology to 1,000+ Languages | [arxiv](https://arxiv.org/abs/2305.13516) | [Coverage table marks pus support in MMS release. (`pus`)](https://dl.fbaipublicfiles.com/mms/misc/language_coverage_mms.html) | ASR and TTS transfer reference |
|
| 11 |
+
|
| 12 |
+
## Maintenance
|
| 13 |
+
- Source of truth: [../catalog/resources.json](../catalog/resources.json)
|
| 14 |
+
- Validation: [../../scripts/validate_resource_catalog.py](../../scripts/validate_resource_catalog.py)
|
| 15 |
+
- Generated by: [../../scripts/generate_resource_views.py](../../scripts/generate_resource_views.py)
|
resources/schema/resource.schema.json
ADDED
|
@@ -0,0 +1,142 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"$schema": "https://json-schema.org/draft/2020-12/schema",
|
| 3 |
+
"$id": "https://musawer1214.github.io/Pukhto_Pashto/resources/schema/resource.schema.json",
|
| 4 |
+
"title": "Pashto Resource Catalog",
|
| 5 |
+
"type": "object",
|
| 6 |
+
"additionalProperties": false,
|
| 7 |
+
"required": [
|
| 8 |
+
"version",
|
| 9 |
+
"updated_on",
|
| 10 |
+
"resources"
|
| 11 |
+
],
|
| 12 |
+
"properties": {
|
| 13 |
+
"version": {
|
| 14 |
+
"type": "string",
|
| 15 |
+
"pattern": "^\\d+\\.\\d+\\.\\d+$"
|
| 16 |
+
},
|
| 17 |
+
"updated_on": {
|
| 18 |
+
"type": "string",
|
| 19 |
+
"format": "date"
|
| 20 |
+
},
|
| 21 |
+
"resources": {
|
| 22 |
+
"type": "array",
|
| 23 |
+
"items": {
|
| 24 |
+
"$ref": "#/$defs/resource"
|
| 25 |
+
}
|
| 26 |
+
}
|
| 27 |
+
},
|
| 28 |
+
"$defs": {
|
| 29 |
+
"resource": {
|
| 30 |
+
"type": "object",
|
| 31 |
+
"additionalProperties": false,
|
| 32 |
+
"required": [
|
| 33 |
+
"id",
|
| 34 |
+
"title",
|
| 35 |
+
"url",
|
| 36 |
+
"category",
|
| 37 |
+
"source",
|
| 38 |
+
"status",
|
| 39 |
+
"summary",
|
| 40 |
+
"primary_use",
|
| 41 |
+
"pashto_evidence",
|
| 42 |
+
"tags"
|
| 43 |
+
],
|
| 44 |
+
"properties": {
|
| 45 |
+
"id": {
|
| 46 |
+
"type": "string",
|
| 47 |
+
"pattern": "^[a-z0-9][a-z0-9._-]*$"
|
| 48 |
+
},
|
| 49 |
+
"title": {
|
| 50 |
+
"type": "string",
|
| 51 |
+
"minLength": 3
|
| 52 |
+
},
|
| 53 |
+
"url": {
|
| 54 |
+
"type": "string",
|
| 55 |
+
"format": "uri",
|
| 56 |
+
"pattern": "^https?://"
|
| 57 |
+
},
|
| 58 |
+
"category": {
|
| 59 |
+
"type": "string",
|
| 60 |
+
"enum": [
|
| 61 |
+
"dataset",
|
| 62 |
+
"model",
|
| 63 |
+
"benchmark",
|
| 64 |
+
"tool",
|
| 65 |
+
"paper"
|
| 66 |
+
]
|
| 67 |
+
},
|
| 68 |
+
"source": {
|
| 69 |
+
"type": "string",
|
| 70 |
+
"enum": [
|
| 71 |
+
"huggingface",
|
| 72 |
+
"mozilla",
|
| 73 |
+
"kaggle",
|
| 74 |
+
"github",
|
| 75 |
+
"arxiv",
|
| 76 |
+
"meta",
|
| 77 |
+
"other"
|
| 78 |
+
]
|
| 79 |
+
},
|
| 80 |
+
"status": {
|
| 81 |
+
"type": "string",
|
| 82 |
+
"enum": [
|
| 83 |
+
"verified",
|
| 84 |
+
"candidate"
|
| 85 |
+
]
|
| 86 |
+
},
|
| 87 |
+
"summary": {
|
| 88 |
+
"type": "string",
|
| 89 |
+
"minLength": 10
|
| 90 |
+
},
|
| 91 |
+
"primary_use": {
|
| 92 |
+
"type": "string",
|
| 93 |
+
"minLength": 3
|
| 94 |
+
},
|
| 95 |
+
"license": {
|
| 96 |
+
"type": "string"
|
| 97 |
+
},
|
| 98 |
+
"tasks": {
|
| 99 |
+
"type": "array",
|
| 100 |
+
"items": {
|
| 101 |
+
"type": "string"
|
| 102 |
+
}
|
| 103 |
+
},
|
| 104 |
+
"pashto_evidence": {
|
| 105 |
+
"type": "object",
|
| 106 |
+
"additionalProperties": false,
|
| 107 |
+
"required": [
|
| 108 |
+
"evidence_text",
|
| 109 |
+
"evidence_url",
|
| 110 |
+
"markers"
|
| 111 |
+
],
|
| 112 |
+
"properties": {
|
| 113 |
+
"evidence_text": {
|
| 114 |
+
"type": "string",
|
| 115 |
+
"minLength": 3
|
| 116 |
+
},
|
| 117 |
+
"evidence_url": {
|
| 118 |
+
"type": "string",
|
| 119 |
+
"format": "uri",
|
| 120 |
+
"pattern": "^https?://"
|
| 121 |
+
},
|
| 122 |
+
"markers": {
|
| 123 |
+
"type": "array",
|
| 124 |
+
"minItems": 1,
|
| 125 |
+
"items": {
|
| 126 |
+
"type": "string",
|
| 127 |
+
"minLength": 1
|
| 128 |
+
}
|
| 129 |
+
}
|
| 130 |
+
}
|
| 131 |
+
},
|
| 132 |
+
"tags": {
|
| 133 |
+
"type": "array",
|
| 134 |
+
"minItems": 1,
|
| 135 |
+
"items": {
|
| 136 |
+
"type": "string"
|
| 137 |
+
}
|
| 138 |
+
}
|
| 139 |
+
}
|
| 140 |
+
}
|
| 141 |
+
}
|
| 142 |
+
}
|
resources/tools/README.md
CHANGED
|
@@ -1,17 +1,13 @@
|
|
| 1 |
-
#
|
| 2 |
|
| 3 |
-
##
|
| 4 |
|
| 5 |
-
|
|
| 6 |
-
|---|---|---|
|
| 7 |
-
|
|
| 8 |
-
|
|
| 9 |
|
| 10 |
-
##
|
| 11 |
-
-
|
| 12 |
-
-
|
| 13 |
-
-
|
| 14 |
-
- FLEURS paper: [arXiv:2205.12446](https://arxiv.org/abs/2205.12446)
|
| 15 |
-
|
| 16 |
-
## Integration Path
|
| 17 |
-
- Desktop integration: [../../apps/desktop/README.md](../../apps/desktop/README.md)
|
|
|
|
| 1 |
+
# Tools
|
| 2 |
|
| 3 |
+
## Verified Pashto Resources
|
| 4 |
|
| 5 |
+
| Resource | Link | Pashto Evidence | Primary Use |
|
| 6 |
+
|---|---|---|---|
|
| 7 |
+
| Coqui TTS | [github](https://github.com/coqui-ai/TTS) | [Can be paired with Pashto-supporting MMS checkpoints. (`pus`)](https://dl.fbaipublicfiles.com/mms/misc/language_coverage_mms.html) | TTS training and inference |
|
| 8 |
+
| Faster-Whisper | [github](https://github.com/SYSTRAN/faster-whisper) | [Whisper tokenizer includes ps and tool runs Whisper models. (`ps`)](https://raw.githubusercontent.com/openai/whisper/main/whisper/tokenizer.py) | ASR inference acceleration |
|
| 9 |
|
| 10 |
+
## Maintenance
|
| 11 |
+
- Source of truth: [../catalog/resources.json](../catalog/resources.json)
|
| 12 |
+
- Validation: [../../scripts/validate_resource_catalog.py](../../scripts/validate_resource_catalog.py)
|
| 13 |
+
- Generated by: [../../scripts/generate_resource_views.py](../../scripts/generate_resource_views.py)
|
|
|
|
|
|
|
|
|
|
|
|
scripts/README.md
CHANGED
|
@@ -1,10 +1,13 @@
|
|
| 1 |
-
#
|
| 2 |
|
| 3 |
-
Automation scripts for
|
| 4 |
|
| 5 |
-
## Available
|
| 6 |
-
-
|
| 7 |
-
-
|
|
|
|
|
|
|
|
|
|
| 8 |
|
| 9 |
## Usage
|
| 10 |
|
|
@@ -13,7 +16,22 @@ Validate normalization seed file:
|
|
| 13 |
python scripts/validate_normalization.py data/processed/normalization_seed_v0.1.tsv
|
| 14 |
```
|
| 15 |
|
| 16 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 17 |
```bash
|
| 18 |
python scripts/check_links.py
|
| 19 |
```
|
|
|
|
| 1 |
+
# Scripts
|
| 2 |
|
| 3 |
+
Automation scripts for quality checks, resource catalog validation, and search index generation.
|
| 4 |
|
| 5 |
+
## Available scripts
|
| 6 |
+
- `validate_normalization.py`: validate normalization seed TSV format and rules.
|
| 7 |
+
- `check_links.py`: ensure markdown links are clickable (optional online reachability check).
|
| 8 |
+
- `validate_resource_catalog.py`: validate `resources/catalog/resources.json`.
|
| 9 |
+
- `generate_resource_views.py`: generate `resources/*/README.md`, `resources/README.md`, and `docs/search/resources.json` from the catalog.
|
| 10 |
+
- `sync_resources.py`: collect new candidate Pashto resources from public endpoints into `resources/catalog/pending_candidates.json`.
|
| 11 |
|
| 12 |
## Usage
|
| 13 |
|
|
|
|
| 16 |
python scripts/validate_normalization.py data/processed/normalization_seed_v0.1.tsv
|
| 17 |
```
|
| 18 |
|
| 19 |
+
Validate resource catalog:
|
| 20 |
+
```bash
|
| 21 |
+
python scripts/validate_resource_catalog.py
|
| 22 |
+
```
|
| 23 |
+
|
| 24 |
+
Generate markdown and search index from catalog:
|
| 25 |
+
```bash
|
| 26 |
+
python scripts/generate_resource_views.py
|
| 27 |
+
```
|
| 28 |
+
|
| 29 |
+
Sync candidate resources for maintainer review:
|
| 30 |
+
```bash
|
| 31 |
+
python scripts/sync_resources.py --limit 20
|
| 32 |
+
```
|
| 33 |
+
|
| 34 |
+
Check markdown links format:
|
| 35 |
```bash
|
| 36 |
python scripts/check_links.py
|
| 37 |
```
|
scripts/generate_resource_views.py
ADDED
|
@@ -0,0 +1,174 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Generate markdown resource views and search index from catalog JSON.
|
| 2 |
+
|
| 3 |
+
Usage:
|
| 4 |
+
python scripts/generate_resource_views.py
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
from __future__ import annotations
|
| 8 |
+
|
| 9 |
+
import json
|
| 10 |
+
from pathlib import Path
|
| 11 |
+
from typing import Any
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
CATEGORY_CONFIG = {
|
| 15 |
+
"dataset": ("resources/datasets/README.md", "Datasets"),
|
| 16 |
+
"model": ("resources/models/README.md", "Models"),
|
| 17 |
+
"benchmark": ("resources/benchmarks/README.md", "Benchmarks"),
|
| 18 |
+
"tool": ("resources/tools/README.md", "Tools"),
|
| 19 |
+
"paper": ("resources/papers/README.md", "Papers"),
|
| 20 |
+
}
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
def _load_catalog(path: Path) -> dict[str, Any]:
|
| 24 |
+
return json.loads(path.read_text(encoding="utf-8"))
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
def _escape_cell(value: str) -> str:
|
| 28 |
+
return value.replace("|", "\\|").strip()
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
def _marker_text(markers: list[str]) -> str:
|
| 32 |
+
return ", ".join(f"`{marker}`" for marker in markers)
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
def _resource_row(resource: dict[str, Any]) -> str:
|
| 36 |
+
evidence = resource["pashto_evidence"]
|
| 37 |
+
evidence_text = _escape_cell(evidence["evidence_text"])
|
| 38 |
+
markers = _marker_text(evidence["markers"])
|
| 39 |
+
if markers:
|
| 40 |
+
evidence_text = f"{evidence_text} ({markers})"
|
| 41 |
+
return (
|
| 42 |
+
f"| {_escape_cell(resource['title'])} | "
|
| 43 |
+
f"[{resource['source']}]({resource['url']}) | "
|
| 44 |
+
f"[{evidence_text}]({evidence['evidence_url']}) | "
|
| 45 |
+
f"{_escape_cell(resource['primary_use'])} |"
|
| 46 |
+
)
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
def _write_markdown_table(path: Path, title: str, resources: list[dict[str, Any]]) -> None:
|
| 50 |
+
lines = [
|
| 51 |
+
f"# {title}",
|
| 52 |
+
"",
|
| 53 |
+
"## Verified Pashto Resources",
|
| 54 |
+
"",
|
| 55 |
+
"| Resource | Link | Pashto Evidence | Primary Use |",
|
| 56 |
+
"|---|---|---|---|",
|
| 57 |
+
]
|
| 58 |
+
|
| 59 |
+
if resources:
|
| 60 |
+
lines.extend(_resource_row(resource) for resource in resources)
|
| 61 |
+
else:
|
| 62 |
+
lines.append("| _None yet_ | - | - | - |")
|
| 63 |
+
|
| 64 |
+
lines.extend(
|
| 65 |
+
[
|
| 66 |
+
"",
|
| 67 |
+
"## Maintenance",
|
| 68 |
+
"- Source of truth: [../catalog/resources.json](../catalog/resources.json)",
|
| 69 |
+
"- Validation: [../../scripts/validate_resource_catalog.py](../../scripts/validate_resource_catalog.py)",
|
| 70 |
+
"- Generated by: [../../scripts/generate_resource_views.py](../../scripts/generate_resource_views.py)",
|
| 71 |
+
"",
|
| 72 |
+
]
|
| 73 |
+
)
|
| 74 |
+
path.write_text("\n".join(lines), encoding="utf-8")
|
| 75 |
+
|
| 76 |
+
|
| 77 |
+
def _write_resources_home(path: Path, counts: dict[str, int], total_verified: int) -> None:
|
| 78 |
+
lines = [
|
| 79 |
+
"# Resources",
|
| 80 |
+
"",
|
| 81 |
+
"Structured, Pashto-focused resource tracking lives in this folder.",
|
| 82 |
+
"",
|
| 83 |
+
"## Sections",
|
| 84 |
+
f"- Datasets ({counts.get('dataset', 0)}): [datasets/README.md](datasets/README.md)",
|
| 85 |
+
f"- Models ({counts.get('model', 0)}): [models/README.md](models/README.md)",
|
| 86 |
+
f"- Benchmarks ({counts.get('benchmark', 0)}): [benchmarks/README.md](benchmarks/README.md)",
|
| 87 |
+
f"- Tools ({counts.get('tool', 0)}): [tools/README.md](tools/README.md)",
|
| 88 |
+
f"- Papers ({counts.get('paper', 0)}): [papers/README.md](papers/README.md)",
|
| 89 |
+
"",
|
| 90 |
+
"## Machine-Readable Catalog",
|
| 91 |
+
"- Canonical catalog: [catalog/resources.json](catalog/resources.json)",
|
| 92 |
+
"- Candidate feed: [catalog/pending_candidates.json](catalog/pending_candidates.json)",
|
| 93 |
+
"- Schema: [schema/resource.schema.json](schema/resource.schema.json)",
|
| 94 |
+
"",
|
| 95 |
+
"## Update Rule",
|
| 96 |
+
"- Add only validated resources with explicit Pashto relevance.",
|
| 97 |
+
"- Keep every external reference clickable using markdown links.",
|
| 98 |
+
"- Run `python scripts/validate_resource_catalog.py` before opening a PR.",
|
| 99 |
+
"- Run `python scripts/generate_resource_views.py` after catalog changes.",
|
| 100 |
+
"",
|
| 101 |
+
f"Verified resource count: `{total_verified}`",
|
| 102 |
+
"",
|
| 103 |
+
]
|
| 104 |
+
path.write_text("\n".join(lines), encoding="utf-8")
|
| 105 |
+
|
| 106 |
+
|
| 107 |
+
def _build_search_payload(resources: list[dict[str, Any]], updated_on: str) -> dict[str, Any]:
|
| 108 |
+
search_items: list[dict[str, Any]] = []
|
| 109 |
+
for resource in resources:
|
| 110 |
+
evidence = resource["pashto_evidence"]
|
| 111 |
+
search_items.append(
|
| 112 |
+
{
|
| 113 |
+
"id": resource["id"],
|
| 114 |
+
"title": resource["title"],
|
| 115 |
+
"url": resource["url"],
|
| 116 |
+
"category": resource["category"],
|
| 117 |
+
"source": resource["source"],
|
| 118 |
+
"status": resource["status"],
|
| 119 |
+
"summary": resource["summary"],
|
| 120 |
+
"primary_use": resource["primary_use"],
|
| 121 |
+
"tasks": resource.get("tasks", []),
|
| 122 |
+
"tags": resource["tags"],
|
| 123 |
+
"evidence_text": evidence["evidence_text"],
|
| 124 |
+
"evidence_url": evidence["evidence_url"],
|
| 125 |
+
"markers": evidence["markers"],
|
| 126 |
+
}
|
| 127 |
+
)
|
| 128 |
+
|
| 129 |
+
return {
|
| 130 |
+
"generated_on": f"{updated_on}T00:00:00Z",
|
| 131 |
+
"count": len(search_items),
|
| 132 |
+
"resources": search_items,
|
| 133 |
+
}
|
| 134 |
+
|
| 135 |
+
|
| 136 |
+
def main() -> int:
|
| 137 |
+
catalog_path = Path("resources/catalog/resources.json")
|
| 138 |
+
catalog = _load_catalog(catalog_path)
|
| 139 |
+
resources: list[dict[str, Any]] = catalog.get("resources", [])
|
| 140 |
+
updated_on = catalog.get("updated_on", "1970-01-01")
|
| 141 |
+
verified = [resource for resource in resources if resource.get("status") == "verified"]
|
| 142 |
+
|
| 143 |
+
grouped: dict[str, list[dict[str, Any]]] = {category: [] for category in CATEGORY_CONFIG}
|
| 144 |
+
for resource in verified:
|
| 145 |
+
category = resource.get("category")
|
| 146 |
+
if category in grouped:
|
| 147 |
+
grouped[category].append(resource)
|
| 148 |
+
|
| 149 |
+
for category, (file_path, title) in CATEGORY_CONFIG.items():
|
| 150 |
+
output_path = Path(file_path)
|
| 151 |
+
output_path.parent.mkdir(parents=True, exist_ok=True)
|
| 152 |
+
rows = sorted(grouped[category], key=lambda item: item["title"].lower())
|
| 153 |
+
_write_markdown_table(output_path, title, rows)
|
| 154 |
+
|
| 155 |
+
counts = {category: len(items) for category, items in grouped.items()}
|
| 156 |
+
_write_resources_home(Path("resources/README.md"), counts, len(verified))
|
| 157 |
+
|
| 158 |
+
search_payload = _build_search_payload(resources, updated_on)
|
| 159 |
+
search_json_path = Path("docs/search/resources.json")
|
| 160 |
+
search_json_path.parent.mkdir(parents=True, exist_ok=True)
|
| 161 |
+
search_json_path.write_text(
|
| 162 |
+
json.dumps(search_payload, ensure_ascii=False, indent=2) + "\n",
|
| 163 |
+
encoding="utf-8",
|
| 164 |
+
)
|
| 165 |
+
|
| 166 |
+
print(
|
| 167 |
+
"Generated resources markdown and search index: "
|
| 168 |
+
f"{len(verified)} verified resources, {len(resources)} total resources"
|
| 169 |
+
)
|
| 170 |
+
return 0
|
| 171 |
+
|
| 172 |
+
|
| 173 |
+
if __name__ == "__main__":
|
| 174 |
+
raise SystemExit(main())
|
scripts/sync_resources.py
ADDED
|
@@ -0,0 +1,283 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Discover new Pashto-related resource candidates from public endpoints.
|
| 2 |
+
|
| 3 |
+
This script does not auto-merge into the main catalog. It writes candidates to
|
| 4 |
+
`resources/catalog/pending_candidates.json` for maintainer review.
|
| 5 |
+
|
| 6 |
+
Usage:
|
| 7 |
+
python scripts/sync_resources.py
|
| 8 |
+
python scripts/sync_resources.py --limit 20 --output resources/catalog/pending_candidates.json
|
| 9 |
+
"""
|
| 10 |
+
|
| 11 |
+
from __future__ import annotations
|
| 12 |
+
|
| 13 |
+
import argparse
|
| 14 |
+
import json
|
| 15 |
+
import re
|
| 16 |
+
import urllib.parse
|
| 17 |
+
import urllib.request
|
| 18 |
+
import xml.etree.ElementTree as ET
|
| 19 |
+
from datetime import datetime, timezone
|
| 20 |
+
from pathlib import Path
|
| 21 |
+
from typing import Any
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
USER_AGENT = "pashto-resource-sync/1.0"
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
def _slug(value: str) -> str:
|
| 28 |
+
value = value.lower()
|
| 29 |
+
value = re.sub(r"[^a-z0-9]+", "-", value)
|
| 30 |
+
value = re.sub(r"-+", "-", value).strip("-")
|
| 31 |
+
return value[:80] if value else "resource"
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
def _fetch_json(url: str, timeout: float = 20.0) -> Any:
|
| 35 |
+
req = urllib.request.Request(url, headers={"User-Agent": USER_AGENT})
|
| 36 |
+
with urllib.request.urlopen(req, timeout=timeout) as response:
|
| 37 |
+
return json.loads(response.read().decode("utf-8"))
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
def _fetch_text(url: str, timeout: float = 20.0) -> str:
|
| 41 |
+
req = urllib.request.Request(url, headers={"User-Agent": USER_AGENT})
|
| 42 |
+
with urllib.request.urlopen(req, timeout=timeout) as response:
|
| 43 |
+
return response.read().decode("utf-8", errors="replace")
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
def _candidate(
|
| 47 |
+
*,
|
| 48 |
+
rid: str,
|
| 49 |
+
title: str,
|
| 50 |
+
url: str,
|
| 51 |
+
category: str,
|
| 52 |
+
source: str,
|
| 53 |
+
summary: str,
|
| 54 |
+
evidence_text: str,
|
| 55 |
+
evidence_url: str,
|
| 56 |
+
markers: list[str],
|
| 57 |
+
tags: list[str],
|
| 58 |
+
) -> dict[str, Any]:
|
| 59 |
+
return {
|
| 60 |
+
"id": rid,
|
| 61 |
+
"title": title.strip(),
|
| 62 |
+
"url": url.strip(),
|
| 63 |
+
"category": category,
|
| 64 |
+
"source": source,
|
| 65 |
+
"status": "candidate",
|
| 66 |
+
"summary": summary.strip(),
|
| 67 |
+
"primary_use": "Needs maintainer review before promotion to verified catalog.",
|
| 68 |
+
"tasks": [],
|
| 69 |
+
"pashto_evidence": {
|
| 70 |
+
"evidence_text": evidence_text.strip(),
|
| 71 |
+
"evidence_url": evidence_url.strip(),
|
| 72 |
+
"markers": markers,
|
| 73 |
+
},
|
| 74 |
+
"tags": tags,
|
| 75 |
+
}
|
| 76 |
+
|
| 77 |
+
|
| 78 |
+
def fetch_huggingface(kind: str, limit: int) -> list[dict[str, Any]]:
|
| 79 |
+
if kind not in {"datasets", "models"}:
|
| 80 |
+
return []
|
| 81 |
+
|
| 82 |
+
query = urllib.parse.urlencode({"search": "pashto", "limit": str(limit)})
|
| 83 |
+
url = f"https://huggingface.co/api/{kind}?{query}"
|
| 84 |
+
payload = _fetch_json(url)
|
| 85 |
+
|
| 86 |
+
category = "dataset" if kind == "datasets" else "model"
|
| 87 |
+
out: list[dict[str, Any]] = []
|
| 88 |
+
for item in payload:
|
| 89 |
+
repo_id = item.get("id") or item.get("modelId")
|
| 90 |
+
if not repo_id:
|
| 91 |
+
continue
|
| 92 |
+
repo_url = f"https://huggingface.co/{'datasets/' if kind == 'datasets' else ''}{repo_id}"
|
| 93 |
+
rid = f"candidate-hf-{kind[:-1]}-{_slug(repo_id)}"
|
| 94 |
+
out.append(
|
| 95 |
+
_candidate(
|
| 96 |
+
rid=rid,
|
| 97 |
+
title=repo_id,
|
| 98 |
+
url=repo_url,
|
| 99 |
+
category=category,
|
| 100 |
+
source="huggingface",
|
| 101 |
+
summary=f"Candidate {category} returned from Hugging Face search for Pashto.",
|
| 102 |
+
evidence_text="Matched by Pashto keyword in Hugging Face search results.",
|
| 103 |
+
evidence_url=repo_url,
|
| 104 |
+
markers=["pashto"],
|
| 105 |
+
tags=["pashto", "candidate", category],
|
| 106 |
+
)
|
| 107 |
+
)
|
| 108 |
+
return out
|
| 109 |
+
|
| 110 |
+
|
| 111 |
+
def fetch_arxiv(limit: int) -> list[dict[str, Any]]:
|
| 112 |
+
query = urllib.parse.urlencode(
|
| 113 |
+
{"search_query": "all:pashto", "start": "0", "max_results": str(limit)}
|
| 114 |
+
)
|
| 115 |
+
url = f"http://export.arxiv.org/api/query?{query}"
|
| 116 |
+
xml_text = _fetch_text(url)
|
| 117 |
+
root = ET.fromstring(xml_text)
|
| 118 |
+
ns = {"atom": "http://www.w3.org/2005/Atom"}
|
| 119 |
+
|
| 120 |
+
out: list[dict[str, Any]] = []
|
| 121 |
+
for entry in root.findall("atom:entry", ns):
|
| 122 |
+
title = (entry.findtext("atom:title", default="", namespaces=ns) or "").strip()
|
| 123 |
+
link = (entry.findtext("atom:id", default="", namespaces=ns) or "").strip()
|
| 124 |
+
summary = (entry.findtext("atom:summary", default="", namespaces=ns) or "").strip()
|
| 125 |
+
if not title or not link:
|
| 126 |
+
continue
|
| 127 |
+
|
| 128 |
+
rid = f"candidate-arxiv-{_slug(title)}"
|
| 129 |
+
out.append(
|
| 130 |
+
_candidate(
|
| 131 |
+
rid=rid,
|
| 132 |
+
title=title,
|
| 133 |
+
url=link,
|
| 134 |
+
category="paper",
|
| 135 |
+
source="arxiv",
|
| 136 |
+
summary=summary[:240] if summary else "Candidate paper returned from arXiv query for Pashto.",
|
| 137 |
+
evidence_text="Matched by arXiv query: all:pashto.",
|
| 138 |
+
evidence_url=link,
|
| 139 |
+
markers=["pashto"],
|
| 140 |
+
tags=["pashto", "candidate", "paper"],
|
| 141 |
+
)
|
| 142 |
+
)
|
| 143 |
+
return out
|
| 144 |
+
|
| 145 |
+
|
| 146 |
+
def fetch_semantic_scholar(limit: int) -> list[dict[str, Any]]:
|
| 147 |
+
fields = "title,url,abstract,year,externalIds"
|
| 148 |
+
query = urllib.parse.urlencode(
|
| 149 |
+
{"query": "pashto", "limit": str(limit), "fields": fields}
|
| 150 |
+
)
|
| 151 |
+
url = f"https://api.semanticscholar.org/graph/v1/paper/search?{query}"
|
| 152 |
+
payload = _fetch_json(url)
|
| 153 |
+
|
| 154 |
+
out: list[dict[str, Any]] = []
|
| 155 |
+
for item in payload.get("data", []):
|
| 156 |
+
title = (item.get("title") or "").strip()
|
| 157 |
+
if not title:
|
| 158 |
+
continue
|
| 159 |
+
paper_url = (item.get("url") or "").strip()
|
| 160 |
+
if not paper_url:
|
| 161 |
+
ext = item.get("externalIds") or {}
|
| 162 |
+
arxiv_id = ext.get("ArXiv")
|
| 163 |
+
if arxiv_id:
|
| 164 |
+
paper_url = f"https://arxiv.org/abs/{arxiv_id}"
|
| 165 |
+
if not paper_url:
|
| 166 |
+
continue
|
| 167 |
+
|
| 168 |
+
summary = (item.get("abstract") or "").strip()
|
| 169 |
+
rid = f"candidate-s2-{_slug(title)}"
|
| 170 |
+
out.append(
|
| 171 |
+
_candidate(
|
| 172 |
+
rid=rid,
|
| 173 |
+
title=title,
|
| 174 |
+
url=paper_url,
|
| 175 |
+
category="paper",
|
| 176 |
+
source="other",
|
| 177 |
+
summary=summary[:240] if summary else "Candidate paper returned from Semantic Scholar search for Pashto.",
|
| 178 |
+
evidence_text="Matched by Semantic Scholar query: pashto.",
|
| 179 |
+
evidence_url=paper_url,
|
| 180 |
+
markers=["pashto"],
|
| 181 |
+
tags=["pashto", "candidate", "paper"],
|
| 182 |
+
)
|
| 183 |
+
)
|
| 184 |
+
return out
|
| 185 |
+
|
| 186 |
+
|
| 187 |
+
def _dedupe_candidates(
|
| 188 |
+
candidates: list[dict[str, Any]],
|
| 189 |
+
existing_ids: set[str],
|
| 190 |
+
existing_urls: set[str],
|
| 191 |
+
) -> list[dict[str, Any]]:
|
| 192 |
+
unique: list[dict[str, Any]] = []
|
| 193 |
+
seen_ids = set(existing_ids)
|
| 194 |
+
seen_urls = set(existing_urls)
|
| 195 |
+
|
| 196 |
+
for item in candidates:
|
| 197 |
+
rid = item["id"]
|
| 198 |
+
url = item["url"].rstrip("/")
|
| 199 |
+
if rid in seen_ids or url in seen_urls:
|
| 200 |
+
continue
|
| 201 |
+
seen_ids.add(rid)
|
| 202 |
+
seen_urls.add(url)
|
| 203 |
+
unique.append(item)
|
| 204 |
+
return unique
|
| 205 |
+
|
| 206 |
+
|
| 207 |
+
def main() -> int:
|
| 208 |
+
parser = argparse.ArgumentParser()
|
| 209 |
+
parser.add_argument("--catalog", default="resources/catalog/resources.json")
|
| 210 |
+
parser.add_argument("--output", default="resources/catalog/pending_candidates.json")
|
| 211 |
+
parser.add_argument("--limit", type=int, default=15)
|
| 212 |
+
args = parser.parse_args()
|
| 213 |
+
|
| 214 |
+
catalog_path = Path(args.catalog)
|
| 215 |
+
output_path = Path(args.output)
|
| 216 |
+
|
| 217 |
+
catalog = json.loads(catalog_path.read_text(encoding="utf-8"))
|
| 218 |
+
resources = catalog.get("resources", [])
|
| 219 |
+
existing_ids = {resource.get("id", "") for resource in resources if isinstance(resource, dict)}
|
| 220 |
+
existing_urls = {
|
| 221 |
+
resource.get("url", "").rstrip("/")
|
| 222 |
+
for resource in resources
|
| 223 |
+
if isinstance(resource, dict) and isinstance(resource.get("url"), str)
|
| 224 |
+
}
|
| 225 |
+
|
| 226 |
+
all_candidates: list[dict[str, Any]] = []
|
| 227 |
+
source_errors: list[str] = []
|
| 228 |
+
sources_used: list[str] = []
|
| 229 |
+
|
| 230 |
+
fetch_steps = [
|
| 231 |
+
("huggingface-datasets", lambda: fetch_huggingface("datasets", args.limit)),
|
| 232 |
+
("huggingface-models", lambda: fetch_huggingface("models", args.limit)),
|
| 233 |
+
("arxiv", lambda: fetch_arxiv(args.limit)),
|
| 234 |
+
("semantic-scholar", lambda: fetch_semantic_scholar(args.limit)),
|
| 235 |
+
]
|
| 236 |
+
|
| 237 |
+
for source_name, step in fetch_steps:
|
| 238 |
+
try:
|
| 239 |
+
results = step()
|
| 240 |
+
all_candidates.extend(results)
|
| 241 |
+
sources_used.append(source_name)
|
| 242 |
+
except Exception as exc: # noqa: BLE001
|
| 243 |
+
source_errors.append(f"{source_name}: {exc}")
|
| 244 |
+
|
| 245 |
+
unique_candidates = _dedupe_candidates(all_candidates, existing_ids, existing_urls)
|
| 246 |
+
unique_candidates = sorted(unique_candidates, key=lambda item: item["title"].lower())
|
| 247 |
+
|
| 248 |
+
payload: dict[str, Any] = {
|
| 249 |
+
"generated_on": datetime.now(timezone.utc).isoformat(),
|
| 250 |
+
"sources": sources_used,
|
| 251 |
+
"candidate_count": len(unique_candidates),
|
| 252 |
+
"candidates": unique_candidates,
|
| 253 |
+
}
|
| 254 |
+
if source_errors:
|
| 255 |
+
payload["errors"] = source_errors
|
| 256 |
+
|
| 257 |
+
output_path.parent.mkdir(parents=True, exist_ok=True)
|
| 258 |
+
if output_path.exists():
|
| 259 |
+
try:
|
| 260 |
+
old_payload = json.loads(output_path.read_text(encoding="utf-8"))
|
| 261 |
+
except json.JSONDecodeError:
|
| 262 |
+
old_payload = None
|
| 263 |
+
if isinstance(old_payload, dict):
|
| 264 |
+
old_compare = {key: value for key, value in old_payload.items() if key != "generated_on"}
|
| 265 |
+
new_compare = {key: value for key, value in payload.items() if key != "generated_on"}
|
| 266 |
+
if old_compare == new_compare:
|
| 267 |
+
print(
|
| 268 |
+
f"Candidate sync complete: {len(unique_candidates)} new candidates, "
|
| 269 |
+
f"{len(source_errors)} source errors, no file changes"
|
| 270 |
+
)
|
| 271 |
+
return 0
|
| 272 |
+
|
| 273 |
+
output_path.write_text(json.dumps(payload, ensure_ascii=False, indent=2) + "\n", encoding="utf-8")
|
| 274 |
+
|
| 275 |
+
print(
|
| 276 |
+
f"Candidate sync complete: {len(unique_candidates)} new candidates, "
|
| 277 |
+
f"{len(source_errors)} source errors"
|
| 278 |
+
)
|
| 279 |
+
return 0
|
| 280 |
+
|
| 281 |
+
|
| 282 |
+
if __name__ == "__main__":
|
| 283 |
+
raise SystemExit(main())
|
scripts/validate_resource_catalog.py
ADDED
|
@@ -0,0 +1,207 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Validate the machine-readable Pashto resource catalog.
|
| 2 |
+
|
| 3 |
+
Usage:
|
| 4 |
+
python scripts/validate_resource_catalog.py
|
| 5 |
+
python scripts/validate_resource_catalog.py --catalog resources/catalog/resources.json
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
from __future__ import annotations
|
| 9 |
+
|
| 10 |
+
import argparse
|
| 11 |
+
import json
|
| 12 |
+
import re
|
| 13 |
+
from datetime import date
|
| 14 |
+
from pathlib import Path
|
| 15 |
+
from typing import Any
|
| 16 |
+
from urllib.parse import urlparse
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
ALLOWED_CATEGORIES = {"dataset", "model", "benchmark", "tool", "paper"}
|
| 20 |
+
ALLOWED_SOURCES = {"huggingface", "mozilla", "kaggle", "github", "arxiv", "meta", "other"}
|
| 21 |
+
ALLOWED_STATUS = {"verified", "candidate"}
|
| 22 |
+
RESOURCE_ID_RE = re.compile(r"^[a-z0-9][a-z0-9._-]*$")
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
def _load_json(path: Path) -> dict[str, Any]:
|
| 26 |
+
return json.loads(path.read_text(encoding="utf-8"))
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
def _is_valid_http_url(value: str) -> bool:
|
| 30 |
+
parsed = urlparse(value)
|
| 31 |
+
return parsed.scheme in {"http", "https"} and bool(parsed.netloc)
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
def _validate_iso_date(value: str) -> bool:
|
| 35 |
+
try:
|
| 36 |
+
date.fromisoformat(value)
|
| 37 |
+
except ValueError:
|
| 38 |
+
return False
|
| 39 |
+
return True
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
def validate_resource(resource: dict[str, Any], index: int) -> list[str]:
|
| 43 |
+
errors: list[str] = []
|
| 44 |
+
prefix = f"resource[{index}]"
|
| 45 |
+
|
| 46 |
+
required_fields = {
|
| 47 |
+
"id",
|
| 48 |
+
"title",
|
| 49 |
+
"url",
|
| 50 |
+
"category",
|
| 51 |
+
"source",
|
| 52 |
+
"status",
|
| 53 |
+
"summary",
|
| 54 |
+
"primary_use",
|
| 55 |
+
"pashto_evidence",
|
| 56 |
+
"tags",
|
| 57 |
+
}
|
| 58 |
+
missing = sorted(required_fields - resource.keys())
|
| 59 |
+
if missing:
|
| 60 |
+
errors.append(f"{prefix} missing required fields: {', '.join(missing)}")
|
| 61 |
+
return errors
|
| 62 |
+
|
| 63 |
+
rid = resource["id"]
|
| 64 |
+
if not isinstance(rid, str) or not RESOURCE_ID_RE.fullmatch(rid):
|
| 65 |
+
errors.append(f"{prefix}.id must match {RESOURCE_ID_RE.pattern}")
|
| 66 |
+
|
| 67 |
+
title = resource["title"]
|
| 68 |
+
if not isinstance(title, str) or len(title.strip()) < 3:
|
| 69 |
+
errors.append(f"{prefix}.title must be a non-empty string")
|
| 70 |
+
|
| 71 |
+
url = resource["url"]
|
| 72 |
+
if not isinstance(url, str) or not _is_valid_http_url(url):
|
| 73 |
+
errors.append(f"{prefix}.url must be a valid http/https URL")
|
| 74 |
+
|
| 75 |
+
category = resource["category"]
|
| 76 |
+
if category not in ALLOWED_CATEGORIES:
|
| 77 |
+
errors.append(f"{prefix}.category must be one of {sorted(ALLOWED_CATEGORIES)}")
|
| 78 |
+
|
| 79 |
+
source = resource["source"]
|
| 80 |
+
if source not in ALLOWED_SOURCES:
|
| 81 |
+
errors.append(f"{prefix}.source must be one of {sorted(ALLOWED_SOURCES)}")
|
| 82 |
+
|
| 83 |
+
status = resource["status"]
|
| 84 |
+
if status not in ALLOWED_STATUS:
|
| 85 |
+
errors.append(f"{prefix}.status must be one of {sorted(ALLOWED_STATUS)}")
|
| 86 |
+
|
| 87 |
+
summary = resource["summary"]
|
| 88 |
+
if not isinstance(summary, str) or len(summary.strip()) < 10:
|
| 89 |
+
errors.append(f"{prefix}.summary must be at least 10 characters")
|
| 90 |
+
|
| 91 |
+
primary_use = resource["primary_use"]
|
| 92 |
+
if not isinstance(primary_use, str) or len(primary_use.strip()) < 3:
|
| 93 |
+
errors.append(f"{prefix}.primary_use must be a non-empty string")
|
| 94 |
+
|
| 95 |
+
if "tasks" in resource and not (
|
| 96 |
+
isinstance(resource["tasks"], list)
|
| 97 |
+
and all(isinstance(item, str) and item.strip() for item in resource["tasks"])
|
| 98 |
+
):
|
| 99 |
+
errors.append(f"{prefix}.tasks must be a list of strings")
|
| 100 |
+
|
| 101 |
+
tags = resource["tags"]
|
| 102 |
+
if not (isinstance(tags, list) and tags and all(isinstance(tag, str) and tag.strip() for tag in tags)):
|
| 103 |
+
errors.append(f"{prefix}.tags must be a non-empty list of strings")
|
| 104 |
+
|
| 105 |
+
evidence = resource["pashto_evidence"]
|
| 106 |
+
if not isinstance(evidence, dict):
|
| 107 |
+
errors.append(f"{prefix}.pashto_evidence must be an object")
|
| 108 |
+
return errors
|
| 109 |
+
|
| 110 |
+
for key in ("evidence_text", "evidence_url", "markers"):
|
| 111 |
+
if key not in evidence:
|
| 112 |
+
errors.append(f"{prefix}.pashto_evidence missing '{key}'")
|
| 113 |
+
|
| 114 |
+
evidence_text = evidence.get("evidence_text")
|
| 115 |
+
if not isinstance(evidence_text, str) or len(evidence_text.strip()) < 3:
|
| 116 |
+
errors.append(f"{prefix}.pashto_evidence.evidence_text must be a string")
|
| 117 |
+
|
| 118 |
+
evidence_url = evidence.get("evidence_url")
|
| 119 |
+
if not isinstance(evidence_url, str) or not _is_valid_http_url(evidence_url):
|
| 120 |
+
errors.append(f"{prefix}.pashto_evidence.evidence_url must be a valid http/https URL")
|
| 121 |
+
|
| 122 |
+
markers = evidence.get("markers")
|
| 123 |
+
if not (isinstance(markers, list) and markers and all(isinstance(marker, str) and marker.strip() for marker in markers)):
|
| 124 |
+
errors.append(f"{prefix}.pashto_evidence.markers must be a non-empty list of strings")
|
| 125 |
+
|
| 126 |
+
return errors
|
| 127 |
+
|
| 128 |
+
|
| 129 |
+
def validate_catalog(catalog: dict[str, Any]) -> list[str]:
|
| 130 |
+
errors: list[str] = []
|
| 131 |
+
|
| 132 |
+
for key in ("version", "updated_on", "resources"):
|
| 133 |
+
if key not in catalog:
|
| 134 |
+
errors.append(f"catalog missing required top-level key: {key}")
|
| 135 |
+
|
| 136 |
+
if errors:
|
| 137 |
+
return errors
|
| 138 |
+
|
| 139 |
+
version = catalog["version"]
|
| 140 |
+
if not isinstance(version, str) or not re.fullmatch(r"^\d+\.\d+\.\d+$", version):
|
| 141 |
+
errors.append("catalog.version must look like '1.0.0'")
|
| 142 |
+
|
| 143 |
+
updated_on = catalog["updated_on"]
|
| 144 |
+
if not isinstance(updated_on, str) or not _validate_iso_date(updated_on):
|
| 145 |
+
errors.append("catalog.updated_on must be a valid ISO date (YYYY-MM-DD)")
|
| 146 |
+
|
| 147 |
+
resources = catalog["resources"]
|
| 148 |
+
if not isinstance(resources, list):
|
| 149 |
+
errors.append("catalog.resources must be a list")
|
| 150 |
+
return errors
|
| 151 |
+
|
| 152 |
+
seen_ids: set[str] = set()
|
| 153 |
+
for index, resource in enumerate(resources):
|
| 154 |
+
if not isinstance(resource, dict):
|
| 155 |
+
errors.append(f"resource[{index}] must be an object")
|
| 156 |
+
continue
|
| 157 |
+
errors.extend(validate_resource(resource, index))
|
| 158 |
+
resource_id = resource.get("id")
|
| 159 |
+
if isinstance(resource_id, str):
|
| 160 |
+
if resource_id in seen_ids:
|
| 161 |
+
errors.append(f"duplicate resource id: {resource_id}")
|
| 162 |
+
seen_ids.add(resource_id)
|
| 163 |
+
|
| 164 |
+
return errors
|
| 165 |
+
|
| 166 |
+
|
| 167 |
+
def main() -> int:
|
| 168 |
+
parser = argparse.ArgumentParser()
|
| 169 |
+
parser.add_argument("--catalog", default="resources/catalog/resources.json")
|
| 170 |
+
parser.add_argument("--schema", default="resources/schema/resource.schema.json")
|
| 171 |
+
args = parser.parse_args()
|
| 172 |
+
|
| 173 |
+
catalog_path = Path(args.catalog)
|
| 174 |
+
schema_path = Path(args.schema)
|
| 175 |
+
|
| 176 |
+
if not catalog_path.exists():
|
| 177 |
+
print(f"Missing catalog file: {catalog_path}")
|
| 178 |
+
return 1
|
| 179 |
+
if not schema_path.exists():
|
| 180 |
+
print(f"Missing schema file: {schema_path}")
|
| 181 |
+
return 1
|
| 182 |
+
|
| 183 |
+
try:
|
| 184 |
+
schema = _load_json(schema_path)
|
| 185 |
+
catalog = _load_json(catalog_path)
|
| 186 |
+
except json.JSONDecodeError as exc:
|
| 187 |
+
print(f"Invalid JSON: {exc}")
|
| 188 |
+
return 1
|
| 189 |
+
|
| 190 |
+
# Basic schema sanity check (this script enforces the validation rules directly).
|
| 191 |
+
if not isinstance(schema, dict) or "$schema" not in schema:
|
| 192 |
+
print("Schema file must be a JSON object with a '$schema' key")
|
| 193 |
+
return 1
|
| 194 |
+
|
| 195 |
+
errors = validate_catalog(catalog)
|
| 196 |
+
if errors:
|
| 197 |
+
print("Resource catalog validation failed:")
|
| 198 |
+
for error in errors:
|
| 199 |
+
print(f"- {error}")
|
| 200 |
+
return 1
|
| 201 |
+
|
| 202 |
+
print(f"Resource catalog valid: {len(catalog['resources'])} resources")
|
| 203 |
+
return 0
|
| 204 |
+
|
| 205 |
+
|
| 206 |
+
if __name__ == "__main__":
|
| 207 |
+
raise SystemExit(main())
|
tests/test_validate_resource_catalog.py
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from scripts.validate_resource_catalog import validate_catalog
|
| 2 |
+
|
| 3 |
+
|
| 4 |
+
def _minimal_catalog() -> dict:
|
| 5 |
+
return {
|
| 6 |
+
"version": "1.0.0",
|
| 7 |
+
"updated_on": "2026-02-15",
|
| 8 |
+
"resources": [
|
| 9 |
+
{
|
| 10 |
+
"id": "dataset-example",
|
| 11 |
+
"title": "Example Dataset",
|
| 12 |
+
"url": "https://example.org/dataset",
|
| 13 |
+
"category": "dataset",
|
| 14 |
+
"source": "other",
|
| 15 |
+
"status": "verified",
|
| 16 |
+
"summary": "Useful Pashto example dataset for testing the validator.",
|
| 17 |
+
"primary_use": "Testing",
|
| 18 |
+
"pashto_evidence": {
|
| 19 |
+
"evidence_text": "Mentions Pashto in title.",
|
| 20 |
+
"evidence_url": "https://example.org/dataset",
|
| 21 |
+
"markers": ["Pashto"],
|
| 22 |
+
},
|
| 23 |
+
"tags": ["pashto", "test"],
|
| 24 |
+
}
|
| 25 |
+
],
|
| 26 |
+
}
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
def test_validate_catalog_passes_for_minimal_valid_catalog() -> None:
|
| 30 |
+
errors = validate_catalog(_minimal_catalog())
|
| 31 |
+
assert errors == []
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
def test_validate_catalog_fails_for_duplicate_ids() -> None:
|
| 35 |
+
catalog = _minimal_catalog()
|
| 36 |
+
catalog["resources"].append(dict(catalog["resources"][0]))
|
| 37 |
+
errors = validate_catalog(catalog)
|
| 38 |
+
assert any("duplicate resource id" in error for error in errors)
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
def test_validate_catalog_fails_for_invalid_evidence_url() -> None:
|
| 42 |
+
catalog = _minimal_catalog()
|
| 43 |
+
catalog["resources"][0]["pashto_evidence"]["evidence_url"] = "not-a-url"
|
| 44 |
+
errors = validate_catalog(catalog)
|
| 45 |
+
assert any("evidence_url" in error for error in errors)
|