musaw commited on
Commit
2f53244
·
1 Parent(s): 4598659

Sync main snapshot to Hugging Face (no local binary banner)

Browse files
.gitattributes CHANGED
@@ -1,8 +1,7 @@
1
- * text=auto eol=lf
2
  *.md text eol=lf
3
  *.py text eol=lf
4
  *.tsv text eol=lf
5
-
6
  *.7z filter=lfs diff=lfs merge=lfs -text
7
  *.arrow filter=lfs diff=lfs merge=lfs -text
8
  *.bin filter=lfs diff=lfs merge=lfs -text
@@ -38,3 +37,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
38
  *.zip filter=lfs diff=lfs merge=lfs -text
39
  *.zst filter=lfs diff=lfs merge=lfs -text
40
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
1
+ * text=auto eol=lf
2
  *.md text eol=lf
3
  *.py text eol=lf
4
  *.tsv text eol=lf
 
5
  *.7z filter=lfs diff=lfs merge=lfs -text
6
  *.arrow filter=lfs diff=lfs merge=lfs -text
7
  *.bin filter=lfs diff=lfs merge=lfs -text
 
37
  *.zip filter=lfs diff=lfs merge=lfs -text
38
  *.zst filter=lfs diff=lfs merge=lfs -text
39
  *tfevents* filter=lfs diff=lfs merge=lfs -text
40
+
.github/workflows/resource_sync.yml CHANGED
@@ -32,6 +32,36 @@ jobs:
32
  - name: Validate catalog
33
  run: python scripts/validate_resource_catalog.py
34
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
  - name: Create review PR
36
  uses: peter-evans/create-pull-request@v6
37
  with:
 
32
  - name: Validate catalog
33
  run: python scripts/validate_resource_catalog.py
34
 
35
+ - name: Ensure labels exist
36
+ uses: actions/github-script@v7
37
+ with:
38
+ script: |
39
+ const labels = [
40
+ { name: "resource-update", color: "0e8a16", description: "Automated resource catalog updates" },
41
+ { name: "needs-review", color: "fbca04", description: "Needs maintainer review before merge" }
42
+ ];
43
+ for (const label of labels) {
44
+ try {
45
+ await github.rest.issues.getLabel({
46
+ owner: context.repo.owner,
47
+ repo: context.repo.repo,
48
+ name: label.name
49
+ });
50
+ } catch (error) {
51
+ if (error.status === 404) {
52
+ await github.rest.issues.createLabel({
53
+ owner: context.repo.owner,
54
+ repo: context.repo.repo,
55
+ name: label.name,
56
+ color: label.color,
57
+ description: label.description
58
+ });
59
+ } else {
60
+ throw error;
61
+ }
62
+ }
63
+ }
64
+
65
  - name: Create review PR
66
  uses: peter-evans/create-pull-request@v6
67
  with:
CITATION.cff ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ cff-version: 1.2.0
2
+ message: "If you use this repository, please cite it."
3
+ title: "Pashto Language Resources Hub (Pukhto/Pashto)"
4
+ type: software
5
+ version: 0.1.0
6
+ license: Apache-2.0
7
+ repository-code: "https://github.com/Musawer1214/pashto-language-resources"
8
+ url: "https://github.com/Musawer1214/pashto-language-resources"
9
+ authors:
10
+ - family-names: "Musawer"
11
+ given-names: "Musawer"
12
+ keywords:
13
+ - Pashto
14
+ - Pukhto
15
+ - Pushto
16
+ - ASR
17
+ - TTS
18
+ - NLP
19
+ - machine translation
20
+ - language resources
21
+
README.md CHANGED
@@ -1,63 +1,57 @@
1
- ---
2
- license: apache-2.0
3
- language:
4
- - ps
5
- tags:
6
- - pashto
7
- - asr
8
- - tts
9
- - nlp
10
- ---
11
 
12
- ![Pukhto Pashto Repository Banner](Repository_banner_Image.png)
13
 
14
- # Pukhto/Pashto Open Language Project
15
 
16
- Community-led open-source project to make Pashto a first-class language in speech and language technology.
17
 
18
- ## Project Links
19
- - GitHub Pages (About): [Pukhto_Pashto Site](https://musawer1214.github.io/Pukhto_Pashto/)
20
- - GitHub Pages (Resource Search): [Pashto Resource Search](https://musawer1214.github.io/Pukhto_Pashto/search/)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
 
22
  ## Current Scope
 
23
  - Build open Pashto datasets, benchmarks, and model references for ASR, TTS, NLP, and MT.
24
- - Track practical tools, apps, and academic papers relevant to Pashto integration in technology.
25
  - Keep everything transparent, reproducible, and contribution-friendly.
26
 
27
- ## Resource System (Current)
28
 
29
- This repository now has a machine-readable and searchable resource pipeline:
30
 
31
  - Canonical catalog: [resources/catalog/resources.json](resources/catalog/resources.json)
32
  - Catalog schema: [resources/schema/resource.schema.json](resources/schema/resource.schema.json)
33
  - Candidate feed (auto-generated): [resources/catalog/pending_candidates.json](resources/catalog/pending_candidates.json)
34
- - Search UI: [docs/search/index.html](docs/search/index.html)
35
  - Search data export: [docs/search/resources.json](docs/search/resources.json)
36
- - Full index docs: [docs/resource_catalog.md](docs/resource_catalog.md)
37
  - Automation docs: [docs/resource_automation.md](docs/resource_automation.md)
38
- - Repeatable runbook: [docs/resource_cycle_runbook.md](docs/resource_cycle_runbook.md)
39
 
40
  ## How New Resources Are Added
41
 
42
- The process is semi-automatic:
43
-
44
- 1. Auto discovery:
45
- - Daily GitHub Action runs `.github/workflows/resource_sync.yml`.
46
- - It updates `resources/catalog/pending_candidates.json` and opens a review PR.
47
-
48
- 2. Manual review and promotion:
49
- - Maintainers inspect candidate quality, Pashto evidence, and license/usage compatibility.
50
- - Approved entries are moved into `resources/catalog/resources.json` with `status: verified`.
51
-
52
- 3. Regeneration and validation:
53
- - Run `python scripts/validate_resource_catalog.py`
54
- - Run `python scripts/generate_resource_views.py`
55
- - Commit generated updates (`resources/*/README.md` and `docs/search/resources.json`).
56
 
57
  Shortcut wrapper:
58
- - Run `python scripts/run_resource_cycle.py --limit 25`
59
-
60
- This prevents low-confidence links from being merged directly while still automating discovery.
61
 
62
  ## Quickstart
63
 
@@ -69,7 +63,15 @@ python scripts/check_links.py
69
  python -m pytest -q
70
  ```
71
 
 
 
 
 
 
 
 
72
  ## Documentation Map
 
73
  - Purpose: [PROJECT_PURPOSE.md](PROJECT_PURPOSE.md)
74
  - Contributing: [CONTRIBUTING.md](CONTRIBUTING.md)
75
  - Roadmap: [ROADMAP.md](ROADMAP.md)
@@ -82,6 +84,7 @@ python -m pytest -q
82
  - Resource automation: [docs/resource_automation.md](docs/resource_automation.md)
83
 
84
  ## Resource Sections
 
85
  - Datasets: [resources/datasets/README.md](resources/datasets/README.md)
86
  - Models: [resources/models/README.md](resources/models/README.md)
87
  - Benchmarks: [resources/benchmarks/README.md](resources/benchmarks/README.md)
@@ -91,6 +94,7 @@ python -m pytest -q
91
  - Code: [resources/codes/README.md](resources/codes/README.md)
92
 
93
  ## Workspaces
 
94
  - [data/](data/README.md): datasets, curation, metadata, quality
95
  - [asr/](asr/README.md): ASR baselines and experiments
96
  - [tts/](tts/README.md): TTS baselines and experiments
@@ -98,3 +102,6 @@ python -m pytest -q
98
  - [experiments/](experiments/README.md): reproducible run cards
99
  - [apps/desktop/](apps/desktop/README.md): user-facing integration references
100
  - [models/](models/README.md): model layout and release conventions
 
 
 
 
1
+ # Pashto Language Resources Hub (Pukhto/Pashto)
 
 
 
 
 
 
 
 
 
2
 
3
+ Open-source repository for Pashto language technology resources: datasets, models, benchmarks, ASR, TTS, NLP, and machine translation (MT).
4
 
5
+ This project curates verified Pashto resources and maintains reproducible tooling for discovery, validation, and documentation.
6
 
7
+ ![Pukhto Pashto Repository Banner](https://raw.githubusercontent.com/Musawer1214/pashto-language-resources/main/Repository_banner_Image.png)
8
 
9
+ ## Start Here
10
+
11
+ - Main resource search: [Pashto Resource Search](https://musawer1214.github.io/pashto-language-resources/search/)
12
+ - Project site: [Pashto Language Resources Hub](https://musawer1214.github.io/pashto-language-resources/)
13
+ - GitHub repository: [Musawer1214/pashto-language-resources](https://github.com/Musawer1214/pashto-language-resources)
14
+ - Hugging Face mirror: [Musawer14/pashto-language-resources](https://huggingface.co/Musawer14/pashto-language-resources)
15
+
16
+ ## If You Searched For
17
+
18
+ This repository is relevant to these search intents:
19
+
20
+ - Pashto datasets
21
+ - Pashto ASR model
22
+ - Pashto TTS resources
23
+ - Pashto NLP benchmark
24
+ - Pashto machine translation resources
25
+ - Pukhto language technology
26
+ - Pushto AI resources
27
 
28
  ## Current Scope
29
+
30
  - Build open Pashto datasets, benchmarks, and model references for ASR, TTS, NLP, and MT.
31
+ - Track practical tools, apps, and academic papers for Pashto integration in technology.
32
  - Keep everything transparent, reproducible, and contribution-friendly.
33
 
34
+ ## Resource System
35
 
36
+ Machine-readable and searchable resource pipeline:
37
 
38
  - Canonical catalog: [resources/catalog/resources.json](resources/catalog/resources.json)
39
  - Catalog schema: [resources/schema/resource.schema.json](resources/schema/resource.schema.json)
40
  - Candidate feed (auto-generated): [resources/catalog/pending_candidates.json](resources/catalog/pending_candidates.json)
41
+ - Search UI source: [docs/search/index.html](docs/search/index.html)
42
  - Search data export: [docs/search/resources.json](docs/search/resources.json)
43
+ - Resource index docs: [docs/resource_catalog.md](docs/resource_catalog.md)
44
  - Automation docs: [docs/resource_automation.md](docs/resource_automation.md)
45
+ - Cycle runbook: [docs/resource_cycle_runbook.md](docs/resource_cycle_runbook.md)
46
 
47
  ## How New Resources Are Added
48
 
49
+ 1. Auto discovery runs daily from `.github/workflows/resource_sync.yml` and updates `resources/catalog/pending_candidates.json` in a review PR.
50
+ 2. Manual review checks quality, Pashto evidence, and license compatibility before promoting entries into `resources/catalog/resources.json` with `status: verified`.
51
+ 3. Regeneration and validation runs `python scripts/validate_resource_catalog.py` and `python scripts/generate_resource_views.py`, then commits generated updates.
 
 
 
 
 
 
 
 
 
 
 
52
 
53
  Shortcut wrapper:
54
+ - `python scripts/run_resource_cycle.py --limit 25`
 
 
55
 
56
  ## Quickstart
57
 
 
63
  python -m pytest -q
64
  ```
65
 
66
+ ## Discoverability And SEO
67
+
68
+ - Playbook: [docs/discoverability_seo.md](docs/discoverability_seo.md)
69
+ - Docs hub: [docs/README.md](docs/README.md)
70
+ - Resource search page: [docs/search/index.html](docs/search/index.html)
71
+ - Citation metadata: [CITATION.cff](CITATION.cff)
72
+
73
  ## Documentation Map
74
+
75
  - Purpose: [PROJECT_PURPOSE.md](PROJECT_PURPOSE.md)
76
  - Contributing: [CONTRIBUTING.md](CONTRIBUTING.md)
77
  - Roadmap: [ROADMAP.md](ROADMAP.md)
 
84
  - Resource automation: [docs/resource_automation.md](docs/resource_automation.md)
85
 
86
  ## Resource Sections
87
+
88
  - Datasets: [resources/datasets/README.md](resources/datasets/README.md)
89
  - Models: [resources/models/README.md](resources/models/README.md)
90
  - Benchmarks: [resources/benchmarks/README.md](resources/benchmarks/README.md)
 
94
  - Code: [resources/codes/README.md](resources/codes/README.md)
95
 
96
  ## Workspaces
97
+
98
  - [data/](data/README.md): datasets, curation, metadata, quality
99
  - [asr/](asr/README.md): ASR baselines and experiments
100
  - [tts/](tts/README.md): TTS baselines and experiments
 
102
  - [experiments/](experiments/README.md): reproducible run cards
103
  - [apps/desktop/](apps/desktop/README.md): user-facing integration references
104
  - [models/](models/README.md): model layout and release conventions
105
+
106
+
107
+
benchmarks/results/templates/tts_result.example.json CHANGED
@@ -4,7 +4,7 @@
4
  "name": "internal_prompt_set",
5
  "version": "v1",
6
  "split": "eval",
7
- "source_url": "https://github.com/Musawer1214/Pukhto_Pashto"
8
  },
9
  "model": {
10
  "name": "facebook/mms-tts",
@@ -26,3 +26,4 @@
26
  "config_ref": "tts/configs/mms_tts_eval.yaml"
27
  }
28
  }
 
 
4
  "name": "internal_prompt_set",
5
  "version": "v1",
6
  "split": "eval",
7
+ "source_url": "https://github.com/Musawer1214/pashto-language-resources"
8
  },
9
  "model": {
10
  "name": "facebook/mms-tts",
 
26
  "config_ref": "tts/configs/mms_tts_eval.yaml"
27
  }
28
  }
29
+
benchmarks/schema/benchmark_result.schema.json CHANGED
@@ -1,6 +1,6 @@
1
  {
2
  "$schema": "https://json-schema.org/draft/2020-12/schema",
3
- "$id": "https://github.com/Musawer1214/Pukhto_Pashto/benchmarks/schema/benchmark_result.schema.json",
4
  "title": "Pashto Benchmark Result",
5
  "type": "object",
6
  "required": [
@@ -67,3 +67,4 @@
67
  },
68
  "additionalProperties": false
69
  }
 
 
1
  {
2
  "$schema": "https://json-schema.org/draft/2020-12/schema",
3
+ "$id": "https://github.com/Musawer1214/pashto-language-resources/benchmarks/schema/benchmark_result.schema.json",
4
  "title": "Pashto Benchmark Result",
5
  "type": "object",
6
  "required": [
 
67
  },
68
  "additionalProperties": false
69
  }
70
+
docs/README.md CHANGED
@@ -15,6 +15,7 @@ This folder is the main documentation entry point for contributors.
15
  - Dataset guidelines: [dataset_guidelines.md](dataset_guidelines.md)
16
  - Pashto normalization policy: [pashto_normalization_v0.1.md](pashto_normalization_v0.1.md)
17
  - Common Voice Pashto integration: [common_voice_pashto_24.md](common_voice_pashto_24.md)
 
18
  - Release process: [release_process.md](release_process.md)
19
  - Release checklist: [release_checklist.md](release_checklist.md)
20
  - Platforms and publish flow: [platforms.md](platforms.md)
 
15
  - Dataset guidelines: [dataset_guidelines.md](dataset_guidelines.md)
16
  - Pashto normalization policy: [pashto_normalization_v0.1.md](pashto_normalization_v0.1.md)
17
  - Common Voice Pashto integration: [common_voice_pashto_24.md](common_voice_pashto_24.md)
18
+ - Discoverability and SEO playbook: [discoverability_seo.md](discoverability_seo.md)
19
  - Release process: [release_process.md](release_process.md)
20
  - Release checklist: [release_checklist.md](release_checklist.md)
21
  - Platforms and publish flow: [platforms.md](platforms.md)
docs/_config.yml CHANGED
@@ -1,4 +1,21 @@
1
- title: Pukhto Pashto
2
- description: Community-led open Pashto AI language project
3
  theme: minima
4
  markdown: kramdown
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ title: Pashto Language Resources Hub
2
+ description: Open Pashto (Pukhto/Pashto) datasets, models, ASR, TTS, NLP, MT, and benchmark resources.
3
  theme: minima
4
  markdown: kramdown
5
+ url: "https://musawer1214.github.io"
6
+ baseurl: "/pashto-language-resources"
7
+ lang: en
8
+ locale: en_US
9
+ plugins:
10
+ - jekyll-seo-tag
11
+ - jekyll-sitemap
12
+
13
+ logo: "https://raw.githubusercontent.com/Musawer1214/pashto-language-resources/main/Repository_banner_Image.png"
14
+
15
+ social:
16
+ name: Pashto Language Resources Hub
17
+ links:
18
+ - https://github.com/Musawer1214/pashto-language-resources
19
+ - https://huggingface.co/Musawer14/pashto-language-resources
20
+
21
+
docs/discoverability_seo.md ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Discoverability and SEO Playbook
2
+
3
+ This playbook focuses on making the repository easier to find in:
4
+
5
+ - GitHub search
6
+ - Google/Bing search
7
+ - Academic and resource discovery channels
8
+
9
+ ## 1) Repository Rename Recommendation
10
+
11
+ Current slug is `pashto-language-resources`.
12
+
13
+ Previous slug was `Pukhto_Pashto`, which was less search-friendly due to underscore and mixed spelling.
14
+
15
+ Recommended slug options:
16
+
17
+ 1. `pashto-language-resources`
18
+ 2. `pashto-ai-resources`
19
+ 3. `pashto-language-tech`
20
+
21
+ Selection rule:
22
+ - Prefer the name that starts with `pashto` and includes a clear intent word like `resources`.
23
+
24
+ After rename:
25
+ - Update `docs/_config.yml` `baseurl`.
26
+ - Update hardcoded URLs in `README.md`, `docs/index.md`, and `docs/search/index.html`.
27
+ - Keep old links alive via GitHub redirect behavior, but still update links in-repo.
28
+
29
+ ## 2) GitHub About Section (Manual UI)
30
+
31
+ Set these in repository `Settings -> General` and About panel:
32
+
33
+ - Description:
34
+ - `Open-source Pashto (Pukhto/Pashto) datasets, ASR, TTS, NLP, MT, models, and benchmark resources.`
35
+ - Website:
36
+ - [GitHub Pages home](https://musawer1214.github.io/pashto-language-resources/) (or new slug after rename)
37
+ - Topics:
38
+ - `pashto`
39
+ - `pukhto`
40
+ - `pushto`
41
+ - `asr`
42
+ - `tts`
43
+ - `nlp`
44
+ - `machine-translation`
45
+ - `speech-recognition`
46
+ - `language-resources`
47
+ - `low-resource-languages`
48
+
49
+ ## 3) Content Signals
50
+
51
+ - Keep the first 160 characters of `README.md` keyword clear.
52
+ - Use consistent terminology across pages: `Pashto (Pukhto/Pushto)`.
53
+ - Publish regular updates in `CHANGELOG.md` and GitHub Releases.
54
+ - Keep `CITATION.cff` updated for scholarly reuse and citation.
55
+
56
+ ## 4) Pages SEO and Crawlability
57
+
58
+ Already included in this repository:
59
+
60
+ - `docs/_config.yml` with sitemap and SEO plugin support.
61
+ - `docs/robots.txt` with sitemap reference.
62
+ - Page-level metadata and structured data in `docs/search/index.html`.
63
+
64
+ Keep these updated when renaming slug or domain.
65
+
66
+ ## 5) External Discovery Boost
67
+
68
+ - Add the GitHub Pages search URL to:
69
+ - Hugging Face model/dataset cards
70
+ - Relevant community profiles and README links
71
+ - Conference/demo pages for Pashto language technology
72
+ - Ask contributors to link specific resource pages in blog posts or papers.
73
+
74
+ ## 6) Indexing Checklist (After Push)
75
+
76
+ 1. Push all changes to `main`.
77
+ 2. Verify GitHub Pages is serving:
78
+ - `/`
79
+ - `/search/`
80
+ - `/robots.txt`
81
+ - `/sitemap.xml`
82
+ 3. Add site property in Google Search Console.
83
+ 4. Submit sitemap URL:
84
+ - [Sitemap file](https://musawer1214.github.io/pashto-language-resources/sitemap.xml)
85
+ 5. Run URL Inspection and request indexing for:
86
+ - Home page
87
+ - Search page
88
+ 6. Recheck search visibility after 1 to 3 weeks.
89
+
docs/github_operations.md CHANGED
@@ -1,21 +1,32 @@
1
- # 🧰 GitHub Operations Guide
2
 
3
- This guide covers repository operations that can be versioned in code and
4
- manual steps that must be done in the GitHub web UI.
 
5
 
6
- ## In-Repo Operations (already versioned)
7
  - Issue templates: [../.github/ISSUE_TEMPLATE/](../.github/ISSUE_TEMPLATE/)
8
  - PR template: [../.github/PULL_REQUEST_TEMPLATE.md](../.github/PULL_REQUEST_TEMPLATE.md)
9
  - CI workflow: [../.github/workflows/ci.yml](../.github/workflows/ci.yml)
10
  - Release template: [../.github/release_template.md](../.github/release_template.md)
11
 
12
- ## Manual GitHub UI Steps (not stored in repo)
13
- - Create/update GitHub Project board.
 
14
  - Define milestone dates.
15
  - Configure branch protection rules.
16
  - Configure required status checks.
 
 
 
 
 
 
 
 
 
17
 
18
  ## Recommended Project Board Columns
 
19
  1. Backlog
20
  2. Ready
21
  3. In Progress
 
1
+ # GitHub Operations Guide
2
 
3
+ This guide covers repository operations that can be versioned in code and manual steps that must be done in the GitHub web UI.
4
+
5
+ ## In-Repo Operations (Versioned)
6
 
 
7
  - Issue templates: [../.github/ISSUE_TEMPLATE/](../.github/ISSUE_TEMPLATE/)
8
  - PR template: [../.github/PULL_REQUEST_TEMPLATE.md](../.github/PULL_REQUEST_TEMPLATE.md)
9
  - CI workflow: [../.github/workflows/ci.yml](../.github/workflows/ci.yml)
10
  - Release template: [../.github/release_template.md](../.github/release_template.md)
11
 
12
+ ## Manual GitHub UI Steps (Not Stored in Repo)
13
+
14
+ - Create or update GitHub Project board.
15
  - Define milestone dates.
16
  - Configure branch protection rules.
17
  - Configure required status checks.
18
+ - Update repository About description, website, and topics.
19
+ - Upload social preview image.
20
+
21
+ ## Discoverability-Specific GitHub UI Tasks
22
+
23
+ - About description should include: `Pashto`, `ASR`, `TTS`, `NLP`, and `machine translation`.
24
+ - Topics should include at least: `pashto`, `pukhto`, `pushto`, `asr`, `tts`, `nlp`, `language-resources`.
25
+ - Website should point to GitHub Pages search or docs home.
26
+ - After renaming the repository, update pinned repositories and profile links.
27
 
28
  ## Recommended Project Board Columns
29
+
30
  1. Backlog
31
  2. Ready
32
  3. In Progress
docs/index.md CHANGED
@@ -1,16 +1,17 @@
1
  ---
2
  layout: default
3
- title: About Pukhto Pashto
 
4
  ---
5
 
6
- # About This Repository
7
 
8
- `Pukhto_Pashto` is a community-led open project focused on making Pashto a first-class language in speech and language AI.
9
 
10
  ## Mission
11
 
12
- - Build open Pashto datasets for ASR, TTS, and NLP.
13
- - Publish reproducible baseline models and evaluation workflows.
14
  - Keep progress transparent, contributor-friendly, and public-benefit oriented.
15
 
16
  ## What Is In This Repository
@@ -23,28 +24,41 @@ title: About Pukhto Pashto
23
  - `docs/`: policies, roadmap, release process, and operating guides.
24
  - `resources/`: verified external Pashto datasets, models, tools, benchmarks, and papers.
25
 
26
- ## Search Resources
27
 
28
  - Search UI: [Pashto Resource Search](search/)
29
  - Resource index docs: [resource_catalog.md](resource_catalog.md)
30
- - Machine-readable catalog: [../resources/catalog/resources.json](../resources/catalog/resources.json)
31
 
32
  ## Project References
33
 
34
- - Repository: [Musawer1214/Pukhto_Pashto](https://github.com/Musawer1214/Pukhto_Pashto)
35
- - Hugging Face: [Musawer14/Pukhto_Pashto](https://huggingface.co/Musawer14/Pukhto_Pashto)
36
  - Purpose: [PROJECT_PURPOSE.md](../PROJECT_PURPOSE.md)
37
  - Roadmap: [ROADMAP.md](../ROADMAP.md)
38
  - Contributing: [CONTRIBUTING.md](../CONTRIBUTING.md)
39
 
40
  ## Contributing
41
 
42
- You can help by improving documentation, validating normalization rows, sharing verified resources, or contributing data and evaluation workflows.
43
 
44
  For contributor workflow and standards, start at:
45
  - [docs/README.md](README.md)
46
  - [community/COMMUNICATION.md](../community/COMMUNICATION.md)
47
 
 
 
 
 
 
 
 
 
 
 
 
48
  ## License
49
 
50
  This project is released under Apache 2.0. See [LICENSE](../LICENSE).
 
 
 
1
  ---
2
  layout: default
3
+ title: Pashto Language Resources Hub
4
+ description: Open-source Pashto (Pukhto/Pashto) datasets, models, benchmarks, ASR, TTS, NLP, and MT resources.
5
  ---
6
 
7
+ # Pashto Language Resources Hub
8
 
9
+ `pashto-language-resources` is a community-led open-source project focused on making Pashto (also written as Pukhto/Pushto) a first-class language in speech and language AI.
10
 
11
  ## Mission
12
 
13
+ - Build open Pashto datasets and resource indexes for ASR, TTS, NLP, and MT.
14
+ - Publish reproducible baseline models, benchmark schemas, and evaluation workflows.
15
  - Keep progress transparent, contributor-friendly, and public-benefit oriented.
16
 
17
  ## What Is In This Repository
 
24
  - `docs/`: policies, roadmap, release process, and operating guides.
25
  - `resources/`: verified external Pashto datasets, models, tools, benchmarks, and papers.
26
 
27
+ ## Search Pashto Resources
28
 
29
  - Search UI: [Pashto Resource Search](search/)
30
  - Resource index docs: [resource_catalog.md](resource_catalog.md)
31
+ - Machine-readable catalog (GitHub): [resources.json source](https://github.com/Musawer1214/pashto-language-resources/blob/main/resources/catalog/resources.json)
32
 
33
  ## Project References
34
 
35
+ - Repository: [Musawer1214/pashto-language-resources](https://github.com/Musawer1214/pashto-language-resources)
36
+ - Hugging Face: [Musawer14/pashto-language-resources](https://huggingface.co/Musawer14/pashto-language-resources)
37
  - Purpose: [PROJECT_PURPOSE.md](../PROJECT_PURPOSE.md)
38
  - Roadmap: [ROADMAP.md](../ROADMAP.md)
39
  - Contributing: [CONTRIBUTING.md](../CONTRIBUTING.md)
40
 
41
  ## Contributing
42
 
43
+ You can help by improving documentation, validating normalization rows, sharing verified resources, or contributing data, model, and evaluation workflows.
44
 
45
  For contributor workflow and standards, start at:
46
  - [docs/README.md](README.md)
47
  - [community/COMMUNICATION.md](../community/COMMUNICATION.md)
48
 
49
+ ## Search Terms
50
+
51
+ This project is relevant to searches like:
52
+
53
+ - Pashto datasets
54
+ - Pashto ASR model
55
+ - Pashto TTS resources
56
+ - Pashto NLP benchmark
57
+ - Pashto language technology
58
+ - Pukhto language resources
59
+
60
  ## License
61
 
62
  This project is released under Apache 2.0. See [LICENSE](../LICENSE).
63
+
64
+
docs/platforms.md CHANGED
@@ -1,27 +1,39 @@
1
- # 🌐 Platforms
2
 
3
- ## 🧭 Primary Platforms
4
- - GitHub: code, issues, pull requests, releases
5
- - Hugging Face Hub: models, datasets, demos
6
- - Community chat (Discord/Matrix): contributor coordination
 
 
 
7
 
8
- ## 📚 Resource Discovery and Validation
9
  - Use [docs/resource_catalog.md](resource_catalog.md) as the single source of truth for validated external resources.
10
  - Add new links only after checking official pages and explicit Pashto support markers.
11
 
12
- ## 📣 Publishing Expectations
13
- - Every release links to changelog + benchmark snapshot.
14
- - Every model links to dataset provenance and eval metrics.
15
- - Every new external link must include use-case notes and where it belongs in repo structure.
16
- - CI should pass before merging (`.github/workflows/ci.yml`).
 
 
 
17
 
18
- ## 🚀 Dual Publish Checklist (GitHub + Hugging Face)
19
  1. `git status` is clean except intended changes.
20
- 2. Docs and resource links updated.
21
- 3. Commit created with clear scope.
22
  4. Push to `origin` (GitHub).
23
  5. Push to `hf` (Hugging Face).
24
  6. Verify README render and link health on both platforms.
25
 
26
- ## 🧰 Operations Guide
 
 
 
 
 
 
 
27
  - GitHub operations and manual UI tasks: [github_operations.md](github_operations.md)
 
 
1
+ # Platforms
2
 
3
+ ## Primary Platforms
4
+
5
+ - GitHub: code, issues, pull requests, releases, and docs source.
6
+ - Hugging Face Hub: models, datasets, and demos.
7
+ - Community chat (Discord/Matrix): contributor coordination.
8
+
9
+ ## Resource Discovery and Validation
10
 
 
11
  - Use [docs/resource_catalog.md](resource_catalog.md) as the single source of truth for validated external resources.
12
  - Add new links only after checking official pages and explicit Pashto support markers.
13
 
14
+ ## Publishing Expectations
15
+
16
+ - Every release links to changelog and benchmark snapshot.
17
+ - Every model links to dataset provenance and evaluation metrics.
18
+ - Every new external link includes use-case notes and target location in repo structure.
19
+ - CI must pass before merging (`.github/workflows/ci.yml`).
20
+
21
+ ## Dual Publish Checklist (GitHub and Hugging Face)
22
 
 
23
  1. `git status` is clean except intended changes.
24
+ 2. Docs and resource links are updated.
25
+ 3. Commit message is scoped and explicit.
26
  4. Push to `origin` (GitHub).
27
  5. Push to `hf` (Hugging Face).
28
  6. Verify README render and link health on both platforms.
29
 
30
+ ## Discoverability Checklist
31
+
32
+ - Keep GitHub About description, topics, and website URL updated.
33
+ - Keep [docs/discoverability_seo.md](discoverability_seo.md) current with slug and sitemap URLs.
34
+ - Ensure links from Hugging Face cards point to both the repository and search page.
35
+
36
+ ## Operations Guide
37
+
38
  - GitHub operations and manual UI tasks: [github_operations.md](github_operations.md)
39
+
docs/resource_automation.md CHANGED
@@ -13,7 +13,11 @@ This repository uses a semi-automated process to keep Pashto resources current w
13
  - Hugging Face models
14
  - Hugging Face Spaces (projects)
15
  - GitHub repositories (projects and code)
16
- - Research-paper endpoints
 
 
 
 
17
 
18
  ## Files involved
19
  - Canonical verified catalog: [../resources/catalog/resources.json](../resources/catalog/resources.json)
 
13
  - Hugging Face models
14
  - Hugging Face Spaces (projects)
15
  - GitHub repositories (projects and code)
16
+ - GitLab repositories (projects and code)
17
+ - Zenodo records
18
+ - Dataverse datasets
19
+ - DataCite DOI records
20
+ - Research-paper endpoints (arXiv, Semantic Scholar, OpenAlex, Crossref)
21
 
22
  ## Files involved
23
  - Canonical verified catalog: [../resources/catalog/resources.json](../resources/catalog/resources.json)
docs/resource_cycle_runbook.md CHANGED
@@ -21,7 +21,7 @@ What it executes:
21
  4. `python scripts/check_links.py`
22
  5. `python -m pytest -q`
23
 
24
- Candidate sources in the sync step include Kaggle datasets, Hugging Face datasets/models/spaces, GitHub repositories, and paper endpoints.
25
 
26
  ## Discovery-only mode
27
  If you only want fresh candidates:
 
21
  4. `python scripts/check_links.py`
22
  5. `python -m pytest -q`
23
 
24
+ Candidate sources in the sync step include Kaggle datasets, Hugging Face datasets/models/spaces, GitHub repositories, GitLab repositories, Zenodo records, Dataverse datasets, DataCite DOI records, and paper endpoints (arXiv, Semantic Scholar, OpenAlex, Crossref).
25
 
26
  ## Discovery-only mode
27
  If you only want fresh candidates:
docs/robots.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ User-agent: *
2
+ Allow: /
3
+
4
+ Sitemap: https://musawer1214.github.io/pashto-language-resources/sitemap.xml
5
+
docs/search/index.html CHANGED
@@ -1,9 +1,46 @@
 
 
 
 
 
 
1
  <!doctype html>
2
  <html lang="en">
3
  <head>
4
  <meta charset="utf-8">
5
  <meta name="viewport" content="width=device-width, initial-scale=1">
6
  <title>Pashto Resource Search</title>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
  <link rel="preconnect" href="https://fonts.googleapis.com">
8
  <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
9
  <link href="https://fonts.googleapis.com/css2?family=IBM+Plex+Sans+Arabic:wght@400;500;700&family=Space+Grotesk:wght@500;700&display=swap" rel="stylesheet">
@@ -131,6 +168,26 @@
131
  font-weight: 600;
132
  }
133
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
134
  .grid {
135
  display: grid;
136
  grid-template-columns: repeat(auto-fill, minmax(260px, 1fr));
@@ -273,6 +330,26 @@
273
  <span class="badge" id="generatedAt">Catalog timestamp: -</span>
274
  </div>
275
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
276
  <ul id="results" class="grid"></ul>
277
  </main>
278
 
@@ -416,3 +493,4 @@
416
  </script>
417
  </body>
418
  </html>
 
 
1
+ ---
2
+ layout: null
3
+ title: Pashto Resource Search
4
+ description: Search verified Pashto (Pukhto/Pashto) datasets, models, tools, benchmarks, papers, and projects.
5
+ permalink: /search/
6
+ ---
7
  <!doctype html>
8
  <html lang="en">
9
  <head>
10
  <meta charset="utf-8">
11
  <meta name="viewport" content="width=device-width, initial-scale=1">
12
  <title>Pashto Resource Search</title>
13
+ <meta name="description" content="Search verified Pashto (Pukhto/Pashto) resources for ASR, TTS, NLP, machine translation, tools, datasets, models, and benchmarks.">
14
+ <meta name="keywords" content="Pashto resources, Pukhto resources, Pashto datasets, Pashto ASR, Pashto TTS, Pashto NLP, Pashto machine translation, Pashto benchmarks">
15
+ <meta name="robots" content="index,follow,max-image-preview:large">
16
+ <link rel="canonical" href="https://musawer1214.github.io/pashto-language-resources/search/">
17
+ <meta property="og:type" content="website">
18
+ <meta property="og:site_name" content="Pashto Language Resources Hub">
19
+ <meta property="og:title" content="Pashto Resource Search">
20
+ <meta property="og:description" content="Discover verified Pashto datasets, models, tools, benchmarks, projects, and papers.">
21
+ <meta property="og:url" content="https://musawer1214.github.io/pashto-language-resources/search/">
22
+ <meta property="og:image" content="https://raw.githubusercontent.com/Musawer1214/pashto-language-resources/main/Repository_banner_Image.png">
23
+ <meta name="twitter:card" content="summary">
24
+ <meta name="twitter:title" content="Pashto Resource Search">
25
+ <meta name="twitter:description" content="Search and filter verified Pashto language technology resources.">
26
+ <meta name="twitter:image" content="https://raw.githubusercontent.com/Musawer1214/pashto-language-resources/main/Repository_banner_Image.png">
27
+ <script type="application/ld+json">
28
+ {
29
+ "@context": "https://schema.org",
30
+ "@type": "CollectionPage",
31
+ "name": "Pashto Resource Search",
32
+ "url": "https://musawer1214.github.io/pashto-language-resources/search/",
33
+ "description": "Search verified and candidate Pashto resources for ASR, TTS, NLP, MT, tools, datasets, and benchmarks.",
34
+ "inLanguage": "en",
35
+ "about": [
36
+ "Pashto datasets",
37
+ "Pashto ASR",
38
+ "Pashto TTS",
39
+ "Pashto NLP",
40
+ "Pashto machine translation"
41
+ ]
42
+ }
43
+ </script>
44
  <link rel="preconnect" href="https://fonts.googleapis.com">
45
  <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
46
  <link href="https://fonts.googleapis.com/css2?family=IBM+Plex+Sans+Arabic:wght@400;500;700&family=Space+Grotesk:wght@500;700&display=swap" rel="stylesheet">
 
168
  font-weight: 600;
169
  }
170
 
171
+ .crawl {
172
+ margin: 8px 2px 16px;
173
+ color: var(--muted);
174
+ font-size: 14px;
175
+ line-height: 1.5;
176
+ }
177
+
178
+ .crawl p {
179
+ margin: 6px 0;
180
+ }
181
+
182
+ .crawl-links {
183
+ margin: 8px 0 0;
184
+ padding-left: 18px;
185
+ }
186
+
187
+ .crawl-links li {
188
+ margin: 3px 0;
189
+ }
190
+
191
  .grid {
192
  display: grid;
193
  grid-template-columns: repeat(auto-fill, minmax(260px, 1fr));
 
330
  <span class="badge" id="generatedAt">Catalog timestamp: -</span>
331
  </div>
332
 
333
+ <section class="crawl" aria-label="Pashto resource categories">
334
+ <p>
335
+ Browse or search resource entries covering Pashto datasets, speech recognition (ASR), text-to-speech (TTS), NLP, translation, tools, and benchmarks.
336
+ </p>
337
+ <p>
338
+ Project overview: <a href="https://github.com/Musawer1214/pashto-language-resources">Musawer1214/pashto-language-resources</a>
339
+ </p>
340
+ <ul class="crawl-links">
341
+ <li><a href="https://github.com/Musawer1214/pashto-language-resources/blob/main/resources/datasets/README.md">Pashto datasets</a></li>
342
+ <li><a href="https://github.com/Musawer1214/pashto-language-resources/blob/main/resources/models/README.md">Pashto models</a></li>
343
+ <li><a href="https://github.com/Musawer1214/pashto-language-resources/blob/main/resources/benchmarks/README.md">Pashto benchmarks</a></li>
344
+ <li><a href="https://github.com/Musawer1214/pashto-language-resources/blob/main/resources/tools/README.md">Pashto tools</a></li>
345
+ <li><a href="https://github.com/Musawer1214/pashto-language-resources/blob/main/resources/papers/README.md">Pashto papers</a></li>
346
+ </ul>
347
+ </section>
348
+
349
+ <noscript>
350
+ JavaScript is needed for filtering, but the linked category pages above remain accessible.
351
+ </noscript>
352
+
353
  <ul id="results" class="grid"></ul>
354
  </main>
355
 
 
493
  </script>
494
  </body>
495
  </html>
496
+
docs/search/resources.json CHANGED
@@ -1,6 +1,6 @@
1
  {
2
- "generated_on": "2026-02-16T00:00:00Z",
3
- "count": 75,
4
  "resources": [
5
  {
6
  "id": "dataset-common-voice-ps-v24",
@@ -1846,6 +1846,517 @@
1846
  "markers": [
1847
  "pashto"
1848
  ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1849
  }
1850
  ]
1851
  }
 
1
  {
2
+ "generated_on": "2026-02-17T00:00:00Z",
3
+ "count": 95,
4
  "resources": [
5
  {
6
  "id": "dataset-common-voice-ps-v24",
 
1846
  "markers": [
1847
  "pashto"
1848
  ]
1849
+ },
1850
+ {
1851
+ "id": "dataset-hf-aamirhs-pashto-audio-wav2vec",
1852
+ "title": "aamirhs/pashto-audio-wav2vec",
1853
+ "url": "https://huggingface.co/datasets/aamirhs/pashto-audio-wav2vec",
1854
+ "category": "dataset",
1855
+ "source": "huggingface",
1856
+ "status": "verified",
1857
+ "summary": "Pashto speech dataset surfaced from Hugging Face candidate sync for ASR experiments.",
1858
+ "primary_use": "Pashto ASR data exploration and baseline training",
1859
+ "tasks": [
1860
+ "asr"
1861
+ ],
1862
+ "tags": [
1863
+ "pashto",
1864
+ "dataset",
1865
+ "huggingface",
1866
+ "speech",
1867
+ "asr"
1868
+ ],
1869
+ "evidence_text": "Matched by Pashto keyword in Hugging Face search results.",
1870
+ "evidence_url": "https://huggingface.co/datasets/aamirhs/pashto-audio-wav2vec",
1871
+ "markers": [
1872
+ "pashto"
1873
+ ]
1874
+ },
1875
+ {
1876
+ "id": "dataset-hf-alimuhammad73-pashto-poetry",
1877
+ "title": "AliMuhammad73/Pashto-Poetry",
1878
+ "url": "https://huggingface.co/datasets/AliMuhammad73/Pashto-Poetry",
1879
+ "category": "dataset",
1880
+ "source": "huggingface",
1881
+ "status": "verified",
1882
+ "summary": "Pashto poetry text dataset surfaced from Hugging Face candidate sync for NLP experiments.",
1883
+ "primary_use": "Pashto poetry corpus for language modeling and text analysis",
1884
+ "tasks": [
1885
+ "nlp"
1886
+ ],
1887
+ "tags": [
1888
+ "pashto",
1889
+ "dataset",
1890
+ "huggingface",
1891
+ "text",
1892
+ "poetry",
1893
+ "nlp"
1894
+ ],
1895
+ "evidence_text": "Matched by Pashto keyword in Hugging Face search results.",
1896
+ "evidence_url": "https://huggingface.co/datasets/AliMuhammad73/Pashto-Poetry",
1897
+ "markers": [
1898
+ "pashto"
1899
+ ]
1900
+ },
1901
+ {
1902
+ "id": "model-hf-aamirhs-wav2vec2-large-xls-r-300m-pashto-colab",
1903
+ "title": "aamirhs/wav2vec2-large-xls-r-300m-pashto-colab",
1904
+ "url": "https://huggingface.co/aamirhs/wav2vec2-large-xls-r-300m-pashto-colab",
1905
+ "category": "model",
1906
+ "source": "huggingface",
1907
+ "status": "verified",
1908
+ "summary": "Pashto ASR model checkpoint surfaced from Hugging Face candidate sync.",
1909
+ "primary_use": "Pashto ASR baseline and transfer-learning comparison",
1910
+ "tasks": [
1911
+ "asr"
1912
+ ],
1913
+ "tags": [
1914
+ "pashto",
1915
+ "model",
1916
+ "huggingface",
1917
+ "asr"
1918
+ ],
1919
+ "evidence_text": "Matched by Pashto keyword in Hugging Face search results.",
1920
+ "evidence_url": "https://huggingface.co/aamirhs/wav2vec2-large-xls-r-300m-pashto-colab",
1921
+ "markers": [
1922
+ "pashto"
1923
+ ]
1924
+ },
1925
+ {
1926
+ "id": "project-hf-space-aizazayyubi-pashto-asr",
1927
+ "title": "Aizazayyubi/pashto_asr",
1928
+ "url": "https://huggingface.co/spaces/Aizazayyubi/pashto_asr",
1929
+ "category": "project",
1930
+ "source": "huggingface",
1931
+ "status": "verified",
1932
+ "summary": "Pashto ASR interactive demo surfaced from Hugging Face Spaces candidate sync.",
1933
+ "primary_use": "Interactive Pashto ASR demo for qualitative evaluation",
1934
+ "tasks": [
1935
+ "asr",
1936
+ "demo"
1937
+ ],
1938
+ "tags": [
1939
+ "pashto",
1940
+ "project",
1941
+ "huggingface",
1942
+ "asr",
1943
+ "demo"
1944
+ ],
1945
+ "evidence_text": "Matched by Pashto keyword in Hugging Face Spaces search.",
1946
+ "evidence_url": "https://huggingface.co/spaces/Aizazayyubi/pashto_asr",
1947
+ "markers": [
1948
+ "pashto"
1949
+ ]
1950
+ },
1951
+ {
1952
+ "id": "paper-arxiv-from-scarcity-to-scale-pashto-common-voice",
1953
+ "title": "From Scarcity to Scale: A Release-Level Analysis of the Pashto Common Voice Dataset",
1954
+ "url": "http://arxiv.org/abs/2602.14062v1",
1955
+ "category": "paper",
1956
+ "source": "arxiv",
1957
+ "status": "verified",
1958
+ "summary": "Research paper analyzing Pashto Common Voice releases and dataset scaling characteristics.",
1959
+ "primary_use": "ASR data quality and release trend reference",
1960
+ "tasks": [
1961
+ "asr",
1962
+ "benchmarking"
1963
+ ],
1964
+ "tags": [
1965
+ "pashto",
1966
+ "paper",
1967
+ "arxiv",
1968
+ "asr",
1969
+ "common-voice"
1970
+ ],
1971
+ "evidence_text": "Matched by Pashto marker in paper title from arXiv query results.",
1972
+ "evidence_url": "http://arxiv.org/abs/2602.14062v1",
1973
+ "markers": [
1974
+ "pashto"
1975
+ ]
1976
+ },
1977
+ {
1978
+ "id": "paper-arxiv-tuning-traditional-pashto-text-classification",
1979
+ "title": "Tuning Traditional Language Processing Approaches for Pashto Text Classification",
1980
+ "url": "http://arxiv.org/abs/2305.03737v1",
1981
+ "category": "paper",
1982
+ "source": "arxiv",
1983
+ "status": "verified",
1984
+ "summary": "Research paper focused on Pashto text classification using traditional NLP approaches.",
1985
+ "primary_use": "Pashto text classification method reference",
1986
+ "tasks": [
1987
+ "nlp"
1988
+ ],
1989
+ "tags": [
1990
+ "pashto",
1991
+ "paper",
1992
+ "arxiv",
1993
+ "nlp",
1994
+ "classification"
1995
+ ],
1996
+ "evidence_text": "Matched by Pashto marker in paper title from arXiv query results.",
1997
+ "evidence_url": "http://arxiv.org/abs/2305.03737v1",
1998
+ "markers": [
1999
+ "pashto"
2000
+ ]
2001
+ },
2002
+ {
2003
+ "id": "dataset-dataverse-iarpa-babel-pashto-language-pack-v0-4by",
2004
+ "title": "IARPA Babel Pashto Language Pack IARPA-babel104b-v0.4bY",
2005
+ "url": "https://hdl.handle.net/11272.1/AB2/GLFN3X",
2006
+ "category": "dataset",
2007
+ "source": "dataverse",
2008
+ "status": "verified",
2009
+ "summary": "Pashto Babel language pack dataset for speech and language processing evaluation.",
2010
+ "primary_use": "Pashto speech dataset for ASR and language identification experiments",
2011
+ "tasks": [
2012
+ "asr",
2013
+ "benchmarking"
2014
+ ],
2015
+ "tags": [
2016
+ "pashto",
2017
+ "dataset",
2018
+ "dataverse",
2019
+ "speech",
2020
+ "asr",
2021
+ "babel"
2022
+ ],
2023
+ "evidence_text": "Dataverse metadata includes Pashto markers in dataset title or description.",
2024
+ "evidence_url": "https://hdl.handle.net/11272.1/AB2/GLFN3X",
2025
+ "markers": [
2026
+ "pashto"
2027
+ ]
2028
+ },
2029
+ {
2030
+ "id": "paper-arxiv-image-to-text-pashto-farsi-traditional-chinese",
2031
+ "title": "Development of a New Image-to-text Conversion System for Pashto, Farsi and Traditional Chinese",
2032
+ "url": "http://arxiv.org/abs/2005.08650v1",
2033
+ "category": "paper",
2034
+ "source": "arxiv",
2035
+ "status": "verified",
2036
+ "summary": "Research paper on image-to-text conversion including Pashto OCR.",
2037
+ "primary_use": "Pashto OCR method reference",
2038
+ "tasks": [
2039
+ "ocr",
2040
+ "nlp"
2041
+ ],
2042
+ "tags": [
2043
+ "pashto",
2044
+ "paper",
2045
+ "arxiv",
2046
+ "ocr"
2047
+ ],
2048
+ "evidence_text": "Matched by Pashto marker in paper title from arXiv query results.",
2049
+ "evidence_url": "http://arxiv.org/abs/2005.08650v1",
2050
+ "markers": [
2051
+ "pashto"
2052
+ ]
2053
+ },
2054
+ {
2055
+ "id": "paper-openalex-benchmark-pashto-handwritten-character-dataset-ocr",
2056
+ "title": "Benchmark Pashto Handwritten Character Dataset and Pashto Object Character Recognition (OCR) Using Deep Neural Network with Rule Activation Function",
2057
+ "url": "https://doi.org/10.1155/2021/6669672",
2058
+ "category": "paper",
2059
+ "source": "openalex",
2060
+ "status": "verified",
2061
+ "summary": "Research paper introducing a benchmark dataset and OCR approach for Pashto handwritten characters.",
2062
+ "primary_use": "Pashto handwritten OCR benchmark and methodology reference",
2063
+ "tasks": [
2064
+ "ocr",
2065
+ "benchmarking"
2066
+ ],
2067
+ "tags": [
2068
+ "pashto",
2069
+ "paper",
2070
+ "openalex",
2071
+ "ocr",
2072
+ "benchmark"
2073
+ ],
2074
+ "evidence_text": "Matched by explicit Pashto marker in title from OpenAlex works search.",
2075
+ "evidence_url": "https://doi.org/10.1155/2021/6669672",
2076
+ "markers": [
2077
+ "pashto"
2078
+ ]
2079
+ },
2080
+ {
2081
+ "id": "paper-openalex-asr-isolated-pashto-spoken-digits-mfcc-knn",
2082
+ "title": "Database development and automatic speech recognition of isolated Pashto spoken digits using MFCC and K-NN",
2083
+ "url": "https://doi.org/10.1007/s10772-014-9267-z",
2084
+ "category": "paper",
2085
+ "source": "openalex",
2086
+ "status": "verified",
2087
+ "summary": "Research paper on isolated Pashto spoken-digit ASR with MFCC and K-NN.",
2088
+ "primary_use": "Pashto ASR baseline method reference for digit recognition",
2089
+ "tasks": [
2090
+ "asr"
2091
+ ],
2092
+ "tags": [
2093
+ "pashto",
2094
+ "paper",
2095
+ "openalex",
2096
+ "asr",
2097
+ "speech"
2098
+ ],
2099
+ "evidence_text": "Matched by explicit Pashto marker in title from OpenAlex works search.",
2100
+ "evidence_url": "https://doi.org/10.1007/s10772-014-9267-z",
2101
+ "markers": [
2102
+ "pashto"
2103
+ ]
2104
+ },
2105
+ {
2106
+ "id": "paper-openalex-pashto-isolated-digits-recognition-dcnn",
2107
+ "title": "Pashto isolated digits recognition using deep convolutional neural network",
2108
+ "url": "https://doi.org/10.1016/j.heliyon.2020.e03372",
2109
+ "category": "paper",
2110
+ "source": "openalex",
2111
+ "status": "verified",
2112
+ "summary": "Research paper on Pashto isolated-digit recognition using deep convolutional neural networks.",
2113
+ "primary_use": "Pashto speech recognition research reference",
2114
+ "tasks": [
2115
+ "asr"
2116
+ ],
2117
+ "tags": [
2118
+ "pashto",
2119
+ "paper",
2120
+ "openalex",
2121
+ "asr",
2122
+ "deep-learning"
2123
+ ],
2124
+ "evidence_text": "Matched by explicit Pashto marker in title from OpenAlex works search.",
2125
+ "evidence_url": "https://doi.org/10.1016/j.heliyon.2020.e03372",
2126
+ "markers": [
2127
+ "pashto"
2128
+ ]
2129
+ },
2130
+ {
2131
+ "id": "paper-openalex-pashto-offensive-language-detection-benchmark-bert",
2132
+ "title": "Pashto offensive language detection: a benchmark dataset and monolingual Pashto BERT",
2133
+ "url": "https://doi.org/10.7717/peerj-cs.1617",
2134
+ "category": "paper",
2135
+ "source": "openalex",
2136
+ "status": "verified",
2137
+ "summary": "Research paper on Pashto offensive language detection with benchmark dataset and monolingual BERT model.",
2138
+ "primary_use": "Pashto NLP toxicity detection benchmark and model reference",
2139
+ "tasks": [
2140
+ "nlp",
2141
+ "benchmarking"
2142
+ ],
2143
+ "tags": [
2144
+ "pashto",
2145
+ "paper",
2146
+ "openalex",
2147
+ "nlp",
2148
+ "bert",
2149
+ "benchmark"
2150
+ ],
2151
+ "evidence_text": "Matched by explicit Pashto marker in title from OpenAlex works search.",
2152
+ "evidence_url": "https://doi.org/10.7717/peerj-cs.1617",
2153
+ "markers": [
2154
+ "pashto"
2155
+ ]
2156
+ },
2157
+ {
2158
+ "id": "paper-openalex-phti-pashto-handwritten-text-imagebase",
2159
+ "title": "PHTI: Pashto Handwritten Text Imagebase for Deep Learning Applications",
2160
+ "url": "https://doi.org/10.1109/access.2022.3216881",
2161
+ "category": "paper",
2162
+ "source": "openalex",
2163
+ "status": "verified",
2164
+ "summary": "Research paper describing PHTI, a Pashto handwritten text imagebase for deep learning.",
2165
+ "primary_use": "Pashto OCR dataset and benchmark reference",
2166
+ "tasks": [
2167
+ "ocr",
2168
+ "benchmarking"
2169
+ ],
2170
+ "tags": [
2171
+ "pashto",
2172
+ "paper",
2173
+ "openalex",
2174
+ "ocr",
2175
+ "dataset"
2176
+ ],
2177
+ "evidence_text": "Matched by explicit Pashto marker in title from OpenAlex works search.",
2178
+ "evidence_url": "https://doi.org/10.1109/access.2022.3216881",
2179
+ "markers": [
2180
+ "pashto"
2181
+ ]
2182
+ },
2183
+ {
2184
+ "id": "paper-openalex-recognition-of-pashto-handwritten-characters-deep-learning",
2185
+ "title": "Recognition of Pashto Handwritten Characters Based on Deep Learning",
2186
+ "url": "https://doi.org/10.3390/s20205884",
2187
+ "category": "paper",
2188
+ "source": "openalex",
2189
+ "status": "verified",
2190
+ "summary": "Research paper on deep-learning-based recognition of Pashto handwritten characters.",
2191
+ "primary_use": "Pashto OCR model reference for handwritten character recognition",
2192
+ "tasks": [
2193
+ "ocr"
2194
+ ],
2195
+ "tags": [
2196
+ "pashto",
2197
+ "paper",
2198
+ "openalex",
2199
+ "ocr",
2200
+ "deep-learning"
2201
+ ],
2202
+ "evidence_text": "Matched by explicit Pashto marker in title from OpenAlex works search.",
2203
+ "evidence_url": "https://doi.org/10.3390/s20205884",
2204
+ "markers": [
2205
+ "pashto"
2206
+ ]
2207
+ },
2208
+ {
2209
+ "id": "paper-openalex-kpti-katib-pashto-text-imagebase-benchmark",
2210
+ "title": "KPTI: Katib's Pashto Text Imagebase and Deep Learning Benchmark",
2211
+ "url": "https://doi.org/10.1109/icfhr.2016.0090",
2212
+ "category": "paper",
2213
+ "source": "openalex",
2214
+ "status": "verified",
2215
+ "summary": "Research paper introducing KPTI, a Pashto text imagebase and benchmark for handwritten recognition.",
2216
+ "primary_use": "Pashto OCR dataset and benchmarking reference",
2217
+ "tasks": [
2218
+ "ocr",
2219
+ "benchmarking"
2220
+ ],
2221
+ "tags": [
2222
+ "pashto",
2223
+ "paper",
2224
+ "openalex",
2225
+ "ocr",
2226
+ "benchmark"
2227
+ ],
2228
+ "evidence_text": "Matched by explicit Pashto marker in title from OpenAlex works search.",
2229
+ "evidence_url": "https://doi.org/10.1109/icfhr.2016.0090",
2230
+ "markers": [
2231
+ "pashto"
2232
+ ]
2233
+ },
2234
+ {
2235
+ "id": "paper-openalex-pioneer-dataset-handwritten-pashto-cnn",
2236
+ "title": "Pioneer dataset and recognition of Handwritten Pashto characters using Convolution Neural Networks",
2237
+ "url": "https://doi.org/10.1177/0020294020964826",
2238
+ "category": "paper",
2239
+ "source": "openalex",
2240
+ "status": "verified",
2241
+ "summary": "Research paper on a pioneer handwritten Pashto character dataset with CNN-based recognition.",
2242
+ "primary_use": "Pashto handwritten character recognition reference",
2243
+ "tasks": [
2244
+ "ocr",
2245
+ "benchmarking"
2246
+ ],
2247
+ "tags": [
2248
+ "pashto",
2249
+ "paper",
2250
+ "openalex",
2251
+ "ocr",
2252
+ "deep-learning"
2253
+ ],
2254
+ "evidence_text": "Matched by explicit Pashto marker in title from OpenAlex works search.",
2255
+ "evidence_url": "https://doi.org/10.1177/0020294020964826",
2256
+ "markers": [
2257
+ "pashto"
2258
+ ]
2259
+ },
2260
+ {
2261
+ "id": "paper-openalex-scale-rotation-invariant-ocr-pashto-mdlstm",
2262
+ "title": "Scale and rotation invariant OCR for Pashto cursive script using MDLSTM network",
2263
+ "url": "https://doi.org/10.1109/icdar.2015.7333931",
2264
+ "category": "paper",
2265
+ "source": "openalex",
2266
+ "status": "verified",
2267
+ "summary": "Research paper on scale- and rotation-invariant OCR for cursive Pashto using MDLSTM.",
2268
+ "primary_use": "Pashto OCR model architecture reference",
2269
+ "tasks": [
2270
+ "ocr"
2271
+ ],
2272
+ "tags": [
2273
+ "pashto",
2274
+ "paper",
2275
+ "openalex",
2276
+ "ocr",
2277
+ "mdlstm"
2278
+ ],
2279
+ "evidence_text": "Matched by explicit Pashto marker in title from OpenAlex works search.",
2280
+ "evidence_url": "https://doi.org/10.1109/icdar.2015.7333931",
2281
+ "markers": [
2282
+ "pashto"
2283
+ ]
2284
+ },
2285
+ {
2286
+ "id": "paper-openalex-recognizable-units-pashto-ocr",
2287
+ "title": "Recognizable units in Pashto language for OCR",
2288
+ "url": "https://doi.org/10.1109/icdar.2015.7333963",
2289
+ "category": "paper",
2290
+ "source": "openalex",
2291
+ "status": "verified",
2292
+ "summary": "Research paper defining recognizable units in Pashto for OCR workflows.",
2293
+ "primary_use": "Pashto OCR preprocessing and unit-design reference",
2294
+ "tasks": [
2295
+ "ocr"
2296
+ ],
2297
+ "tags": [
2298
+ "pashto",
2299
+ "paper",
2300
+ "openalex",
2301
+ "ocr"
2302
+ ],
2303
+ "evidence_text": "Matched by explicit Pashto marker in title from OpenAlex works search.",
2304
+ "evidence_url": "https://doi.org/10.1109/icdar.2015.7333963",
2305
+ "markers": [
2306
+ "pashto"
2307
+ ]
2308
+ },
2309
+ {
2310
+ "id": "paper-openalex-shape-analysis-pashto-script-image-database-ocr",
2311
+ "title": "Shape analysis of Pashto script and creation of image database for OCR",
2312
+ "url": "https://doi.org/10.1109/icet.2009.5353160",
2313
+ "category": "paper",
2314
+ "source": "openalex",
2315
+ "status": "verified",
2316
+ "summary": "Research paper on Pashto script shape analysis and image database creation for OCR.",
2317
+ "primary_use": "Pashto OCR dataset design and feature reference",
2318
+ "tasks": [
2319
+ "ocr",
2320
+ "benchmarking"
2321
+ ],
2322
+ "tags": [
2323
+ "pashto",
2324
+ "paper",
2325
+ "openalex",
2326
+ "ocr",
2327
+ "dataset"
2328
+ ],
2329
+ "evidence_text": "Matched by explicit Pashto marker in title from OpenAlex works search.",
2330
+ "evidence_url": "https://doi.org/10.1109/icet.2009.5353160",
2331
+ "markers": [
2332
+ "pashto"
2333
+ ]
2334
+ },
2335
+ {
2336
+ "id": "paper-openalex-speech-translation-low-resource-case-pashto",
2337
+ "title": "Speech translation for low-resource languages: the case of Pashto",
2338
+ "url": "https://doi.org/10.21437/interspeech.2005-723",
2339
+ "category": "paper",
2340
+ "source": "openalex",
2341
+ "status": "verified",
2342
+ "summary": "Research paper on speech translation for low-resource languages, including Pashto.",
2343
+ "primary_use": "Pashto speech translation and low-resource MT reference",
2344
+ "tasks": [
2345
+ "asr",
2346
+ "mt"
2347
+ ],
2348
+ "tags": [
2349
+ "pashto",
2350
+ "paper",
2351
+ "openalex",
2352
+ "speech",
2353
+ "translation"
2354
+ ],
2355
+ "evidence_text": "Matched by explicit Pashto marker in title from OpenAlex works search.",
2356
+ "evidence_url": "https://doi.org/10.21437/interspeech.2005-723",
2357
+ "markers": [
2358
+ "pashto"
2359
+ ]
2360
  }
2361
  ]
2362
  }
pyproject.toml CHANGED
@@ -7,6 +7,22 @@ name = "pukhto-pashto"
7
  version = "0.1.0"
8
  description = "Open Pashto language resources for ASR, TTS, NLP, and benchmarks"
9
  requires-python = ">=3.10"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
 
11
  [project.optional-dependencies]
12
  dev = ["pytest>=8.0.0"]
@@ -17,3 +33,4 @@ python_files = ["test_*.py"]
17
 
18
  [tool.setuptools]
19
  packages = []
 
 
7
  version = "0.1.0"
8
  description = "Open Pashto language resources for ASR, TTS, NLP, and benchmarks"
9
  requires-python = ">=3.10"
10
+ readme = "README.md"
11
+ keywords = [
12
+ "pashto",
13
+ "pukhto",
14
+ "pushto",
15
+ "asr",
16
+ "tts",
17
+ "nlp",
18
+ "machine-translation",
19
+ "language-resources",
20
+ ]
21
+
22
+ [project.urls]
23
+ Homepage = "https://github.com/Musawer1214/pashto-language-resources"
24
+ Documentation = "https://musawer1214.github.io/pashto-language-resources/"
25
+ Repository = "https://github.com/Musawer1214/pashto-language-resources"
26
 
27
  [project.optional-dependencies]
28
  dev = ["pytest>=8.0.0"]
 
33
 
34
  [tool.setuptools]
35
  packages = []
36
+
resources/README.md CHANGED
@@ -3,12 +3,12 @@
3
  Structured, Pashto-focused resource tracking lives in this folder.
4
 
5
  ## Sections
6
- - Datasets (32): [datasets/README.md](datasets/README.md)
7
- - Models (15): [models/README.md](models/README.md)
8
  - Benchmarks (4): [benchmarks/README.md](benchmarks/README.md)
9
  - Tools (0): [tools/README.md](tools/README.md)
10
- - Papers (9): [papers/README.md](papers/README.md)
11
- - Projects (14): [projects/README.md](projects/README.md)
12
  - Code (1): [codes/README.md](codes/README.md)
13
 
14
  ## Machine-Readable Catalog
@@ -22,4 +22,4 @@ Structured, Pashto-focused resource tracking lives in this folder.
22
  - Run `python scripts/validate_resource_catalog.py` before opening a PR.
23
  - Run `python scripts/generate_resource_views.py` after catalog changes.
24
 
25
- Verified resource count: `75`
 
3
  Structured, Pashto-focused resource tracking lives in this folder.
4
 
5
  ## Sections
6
+ - Datasets (35): [datasets/README.md](datasets/README.md)
7
+ - Models (16): [models/README.md](models/README.md)
8
  - Benchmarks (4): [benchmarks/README.md](benchmarks/README.md)
9
  - Tools (0): [tools/README.md](tools/README.md)
10
+ - Papers (24): [papers/README.md](papers/README.md)
11
+ - Projects (15): [projects/README.md](projects/README.md)
12
  - Code (1): [codes/README.md](codes/README.md)
13
 
14
  ## Machine-Readable Catalog
 
22
  - Run `python scripts/validate_resource_catalog.py` before opening a PR.
23
  - Run `python scripts/generate_resource_views.py` after catalog changes.
24
 
25
+ Verified resource count: `95`
resources/catalog/pending_candidates.json CHANGED
The diff for this file is too large to render. See raw diff
 
resources/catalog/resources.json CHANGED
@@ -1,6 +1,6 @@
1
  {
2
  "version": "1.0.0",
3
- "updated_on": "2026-02-16",
4
  "resources": [
5
  {
6
  "id": "dataset-common-voice-ps-v24",
@@ -2008,6 +2008,557 @@
2008
  "other",
2009
  "nlp"
2010
  ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2011
  }
2012
  ]
2013
  }
 
1
  {
2
  "version": "1.0.0",
3
+ "updated_on": "2026-02-17",
4
  "resources": [
5
  {
6
  "id": "dataset-common-voice-ps-v24",
 
2008
  "other",
2009
  "nlp"
2010
  ]
2011
+ },
2012
+ {
2013
+ "id": "dataset-hf-aamirhs-pashto-audio-wav2vec",
2014
+ "title": "aamirhs/pashto-audio-wav2vec",
2015
+ "url": "https://huggingface.co/datasets/aamirhs/pashto-audio-wav2vec",
2016
+ "category": "dataset",
2017
+ "source": "huggingface",
2018
+ "status": "verified",
2019
+ "summary": "Pashto speech dataset surfaced from Hugging Face candidate sync for ASR experiments.",
2020
+ "primary_use": "Pashto ASR data exploration and baseline training",
2021
+ "tasks": [
2022
+ "asr"
2023
+ ],
2024
+ "pashto_evidence": {
2025
+ "evidence_text": "Matched by Pashto keyword in Hugging Face search results.",
2026
+ "evidence_url": "https://huggingface.co/datasets/aamirhs/pashto-audio-wav2vec",
2027
+ "markers": [
2028
+ "pashto"
2029
+ ]
2030
+ },
2031
+ "tags": [
2032
+ "pashto",
2033
+ "dataset",
2034
+ "huggingface",
2035
+ "speech",
2036
+ "asr"
2037
+ ]
2038
+ },
2039
+ {
2040
+ "id": "dataset-hf-alimuhammad73-pashto-poetry",
2041
+ "title": "AliMuhammad73/Pashto-Poetry",
2042
+ "url": "https://huggingface.co/datasets/AliMuhammad73/Pashto-Poetry",
2043
+ "category": "dataset",
2044
+ "source": "huggingface",
2045
+ "status": "verified",
2046
+ "summary": "Pashto poetry text dataset surfaced from Hugging Face candidate sync for NLP experiments.",
2047
+ "primary_use": "Pashto poetry corpus for language modeling and text analysis",
2048
+ "tasks": [
2049
+ "nlp"
2050
+ ],
2051
+ "pashto_evidence": {
2052
+ "evidence_text": "Matched by Pashto keyword in Hugging Face search results.",
2053
+ "evidence_url": "https://huggingface.co/datasets/AliMuhammad73/Pashto-Poetry",
2054
+ "markers": [
2055
+ "pashto"
2056
+ ]
2057
+ },
2058
+ "tags": [
2059
+ "pashto",
2060
+ "dataset",
2061
+ "huggingface",
2062
+ "text",
2063
+ "poetry",
2064
+ "nlp"
2065
+ ]
2066
+ },
2067
+ {
2068
+ "id": "model-hf-aamirhs-wav2vec2-large-xls-r-300m-pashto-colab",
2069
+ "title": "aamirhs/wav2vec2-large-xls-r-300m-pashto-colab",
2070
+ "url": "https://huggingface.co/aamirhs/wav2vec2-large-xls-r-300m-pashto-colab",
2071
+ "category": "model",
2072
+ "source": "huggingface",
2073
+ "status": "verified",
2074
+ "summary": "Pashto ASR model checkpoint surfaced from Hugging Face candidate sync.",
2075
+ "primary_use": "Pashto ASR baseline and transfer-learning comparison",
2076
+ "tasks": [
2077
+ "asr"
2078
+ ],
2079
+ "pashto_evidence": {
2080
+ "evidence_text": "Matched by Pashto keyword in Hugging Face search results.",
2081
+ "evidence_url": "https://huggingface.co/aamirhs/wav2vec2-large-xls-r-300m-pashto-colab",
2082
+ "markers": [
2083
+ "pashto"
2084
+ ]
2085
+ },
2086
+ "tags": [
2087
+ "pashto",
2088
+ "model",
2089
+ "huggingface",
2090
+ "asr"
2091
+ ]
2092
+ },
2093
+ {
2094
+ "id": "project-hf-space-aizazayyubi-pashto-asr",
2095
+ "title": "Aizazayyubi/pashto_asr",
2096
+ "url": "https://huggingface.co/spaces/Aizazayyubi/pashto_asr",
2097
+ "category": "project",
2098
+ "source": "huggingface",
2099
+ "status": "verified",
2100
+ "summary": "Pashto ASR interactive demo surfaced from Hugging Face Spaces candidate sync.",
2101
+ "primary_use": "Interactive Pashto ASR demo for qualitative evaluation",
2102
+ "tasks": [
2103
+ "asr",
2104
+ "demo"
2105
+ ],
2106
+ "pashto_evidence": {
2107
+ "evidence_text": "Matched by Pashto keyword in Hugging Face Spaces search.",
2108
+ "evidence_url": "https://huggingface.co/spaces/Aizazayyubi/pashto_asr",
2109
+ "markers": [
2110
+ "pashto"
2111
+ ]
2112
+ },
2113
+ "tags": [
2114
+ "pashto",
2115
+ "project",
2116
+ "huggingface",
2117
+ "asr",
2118
+ "demo"
2119
+ ]
2120
+ },
2121
+ {
2122
+ "id": "paper-arxiv-from-scarcity-to-scale-pashto-common-voice",
2123
+ "title": "From Scarcity to Scale: A Release-Level Analysis of the Pashto Common Voice Dataset",
2124
+ "url": "http://arxiv.org/abs/2602.14062v1",
2125
+ "category": "paper",
2126
+ "source": "arxiv",
2127
+ "status": "verified",
2128
+ "summary": "Research paper analyzing Pashto Common Voice releases and dataset scaling characteristics.",
2129
+ "primary_use": "ASR data quality and release trend reference",
2130
+ "tasks": [
2131
+ "asr",
2132
+ "benchmarking"
2133
+ ],
2134
+ "pashto_evidence": {
2135
+ "evidence_text": "Matched by Pashto marker in paper title from arXiv query results.",
2136
+ "evidence_url": "http://arxiv.org/abs/2602.14062v1",
2137
+ "markers": [
2138
+ "pashto"
2139
+ ]
2140
+ },
2141
+ "tags": [
2142
+ "pashto",
2143
+ "paper",
2144
+ "arxiv",
2145
+ "asr",
2146
+ "common-voice"
2147
+ ]
2148
+ },
2149
+ {
2150
+ "id": "paper-arxiv-tuning-traditional-pashto-text-classification",
2151
+ "title": "Tuning Traditional Language Processing Approaches for Pashto Text Classification",
2152
+ "url": "http://arxiv.org/abs/2305.03737v1",
2153
+ "category": "paper",
2154
+ "source": "arxiv",
2155
+ "status": "verified",
2156
+ "summary": "Research paper focused on Pashto text classification using traditional NLP approaches.",
2157
+ "primary_use": "Pashto text classification method reference",
2158
+ "tasks": [
2159
+ "nlp"
2160
+ ],
2161
+ "pashto_evidence": {
2162
+ "evidence_text": "Matched by Pashto marker in paper title from arXiv query results.",
2163
+ "evidence_url": "http://arxiv.org/abs/2305.03737v1",
2164
+ "markers": [
2165
+ "pashto"
2166
+ ]
2167
+ },
2168
+ "tags": [
2169
+ "pashto",
2170
+ "paper",
2171
+ "arxiv",
2172
+ "nlp",
2173
+ "classification"
2174
+ ]
2175
+ },
2176
+ {
2177
+ "id": "dataset-dataverse-iarpa-babel-pashto-language-pack-v0-4by",
2178
+ "title": "IARPA Babel Pashto Language Pack IARPA-babel104b-v0.4bY",
2179
+ "url": "https://hdl.handle.net/11272.1/AB2/GLFN3X",
2180
+ "category": "dataset",
2181
+ "source": "dataverse",
2182
+ "status": "verified",
2183
+ "summary": "Pashto Babel language pack dataset for speech and language processing evaluation.",
2184
+ "primary_use": "Pashto speech dataset for ASR and language identification experiments",
2185
+ "tasks": [
2186
+ "asr",
2187
+ "benchmarking"
2188
+ ],
2189
+ "pashto_evidence": {
2190
+ "evidence_text": "Dataverse metadata includes Pashto markers in dataset title or description.",
2191
+ "evidence_url": "https://hdl.handle.net/11272.1/AB2/GLFN3X",
2192
+ "markers": [
2193
+ "pashto"
2194
+ ]
2195
+ },
2196
+ "tags": [
2197
+ "pashto",
2198
+ "dataset",
2199
+ "dataverse",
2200
+ "speech",
2201
+ "asr",
2202
+ "babel"
2203
+ ]
2204
+ },
2205
+ {
2206
+ "id": "paper-arxiv-image-to-text-pashto-farsi-traditional-chinese",
2207
+ "title": "Development of a New Image-to-text Conversion System for Pashto, Farsi and Traditional Chinese",
2208
+ "url": "http://arxiv.org/abs/2005.08650v1",
2209
+ "category": "paper",
2210
+ "source": "arxiv",
2211
+ "status": "verified",
2212
+ "summary": "Research paper on image-to-text conversion including Pashto OCR.",
2213
+ "primary_use": "Pashto OCR method reference",
2214
+ "tasks": [
2215
+ "ocr",
2216
+ "nlp"
2217
+ ],
2218
+ "pashto_evidence": {
2219
+ "evidence_text": "Matched by Pashto marker in paper title from arXiv query results.",
2220
+ "evidence_url": "http://arxiv.org/abs/2005.08650v1",
2221
+ "markers": [
2222
+ "pashto"
2223
+ ]
2224
+ },
2225
+ "tags": [
2226
+ "pashto",
2227
+ "paper",
2228
+ "arxiv",
2229
+ "ocr"
2230
+ ]
2231
+ },
2232
+ {
2233
+ "id": "paper-openalex-benchmark-pashto-handwritten-character-dataset-ocr",
2234
+ "title": "Benchmark Pashto Handwritten Character Dataset and Pashto Object Character Recognition (OCR) Using Deep Neural Network with Rule Activation Function",
2235
+ "url": "https://doi.org/10.1155/2021/6669672",
2236
+ "category": "paper",
2237
+ "source": "openalex",
2238
+ "status": "verified",
2239
+ "summary": "Research paper introducing a benchmark dataset and OCR approach for Pashto handwritten characters.",
2240
+ "primary_use": "Pashto handwritten OCR benchmark and methodology reference",
2241
+ "tasks": [
2242
+ "ocr",
2243
+ "benchmarking"
2244
+ ],
2245
+ "pashto_evidence": {
2246
+ "evidence_text": "Matched by explicit Pashto marker in title from OpenAlex works search.",
2247
+ "evidence_url": "https://doi.org/10.1155/2021/6669672",
2248
+ "markers": [
2249
+ "pashto"
2250
+ ]
2251
+ },
2252
+ "tags": [
2253
+ "pashto",
2254
+ "paper",
2255
+ "openalex",
2256
+ "ocr",
2257
+ "benchmark"
2258
+ ]
2259
+ },
2260
+ {
2261
+ "id": "paper-openalex-asr-isolated-pashto-spoken-digits-mfcc-knn",
2262
+ "title": "Database development and automatic speech recognition of isolated Pashto spoken digits using MFCC and K-NN",
2263
+ "url": "https://doi.org/10.1007/s10772-014-9267-z",
2264
+ "category": "paper",
2265
+ "source": "openalex",
2266
+ "status": "verified",
2267
+ "summary": "Research paper on isolated Pashto spoken-digit ASR with MFCC and K-NN.",
2268
+ "primary_use": "Pashto ASR baseline method reference for digit recognition",
2269
+ "tasks": [
2270
+ "asr"
2271
+ ],
2272
+ "pashto_evidence": {
2273
+ "evidence_text": "Matched by explicit Pashto marker in title from OpenAlex works search.",
2274
+ "evidence_url": "https://doi.org/10.1007/s10772-014-9267-z",
2275
+ "markers": [
2276
+ "pashto"
2277
+ ]
2278
+ },
2279
+ "tags": [
2280
+ "pashto",
2281
+ "paper",
2282
+ "openalex",
2283
+ "asr",
2284
+ "speech"
2285
+ ]
2286
+ },
2287
+ {
2288
+ "id": "paper-openalex-pashto-isolated-digits-recognition-dcnn",
2289
+ "title": "Pashto isolated digits recognition using deep convolutional neural network",
2290
+ "url": "https://doi.org/10.1016/j.heliyon.2020.e03372",
2291
+ "category": "paper",
2292
+ "source": "openalex",
2293
+ "status": "verified",
2294
+ "summary": "Research paper on Pashto isolated-digit recognition using deep convolutional neural networks.",
2295
+ "primary_use": "Pashto speech recognition research reference",
2296
+ "tasks": [
2297
+ "asr"
2298
+ ],
2299
+ "pashto_evidence": {
2300
+ "evidence_text": "Matched by explicit Pashto marker in title from OpenAlex works search.",
2301
+ "evidence_url": "https://doi.org/10.1016/j.heliyon.2020.e03372",
2302
+ "markers": [
2303
+ "pashto"
2304
+ ]
2305
+ },
2306
+ "tags": [
2307
+ "pashto",
2308
+ "paper",
2309
+ "openalex",
2310
+ "asr",
2311
+ "deep-learning"
2312
+ ]
2313
+ },
2314
+ {
2315
+ "id": "paper-openalex-pashto-offensive-language-detection-benchmark-bert",
2316
+ "title": "Pashto offensive language detection: a benchmark dataset and monolingual Pashto BERT",
2317
+ "url": "https://doi.org/10.7717/peerj-cs.1617",
2318
+ "category": "paper",
2319
+ "source": "openalex",
2320
+ "status": "verified",
2321
+ "summary": "Research paper on Pashto offensive language detection with benchmark dataset and monolingual BERT model.",
2322
+ "primary_use": "Pashto NLP toxicity detection benchmark and model reference",
2323
+ "tasks": [
2324
+ "nlp",
2325
+ "benchmarking"
2326
+ ],
2327
+ "pashto_evidence": {
2328
+ "evidence_text": "Matched by explicit Pashto marker in title from OpenAlex works search.",
2329
+ "evidence_url": "https://doi.org/10.7717/peerj-cs.1617",
2330
+ "markers": [
2331
+ "pashto"
2332
+ ]
2333
+ },
2334
+ "tags": [
2335
+ "pashto",
2336
+ "paper",
2337
+ "openalex",
2338
+ "nlp",
2339
+ "bert",
2340
+ "benchmark"
2341
+ ]
2342
+ },
2343
+ {
2344
+ "id": "paper-openalex-phti-pashto-handwritten-text-imagebase",
2345
+ "title": "PHTI: Pashto Handwritten Text Imagebase for Deep Learning Applications",
2346
+ "url": "https://doi.org/10.1109/access.2022.3216881",
2347
+ "category": "paper",
2348
+ "source": "openalex",
2349
+ "status": "verified",
2350
+ "summary": "Research paper describing PHTI, a Pashto handwritten text imagebase for deep learning.",
2351
+ "primary_use": "Pashto OCR dataset and benchmark reference",
2352
+ "tasks": [
2353
+ "ocr",
2354
+ "benchmarking"
2355
+ ],
2356
+ "pashto_evidence": {
2357
+ "evidence_text": "Matched by explicit Pashto marker in title from OpenAlex works search.",
2358
+ "evidence_url": "https://doi.org/10.1109/access.2022.3216881",
2359
+ "markers": [
2360
+ "pashto"
2361
+ ]
2362
+ },
2363
+ "tags": [
2364
+ "pashto",
2365
+ "paper",
2366
+ "openalex",
2367
+ "ocr",
2368
+ "dataset"
2369
+ ]
2370
+ },
2371
+ {
2372
+ "id": "paper-openalex-recognition-of-pashto-handwritten-characters-deep-learning",
2373
+ "title": "Recognition of Pashto Handwritten Characters Based on Deep Learning",
2374
+ "url": "https://doi.org/10.3390/s20205884",
2375
+ "category": "paper",
2376
+ "source": "openalex",
2377
+ "status": "verified",
2378
+ "summary": "Research paper on deep-learning-based recognition of Pashto handwritten characters.",
2379
+ "primary_use": "Pashto OCR model reference for handwritten character recognition",
2380
+ "tasks": [
2381
+ "ocr"
2382
+ ],
2383
+ "pashto_evidence": {
2384
+ "evidence_text": "Matched by explicit Pashto marker in title from OpenAlex works search.",
2385
+ "evidence_url": "https://doi.org/10.3390/s20205884",
2386
+ "markers": [
2387
+ "pashto"
2388
+ ]
2389
+ },
2390
+ "tags": [
2391
+ "pashto",
2392
+ "paper",
2393
+ "openalex",
2394
+ "ocr",
2395
+ "deep-learning"
2396
+ ]
2397
+ },
2398
+ {
2399
+ "id": "paper-openalex-kpti-katib-pashto-text-imagebase-benchmark",
2400
+ "title": "KPTI: Katib's Pashto Text Imagebase and Deep Learning Benchmark",
2401
+ "url": "https://doi.org/10.1109/icfhr.2016.0090",
2402
+ "category": "paper",
2403
+ "source": "openalex",
2404
+ "status": "verified",
2405
+ "summary": "Research paper introducing KPTI, a Pashto text imagebase and benchmark for handwritten recognition.",
2406
+ "primary_use": "Pashto OCR dataset and benchmarking reference",
2407
+ "tasks": [
2408
+ "ocr",
2409
+ "benchmarking"
2410
+ ],
2411
+ "pashto_evidence": {
2412
+ "evidence_text": "Matched by explicit Pashto marker in title from OpenAlex works search.",
2413
+ "evidence_url": "https://doi.org/10.1109/icfhr.2016.0090",
2414
+ "markers": [
2415
+ "pashto"
2416
+ ]
2417
+ },
2418
+ "tags": [
2419
+ "pashto",
2420
+ "paper",
2421
+ "openalex",
2422
+ "ocr",
2423
+ "benchmark"
2424
+ ]
2425
+ },
2426
+ {
2427
+ "id": "paper-openalex-pioneer-dataset-handwritten-pashto-cnn",
2428
+ "title": "Pioneer dataset and recognition of Handwritten Pashto characters using Convolution Neural Networks",
2429
+ "url": "https://doi.org/10.1177/0020294020964826",
2430
+ "category": "paper",
2431
+ "source": "openalex",
2432
+ "status": "verified",
2433
+ "summary": "Research paper on a pioneer handwritten Pashto character dataset with CNN-based recognition.",
2434
+ "primary_use": "Pashto handwritten character recognition reference",
2435
+ "tasks": [
2436
+ "ocr",
2437
+ "benchmarking"
2438
+ ],
2439
+ "pashto_evidence": {
2440
+ "evidence_text": "Matched by explicit Pashto marker in title from OpenAlex works search.",
2441
+ "evidence_url": "https://doi.org/10.1177/0020294020964826",
2442
+ "markers": [
2443
+ "pashto"
2444
+ ]
2445
+ },
2446
+ "tags": [
2447
+ "pashto",
2448
+ "paper",
2449
+ "openalex",
2450
+ "ocr",
2451
+ "deep-learning"
2452
+ ]
2453
+ },
2454
+ {
2455
+ "id": "paper-openalex-scale-rotation-invariant-ocr-pashto-mdlstm",
2456
+ "title": "Scale and rotation invariant OCR for Pashto cursive script using MDLSTM network",
2457
+ "url": "https://doi.org/10.1109/icdar.2015.7333931",
2458
+ "category": "paper",
2459
+ "source": "openalex",
2460
+ "status": "verified",
2461
+ "summary": "Research paper on scale- and rotation-invariant OCR for cursive Pashto using MDLSTM.",
2462
+ "primary_use": "Pashto OCR model architecture reference",
2463
+ "tasks": [
2464
+ "ocr"
2465
+ ],
2466
+ "pashto_evidence": {
2467
+ "evidence_text": "Matched by explicit Pashto marker in title from OpenAlex works search.",
2468
+ "evidence_url": "https://doi.org/10.1109/icdar.2015.7333931",
2469
+ "markers": [
2470
+ "pashto"
2471
+ ]
2472
+ },
2473
+ "tags": [
2474
+ "pashto",
2475
+ "paper",
2476
+ "openalex",
2477
+ "ocr",
2478
+ "mdlstm"
2479
+ ]
2480
+ },
2481
+ {
2482
+ "id": "paper-openalex-recognizable-units-pashto-ocr",
2483
+ "title": "Recognizable units in Pashto language for OCR",
2484
+ "url": "https://doi.org/10.1109/icdar.2015.7333963",
2485
+ "category": "paper",
2486
+ "source": "openalex",
2487
+ "status": "verified",
2488
+ "summary": "Research paper defining recognizable units in Pashto for OCR workflows.",
2489
+ "primary_use": "Pashto OCR preprocessing and unit-design reference",
2490
+ "tasks": [
2491
+ "ocr"
2492
+ ],
2493
+ "pashto_evidence": {
2494
+ "evidence_text": "Matched by explicit Pashto marker in title from OpenAlex works search.",
2495
+ "evidence_url": "https://doi.org/10.1109/icdar.2015.7333963",
2496
+ "markers": [
2497
+ "pashto"
2498
+ ]
2499
+ },
2500
+ "tags": [
2501
+ "pashto",
2502
+ "paper",
2503
+ "openalex",
2504
+ "ocr"
2505
+ ]
2506
+ },
2507
+ {
2508
+ "id": "paper-openalex-shape-analysis-pashto-script-image-database-ocr",
2509
+ "title": "Shape analysis of Pashto script and creation of image database for OCR",
2510
+ "url": "https://doi.org/10.1109/icet.2009.5353160",
2511
+ "category": "paper",
2512
+ "source": "openalex",
2513
+ "status": "verified",
2514
+ "summary": "Research paper on Pashto script shape analysis and image database creation for OCR.",
2515
+ "primary_use": "Pashto OCR dataset design and feature reference",
2516
+ "tasks": [
2517
+ "ocr",
2518
+ "benchmarking"
2519
+ ],
2520
+ "pashto_evidence": {
2521
+ "evidence_text": "Matched by explicit Pashto marker in title from OpenAlex works search.",
2522
+ "evidence_url": "https://doi.org/10.1109/icet.2009.5353160",
2523
+ "markers": [
2524
+ "pashto"
2525
+ ]
2526
+ },
2527
+ "tags": [
2528
+ "pashto",
2529
+ "paper",
2530
+ "openalex",
2531
+ "ocr",
2532
+ "dataset"
2533
+ ]
2534
+ },
2535
+ {
2536
+ "id": "paper-openalex-speech-translation-low-resource-case-pashto",
2537
+ "title": "Speech translation for low-resource languages: the case of Pashto",
2538
+ "url": "https://doi.org/10.21437/interspeech.2005-723",
2539
+ "category": "paper",
2540
+ "source": "openalex",
2541
+ "status": "verified",
2542
+ "summary": "Research paper on speech translation for low-resource languages, including Pashto.",
2543
+ "primary_use": "Pashto speech translation and low-resource MT reference",
2544
+ "tasks": [
2545
+ "asr",
2546
+ "mt"
2547
+ ],
2548
+ "pashto_evidence": {
2549
+ "evidence_text": "Matched by explicit Pashto marker in title from OpenAlex works search.",
2550
+ "evidence_url": "https://doi.org/10.21437/interspeech.2005-723",
2551
+ "markers": [
2552
+ "pashto"
2553
+ ]
2554
+ },
2555
+ "tags": [
2556
+ "pashto",
2557
+ "paper",
2558
+ "openalex",
2559
+ "speech",
2560
+ "translation"
2561
+ ]
2562
  }
2563
  ]
2564
  }
resources/datasets/README.md CHANGED
@@ -5,7 +5,9 @@
5
  | Resource | Link | Pashto Evidence | Primary Use |
6
  |---|---|---|---|
7
  | 99 Hours Pashto Spontaneous Dialogue Smartphone Speech Dataset | [huggingface](https://huggingface.co/datasets/Nexdata/99_Hours_Pashto_Spontaneous_Dialogue_Smartphone_speech_dataset) | [Dataset title explicitly includes Pashto and API metadata marks audio and text modalities. (`Pashto`)](https://huggingface.co/datasets/Nexdata/99_Hours_Pashto_Spontaneous_Dialogue_Smartphone_speech_dataset) | Spontaneous speech ASR training and robustness evaluation |
 
8
  | adnankhan769/proper_dataset_english_2_pashto | [huggingface](https://huggingface.co/datasets/adnankhan769/proper_dataset_english_2_pashto) | [Matched by Pashto keyword in Hugging Face search results. (`pashto`)](https://huggingface.co/datasets/adnankhan769/proper_dataset_english_2_pashto) | Machine translation and bilingual corpus development |
 
9
  | alpaca-pashto-cleaned | [huggingface](https://huggingface.co/datasets/saillab/alpaca-pashto-cleaned) | [Dataset metadata includes language:ps and dataset name includes Pashto. (`ps`, `Pashto`)](https://huggingface.co/api/datasets/saillab/alpaca-pashto-cleaned) | Pashto instruction tuning and conversational NLP experiments |
10
  | Belebele | [huggingface](https://huggingface.co/datasets/facebook/belebele) | [Dataset includes pbt_Arab subset. (`pbt_Arab`)](https://huggingface.co/datasets/facebook/belebele) | Comprehension and multilingual NLP benchmark |
11
  | Common Voice 24.0: Pashto Speech Dataset | [kaggle](https://www.kaggle.com/datasets/ataullahaali/common-voice-scripted-speech-24-0-pashto) | [Kaggle dataset title/subtitle includes Pashto keyword. (`Pashto`)](https://www.kaggle.com/datasets/ataullahaali/common-voice-scripted-speech-24-0-pashto) | ASR training and evaluation data source |
@@ -13,6 +15,7 @@
13
  | English to Pashto Sentences Dataset | [huggingface](https://huggingface.co/datasets/adnankhan769/english_to_pashto_sentences_dataset) | [Dataset ID explicitly states English-to-Pashto and includes Pashto-script sentence column. (`Pashto`)](https://huggingface.co/api/datasets/adnankhan769/english_to_pashto_sentences_dataset) | MT and bilingual sentence alignment baseline |
14
  | English-Pashto Language Dataset (EPLD) | [kaggle](https://www.kaggle.com/datasets/rabiakhan827/english-pashto-language-dataset-epld) | [Kaggle dataset title/subtitle includes Pashto keyword. (`Pashto`)](https://www.kaggle.com/datasets/rabiakhan827/english-pashto-language-dataset-epld) | Machine translation and bilingual corpus development |
15
  | Google FLEURS | [huggingface](https://huggingface.co/datasets/google/fleurs) | [Dataset config includes ps_af. (`ps_af`)](https://huggingface.co/datasets/google/fleurs/blob/main/fleurs.py) | Speech benchmark and external evaluation |
 
16
  | ihanif/pashto_asr_wer | [huggingface](https://huggingface.co/datasets/ihanif/pashto_asr_wer) | [Matched by Pashto keyword in Hugging Face search results. (`pashto`)](https://huggingface.co/datasets/ihanif/pashto_asr_wer) | ASR training and evaluation data source |
17
  | ihanif/pashto_speech_20k | [huggingface](https://huggingface.co/datasets/ihanif/pashto_speech_20k) | [Matched by Pashto keyword in Hugging Face search results. (`pashto`)](https://huggingface.co/datasets/ihanif/pashto_speech_20k) | ASR training and evaluation data source |
18
  | ihanif/pashto_speech_5k | [huggingface](https://huggingface.co/datasets/ihanif/pashto_speech_5k) | [Matched by Pashto keyword in Hugging Face search results. (`pashto`)](https://huggingface.co/datasets/ihanif/pashto_speech_5k) | ASR training and evaluation data source |
 
5
  | Resource | Link | Pashto Evidence | Primary Use |
6
  |---|---|---|---|
7
  | 99 Hours Pashto Spontaneous Dialogue Smartphone Speech Dataset | [huggingface](https://huggingface.co/datasets/Nexdata/99_Hours_Pashto_Spontaneous_Dialogue_Smartphone_speech_dataset) | [Dataset title explicitly includes Pashto and API metadata marks audio and text modalities. (`Pashto`)](https://huggingface.co/datasets/Nexdata/99_Hours_Pashto_Spontaneous_Dialogue_Smartphone_speech_dataset) | Spontaneous speech ASR training and robustness evaluation |
8
+ | aamirhs/pashto-audio-wav2vec | [huggingface](https://huggingface.co/datasets/aamirhs/pashto-audio-wav2vec) | [Matched by Pashto keyword in Hugging Face search results. (`pashto`)](https://huggingface.co/datasets/aamirhs/pashto-audio-wav2vec) | Pashto ASR data exploration and baseline training |
9
  | adnankhan769/proper_dataset_english_2_pashto | [huggingface](https://huggingface.co/datasets/adnankhan769/proper_dataset_english_2_pashto) | [Matched by Pashto keyword in Hugging Face search results. (`pashto`)](https://huggingface.co/datasets/adnankhan769/proper_dataset_english_2_pashto) | Machine translation and bilingual corpus development |
10
+ | AliMuhammad73/Pashto-Poetry | [huggingface](https://huggingface.co/datasets/AliMuhammad73/Pashto-Poetry) | [Matched by Pashto keyword in Hugging Face search results. (`pashto`)](https://huggingface.co/datasets/AliMuhammad73/Pashto-Poetry) | Pashto poetry corpus for language modeling and text analysis |
11
  | alpaca-pashto-cleaned | [huggingface](https://huggingface.co/datasets/saillab/alpaca-pashto-cleaned) | [Dataset metadata includes language:ps and dataset name includes Pashto. (`ps`, `Pashto`)](https://huggingface.co/api/datasets/saillab/alpaca-pashto-cleaned) | Pashto instruction tuning and conversational NLP experiments |
12
  | Belebele | [huggingface](https://huggingface.co/datasets/facebook/belebele) | [Dataset includes pbt_Arab subset. (`pbt_Arab`)](https://huggingface.co/datasets/facebook/belebele) | Comprehension and multilingual NLP benchmark |
13
  | Common Voice 24.0: Pashto Speech Dataset | [kaggle](https://www.kaggle.com/datasets/ataullahaali/common-voice-scripted-speech-24-0-pashto) | [Kaggle dataset title/subtitle includes Pashto keyword. (`Pashto`)](https://www.kaggle.com/datasets/ataullahaali/common-voice-scripted-speech-24-0-pashto) | ASR training and evaluation data source |
 
15
  | English to Pashto Sentences Dataset | [huggingface](https://huggingface.co/datasets/adnankhan769/english_to_pashto_sentences_dataset) | [Dataset ID explicitly states English-to-Pashto and includes Pashto-script sentence column. (`Pashto`)](https://huggingface.co/api/datasets/adnankhan769/english_to_pashto_sentences_dataset) | MT and bilingual sentence alignment baseline |
16
  | English-Pashto Language Dataset (EPLD) | [kaggle](https://www.kaggle.com/datasets/rabiakhan827/english-pashto-language-dataset-epld) | [Kaggle dataset title/subtitle includes Pashto keyword. (`Pashto`)](https://www.kaggle.com/datasets/rabiakhan827/english-pashto-language-dataset-epld) | Machine translation and bilingual corpus development |
17
  | Google FLEURS | [huggingface](https://huggingface.co/datasets/google/fleurs) | [Dataset config includes ps_af. (`ps_af`)](https://huggingface.co/datasets/google/fleurs/blob/main/fleurs.py) | Speech benchmark and external evaluation |
18
+ | IARPA Babel Pashto Language Pack IARPA-babel104b-v0.4bY | [dataverse](https://hdl.handle.net/11272.1/AB2/GLFN3X) | [Dataverse metadata includes Pashto markers in dataset title or description. (`pashto`)](https://hdl.handle.net/11272.1/AB2/GLFN3X) | Pashto speech dataset for ASR and language identification experiments |
19
  | ihanif/pashto_asr_wer | [huggingface](https://huggingface.co/datasets/ihanif/pashto_asr_wer) | [Matched by Pashto keyword in Hugging Face search results. (`pashto`)](https://huggingface.co/datasets/ihanif/pashto_asr_wer) | ASR training and evaluation data source |
20
  | ihanif/pashto_speech_20k | [huggingface](https://huggingface.co/datasets/ihanif/pashto_speech_20k) | [Matched by Pashto keyword in Hugging Face search results. (`pashto`)](https://huggingface.co/datasets/ihanif/pashto_speech_20k) | ASR training and evaluation data source |
21
  | ihanif/pashto_speech_5k | [huggingface](https://huggingface.co/datasets/ihanif/pashto_speech_5k) | [Matched by Pashto keyword in Hugging Face search results. (`pashto`)](https://huggingface.co/datasets/ihanif/pashto_speech_5k) | ASR training and evaluation data source |
resources/models/README.md CHANGED
@@ -4,6 +4,7 @@
4
 
5
  | Resource | Link | Pashto Evidence | Primary Use |
6
  |---|---|---|---|
 
7
  | ihanif/pashto-asr-base | [huggingface](https://huggingface.co/ihanif/pashto-asr-base) | [Matched by Pashto keyword in Hugging Face search results. (`pashto`)](https://huggingface.co/ihanif/pashto-asr-base) | Pashto ASR baseline and model comparison |
8
  | ihanif/wav2vec2-xls-r-300m-pashto-lm | [huggingface](https://huggingface.co/ihanif/wav2vec2-xls-r-300m-pashto-lm) | [Matched by Pashto keyword in Hugging Face search results. (`pashto`)](https://huggingface.co/ihanif/wav2vec2-xls-r-300m-pashto-lm) | Pashto ASR baseline and model comparison |
9
  | ihanif/whisper-large-pashto | [huggingface](https://huggingface.co/ihanif/whisper-large-pashto) | [Matched by Pashto keyword in Hugging Face search results. (`pashto`)](https://huggingface.co/ihanif/whisper-large-pashto) | Pashto ASR baseline and model comparison |
 
4
 
5
  | Resource | Link | Pashto Evidence | Primary Use |
6
  |---|---|---|---|
7
+ | aamirhs/wav2vec2-large-xls-r-300m-pashto-colab | [huggingface](https://huggingface.co/aamirhs/wav2vec2-large-xls-r-300m-pashto-colab) | [Matched by Pashto keyword in Hugging Face search results. (`pashto`)](https://huggingface.co/aamirhs/wav2vec2-large-xls-r-300m-pashto-colab) | Pashto ASR baseline and transfer-learning comparison |
8
  | ihanif/pashto-asr-base | [huggingface](https://huggingface.co/ihanif/pashto-asr-base) | [Matched by Pashto keyword in Hugging Face search results. (`pashto`)](https://huggingface.co/ihanif/pashto-asr-base) | Pashto ASR baseline and model comparison |
9
  | ihanif/wav2vec2-xls-r-300m-pashto-lm | [huggingface](https://huggingface.co/ihanif/wav2vec2-xls-r-300m-pashto-lm) | [Matched by Pashto keyword in Hugging Face search results. (`pashto`)](https://huggingface.co/ihanif/wav2vec2-xls-r-300m-pashto-lm) | Pashto ASR baseline and model comparison |
10
  | ihanif/whisper-large-pashto | [huggingface](https://huggingface.co/ihanif/whisper-large-pashto) | [Matched by Pashto keyword in Hugging Face search results. (`pashto`)](https://huggingface.co/ihanif/whisper-large-pashto) | Pashto ASR baseline and model comparison |
resources/papers/README.md CHANGED
@@ -4,15 +4,30 @@
4
 
5
  | Resource | Link | Pashto Evidence | Primary Use |
6
  |---|---|---|---|
 
7
  | Benchmarking Whisper for Low-Resource Speech Recognition: An N-Shot Evaluation on Pashto, Punjabi, and Urdu | [other](https://www.semanticscholar.org/paper/13104eddc785756132a19242ac7e74442b145693) | [Matched by Semantic Scholar query: pashto. (`pashto`)](https://www.semanticscholar.org/paper/13104eddc785756132a19242ac7e74442b145693) | Pashto research reference for methods and benchmarking |
 
8
  | Deep Learning-Based Detection of One and Two-Column Textual Blocks in Camera-Captured Pashto Documents Images | [other](https://www.semanticscholar.org/paper/8c9d2628e23d5c27edc656071f11f0e78124d182) | [Matched by Semantic Scholar query: pashto. (`pashto`)](https://www.semanticscholar.org/paper/8c9d2628e23d5c27edc656071f11f0e78124d182) | Pashto research reference for methods and benchmarking |
 
9
  | Enhancing Pashto NER Using Machine-Labeled Data and Transformer-Based Models | [other](https://www.semanticscholar.org/paper/be851ecf9197ef9bb8bf764abf4db0dda95cd9da) | [Matched by explicit Pashto marker in paper title from Semantic Scholar search. (`pashto`)](https://www.semanticscholar.org/paper/be851ecf9197ef9bb8bf764abf4db0dda95cd9da) | Pashto research reference for methods and benchmarking |
10
  | Enhancing Pashto Text Classification using Language Processing Techniques for Single And Multi-Label Analysis | [arxiv](http://arxiv.org/abs/2305.03201v1) | [Matched by arXiv query: all:pashto. (`pashto`)](http://arxiv.org/abs/2305.03201v1) | Pashto research reference for methods and benchmarking |
 
11
  | KNN and ANN-based Recognition of Handwritten Pashto Letters using Zoning Features | [arxiv](http://arxiv.org/abs/1904.03391v2) | [Matched by arXiv query: all:pashto. (`pashto`)](http://arxiv.org/abs/1904.03391v2) | Pashto research reference for methods and benchmarking |
 
12
  | Out-of-Vocabulary Pashto Spell Checker using Morphological Operations | [other](https://www.semanticscholar.org/paper/802aae68a6a7fdfb29d51be03fb2b09e29311fa7) | [Matched by Semantic Scholar query: pashto. (`pashto`)](https://www.semanticscholar.org/paper/802aae68a6a7fdfb29d51be03fb2b09e29311fa7) | Pashto research reference for methods and benchmarking |
 
 
13
  | Pashto Shallow Parsing: A Deep Learning Approach | [other](https://www.semanticscholar.org/paper/be36455bb4eca60accb3e6866f345132f0dac1e5) | [Matched by Semantic Scholar query: pashto. (`pashto`)](https://www.semanticscholar.org/paper/be36455bb4eca60accb3e6866f345132f0dac1e5) | Pashto research reference for methods and benchmarking |
 
 
14
  | POS tagging of low-resource Pashto language: annotated corpus and BERT-based model | [other](https://www.semanticscholar.org/paper/1b2d5c896fec735483e8c8fb0a75e13125e08769) | [Matched by Semantic Scholar query: pashto. (`pashto`)](https://www.semanticscholar.org/paper/1b2d5c896fec735483e8c8fb0a75e13125e08769) | Pashto research reference for methods and benchmarking |
15
  | PsOCR: Benchmarking Large Multimodal Models for Optical Character Recognition in Low-resource Pashto Language | [other](https://www.semanticscholar.org/paper/d2743c0dcdbc65f5b46fcec2f0ba7cb379c4134f) | [Paper title explicitly references low-resource Pashto language OCR benchmarking. (`Pashto`, `OCR`)](https://www.semanticscholar.org/paper/d2743c0dcdbc65f5b46fcec2f0ba7cb379c4134f) | Pashto OCR research baseline and evaluation reference |
 
 
 
 
 
 
16
 
17
  ## Maintenance
18
  - Source of truth: [../catalog/resources.json](../catalog/resources.json)
 
4
 
5
  | Resource | Link | Pashto Evidence | Primary Use |
6
  |---|---|---|---|
7
+ | Benchmark Pashto Handwritten Character Dataset and Pashto Object Character Recognition (OCR) Using Deep Neural Network with Rule Activation Function | [openalex](https://doi.org/10.1155/2021/6669672) | [Matched by explicit Pashto marker in title from OpenAlex works search. (`pashto`)](https://doi.org/10.1155/2021/6669672) | Pashto handwritten OCR benchmark and methodology reference |
8
  | Benchmarking Whisper for Low-Resource Speech Recognition: An N-Shot Evaluation on Pashto, Punjabi, and Urdu | [other](https://www.semanticscholar.org/paper/13104eddc785756132a19242ac7e74442b145693) | [Matched by Semantic Scholar query: pashto. (`pashto`)](https://www.semanticscholar.org/paper/13104eddc785756132a19242ac7e74442b145693) | Pashto research reference for methods and benchmarking |
9
+ | Database development and automatic speech recognition of isolated Pashto spoken digits using MFCC and K-NN | [openalex](https://doi.org/10.1007/s10772-014-9267-z) | [Matched by explicit Pashto marker in title from OpenAlex works search. (`pashto`)](https://doi.org/10.1007/s10772-014-9267-z) | Pashto ASR baseline method reference for digit recognition |
10
  | Deep Learning-Based Detection of One and Two-Column Textual Blocks in Camera-Captured Pashto Documents Images | [other](https://www.semanticscholar.org/paper/8c9d2628e23d5c27edc656071f11f0e78124d182) | [Matched by Semantic Scholar query: pashto. (`pashto`)](https://www.semanticscholar.org/paper/8c9d2628e23d5c27edc656071f11f0e78124d182) | Pashto research reference for methods and benchmarking |
11
+ | Development of a New Image-to-text Conversion System for Pashto, Farsi and Traditional Chinese | [arxiv](http://arxiv.org/abs/2005.08650v1) | [Matched by Pashto marker in paper title from arXiv query results. (`pashto`)](http://arxiv.org/abs/2005.08650v1) | Pashto OCR method reference |
12
  | Enhancing Pashto NER Using Machine-Labeled Data and Transformer-Based Models | [other](https://www.semanticscholar.org/paper/be851ecf9197ef9bb8bf764abf4db0dda95cd9da) | [Matched by explicit Pashto marker in paper title from Semantic Scholar search. (`pashto`)](https://www.semanticscholar.org/paper/be851ecf9197ef9bb8bf764abf4db0dda95cd9da) | Pashto research reference for methods and benchmarking |
13
  | Enhancing Pashto Text Classification using Language Processing Techniques for Single And Multi-Label Analysis | [arxiv](http://arxiv.org/abs/2305.03201v1) | [Matched by arXiv query: all:pashto. (`pashto`)](http://arxiv.org/abs/2305.03201v1) | Pashto research reference for methods and benchmarking |
14
+ | From Scarcity to Scale: A Release-Level Analysis of the Pashto Common Voice Dataset | [arxiv](http://arxiv.org/abs/2602.14062v1) | [Matched by Pashto marker in paper title from arXiv query results. (`pashto`)](http://arxiv.org/abs/2602.14062v1) | ASR data quality and release trend reference |
15
  | KNN and ANN-based Recognition of Handwritten Pashto Letters using Zoning Features | [arxiv](http://arxiv.org/abs/1904.03391v2) | [Matched by arXiv query: all:pashto. (`pashto`)](http://arxiv.org/abs/1904.03391v2) | Pashto research reference for methods and benchmarking |
16
+ | KPTI: Katib's Pashto Text Imagebase and Deep Learning Benchmark | [openalex](https://doi.org/10.1109/icfhr.2016.0090) | [Matched by explicit Pashto marker in title from OpenAlex works search. (`pashto`)](https://doi.org/10.1109/icfhr.2016.0090) | Pashto OCR dataset and benchmarking reference |
17
  | Out-of-Vocabulary Pashto Spell Checker using Morphological Operations | [other](https://www.semanticscholar.org/paper/802aae68a6a7fdfb29d51be03fb2b09e29311fa7) | [Matched by Semantic Scholar query: pashto. (`pashto`)](https://www.semanticscholar.org/paper/802aae68a6a7fdfb29d51be03fb2b09e29311fa7) | Pashto research reference for methods and benchmarking |
18
+ | Pashto isolated digits recognition using deep convolutional neural network | [openalex](https://doi.org/10.1016/j.heliyon.2020.e03372) | [Matched by explicit Pashto marker in title from OpenAlex works search. (`pashto`)](https://doi.org/10.1016/j.heliyon.2020.e03372) | Pashto speech recognition research reference |
19
+ | Pashto offensive language detection: a benchmark dataset and monolingual Pashto BERT | [openalex](https://doi.org/10.7717/peerj-cs.1617) | [Matched by explicit Pashto marker in title from OpenAlex works search. (`pashto`)](https://doi.org/10.7717/peerj-cs.1617) | Pashto NLP toxicity detection benchmark and model reference |
20
  | Pashto Shallow Parsing: A Deep Learning Approach | [other](https://www.semanticscholar.org/paper/be36455bb4eca60accb3e6866f345132f0dac1e5) | [Matched by Semantic Scholar query: pashto. (`pashto`)](https://www.semanticscholar.org/paper/be36455bb4eca60accb3e6866f345132f0dac1e5) | Pashto research reference for methods and benchmarking |
21
+ | PHTI: Pashto Handwritten Text Imagebase for Deep Learning Applications | [openalex](https://doi.org/10.1109/access.2022.3216881) | [Matched by explicit Pashto marker in title from OpenAlex works search. (`pashto`)](https://doi.org/10.1109/access.2022.3216881) | Pashto OCR dataset and benchmark reference |
22
+ | Pioneer dataset and recognition of Handwritten Pashto characters using Convolution Neural Networks | [openalex](https://doi.org/10.1177/0020294020964826) | [Matched by explicit Pashto marker in title from OpenAlex works search. (`pashto`)](https://doi.org/10.1177/0020294020964826) | Pashto handwritten character recognition reference |
23
  | POS tagging of low-resource Pashto language: annotated corpus and BERT-based model | [other](https://www.semanticscholar.org/paper/1b2d5c896fec735483e8c8fb0a75e13125e08769) | [Matched by Semantic Scholar query: pashto. (`pashto`)](https://www.semanticscholar.org/paper/1b2d5c896fec735483e8c8fb0a75e13125e08769) | Pashto research reference for methods and benchmarking |
24
  | PsOCR: Benchmarking Large Multimodal Models for Optical Character Recognition in Low-resource Pashto Language | [other](https://www.semanticscholar.org/paper/d2743c0dcdbc65f5b46fcec2f0ba7cb379c4134f) | [Paper title explicitly references low-resource Pashto language OCR benchmarking. (`Pashto`, `OCR`)](https://www.semanticscholar.org/paper/d2743c0dcdbc65f5b46fcec2f0ba7cb379c4134f) | Pashto OCR research baseline and evaluation reference |
25
+ | Recognition of Pashto Handwritten Characters Based on Deep Learning | [openalex](https://doi.org/10.3390/s20205884) | [Matched by explicit Pashto marker in title from OpenAlex works search. (`pashto`)](https://doi.org/10.3390/s20205884) | Pashto OCR model reference for handwritten character recognition |
26
+ | Recognizable units in Pashto language for OCR | [openalex](https://doi.org/10.1109/icdar.2015.7333963) | [Matched by explicit Pashto marker in title from OpenAlex works search. (`pashto`)](https://doi.org/10.1109/icdar.2015.7333963) | Pashto OCR preprocessing and unit-design reference |
27
+ | Scale and rotation invariant OCR for Pashto cursive script using MDLSTM network | [openalex](https://doi.org/10.1109/icdar.2015.7333931) | [Matched by explicit Pashto marker in title from OpenAlex works search. (`pashto`)](https://doi.org/10.1109/icdar.2015.7333931) | Pashto OCR model architecture reference |
28
+ | Shape analysis of Pashto script and creation of image database for OCR | [openalex](https://doi.org/10.1109/icet.2009.5353160) | [Matched by explicit Pashto marker in title from OpenAlex works search. (`pashto`)](https://doi.org/10.1109/icet.2009.5353160) | Pashto OCR dataset design and feature reference |
29
+ | Speech translation for low-resource languages: the case of Pashto | [openalex](https://doi.org/10.21437/interspeech.2005-723) | [Matched by explicit Pashto marker in title from OpenAlex works search. (`pashto`)](https://doi.org/10.21437/interspeech.2005-723) | Pashto speech translation and low-resource MT reference |
30
+ | Tuning Traditional Language Processing Approaches for Pashto Text Classification | [arxiv](http://arxiv.org/abs/2305.03737v1) | [Matched by Pashto marker in paper title from arXiv query results. (`pashto`)](http://arxiv.org/abs/2305.03737v1) | Pashto text classification method reference |
31
 
32
  ## Maintenance
33
  - Source of truth: [../catalog/resources.json](../catalog/resources.json)
resources/projects/README.md CHANGED
@@ -5,6 +5,7 @@
5
  | Resource | Link | Pashto Evidence | Primary Use |
6
  |---|---|---|---|
7
  | afaqalinagra/PASHTO-ASR-MODEL | [huggingface](https://huggingface.co/spaces/afaqalinagra/PASHTO-ASR-MODEL) | [Matched by Pashto keyword in Hugging Face Spaces search. (`pashto`)](https://huggingface.co/spaces/afaqalinagra/PASHTO-ASR-MODEL) | Interactive Pashto demo and quick qualitative validation |
 
8
  | Fazlullahmamond/Pashto-Typing | [github](https://github.com/Fazlullahmamond/Pashto-Typing) | [Repository metadata (name/description/topics) includes Pashto markers. (`pashto`)](https://github.com/Fazlullahmamond/Pashto-Typing) | Interactive Pashto demo and quick qualitative validation |
9
  | ihanif/wav2vec-pashto-asr | [huggingface](https://huggingface.co/spaces/ihanif/wav2vec-pashto-asr) | [Matched by Pashto keyword in Hugging Face Spaces search. (`pashto`)](https://huggingface.co/spaces/ihanif/wav2vec-pashto-asr) | Interactive Pashto demo and quick qualitative validation |
10
  | ihanif/wav2vec2-bert-pashto-asr | [huggingface](https://huggingface.co/spaces/ihanif/wav2vec2-bert-pashto-asr) | [Matched by Pashto keyword in Hugging Face Spaces search. (`pashto`)](https://huggingface.co/spaces/ihanif/wav2vec2-bert-pashto-asr) | Interactive Pashto demo and quick qualitative validation |
 
5
  | Resource | Link | Pashto Evidence | Primary Use |
6
  |---|---|---|---|
7
  | afaqalinagra/PASHTO-ASR-MODEL | [huggingface](https://huggingface.co/spaces/afaqalinagra/PASHTO-ASR-MODEL) | [Matched by Pashto keyword in Hugging Face Spaces search. (`pashto`)](https://huggingface.co/spaces/afaqalinagra/PASHTO-ASR-MODEL) | Interactive Pashto demo and quick qualitative validation |
8
+ | Aizazayyubi/pashto_asr | [huggingface](https://huggingface.co/spaces/Aizazayyubi/pashto_asr) | [Matched by Pashto keyword in Hugging Face Spaces search. (`pashto`)](https://huggingface.co/spaces/Aizazayyubi/pashto_asr) | Interactive Pashto ASR demo for qualitative evaluation |
9
  | Fazlullahmamond/Pashto-Typing | [github](https://github.com/Fazlullahmamond/Pashto-Typing) | [Repository metadata (name/description/topics) includes Pashto markers. (`pashto`)](https://github.com/Fazlullahmamond/Pashto-Typing) | Interactive Pashto demo and quick qualitative validation |
10
  | ihanif/wav2vec-pashto-asr | [huggingface](https://huggingface.co/spaces/ihanif/wav2vec-pashto-asr) | [Matched by Pashto keyword in Hugging Face Spaces search. (`pashto`)](https://huggingface.co/spaces/ihanif/wav2vec-pashto-asr) | Interactive Pashto demo and quick qualitative validation |
11
  | ihanif/wav2vec2-bert-pashto-asr | [huggingface](https://huggingface.co/spaces/ihanif/wav2vec2-bert-pashto-asr) | [Matched by Pashto keyword in Hugging Face Spaces search. (`pashto`)](https://huggingface.co/spaces/ihanif/wav2vec2-bert-pashto-asr) | Interactive Pashto demo and quick qualitative validation |
resources/schema/resource.schema.json CHANGED
@@ -1,6 +1,6 @@
1
  {
2
  "$schema": "https://json-schema.org/draft/2020-12/schema",
3
- "$id": "https://musawer1214.github.io/Pukhto_Pashto/resources/schema/resource.schema.json",
4
  "title": "Pashto Resource Catalog",
5
  "type": "object",
6
  "additionalProperties": false,
@@ -74,7 +74,13 @@
74
  "mozilla",
75
  "kaggle",
76
  "github",
 
77
  "arxiv",
 
 
 
 
 
78
  "meta",
79
  "other"
80
  ]
@@ -142,3 +148,4 @@
142
  }
143
  }
144
  }
 
 
1
  {
2
  "$schema": "https://json-schema.org/draft/2020-12/schema",
3
+ "$id": "https://musawer1214.github.io/pashto-language-resources/resources/schema/resource.schema.json",
4
  "title": "Pashto Resource Catalog",
5
  "type": "object",
6
  "additionalProperties": false,
 
74
  "mozilla",
75
  "kaggle",
76
  "github",
77
+ "gitlab",
78
  "arxiv",
79
+ "openalex",
80
+ "crossref",
81
+ "zenodo",
82
+ "dataverse",
83
+ "datacite",
84
  "meta",
85
  "other"
86
  ]
 
148
  }
149
  }
150
  }
151
+
scripts/README.md CHANGED
@@ -7,7 +7,7 @@ Automation scripts for quality checks, resource catalog validation, and search i
7
  - `check_links.py`: ensure markdown links are clickable (optional online reachability check).
8
  - `validate_resource_catalog.py`: validate `resources/catalog/resources.json`.
9
  - `generate_resource_views.py`: generate `resources/*/README.md`, `resources/README.md`, and `docs/search/resources.json` from the catalog.
10
- - `sync_resources.py`: collect new candidate Pashto resources from Kaggle, Hugging Face (datasets/models/spaces), GitHub repositories, and paper endpoints into `resources/catalog/pending_candidates.json`.
11
  - `run_resource_cycle.py`: run the full repeatable resource cycle with one command.
12
 
13
  ## Usage
 
7
  - `check_links.py`: ensure markdown links are clickable (optional online reachability check).
8
  - `validate_resource_catalog.py`: validate `resources/catalog/resources.json`.
9
  - `generate_resource_views.py`: generate `resources/*/README.md`, `resources/README.md`, and `docs/search/resources.json` from the catalog.
10
+ - `sync_resources.py`: collect new candidate Pashto resources from Kaggle, Hugging Face (datasets/models/spaces), GitHub, GitLab, OpenAlex, Crossref, Zenodo, Dataverse, DataCite, arXiv, and Semantic Scholar into `resources/catalog/pending_candidates.json`.
11
  - `run_resource_cycle.py`: run the full repeatable resource cycle with one command.
12
 
13
  ## Usage
scripts/sync_resources.py CHANGED
@@ -21,6 +21,7 @@ import urllib.request
21
  import xml.etree.ElementTree as ET
22
  from datetime import datetime, timezone
23
  from email.utils import parsedate_to_datetime
 
24
  from http.client import IncompleteRead
25
  from pathlib import Path
26
  from typing import Any
@@ -74,6 +75,20 @@ def _is_low_signal_name(value: str) -> bool:
74
  return bool(LOW_SIGNAL_RE.search(value or ""))
75
 
76
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77
  def _parse_retry_after_seconds(retry_after: str | None) -> float | None:
78
  if not retry_after:
79
  return None
@@ -440,10 +455,8 @@ def fetch_github_pashto_repos(limit: int) -> list[dict[str, Any]]:
440
  continue
441
 
442
  html_url = item["html_url"]
443
- category = "project"
444
  topics = item.get("topics") or []
445
- if any(token in name_blob for token in ("toolkit", "library", "nlp", "asr", "tts", "ocr", "api", "code")):
446
- category = "code"
447
 
448
  rid = f"candidate-gh-{category}-{_slug(full_name)}"
449
  description = (item.get("description") or "").strip()
@@ -467,6 +480,433 @@ def fetch_github_pashto_repos(limit: int) -> list[dict[str, Any]]:
467
  return out
468
 
469
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
470
  def fetch_arxiv(limit: int) -> list[dict[str, Any]]:
471
  roots: list[ET.Element] = []
472
  errors: list[str] = []
@@ -651,6 +1091,12 @@ def main() -> int:
651
  ("huggingface-models", lambda: fetch_huggingface("models", args.limit)),
652
  ("huggingface-spaces", lambda: fetch_huggingface_spaces(args.limit)),
653
  ("github-repositories", lambda: fetch_github_pashto_repos(args.limit)),
 
 
 
 
 
 
654
  ("arxiv", lambda: fetch_arxiv(args.limit)),
655
  ("semantic-scholar", lambda: fetch_semantic_scholar(args.limit)),
656
  ]
 
21
  import xml.etree.ElementTree as ET
22
  from datetime import datetime, timezone
23
  from email.utils import parsedate_to_datetime
24
+ from html import unescape
25
  from http.client import IncompleteRead
26
  from pathlib import Path
27
  from typing import Any
 
75
  return bool(LOW_SIGNAL_RE.search(value or ""))
76
 
77
 
78
+ def _strip_html(value: str) -> str:
79
+ text = re.sub(r"<[^>]+>", " ", value or "")
80
+ text = unescape(text)
81
+ return re.sub(r"\s+", " ", text).strip()
82
+
83
+
84
+ def _classify_repo_category(name_blob: str) -> str:
85
+ lowered = (name_blob or "").casefold()
86
+ code_tokens = ("toolkit", "library", "nlp", "asr", "tts", "ocr", "api", "code", "cli", "sdk")
87
+ if any(token in lowered for token in code_tokens):
88
+ return "code"
89
+ return "project"
90
+
91
+
92
  def _parse_retry_after_seconds(retry_after: str | None) -> float | None:
93
  if not retry_after:
94
  return None
 
455
  continue
456
 
457
  html_url = item["html_url"]
 
458
  topics = item.get("topics") or []
459
+ category = _classify_repo_category(name_blob)
 
460
 
461
  rid = f"candidate-gh-{category}-{_slug(full_name)}"
462
  description = (item.get("description") or "").strip()
 
480
  return out
481
 
482
 
483
+ def fetch_gitlab_pashto_projects(limit: int) -> list[dict[str, Any]]:
484
+ combined: dict[str, dict[str, Any]] = {}
485
+ errors: list[str] = []
486
+ for term in PASHTO_QUERY_TERMS:
487
+ query = urllib.parse.urlencode(
488
+ {
489
+ "search": term,
490
+ "simple": "true",
491
+ "order_by": "star_count",
492
+ "sort": "desc",
493
+ "per_page": str(limit),
494
+ }
495
+ )
496
+ url = f"https://gitlab.com/api/v4/projects?{query}"
497
+ try:
498
+ payload = _fetch_json(url, timeout=30.0, source_name="gitlab-projects")
499
+ except Exception as exc: # noqa: BLE001
500
+ errors.append(f"{term}: {exc}")
501
+ continue
502
+ for item in payload:
503
+ full_name = (item.get("path_with_namespace") or item.get("name_with_namespace") or "").strip()
504
+ if not full_name:
505
+ continue
506
+ combined[full_name] = item
507
+
508
+ if not combined and errors:
509
+ raise RuntimeError("; ".join(errors))
510
+
511
+ out: list[dict[str, Any]] = []
512
+ sorted_items = sorted(
513
+ combined.items(),
514
+ key=lambda kv: kv[1].get("star_count") or 0,
515
+ reverse=True,
516
+ )
517
+ for full_name, item in sorted_items:
518
+ web_url = (item.get("web_url") or "").strip()
519
+ if not web_url:
520
+ continue
521
+
522
+ description = (item.get("description") or "").strip()
523
+ topics = item.get("topics") or []
524
+ if not isinstance(topics, list):
525
+ topics = []
526
+ topics = [str(topic).strip() for topic in topics if str(topic).strip()]
527
+ name_blob = " ".join([full_name, item.get("name") or "", description, " ".join(topics)])
528
+ if not _is_pashto_centric(name_blob):
529
+ continue
530
+ if _is_low_signal_name(full_name):
531
+ continue
532
+
533
+ category = _classify_repo_category(name_blob)
534
+ rid = f"candidate-gitlab-{category}-{_slug(full_name)}"
535
+ summary = description or "Candidate Pashto-related GitLab repository."
536
+ out.append(
537
+ _candidate(
538
+ rid=rid,
539
+ title=full_name,
540
+ url=web_url,
541
+ category=category,
542
+ source="gitlab",
543
+ summary=summary[:240] if summary else "Candidate Pashto-related GitLab repository.",
544
+ evidence_text="Project metadata (name/description/topics) includes Pashto markers.",
545
+ evidence_url=web_url,
546
+ markers=["pashto"],
547
+ tags=["pashto", "candidate", category, "gitlab", *(topics[:3])],
548
+ )
549
+ )
550
+ if len(out) >= limit:
551
+ break
552
+ return out
553
+
554
+
555
+ def fetch_openalex_papers(limit: int) -> list[dict[str, Any]]:
556
+ combined: dict[str, dict[str, Any]] = {}
557
+ errors: list[str] = []
558
+ for term in PASHTO_QUERY_TERMS:
559
+ query = urllib.parse.urlencode({"search": term, "per-page": str(limit)})
560
+ url = f"https://api.openalex.org/works?{query}"
561
+ try:
562
+ payload = _fetch_json(url, timeout=30.0, source_name="openalex")
563
+ except Exception as exc: # noqa: BLE001
564
+ errors.append(f"{term}: {exc}")
565
+ continue
566
+ for item in payload.get("results", []):
567
+ work_id = (item.get("id") or "").strip()
568
+ if not work_id:
569
+ continue
570
+ combined[work_id] = item
571
+
572
+ if not combined and errors:
573
+ raise RuntimeError("; ".join(errors))
574
+
575
+ out: list[dict[str, Any]] = []
576
+ for item in combined.values():
577
+ title = (item.get("display_name") or "").strip()
578
+ if not title:
579
+ continue
580
+ if not _is_pashto_centric(title):
581
+ continue
582
+ if _is_low_signal_name(title):
583
+ continue
584
+
585
+ doi = (item.get("doi") or "").strip()
586
+ if doi and not doi.startswith("http"):
587
+ doi = f"https://doi.org/{doi}"
588
+ primary = item.get("primary_location") or {}
589
+ landing = (primary.get("landing_page_url") or "").strip()
590
+ paper_url = doi or landing or (item.get("id") or "").strip()
591
+ if not paper_url:
592
+ continue
593
+
594
+ rid = f"candidate-openalex-{_slug(title)}"
595
+ out.append(
596
+ _candidate(
597
+ rid=rid,
598
+ title=title,
599
+ url=paper_url,
600
+ category="paper",
601
+ source="openalex",
602
+ summary="Candidate paper returned from OpenAlex works search for Pashto.",
603
+ evidence_text="Matched by explicit Pashto marker in title from OpenAlex works search.",
604
+ evidence_url=paper_url,
605
+ markers=["pashto"],
606
+ tags=["pashto", "candidate", "paper", "openalex"],
607
+ )
608
+ )
609
+ if len(out) >= limit:
610
+ break
611
+ return out
612
+
613
+
614
+ def fetch_crossref_papers(limit: int) -> list[dict[str, Any]]:
615
+ combined: dict[str, dict[str, Any]] = {}
616
+ errors: list[str] = []
617
+ for term in PASHTO_QUERY_TERMS:
618
+ query = urllib.parse.urlencode({"query.title": term, "rows": str(limit)})
619
+ url = f"https://api.crossref.org/works?{query}"
620
+ try:
621
+ payload = _fetch_json(url, timeout=30.0, source_name="crossref")
622
+ except Exception as exc: # noqa: BLE001
623
+ errors.append(f"{term}: {exc}")
624
+ continue
625
+ for item in payload.get("message", {}).get("items", []):
626
+ doi = (item.get("DOI") or "").strip()
627
+ title_list = item.get("title") or []
628
+ title = (title_list[0] if isinstance(title_list, list) and title_list else "").strip()
629
+ key = doi or title
630
+ if not key:
631
+ continue
632
+ combined[key] = item
633
+
634
+ if not combined and errors:
635
+ raise RuntimeError("; ".join(errors))
636
+
637
+ out: list[dict[str, Any]] = []
638
+ for item in combined.values():
639
+ title_list = item.get("title") or []
640
+ title = (title_list[0] if isinstance(title_list, list) and title_list else "").strip()
641
+ if not title:
642
+ continue
643
+ if not _is_pashto_centric(title):
644
+ continue
645
+ if _is_low_signal_name(title):
646
+ continue
647
+
648
+ doi = (item.get("DOI") or "").strip()
649
+ paper_url = (item.get("URL") or "").strip()
650
+ if not paper_url and doi:
651
+ paper_url = f"https://doi.org/{doi}"
652
+ if not paper_url:
653
+ continue
654
+
655
+ abstract = _strip_html(item.get("abstract") or "")
656
+ rid = f"candidate-crossref-{_slug(title)}"
657
+ out.append(
658
+ _candidate(
659
+ rid=rid,
660
+ title=title,
661
+ url=paper_url,
662
+ category="paper",
663
+ source="crossref",
664
+ summary=(abstract or "Candidate paper returned from Crossref search for Pashto.")[:240],
665
+ evidence_text="Matched by explicit Pashto marker in title from Crossref search.",
666
+ evidence_url=paper_url,
667
+ markers=["pashto"],
668
+ tags=["pashto", "candidate", "paper", "crossref"],
669
+ )
670
+ )
671
+ if len(out) >= limit:
672
+ break
673
+ return out
674
+
675
+
676
+ def fetch_zenodo_records(limit: int) -> list[dict[str, Any]]:
677
+ combined: dict[str, dict[str, Any]] = {}
678
+ errors: list[str] = []
679
+ for term in PASHTO_QUERY_TERMS:
680
+ query = urllib.parse.urlencode({"q": term, "size": str(limit), "sort": "mostrecent"})
681
+ url = f"https://zenodo.org/api/records/?{query}"
682
+ try:
683
+ payload = _fetch_json(url, timeout=30.0, source_name="zenodo")
684
+ except Exception as exc: # noqa: BLE001
685
+ errors.append(f"{term}: {exc}")
686
+ continue
687
+ for item in payload.get("hits", {}).get("hits", []):
688
+ record_id = str(item.get("id") or "").strip()
689
+ if not record_id:
690
+ continue
691
+ combined[record_id] = item
692
+
693
+ if not combined and errors:
694
+ raise RuntimeError("; ".join(errors))
695
+
696
+ category_map = {
697
+ "dataset": "dataset",
698
+ "software": "code",
699
+ "publication": "paper",
700
+ "poster": "project",
701
+ "presentation": "project",
702
+ }
703
+
704
+ out: list[dict[str, Any]] = []
705
+ for item in combined.values():
706
+ metadata = item.get("metadata") or {}
707
+ title = (metadata.get("title") or "").strip()
708
+ description = _strip_html(metadata.get("description") or "")
709
+ if not title:
710
+ continue
711
+ if not _is_pashto_centric(title, description):
712
+ continue
713
+ if _is_low_signal_name(title):
714
+ continue
715
+
716
+ links = item.get("links") or {}
717
+ record_url = (links.get("self_html") or links.get("doi") or "").strip()
718
+ if not record_url:
719
+ doi = (metadata.get("doi") or "").strip()
720
+ if doi:
721
+ record_url = f"https://doi.org/{doi}"
722
+ if not record_url:
723
+ continue
724
+
725
+ rtype = (metadata.get("resource_type") or {}).get("type") or ""
726
+ category = category_map.get(str(rtype).casefold(), "project")
727
+ rid = f"candidate-zenodo-{category}-{_slug(title)}"
728
+ summary = description or "Candidate resource returned from Zenodo search for Pashto."
729
+ out.append(
730
+ _candidate(
731
+ rid=rid,
732
+ title=title,
733
+ url=record_url,
734
+ category=category,
735
+ source="zenodo",
736
+ summary=summary[:240],
737
+ evidence_text="Zenodo metadata includes Pashto markers in title or description.",
738
+ evidence_url=record_url,
739
+ markers=["pashto"],
740
+ tags=["pashto", "candidate", category, "zenodo"],
741
+ )
742
+ )
743
+ if len(out) >= limit:
744
+ break
745
+ return out
746
+
747
+
748
+ def fetch_dataverse_datasets(limit: int) -> list[dict[str, Any]]:
749
+ combined: dict[str, dict[str, Any]] = {}
750
+ errors: list[str] = []
751
+ base_url = "https://dataverse.harvard.edu"
752
+ for term in PASHTO_QUERY_TERMS:
753
+ query = urllib.parse.urlencode(
754
+ {
755
+ "q": term,
756
+ "type": "dataset",
757
+ "per_page": str(limit),
758
+ "start": "0",
759
+ }
760
+ )
761
+ url = f"{base_url}/api/search?{query}"
762
+ try:
763
+ payload = _fetch_json(url, timeout=30.0, source_name="dataverse")
764
+ except Exception as exc: # noqa: BLE001
765
+ errors.append(f"{term}: {exc}")
766
+ continue
767
+ for item in payload.get("data", {}).get("items", []):
768
+ key = str(item.get("global_id") or item.get("identifier") or item.get("url") or "").strip()
769
+ if not key:
770
+ continue
771
+ combined[key] = item
772
+
773
+ if not combined and errors:
774
+ raise RuntimeError("; ".join(errors))
775
+
776
+ out: list[dict[str, Any]] = []
777
+ for item in combined.values():
778
+ title = (item.get("name") or "").strip()
779
+ description = (item.get("description") or "").strip()
780
+ if not title:
781
+ continue
782
+ if not _is_pashto_centric(title, description):
783
+ continue
784
+ if _is_low_signal_name(title):
785
+ continue
786
+
787
+ record_url = (item.get("url") or "").strip()
788
+ if record_url and record_url.startswith("/"):
789
+ record_url = f"{base_url}{record_url}"
790
+ if not record_url:
791
+ global_id = (item.get("global_id") or "").strip()
792
+ if global_id:
793
+ escaped_id = urllib.parse.quote(global_id, safe=":/")
794
+ record_url = f"{base_url}/dataset.xhtml?persistentId={escaped_id}"
795
+ if not record_url:
796
+ continue
797
+
798
+ rid = f"candidate-dataverse-dataset-{_slug(title)}"
799
+ out.append(
800
+ _candidate(
801
+ rid=rid,
802
+ title=title,
803
+ url=record_url,
804
+ category="dataset",
805
+ source="dataverse",
806
+ summary=(description or "Candidate dataset returned from Dataverse search for Pashto.")[:240],
807
+ evidence_text="Dataverse metadata includes Pashto markers in dataset title or description.",
808
+ evidence_url=record_url,
809
+ markers=["pashto"],
810
+ tags=["pashto", "candidate", "dataset", "dataverse"],
811
+ )
812
+ )
813
+ if len(out) >= limit:
814
+ break
815
+ return out
816
+
817
+
818
+ def fetch_datacite_records(limit: int) -> list[dict[str, Any]]:
819
+ combined: dict[str, dict[str, Any]] = {}
820
+ errors: list[str] = []
821
+ for term in PASHTO_QUERY_TERMS:
822
+ query = urllib.parse.urlencode(
823
+ {
824
+ "query": term,
825
+ "page[size]": str(limit),
826
+ }
827
+ )
828
+ url = f"https://api.datacite.org/dois?{query}"
829
+ try:
830
+ payload = _fetch_json(url, timeout=30.0, source_name="datacite")
831
+ except Exception as exc: # noqa: BLE001
832
+ errors.append(f"{term}: {exc}")
833
+ continue
834
+ for item in payload.get("data", []):
835
+ record_id = (item.get("id") or "").strip()
836
+ if not record_id:
837
+ continue
838
+ combined[record_id] = item
839
+
840
+ if not combined and errors:
841
+ raise RuntimeError("; ".join(errors))
842
+
843
+ dataset_types = {"dataset", "collection"}
844
+ software_types = {"software"}
845
+ paper_types = {"journalarticle", "conferencepaper", "preprint", "text"}
846
+
847
+ out: list[dict[str, Any]] = []
848
+ for item in combined.values():
849
+ attributes = item.get("attributes") or {}
850
+ titles = attributes.get("titles") or []
851
+ title = ""
852
+ if isinstance(titles, list) and titles:
853
+ first = titles[0] or {}
854
+ if isinstance(first, dict):
855
+ title = (first.get("title") or "").strip()
856
+ if not title:
857
+ continue
858
+ description_items = attributes.get("descriptions") or []
859
+ descriptions: list[str] = []
860
+ if isinstance(description_items, list):
861
+ for block in description_items:
862
+ if isinstance(block, dict):
863
+ value = (block.get("description") or "").strip()
864
+ if value:
865
+ descriptions.append(_strip_html(value))
866
+ description_blob = " ".join(descriptions).strip()
867
+ if not _is_pashto_centric(title, description_blob):
868
+ continue
869
+ if _is_low_signal_name(title):
870
+ continue
871
+
872
+ doi = (attributes.get("doi") or item.get("id") or "").strip()
873
+ record_url = (attributes.get("url") or "").strip()
874
+ if not record_url and doi:
875
+ record_url = f"https://doi.org/{doi}"
876
+ if not record_url:
877
+ continue
878
+
879
+ general_type = str((attributes.get("types") or {}).get("resourceTypeGeneral") or "").casefold()
880
+ if general_type in dataset_types:
881
+ category = "dataset"
882
+ elif general_type in software_types:
883
+ category = "code"
884
+ elif general_type in paper_types:
885
+ category = "paper"
886
+ else:
887
+ category = "project"
888
+
889
+ rid = f"candidate-datacite-{category}-{_slug(title)}"
890
+ summary = description_blob or "Candidate record returned from DataCite DOI search for Pashto."
891
+ out.append(
892
+ _candidate(
893
+ rid=rid,
894
+ title=title,
895
+ url=record_url,
896
+ category=category,
897
+ source="datacite",
898
+ summary=summary[:240],
899
+ evidence_text="DataCite metadata includes Pashto markers in title or description.",
900
+ evidence_url=record_url,
901
+ markers=["pashto"],
902
+ tags=["pashto", "candidate", category, "datacite"],
903
+ )
904
+ )
905
+ if len(out) >= limit:
906
+ break
907
+ return out
908
+
909
+
910
  def fetch_arxiv(limit: int) -> list[dict[str, Any]]:
911
  roots: list[ET.Element] = []
912
  errors: list[str] = []
 
1091
  ("huggingface-models", lambda: fetch_huggingface("models", args.limit)),
1092
  ("huggingface-spaces", lambda: fetch_huggingface_spaces(args.limit)),
1093
  ("github-repositories", lambda: fetch_github_pashto_repos(args.limit)),
1094
+ ("gitlab-projects", lambda: fetch_gitlab_pashto_projects(args.limit)),
1095
+ ("openalex", lambda: fetch_openalex_papers(args.limit)),
1096
+ ("crossref", lambda: fetch_crossref_papers(args.limit)),
1097
+ ("zenodo", lambda: fetch_zenodo_records(args.limit)),
1098
+ ("dataverse", lambda: fetch_dataverse_datasets(args.limit)),
1099
+ ("datacite", lambda: fetch_datacite_records(args.limit)),
1100
  ("arxiv", lambda: fetch_arxiv(args.limit)),
1101
  ("semantic-scholar", lambda: fetch_semantic_scholar(args.limit)),
1102
  ]
scripts/validate_resource_catalog.py CHANGED
@@ -17,7 +17,21 @@ from urllib.parse import urlparse
17
 
18
 
19
  ALLOWED_CATEGORIES = {"dataset", "model", "benchmark", "tool", "paper", "project", "code"}
20
- ALLOWED_SOURCES = {"huggingface", "mozilla", "kaggle", "github", "arxiv", "meta", "other"}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
  ALLOWED_STATUS = {"verified", "candidate"}
22
  RESOURCE_ID_RE = re.compile(r"^[a-z0-9][a-z0-9._-]*$")
23
  STRICT_PASHTO_CATEGORIES = {"model", "paper", "tool", "code", "project"}
 
17
 
18
 
19
  ALLOWED_CATEGORIES = {"dataset", "model", "benchmark", "tool", "paper", "project", "code"}
20
+ ALLOWED_SOURCES = {
21
+ "huggingface",
22
+ "mozilla",
23
+ "kaggle",
24
+ "github",
25
+ "gitlab",
26
+ "arxiv",
27
+ "openalex",
28
+ "crossref",
29
+ "zenodo",
30
+ "dataverse",
31
+ "datacite",
32
+ "meta",
33
+ "other",
34
+ }
35
  ALLOWED_STATUS = {"verified", "candidate"}
36
  RESOURCE_ID_RE = re.compile(r"^[a-z0-9][a-z0-9._-]*$")
37
  STRICT_PASHTO_CATEGORIES = {"model", "paper", "tool", "code", "project"}