musaw commited on
Commit
9899fdf
·
1 Parent(s): 081627f

chore(resources): improve sync reliability and promote curated candidates

Browse files
docs/search/resources.json CHANGED
@@ -1,6 +1,6 @@
1
  {
2
- "generated_on": "2026-02-15T00:00:00Z",
3
- "count": 37,
4
  "resources": [
5
  {
6
  "id": "dataset-common-voice-ps-v24",
@@ -895,6 +895,991 @@
895
  "Pashto",
896
  "NLP"
897
  ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
898
  }
899
  ]
900
  }
 
1
  {
2
+ "generated_on": "2026-02-16T00:00:00Z",
3
+ "count": 77,
4
  "resources": [
5
  {
6
  "id": "dataset-common-voice-ps-v24",
 
895
  "Pashto",
896
  "NLP"
897
  ]
898
+ },
899
+ {
900
+ "id": "dataset-kaggle-drijaz-pashtoocr",
901
+ "title": "PashtoOCR (Kaggle)",
902
+ "url": "https://www.kaggle.com/datasets/drijaz/pashtoocr",
903
+ "category": "dataset",
904
+ "source": "kaggle",
905
+ "status": "verified",
906
+ "summary": "Synthetic OCR dataset focused on Pashto ligatures and text recognition tasks.",
907
+ "primary_use": "Pashto OCR dataset benchmarking and training",
908
+ "tasks": [
909
+ "ocr",
910
+ "nlp"
911
+ ],
912
+ "tags": [
913
+ "pashto",
914
+ "kaggle",
915
+ "ocr",
916
+ "dataset"
917
+ ],
918
+ "evidence_text": "Kaggle dataset title and subtitle explicitly identify a Pashto OCR dataset.",
919
+ "evidence_url": "https://www.kaggle.com/api/v1/datasets/view/drijaz/pashtoocr",
920
+ "markers": [
921
+ "Pashto",
922
+ "OCR"
923
+ ]
924
+ },
925
+ {
926
+ "id": "model-hf-zirak-ai-pashto-bert-v1",
927
+ "title": "zirak-ai/pashto-bert-v1",
928
+ "url": "https://huggingface.co/zirak-ai/pashto-bert-v1",
929
+ "category": "model",
930
+ "source": "huggingface",
931
+ "status": "verified",
932
+ "summary": "Pashto BERT model checkpoint for low-resource Pashto NLP experiments.",
933
+ "primary_use": "Pashto encoder baseline for NLP tasks",
934
+ "tasks": [
935
+ "nlp"
936
+ ],
937
+ "tags": [
938
+ "pashto",
939
+ "huggingface",
940
+ "bert",
941
+ "nlp"
942
+ ],
943
+ "evidence_text": "Hugging Face model ID and search tags explicitly include pashto marker.",
944
+ "evidence_url": "https://huggingface.co/zirak-ai/pashto-bert-v1",
945
+ "markers": [
946
+ "pashto"
947
+ ]
948
+ },
949
+ {
950
+ "id": "project-hf-space-ihanif-pashto-asr",
951
+ "title": "Pashto ASR Space",
952
+ "url": "https://huggingface.co/spaces/ihanif/pashto-asr",
953
+ "category": "project",
954
+ "source": "huggingface",
955
+ "status": "verified",
956
+ "summary": "Interactive Hugging Face Space for Pashto ASR inference demos.",
957
+ "primary_use": "Live Pashto speech-to-text demo project",
958
+ "tasks": [
959
+ "asr",
960
+ "demo"
961
+ ],
962
+ "tags": [
963
+ "pashto",
964
+ "project",
965
+ "huggingface-space",
966
+ "asr"
967
+ ],
968
+ "evidence_text": "Space ID includes pashto-asr and is returned by Hugging Face Pashto space search.",
969
+ "evidence_url": "https://huggingface.co/api/spaces/ihanif/pashto-asr",
970
+ "markers": [
971
+ "pashto",
972
+ "asr"
973
+ ]
974
+ },
975
+ {
976
+ "id": "code-github-mrychlik-worldly-ocr",
977
+ "title": "worldly-ocr",
978
+ "url": "https://github.com/mrychlik/worldly-ocr",
979
+ "category": "code",
980
+ "source": "github",
981
+ "status": "verified",
982
+ "summary": "Open OCR code project that explicitly includes Pashto among target languages.",
983
+ "primary_use": "Pashto OCR code reference and experimentation",
984
+ "tasks": [
985
+ "ocr",
986
+ "tooling"
987
+ ],
988
+ "tags": [
989
+ "pashto",
990
+ "code",
991
+ "github",
992
+ "ocr"
993
+ ],
994
+ "evidence_text": "Repository description explicitly says OCR for Pashto and Chinese.",
995
+ "evidence_url": "https://api.github.com/repos/mrychlik/worldly-ocr",
996
+ "markers": [
997
+ "Pashto",
998
+ "OCR"
999
+ ]
1000
+ },
1001
+ {
1002
+ "id": "paper-s2-psocr-lmm-pashto",
1003
+ "title": "PsOCR: Benchmarking Large Multimodal Models for Optical Character Recognition in Low-resource Pashto Language",
1004
+ "url": "https://www.semanticscholar.org/paper/d2743c0dcdbc65f5b46fcec2f0ba7cb379c4134f",
1005
+ "category": "paper",
1006
+ "source": "other",
1007
+ "status": "verified",
1008
+ "summary": "Research paper benchmarking multimodal OCR models on low-resource Pashto OCR tasks.",
1009
+ "primary_use": "Pashto OCR research baseline and evaluation reference",
1010
+ "tasks": [
1011
+ "ocr",
1012
+ "research"
1013
+ ],
1014
+ "tags": [
1015
+ "pashto",
1016
+ "paper",
1017
+ "ocr",
1018
+ "multimodal"
1019
+ ],
1020
+ "evidence_text": "Paper title explicitly references low-resource Pashto language OCR benchmarking.",
1021
+ "evidence_url": "https://www.semanticscholar.org/paper/d2743c0dcdbc65f5b46fcec2f0ba7cb379c4134f",
1022
+ "markers": [
1023
+ "Pashto",
1024
+ "OCR"
1025
+ ]
1026
+ },
1027
+ {
1028
+ "id": "dataset-hf-adnankhan769-english-to-pashto",
1029
+ "title": "English to Pashto Sentences Dataset",
1030
+ "url": "https://huggingface.co/datasets/adnankhan769/english_to_pashto_sentences_dataset",
1031
+ "category": "dataset",
1032
+ "source": "huggingface",
1033
+ "status": "verified",
1034
+ "summary": "Parallel English-Pashto sentence dataset for bilingual NLP and translation experiments.",
1035
+ "primary_use": "MT and bilingual sentence alignment baseline",
1036
+ "tasks": [
1037
+ "mt",
1038
+ "nlp"
1039
+ ],
1040
+ "tags": [
1041
+ "pashto",
1042
+ "dataset",
1043
+ "huggingface",
1044
+ "translation"
1045
+ ],
1046
+ "evidence_text": "Dataset ID explicitly states English-to-Pashto and includes Pashto-script sentence column.",
1047
+ "evidence_url": "https://huggingface.co/api/datasets/adnankhan769/english_to_pashto_sentences_dataset",
1048
+ "markers": [
1049
+ "Pashto"
1050
+ ]
1051
+ },
1052
+ {
1053
+ "id": "dataset-hf-saillab-alpaca-pashto-cleaned",
1054
+ "title": "alpaca-pashto-cleaned",
1055
+ "url": "https://huggingface.co/datasets/saillab/alpaca-pashto-cleaned",
1056
+ "category": "dataset",
1057
+ "source": "huggingface",
1058
+ "status": "verified",
1059
+ "summary": "Instruction-style Pashto text dataset suitable for LLM tuning and instruction-following research.",
1060
+ "primary_use": "Pashto instruction tuning and conversational NLP experiments",
1061
+ "tasks": [
1062
+ "nlp",
1063
+ "llm"
1064
+ ],
1065
+ "tags": [
1066
+ "pashto",
1067
+ "dataset",
1068
+ "huggingface",
1069
+ "instruction"
1070
+ ],
1071
+ "evidence_text": "Dataset metadata includes language:ps and dataset name includes Pashto.",
1072
+ "evidence_url": "https://huggingface.co/api/datasets/saillab/alpaca-pashto-cleaned",
1073
+ "markers": [
1074
+ "ps",
1075
+ "Pashto"
1076
+ ]
1077
+ },
1078
+ {
1079
+ "id": "model-hf-ihanif-whisper-base-pashto",
1080
+ "title": "Whisper Base Pashto",
1081
+ "url": "https://huggingface.co/ihanif/whisper-base-pashto",
1082
+ "category": "model",
1083
+ "source": "huggingface",
1084
+ "status": "verified",
1085
+ "summary": "Fine-tuned Whisper Base checkpoint for Pashto ASR with FLEURS ps_af evaluation metadata.",
1086
+ "primary_use": "Pashto ASR baseline and speed-accuracy comparison",
1087
+ "tasks": [
1088
+ "asr"
1089
+ ],
1090
+ "tags": [
1091
+ "pashto",
1092
+ "model",
1093
+ "huggingface",
1094
+ "asr"
1095
+ ],
1096
+ "evidence_text": "Model ID includes Pashto and card metadata references FLEURS config ps_af.",
1097
+ "evidence_url": "https://huggingface.co/api/models/ihanif/whisper-base-pashto",
1098
+ "markers": [
1099
+ "Pashto",
1100
+ "ps_af"
1101
+ ]
1102
+ },
1103
+ {
1104
+ "id": "project-hf-space-zamai-mistral-7b-pashto",
1105
+ "title": "ZamAI-Mistral-7B-Pashto Space",
1106
+ "url": "https://huggingface.co/spaces/tasal9/ZamAI-Mistral-7B-Pashto-space",
1107
+ "category": "project",
1108
+ "source": "huggingface",
1109
+ "status": "verified",
1110
+ "summary": "Gradio project space demonstrating a Pashto-adapted Mistral 7B interface.",
1111
+ "primary_use": "Interactive Pashto LLM project demo",
1112
+ "tasks": [
1113
+ "llm",
1114
+ "demo"
1115
+ ],
1116
+ "tags": [
1117
+ "pashto",
1118
+ "project",
1119
+ "huggingface-space",
1120
+ "llm"
1121
+ ],
1122
+ "evidence_text": "Space title and ID explicitly include Pashto and model card metadata exposes project details.",
1123
+ "evidence_url": "https://huggingface.co/api/spaces/tasal9/ZamAI-Mistral-7B-Pashto-space",
1124
+ "markers": [
1125
+ "Pashto"
1126
+ ]
1127
+ },
1128
+ {
1129
+ "id": "dataset-hf-adnankhan769-proper-dataset-english-2-pashto",
1130
+ "title": "adnankhan769/proper_dataset_english_2_pashto",
1131
+ "url": "https://huggingface.co/datasets/adnankhan769/proper_dataset_english_2_pashto",
1132
+ "category": "dataset",
1133
+ "source": "huggingface",
1134
+ "status": "verified",
1135
+ "summary": "Pashto bilingual/translation dataset discovered from huggingface for MT experimentation.",
1136
+ "primary_use": "Machine translation and bilingual corpus development",
1137
+ "tasks": [
1138
+ "mt"
1139
+ ],
1140
+ "tags": [
1141
+ "pashto",
1142
+ "dataset",
1143
+ "huggingface",
1144
+ "mt"
1145
+ ],
1146
+ "evidence_text": "Matched by Pashto keyword in Hugging Face search results.",
1147
+ "evidence_url": "https://huggingface.co/datasets/adnankhan769/proper_dataset_english_2_pashto",
1148
+ "markers": [
1149
+ "pashto"
1150
+ ]
1151
+ },
1152
+ {
1153
+ "id": "dataset-hf-ihanif-pashto-asr-wer",
1154
+ "title": "ihanif/pashto_asr_wer",
1155
+ "url": "https://huggingface.co/datasets/ihanif/pashto_asr_wer",
1156
+ "category": "dataset",
1157
+ "source": "huggingface",
1158
+ "status": "verified",
1159
+ "summary": "Pashto speech dataset discovered from huggingface for ASR training and evaluation.",
1160
+ "primary_use": "ASR training and evaluation data source",
1161
+ "tasks": [
1162
+ "asr"
1163
+ ],
1164
+ "tags": [
1165
+ "pashto",
1166
+ "dataset",
1167
+ "huggingface",
1168
+ "asr"
1169
+ ],
1170
+ "evidence_text": "Matched by Pashto keyword in Hugging Face search results.",
1171
+ "evidence_url": "https://huggingface.co/datasets/ihanif/pashto_asr_wer",
1172
+ "markers": [
1173
+ "pashto"
1174
+ ]
1175
+ },
1176
+ {
1177
+ "id": "dataset-hf-ihanif-pashto-speech-ds",
1178
+ "title": "ihanif/pashto_speech_ds",
1179
+ "url": "https://huggingface.co/datasets/ihanif/pashto_speech_ds",
1180
+ "category": "dataset",
1181
+ "source": "huggingface",
1182
+ "status": "verified",
1183
+ "summary": "Pashto speech dataset discovered from huggingface for ASR training and evaluation.",
1184
+ "primary_use": "ASR training and evaluation data source",
1185
+ "tasks": [
1186
+ "asr"
1187
+ ],
1188
+ "tags": [
1189
+ "pashto",
1190
+ "dataset",
1191
+ "huggingface",
1192
+ "asr"
1193
+ ],
1194
+ "evidence_text": "Matched by Pashto keyword in Hugging Face search results.",
1195
+ "evidence_url": "https://huggingface.co/datasets/ihanif/pashto_speech_ds",
1196
+ "markers": [
1197
+ "pashto"
1198
+ ]
1199
+ },
1200
+ {
1201
+ "id": "dataset-hf-ihanif-pashto-speech-parquet-10k",
1202
+ "title": "ihanif/pashto_speech_parquet_10k",
1203
+ "url": "https://huggingface.co/datasets/ihanif/pashto_speech_parquet_10k",
1204
+ "category": "dataset",
1205
+ "source": "huggingface",
1206
+ "status": "verified",
1207
+ "summary": "Pashto speech dataset discovered from huggingface for ASR training and evaluation.",
1208
+ "primary_use": "ASR training and evaluation data source",
1209
+ "tasks": [
1210
+ "asr"
1211
+ ],
1212
+ "tags": [
1213
+ "pashto",
1214
+ "dataset",
1215
+ "huggingface",
1216
+ "asr"
1217
+ ],
1218
+ "evidence_text": "Matched by Pashto keyword in Hugging Face search results.",
1219
+ "evidence_url": "https://huggingface.co/datasets/ihanif/pashto_speech_parquet_10k",
1220
+ "markers": [
1221
+ "pashto"
1222
+ ]
1223
+ },
1224
+ {
1225
+ "id": "dataset-hf-saillab-alpaca-pashto-taco",
1226
+ "title": "saillab/alpaca_pashto_taco",
1227
+ "url": "https://huggingface.co/datasets/saillab/alpaca_pashto_taco",
1228
+ "category": "dataset",
1229
+ "source": "huggingface",
1230
+ "status": "verified",
1231
+ "summary": "Pashto-focused dataset discovered from huggingface candidate sync.",
1232
+ "primary_use": "Instruction tuning and LLM adaptation data source",
1233
+ "tasks": [
1234
+ "llm"
1235
+ ],
1236
+ "tags": [
1237
+ "pashto",
1238
+ "dataset",
1239
+ "huggingface",
1240
+ "llm"
1241
+ ],
1242
+ "evidence_text": "Matched by Pashto keyword in Hugging Face search results.",
1243
+ "evidence_url": "https://huggingface.co/datasets/saillab/alpaca_pashto_taco",
1244
+ "markers": [
1245
+ "pashto"
1246
+ ]
1247
+ },
1248
+ {
1249
+ "id": "dataset-hf-sherwindesouza-pashto-common-voice-20",
1250
+ "title": "SherwinDesouza/pashto-common-voice-20",
1251
+ "url": "https://huggingface.co/datasets/SherwinDesouza/pashto-common-voice-20",
1252
+ "category": "dataset",
1253
+ "source": "huggingface",
1254
+ "status": "verified",
1255
+ "summary": "Pashto-focused dataset discovered from huggingface candidate sync.",
1256
+ "primary_use": "Pashto data source for NLP experimentation",
1257
+ "tasks": [
1258
+ "nlp"
1259
+ ],
1260
+ "tags": [
1261
+ "pashto",
1262
+ "dataset",
1263
+ "huggingface",
1264
+ "nlp"
1265
+ ],
1266
+ "evidence_text": "Matched by Pashto keyword in Hugging Face search results.",
1267
+ "evidence_url": "https://huggingface.co/datasets/SherwinDesouza/pashto-common-voice-20",
1268
+ "markers": [
1269
+ "pashto"
1270
+ ]
1271
+ },
1272
+ {
1273
+ "id": "dataset-hf-tasal9-zamai-pashto-dataset",
1274
+ "title": "tasal9/ZamAI_Pashto_Dataset",
1275
+ "url": "https://huggingface.co/datasets/tasal9/ZamAI_Pashto_Dataset",
1276
+ "category": "dataset",
1277
+ "source": "huggingface",
1278
+ "status": "verified",
1279
+ "summary": "Pashto-focused dataset discovered from huggingface candidate sync.",
1280
+ "primary_use": "Pashto data source for NLP experimentation",
1281
+ "tasks": [
1282
+ "nlp"
1283
+ ],
1284
+ "tags": [
1285
+ "pashto",
1286
+ "dataset",
1287
+ "huggingface",
1288
+ "nlp"
1289
+ ],
1290
+ "evidence_text": "Matched by Pashto keyword in Hugging Face search results.",
1291
+ "evidence_url": "https://huggingface.co/datasets/tasal9/ZamAI_Pashto_Dataset",
1292
+ "markers": [
1293
+ "pashto"
1294
+ ]
1295
+ },
1296
+ {
1297
+ "id": "dataset-kaggle-english-pashto-language-dataset-epld",
1298
+ "title": "English-Pashto Language Dataset (EPLD)",
1299
+ "url": "https://www.kaggle.com/datasets/rabiakhan827/english-pashto-language-dataset-epld",
1300
+ "category": "dataset",
1301
+ "source": "kaggle",
1302
+ "status": "verified",
1303
+ "summary": "Pashto bilingual/translation dataset discovered from kaggle for MT experimentation.",
1304
+ "primary_use": "Machine translation and bilingual corpus development",
1305
+ "tasks": [
1306
+ "mt"
1307
+ ],
1308
+ "tags": [
1309
+ "pashto",
1310
+ "dataset",
1311
+ "kaggle",
1312
+ "mt"
1313
+ ],
1314
+ "evidence_text": "Kaggle dataset title/subtitle includes Pashto keyword.",
1315
+ "evidence_url": "https://www.kaggle.com/datasets/rabiakhan827/english-pashto-language-dataset-epld",
1316
+ "markers": [
1317
+ "Pashto"
1318
+ ]
1319
+ },
1320
+ {
1321
+ "id": "dataset-kaggle-katib-s-pashto-text-imagebase-kpti",
1322
+ "title": "Katib's Pashto Text Imagebase (KPTI)",
1323
+ "url": "https://www.kaggle.com/datasets/hassanamin/katibs-pashto-text-imagebase-kpti",
1324
+ "category": "dataset",
1325
+ "source": "kaggle",
1326
+ "status": "verified",
1327
+ "summary": "Pashto OCR-oriented dataset discovered from kaggle for document and script recognition work.",
1328
+ "primary_use": "OCR training and evaluation data source",
1329
+ "tasks": [
1330
+ "ocr"
1331
+ ],
1332
+ "tags": [
1333
+ "pashto",
1334
+ "dataset",
1335
+ "kaggle",
1336
+ "ocr"
1337
+ ],
1338
+ "evidence_text": "Kaggle dataset title/subtitle includes Pashto keyword.",
1339
+ "evidence_url": "https://www.kaggle.com/datasets/hassanamin/katibs-pashto-text-imagebase-kpti",
1340
+ "markers": [
1341
+ "Pashto"
1342
+ ]
1343
+ },
1344
+ {
1345
+ "id": "dataset-kaggle-pashto-ocr",
1346
+ "title": "Pashto OCR",
1347
+ "url": "https://www.kaggle.com/datasets/hassanamin/pashto-ocr",
1348
+ "category": "dataset",
1349
+ "source": "kaggle",
1350
+ "status": "verified",
1351
+ "summary": "Pashto OCR-oriented dataset discovered from kaggle for document and script recognition work.",
1352
+ "primary_use": "OCR training and evaluation data source",
1353
+ "tasks": [
1354
+ "ocr"
1355
+ ],
1356
+ "tags": [
1357
+ "pashto",
1358
+ "dataset",
1359
+ "kaggle",
1360
+ "ocr"
1361
+ ],
1362
+ "evidence_text": "Kaggle dataset title/subtitle includes Pashto keyword.",
1363
+ "evidence_url": "https://www.kaggle.com/datasets/hassanamin/pashto-ocr",
1364
+ "markers": [
1365
+ "Pashto"
1366
+ ]
1367
+ },
1368
+ {
1369
+ "id": "dataset-kaggle-common-voice-24-0-pashto-speech-dataset",
1370
+ "title": "Common Voice 24.0: Pashto Speech Dataset",
1371
+ "url": "https://www.kaggle.com/datasets/ataullahaali/common-voice-scripted-speech-24-0-pashto",
1372
+ "category": "dataset",
1373
+ "source": "kaggle",
1374
+ "status": "verified",
1375
+ "summary": "Pashto speech dataset discovered from kaggle for ASR training and evaluation.",
1376
+ "primary_use": "ASR training and evaluation data source",
1377
+ "tasks": [
1378
+ "asr"
1379
+ ],
1380
+ "tags": [
1381
+ "pashto",
1382
+ "dataset",
1383
+ "kaggle",
1384
+ "asr"
1385
+ ],
1386
+ "evidence_text": "Kaggle dataset title/subtitle includes Pashto keyword.",
1387
+ "evidence_url": "https://www.kaggle.com/datasets/ataullahaali/common-voice-scripted-speech-24-0-pashto",
1388
+ "markers": [
1389
+ "Pashto"
1390
+ ]
1391
+ },
1392
+ {
1393
+ "id": "model-hf-ihanif-pashto-asr-base",
1394
+ "title": "ihanif/pashto-asr-base",
1395
+ "url": "https://huggingface.co/ihanif/pashto-asr-base",
1396
+ "category": "model",
1397
+ "source": "huggingface",
1398
+ "status": "verified",
1399
+ "summary": "Pashto ASR model checkpoint discovered from huggingface candidate sync.",
1400
+ "primary_use": "Pashto ASR baseline and model comparison",
1401
+ "tasks": [
1402
+ "asr"
1403
+ ],
1404
+ "tags": [
1405
+ "pashto",
1406
+ "model",
1407
+ "huggingface",
1408
+ "asr"
1409
+ ],
1410
+ "evidence_text": "Matched by Pashto keyword in Hugging Face search results.",
1411
+ "evidence_url": "https://huggingface.co/ihanif/pashto-asr-base",
1412
+ "markers": [
1413
+ "pashto"
1414
+ ]
1415
+ },
1416
+ {
1417
+ "id": "model-hf-ihanif-wav2vec2-xls-r-300m-pashto-lm",
1418
+ "title": "ihanif/wav2vec2-xls-r-300m-pashto-lm",
1419
+ "url": "https://huggingface.co/ihanif/wav2vec2-xls-r-300m-pashto-lm",
1420
+ "category": "model",
1421
+ "source": "huggingface",
1422
+ "status": "verified",
1423
+ "summary": "Pashto ASR model checkpoint discovered from huggingface candidate sync.",
1424
+ "primary_use": "Pashto ASR baseline and model comparison",
1425
+ "tasks": [
1426
+ "asr"
1427
+ ],
1428
+ "tags": [
1429
+ "pashto",
1430
+ "model",
1431
+ "huggingface",
1432
+ "asr"
1433
+ ],
1434
+ "evidence_text": "Matched by Pashto keyword in Hugging Face search results.",
1435
+ "evidence_url": "https://huggingface.co/ihanif/wav2vec2-xls-r-300m-pashto-lm",
1436
+ "markers": [
1437
+ "pashto"
1438
+ ]
1439
+ },
1440
+ {
1441
+ "id": "model-hf-ihanif-whisper-large-pashto",
1442
+ "title": "ihanif/whisper-large-pashto",
1443
+ "url": "https://huggingface.co/ihanif/whisper-large-pashto",
1444
+ "category": "model",
1445
+ "source": "huggingface",
1446
+ "status": "verified",
1447
+ "summary": "Pashto ASR model checkpoint discovered from huggingface candidate sync.",
1448
+ "primary_use": "Pashto ASR baseline and model comparison",
1449
+ "tasks": [
1450
+ "asr"
1451
+ ],
1452
+ "tags": [
1453
+ "pashto",
1454
+ "model",
1455
+ "huggingface",
1456
+ "asr"
1457
+ ],
1458
+ "evidence_text": "Matched by Pashto keyword in Hugging Face search results.",
1459
+ "evidence_url": "https://huggingface.co/ihanif/whisper-large-pashto",
1460
+ "markers": [
1461
+ "pashto"
1462
+ ]
1463
+ },
1464
+ {
1465
+ "id": "model-hf-ihanif-whisper-medium-pashto-3e-7",
1466
+ "title": "ihanif/whisper-medium-pashto-3e-7",
1467
+ "url": "https://huggingface.co/ihanif/whisper-medium-pashto-3e-7",
1468
+ "category": "model",
1469
+ "source": "huggingface",
1470
+ "status": "verified",
1471
+ "summary": "Pashto ASR model checkpoint discovered from huggingface candidate sync.",
1472
+ "primary_use": "Pashto ASR baseline and model comparison",
1473
+ "tasks": [
1474
+ "asr"
1475
+ ],
1476
+ "tags": [
1477
+ "pashto",
1478
+ "model",
1479
+ "huggingface",
1480
+ "asr"
1481
+ ],
1482
+ "evidence_text": "Matched by Pashto keyword in Hugging Face search results.",
1483
+ "evidence_url": "https://huggingface.co/ihanif/whisper-medium-pashto-3e-7",
1484
+ "markers": [
1485
+ "pashto"
1486
+ ]
1487
+ },
1488
+ {
1489
+ "id": "model-hf-ihanif-whisper-small-pashto",
1490
+ "title": "ihanif/whisper-small-pashto",
1491
+ "url": "https://huggingface.co/ihanif/whisper-small-pashto",
1492
+ "category": "model",
1493
+ "source": "huggingface",
1494
+ "status": "verified",
1495
+ "summary": "Pashto ASR model checkpoint discovered from huggingface candidate sync.",
1496
+ "primary_use": "Pashto ASR baseline and model comparison",
1497
+ "tasks": [
1498
+ "asr"
1499
+ ],
1500
+ "tags": [
1501
+ "pashto",
1502
+ "model",
1503
+ "huggingface",
1504
+ "asr"
1505
+ ],
1506
+ "evidence_text": "Matched by Pashto keyword in Hugging Face search results.",
1507
+ "evidence_url": "https://huggingface.co/ihanif/whisper-small-pashto",
1508
+ "markers": [
1509
+ "pashto"
1510
+ ]
1511
+ },
1512
+ {
1513
+ "id": "model-hf-ihanif-xls-r-1b-pashto",
1514
+ "title": "ihanif/xls-r-1b-pashto",
1515
+ "url": "https://huggingface.co/ihanif/xls-r-1b-pashto",
1516
+ "category": "model",
1517
+ "source": "huggingface",
1518
+ "status": "verified",
1519
+ "summary": "Pashto ASR model checkpoint discovered from huggingface candidate sync.",
1520
+ "primary_use": "Pashto ASR baseline and model comparison",
1521
+ "tasks": [
1522
+ "asr"
1523
+ ],
1524
+ "tags": [
1525
+ "pashto",
1526
+ "model",
1527
+ "huggingface",
1528
+ "asr"
1529
+ ],
1530
+ "evidence_text": "Matched by Pashto keyword in Hugging Face search results.",
1531
+ "evidence_url": "https://huggingface.co/ihanif/xls-r-1b-pashto",
1532
+ "markers": [
1533
+ "pashto"
1534
+ ]
1535
+ },
1536
+ {
1537
+ "id": "model-hf-ijazulhaq-bert-base-pashto-v1",
1538
+ "title": "ijazulhaq/bert-base-pashto-v1",
1539
+ "url": "https://huggingface.co/ijazulhaq/bert-base-pashto-v1",
1540
+ "category": "model",
1541
+ "source": "huggingface",
1542
+ "status": "verified",
1543
+ "summary": "Pashto NLP model checkpoint discovered from huggingface candidate sync.",
1544
+ "primary_use": "Pashto model baseline for downstream NLP tasks",
1545
+ "tasks": [
1546
+ "nlp"
1547
+ ],
1548
+ "tags": [
1549
+ "pashto",
1550
+ "model",
1551
+ "huggingface",
1552
+ "nlp"
1553
+ ],
1554
+ "evidence_text": "Matched by Pashto keyword in Hugging Face search results.",
1555
+ "evidence_url": "https://huggingface.co/ijazulhaq/bert-base-pashto-v1",
1556
+ "markers": [
1557
+ "pashto"
1558
+ ]
1559
+ },
1560
+ {
1561
+ "id": "project-hf-space-ihanif-wav2vec2-bert-pashto-asr",
1562
+ "title": "ihanif/wav2vec2-bert-pashto-asr",
1563
+ "url": "https://huggingface.co/spaces/ihanif/wav2vec2-bert-pashto-asr",
1564
+ "category": "project",
1565
+ "source": "huggingface",
1566
+ "status": "verified",
1567
+ "summary": "Pashto-focused interactive project discovered from huggingface for demonstration and quick evaluation.",
1568
+ "primary_use": "Interactive Pashto demo and quick qualitative validation",
1569
+ "tasks": [
1570
+ "asr",
1571
+ "nlp",
1572
+ "demo"
1573
+ ],
1574
+ "tags": [
1575
+ "pashto",
1576
+ "project",
1577
+ "huggingface",
1578
+ "asr",
1579
+ "nlp",
1580
+ "demo"
1581
+ ],
1582
+ "evidence_text": "Matched by Pashto keyword in Hugging Face Spaces search.",
1583
+ "evidence_url": "https://huggingface.co/spaces/ihanif/wav2vec2-bert-pashto-asr",
1584
+ "markers": [
1585
+ "pashto"
1586
+ ]
1587
+ },
1588
+ {
1589
+ "id": "project-hf-space-nasirkhansayyad-pashto-whisper-demo",
1590
+ "title": "nasirkhansayyad/pashto-whisper-demo",
1591
+ "url": "https://huggingface.co/spaces/nasirkhansayyad/pashto-whisper-demo",
1592
+ "category": "project",
1593
+ "source": "huggingface",
1594
+ "status": "verified",
1595
+ "summary": "Pashto-focused interactive project discovered from huggingface for demonstration and quick evaluation.",
1596
+ "primary_use": "Interactive Pashto demo and quick qualitative validation",
1597
+ "tasks": [
1598
+ "asr",
1599
+ "demo"
1600
+ ],
1601
+ "tags": [
1602
+ "pashto",
1603
+ "project",
1604
+ "huggingface",
1605
+ "asr",
1606
+ "demo"
1607
+ ],
1608
+ "evidence_text": "Matched by Pashto keyword in Hugging Face Spaces search.",
1609
+ "evidence_url": "https://huggingface.co/spaces/nasirkhansayyad/pashto-whisper-demo",
1610
+ "markers": [
1611
+ "pashto"
1612
+ ]
1613
+ },
1614
+ {
1615
+ "id": "project-hf-space-tasal9-zamai-phi3-mini-pashto-demo",
1616
+ "title": "tasal9/ZamAI-Phi3-Mini-Pashto-Demo",
1617
+ "url": "https://huggingface.co/spaces/tasal9/ZamAI-Phi3-Mini-Pashto-Demo",
1618
+ "category": "project",
1619
+ "source": "huggingface",
1620
+ "status": "verified",
1621
+ "summary": "Pashto-focused interactive project discovered from huggingface for demonstration and quick evaluation.",
1622
+ "primary_use": "Interactive Pashto demo and quick qualitative validation",
1623
+ "tasks": [
1624
+ "llm",
1625
+ "demo"
1626
+ ],
1627
+ "tags": [
1628
+ "pashto",
1629
+ "project",
1630
+ "huggingface",
1631
+ "llm",
1632
+ "demo"
1633
+ ],
1634
+ "evidence_text": "Matched by Pashto keyword in Hugging Face Spaces search.",
1635
+ "evidence_url": "https://huggingface.co/spaces/tasal9/ZamAI-Phi3-Mini-Pashto-Demo",
1636
+ "markers": [
1637
+ "pashto"
1638
+ ]
1639
+ },
1640
+ {
1641
+ "id": "project-hf-space-umar4321-pashto-to-english-urdu",
1642
+ "title": "Umar4321/Pashto-To-English-Urdu",
1643
+ "url": "https://huggingface.co/spaces/Umar4321/Pashto-To-English-Urdu",
1644
+ "category": "project",
1645
+ "source": "huggingface",
1646
+ "status": "verified",
1647
+ "summary": "Pashto-focused interactive project discovered from huggingface for demonstration and quick evaluation.",
1648
+ "primary_use": "Interactive Pashto demo and quick qualitative validation",
1649
+ "tasks": [
1650
+ "mt",
1651
+ "demo"
1652
+ ],
1653
+ "tags": [
1654
+ "pashto",
1655
+ "project",
1656
+ "huggingface",
1657
+ "mt",
1658
+ "demo"
1659
+ ],
1660
+ "evidence_text": "Matched by Pashto keyword in Hugging Face Spaces search.",
1661
+ "evidence_url": "https://huggingface.co/spaces/Umar4321/Pashto-To-English-Urdu",
1662
+ "markers": [
1663
+ "pashto"
1664
+ ]
1665
+ },
1666
+ {
1667
+ "id": "project-github-fazlullahmamond-pashto-typing",
1668
+ "title": "Fazlullahmamond/Pashto-Typing",
1669
+ "url": "https://github.com/Fazlullahmamond/Pashto-Typing",
1670
+ "category": "project",
1671
+ "source": "github",
1672
+ "status": "verified",
1673
+ "summary": "Pashto-focused interactive project discovered from github for demonstration and quick evaluation.",
1674
+ "primary_use": "Interactive Pashto demo and quick qualitative validation",
1675
+ "tasks": [
1676
+ "demo"
1677
+ ],
1678
+ "tags": [
1679
+ "pashto",
1680
+ "project",
1681
+ "github",
1682
+ "demo"
1683
+ ],
1684
+ "evidence_text": "Repository metadata (name/description/topics) includes Pashto markers.",
1685
+ "evidence_url": "https://github.com/Fazlullahmamond/Pashto-Typing",
1686
+ "markers": [
1687
+ "pashto"
1688
+ ]
1689
+ },
1690
+ {
1691
+ "id": "project-github-ihyacommunity-khushkhat-extension",
1692
+ "title": "IhyaCommunity/Khushkhat-Extension",
1693
+ "url": "https://github.com/IhyaCommunity/Khushkhat-Extension",
1694
+ "category": "project",
1695
+ "source": "github",
1696
+ "status": "verified",
1697
+ "summary": "Pashto-focused interactive project discovered from github for demonstration and quick evaluation.",
1698
+ "primary_use": "Interactive Pashto demo and quick qualitative validation",
1699
+ "tasks": [
1700
+ "demo"
1701
+ ],
1702
+ "tags": [
1703
+ "pashto",
1704
+ "project",
1705
+ "github",
1706
+ "demo"
1707
+ ],
1708
+ "evidence_text": "Repository metadata (name/description/topics) includes Pashto markers.",
1709
+ "evidence_url": "https://github.com/IhyaCommunity/Khushkhat-Extension",
1710
+ "markers": [
1711
+ "pashto"
1712
+ ]
1713
+ },
1714
+ {
1715
+ "id": "paper-s2-benchmarking-whisper-for-low-resource-speech-recognition-an-n-shot-evaluation-on-pashto-pu",
1716
+ "title": "Benchmarking Whisper for Low-Resource Speech Recognition: An N-Shot Evaluation on Pashto, Punjabi, and Urdu",
1717
+ "url": "https://www.semanticscholar.org/paper/13104eddc785756132a19242ac7e74442b145693",
1718
+ "category": "paper",
1719
+ "source": "other",
1720
+ "status": "verified",
1721
+ "summary": "Pashto language technology paper discovered from other for research reference.",
1722
+ "primary_use": "Pashto research reference for methods and benchmarking",
1723
+ "tasks": [
1724
+ "asr",
1725
+ "mt"
1726
+ ],
1727
+ "tags": [
1728
+ "pashto",
1729
+ "paper",
1730
+ "other",
1731
+ "asr",
1732
+ "mt"
1733
+ ],
1734
+ "evidence_text": "Matched by Semantic Scholar query: pashto.",
1735
+ "evidence_url": "https://www.semanticscholar.org/paper/13104eddc785756132a19242ac7e74442b145693",
1736
+ "markers": [
1737
+ "pashto"
1738
+ ]
1739
+ },
1740
+ {
1741
+ "id": "paper-s2-deep-learning-based-detection-of-one-and-two-column-textual-blocks-in-camera-captured-pash",
1742
+ "title": "Deep Learning-Based Detection of One and Two-Column Textual Blocks in Camera-Captured Pashto Documents Images",
1743
+ "url": "https://www.semanticscholar.org/paper/8c9d2628e23d5c27edc656071f11f0e78124d182",
1744
+ "category": "paper",
1745
+ "source": "other",
1746
+ "status": "verified",
1747
+ "summary": "Pashto language technology paper discovered from other for research reference.",
1748
+ "primary_use": "Pashto research reference for methods and benchmarking",
1749
+ "tasks": [
1750
+ "ocr"
1751
+ ],
1752
+ "tags": [
1753
+ "pashto",
1754
+ "paper",
1755
+ "other",
1756
+ "ocr"
1757
+ ],
1758
+ "evidence_text": "Matched by Semantic Scholar query: pashto.",
1759
+ "evidence_url": "https://www.semanticscholar.org/paper/8c9d2628e23d5c27edc656071f11f0e78124d182",
1760
+ "markers": [
1761
+ "pashto"
1762
+ ]
1763
+ },
1764
+ {
1765
+ "id": "paper-s2-out-of-vocabulary-pashto-spell-checker-using-morphological-operations",
1766
+ "title": "Out-of-Vocabulary Pashto Spell Checker using Morphological Operations",
1767
+ "url": "https://www.semanticscholar.org/paper/802aae68a6a7fdfb29d51be03fb2b09e29311fa7",
1768
+ "category": "paper",
1769
+ "source": "other",
1770
+ "status": "verified",
1771
+ "summary": "Pashto language technology paper discovered from other for research reference.",
1772
+ "primary_use": "Pashto research reference for methods and benchmarking",
1773
+ "tasks": [
1774
+ "nlp"
1775
+ ],
1776
+ "tags": [
1777
+ "pashto",
1778
+ "paper",
1779
+ "other",
1780
+ "nlp"
1781
+ ],
1782
+ "evidence_text": "Matched by Semantic Scholar query: pashto.",
1783
+ "evidence_url": "https://www.semanticscholar.org/paper/802aae68a6a7fdfb29d51be03fb2b09e29311fa7",
1784
+ "markers": [
1785
+ "pashto"
1786
+ ]
1787
+ },
1788
+ {
1789
+ "id": "paper-s2-pashto-shallow-parsing-a-deep-learning-approach",
1790
+ "title": "Pashto Shallow Parsing: A Deep Learning Approach",
1791
+ "url": "https://www.semanticscholar.org/paper/be36455bb4eca60accb3e6866f345132f0dac1e5",
1792
+ "category": "paper",
1793
+ "source": "other",
1794
+ "status": "verified",
1795
+ "summary": "Pashto language technology paper discovered from other for research reference.",
1796
+ "primary_use": "Pashto research reference for methods and benchmarking",
1797
+ "tasks": [
1798
+ "nlp"
1799
+ ],
1800
+ "tags": [
1801
+ "pashto",
1802
+ "paper",
1803
+ "other",
1804
+ "nlp"
1805
+ ],
1806
+ "evidence_text": "Matched by Semantic Scholar query: pashto.",
1807
+ "evidence_url": "https://www.semanticscholar.org/paper/be36455bb4eca60accb3e6866f345132f0dac1e5",
1808
+ "markers": [
1809
+ "pashto"
1810
+ ]
1811
+ },
1812
+ {
1813
+ "id": "paper-s2-pos-tagging-of-low-resource-pashto-language-annotated-corpus-and-bert-based-model",
1814
+ "title": "POS tagging of low-resource Pashto language: annotated corpus and BERT-based model",
1815
+ "url": "https://www.semanticscholar.org/paper/1b2d5c896fec735483e8c8fb0a75e13125e08769",
1816
+ "category": "paper",
1817
+ "source": "other",
1818
+ "status": "verified",
1819
+ "summary": "Pashto language technology paper discovered from other for research reference.",
1820
+ "primary_use": "Pashto research reference for methods and benchmarking",
1821
+ "tasks": [
1822
+ "nlp"
1823
+ ],
1824
+ "tags": [
1825
+ "pashto",
1826
+ "paper",
1827
+ "other",
1828
+ "nlp"
1829
+ ],
1830
+ "evidence_text": "Matched by Semantic Scholar query: pashto.",
1831
+ "evidence_url": "https://www.semanticscholar.org/paper/1b2d5c896fec735483e8c8fb0a75e13125e08769",
1832
+ "markers": [
1833
+ "pashto"
1834
+ ]
1835
+ },
1836
+ {
1837
+ "id": "paper-arxiv-enhancing-pashto-text-classification-using-language-processing-techniques-for-single-and-m",
1838
+ "title": "Enhancing Pashto Text Classification using Language Processing Techniques for Single And Multi-Label Analysis",
1839
+ "url": "http://arxiv.org/abs/2305.03201v1",
1840
+ "category": "paper",
1841
+ "source": "arxiv",
1842
+ "status": "verified",
1843
+ "summary": "Pashto language technology paper discovered from arxiv for research reference.",
1844
+ "primary_use": "Pashto research reference for methods and benchmarking",
1845
+ "tasks": [
1846
+ "nlp"
1847
+ ],
1848
+ "tags": [
1849
+ "pashto",
1850
+ "paper",
1851
+ "arxiv",
1852
+ "nlp"
1853
+ ],
1854
+ "evidence_text": "Matched by arXiv query: all:pashto.",
1855
+ "evidence_url": "http://arxiv.org/abs/2305.03201v1",
1856
+ "markers": [
1857
+ "pashto"
1858
+ ]
1859
+ },
1860
+ {
1861
+ "id": "paper-arxiv-knn-and-ann-based-recognition-of-handwritten-pashto-letters-using-zoning-features",
1862
+ "title": "KNN and ANN-based Recognition of Handwritten Pashto Letters using Zoning Features",
1863
+ "url": "http://arxiv.org/abs/1904.03391v2",
1864
+ "category": "paper",
1865
+ "source": "arxiv",
1866
+ "status": "verified",
1867
+ "summary": "Pashto language technology paper discovered from arxiv for research reference.",
1868
+ "primary_use": "Pashto research reference for methods and benchmarking",
1869
+ "tasks": [
1870
+ "ocr"
1871
+ ],
1872
+ "tags": [
1873
+ "pashto",
1874
+ "paper",
1875
+ "arxiv",
1876
+ "ocr"
1877
+ ],
1878
+ "evidence_text": "Matched by arXiv query: all:pashto.",
1879
+ "evidence_url": "http://arxiv.org/abs/1904.03391v2",
1880
+ "markers": [
1881
+ "pashto"
1882
+ ]
1883
  }
1884
  ]
1885
  }
resources/README.md CHANGED
@@ -3,13 +3,13 @@
3
  Structured, Pashto-focused resource tracking lives in this folder.
4
 
5
  ## Sections
6
- - Datasets (14): [datasets/README.md](datasets/README.md)
7
- - Models (9): [models/README.md](models/README.md)
8
  - Benchmarks (4): [benchmarks/README.md](benchmarks/README.md)
9
  - Tools (2): [tools/README.md](tools/README.md)
10
- - Papers (4): [papers/README.md](papers/README.md)
11
- - Projects (3): [projects/README.md](projects/README.md)
12
- - Code (1): [codes/README.md](codes/README.md)
13
 
14
  ## Machine-Readable Catalog
15
  - Canonical catalog: [catalog/resources.json](catalog/resources.json)
@@ -22,4 +22,4 @@ Structured, Pashto-focused resource tracking lives in this folder.
22
  - Run `python scripts/validate_resource_catalog.py` before opening a PR.
23
  - Run `python scripts/generate_resource_views.py` after catalog changes.
24
 
25
- Verified resource count: `37`
 
3
  Structured, Pashto-focused resource tracking lives in this folder.
4
 
5
  ## Sections
6
+ - Datasets (28): [datasets/README.md](datasets/README.md)
7
+ - Models (18): [models/README.md](models/README.md)
8
  - Benchmarks (4): [benchmarks/README.md](benchmarks/README.md)
9
  - Tools (2): [tools/README.md](tools/README.md)
10
+ - Papers (12): [papers/README.md](papers/README.md)
11
+ - Projects (11): [projects/README.md](projects/README.md)
12
+ - Code (2): [codes/README.md](codes/README.md)
13
 
14
  ## Machine-Readable Catalog
15
  - Canonical catalog: [catalog/resources.json](catalog/resources.json)
 
22
  - Run `python scripts/validate_resource_catalog.py` before opening a PR.
23
  - Run `python scripts/generate_resource_views.py` after catalog changes.
24
 
25
+ Verified resource count: `77`
resources/catalog/pending_candidates.json CHANGED
@@ -1,28 +1,52 @@
1
  {
2
- "generated_on": "2026-02-15T10:54:46.151446+00:00",
3
  "sources": [
4
  "kaggle-datasets",
5
  "huggingface-datasets",
6
  "huggingface-models",
7
  "huggingface-spaces",
8
  "github-repositories",
 
9
  "semantic-scholar"
10
  ],
11
- "candidate_count": 114,
12
  "candidates": [
13
  {
14
- "id": "candidate-s2-a-comparative-analysis-of-pashto-ghazals-and-english-sonnets-in-17th-century",
15
- "title": "A Comparative Analysis of Pashto Ghazals and English Sonnets in 17th Century",
16
- "url": "https://www.semanticscholar.org/paper/55b044485b2f134c69c9b9b6dfeaa7e71e704b3d",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
  "category": "paper",
18
  "source": "other",
19
  "status": "candidate",
20
- "summary": "This study conducts a comparative analysis of Pashto ghazals and English sonnets in 17th century  to explore their unique structures, themes, and cultural significance. Utilizing descriptive and comparative methods, the study examines how t",
21
  "primary_use": "Needs maintainer review before promotion to verified catalog.",
22
  "tasks": [],
23
  "pashto_evidence": {
24
  "evidence_text": "Matched by Semantic Scholar query: pashto.",
25
- "evidence_url": "https://www.semanticscholar.org/paper/55b044485b2f134c69c9b9b6dfeaa7e71e704b3d",
26
  "markers": [
27
  "pashto"
28
  ]
@@ -196,29 +220,6 @@
196
  "space"
197
  ]
198
  },
199
- {
200
- "id": "candidate-hf-dataset-adnankhan769-english-to-pashto-sentences-dataset",
201
- "title": "adnankhan769/english_to_pashto_sentences_dataset",
202
- "url": "https://huggingface.co/datasets/adnankhan769/english_to_pashto_sentences_dataset",
203
- "category": "dataset",
204
- "source": "huggingface",
205
- "status": "candidate",
206
- "summary": "Candidate dataset returned from Hugging Face search for Pashto.",
207
- "primary_use": "Needs maintainer review before promotion to verified catalog.",
208
- "tasks": [],
209
- "pashto_evidence": {
210
- "evidence_text": "Matched by Pashto keyword in Hugging Face search results.",
211
- "evidence_url": "https://huggingface.co/datasets/adnankhan769/english_to_pashto_sentences_dataset",
212
- "markers": [
213
- "pashto"
214
- ]
215
- },
216
- "tags": [
217
- "pashto",
218
- "candidate",
219
- "dataset"
220
- ]
221
- },
222
  {
223
  "id": "candidate-hf-dataset-adnankhan769-proper-dataset-english-2-pashto",
224
  "title": "adnankhan769/proper_dataset_english_2_pashto",
@@ -526,18 +527,41 @@
526
  ]
527
  },
528
  {
529
- "id": "candidate-s2-child-marriage-as-a-major-concern-in-pashto-poetry",
530
- "title": "Child Marriage As A Major Concern in Pashto Poetry",
531
- "url": "https://www.semanticscholar.org/paper/87fea719c6b5e4a7c65ad552fffa6b2bffef2580",
532
  "category": "paper",
533
- "source": "other",
534
  "status": "candidate",
535
- "summary": "Child marriage is a major concern we read in Pashto poetry. They believe fewer financial resources left people in desperate need to survive, and it has caused both girls and boys to pay the price. Poor parents give away their girls to wealt",
536
  "primary_use": "Needs maintainer review before promotion to verified catalog.",
537
  "tasks": [],
538
  "pashto_evidence": {
539
- "evidence_text": "Matched by Semantic Scholar query: pashto.",
540
- "evidence_url": "https://www.semanticscholar.org/paper/87fea719c6b5e4a7c65ad552fffa6b2bffef2580",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
541
  "markers": [
542
  "pashto"
543
  ]
@@ -596,18 +620,18 @@
596
  ]
597
  },
598
  {
599
- "id": "candidate-s2-comparative-study-of-adjectives-in-pashto-and-dari-as-cognate-languages",
600
- "title": "Comparative Study of Adjectives in Pashto and Dari as Cognate Languages",
601
- "url": "https://www.semanticscholar.org/paper/558e9dd7d4027be391a39f5e5ef988cf05039dc7",
602
  "category": "paper",
603
  "source": "other",
604
  "status": "candidate",
605
- "summary": "In Pashto, adjectives align with nouns in terms of number, gender, and case, which distinguishthem from adjectives in Modern Persian-Dari. In both Old and Modern Persian, unlike in Pashto, adjectives are divided into two categories: attribu",
606
  "primary_use": "Needs maintainer review before promotion to verified catalog.",
607
  "tasks": [],
608
  "pashto_evidence": {
609
  "evidence_text": "Matched by Semantic Scholar query: pashto.",
610
- "evidence_url": "https://www.semanticscholar.org/paper/558e9dd7d4027be391a39f5e5ef988cf05039dc7",
611
  "markers": [
612
  "pashto"
613
  ]
@@ -619,18 +643,41 @@
619
  ]
620
  },
621
  {
622
- "id": "candidate-s2-deep-learning-based-detection-of-one-and-two-column-textual-blocks-in-camera-cap",
623
- "title": "Deep Learning-Based Detection of One and Two-Column Textual Blocks in Camera-Captured Pashto Documents Images",
624
- "url": "https://www.semanticscholar.org/paper/8c9d2628e23d5c27edc656071f11f0e78124d182",
625
  "category": "paper",
626
  "source": "other",
627
  "status": "candidate",
628
- "summary": "The paper explores the layout analysis and classification task of Pashto document images, a field with limited research due to the language’s low-resource status. It uses Document Image Analysis (DIA) to detect one-column and two-column tex",
629
  "primary_use": "Needs maintainer review before promotion to verified catalog.",
630
  "tasks": [],
631
  "pashto_evidence": {
632
  "evidence_text": "Matched by Semantic Scholar query: pashto.",
633
- "evidence_url": "https://www.semanticscholar.org/paper/8c9d2628e23d5c27edc656071f11f0e78124d182",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
634
  "markers": [
635
  "pashto"
636
  ]
@@ -712,6 +759,52 @@
712
  "kaggle"
713
  ]
714
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
715
  {
716
  "id": "candidate-s2-evaluating-the-message-of-pashto-landay-according-to-the-audience",
717
  "title": "Evaluating the Message of Pashto Landay According to the Audience",
@@ -735,6 +828,29 @@
735
  "paper"
736
  ]
737
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
738
  {
739
  "id": "candidate-gh-project-fazlullahmamond-hadith-collection-pashto",
740
  "title": "Fazlullahmamond/hadith-collection-pashto",
@@ -813,18 +929,41 @@
813
  ]
814
  },
815
  {
816
- "id": "candidate-s2-gemination-in-pashto",
817
- "title": "Gemination in Pashto",
818
- "url": "https://www.semanticscholar.org/paper/ccf72dc1bcd0a0cd3a4b97cc7fe1830c37922c64",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
819
  "category": "paper",
820
  "source": "other",
821
  "status": "candidate",
822
- "summary": "The purpose of the present study was to analyze gemination in Pashto. For this purpose, first, data was collected generally from elder native speakers who speak the Yousafzai dialect. The collected data then was verified and discussed sever",
823
  "primary_use": "Needs maintainer review before promotion to verified catalog.",
824
  "tasks": [],
825
  "pashto_evidence": {
826
  "evidence_text": "Matched by Semantic Scholar query: pashto.",
827
- "evidence_url": "https://www.semanticscholar.org/paper/ccf72dc1bcd0a0cd3a4b97cc7fe1830c37922c64",
828
  "markers": [
829
  "pashto"
830
  ]
@@ -909,30 +1048,6 @@
909
  "paper"
910
  ]
911
  },
912
- {
913
- "id": "candidate-hf-project-ihanif-pashto-asr",
914
- "title": "ihanif/pashto-asr",
915
- "url": "https://huggingface.co/spaces/ihanif/pashto-asr",
916
- "category": "project",
917
- "source": "huggingface",
918
- "status": "candidate",
919
- "summary": "Candidate project app returned from Hugging Face Spaces Pashto search.",
920
- "primary_use": "Needs maintainer review before promotion to verified catalog.",
921
- "tasks": [],
922
- "pashto_evidence": {
923
- "evidence_text": "Matched by Pashto keyword in Hugging Face Spaces search.",
924
- "evidence_url": "https://huggingface.co/spaces/ihanif/pashto-asr",
925
- "markers": [
926
- "pashto"
927
- ]
928
- },
929
- "tags": [
930
- "pashto",
931
- "candidate",
932
- "project",
933
- "space"
934
- ]
935
- },
936
  {
937
  "id": "candidate-hf-model-ihanif-pashto-asr-base",
938
  "title": "ihanif/pashto-asr-base",
@@ -1188,29 +1303,6 @@
1188
  "model"
1189
  ]
1190
  },
1191
- {
1192
- "id": "candidate-hf-model-ihanif-whisper-base-pashto",
1193
- "title": "ihanif/whisper-base-pashto",
1194
- "url": "https://huggingface.co/ihanif/whisper-base-pashto",
1195
- "category": "model",
1196
- "source": "huggingface",
1197
- "status": "candidate",
1198
- "summary": "Candidate model returned from Hugging Face search for Pashto.",
1199
- "primary_use": "Needs maintainer review before promotion to verified catalog.",
1200
- "tasks": [],
1201
- "pashto_evidence": {
1202
- "evidence_text": "Matched by Pashto keyword in Hugging Face search results.",
1203
- "evidence_url": "https://huggingface.co/ihanif/whisper-base-pashto",
1204
- "markers": [
1205
- "pashto"
1206
- ]
1207
- },
1208
- "tags": [
1209
- "pashto",
1210
- "candidate",
1211
- "model"
1212
- ]
1213
- },
1214
  {
1215
  "id": "candidate-hf-model-ihanif-whisper-large-pashto",
1216
  "title": "ihanif/whisper-large-pashto",
@@ -1447,6 +1539,29 @@
1447
  "space"
1448
  ]
1449
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1450
  {
1451
  "id": "candidate-hf-model-jawaria-wav2vec2-large-xls-r-300m-pashto-colab-final-1",
1452
  "title": "Jawaria/wav2vec2-large-xls-r-300m-pashto-colab-final-1",
@@ -1655,6 +1770,29 @@
1655
  "kaggle"
1656
  ]
1657
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1658
  {
1659
  "id": "candidate-hf-dataset-koochikoo25-pashto-concatenated",
1660
  "title": "koochikoo25/Pashto-Concatenated",
@@ -1724,29 +1862,6 @@
1724
  "model"
1725
  ]
1726
  },
1727
- {
1728
- "id": "candidate-s2-language-of-resistance-in-pashto-poetry-during-the-war-on-terror",
1729
- "title": "Language of Resistance in Pashto Poetry during the War on Terror",
1730
- "url": "https://www.semanticscholar.org/paper/23dbf301cdadbb3e1e309ed232baf5cfb2b6414b",
1731
- "category": "paper",
1732
- "source": "other",
1733
- "status": "candidate",
1734
- "summary": "The paper explores the compelling nature of Pashto poetry as a weapon of resistance in the War on Terror, how it has been used to reveal Pashtun identity, political protest, and cultural strength. With military activities dismantling the Pa",
1735
- "primary_use": "Needs maintainer review before promotion to verified catalog.",
1736
- "tasks": [],
1737
- "pashto_evidence": {
1738
- "evidence_text": "Matched by Semantic Scholar query: pashto.",
1739
- "evidence_url": "https://www.semanticscholar.org/paper/23dbf301cdadbb3e1e309ed232baf5cfb2b6414b",
1740
- "markers": [
1741
- "pashto"
1742
- ]
1743
- },
1744
- "tags": [
1745
- "pashto",
1746
- "candidate",
1747
- "paper"
1748
- ]
1749
- },
1750
  {
1751
  "id": "candidate-gh-project-lecramyajiv-fonts-arabic-extra",
1752
  "title": "lecramyajiv/fonts-arabic-extra",
@@ -1878,56 +1993,6 @@
1878
  "quran"
1879
  ]
1880
  },
1881
- {
1882
- "id": "candidate-gh-code-mrychlik-worldly-ocr",
1883
- "title": "mrychlik/worldly-ocr",
1884
- "url": "https://github.com/mrychlik/worldly-ocr",
1885
- "category": "code",
1886
- "source": "github",
1887
- "status": "candidate",
1888
- "summary": "Text-to-image conversion (OCR) for Pashto and Chinese, with a view towards comprehensive, multi-lingual OCR",
1889
- "primary_use": "Needs maintainer review before promotion to verified catalog.",
1890
- "tasks": [],
1891
- "pashto_evidence": {
1892
- "evidence_text": "Repository metadata (name/description/topics) includes Pashto markers.",
1893
- "evidence_url": "https://github.com/mrychlik/worldly-ocr",
1894
- "markers": [
1895
- "pashto"
1896
- ]
1897
- },
1898
- "tags": [
1899
- "pashto",
1900
- "candidate",
1901
- "code",
1902
- "github",
1903
- "multi-lingual-ocr",
1904
- "ocr",
1905
- "pashto"
1906
- ]
1907
- },
1908
- {
1909
- "id": "candidate-s2-multilingual-interplay-and-the-influence-of-the-official-languages-on-the-use-an",
1910
- "title": "Multilingual interplay and the influence of the official languages on the use and transmission of the regional language Pashto: a case study of a Pashtun family in Pakistan",
1911
- "url": "https://www.semanticscholar.org/paper/2b42be99fa7ad002efd3cf1d1c75834b69108a07",
1912
- "category": "paper",
1913
- "source": "other",
1914
- "status": "candidate",
1915
- "summary": "ABSTRACT The impact of English and Urdu in Pakistan on the intergenerational transmission and use of the regional language, Pashto, in the family domain is not well known. This paper, therefore, examines language use patterns in a middle-cl",
1916
- "primary_use": "Needs maintainer review before promotion to verified catalog.",
1917
- "tasks": [],
1918
- "pashto_evidence": {
1919
- "evidence_text": "Matched by Semantic Scholar query: pashto.",
1920
- "evidence_url": "https://www.semanticscholar.org/paper/2b42be99fa7ad002efd3cf1d1c75834b69108a07",
1921
- "markers": [
1922
- "pashto"
1923
- ]
1924
- },
1925
- "tags": [
1926
- "pashto",
1927
- "candidate",
1928
- "paper"
1929
- ]
1930
- },
1931
  {
1932
  "id": "candidate-gh-project-nanonulla-lorem",
1933
  "title": "NanoNulla/lorem",
@@ -2101,6 +2166,52 @@
2101
  "kaggle"
2102
  ]
2103
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2104
  {
2105
  "id": "candidate-kaggle-dataset-mahibullahmudaser-pashto-text-characters-sample",
2106
  "title": "Pashto text characters sample",
@@ -2174,42 +2285,41 @@
2174
  ]
2175
  },
2176
  {
2177
- "id": "candidate-kaggle-dataset-drijaz-pashtoocr",
2178
- "title": "PashtoOCR",
2179
- "url": "https://www.kaggle.com/datasets/drijaz/pashtoocr",
2180
- "category": "dataset",
2181
- "source": "kaggle",
2182
  "status": "candidate",
2183
- "summary": "A Synthetic Dataset for Optical Character Recognition (OCR) in Pashto",
2184
  "primary_use": "Needs maintainer review before promotion to verified catalog.",
2185
  "tasks": [],
2186
  "pashto_evidence": {
2187
- "evidence_text": "Kaggle dataset title/subtitle includes Pashto keyword.",
2188
- "evidence_url": "https://www.kaggle.com/datasets/drijaz/pashtoocr",
2189
  "markers": [
2190
- "Pashto"
2191
  ]
2192
  },
2193
  "tags": [
2194
  "pashto",
2195
  "candidate",
2196
- "dataset",
2197
- "kaggle"
2198
  ]
2199
  },
2200
  {
2201
- "id": "candidate-s2-pos-tagging-of-low-resource-pashto-language-annotated-corpus-and-bert-based-mode",
2202
- "title": "POS tagging of low-resource Pashto language: annotated corpus and BERT-based model",
2203
- "url": "https://www.semanticscholar.org/paper/1b2d5c896fec735483e8c8fb0a75e13125e08769",
2204
  "category": "paper",
2205
- "source": "other",
2206
  "status": "candidate",
2207
- "summary": "Candidate paper returned from Semantic Scholar search for Pashto.",
2208
  "primary_use": "Needs maintainer review before promotion to verified catalog.",
2209
  "tasks": [],
2210
  "pashto_evidence": {
2211
- "evidence_text": "Matched by Semantic Scholar query: pashto.",
2212
- "evidence_url": "https://www.semanticscholar.org/paper/1b2d5c896fec735483e8c8fb0a75e13125e08769",
2213
  "markers": [
2214
  "pashto"
2215
  ]
@@ -2221,18 +2331,18 @@
2221
  ]
2222
  },
2223
  {
2224
- "id": "candidate-s2-psocr-benchmarking-large-multimodal-models-for-optical-character-recognition-in-",
2225
- "title": "PsOCR: Benchmarking Large Multimodal Models for Optical Character Recognition in Low-resource Pashto Language",
2226
- "url": "https://www.semanticscholar.org/paper/d2743c0dcdbc65f5b46fcec2f0ba7cb379c4134f",
2227
  "category": "paper",
2228
  "source": "other",
2229
  "status": "candidate",
2230
- "summary": "This paper evaluates the performance of Large Multimodal Models (LMMs) on Optical Character Recognition (OCR) in the low-resource Pashto language. Natural Language Processing (NLP) in Pashto faces several challenges due to the cursive natur",
2231
  "primary_use": "Needs maintainer review before promotion to verified catalog.",
2232
  "tasks": [],
2233
  "pashto_evidence": {
2234
  "evidence_text": "Matched by Semantic Scholar query: pashto.",
2235
- "evidence_url": "https://www.semanticscholar.org/paper/d2743c0dcdbc65f5b46fcec2f0ba7cb379c4134f",
2236
  "markers": [
2237
  "pashto"
2238
  ]
@@ -2244,18 +2354,18 @@
2244
  ]
2245
  },
2246
  {
2247
- "id": "candidate-hf-dataset-saillab-alpaca-pashto-cleaned",
2248
- "title": "saillab/alpaca-pashto-cleaned",
2249
- "url": "https://huggingface.co/datasets/saillab/alpaca-pashto-cleaned",
2250
- "category": "dataset",
2251
- "source": "huggingface",
2252
  "status": "candidate",
2253
- "summary": "Candidate dataset returned from Hugging Face search for Pashto.",
2254
  "primary_use": "Needs maintainer review before promotion to verified catalog.",
2255
  "tasks": [],
2256
  "pashto_evidence": {
2257
- "evidence_text": "Matched by Pashto keyword in Hugging Face search results.",
2258
- "evidence_url": "https://huggingface.co/datasets/saillab/alpaca-pashto-cleaned",
2259
  "markers": [
2260
  "pashto"
2261
  ]
@@ -2263,7 +2373,7 @@
2263
  "tags": [
2264
  "pashto",
2265
  "candidate",
2266
- "dataset"
2267
  ]
2268
  },
2269
  {
@@ -2289,6 +2399,29 @@
2289
  "dataset"
2290
  ]
2291
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2292
  {
2293
  "id": "candidate-gh-project-shahzamanpatan-pashto-baran",
2294
  "title": "ShahZamanPatan/Pashto-Baran",
@@ -2366,6 +2499,29 @@
2366
  "brahui"
2367
  ]
2368
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2369
  {
2370
  "id": "candidate-s2-switching-selves-online-pashto-english-bilingualism-identity-and-expression-in-p",
2371
  "title": "SWITCHING SELVES ONLINE:PASHTO-ENGLISH BILINGUALISM,IDENTITY, AND EXPRESSION IN PAKISTAN’S DIGITAL DISCOURSE",
@@ -2459,30 +2615,6 @@
2459
  "dataset"
2460
  ]
2461
  },
2462
- {
2463
- "id": "candidate-hf-project-tasal9-zamai-mistral-7b-pashto-space",
2464
- "title": "tasal9/ZamAI-Mistral-7B-Pashto-space",
2465
- "url": "https://huggingface.co/spaces/tasal9/ZamAI-Mistral-7B-Pashto-space",
2466
- "category": "project",
2467
- "source": "huggingface",
2468
- "status": "candidate",
2469
- "summary": "Candidate project app returned from Hugging Face Spaces Pashto search.",
2470
- "primary_use": "Needs maintainer review before promotion to verified catalog.",
2471
- "tasks": [],
2472
- "pashto_evidence": {
2473
- "evidence_text": "Matched by Pashto keyword in Hugging Face Spaces search.",
2474
- "evidence_url": "https://huggingface.co/spaces/tasal9/ZamAI-Mistral-7B-Pashto-space",
2475
- "markers": [
2476
- "pashto"
2477
- ]
2478
- },
2479
- "tags": [
2480
- "pashto",
2481
- "candidate",
2482
- "project",
2483
- "space"
2484
- ]
2485
- },
2486
  {
2487
  "id": "candidate-hf-project-tasal9-zamai-mt5-pashto-demo",
2488
  "title": "tasal9/ZamAI-mt5-Pashto-Demo",
@@ -2577,29 +2709,6 @@
2577
  "paper"
2578
  ]
2579
  },
2580
- {
2581
- "id": "candidate-s2-the-role-of-early-literary-biographies-tazkiri-in-the-ancient-history-of-pashto-",
2582
- "title": "The Role of Early Literary Biographies (Tazkiri) in the Ancient History of Pashto Literature",
2583
- "url": "https://www.semanticscholar.org/paper/4938170077d3430c2e3f9fadc161ed7b79242917",
2584
- "category": "paper",
2585
- "source": "other",
2586
- "status": "candidate",
2587
- "summary": "The role of early literary biographies in the ancient history of Pashto literature is a significant and fundamental subject, as these biographies have transmitted to us the thoughts, styles, and contributions of early poets, writers, and Su",
2588
- "primary_use": "Needs maintainer review before promotion to verified catalog.",
2589
- "tasks": [],
2590
- "pashto_evidence": {
2591
- "evidence_text": "Matched by Semantic Scholar query: pashto.",
2592
- "evidence_url": "https://www.semanticscholar.org/paper/4938170077d3430c2e3f9fadc161ed7b79242917",
2593
- "markers": [
2594
- "pashto"
2595
- ]
2596
- },
2597
- "tags": [
2598
- "pashto",
2599
- "candidate",
2600
- "paper"
2601
- ]
2602
- },
2603
  {
2604
  "id": "candidate-s2-the-roshani-movement-literary-services-and-the-contribution-of-this-movement-in-",
2605
  "title": "The Roshani Movement literary services and the contribution of this Movement in the development of Pashto Literature",
@@ -2624,18 +2733,18 @@
2624
  ]
2625
  },
2626
  {
2627
- "id": "candidate-s2-the-themes-of-insomnia-in-pashto-tappa-a-cultural-and-literary-representation",
2628
- "title": "The Themes of Insomnia in Pashto Tappa: A Cultural and Literary Representation",
2629
- "url": "https://www.semanticscholar.org/paper/95bd6f1b23e3331f8e5faedb87b6bb9c1c574c4b",
2630
  "category": "paper",
2631
- "source": "other",
2632
  "status": "candidate",
2633
- "summary": "This study explores the motif of insomnia in Pashto tappa as a culturally embedded literary symptom that reflects emotional, social, and existential tensions in Pashtun life. Drawing on a qualitative content analysis of a purposive corpus o",
2634
  "primary_use": "Needs maintainer review before promotion to verified catalog.",
2635
  "tasks": [],
2636
  "pashto_evidence": {
2637
- "evidence_text": "Matched by Semantic Scholar query: pashto.",
2638
- "evidence_url": "https://www.semanticscholar.org/paper/95bd6f1b23e3331f8e5faedb87b6bb9c1c574c4b",
2639
  "markers": [
2640
  "pashto"
2641
  ]
@@ -2671,18 +2780,18 @@
2671
  ]
2672
  },
2673
  {
2674
- "id": "candidate-gh-project-wikis-on-git-ps-wikipedia-org",
2675
- "title": "wikis-on-git/ps.wikipedia.org",
2676
- "url": "https://github.com/wikis-on-git/ps.wikipedia.org",
2677
- "category": "project",
2678
- "source": "github",
2679
  "status": "candidate",
2680
- "summary": "Wikipedia in Pashto (پښتو)",
2681
  "primary_use": "Needs maintainer review before promotion to verified catalog.",
2682
  "tasks": [],
2683
  "pashto_evidence": {
2684
- "evidence_text": "Repository metadata (name/description/topics) includes Pashto markers.",
2685
- "evidence_url": "https://github.com/wikis-on-git/ps.wikipedia.org",
2686
  "markers": [
2687
  "pashto"
2688
  ]
@@ -2690,26 +2799,22 @@
2690
  "tags": [
2691
  "pashto",
2692
  "candidate",
2693
- "project",
2694
- "github",
2695
- "mediawiki",
2696
- "pashto",
2697
- "wikipedia"
2698
  ]
2699
  },
2700
  {
2701
- "id": "candidate-hf-model-zirak-ai-pashto-bert-v1",
2702
- "title": "zirak-ai/pashto-bert-v1",
2703
- "url": "https://huggingface.co/zirak-ai/pashto-bert-v1",
2704
- "category": "model",
2705
- "source": "huggingface",
2706
  "status": "candidate",
2707
- "summary": "Candidate model returned from Hugging Face search for Pashto.",
2708
  "primary_use": "Needs maintainer review before promotion to verified catalog.",
2709
  "tasks": [],
2710
  "pashto_evidence": {
2711
- "evidence_text": "Matched by Pashto keyword in Hugging Face search results.",
2712
- "evidence_url": "https://huggingface.co/zirak-ai/pashto-bert-v1",
2713
  "markers": [
2714
  "pashto"
2715
  ]
@@ -2717,11 +2822,12 @@
2717
  "tags": [
2718
  "pashto",
2719
  "candidate",
2720
- "model"
 
 
 
 
2721
  ]
2722
  }
2723
- ],
2724
- "errors": [
2725
- "arxiv: <urlopen error [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: unable to get local issuer certificate (_ssl.c:1007)>"
2726
  ]
2727
  }
 
1
  {
2
+ "generated_on": "2026-02-16T08:53:03.539791+00:00",
3
  "sources": [
4
  "kaggle-datasets",
5
  "huggingface-datasets",
6
  "huggingface-models",
7
  "huggingface-spaces",
8
  "github-repositories",
9
+ "arxiv",
10
  "semantic-scholar"
11
  ],
12
+ "candidate_count": 119,
13
  "candidates": [
14
  {
15
+ "id": "candidate-arxiv-a-breadth-first-catalog-of-text-processing-speech-processing-and-multimodal-rese",
16
+ "title": "A Breadth-First Catalog of Text Processing, Speech Processing and Multimodal Research in South Asian Languages",
17
+ "url": "http://arxiv.org/abs/2501.00029v1",
18
+ "category": "paper",
19
+ "source": "arxiv",
20
+ "status": "candidate",
21
+ "summary": "We review the recent literature (January 2022- October 2024) in South Asian languages on text-based language processing, multimodal models, and speech processing, and provide a spotlight analysis focused on 21 low-resource South Asian langu",
22
+ "primary_use": "Needs maintainer review before promotion to verified catalog.",
23
+ "tasks": [],
24
+ "pashto_evidence": {
25
+ "evidence_text": "Matched by arXiv query: all:pashto.",
26
+ "evidence_url": "http://arxiv.org/abs/2501.00029v1",
27
+ "markers": [
28
+ "pashto"
29
+ ]
30
+ },
31
+ "tags": [
32
+ "pashto",
33
+ "candidate",
34
+ "paper"
35
+ ]
36
+ },
37
+ {
38
+ "id": "candidate-s2-a-lexical-analysis-of-pashto-language",
39
+ "title": "A Lexical Analysis of Pashto Language",
40
+ "url": "https://www.semanticscholar.org/paper/6a1422eaca906a6657aa667b30dcb5575d25f8f8",
41
  "category": "paper",
42
  "source": "other",
43
  "status": "candidate",
44
+ "summary": "Language changes over time. Apart from many other reasons, some words become dormant and remain no more in use. In this research, an attempt has been made to show language change in Pashto language. For this purpose, images of different cul",
45
  "primary_use": "Needs maintainer review before promotion to verified catalog.",
46
  "tasks": [],
47
  "pashto_evidence": {
48
  "evidence_text": "Matched by Semantic Scholar query: pashto.",
49
+ "evidence_url": "https://www.semanticscholar.org/paper/6a1422eaca906a6657aa667b30dcb5575d25f8f8",
50
  "markers": [
51
  "pashto"
52
  ]
 
220
  "space"
221
  ]
222
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
223
  {
224
  "id": "candidate-hf-dataset-adnankhan769-proper-dataset-english-2-pashto",
225
  "title": "adnankhan769/proper_dataset_english_2_pashto",
 
527
  ]
528
  },
529
  {
530
+ "id": "candidate-arxiv-bitext-mining-for-low-resource-languages-via-contrastive-learning",
531
+ "title": "Bitext Mining for Low-Resource Languages via Contrastive Learning",
532
+ "url": "http://arxiv.org/abs/2208.11194v1",
533
  "category": "paper",
534
+ "source": "arxiv",
535
  "status": "candidate",
536
+ "summary": "Mining high-quality bitexts for low-resource languages is challenging. This paper shows that sentence representation of language models fine-tuned with multiple negatives ranking loss, a contrastive objective, helps retrieve clean bitexts.",
537
  "primary_use": "Needs maintainer review before promotion to verified catalog.",
538
  "tasks": [],
539
  "pashto_evidence": {
540
+ "evidence_text": "Matched by arXiv query: all:pashto.",
541
+ "evidence_url": "http://arxiv.org/abs/2208.11194v1",
542
+ "markers": [
543
+ "pashto"
544
+ ]
545
+ },
546
+ "tags": [
547
+ "pashto",
548
+ "candidate",
549
+ "paper"
550
+ ]
551
+ },
552
+ {
553
+ "id": "candidate-arxiv-cer-hv-a-cer-based-human-in-the-loop-framework-for-cleaning-datasets-applied-to-",
554
+ "title": "CER-HV: A CER-Based Human-in-the-Loop Framework for Cleaning Datasets Applied to Arabic-Script HTR",
555
+ "url": "http://arxiv.org/abs/2601.16713v2",
556
+ "category": "paper",
557
+ "source": "arxiv",
558
+ "status": "candidate",
559
+ "summary": "Handwritten text recognition (HTR) for Arabic-script languages still lags behind Latin-script HTR, despite recent advances in model architectures, datasets, and benchmarks. We show that data quality is a significant limiting factor in many",
560
+ "primary_use": "Needs maintainer review before promotion to verified catalog.",
561
+ "tasks": [],
562
+ "pashto_evidence": {
563
+ "evidence_text": "Matched by arXiv query: all:pashto.",
564
+ "evidence_url": "http://arxiv.org/abs/2601.16713v2",
565
  "markers": [
566
  "pashto"
567
  ]
 
620
  ]
621
  },
622
  {
623
+ "id": "candidate-s2-deep-learning-based-detection-of-one-and-two-column-textual-blocks-in-camera-cap",
624
+ "title": "Deep Learning-Based Detection of One and Two-Column Textual Blocks in Camera-Captured Pashto Documents Images",
625
+ "url": "https://www.semanticscholar.org/paper/8c9d2628e23d5c27edc656071f11f0e78124d182",
626
  "category": "paper",
627
  "source": "other",
628
  "status": "candidate",
629
+ "summary": "The paper explores the layout analysis and classification task of Pashto document images, a field with limited research due to the language’s low-resource status. It uses Document Image Analysis (DIA) to detect one-column and two-column tex",
630
  "primary_use": "Needs maintainer review before promotion to verified catalog.",
631
  "tasks": [],
632
  "pashto_evidence": {
633
  "evidence_text": "Matched by Semantic Scholar query: pashto.",
634
+ "evidence_url": "https://www.semanticscholar.org/paper/8c9d2628e23d5c27edc656071f11f0e78124d182",
635
  "markers": [
636
  "pashto"
637
  ]
 
643
  ]
644
  },
645
  {
646
+ "id": "candidate-s2-deictic-field-time-of-action-in-the-semantics-of-the-pashto-language-the-time-fi",
647
+ "title": "DEICTIC FIELD “TIME OF ACTION” IN THE SEMANTICS OF THE PASHTO LANGUAGE, THE “TIME” FIELD: BACKGROUND OF THE PROBLEM",
648
+ "url": "https://www.semanticscholar.org/paper/3358d828c2ff07a45d614fd1d81cf44d5c55cad8",
649
  "category": "paper",
650
  "source": "other",
651
  "status": "candidate",
652
+ "summary": "The article examines the semantic modeling of the category of time in language through the lens of deictic field theory, with a focus on Pashto adverbs. It outlines four major approaches to modeling semantic fields - phenomenological, lexic",
653
  "primary_use": "Needs maintainer review before promotion to verified catalog.",
654
  "tasks": [],
655
  "pashto_evidence": {
656
  "evidence_text": "Matched by Semantic Scholar query: pashto.",
657
+ "evidence_url": "https://www.semanticscholar.org/paper/3358d828c2ff07a45d614fd1d81cf44d5c55cad8",
658
+ "markers": [
659
+ "pashto"
660
+ ]
661
+ },
662
+ "tags": [
663
+ "pashto",
664
+ "candidate",
665
+ "paper"
666
+ ]
667
+ },
668
+ {
669
+ "id": "candidate-arxiv-development-of-a-new-image-to-text-conversion-system-for-pashto-farsi-and-tradit",
670
+ "title": "Development of a New Image-to-text Conversion System for Pashto, Farsi and Traditional Chinese",
671
+ "url": "http://arxiv.org/abs/2005.08650v1",
672
+ "category": "paper",
673
+ "source": "arxiv",
674
+ "status": "candidate",
675
+ "summary": "We report upon the results of a research and prototype building project \\emph{Worldly~OCR} dedicated to developing new, more accurate image-to-text conversion software for several languages and writing systems. These include the cursive scr",
676
+ "primary_use": "Needs maintainer review before promotion to verified catalog.",
677
+ "tasks": [],
678
+ "pashto_evidence": {
679
+ "evidence_text": "Matched by arXiv query: all:pashto.",
680
+ "evidence_url": "http://arxiv.org/abs/2005.08650v1",
681
  "markers": [
682
  "pashto"
683
  ]
 
759
  "kaggle"
760
  ]
761
  },
762
+ {
763
+ "id": "candidate-arxiv-enhancing-ner-performance-in-low-resource-pakistani-languages-using-cross-lingua",
764
+ "title": "Enhancing NER Performance in Low-Resource Pakistani Languages using Cross-Lingual Data Augmentation",
765
+ "url": "http://arxiv.org/abs/2504.08792v1",
766
+ "category": "paper",
767
+ "source": "arxiv",
768
+ "status": "candidate",
769
+ "summary": "Named Entity Recognition (NER), a fundamental task in Natural Language Processing (NLP), has shown significant advancements for high-resource languages. However, due to a lack of annotated datasets and limited representation in Pre-trained",
770
+ "primary_use": "Needs maintainer review before promotion to verified catalog.",
771
+ "tasks": [],
772
+ "pashto_evidence": {
773
+ "evidence_text": "Matched by arXiv query: all:pashto.",
774
+ "evidence_url": "http://arxiv.org/abs/2504.08792v1",
775
+ "markers": [
776
+ "pashto"
777
+ ]
778
+ },
779
+ "tags": [
780
+ "pashto",
781
+ "candidate",
782
+ "paper"
783
+ ]
784
+ },
785
+ {
786
+ "id": "candidate-arxiv-enhancing-pashto-text-classification-using-language-processing-techniques-for-si",
787
+ "title": "Enhancing Pashto Text Classification using Language Processing Techniques for Single And Multi-Label Analysis",
788
+ "url": "http://arxiv.org/abs/2305.03201v1",
789
+ "category": "paper",
790
+ "source": "arxiv",
791
+ "status": "candidate",
792
+ "summary": "Text classification has become a crucial task in various fields, leading to a significant amount of research on developing automated text classification systems for national and international languages. However, there is a growing need for",
793
+ "primary_use": "Needs maintainer review before promotion to verified catalog.",
794
+ "tasks": [],
795
+ "pashto_evidence": {
796
+ "evidence_text": "Matched by arXiv query: all:pashto.",
797
+ "evidence_url": "http://arxiv.org/abs/2305.03201v1",
798
+ "markers": [
799
+ "pashto"
800
+ ]
801
+ },
802
+ "tags": [
803
+ "pashto",
804
+ "candidate",
805
+ "paper"
806
+ ]
807
+ },
808
  {
809
  "id": "candidate-s2-evaluating-the-message-of-pashto-landay-according-to-the-audience",
810
  "title": "Evaluating the Message of Pashto Landay According to the Audience",
 
828
  "paper"
829
  ]
830
  },
831
+ {
832
+ "id": "candidate-s2-exploring-the-impacts-of-emotion-through-language-learning-on-pashto-speakers-yo",
833
+ "title": "Exploring the Impacts of Emotion through Language Learning on Pashto Speakers Young Adulthood in District Peshawar",
834
+ "url": "https://www.semanticscholar.org/paper/4549649112553aabccfac8b918c7e98cdbdd0f09",
835
+ "category": "paper",
836
+ "source": "other",
837
+ "status": "candidate",
838
+ "summary": "The current study explores the emotional experiences of Pashto speakers learning a second language, with a focus on how emotions are expressed, understood, and influenced by cultural and linguistic factors. While language learning is often",
839
+ "primary_use": "Needs maintainer review before promotion to verified catalog.",
840
+ "tasks": [],
841
+ "pashto_evidence": {
842
+ "evidence_text": "Matched by Semantic Scholar query: pashto.",
843
+ "evidence_url": "https://www.semanticscholar.org/paper/4549649112553aabccfac8b918c7e98cdbdd0f09",
844
+ "markers": [
845
+ "pashto"
846
+ ]
847
+ },
848
+ "tags": [
849
+ "pashto",
850
+ "candidate",
851
+ "paper"
852
+ ]
853
+ },
854
  {
855
  "id": "candidate-gh-project-fazlullahmamond-hadith-collection-pashto",
856
  "title": "Fazlullahmamond/hadith-collection-pashto",
 
929
  ]
930
  },
931
  {
932
+ "id": "candidate-arxiv-framing-political-bias-in-multilingual-llms-across-pakistani-languages",
933
+ "title": "Framing Political Bias in Multilingual LLMs Across Pakistani Languages",
934
+ "url": "http://arxiv.org/abs/2506.00068v3",
935
+ "category": "paper",
936
+ "source": "arxiv",
937
+ "status": "candidate",
938
+ "summary": "Large Language Models (LLMs) increasingly shape public discourse, yet most evaluations of political and economic bias have focused on high-resource, Western languages and contexts. This leaves critical blind spots in low-resource, multiling",
939
+ "primary_use": "Needs maintainer review before promotion to verified catalog.",
940
+ "tasks": [],
941
+ "pashto_evidence": {
942
+ "evidence_text": "Matched by arXiv query: all:pashto.",
943
+ "evidence_url": "http://arxiv.org/abs/2506.00068v3",
944
+ "markers": [
945
+ "pashto"
946
+ ]
947
+ },
948
+ "tags": [
949
+ "pashto",
950
+ "candidate",
951
+ "paper"
952
+ ]
953
+ },
954
+ {
955
+ "id": "candidate-s2-gender-classification-from-pashto-handwritten-text-images",
956
+ "title": "Gender Classification From Pashto Handwritten Text Images",
957
+ "url": "https://www.semanticscholar.org/paper/2d70fffa9224d71f67ad3c1943b8a71b18164eeb",
958
  "category": "paper",
959
  "source": "other",
960
  "status": "candidate",
961
+ "summary": "Computer vision (CV) is a subfield of computer science that enables machines to perceive, interpret, and understand visual data. It combines image processing, analysis, and machine learning to extract meaningful insights from images and vid",
962
  "primary_use": "Needs maintainer review before promotion to verified catalog.",
963
  "tasks": [],
964
  "pashto_evidence": {
965
  "evidence_text": "Matched by Semantic Scholar query: pashto.",
966
+ "evidence_url": "https://www.semanticscholar.org/paper/2d70fffa9224d71f67ad3c1943b8a71b18164eeb",
967
  "markers": [
968
  "pashto"
969
  ]
 
1048
  "paper"
1049
  ]
1050
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1051
  {
1052
  "id": "candidate-hf-model-ihanif-pashto-asr-base",
1053
  "title": "ihanif/pashto-asr-base",
 
1303
  "model"
1304
  ]
1305
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1306
  {
1307
  "id": "candidate-hf-model-ihanif-whisper-large-pashto",
1308
  "title": "ihanif/whisper-large-pashto",
 
1539
  "space"
1540
  ]
1541
  },
1542
+ {
1543
+ "id": "candidate-arxiv-improving-machine-translation-with-phrase-pair-injection-and-corpus-filtering",
1544
+ "title": "Improving Machine Translation with Phrase Pair Injection and Corpus Filtering",
1545
+ "url": "http://arxiv.org/abs/2301.08008v1",
1546
+ "category": "paper",
1547
+ "source": "arxiv",
1548
+ "status": "candidate",
1549
+ "summary": "In this paper, we show that the combination of Phrase Pair Injection and Corpus Filtering boosts the performance of Neural Machine Translation (NMT) systems. We extract parallel phrases and sentences from the pseudo-parallel corpus and augm",
1550
+ "primary_use": "Needs maintainer review before promotion to verified catalog.",
1551
+ "tasks": [],
1552
+ "pashto_evidence": {
1553
+ "evidence_text": "Matched by arXiv query: all:pashto.",
1554
+ "evidence_url": "http://arxiv.org/abs/2301.08008v1",
1555
+ "markers": [
1556
+ "pashto"
1557
+ ]
1558
+ },
1559
+ "tags": [
1560
+ "pashto",
1561
+ "candidate",
1562
+ "paper"
1563
+ ]
1564
+ },
1565
  {
1566
  "id": "candidate-hf-model-jawaria-wav2vec2-large-xls-r-300m-pashto-colab-final-1",
1567
  "title": "Jawaria/wav2vec2-large-xls-r-300m-pashto-colab-final-1",
 
1770
  "kaggle"
1771
  ]
1772
  },
1773
+ {
1774
+ "id": "candidate-arxiv-knn-and-ann-based-recognition-of-handwritten-pashto-letters-using-zoning-feature",
1775
+ "title": "KNN and ANN-based Recognition of Handwritten Pashto Letters using Zoning Features",
1776
+ "url": "http://arxiv.org/abs/1904.03391v2",
1777
+ "category": "paper",
1778
+ "source": "arxiv",
1779
+ "status": "candidate",
1780
+ "summary": "This paper presents a recognition system for handwritten Pashto letters. However, handwritten character recognition is a challenging task. These letters not only differ in shape and style but also vary among individuals. The recognition bec",
1781
+ "primary_use": "Needs maintainer review before promotion to verified catalog.",
1782
+ "tasks": [],
1783
+ "pashto_evidence": {
1784
+ "evidence_text": "Matched by arXiv query: all:pashto.",
1785
+ "evidence_url": "http://arxiv.org/abs/1904.03391v2",
1786
+ "markers": [
1787
+ "pashto"
1788
+ ]
1789
+ },
1790
+ "tags": [
1791
+ "pashto",
1792
+ "candidate",
1793
+ "paper"
1794
+ ]
1795
+ },
1796
  {
1797
  "id": "candidate-hf-dataset-koochikoo25-pashto-concatenated",
1798
  "title": "koochikoo25/Pashto-Concatenated",
 
1862
  "model"
1863
  ]
1864
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1865
  {
1866
  "id": "candidate-gh-project-lecramyajiv-fonts-arabic-extra",
1867
  "title": "lecramyajiv/fonts-arabic-extra",
 
1993
  "quran"
1994
  ]
1995
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1996
  {
1997
  "id": "candidate-gh-project-nanonulla-lorem",
1998
  "title": "NanoNulla/lorem",
 
2166
  "kaggle"
2167
  ]
2168
  },
2169
+ {
2170
+ "id": "candidate-s2-pashto-preverbs-iii-compound-verbs-with-preverb",
2171
+ "title": "Pashto preverbs, III. Compound verbs with preverb",
2172
+ "url": "https://www.semanticscholar.org/paper/53eeae3a973d6bb72839e9304be13a0362c92242",
2173
+ "category": "paper",
2174
+ "source": "other",
2175
+ "status": "candidate",
2176
+ "summary": "Abstract This article, the third in a series, focuses on the “living” preverbs used in the verbal system of contemporary Pashto. The verbs treated here belong to the “compound verbs with preverb” class or to the “mixed verbs with preverb” c",
2177
+ "primary_use": "Needs maintainer review before promotion to verified catalog.",
2178
+ "tasks": [],
2179
+ "pashto_evidence": {
2180
+ "evidence_text": "Matched by Semantic Scholar query: pashto.",
2181
+ "evidence_url": "https://www.semanticscholar.org/paper/53eeae3a973d6bb72839e9304be13a0362c92242",
2182
+ "markers": [
2183
+ "pashto"
2184
+ ]
2185
+ },
2186
+ "tags": [
2187
+ "pashto",
2188
+ "candidate",
2189
+ "paper"
2190
+ ]
2191
+ },
2192
+ {
2193
+ "id": "candidate-s2-pashto-shallow-parsing-a-deep-learning-approach",
2194
+ "title": "Pashto Shallow Parsing: A Deep Learning Approach",
2195
+ "url": "https://www.semanticscholar.org/paper/be36455bb4eca60accb3e6866f345132f0dac1e5",
2196
+ "category": "paper",
2197
+ "source": "other",
2198
+ "status": "candidate",
2199
+ "summary": "This paper presents the first deep learning-based shallow parsing system for the Pashto language, addressing the significant lack of syntactic tools for this low-resource and morphologically rich language. A comprehensive corpus of over 15,",
2200
+ "primary_use": "Needs maintainer review before promotion to verified catalog.",
2201
+ "tasks": [],
2202
+ "pashto_evidence": {
2203
+ "evidence_text": "Matched by Semantic Scholar query: pashto.",
2204
+ "evidence_url": "https://www.semanticscholar.org/paper/be36455bb4eca60accb3e6866f345132f0dac1e5",
2205
+ "markers": [
2206
+ "pashto"
2207
+ ]
2208
+ },
2209
+ "tags": [
2210
+ "pashto",
2211
+ "candidate",
2212
+ "paper"
2213
+ ]
2214
+ },
2215
  {
2216
  "id": "candidate-kaggle-dataset-mahibullahmudaser-pashto-text-characters-sample",
2217
  "title": "Pashto text characters sample",
 
2285
  ]
2286
  },
2287
  {
2288
+ "id": "candidate-s2-pos-tagging-of-low-resource-pashto-language-annotated-corpus-and-bert-based-mode",
2289
+ "title": "POS tagging of low-resource Pashto language: annotated corpus and BERT-based model",
2290
+ "url": "https://www.semanticscholar.org/paper/1b2d5c896fec735483e8c8fb0a75e13125e08769",
2291
+ "category": "paper",
2292
+ "source": "other",
2293
  "status": "candidate",
2294
+ "summary": "Candidate paper returned from Semantic Scholar search for Pashto.",
2295
  "primary_use": "Needs maintainer review before promotion to verified catalog.",
2296
  "tasks": [],
2297
  "pashto_evidence": {
2298
+ "evidence_text": "Matched by Semantic Scholar query: pashto.",
2299
+ "evidence_url": "https://www.semanticscholar.org/paper/1b2d5c896fec735483e8c8fb0a75e13125e08769",
2300
  "markers": [
2301
+ "pashto"
2302
  ]
2303
  },
2304
  "tags": [
2305
  "pashto",
2306
  "candidate",
2307
+ "paper"
 
2308
  ]
2309
  },
2310
  {
2311
+ "id": "candidate-arxiv-psocr-benchmarking-large-multimodal-models-for-optical-character-recognition-in-",
2312
+ "title": "PsOCR: Benchmarking Large Multimodal Models for Optical Character Recognition in Low-resource Pashto Language",
2313
+ "url": "http://arxiv.org/abs/2505.10055v2",
2314
  "category": "paper",
2315
+ "source": "arxiv",
2316
  "status": "candidate",
2317
+ "summary": "This paper evaluates the performance of Large Multimodal Models (LMMs) on Optical Character Recognition (OCR) in the low-resource Pashto language. Natural Language Processing (NLP) in Pashto faces several challenges due to the cursive natur",
2318
  "primary_use": "Needs maintainer review before promotion to verified catalog.",
2319
  "tasks": [],
2320
  "pashto_evidence": {
2321
+ "evidence_text": "Matched by arXiv query: all:pashto.",
2322
+ "evidence_url": "http://arxiv.org/abs/2505.10055v2",
2323
  "markers": [
2324
  "pashto"
2325
  ]
 
2331
  ]
2332
  },
2333
  {
2334
+ "id": "candidate-s2-resolution-of-ellipses-in-wh-constructions-in-pashto-language",
2335
+ "title": "Resolution of Ellipses in WH-constructions in Pashto Language",
2336
+ "url": "https://www.semanticscholar.org/paper/b9d84d79be0e90e026bbd596276697eeca5d9474",
2337
  "category": "paper",
2338
  "source": "other",
2339
  "status": "candidate",
2340
+ "summary": "The Pashto language has a question structure consisting of a WH-word and an answer to the question, this is called WH-structure. The resolution of ellipsis occurs in most cases in both written and spoken language in its WH construction. In",
2341
  "primary_use": "Needs maintainer review before promotion to verified catalog.",
2342
  "tasks": [],
2343
  "pashto_evidence": {
2344
  "evidence_text": "Matched by Semantic Scholar query: pashto.",
2345
+ "evidence_url": "https://www.semanticscholar.org/paper/b9d84d79be0e90e026bbd596276697eeca5d9474",
2346
  "markers": [
2347
  "pashto"
2348
  ]
 
2354
  ]
2355
  },
2356
  {
2357
+ "id": "candidate-s2-resolving-the-dual-y-orthographic-variation-in-pashto-an-interdisciplinary-appro",
2358
+ "title": "Resolving the Dual Yā Orthographic Variation in Pashto: An Interdisciplinary Approach Integrating Linguistic, Technological, and Educational Perspectives in Afghanistan and Pakistan",
2359
+ "url": "https://www.semanticscholar.org/paper/3741ccd390216a00431606d85f6c21a174244ccb",
2360
+ "category": "paper",
2361
+ "source": "other",
2362
  "status": "candidate",
2363
+ "summary": "Pashto, a major language in Afghanistan and Pakistan, faces persistent orthographic inconsistencies regarding the dual graphemes Yā (\"ی\", U+06CC and \"ې\", U+06D0). These graphemes represent distinct phonological and morphological functions b",
2364
  "primary_use": "Needs maintainer review before promotion to verified catalog.",
2365
  "tasks": [],
2366
  "pashto_evidence": {
2367
+ "evidence_text": "Matched by Semantic Scholar query: pashto.",
2368
+ "evidence_url": "https://www.semanticscholar.org/paper/3741ccd390216a00431606d85f6c21a174244ccb",
2369
  "markers": [
2370
  "pashto"
2371
  ]
 
2373
  "tags": [
2374
  "pashto",
2375
  "candidate",
2376
+ "paper"
2377
  ]
2378
  },
2379
  {
 
2399
  "dataset"
2400
  ]
2401
  },
2402
+ {
2403
+ "id": "candidate-arxiv-score-combination-for-improved-parallel-corpus-filtering-for-low-resource-condit",
2404
+ "title": "Score Combination for Improved Parallel Corpus Filtering for Low Resource Conditions",
2405
+ "url": "http://arxiv.org/abs/2011.07933v1",
2406
+ "category": "paper",
2407
+ "source": "arxiv",
2408
+ "status": "candidate",
2409
+ "summary": "This paper describes our submission to the WMT20 sentence filtering task. We combine scores from (1) a custom LASER built for each source language, (2) a classifier built to distinguish positive and negative pairs by semantic alignment, and",
2410
+ "primary_use": "Needs maintainer review before promotion to verified catalog.",
2411
+ "tasks": [],
2412
+ "pashto_evidence": {
2413
+ "evidence_text": "Matched by arXiv query: all:pashto.",
2414
+ "evidence_url": "http://arxiv.org/abs/2011.07933v1",
2415
+ "markers": [
2416
+ "pashto"
2417
+ ]
2418
+ },
2419
+ "tags": [
2420
+ "pashto",
2421
+ "candidate",
2422
+ "paper"
2423
+ ]
2424
+ },
2425
  {
2426
  "id": "candidate-gh-project-shahzamanpatan-pashto-baran",
2427
  "title": "ShahZamanPatan/Pashto-Baran",
 
2499
  "brahui"
2500
  ]
2501
  },
2502
+ {
2503
+ "id": "candidate-arxiv-speech-to-speech-translation-pipelines-for-conversations-in-low-resource-languag",
2504
+ "title": "Speech-to-Speech Translation Pipelines for Conversations in Low-Resource Languages",
2505
+ "url": "http://arxiv.org/abs/2506.01406v1",
2506
+ "category": "paper",
2507
+ "source": "arxiv",
2508
+ "status": "candidate",
2509
+ "summary": "The popularity of automatic speech-to-speech translation for human conversations is growing, but the quality varies significantly depending on the language pair. In a context of community interpreting for low-resource languages, namely Turk",
2510
+ "primary_use": "Needs maintainer review before promotion to verified catalog.",
2511
+ "tasks": [],
2512
+ "pashto_evidence": {
2513
+ "evidence_text": "Matched by arXiv query: all:pashto.",
2514
+ "evidence_url": "http://arxiv.org/abs/2506.01406v1",
2515
+ "markers": [
2516
+ "pashto"
2517
+ ]
2518
+ },
2519
+ "tags": [
2520
+ "pashto",
2521
+ "candidate",
2522
+ "paper"
2523
+ ]
2524
+ },
2525
  {
2526
  "id": "candidate-s2-switching-selves-online-pashto-english-bilingualism-identity-and-expression-in-p",
2527
  "title": "SWITCHING SELVES ONLINE:PASHTO-ENGLISH BILINGUALISM,IDENTITY, AND EXPRESSION IN PAKISTAN’S DIGITAL DISCOURSE",
 
2615
  "dataset"
2616
  ]
2617
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2618
  {
2619
  "id": "candidate-hf-project-tasal9-zamai-mt5-pashto-demo",
2620
  "title": "tasal9/ZamAI-mt5-Pashto-Demo",
 
2709
  "paper"
2710
  ]
2711
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2712
  {
2713
  "id": "candidate-s2-the-roshani-movement-literary-services-and-the-contribution-of-this-movement-in-",
2714
  "title": "The Roshani Movement literary services and the contribution of this Movement in the development of Pashto Literature",
 
2733
  ]
2734
  },
2735
  {
2736
+ "id": "candidate-arxiv-tuning-traditional-language-processing-approaches-for-pashto-text-classification",
2737
+ "title": "Tuning Traditional Language Processing Approaches for Pashto Text Classification",
2738
+ "url": "http://arxiv.org/abs/2305.03737v1",
2739
  "category": "paper",
2740
+ "source": "arxiv",
2741
  "status": "candidate",
2742
+ "summary": "Today text classification becomes critical task for concerned individuals for numerous purposes. Hence, several researches have been conducted to develop automatic text classification for national and international languages. However, the n",
2743
  "primary_use": "Needs maintainer review before promotion to verified catalog.",
2744
  "tasks": [],
2745
  "pashto_evidence": {
2746
+ "evidence_text": "Matched by arXiv query: all:pashto.",
2747
+ "evidence_url": "http://arxiv.org/abs/2305.03737v1",
2748
  "markers": [
2749
  "pashto"
2750
  ]
 
2780
  ]
2781
  },
2782
  {
2783
+ "id": "candidate-arxiv-using-of-heterogeneous-corpora-for-training-of-an-asr-system",
2784
+ "title": "Using of heterogeneous corpora for training of an ASR system",
2785
+ "url": "http://arxiv.org/abs/1706.00321v1",
2786
+ "category": "paper",
2787
+ "source": "arxiv",
2788
  "status": "candidate",
2789
+ "summary": "The paper summarizes the development of the LVCSR system built as a part of the Pashto speech-translation system at the SCALE (Summer Camp for Applied Language Exploration) 2015 workshop on \"Speech-to-text-translation for low-resource langu",
2790
  "primary_use": "Needs maintainer review before promotion to verified catalog.",
2791
  "tasks": [],
2792
  "pashto_evidence": {
2793
+ "evidence_text": "Matched by arXiv query: all:pashto.",
2794
+ "evidence_url": "http://arxiv.org/abs/1706.00321v1",
2795
  "markers": [
2796
  "pashto"
2797
  ]
 
2799
  "tags": [
2800
  "pashto",
2801
  "candidate",
2802
+ "paper"
 
 
 
 
2803
  ]
2804
  },
2805
  {
2806
+ "id": "candidate-gh-project-wikis-on-git-ps-wikipedia-org",
2807
+ "title": "wikis-on-git/ps.wikipedia.org",
2808
+ "url": "https://github.com/wikis-on-git/ps.wikipedia.org",
2809
+ "category": "project",
2810
+ "source": "github",
2811
  "status": "candidate",
2812
+ "summary": "Wikipedia in Pashto (پښتو)",
2813
  "primary_use": "Needs maintainer review before promotion to verified catalog.",
2814
  "tasks": [],
2815
  "pashto_evidence": {
2816
+ "evidence_text": "Repository metadata (name/description/topics) includes Pashto markers.",
2817
+ "evidence_url": "https://github.com/wikis-on-git/ps.wikipedia.org",
2818
  "markers": [
2819
  "pashto"
2820
  ]
 
2822
  "tags": [
2823
  "pashto",
2824
  "candidate",
2825
+ "project",
2826
+ "github",
2827
+ "mediawiki",
2828
+ "pashto",
2829
+ "wikipedia"
2830
  ]
2831
  }
 
 
 
2832
  ]
2833
  }
resources/catalog/resources.json CHANGED
@@ -1,6 +1,6 @@
1
  {
2
  "version": "1.0.0",
3
- "updated_on": "2026-02-15",
4
  "resources": [
5
  {
6
  "id": "dataset-common-voice-ps-v24",
@@ -977,6 +977,1075 @@
977
  "github",
978
  "nlp"
979
  ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
980
  }
981
  ]
982
  }
 
1
  {
2
  "version": "1.0.0",
3
+ "updated_on": "2026-02-16",
4
  "resources": [
5
  {
6
  "id": "dataset-common-voice-ps-v24",
 
977
  "github",
978
  "nlp"
979
  ]
980
+ },
981
+ {
982
+ "id": "dataset-kaggle-drijaz-pashtoocr",
983
+ "title": "PashtoOCR (Kaggle)",
984
+ "url": "https://www.kaggle.com/datasets/drijaz/pashtoocr",
985
+ "category": "dataset",
986
+ "source": "kaggle",
987
+ "status": "verified",
988
+ "summary": "Synthetic OCR dataset focused on Pashto ligatures and text recognition tasks.",
989
+ "primary_use": "Pashto OCR dataset benchmarking and training",
990
+ "license": "MIT",
991
+ "tasks": [
992
+ "ocr",
993
+ "nlp"
994
+ ],
995
+ "pashto_evidence": {
996
+ "evidence_text": "Kaggle dataset title and subtitle explicitly identify a Pashto OCR dataset.",
997
+ "evidence_url": "https://www.kaggle.com/api/v1/datasets/view/drijaz/pashtoocr",
998
+ "markers": [
999
+ "Pashto",
1000
+ "OCR"
1001
+ ]
1002
+ },
1003
+ "tags": [
1004
+ "pashto",
1005
+ "kaggle",
1006
+ "ocr",
1007
+ "dataset"
1008
+ ]
1009
+ },
1010
+ {
1011
+ "id": "model-hf-zirak-ai-pashto-bert-v1",
1012
+ "title": "zirak-ai/pashto-bert-v1",
1013
+ "url": "https://huggingface.co/zirak-ai/pashto-bert-v1",
1014
+ "category": "model",
1015
+ "source": "huggingface",
1016
+ "status": "verified",
1017
+ "summary": "Pashto BERT model checkpoint for low-resource Pashto NLP experiments.",
1018
+ "primary_use": "Pashto encoder baseline for NLP tasks",
1019
+ "tasks": [
1020
+ "nlp"
1021
+ ],
1022
+ "pashto_evidence": {
1023
+ "evidence_text": "Hugging Face model ID and search tags explicitly include pashto marker.",
1024
+ "evidence_url": "https://huggingface.co/zirak-ai/pashto-bert-v1",
1025
+ "markers": [
1026
+ "pashto"
1027
+ ]
1028
+ },
1029
+ "tags": [
1030
+ "pashto",
1031
+ "huggingface",
1032
+ "bert",
1033
+ "nlp"
1034
+ ]
1035
+ },
1036
+ {
1037
+ "id": "project-hf-space-ihanif-pashto-asr",
1038
+ "title": "Pashto ASR Space",
1039
+ "url": "https://huggingface.co/spaces/ihanif/pashto-asr",
1040
+ "category": "project",
1041
+ "source": "huggingface",
1042
+ "status": "verified",
1043
+ "summary": "Interactive Hugging Face Space for Pashto ASR inference demos.",
1044
+ "primary_use": "Live Pashto speech-to-text demo project",
1045
+ "tasks": [
1046
+ "asr",
1047
+ "demo"
1048
+ ],
1049
+ "pashto_evidence": {
1050
+ "evidence_text": "Space ID includes pashto-asr and is returned by Hugging Face Pashto space search.",
1051
+ "evidence_url": "https://huggingface.co/api/spaces/ihanif/pashto-asr",
1052
+ "markers": [
1053
+ "pashto",
1054
+ "asr"
1055
+ ]
1056
+ },
1057
+ "tags": [
1058
+ "pashto",
1059
+ "project",
1060
+ "huggingface-space",
1061
+ "asr"
1062
+ ]
1063
+ },
1064
+ {
1065
+ "id": "code-github-mrychlik-worldly-ocr",
1066
+ "title": "worldly-ocr",
1067
+ "url": "https://github.com/mrychlik/worldly-ocr",
1068
+ "category": "code",
1069
+ "source": "github",
1070
+ "status": "verified",
1071
+ "summary": "Open OCR code project that explicitly includes Pashto among target languages.",
1072
+ "primary_use": "Pashto OCR code reference and experimentation",
1073
+ "tasks": [
1074
+ "ocr",
1075
+ "tooling"
1076
+ ],
1077
+ "pashto_evidence": {
1078
+ "evidence_text": "Repository description explicitly says OCR for Pashto and Chinese.",
1079
+ "evidence_url": "https://api.github.com/repos/mrychlik/worldly-ocr",
1080
+ "markers": [
1081
+ "Pashto",
1082
+ "OCR"
1083
+ ]
1084
+ },
1085
+ "tags": [
1086
+ "pashto",
1087
+ "code",
1088
+ "github",
1089
+ "ocr"
1090
+ ]
1091
+ },
1092
+ {
1093
+ "id": "paper-s2-psocr-lmm-pashto",
1094
+ "title": "PsOCR: Benchmarking Large Multimodal Models for Optical Character Recognition in Low-resource Pashto Language",
1095
+ "url": "https://www.semanticscholar.org/paper/d2743c0dcdbc65f5b46fcec2f0ba7cb379c4134f",
1096
+ "category": "paper",
1097
+ "source": "other",
1098
+ "status": "verified",
1099
+ "summary": "Research paper benchmarking multimodal OCR models on low-resource Pashto OCR tasks.",
1100
+ "primary_use": "Pashto OCR research baseline and evaluation reference",
1101
+ "tasks": [
1102
+ "ocr",
1103
+ "research"
1104
+ ],
1105
+ "pashto_evidence": {
1106
+ "evidence_text": "Paper title explicitly references low-resource Pashto language OCR benchmarking.",
1107
+ "evidence_url": "https://www.semanticscholar.org/paper/d2743c0dcdbc65f5b46fcec2f0ba7cb379c4134f",
1108
+ "markers": [
1109
+ "Pashto",
1110
+ "OCR"
1111
+ ]
1112
+ },
1113
+ "tags": [
1114
+ "pashto",
1115
+ "paper",
1116
+ "ocr",
1117
+ "multimodal"
1118
+ ]
1119
+ },
1120
+ {
1121
+ "id": "dataset-hf-adnankhan769-english-to-pashto",
1122
+ "title": "English to Pashto Sentences Dataset",
1123
+ "url": "https://huggingface.co/datasets/adnankhan769/english_to_pashto_sentences_dataset",
1124
+ "category": "dataset",
1125
+ "source": "huggingface",
1126
+ "status": "verified",
1127
+ "summary": "Parallel English-Pashto sentence dataset for bilingual NLP and translation experiments.",
1128
+ "primary_use": "MT and bilingual sentence alignment baseline",
1129
+ "license": "cc-by-sa-4.0",
1130
+ "tasks": [
1131
+ "mt",
1132
+ "nlp"
1133
+ ],
1134
+ "pashto_evidence": {
1135
+ "evidence_text": "Dataset ID explicitly states English-to-Pashto and includes Pashto-script sentence column.",
1136
+ "evidence_url": "https://huggingface.co/api/datasets/adnankhan769/english_to_pashto_sentences_dataset",
1137
+ "markers": [
1138
+ "Pashto"
1139
+ ]
1140
+ },
1141
+ "tags": [
1142
+ "pashto",
1143
+ "dataset",
1144
+ "huggingface",
1145
+ "translation"
1146
+ ]
1147
+ },
1148
+ {
1149
+ "id": "dataset-hf-saillab-alpaca-pashto-cleaned",
1150
+ "title": "alpaca-pashto-cleaned",
1151
+ "url": "https://huggingface.co/datasets/saillab/alpaca-pashto-cleaned",
1152
+ "category": "dataset",
1153
+ "source": "huggingface",
1154
+ "status": "verified",
1155
+ "summary": "Instruction-style Pashto text dataset suitable for LLM tuning and instruction-following research.",
1156
+ "primary_use": "Pashto instruction tuning and conversational NLP experiments",
1157
+ "tasks": [
1158
+ "nlp",
1159
+ "llm"
1160
+ ],
1161
+ "pashto_evidence": {
1162
+ "evidence_text": "Dataset metadata includes language:ps and dataset name includes Pashto.",
1163
+ "evidence_url": "https://huggingface.co/api/datasets/saillab/alpaca-pashto-cleaned",
1164
+ "markers": [
1165
+ "ps",
1166
+ "Pashto"
1167
+ ]
1168
+ },
1169
+ "tags": [
1170
+ "pashto",
1171
+ "dataset",
1172
+ "huggingface",
1173
+ "instruction"
1174
+ ]
1175
+ },
1176
+ {
1177
+ "id": "model-hf-ihanif-whisper-base-pashto",
1178
+ "title": "Whisper Base Pashto",
1179
+ "url": "https://huggingface.co/ihanif/whisper-base-pashto",
1180
+ "category": "model",
1181
+ "source": "huggingface",
1182
+ "status": "verified",
1183
+ "summary": "Fine-tuned Whisper Base checkpoint for Pashto ASR with FLEURS ps_af evaluation metadata.",
1184
+ "primary_use": "Pashto ASR baseline and speed-accuracy comparison",
1185
+ "license": "apache-2.0",
1186
+ "tasks": [
1187
+ "asr"
1188
+ ],
1189
+ "pashto_evidence": {
1190
+ "evidence_text": "Model ID includes Pashto and card metadata references FLEURS config ps_af.",
1191
+ "evidence_url": "https://huggingface.co/api/models/ihanif/whisper-base-pashto",
1192
+ "markers": [
1193
+ "Pashto",
1194
+ "ps_af"
1195
+ ]
1196
+ },
1197
+ "tags": [
1198
+ "pashto",
1199
+ "model",
1200
+ "huggingface",
1201
+ "asr"
1202
+ ]
1203
+ },
1204
+ {
1205
+ "id": "project-hf-space-zamai-mistral-7b-pashto",
1206
+ "title": "ZamAI-Mistral-7B-Pashto Space",
1207
+ "url": "https://huggingface.co/spaces/tasal9/ZamAI-Mistral-7B-Pashto-space",
1208
+ "category": "project",
1209
+ "source": "huggingface",
1210
+ "status": "verified",
1211
+ "summary": "Gradio project space demonstrating a Pashto-adapted Mistral 7B interface.",
1212
+ "primary_use": "Interactive Pashto LLM project demo",
1213
+ "license": "apache-2.0",
1214
+ "tasks": [
1215
+ "llm",
1216
+ "demo"
1217
+ ],
1218
+ "pashto_evidence": {
1219
+ "evidence_text": "Space title and ID explicitly include Pashto and model card metadata exposes project details.",
1220
+ "evidence_url": "https://huggingface.co/api/spaces/tasal9/ZamAI-Mistral-7B-Pashto-space",
1221
+ "markers": [
1222
+ "Pashto"
1223
+ ]
1224
+ },
1225
+ "tags": [
1226
+ "pashto",
1227
+ "project",
1228
+ "huggingface-space",
1229
+ "llm"
1230
+ ]
1231
+ },
1232
+ {
1233
+ "id": "dataset-hf-adnankhan769-proper-dataset-english-2-pashto",
1234
+ "title": "adnankhan769/proper_dataset_english_2_pashto",
1235
+ "url": "https://huggingface.co/datasets/adnankhan769/proper_dataset_english_2_pashto",
1236
+ "category": "dataset",
1237
+ "source": "huggingface",
1238
+ "status": "verified",
1239
+ "summary": "Pashto bilingual/translation dataset discovered from huggingface for MT experimentation.",
1240
+ "primary_use": "Machine translation and bilingual corpus development",
1241
+ "tasks": [
1242
+ "mt"
1243
+ ],
1244
+ "pashto_evidence": {
1245
+ "evidence_text": "Matched by Pashto keyword in Hugging Face search results.",
1246
+ "evidence_url": "https://huggingface.co/datasets/adnankhan769/proper_dataset_english_2_pashto",
1247
+ "markers": [
1248
+ "pashto"
1249
+ ]
1250
+ },
1251
+ "tags": [
1252
+ "pashto",
1253
+ "dataset",
1254
+ "huggingface",
1255
+ "mt"
1256
+ ]
1257
+ },
1258
+ {
1259
+ "id": "dataset-hf-ihanif-pashto-asr-wer",
1260
+ "title": "ihanif/pashto_asr_wer",
1261
+ "url": "https://huggingface.co/datasets/ihanif/pashto_asr_wer",
1262
+ "category": "dataset",
1263
+ "source": "huggingface",
1264
+ "status": "verified",
1265
+ "summary": "Pashto speech dataset discovered from huggingface for ASR training and evaluation.",
1266
+ "primary_use": "ASR training and evaluation data source",
1267
+ "tasks": [
1268
+ "asr"
1269
+ ],
1270
+ "pashto_evidence": {
1271
+ "evidence_text": "Matched by Pashto keyword in Hugging Face search results.",
1272
+ "evidence_url": "https://huggingface.co/datasets/ihanif/pashto_asr_wer",
1273
+ "markers": [
1274
+ "pashto"
1275
+ ]
1276
+ },
1277
+ "tags": [
1278
+ "pashto",
1279
+ "dataset",
1280
+ "huggingface",
1281
+ "asr"
1282
+ ]
1283
+ },
1284
+ {
1285
+ "id": "dataset-hf-ihanif-pashto-speech-ds",
1286
+ "title": "ihanif/pashto_speech_ds",
1287
+ "url": "https://huggingface.co/datasets/ihanif/pashto_speech_ds",
1288
+ "category": "dataset",
1289
+ "source": "huggingface",
1290
+ "status": "verified",
1291
+ "summary": "Pashto speech dataset discovered from huggingface for ASR training and evaluation.",
1292
+ "primary_use": "ASR training and evaluation data source",
1293
+ "tasks": [
1294
+ "asr"
1295
+ ],
1296
+ "pashto_evidence": {
1297
+ "evidence_text": "Matched by Pashto keyword in Hugging Face search results.",
1298
+ "evidence_url": "https://huggingface.co/datasets/ihanif/pashto_speech_ds",
1299
+ "markers": [
1300
+ "pashto"
1301
+ ]
1302
+ },
1303
+ "tags": [
1304
+ "pashto",
1305
+ "dataset",
1306
+ "huggingface",
1307
+ "asr"
1308
+ ]
1309
+ },
1310
+ {
1311
+ "id": "dataset-hf-ihanif-pashto-speech-parquet-10k",
1312
+ "title": "ihanif/pashto_speech_parquet_10k",
1313
+ "url": "https://huggingface.co/datasets/ihanif/pashto_speech_parquet_10k",
1314
+ "category": "dataset",
1315
+ "source": "huggingface",
1316
+ "status": "verified",
1317
+ "summary": "Pashto speech dataset discovered from huggingface for ASR training and evaluation.",
1318
+ "primary_use": "ASR training and evaluation data source",
1319
+ "tasks": [
1320
+ "asr"
1321
+ ],
1322
+ "pashto_evidence": {
1323
+ "evidence_text": "Matched by Pashto keyword in Hugging Face search results.",
1324
+ "evidence_url": "https://huggingface.co/datasets/ihanif/pashto_speech_parquet_10k",
1325
+ "markers": [
1326
+ "pashto"
1327
+ ]
1328
+ },
1329
+ "tags": [
1330
+ "pashto",
1331
+ "dataset",
1332
+ "huggingface",
1333
+ "asr"
1334
+ ]
1335
+ },
1336
+ {
1337
+ "id": "dataset-hf-saillab-alpaca-pashto-taco",
1338
+ "title": "saillab/alpaca_pashto_taco",
1339
+ "url": "https://huggingface.co/datasets/saillab/alpaca_pashto_taco",
1340
+ "category": "dataset",
1341
+ "source": "huggingface",
1342
+ "status": "verified",
1343
+ "summary": "Pashto-focused dataset discovered from huggingface candidate sync.",
1344
+ "primary_use": "Instruction tuning and LLM adaptation data source",
1345
+ "tasks": [
1346
+ "llm"
1347
+ ],
1348
+ "pashto_evidence": {
1349
+ "evidence_text": "Matched by Pashto keyword in Hugging Face search results.",
1350
+ "evidence_url": "https://huggingface.co/datasets/saillab/alpaca_pashto_taco",
1351
+ "markers": [
1352
+ "pashto"
1353
+ ]
1354
+ },
1355
+ "tags": [
1356
+ "pashto",
1357
+ "dataset",
1358
+ "huggingface",
1359
+ "llm"
1360
+ ]
1361
+ },
1362
+ {
1363
+ "id": "dataset-hf-sherwindesouza-pashto-common-voice-20",
1364
+ "title": "SherwinDesouza/pashto-common-voice-20",
1365
+ "url": "https://huggingface.co/datasets/SherwinDesouza/pashto-common-voice-20",
1366
+ "category": "dataset",
1367
+ "source": "huggingface",
1368
+ "status": "verified",
1369
+ "summary": "Pashto-focused dataset discovered from huggingface candidate sync.",
1370
+ "primary_use": "Pashto data source for NLP experimentation",
1371
+ "tasks": [
1372
+ "nlp"
1373
+ ],
1374
+ "pashto_evidence": {
1375
+ "evidence_text": "Matched by Pashto keyword in Hugging Face search results.",
1376
+ "evidence_url": "https://huggingface.co/datasets/SherwinDesouza/pashto-common-voice-20",
1377
+ "markers": [
1378
+ "pashto"
1379
+ ]
1380
+ },
1381
+ "tags": [
1382
+ "pashto",
1383
+ "dataset",
1384
+ "huggingface",
1385
+ "nlp"
1386
+ ]
1387
+ },
1388
+ {
1389
+ "id": "dataset-hf-tasal9-zamai-pashto-dataset",
1390
+ "title": "tasal9/ZamAI_Pashto_Dataset",
1391
+ "url": "https://huggingface.co/datasets/tasal9/ZamAI_Pashto_Dataset",
1392
+ "category": "dataset",
1393
+ "source": "huggingface",
1394
+ "status": "verified",
1395
+ "summary": "Pashto-focused dataset discovered from huggingface candidate sync.",
1396
+ "primary_use": "Pashto data source for NLP experimentation",
1397
+ "tasks": [
1398
+ "nlp"
1399
+ ],
1400
+ "pashto_evidence": {
1401
+ "evidence_text": "Matched by Pashto keyword in Hugging Face search results.",
1402
+ "evidence_url": "https://huggingface.co/datasets/tasal9/ZamAI_Pashto_Dataset",
1403
+ "markers": [
1404
+ "pashto"
1405
+ ]
1406
+ },
1407
+ "tags": [
1408
+ "pashto",
1409
+ "dataset",
1410
+ "huggingface",
1411
+ "nlp"
1412
+ ]
1413
+ },
1414
+ {
1415
+ "id": "dataset-kaggle-english-pashto-language-dataset-epld",
1416
+ "title": "English-Pashto Language Dataset (EPLD)",
1417
+ "url": "https://www.kaggle.com/datasets/rabiakhan827/english-pashto-language-dataset-epld",
1418
+ "category": "dataset",
1419
+ "source": "kaggle",
1420
+ "status": "verified",
1421
+ "summary": "Pashto bilingual/translation dataset discovered from kaggle for MT experimentation.",
1422
+ "primary_use": "Machine translation and bilingual corpus development",
1423
+ "tasks": [
1424
+ "mt"
1425
+ ],
1426
+ "pashto_evidence": {
1427
+ "evidence_text": "Kaggle dataset title/subtitle includes Pashto keyword.",
1428
+ "evidence_url": "https://www.kaggle.com/datasets/rabiakhan827/english-pashto-language-dataset-epld",
1429
+ "markers": [
1430
+ "Pashto"
1431
+ ]
1432
+ },
1433
+ "tags": [
1434
+ "pashto",
1435
+ "dataset",
1436
+ "kaggle",
1437
+ "mt"
1438
+ ]
1439
+ },
1440
+ {
1441
+ "id": "dataset-kaggle-katib-s-pashto-text-imagebase-kpti",
1442
+ "title": "Katib's Pashto Text Imagebase (KPTI)",
1443
+ "url": "https://www.kaggle.com/datasets/hassanamin/katibs-pashto-text-imagebase-kpti",
1444
+ "category": "dataset",
1445
+ "source": "kaggle",
1446
+ "status": "verified",
1447
+ "summary": "Pashto OCR-oriented dataset discovered from kaggle for document and script recognition work.",
1448
+ "primary_use": "OCR training and evaluation data source",
1449
+ "tasks": [
1450
+ "ocr"
1451
+ ],
1452
+ "pashto_evidence": {
1453
+ "evidence_text": "Kaggle dataset title/subtitle includes Pashto keyword.",
1454
+ "evidence_url": "https://www.kaggle.com/datasets/hassanamin/katibs-pashto-text-imagebase-kpti",
1455
+ "markers": [
1456
+ "Pashto"
1457
+ ]
1458
+ },
1459
+ "tags": [
1460
+ "pashto",
1461
+ "dataset",
1462
+ "kaggle",
1463
+ "ocr"
1464
+ ]
1465
+ },
1466
+ {
1467
+ "id": "dataset-kaggle-pashto-ocr",
1468
+ "title": "Pashto OCR",
1469
+ "url": "https://www.kaggle.com/datasets/hassanamin/pashto-ocr",
1470
+ "category": "dataset",
1471
+ "source": "kaggle",
1472
+ "status": "verified",
1473
+ "summary": "Pashto OCR-oriented dataset discovered from kaggle for document and script recognition work.",
1474
+ "primary_use": "OCR training and evaluation data source",
1475
+ "tasks": [
1476
+ "ocr"
1477
+ ],
1478
+ "pashto_evidence": {
1479
+ "evidence_text": "Kaggle dataset title/subtitle includes Pashto keyword.",
1480
+ "evidence_url": "https://www.kaggle.com/datasets/hassanamin/pashto-ocr",
1481
+ "markers": [
1482
+ "Pashto"
1483
+ ]
1484
+ },
1485
+ "tags": [
1486
+ "pashto",
1487
+ "dataset",
1488
+ "kaggle",
1489
+ "ocr"
1490
+ ]
1491
+ },
1492
+ {
1493
+ "id": "dataset-kaggle-common-voice-24-0-pashto-speech-dataset",
1494
+ "title": "Common Voice 24.0: Pashto Speech Dataset",
1495
+ "url": "https://www.kaggle.com/datasets/ataullahaali/common-voice-scripted-speech-24-0-pashto",
1496
+ "category": "dataset",
1497
+ "source": "kaggle",
1498
+ "status": "verified",
1499
+ "summary": "Pashto speech dataset discovered from kaggle for ASR training and evaluation.",
1500
+ "primary_use": "ASR training and evaluation data source",
1501
+ "tasks": [
1502
+ "asr"
1503
+ ],
1504
+ "pashto_evidence": {
1505
+ "evidence_text": "Kaggle dataset title/subtitle includes Pashto keyword.",
1506
+ "evidence_url": "https://www.kaggle.com/datasets/ataullahaali/common-voice-scripted-speech-24-0-pashto",
1507
+ "markers": [
1508
+ "Pashto"
1509
+ ]
1510
+ },
1511
+ "tags": [
1512
+ "pashto",
1513
+ "dataset",
1514
+ "kaggle",
1515
+ "asr"
1516
+ ]
1517
+ },
1518
+ {
1519
+ "id": "model-hf-ihanif-pashto-asr-base",
1520
+ "title": "ihanif/pashto-asr-base",
1521
+ "url": "https://huggingface.co/ihanif/pashto-asr-base",
1522
+ "category": "model",
1523
+ "source": "huggingface",
1524
+ "status": "verified",
1525
+ "summary": "Pashto ASR model checkpoint discovered from huggingface candidate sync.",
1526
+ "primary_use": "Pashto ASR baseline and model comparison",
1527
+ "tasks": [
1528
+ "asr"
1529
+ ],
1530
+ "pashto_evidence": {
1531
+ "evidence_text": "Matched by Pashto keyword in Hugging Face search results.",
1532
+ "evidence_url": "https://huggingface.co/ihanif/pashto-asr-base",
1533
+ "markers": [
1534
+ "pashto"
1535
+ ]
1536
+ },
1537
+ "tags": [
1538
+ "pashto",
1539
+ "model",
1540
+ "huggingface",
1541
+ "asr"
1542
+ ]
1543
+ },
1544
+ {
1545
+ "id": "model-hf-ihanif-wav2vec2-xls-r-300m-pashto-lm",
1546
+ "title": "ihanif/wav2vec2-xls-r-300m-pashto-lm",
1547
+ "url": "https://huggingface.co/ihanif/wav2vec2-xls-r-300m-pashto-lm",
1548
+ "category": "model",
1549
+ "source": "huggingface",
1550
+ "status": "verified",
1551
+ "summary": "Pashto ASR model checkpoint discovered from huggingface candidate sync.",
1552
+ "primary_use": "Pashto ASR baseline and model comparison",
1553
+ "tasks": [
1554
+ "asr"
1555
+ ],
1556
+ "pashto_evidence": {
1557
+ "evidence_text": "Matched by Pashto keyword in Hugging Face search results.",
1558
+ "evidence_url": "https://huggingface.co/ihanif/wav2vec2-xls-r-300m-pashto-lm",
1559
+ "markers": [
1560
+ "pashto"
1561
+ ]
1562
+ },
1563
+ "tags": [
1564
+ "pashto",
1565
+ "model",
1566
+ "huggingface",
1567
+ "asr"
1568
+ ]
1569
+ },
1570
+ {
1571
+ "id": "model-hf-ihanif-whisper-large-pashto",
1572
+ "title": "ihanif/whisper-large-pashto",
1573
+ "url": "https://huggingface.co/ihanif/whisper-large-pashto",
1574
+ "category": "model",
1575
+ "source": "huggingface",
1576
+ "status": "verified",
1577
+ "summary": "Pashto ASR model checkpoint discovered from huggingface candidate sync.",
1578
+ "primary_use": "Pashto ASR baseline and model comparison",
1579
+ "tasks": [
1580
+ "asr"
1581
+ ],
1582
+ "pashto_evidence": {
1583
+ "evidence_text": "Matched by Pashto keyword in Hugging Face search results.",
1584
+ "evidence_url": "https://huggingface.co/ihanif/whisper-large-pashto",
1585
+ "markers": [
1586
+ "pashto"
1587
+ ]
1588
+ },
1589
+ "tags": [
1590
+ "pashto",
1591
+ "model",
1592
+ "huggingface",
1593
+ "asr"
1594
+ ]
1595
+ },
1596
+ {
1597
+ "id": "model-hf-ihanif-whisper-medium-pashto-3e-7",
1598
+ "title": "ihanif/whisper-medium-pashto-3e-7",
1599
+ "url": "https://huggingface.co/ihanif/whisper-medium-pashto-3e-7",
1600
+ "category": "model",
1601
+ "source": "huggingface",
1602
+ "status": "verified",
1603
+ "summary": "Pashto ASR model checkpoint discovered from huggingface candidate sync.",
1604
+ "primary_use": "Pashto ASR baseline and model comparison",
1605
+ "tasks": [
1606
+ "asr"
1607
+ ],
1608
+ "pashto_evidence": {
1609
+ "evidence_text": "Matched by Pashto keyword in Hugging Face search results.",
1610
+ "evidence_url": "https://huggingface.co/ihanif/whisper-medium-pashto-3e-7",
1611
+ "markers": [
1612
+ "pashto"
1613
+ ]
1614
+ },
1615
+ "tags": [
1616
+ "pashto",
1617
+ "model",
1618
+ "huggingface",
1619
+ "asr"
1620
+ ]
1621
+ },
1622
+ {
1623
+ "id": "model-hf-ihanif-whisper-small-pashto",
1624
+ "title": "ihanif/whisper-small-pashto",
1625
+ "url": "https://huggingface.co/ihanif/whisper-small-pashto",
1626
+ "category": "model",
1627
+ "source": "huggingface",
1628
+ "status": "verified",
1629
+ "summary": "Pashto ASR model checkpoint discovered from huggingface candidate sync.",
1630
+ "primary_use": "Pashto ASR baseline and model comparison",
1631
+ "tasks": [
1632
+ "asr"
1633
+ ],
1634
+ "pashto_evidence": {
1635
+ "evidence_text": "Matched by Pashto keyword in Hugging Face search results.",
1636
+ "evidence_url": "https://huggingface.co/ihanif/whisper-small-pashto",
1637
+ "markers": [
1638
+ "pashto"
1639
+ ]
1640
+ },
1641
+ "tags": [
1642
+ "pashto",
1643
+ "model",
1644
+ "huggingface",
1645
+ "asr"
1646
+ ]
1647
+ },
1648
+ {
1649
+ "id": "model-hf-ihanif-xls-r-1b-pashto",
1650
+ "title": "ihanif/xls-r-1b-pashto",
1651
+ "url": "https://huggingface.co/ihanif/xls-r-1b-pashto",
1652
+ "category": "model",
1653
+ "source": "huggingface",
1654
+ "status": "verified",
1655
+ "summary": "Pashto ASR model checkpoint discovered from huggingface candidate sync.",
1656
+ "primary_use": "Pashto ASR baseline and model comparison",
1657
+ "tasks": [
1658
+ "asr"
1659
+ ],
1660
+ "pashto_evidence": {
1661
+ "evidence_text": "Matched by Pashto keyword in Hugging Face search results.",
1662
+ "evidence_url": "https://huggingface.co/ihanif/xls-r-1b-pashto",
1663
+ "markers": [
1664
+ "pashto"
1665
+ ]
1666
+ },
1667
+ "tags": [
1668
+ "pashto",
1669
+ "model",
1670
+ "huggingface",
1671
+ "asr"
1672
+ ]
1673
+ },
1674
+ {
1675
+ "id": "model-hf-ijazulhaq-bert-base-pashto-v1",
1676
+ "title": "ijazulhaq/bert-base-pashto-v1",
1677
+ "url": "https://huggingface.co/ijazulhaq/bert-base-pashto-v1",
1678
+ "category": "model",
1679
+ "source": "huggingface",
1680
+ "status": "verified",
1681
+ "summary": "Pashto NLP model checkpoint discovered from huggingface candidate sync.",
1682
+ "primary_use": "Pashto model baseline for downstream NLP tasks",
1683
+ "tasks": [
1684
+ "nlp"
1685
+ ],
1686
+ "pashto_evidence": {
1687
+ "evidence_text": "Matched by Pashto keyword in Hugging Face search results.",
1688
+ "evidence_url": "https://huggingface.co/ijazulhaq/bert-base-pashto-v1",
1689
+ "markers": [
1690
+ "pashto"
1691
+ ]
1692
+ },
1693
+ "tags": [
1694
+ "pashto",
1695
+ "model",
1696
+ "huggingface",
1697
+ "nlp"
1698
+ ]
1699
+ },
1700
+ {
1701
+ "id": "project-hf-space-ihanif-wav2vec2-bert-pashto-asr",
1702
+ "title": "ihanif/wav2vec2-bert-pashto-asr",
1703
+ "url": "https://huggingface.co/spaces/ihanif/wav2vec2-bert-pashto-asr",
1704
+ "category": "project",
1705
+ "source": "huggingface",
1706
+ "status": "verified",
1707
+ "summary": "Pashto-focused interactive project discovered from huggingface for demonstration and quick evaluation.",
1708
+ "primary_use": "Interactive Pashto demo and quick qualitative validation",
1709
+ "tasks": [
1710
+ "asr",
1711
+ "nlp",
1712
+ "demo"
1713
+ ],
1714
+ "pashto_evidence": {
1715
+ "evidence_text": "Matched by Pashto keyword in Hugging Face Spaces search.",
1716
+ "evidence_url": "https://huggingface.co/spaces/ihanif/wav2vec2-bert-pashto-asr",
1717
+ "markers": [
1718
+ "pashto"
1719
+ ]
1720
+ },
1721
+ "tags": [
1722
+ "pashto",
1723
+ "project",
1724
+ "huggingface",
1725
+ "asr",
1726
+ "nlp",
1727
+ "demo"
1728
+ ]
1729
+ },
1730
+ {
1731
+ "id": "project-hf-space-nasirkhansayyad-pashto-whisper-demo",
1732
+ "title": "nasirkhansayyad/pashto-whisper-demo",
1733
+ "url": "https://huggingface.co/spaces/nasirkhansayyad/pashto-whisper-demo",
1734
+ "category": "project",
1735
+ "source": "huggingface",
1736
+ "status": "verified",
1737
+ "summary": "Pashto-focused interactive project discovered from huggingface for demonstration and quick evaluation.",
1738
+ "primary_use": "Interactive Pashto demo and quick qualitative validation",
1739
+ "tasks": [
1740
+ "asr",
1741
+ "demo"
1742
+ ],
1743
+ "pashto_evidence": {
1744
+ "evidence_text": "Matched by Pashto keyword in Hugging Face Spaces search.",
1745
+ "evidence_url": "https://huggingface.co/spaces/nasirkhansayyad/pashto-whisper-demo",
1746
+ "markers": [
1747
+ "pashto"
1748
+ ]
1749
+ },
1750
+ "tags": [
1751
+ "pashto",
1752
+ "project",
1753
+ "huggingface",
1754
+ "asr",
1755
+ "demo"
1756
+ ]
1757
+ },
1758
+ {
1759
+ "id": "project-hf-space-tasal9-zamai-phi3-mini-pashto-demo",
1760
+ "title": "tasal9/ZamAI-Phi3-Mini-Pashto-Demo",
1761
+ "url": "https://huggingface.co/spaces/tasal9/ZamAI-Phi3-Mini-Pashto-Demo",
1762
+ "category": "project",
1763
+ "source": "huggingface",
1764
+ "status": "verified",
1765
+ "summary": "Pashto-focused interactive project discovered from huggingface for demonstration and quick evaluation.",
1766
+ "primary_use": "Interactive Pashto demo and quick qualitative validation",
1767
+ "tasks": [
1768
+ "llm",
1769
+ "demo"
1770
+ ],
1771
+ "pashto_evidence": {
1772
+ "evidence_text": "Matched by Pashto keyword in Hugging Face Spaces search.",
1773
+ "evidence_url": "https://huggingface.co/spaces/tasal9/ZamAI-Phi3-Mini-Pashto-Demo",
1774
+ "markers": [
1775
+ "pashto"
1776
+ ]
1777
+ },
1778
+ "tags": [
1779
+ "pashto",
1780
+ "project",
1781
+ "huggingface",
1782
+ "llm",
1783
+ "demo"
1784
+ ]
1785
+ },
1786
+ {
1787
+ "id": "project-hf-space-umar4321-pashto-to-english-urdu",
1788
+ "title": "Umar4321/Pashto-To-English-Urdu",
1789
+ "url": "https://huggingface.co/spaces/Umar4321/Pashto-To-English-Urdu",
1790
+ "category": "project",
1791
+ "source": "huggingface",
1792
+ "status": "verified",
1793
+ "summary": "Pashto-focused interactive project discovered from huggingface for demonstration and quick evaluation.",
1794
+ "primary_use": "Interactive Pashto demo and quick qualitative validation",
1795
+ "tasks": [
1796
+ "mt",
1797
+ "demo"
1798
+ ],
1799
+ "pashto_evidence": {
1800
+ "evidence_text": "Matched by Pashto keyword in Hugging Face Spaces search.",
1801
+ "evidence_url": "https://huggingface.co/spaces/Umar4321/Pashto-To-English-Urdu",
1802
+ "markers": [
1803
+ "pashto"
1804
+ ]
1805
+ },
1806
+ "tags": [
1807
+ "pashto",
1808
+ "project",
1809
+ "huggingface",
1810
+ "mt",
1811
+ "demo"
1812
+ ]
1813
+ },
1814
+ {
1815
+ "id": "project-github-fazlullahmamond-pashto-typing",
1816
+ "title": "Fazlullahmamond/Pashto-Typing",
1817
+ "url": "https://github.com/Fazlullahmamond/Pashto-Typing",
1818
+ "category": "project",
1819
+ "source": "github",
1820
+ "status": "verified",
1821
+ "summary": "Pashto-focused interactive project discovered from github for demonstration and quick evaluation.",
1822
+ "primary_use": "Interactive Pashto demo and quick qualitative validation",
1823
+ "tasks": [
1824
+ "demo"
1825
+ ],
1826
+ "pashto_evidence": {
1827
+ "evidence_text": "Repository metadata (name/description/topics) includes Pashto markers.",
1828
+ "evidence_url": "https://github.com/Fazlullahmamond/Pashto-Typing",
1829
+ "markers": [
1830
+ "pashto"
1831
+ ]
1832
+ },
1833
+ "tags": [
1834
+ "pashto",
1835
+ "project",
1836
+ "github",
1837
+ "demo"
1838
+ ]
1839
+ },
1840
+ {
1841
+ "id": "project-github-ihyacommunity-khushkhat-extension",
1842
+ "title": "IhyaCommunity/Khushkhat-Extension",
1843
+ "url": "https://github.com/IhyaCommunity/Khushkhat-Extension",
1844
+ "category": "project",
1845
+ "source": "github",
1846
+ "status": "verified",
1847
+ "summary": "Pashto-focused interactive project discovered from github for demonstration and quick evaluation.",
1848
+ "primary_use": "Interactive Pashto demo and quick qualitative validation",
1849
+ "tasks": [
1850
+ "demo"
1851
+ ],
1852
+ "pashto_evidence": {
1853
+ "evidence_text": "Repository metadata (name/description/topics) includes Pashto markers.",
1854
+ "evidence_url": "https://github.com/IhyaCommunity/Khushkhat-Extension",
1855
+ "markers": [
1856
+ "pashto"
1857
+ ]
1858
+ },
1859
+ "tags": [
1860
+ "pashto",
1861
+ "project",
1862
+ "github",
1863
+ "demo"
1864
+ ]
1865
+ },
1866
+ {
1867
+ "id": "paper-s2-benchmarking-whisper-for-low-resource-speech-recognition-an-n-shot-evaluation-on-pashto-pu",
1868
+ "title": "Benchmarking Whisper for Low-Resource Speech Recognition: An N-Shot Evaluation on Pashto, Punjabi, and Urdu",
1869
+ "url": "https://www.semanticscholar.org/paper/13104eddc785756132a19242ac7e74442b145693",
1870
+ "category": "paper",
1871
+ "source": "other",
1872
+ "status": "verified",
1873
+ "summary": "Pashto language technology paper discovered from other for research reference.",
1874
+ "primary_use": "Pashto research reference for methods and benchmarking",
1875
+ "tasks": [
1876
+ "asr",
1877
+ "mt"
1878
+ ],
1879
+ "pashto_evidence": {
1880
+ "evidence_text": "Matched by Semantic Scholar query: pashto.",
1881
+ "evidence_url": "https://www.semanticscholar.org/paper/13104eddc785756132a19242ac7e74442b145693",
1882
+ "markers": [
1883
+ "pashto"
1884
+ ]
1885
+ },
1886
+ "tags": [
1887
+ "pashto",
1888
+ "paper",
1889
+ "other",
1890
+ "asr",
1891
+ "mt"
1892
+ ]
1893
+ },
1894
+ {
1895
+ "id": "paper-s2-deep-learning-based-detection-of-one-and-two-column-textual-blocks-in-camera-captured-pash",
1896
+ "title": "Deep Learning-Based Detection of One and Two-Column Textual Blocks in Camera-Captured Pashto Documents Images",
1897
+ "url": "https://www.semanticscholar.org/paper/8c9d2628e23d5c27edc656071f11f0e78124d182",
1898
+ "category": "paper",
1899
+ "source": "other",
1900
+ "status": "verified",
1901
+ "summary": "Pashto language technology paper discovered from other for research reference.",
1902
+ "primary_use": "Pashto research reference for methods and benchmarking",
1903
+ "tasks": [
1904
+ "ocr"
1905
+ ],
1906
+ "pashto_evidence": {
1907
+ "evidence_text": "Matched by Semantic Scholar query: pashto.",
1908
+ "evidence_url": "https://www.semanticscholar.org/paper/8c9d2628e23d5c27edc656071f11f0e78124d182",
1909
+ "markers": [
1910
+ "pashto"
1911
+ ]
1912
+ },
1913
+ "tags": [
1914
+ "pashto",
1915
+ "paper",
1916
+ "other",
1917
+ "ocr"
1918
+ ]
1919
+ },
1920
+ {
1921
+ "id": "paper-s2-out-of-vocabulary-pashto-spell-checker-using-morphological-operations",
1922
+ "title": "Out-of-Vocabulary Pashto Spell Checker using Morphological Operations",
1923
+ "url": "https://www.semanticscholar.org/paper/802aae68a6a7fdfb29d51be03fb2b09e29311fa7",
1924
+ "category": "paper",
1925
+ "source": "other",
1926
+ "status": "verified",
1927
+ "summary": "Pashto language technology paper discovered from other for research reference.",
1928
+ "primary_use": "Pashto research reference for methods and benchmarking",
1929
+ "tasks": [
1930
+ "nlp"
1931
+ ],
1932
+ "pashto_evidence": {
1933
+ "evidence_text": "Matched by Semantic Scholar query: pashto.",
1934
+ "evidence_url": "https://www.semanticscholar.org/paper/802aae68a6a7fdfb29d51be03fb2b09e29311fa7",
1935
+ "markers": [
1936
+ "pashto"
1937
+ ]
1938
+ },
1939
+ "tags": [
1940
+ "pashto",
1941
+ "paper",
1942
+ "other",
1943
+ "nlp"
1944
+ ]
1945
+ },
1946
+ {
1947
+ "id": "paper-s2-pashto-shallow-parsing-a-deep-learning-approach",
1948
+ "title": "Pashto Shallow Parsing: A Deep Learning Approach",
1949
+ "url": "https://www.semanticscholar.org/paper/be36455bb4eca60accb3e6866f345132f0dac1e5",
1950
+ "category": "paper",
1951
+ "source": "other",
1952
+ "status": "verified",
1953
+ "summary": "Pashto language technology paper discovered from other for research reference.",
1954
+ "primary_use": "Pashto research reference for methods and benchmarking",
1955
+ "tasks": [
1956
+ "nlp"
1957
+ ],
1958
+ "pashto_evidence": {
1959
+ "evidence_text": "Matched by Semantic Scholar query: pashto.",
1960
+ "evidence_url": "https://www.semanticscholar.org/paper/be36455bb4eca60accb3e6866f345132f0dac1e5",
1961
+ "markers": [
1962
+ "pashto"
1963
+ ]
1964
+ },
1965
+ "tags": [
1966
+ "pashto",
1967
+ "paper",
1968
+ "other",
1969
+ "nlp"
1970
+ ]
1971
+ },
1972
+ {
1973
+ "id": "paper-s2-pos-tagging-of-low-resource-pashto-language-annotated-corpus-and-bert-based-model",
1974
+ "title": "POS tagging of low-resource Pashto language: annotated corpus and BERT-based model",
1975
+ "url": "https://www.semanticscholar.org/paper/1b2d5c896fec735483e8c8fb0a75e13125e08769",
1976
+ "category": "paper",
1977
+ "source": "other",
1978
+ "status": "verified",
1979
+ "summary": "Pashto language technology paper discovered from other for research reference.",
1980
+ "primary_use": "Pashto research reference for methods and benchmarking",
1981
+ "tasks": [
1982
+ "nlp"
1983
+ ],
1984
+ "pashto_evidence": {
1985
+ "evidence_text": "Matched by Semantic Scholar query: pashto.",
1986
+ "evidence_url": "https://www.semanticscholar.org/paper/1b2d5c896fec735483e8c8fb0a75e13125e08769",
1987
+ "markers": [
1988
+ "pashto"
1989
+ ]
1990
+ },
1991
+ "tags": [
1992
+ "pashto",
1993
+ "paper",
1994
+ "other",
1995
+ "nlp"
1996
+ ]
1997
+ },
1998
+ {
1999
+ "id": "paper-arxiv-enhancing-pashto-text-classification-using-language-processing-techniques-for-single-and-m",
2000
+ "title": "Enhancing Pashto Text Classification using Language Processing Techniques for Single And Multi-Label Analysis",
2001
+ "url": "http://arxiv.org/abs/2305.03201v1",
2002
+ "category": "paper",
2003
+ "source": "arxiv",
2004
+ "status": "verified",
2005
+ "summary": "Pashto language technology paper discovered from arxiv for research reference.",
2006
+ "primary_use": "Pashto research reference for methods and benchmarking",
2007
+ "tasks": [
2008
+ "nlp"
2009
+ ],
2010
+ "pashto_evidence": {
2011
+ "evidence_text": "Matched by arXiv query: all:pashto.",
2012
+ "evidence_url": "http://arxiv.org/abs/2305.03201v1",
2013
+ "markers": [
2014
+ "pashto"
2015
+ ]
2016
+ },
2017
+ "tags": [
2018
+ "pashto",
2019
+ "paper",
2020
+ "arxiv",
2021
+ "nlp"
2022
+ ]
2023
+ },
2024
+ {
2025
+ "id": "paper-arxiv-knn-and-ann-based-recognition-of-handwritten-pashto-letters-using-zoning-features",
2026
+ "title": "KNN and ANN-based Recognition of Handwritten Pashto Letters using Zoning Features",
2027
+ "url": "http://arxiv.org/abs/1904.03391v2",
2028
+ "category": "paper",
2029
+ "source": "arxiv",
2030
+ "status": "verified",
2031
+ "summary": "Pashto language technology paper discovered from arxiv for research reference.",
2032
+ "primary_use": "Pashto research reference for methods and benchmarking",
2033
+ "tasks": [
2034
+ "ocr"
2035
+ ],
2036
+ "pashto_evidence": {
2037
+ "evidence_text": "Matched by arXiv query: all:pashto.",
2038
+ "evidence_url": "http://arxiv.org/abs/1904.03391v2",
2039
+ "markers": [
2040
+ "pashto"
2041
+ ]
2042
+ },
2043
+ "tags": [
2044
+ "pashto",
2045
+ "paper",
2046
+ "arxiv",
2047
+ "ocr"
2048
+ ]
2049
  }
2050
  ]
2051
  }
resources/datasets/README.md CHANGED
@@ -5,16 +5,30 @@
5
  | Resource | Link | Pashto Evidence | Primary Use |
6
  |---|---|---|---|
7
  | 99 Hours Pashto Spontaneous Dialogue Smartphone Speech Dataset | [huggingface](https://huggingface.co/datasets/Nexdata/99_Hours_Pashto_Spontaneous_Dialogue_Smartphone_speech_dataset) | [Dataset title explicitly includes Pashto and API metadata marks audio and text modalities. (`Pashto`)](https://huggingface.co/datasets/Nexdata/99_Hours_Pashto_Spontaneous_Dialogue_Smartphone_speech_dataset) | Spontaneous speech ASR training and robustness evaluation |
 
 
8
  | Belebele | [huggingface](https://huggingface.co/datasets/facebook/belebele) | [Dataset includes pbt_Arab subset. (`pbt_Arab`)](https://huggingface.co/datasets/facebook/belebele) | Comprehension and multilingual NLP benchmark |
 
9
  | Common Voice Scripted Speech 24.0 - Pashto | [mozilla](https://datacollective.mozillafoundation.org/datasets/cmj8u3pnb00llnxxbfvxo3b14) | [Official dataset page is for Pashto. (`Pashto`)](https://datacollective.mozillafoundation.org/datasets/cmj8u3pnb00llnxxbfvxo3b14) | ASR training and evaluation |
 
 
10
  | Google FLEURS | [huggingface](https://huggingface.co/datasets/google/fleurs) | [Dataset config includes ps_af. (`ps_af`)](https://huggingface.co/datasets/google/fleurs/blob/main/fleurs.py) | Speech benchmark and external evaluation |
 
 
 
 
11
  | OPUS-100 | [huggingface](https://huggingface.co/datasets/Helsinki-NLP/opus-100) | [Dataset viewer includes en-ps split. (`en-ps`)](https://huggingface.co/datasets/Helsinki-NLP/opus-100/viewer/en-ps) | Machine translation training and evaluation |
12
  | OSCAR Corpus | [huggingface](https://huggingface.co/datasets/oscar-corpus/oscar) | [Dataset includes unshuffled_deduplicated_ps split. (`unshuffled_deduplicated_ps`)](https://huggingface.co/datasets/oscar-corpus/oscar) | Language modeling and lexicon expansion |
13
  | Pashto English Bilingual Sentiment Corpus | [kaggle](https://www.kaggle.com/datasets/farhadkhan66/pashto-translated-corpus) | [Kaggle dataset title and description identify the corpus as Pashto-English sentiment data. (`Pashto`)](https://www.kaggle.com/api/v1/datasets/view/farhadkhan66/pashto-translated-corpus) | Sentiment analysis and bilingual NLP experiments |
14
  | Pashto Isolated Words Speech Dataset | [kaggle](https://www.kaggle.com/datasets/engrirf/pashto-isolated-words-speech-dataset) | [Dataset title explicitly states Pashto speech dataset. (`Pashto`)](https://www.kaggle.com/datasets/engrirf/pashto-isolated-words-speech-dataset) | Keyword spotting and constrained ASR experiments |
 
15
  | Pashto Wikipedia Corpus | [huggingface](https://huggingface.co/datasets/ihanif/pashto-wikipedia-corpus) | [Dataset metadata includes language:ps and the title specifies Pashto corpus. (`ps`, `Pashto`)](https://huggingface.co/datasets/ihanif/pashto-wikipedia-corpus) | Pashto text corpus for NLP baselines |
16
  | Pashto Word Embeddings | [kaggle](https://www.kaggle.com/datasets/drijaz/pashto-word-embeddings) | [Dataset description states pretrained Pashto embeddings. (`Pashto`)](https://www.kaggle.com/datasets/drijaz/pashto-word-embeddings) | Lexical semantics and lightweight NLP baselines |
 
17
  | POLD - Pashto Offensive Language Dataset | [kaggle](https://www.kaggle.com/datasets/drijaz/pold-pashto-offensive-language-dataset) | [Kaggle title and description explicitly state Pashto offensive language benchmark dataset. (`Pashto`)](https://www.kaggle.com/api/v1/datasets/view/drijaz/pold-pashto-offensive-language-dataset) | Pashto toxicity and moderation NLP benchmarks |
 
 
 
18
  | Urdu-Pashto Lexicon Dataset | [kaggle](https://www.kaggle.com/datasets/shafeeqgigyani/urdu-pashto-lexicon-dataset) | [Kaggle metadata describes 7,601 Urdu entries with Pashto translations. (`Pashto`)](https://www.kaggle.com/api/v1/datasets/view/shafeeqgigyani/urdu-pashto-lexicon-dataset) | Lexicon and translation lexeme mapping |
19
  | Wikimedia Wikipedia | [huggingface](https://huggingface.co/datasets/wikimedia/wikipedia) | [Dataset includes 20231101.ps subset. (`20231101.ps`)](https://huggingface.co/datasets/wikimedia/wikipedia) | Terminology and balanced text corpus |
20
  | Zirak-AI PashtoOCR | [huggingface](https://huggingface.co/datasets/zirak-ai/PashtoOCR) | [Dataset tags include language:ps and the dataset name is PashtoOCR. (`ps`, `PashtoOCR`)](https://huggingface.co/datasets/zirak-ai/PashtoOCR) | OCR and text extraction benchmarking |
 
5
  | Resource | Link | Pashto Evidence | Primary Use |
6
  |---|---|---|---|
7
  | 99 Hours Pashto Spontaneous Dialogue Smartphone Speech Dataset | [huggingface](https://huggingface.co/datasets/Nexdata/99_Hours_Pashto_Spontaneous_Dialogue_Smartphone_speech_dataset) | [Dataset title explicitly includes Pashto and API metadata marks audio and text modalities. (`Pashto`)](https://huggingface.co/datasets/Nexdata/99_Hours_Pashto_Spontaneous_Dialogue_Smartphone_speech_dataset) | Spontaneous speech ASR training and robustness evaluation |
8
+ | adnankhan769/proper_dataset_english_2_pashto | [huggingface](https://huggingface.co/datasets/adnankhan769/proper_dataset_english_2_pashto) | [Matched by Pashto keyword in Hugging Face search results. (`pashto`)](https://huggingface.co/datasets/adnankhan769/proper_dataset_english_2_pashto) | Machine translation and bilingual corpus development |
9
+ | alpaca-pashto-cleaned | [huggingface](https://huggingface.co/datasets/saillab/alpaca-pashto-cleaned) | [Dataset metadata includes language:ps and dataset name includes Pashto. (`ps`, `Pashto`)](https://huggingface.co/api/datasets/saillab/alpaca-pashto-cleaned) | Pashto instruction tuning and conversational NLP experiments |
10
  | Belebele | [huggingface](https://huggingface.co/datasets/facebook/belebele) | [Dataset includes pbt_Arab subset. (`pbt_Arab`)](https://huggingface.co/datasets/facebook/belebele) | Comprehension and multilingual NLP benchmark |
11
+ | Common Voice 24.0: Pashto Speech Dataset | [kaggle](https://www.kaggle.com/datasets/ataullahaali/common-voice-scripted-speech-24-0-pashto) | [Kaggle dataset title/subtitle includes Pashto keyword. (`Pashto`)](https://www.kaggle.com/datasets/ataullahaali/common-voice-scripted-speech-24-0-pashto) | ASR training and evaluation data source |
12
  | Common Voice Scripted Speech 24.0 - Pashto | [mozilla](https://datacollective.mozillafoundation.org/datasets/cmj8u3pnb00llnxxbfvxo3b14) | [Official dataset page is for Pashto. (`Pashto`)](https://datacollective.mozillafoundation.org/datasets/cmj8u3pnb00llnxxbfvxo3b14) | ASR training and evaluation |
13
+ | English to Pashto Sentences Dataset | [huggingface](https://huggingface.co/datasets/adnankhan769/english_to_pashto_sentences_dataset) | [Dataset ID explicitly states English-to-Pashto and includes Pashto-script sentence column. (`Pashto`)](https://huggingface.co/api/datasets/adnankhan769/english_to_pashto_sentences_dataset) | MT and bilingual sentence alignment baseline |
14
+ | English-Pashto Language Dataset (EPLD) | [kaggle](https://www.kaggle.com/datasets/rabiakhan827/english-pashto-language-dataset-epld) | [Kaggle dataset title/subtitle includes Pashto keyword. (`Pashto`)](https://www.kaggle.com/datasets/rabiakhan827/english-pashto-language-dataset-epld) | Machine translation and bilingual corpus development |
15
  | Google FLEURS | [huggingface](https://huggingface.co/datasets/google/fleurs) | [Dataset config includes ps_af. (`ps_af`)](https://huggingface.co/datasets/google/fleurs/blob/main/fleurs.py) | Speech benchmark and external evaluation |
16
+ | ihanif/pashto_asr_wer | [huggingface](https://huggingface.co/datasets/ihanif/pashto_asr_wer) | [Matched by Pashto keyword in Hugging Face search results. (`pashto`)](https://huggingface.co/datasets/ihanif/pashto_asr_wer) | ASR training and evaluation data source |
17
+ | ihanif/pashto_speech_ds | [huggingface](https://huggingface.co/datasets/ihanif/pashto_speech_ds) | [Matched by Pashto keyword in Hugging Face search results. (`pashto`)](https://huggingface.co/datasets/ihanif/pashto_speech_ds) | ASR training and evaluation data source |
18
+ | ihanif/pashto_speech_parquet_10k | [huggingface](https://huggingface.co/datasets/ihanif/pashto_speech_parquet_10k) | [Matched by Pashto keyword in Hugging Face search results. (`pashto`)](https://huggingface.co/datasets/ihanif/pashto_speech_parquet_10k) | ASR training and evaluation data source |
19
+ | Katib's Pashto Text Imagebase (KPTI) | [kaggle](https://www.kaggle.com/datasets/hassanamin/katibs-pashto-text-imagebase-kpti) | [Kaggle dataset title/subtitle includes Pashto keyword. (`Pashto`)](https://www.kaggle.com/datasets/hassanamin/katibs-pashto-text-imagebase-kpti) | OCR training and evaluation data source |
20
  | OPUS-100 | [huggingface](https://huggingface.co/datasets/Helsinki-NLP/opus-100) | [Dataset viewer includes en-ps split. (`en-ps`)](https://huggingface.co/datasets/Helsinki-NLP/opus-100/viewer/en-ps) | Machine translation training and evaluation |
21
  | OSCAR Corpus | [huggingface](https://huggingface.co/datasets/oscar-corpus/oscar) | [Dataset includes unshuffled_deduplicated_ps split. (`unshuffled_deduplicated_ps`)](https://huggingface.co/datasets/oscar-corpus/oscar) | Language modeling and lexicon expansion |
22
  | Pashto English Bilingual Sentiment Corpus | [kaggle](https://www.kaggle.com/datasets/farhadkhan66/pashto-translated-corpus) | [Kaggle dataset title and description identify the corpus as Pashto-English sentiment data. (`Pashto`)](https://www.kaggle.com/api/v1/datasets/view/farhadkhan66/pashto-translated-corpus) | Sentiment analysis and bilingual NLP experiments |
23
  | Pashto Isolated Words Speech Dataset | [kaggle](https://www.kaggle.com/datasets/engrirf/pashto-isolated-words-speech-dataset) | [Dataset title explicitly states Pashto speech dataset. (`Pashto`)](https://www.kaggle.com/datasets/engrirf/pashto-isolated-words-speech-dataset) | Keyword spotting and constrained ASR experiments |
24
+ | Pashto OCR | [kaggle](https://www.kaggle.com/datasets/hassanamin/pashto-ocr) | [Kaggle dataset title/subtitle includes Pashto keyword. (`Pashto`)](https://www.kaggle.com/datasets/hassanamin/pashto-ocr) | OCR training and evaluation data source |
25
  | Pashto Wikipedia Corpus | [huggingface](https://huggingface.co/datasets/ihanif/pashto-wikipedia-corpus) | [Dataset metadata includes language:ps and the title specifies Pashto corpus. (`ps`, `Pashto`)](https://huggingface.co/datasets/ihanif/pashto-wikipedia-corpus) | Pashto text corpus for NLP baselines |
26
  | Pashto Word Embeddings | [kaggle](https://www.kaggle.com/datasets/drijaz/pashto-word-embeddings) | [Dataset description states pretrained Pashto embeddings. (`Pashto`)](https://www.kaggle.com/datasets/drijaz/pashto-word-embeddings) | Lexical semantics and lightweight NLP baselines |
27
+ | PashtoOCR (Kaggle) | [kaggle](https://www.kaggle.com/datasets/drijaz/pashtoocr) | [Kaggle dataset title and subtitle explicitly identify a Pashto OCR dataset. (`Pashto`, `OCR`)](https://www.kaggle.com/api/v1/datasets/view/drijaz/pashtoocr) | Pashto OCR dataset benchmarking and training |
28
  | POLD - Pashto Offensive Language Dataset | [kaggle](https://www.kaggle.com/datasets/drijaz/pold-pashto-offensive-language-dataset) | [Kaggle title and description explicitly state Pashto offensive language benchmark dataset. (`Pashto`)](https://www.kaggle.com/api/v1/datasets/view/drijaz/pold-pashto-offensive-language-dataset) | Pashto toxicity and moderation NLP benchmarks |
29
+ | saillab/alpaca_pashto_taco | [huggingface](https://huggingface.co/datasets/saillab/alpaca_pashto_taco) | [Matched by Pashto keyword in Hugging Face search results. (`pashto`)](https://huggingface.co/datasets/saillab/alpaca_pashto_taco) | Instruction tuning and LLM adaptation data source |
30
+ | SherwinDesouza/pashto-common-voice-20 | [huggingface](https://huggingface.co/datasets/SherwinDesouza/pashto-common-voice-20) | [Matched by Pashto keyword in Hugging Face search results. (`pashto`)](https://huggingface.co/datasets/SherwinDesouza/pashto-common-voice-20) | Pashto data source for NLP experimentation |
31
+ | tasal9/ZamAI_Pashto_Dataset | [huggingface](https://huggingface.co/datasets/tasal9/ZamAI_Pashto_Dataset) | [Matched by Pashto keyword in Hugging Face search results. (`pashto`)](https://huggingface.co/datasets/tasal9/ZamAI_Pashto_Dataset) | Pashto data source for NLP experimentation |
32
  | Urdu-Pashto Lexicon Dataset | [kaggle](https://www.kaggle.com/datasets/shafeeqgigyani/urdu-pashto-lexicon-dataset) | [Kaggle metadata describes 7,601 Urdu entries with Pashto translations. (`Pashto`)](https://www.kaggle.com/api/v1/datasets/view/shafeeqgigyani/urdu-pashto-lexicon-dataset) | Lexicon and translation lexeme mapping |
33
  | Wikimedia Wikipedia | [huggingface](https://huggingface.co/datasets/wikimedia/wikipedia) | [Dataset includes 20231101.ps subset. (`20231101.ps`)](https://huggingface.co/datasets/wikimedia/wikipedia) | Terminology and balanced text corpus |
34
  | Zirak-AI PashtoOCR | [huggingface](https://huggingface.co/datasets/zirak-ai/PashtoOCR) | [Dataset tags include language:ps and the dataset name is PashtoOCR. (`ps`, `PashtoOCR`)](https://huggingface.co/datasets/zirak-ai/PashtoOCR) | OCR and text extraction benchmarking |
resources/models/README.md CHANGED
@@ -4,6 +4,13 @@
4
 
5
  | Resource | Link | Pashto Evidence | Primary Use |
6
  |---|---|---|---|
 
 
 
 
 
 
 
7
  | MMS 1B All | [huggingface](https://huggingface.co/facebook/mms-1b-all) | [MMS coverage table includes pus with ASR support. (`pus`)](https://dl.fbaipublicfiles.com/mms/misc/language_coverage_mms.html) | ASR transfer baseline |
8
  | MMS TTS | [huggingface](https://huggingface.co/facebook/mms-tts) | [MMS coverage table includes pus with TTS support. (`pus`)](https://dl.fbaipublicfiles.com/mms/misc/language_coverage_mms.html) | TTS baseline and transfer |
9
  | NLLB-200 Distilled 600M | [huggingface](https://huggingface.co/facebook/nllb-200-distilled-600M) | [Model special token map includes pbt_Arab. (`pbt_Arab`)](https://huggingface.co/facebook/nllb-200-distilled-600M/blob/main/special_tokens_map.json) | Pashto translation baseline |
 
4
 
5
  | Resource | Link | Pashto Evidence | Primary Use |
6
  |---|---|---|---|
7
+ | ihanif/pashto-asr-base | [huggingface](https://huggingface.co/ihanif/pashto-asr-base) | [Matched by Pashto keyword in Hugging Face search results. (`pashto`)](https://huggingface.co/ihanif/pashto-asr-base) | Pashto ASR baseline and model comparison |
8
+ | ihanif/wav2vec2-xls-r-300m-pashto-lm | [huggingface](https://huggingface.co/ihanif/wav2vec2-xls-r-300m-pashto-lm) | [Matched by Pashto keyword in Hugging Face search results. (`pashto`)](https://huggingface.co/ihanif/wav2vec2-xls-r-300m-pashto-lm) | Pashto ASR baseline and model comparison |
9
+ | ihanif/whisper-large-pashto | [huggingface](https://huggingface.co/ihanif/whisper-large-pashto) | [Matched by Pashto keyword in Hugging Face search results. (`pashto`)](https://huggingface.co/ihanif/whisper-large-pashto) | Pashto ASR baseline and model comparison |
10
+ | ihanif/whisper-medium-pashto-3e-7 | [huggingface](https://huggingface.co/ihanif/whisper-medium-pashto-3e-7) | [Matched by Pashto keyword in Hugging Face search results. (`pashto`)](https://huggingface.co/ihanif/whisper-medium-pashto-3e-7) | Pashto ASR baseline and model comparison |
11
+ | ihanif/whisper-small-pashto | [huggingface](https://huggingface.co/ihanif/whisper-small-pashto) | [Matched by Pashto keyword in Hugging Face search results. (`pashto`)](https://huggingface.co/ihanif/whisper-small-pashto) | Pashto ASR baseline and model comparison |
12
+ | ihanif/xls-r-1b-pashto | [huggingface](https://huggingface.co/ihanif/xls-r-1b-pashto) | [Matched by Pashto keyword in Hugging Face search results. (`pashto`)](https://huggingface.co/ihanif/xls-r-1b-pashto) | Pashto ASR baseline and model comparison |
13
+ | ijazulhaq/bert-base-pashto-v1 | [huggingface](https://huggingface.co/ijazulhaq/bert-base-pashto-v1) | [Matched by Pashto keyword in Hugging Face search results. (`pashto`)](https://huggingface.co/ijazulhaq/bert-base-pashto-v1) | Pashto model baseline for downstream NLP tasks |
14
  | MMS 1B All | [huggingface](https://huggingface.co/facebook/mms-1b-all) | [MMS coverage table includes pus with ASR support. (`pus`)](https://dl.fbaipublicfiles.com/mms/misc/language_coverage_mms.html) | ASR transfer baseline |
15
  | MMS TTS | [huggingface](https://huggingface.co/facebook/mms-tts) | [MMS coverage table includes pus with TTS support. (`pus`)](https://dl.fbaipublicfiles.com/mms/misc/language_coverage_mms.html) | TTS baseline and transfer |
16
  | NLLB-200 Distilled 600M | [huggingface](https://huggingface.co/facebook/nllb-200-distilled-600M) | [Model special token map includes pbt_Arab. (`pbt_Arab`)](https://huggingface.co/facebook/nllb-200-distilled-600M/blob/main/special_tokens_map.json) | Pashto translation baseline |
resources/papers/README.md CHANGED
@@ -4,8 +4,16 @@
4
 
5
  | Resource | Link | Pashto Evidence | Primary Use |
6
  |---|---|---|---|
 
 
 
7
  | FLEURS: Few-shot Learning Evaluation of Universal Representations of Speech | [arxiv](https://arxiv.org/abs/2205.12446) | [Dataset implementation includes ps_af language code. (`ps_af`)](https://huggingface.co/datasets/google/fleurs/blob/main/fleurs.py) | Speech benchmark methodology reference |
 
8
  | No Language Left Behind | [arxiv](https://arxiv.org/abs/2207.04672) | [Model usage in repo references pbt_Arab token support. (`pbt_Arab`)](https://huggingface.co/facebook/nllb-200-distilled-600M/blob/main/special_tokens_map.json) | MT research reference |
 
 
 
 
9
  | Robust Speech Recognition via Large-Scale Weak Supervision | [arxiv](https://arxiv.org/abs/2212.04356) | [Paired with tokenizer language map containing ps. (`ps`)](https://raw.githubusercontent.com/openai/whisper/main/whisper/tokenizer.py) | ASR methodology reference |
10
  | Scaling Speech Technology to 1,000+ Languages | [arxiv](https://arxiv.org/abs/2305.13516) | [Coverage table marks pus support in MMS release. (`pus`)](https://dl.fbaipublicfiles.com/mms/misc/language_coverage_mms.html) | ASR and TTS transfer reference |
11
 
 
4
 
5
  | Resource | Link | Pashto Evidence | Primary Use |
6
  |---|---|---|---|
7
+ | Benchmarking Whisper for Low-Resource Speech Recognition: An N-Shot Evaluation on Pashto, Punjabi, and Urdu | [other](https://www.semanticscholar.org/paper/13104eddc785756132a19242ac7e74442b145693) | [Matched by Semantic Scholar query: pashto. (`pashto`)](https://www.semanticscholar.org/paper/13104eddc785756132a19242ac7e74442b145693) | Pashto research reference for methods and benchmarking |
8
+ | Deep Learning-Based Detection of One and Two-Column Textual Blocks in Camera-Captured Pashto Documents Images | [other](https://www.semanticscholar.org/paper/8c9d2628e23d5c27edc656071f11f0e78124d182) | [Matched by Semantic Scholar query: pashto. (`pashto`)](https://www.semanticscholar.org/paper/8c9d2628e23d5c27edc656071f11f0e78124d182) | Pashto research reference for methods and benchmarking |
9
+ | Enhancing Pashto Text Classification using Language Processing Techniques for Single And Multi-Label Analysis | [arxiv](http://arxiv.org/abs/2305.03201v1) | [Matched by arXiv query: all:pashto. (`pashto`)](http://arxiv.org/abs/2305.03201v1) | Pashto research reference for methods and benchmarking |
10
  | FLEURS: Few-shot Learning Evaluation of Universal Representations of Speech | [arxiv](https://arxiv.org/abs/2205.12446) | [Dataset implementation includes ps_af language code. (`ps_af`)](https://huggingface.co/datasets/google/fleurs/blob/main/fleurs.py) | Speech benchmark methodology reference |
11
+ | KNN and ANN-based Recognition of Handwritten Pashto Letters using Zoning Features | [arxiv](http://arxiv.org/abs/1904.03391v2) | [Matched by arXiv query: all:pashto. (`pashto`)](http://arxiv.org/abs/1904.03391v2) | Pashto research reference for methods and benchmarking |
12
  | No Language Left Behind | [arxiv](https://arxiv.org/abs/2207.04672) | [Model usage in repo references pbt_Arab token support. (`pbt_Arab`)](https://huggingface.co/facebook/nllb-200-distilled-600M/blob/main/special_tokens_map.json) | MT research reference |
13
+ | Out-of-Vocabulary Pashto Spell Checker using Morphological Operations | [other](https://www.semanticscholar.org/paper/802aae68a6a7fdfb29d51be03fb2b09e29311fa7) | [Matched by Semantic Scholar query: pashto. (`pashto`)](https://www.semanticscholar.org/paper/802aae68a6a7fdfb29d51be03fb2b09e29311fa7) | Pashto research reference for methods and benchmarking |
14
+ | Pashto Shallow Parsing: A Deep Learning Approach | [other](https://www.semanticscholar.org/paper/be36455bb4eca60accb3e6866f345132f0dac1e5) | [Matched by Semantic Scholar query: pashto. (`pashto`)](https://www.semanticscholar.org/paper/be36455bb4eca60accb3e6866f345132f0dac1e5) | Pashto research reference for methods and benchmarking |
15
+ | POS tagging of low-resource Pashto language: annotated corpus and BERT-based model | [other](https://www.semanticscholar.org/paper/1b2d5c896fec735483e8c8fb0a75e13125e08769) | [Matched by Semantic Scholar query: pashto. (`pashto`)](https://www.semanticscholar.org/paper/1b2d5c896fec735483e8c8fb0a75e13125e08769) | Pashto research reference for methods and benchmarking |
16
+ | PsOCR: Benchmarking Large Multimodal Models for Optical Character Recognition in Low-resource Pashto Language | [other](https://www.semanticscholar.org/paper/d2743c0dcdbc65f5b46fcec2f0ba7cb379c4134f) | [Paper title explicitly references low-resource Pashto language OCR benchmarking. (`Pashto`, `OCR`)](https://www.semanticscholar.org/paper/d2743c0dcdbc65f5b46fcec2f0ba7cb379c4134f) | Pashto OCR research baseline and evaluation reference |
17
  | Robust Speech Recognition via Large-Scale Weak Supervision | [arxiv](https://arxiv.org/abs/2212.04356) | [Paired with tokenizer language map containing ps. (`ps`)](https://raw.githubusercontent.com/openai/whisper/main/whisper/tokenizer.py) | ASR methodology reference |
18
  | Scaling Speech Technology to 1,000+ Languages | [arxiv](https://arxiv.org/abs/2305.13516) | [Coverage table marks pus support in MMS release. (`pus`)](https://dl.fbaipublicfiles.com/mms/misc/language_coverage_mms.html) | ASR and TTS transfer reference |
19
 
resources/projects/README.md CHANGED
@@ -4,9 +4,17 @@
4
 
5
  | Resource | Link | Pashto Evidence | Primary Use |
6
  |---|---|---|---|
 
 
 
 
 
7
  | Pashto ASR V3 Space | [huggingface](https://huggingface.co/spaces/ihanif/pashto-asr-v3) | [Space card title is Pashto ASR V3 and short description states Pashto ASR. (`Pashto`, `ASR`)](https://huggingface.co/api/spaces/ihanif/pashto-asr-v3) | Project demo for Pashto ASR user testing |
8
  | Pashto to English Dictionary Space | [huggingface](https://huggingface.co/spaces/EngrAamirBangash/Pashto2English-Dictionary) | [Space metadata title states Pashto to English Dictionary. (`Pashto`)](https://huggingface.co/api/spaces/EngrAamirBangash/Pashto2English-Dictionary) | Interactive bilingual lookup project |
9
  | Pashto Translator Space | [huggingface](https://huggingface.co/spaces/Umar4321/Pashto-Translator) | [Space title is Pashto Translator and description states Pashto to English and Urdu translation. (`Pashto`)](https://huggingface.co/api/spaces/Umar4321/Pashto-Translator) | Interactive translation project demo |
 
 
 
10
 
11
  ## Maintenance
12
  - Source of truth: [../catalog/resources.json](../catalog/resources.json)
 
4
 
5
  | Resource | Link | Pashto Evidence | Primary Use |
6
  |---|---|---|---|
7
+ | Fazlullahmamond/Pashto-Typing | [github](https://github.com/Fazlullahmamond/Pashto-Typing) | [Repository metadata (name/description/topics) includes Pashto markers. (`pashto`)](https://github.com/Fazlullahmamond/Pashto-Typing) | Interactive Pashto demo and quick qualitative validation |
8
+ | ihanif/wav2vec2-bert-pashto-asr | [huggingface](https://huggingface.co/spaces/ihanif/wav2vec2-bert-pashto-asr) | [Matched by Pashto keyword in Hugging Face Spaces search. (`pashto`)](https://huggingface.co/spaces/ihanif/wav2vec2-bert-pashto-asr) | Interactive Pashto demo and quick qualitative validation |
9
+ | IhyaCommunity/Khushkhat-Extension | [github](https://github.com/IhyaCommunity/Khushkhat-Extension) | [Repository metadata (name/description/topics) includes Pashto markers. (`pashto`)](https://github.com/IhyaCommunity/Khushkhat-Extension) | Interactive Pashto demo and quick qualitative validation |
10
+ | nasirkhansayyad/pashto-whisper-demo | [huggingface](https://huggingface.co/spaces/nasirkhansayyad/pashto-whisper-demo) | [Matched by Pashto keyword in Hugging Face Spaces search. (`pashto`)](https://huggingface.co/spaces/nasirkhansayyad/pashto-whisper-demo) | Interactive Pashto demo and quick qualitative validation |
11
+ | Pashto ASR Space | [huggingface](https://huggingface.co/spaces/ihanif/pashto-asr) | [Space ID includes pashto-asr and is returned by Hugging Face Pashto space search. (`pashto`, `asr`)](https://huggingface.co/api/spaces/ihanif/pashto-asr) | Live Pashto speech-to-text demo project |
12
  | Pashto ASR V3 Space | [huggingface](https://huggingface.co/spaces/ihanif/pashto-asr-v3) | [Space card title is Pashto ASR V3 and short description states Pashto ASR. (`Pashto`, `ASR`)](https://huggingface.co/api/spaces/ihanif/pashto-asr-v3) | Project demo for Pashto ASR user testing |
13
  | Pashto to English Dictionary Space | [huggingface](https://huggingface.co/spaces/EngrAamirBangash/Pashto2English-Dictionary) | [Space metadata title states Pashto to English Dictionary. (`Pashto`)](https://huggingface.co/api/spaces/EngrAamirBangash/Pashto2English-Dictionary) | Interactive bilingual lookup project |
14
  | Pashto Translator Space | [huggingface](https://huggingface.co/spaces/Umar4321/Pashto-Translator) | [Space title is Pashto Translator and description states Pashto to English and Urdu translation. (`Pashto`)](https://huggingface.co/api/spaces/Umar4321/Pashto-Translator) | Interactive translation project demo |
15
+ | tasal9/ZamAI-Phi3-Mini-Pashto-Demo | [huggingface](https://huggingface.co/spaces/tasal9/ZamAI-Phi3-Mini-Pashto-Demo) | [Matched by Pashto keyword in Hugging Face Spaces search. (`pashto`)](https://huggingface.co/spaces/tasal9/ZamAI-Phi3-Mini-Pashto-Demo) | Interactive Pashto demo and quick qualitative validation |
16
+ | Umar4321/Pashto-To-English-Urdu | [huggingface](https://huggingface.co/spaces/Umar4321/Pashto-To-English-Urdu) | [Matched by Pashto keyword in Hugging Face Spaces search. (`pashto`)](https://huggingface.co/spaces/Umar4321/Pashto-To-English-Urdu) | Interactive Pashto demo and quick qualitative validation |
17
+ | ZamAI-Mistral-7B-Pashto Space | [huggingface](https://huggingface.co/spaces/tasal9/ZamAI-Mistral-7B-Pashto-space) | [Space title and ID explicitly include Pashto and model card metadata exposes project details. (`Pashto`)](https://huggingface.co/api/spaces/tasal9/ZamAI-Mistral-7B-Pashto-space) | Interactive Pashto LLM project demo |
18
 
19
  ## Maintenance
20
  - Source of truth: [../catalog/resources.json](../catalog/resources.json)
scripts/sync_resources.py CHANGED
@@ -13,15 +13,23 @@ from __future__ import annotations
13
  import argparse
14
  import json
15
  import re
 
 
 
16
  import urllib.parse
17
  import urllib.request
18
  import xml.etree.ElementTree as ET
19
  from datetime import datetime, timezone
 
 
20
  from pathlib import Path
21
  from typing import Any
 
22
 
23
 
24
  USER_AGENT = "pashto-resource-sync/1.0"
 
 
25
 
26
 
27
  def _slug(value: str) -> str:
@@ -31,16 +39,130 @@ def _slug(value: str) -> str:
31
  return value[:80] if value else "resource"
32
 
33
 
34
- def _fetch_json(url: str, timeout: float = 20.0) -> Any:
35
- req = urllib.request.Request(url, headers={"User-Agent": USER_AGENT})
36
- with urllib.request.urlopen(req, timeout=timeout) as response:
37
- return json.loads(response.read().decode("utf-8"))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
 
39
 
40
- def _fetch_text(url: str, timeout: float = 20.0) -> str:
 
 
 
 
 
 
 
 
 
 
 
 
 
41
  req = urllib.request.Request(url, headers={"User-Agent": USER_AGENT})
42
- with urllib.request.urlopen(req, timeout=timeout) as response:
43
- return response.read().decode("utf-8", errors="replace")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44
 
45
 
46
  def _candidate(
@@ -81,7 +203,7 @@ def fetch_huggingface(kind: str, limit: int) -> list[dict[str, Any]]:
81
 
82
  query = urllib.parse.urlencode({"search": "pashto", "limit": str(limit)})
83
  url = f"https://huggingface.co/api/{kind}?{query}"
84
- payload = _fetch_json(url)
85
 
86
  category = "dataset" if kind == "datasets" else "model"
87
  out: list[dict[str, Any]] = []
@@ -111,7 +233,7 @@ def fetch_huggingface(kind: str, limit: int) -> list[dict[str, Any]]:
111
  def fetch_huggingface_spaces(limit: int) -> list[dict[str, Any]]:
112
  query = urllib.parse.urlencode({"search": "pashto", "limit": str(limit)})
113
  url = f"https://huggingface.co/api/spaces?{query}"
114
- payload = _fetch_json(url)
115
 
116
  out: list[dict[str, Any]] = []
117
  for item in payload:
@@ -142,7 +264,7 @@ def fetch_kaggle_datasets(limit: int) -> list[dict[str, Any]]:
142
  # Public Kaggle dataset listing endpoint (no auth needed for list responses).
143
  query = urllib.parse.urlencode({"search": "pashto", "page": "1"})
144
  url = f"https://www.kaggle.com/api/v1/datasets/list?{query}"
145
- payload = _fetch_json(url)
146
 
147
  out: list[dict[str, Any]] = []
148
  for item in payload:
@@ -191,7 +313,11 @@ def fetch_github_pashto_repos(limit: int) -> list[dict[str, Any]]:
191
  {"q": query_text, "sort": "stars", "order": "desc", "per_page": str(limit)}
192
  )
193
  url = f"https://api.github.com/search/repositories?{query}"
194
- payload = _fetch_json(url)
 
 
 
 
195
  for item in payload.get("items", []):
196
  full_name = item.get("full_name")
197
  html_url = item.get("html_url")
@@ -244,8 +370,21 @@ def fetch_arxiv(limit: int) -> list[dict[str, Any]]:
244
  query = urllib.parse.urlencode(
245
  {"search_query": "all:pashto", "start": "0", "max_results": str(limit)}
246
  )
247
- url = f"http://export.arxiv.org/api/query?{query}"
248
- xml_text = _fetch_text(url)
 
 
 
 
 
 
 
 
 
 
 
 
 
249
  root = ET.fromstring(xml_text)
250
  ns = {"atom": "http://www.w3.org/2005/Atom"}
251
 
@@ -281,7 +420,11 @@ def fetch_semantic_scholar(limit: int) -> list[dict[str, Any]]:
281
  {"query": "pashto", "limit": str(limit), "fields": fields}
282
  )
283
  url = f"https://api.semanticscholar.org/graph/v1/paper/search?{query}"
284
- payload = _fetch_json(url)
 
 
 
 
285
 
286
  out: list[dict[str, Any]] = []
287
  for item in payload.get("data", []):
 
13
  import argparse
14
  import json
15
  import re
16
+ import socket
17
+ import ssl
18
+ import time
19
  import urllib.parse
20
  import urllib.request
21
  import xml.etree.ElementTree as ET
22
  from datetime import datetime, timezone
23
+ from email.utils import parsedate_to_datetime
24
+ from http.client import IncompleteRead
25
  from pathlib import Path
26
  from typing import Any
27
+ from urllib.error import HTTPError, URLError
28
 
29
 
30
  USER_AGENT = "pashto-resource-sync/1.0"
31
+ MAX_FETCH_RETRIES = 4
32
+ RETRYABLE_HTTP_CODES = {429, 500, 502, 503, 504}
33
 
34
 
35
  def _slug(value: str) -> str:
 
39
  return value[:80] if value else "resource"
40
 
41
 
42
+ def _parse_retry_after_seconds(retry_after: str | None) -> float | None:
43
+ if not retry_after:
44
+ return None
45
+
46
+ retry_after = retry_after.strip()
47
+ if not retry_after:
48
+ return None
49
+
50
+ if retry_after.isdigit():
51
+ return float(retry_after)
52
+
53
+ try:
54
+ retry_at = parsedate_to_datetime(retry_after)
55
+ except (TypeError, ValueError):
56
+ return None
57
+
58
+ now = datetime.now(timezone.utc)
59
+ if retry_at.tzinfo is None:
60
+ retry_at = retry_at.replace(tzinfo=timezone.utc)
61
+ return max(0.0, (retry_at - now).total_seconds())
62
+
63
+
64
+ def _is_ssl_cert_error(exc: BaseException) -> bool:
65
+ if isinstance(exc, ssl.SSLCertVerificationError):
66
+ return True
67
+ if isinstance(exc, URLError):
68
+ reason = exc.reason
69
+ if isinstance(reason, ssl.SSLCertVerificationError):
70
+ return True
71
+ return "CERTIFICATE_VERIFY_FAILED" in str(exc)
72
+
73
+
74
+ def _retryable_network_error(exc: BaseException) -> bool:
75
+ if _is_ssl_cert_error(exc):
76
+ return False
77
+ if isinstance(exc, (TimeoutError, socket.timeout, IncompleteRead, ConnectionResetError)):
78
+ return True
79
+ if isinstance(exc, URLError):
80
+ reason = exc.reason
81
+ if isinstance(reason, (TimeoutError, socket.timeout, IncompleteRead, ConnectionResetError)):
82
+ return True
83
+ return True
84
+ return False
85
 
86
 
87
+ def _retry_delay(attempt: int, retry_after: str | None = None) -> float:
88
+ parsed = _parse_retry_after_seconds(retry_after)
89
+ if parsed is not None:
90
+ return min(max(parsed, 0.0), 60.0)
91
+ return min(2 ** (attempt - 1), 30.0)
92
+
93
+
94
+ def _fetch_bytes(
95
+ url: str,
96
+ *,
97
+ timeout: float = 20.0,
98
+ ssl_context: ssl.SSLContext | None = None,
99
+ source_name: str = "remote",
100
+ ) -> bytes:
101
  req = urllib.request.Request(url, headers={"User-Agent": USER_AGENT})
102
+ last_exc: BaseException | None = None
103
+
104
+ for attempt in range(1, MAX_FETCH_RETRIES + 1):
105
+ try:
106
+ with urllib.request.urlopen(req, timeout=timeout, context=ssl_context) as response:
107
+ return response.read()
108
+ except HTTPError as exc:
109
+ last_exc = exc
110
+ if exc.code in RETRYABLE_HTTP_CODES and attempt < MAX_FETCH_RETRIES:
111
+ delay = _retry_delay(attempt, exc.headers.get("Retry-After"))
112
+ print(
113
+ f"[retry] {source_name} HTTP {exc.code} from {url}; "
114
+ f"retrying in {delay:.1f}s ({attempt}/{MAX_FETCH_RETRIES})"
115
+ )
116
+ time.sleep(delay)
117
+ continue
118
+ raise
119
+ except Exception as exc: # noqa: BLE001
120
+ last_exc = exc
121
+ if _retryable_network_error(exc) and attempt < MAX_FETCH_RETRIES:
122
+ delay = _retry_delay(attempt)
123
+ print(
124
+ f"[retry] {source_name} network error from {url}: {exc}; "
125
+ f"retrying in {delay:.1f}s ({attempt}/{MAX_FETCH_RETRIES})"
126
+ )
127
+ time.sleep(delay)
128
+ continue
129
+ raise
130
+
131
+ if last_exc is not None:
132
+ raise RuntimeError(f"{source_name} fetch failed after retries: {last_exc}") from last_exc
133
+ raise RuntimeError(f"{source_name} fetch failed unexpectedly for {url}")
134
+
135
+
136
+ def _fetch_json(
137
+ url: str,
138
+ *,
139
+ timeout: float = 20.0,
140
+ ssl_context: ssl.SSLContext | None = None,
141
+ source_name: str = "remote",
142
+ ) -> Any:
143
+ payload = _fetch_bytes(
144
+ url,
145
+ timeout=timeout,
146
+ ssl_context=ssl_context,
147
+ source_name=source_name,
148
+ )
149
+ return json.loads(payload.decode("utf-8"))
150
+
151
+
152
+ def _fetch_text(
153
+ url: str,
154
+ *,
155
+ timeout: float = 20.0,
156
+ ssl_context: ssl.SSLContext | None = None,
157
+ source_name: str = "remote",
158
+ ) -> str:
159
+ payload = _fetch_bytes(
160
+ url,
161
+ timeout=timeout,
162
+ ssl_context=ssl_context,
163
+ source_name=source_name,
164
+ )
165
+ return payload.decode("utf-8", errors="replace")
166
 
167
 
168
  def _candidate(
 
203
 
204
  query = urllib.parse.urlencode({"search": "pashto", "limit": str(limit)})
205
  url = f"https://huggingface.co/api/{kind}?{query}"
206
+ payload = _fetch_json(url, source_name=f"huggingface-{kind}")
207
 
208
  category = "dataset" if kind == "datasets" else "model"
209
  out: list[dict[str, Any]] = []
 
233
  def fetch_huggingface_spaces(limit: int) -> list[dict[str, Any]]:
234
  query = urllib.parse.urlencode({"search": "pashto", "limit": str(limit)})
235
  url = f"https://huggingface.co/api/spaces?{query}"
236
+ payload = _fetch_json(url, source_name="huggingface-spaces")
237
 
238
  out: list[dict[str, Any]] = []
239
  for item in payload:
 
264
  # Public Kaggle dataset listing endpoint (no auth needed for list responses).
265
  query = urllib.parse.urlencode({"search": "pashto", "page": "1"})
266
  url = f"https://www.kaggle.com/api/v1/datasets/list?{query}"
267
+ payload = _fetch_json(url, source_name="kaggle-datasets")
268
 
269
  out: list[dict[str, Any]] = []
270
  for item in payload:
 
313
  {"q": query_text, "sort": "stars", "order": "desc", "per_page": str(limit)}
314
  )
315
  url = f"https://api.github.com/search/repositories?{query}"
316
+ payload = _fetch_json(
317
+ url,
318
+ timeout=30.0,
319
+ source_name="github-repositories",
320
+ )
321
  for item in payload.get("items", []):
322
  full_name = item.get("full_name")
323
  html_url = item.get("html_url")
 
370
  query = urllib.parse.urlencode(
371
  {"search_query": "all:pashto", "start": "0", "max_results": str(limit)}
372
  )
373
+ url = f"https://export.arxiv.org/api/query?{query}"
374
+ try:
375
+ xml_text = _fetch_text(url, timeout=30.0, source_name="arxiv")
376
+ except Exception as exc: # noqa: BLE001
377
+ if not _is_ssl_cert_error(exc):
378
+ raise
379
+ # arXiv occasionally fails cert chain validation in some runner images.
380
+ insecure_context = ssl._create_unverified_context()
381
+ print("[warn] arxiv SSL verification failed; retrying with unverified TLS context")
382
+ xml_text = _fetch_text(
383
+ url,
384
+ timeout=30.0,
385
+ ssl_context=insecure_context,
386
+ source_name="arxiv",
387
+ )
388
  root = ET.fromstring(xml_text)
389
  ns = {"atom": "http://www.w3.org/2005/Atom"}
390
 
 
420
  {"query": "pashto", "limit": str(limit), "fields": fields}
421
  )
422
  url = f"https://api.semanticscholar.org/graph/v1/paper/search?{query}"
423
+ payload = _fetch_json(
424
+ url,
425
+ timeout=30.0,
426
+ source_name="semantic-scholar",
427
+ )
428
 
429
  out: list[dict[str, Any]] = []
430
  for item in payload.get("data", []):