diff --git "a/data/Indo-European.json" "b/data/Indo-European.json" --- "a/data/Indo-European.json" +++ "b/data/Indo-European.json" @@ -134,111 +134,6 @@ ], "class_name": "SpaCyTokenizer", "macrolanguage": true - }, - "Armn": { - "full_object": "SpaCyTokenizer(\"hy\")", - "original_lang_name": "armenian", - "original_lang_code": "hye", - "scripts": [ - "Armn" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Cyrl": { - "full_object": "SpaCyTokenizer(\"ru\")", - "original_lang_name": "russian", - "original_lang_code": "rus", - "scripts": [ - "Cyrl" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Grek": { - "full_object": "SpaCyTokenizer(\"el\")", - "original_lang_name": "greek", - "original_lang_code": "ell", - "scripts": [ - "Grek" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Deva": { - "full_object": "SpaCyTokenizer(\"hi\")", - "original_lang_name": "hindi", - "original_lang_code": "hin", - "scripts": [ - "Latn", - "Deva" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Gujr": { - "full_object": "SpaCyTokenizer(\"gu\")", - "original_lang_name": "gujarati", - "original_lang_code": "guj", - "scripts": [ - "Latn", - "Gujr" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Guru": { - "full_object": "IndicNLPTokenizer(\"pa\")", - "original_lang_name": "punjabi", - "original_lang_code": "pan", - "scripts": [ - "Latn", - "Guru" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": false - }, - "Beng": { - "full_object": "SpaCyTokenizer(\"bn\")", - "original_lang_name": "bengali", - "original_lang_code": "ben", - "scripts": [ - "Latn", - "Beng" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Orya": { - "full_object": "IndicNLPTokenizer(\"or\")", - "original_lang_name": "oriya", - "original_lang_code": "ori", - "scripts": [ - "Latn", - "Orya" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": true - }, - "Arab": { - "full_object": "SpaCyTokenizer(\"fa\")", - "original_lang_name": "persian", - "original_lang_code": "fas", - "scripts": [ - "Arab" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": true - }, - "Sinh": { - "full_object": "SpaCyTokenizer(\"si\")", - "original_lang_name": "sinhala", - "original_lang_code": "sin", - "scripts": [ - "Sinh" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false } }, "children": [ @@ -256,111 +151,6 @@ ], "class_name": "SpaCyTokenizer", "macrolanguage": true - }, - "Armn": { - "full_object": "SpaCyTokenizer(\"hy\")", - "original_lang_name": "armenian", - "original_lang_code": "hye", - "scripts": [ - "Armn" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Cyrl": { - "full_object": "SpaCyTokenizer(\"ru\")", - "original_lang_name": "russian", - "original_lang_code": "rus", - "scripts": [ - "Cyrl" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Grek": { - "full_object": "SpaCyTokenizer(\"el\")", - "original_lang_name": "greek", - "original_lang_code": "ell", - "scripts": [ - "Grek" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Deva": { - "full_object": "SpaCyTokenizer(\"hi\")", - "original_lang_name": "hindi", - "original_lang_code": "hin", - "scripts": [ - "Latn", - "Deva" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Gujr": { - "full_object": "SpaCyTokenizer(\"gu\")", - "original_lang_name": "gujarati", - "original_lang_code": "guj", - "scripts": [ - "Latn", - "Gujr" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Guru": { - "full_object": "IndicNLPTokenizer(\"pa\")", - "original_lang_name": "punjabi", - "original_lang_code": "pan", - "scripts": [ - "Latn", - "Guru" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": false - }, - "Beng": { - "full_object": "SpaCyTokenizer(\"bn\")", - "original_lang_name": "bengali", - "original_lang_code": "ben", - "scripts": [ - "Latn", - "Beng" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Orya": { - "full_object": "IndicNLPTokenizer(\"or\")", - "original_lang_name": "oriya", - "original_lang_code": "ori", - "scripts": [ - "Latn", - "Orya" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": true - }, - "Arab": { - "full_object": "SpaCyTokenizer(\"fa\")", - "original_lang_name": "persian", - "original_lang_code": "fas", - "scripts": [ - "Arab" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": true - }, - "Sinh": { - "full_object": "SpaCyTokenizer(\"si\")", - "original_lang_name": "sinhala", - "original_lang_code": "sin", - "scripts": [ - "Sinh" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false } }, "children": [ @@ -406,134 +196,29 @@ ], "class_name": "SpaCyTokenizer", "macrolanguage": true - }, - "Armn": { - "full_object": "SpaCyTokenizer(\"hy\")", - "original_lang_name": "armenian", - "original_lang_code": "hye", - "scripts": [ - "Armn" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Cyrl": { - "full_object": "SpaCyTokenizer(\"ru\")", - "original_lang_name": "russian", - "original_lang_code": "rus", - "scripts": [ - "Cyrl" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Grek": { - "full_object": "SpaCyTokenizer(\"el\")", - "original_lang_name": "greek", - "original_lang_code": "ell", - "scripts": [ - "Grek" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Deva": { - "full_object": "SpaCyTokenizer(\"hi\")", - "original_lang_name": "hindi", - "original_lang_code": "hin", - "scripts": [ - "Latn", - "Deva" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Gujr": { - "full_object": "SpaCyTokenizer(\"gu\")", - "original_lang_name": "gujarati", - "original_lang_code": "guj", - "scripts": [ - "Latn", - "Gujr" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Guru": { - "full_object": "IndicNLPTokenizer(\"pa\")", - "original_lang_name": "punjabi", - "original_lang_code": "pan", - "scripts": [ - "Latn", - "Guru" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": false - }, - "Beng": { - "full_object": "SpaCyTokenizer(\"bn\")", - "original_lang_name": "bengali", - "original_lang_code": "ben", - "scripts": [ - "Latn", - "Beng" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Orya": { - "full_object": "IndicNLPTokenizer(\"or\")", - "original_lang_name": "oriya", - "original_lang_code": "ori", - "scripts": [ - "Latn", - "Orya" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": true - }, - "Arab": { - "full_object": "SpaCyTokenizer(\"fa\")", - "original_lang_name": "persian", - "original_lang_code": "fas", - "scripts": [ - "Arab" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": true - }, - "Sinh": { - "full_object": "SpaCyTokenizer(\"si\")", - "original_lang_name": "sinhala", - "original_lang_code": "sin", - "scripts": [ - "Sinh" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - } - }, - "children": [ - { - "name": "Albanian, Arb\u00ebresh\u00eb", - "iso_1_code": "sq", - "iso_3_code": "aae", - "tokenizers": { - "Latn": { - "full_object": "SpaCyTokenizer(\"sq\")", - "original_lang_name": "albanian", - "original_lang_code": "sqi", - "scripts": [ - "Latn" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": true - } - }, - "children": [], - "node_i": "3924", - "scripts": [], - "own_tokenizer": true + } + }, + "children": [ + { + "name": "Albanian, Arb\u00ebresh\u00eb", + "iso_1_code": "sq", + "iso_3_code": "aae", + "tokenizers": { + "Latn": { + "full_object": "SpaCyTokenizer(\"sq\")", + "original_lang_name": "albanian", + "original_lang_code": "sqi", + "scripts": [ + "Latn" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + } + }, + "children": [], + "node_i": "3924", + "scripts": [], + "own_tokenizer": true }, { "name": "Albanian, Arvanitika", @@ -603,111 +288,6 @@ ], "class_name": "SpaCyTokenizer", "macrolanguage": false - }, - "Latn": { - "full_object": "SpaCyTokenizer(\"en\")", - "original_lang_name": "english", - "original_lang_code": "eng", - "scripts": [ - "Latn" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Cyrl": { - "full_object": "SpaCyTokenizer(\"ru\")", - "original_lang_name": "russian", - "original_lang_code": "rus", - "scripts": [ - "Cyrl" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Grek": { - "full_object": "SpaCyTokenizer(\"el\")", - "original_lang_name": "greek", - "original_lang_code": "ell", - "scripts": [ - "Grek" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Deva": { - "full_object": "SpaCyTokenizer(\"hi\")", - "original_lang_name": "hindi", - "original_lang_code": "hin", - "scripts": [ - "Latn", - "Deva" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Gujr": { - "full_object": "SpaCyTokenizer(\"gu\")", - "original_lang_name": "gujarati", - "original_lang_code": "guj", - "scripts": [ - "Latn", - "Gujr" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Guru": { - "full_object": "IndicNLPTokenizer(\"pa\")", - "original_lang_name": "punjabi", - "original_lang_code": "pan", - "scripts": [ - "Latn", - "Guru" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": false - }, - "Beng": { - "full_object": "SpaCyTokenizer(\"bn\")", - "original_lang_name": "bengali", - "original_lang_code": "ben", - "scripts": [ - "Latn", - "Beng" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Orya": { - "full_object": "IndicNLPTokenizer(\"or\")", - "original_lang_name": "oriya", - "original_lang_code": "ori", - "scripts": [ - "Latn", - "Orya" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": true - }, - "Arab": { - "full_object": "SpaCyTokenizer(\"fa\")", - "original_lang_name": "persian", - "original_lang_code": "fas", - "scripts": [ - "Arab" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": true - }, - "Sinh": { - "full_object": "SpaCyTokenizer(\"si\")", - "original_lang_name": "sinhala", - "original_lang_code": "sin", - "scripts": [ - "Sinh" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false } }, "children": [ @@ -786,101 +366,6 @@ ], "class_name": "SpaCyTokenizer", "macrolanguage": false - }, - "Armn": { - "full_object": "SpaCyTokenizer(\"hy\")", - "original_lang_name": "armenian", - "original_lang_code": "hye", - "scripts": [ - "Armn" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Grek": { - "full_object": "SpaCyTokenizer(\"el\")", - "original_lang_name": "greek", - "original_lang_code": "ell", - "scripts": [ - "Grek" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Deva": { - "full_object": "SpaCyTokenizer(\"hi\")", - "original_lang_name": "hindi", - "original_lang_code": "hin", - "scripts": [ - "Latn", - "Deva" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Gujr": { - "full_object": "SpaCyTokenizer(\"gu\")", - "original_lang_name": "gujarati", - "original_lang_code": "guj", - "scripts": [ - "Latn", - "Gujr" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Guru": { - "full_object": "IndicNLPTokenizer(\"pa\")", - "original_lang_name": "punjabi", - "original_lang_code": "pan", - "scripts": [ - "Latn", - "Guru" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": false - }, - "Beng": { - "full_object": "SpaCyTokenizer(\"bn\")", - "original_lang_name": "bengali", - "original_lang_code": "ben", - "scripts": [ - "Latn", - "Beng" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Orya": { - "full_object": "IndicNLPTokenizer(\"or\")", - "original_lang_name": "oriya", - "original_lang_code": "ori", - "scripts": [ - "Latn", - "Orya" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": true - }, - "Arab": { - "full_object": "SpaCyTokenizer(\"fa\")", - "original_lang_name": "persian", - "original_lang_code": "fas", - "scripts": [ - "Arab" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": true - }, - "Sinh": { - "full_object": "SpaCyTokenizer(\"si\")", - "original_lang_name": "sinhala", - "original_lang_code": "sin", - "scripts": [ - "Sinh" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false } }, "children": [ @@ -898,111 +383,6 @@ ], "class_name": "SpaCyTokenizer", "macrolanguage": false - }, - "Cyrl": { - "full_object": "SpaCyTokenizer(\"ru\")", - "original_lang_name": "russian", - "original_lang_code": "rus", - "scripts": [ - "Cyrl" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Armn": { - "full_object": "SpaCyTokenizer(\"hy\")", - "original_lang_name": "armenian", - "original_lang_code": "hye", - "scripts": [ - "Armn" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Grek": { - "full_object": "SpaCyTokenizer(\"el\")", - "original_lang_name": "greek", - "original_lang_code": "ell", - "scripts": [ - "Grek" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Deva": { - "full_object": "SpaCyTokenizer(\"hi\")", - "original_lang_name": "hindi", - "original_lang_code": "hin", - "scripts": [ - "Latn", - "Deva" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Gujr": { - "full_object": "SpaCyTokenizer(\"gu\")", - "original_lang_name": "gujarati", - "original_lang_code": "guj", - "scripts": [ - "Latn", - "Gujr" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Guru": { - "full_object": "IndicNLPTokenizer(\"pa\")", - "original_lang_name": "punjabi", - "original_lang_code": "pan", - "scripts": [ - "Latn", - "Guru" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": false - }, - "Beng": { - "full_object": "SpaCyTokenizer(\"bn\")", - "original_lang_name": "bengali", - "original_lang_code": "ben", - "scripts": [ - "Latn", - "Beng" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Orya": { - "full_object": "IndicNLPTokenizer(\"or\")", - "original_lang_name": "oriya", - "original_lang_code": "ori", - "scripts": [ - "Latn", - "Orya" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": true - }, - "Arab": { - "full_object": "SpaCyTokenizer(\"fa\")", - "original_lang_name": "persian", - "original_lang_code": "fas", - "scripts": [ - "Arab" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": true - }, - "Sinh": { - "full_object": "SpaCyTokenizer(\"si\")", - "original_lang_name": "sinhala", - "original_lang_code": "sin", - "scripts": [ - "Sinh" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false } }, "children": [ @@ -1020,111 +400,6 @@ ], "class_name": "SpaCyTokenizer", "macrolanguage": false - }, - "Cyrl": { - "full_object": "SpaCyTokenizer(\"ru\")", - "original_lang_name": "russian", - "original_lang_code": "rus", - "scripts": [ - "Cyrl" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Armn": { - "full_object": "SpaCyTokenizer(\"hy\")", - "original_lang_name": "armenian", - "original_lang_code": "hye", - "scripts": [ - "Armn" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Grek": { - "full_object": "SpaCyTokenizer(\"el\")", - "original_lang_name": "greek", - "original_lang_code": "ell", - "scripts": [ - "Grek" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Deva": { - "full_object": "SpaCyTokenizer(\"hi\")", - "original_lang_name": "hindi", - "original_lang_code": "hin", - "scripts": [ - "Latn", - "Deva" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Gujr": { - "full_object": "SpaCyTokenizer(\"gu\")", - "original_lang_name": "gujarati", - "original_lang_code": "guj", - "scripts": [ - "Latn", - "Gujr" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Guru": { - "full_object": "IndicNLPTokenizer(\"pa\")", - "original_lang_name": "punjabi", - "original_lang_code": "pan", - "scripts": [ - "Latn", - "Guru" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": false - }, - "Beng": { - "full_object": "SpaCyTokenizer(\"bn\")", - "original_lang_name": "bengali", - "original_lang_code": "ben", - "scripts": [ - "Latn", - "Beng" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Orya": { - "full_object": "IndicNLPTokenizer(\"or\")", - "original_lang_name": "oriya", - "original_lang_code": "ori", - "scripts": [ - "Latn", - "Orya" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": true - }, - "Arab": { - "full_object": "SpaCyTokenizer(\"fa\")", - "original_lang_name": "persian", - "original_lang_code": "fas", - "scripts": [ - "Arab" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": true - }, - "Sinh": { - "full_object": "SpaCyTokenizer(\"si\")", - "original_lang_name": "sinhala", - "original_lang_code": "sin", - "scripts": [ - "Sinh" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false } }, "children": [ @@ -1249,111 +524,6 @@ ], "class_name": "SpaCyTokenizer", "macrolanguage": false - }, - "Cyrl": { - "full_object": "SpaCyTokenizer(\"ru\")", - "original_lang_name": "russian", - "original_lang_code": "rus", - "scripts": [ - "Cyrl" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Armn": { - "full_object": "SpaCyTokenizer(\"hy\")", - "original_lang_name": "armenian", - "original_lang_code": "hye", - "scripts": [ - "Armn" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Grek": { - "full_object": "SpaCyTokenizer(\"el\")", - "original_lang_name": "greek", - "original_lang_code": "ell", - "scripts": [ - "Grek" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Deva": { - "full_object": "SpaCyTokenizer(\"hi\")", - "original_lang_name": "hindi", - "original_lang_code": "hin", - "scripts": [ - "Latn", - "Deva" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Gujr": { - "full_object": "SpaCyTokenizer(\"gu\")", - "original_lang_name": "gujarati", - "original_lang_code": "guj", - "scripts": [ - "Latn", - "Gujr" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Guru": { - "full_object": "IndicNLPTokenizer(\"pa\")", - "original_lang_name": "punjabi", - "original_lang_code": "pan", - "scripts": [ - "Latn", - "Guru" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": false - }, - "Beng": { - "full_object": "SpaCyTokenizer(\"bn\")", - "original_lang_name": "bengali", - "original_lang_code": "ben", - "scripts": [ - "Latn", - "Beng" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Orya": { - "full_object": "IndicNLPTokenizer(\"or\")", - "original_lang_name": "oriya", - "original_lang_code": "ori", - "scripts": [ - "Latn", - "Orya" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": true - }, - "Arab": { - "full_object": "SpaCyTokenizer(\"fa\")", - "original_lang_name": "persian", - "original_lang_code": "fas", - "scripts": [ - "Arab" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": true - }, - "Sinh": { - "full_object": "SpaCyTokenizer(\"si\")", - "original_lang_name": "sinhala", - "original_lang_code": "sin", - "scripts": [ - "Sinh" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false } }, "children": [ @@ -1424,101 +594,6 @@ ], "class_name": "SpaCyTokenizer", "macrolanguage": false - }, - "Armn": { - "full_object": "SpaCyTokenizer(\"hy\")", - "original_lang_name": "armenian", - "original_lang_code": "hye", - "scripts": [ - "Armn" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Grek": { - "full_object": "SpaCyTokenizer(\"el\")", - "original_lang_name": "greek", - "original_lang_code": "ell", - "scripts": [ - "Grek" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Deva": { - "full_object": "SpaCyTokenizer(\"hi\")", - "original_lang_name": "hindi", - "original_lang_code": "hin", - "scripts": [ - "Latn", - "Deva" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Gujr": { - "full_object": "SpaCyTokenizer(\"gu\")", - "original_lang_name": "gujarati", - "original_lang_code": "guj", - "scripts": [ - "Latn", - "Gujr" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Guru": { - "full_object": "IndicNLPTokenizer(\"pa\")", - "original_lang_name": "punjabi", - "original_lang_code": "pan", - "scripts": [ - "Latn", - "Guru" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": false - }, - "Beng": { - "full_object": "SpaCyTokenizer(\"bn\")", - "original_lang_name": "bengali", - "original_lang_code": "ben", - "scripts": [ - "Latn", - "Beng" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Orya": { - "full_object": "IndicNLPTokenizer(\"or\")", - "original_lang_name": "oriya", - "original_lang_code": "ori", - "scripts": [ - "Latn", - "Orya" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": true - }, - "Arab": { - "full_object": "SpaCyTokenizer(\"fa\")", - "original_lang_name": "persian", - "original_lang_code": "fas", - "scripts": [ - "Arab" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": true - }, - "Sinh": { - "full_object": "SpaCyTokenizer(\"si\")", - "original_lang_name": "sinhala", - "original_lang_code": "sin", - "scripts": [ - "Sinh" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false } }, "children": [ @@ -1536,169 +611,64 @@ ], "class_name": "SpaCyTokenizer", "macrolanguage": false - }, - "Latn": { - "full_object": "SpaCyTokenizer(\"pl\")", - "original_lang_name": "polish", - "original_lang_code": "pol", + } + }, + "children": [ + { + "name": "Belarusian", + "iso_1_code": "be", + "iso_3_code": "bel", + "tokenizers": { + "Cyrl": { + "full_object": "StanzaTokenizer(\"be\")", + "original_lang_name": "belarusian", + "original_lang_code": "bel", + "scripts": [ + "Cyrl" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + } + }, + "children": [], + "node_i": "3943", "scripts": [ - "Latn" + "Cyrl" ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false + "own_tokenizer": true }, - "Armn": { - "full_object": "SpaCyTokenizer(\"hy\")", - "original_lang_name": "armenian", - "original_lang_code": "hye", + { + "name": "Ruthenian", + "iso_1_code": null, + "iso_3_code": "rsk", + "tokenizers": {}, + "children": [], + "node_i": "3944", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Rusyn", + "iso_1_code": null, + "iso_3_code": "rue", + "tokenizers": { + "Cyrl": { + "full_object": "SpaCyTokenizer(\"ru\")", + "original_lang_name": "russian", + "original_lang_code": "rus", + "scripts": [ + "Cyrl" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } + }, + "children": [], + "node_i": "3945", "scripts": [ - "Armn" + "Cyrl" ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Grek": { - "full_object": "SpaCyTokenizer(\"el\")", - "original_lang_name": "greek", - "original_lang_code": "ell", - "scripts": [ - "Grek" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Deva": { - "full_object": "SpaCyTokenizer(\"hi\")", - "original_lang_name": "hindi", - "original_lang_code": "hin", - "scripts": [ - "Latn", - "Deva" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Gujr": { - "full_object": "SpaCyTokenizer(\"gu\")", - "original_lang_name": "gujarati", - "original_lang_code": "guj", - "scripts": [ - "Latn", - "Gujr" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Guru": { - "full_object": "IndicNLPTokenizer(\"pa\")", - "original_lang_name": "punjabi", - "original_lang_code": "pan", - "scripts": [ - "Latn", - "Guru" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": false - }, - "Beng": { - "full_object": "SpaCyTokenizer(\"bn\")", - "original_lang_name": "bengali", - "original_lang_code": "ben", - "scripts": [ - "Latn", - "Beng" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Orya": { - "full_object": "IndicNLPTokenizer(\"or\")", - "original_lang_name": "oriya", - "original_lang_code": "ori", - "scripts": [ - "Latn", - "Orya" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": true - }, - "Arab": { - "full_object": "SpaCyTokenizer(\"fa\")", - "original_lang_name": "persian", - "original_lang_code": "fas", - "scripts": [ - "Arab" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": true - }, - "Sinh": { - "full_object": "SpaCyTokenizer(\"si\")", - "original_lang_name": "sinhala", - "original_lang_code": "sin", - "scripts": [ - "Sinh" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - } - }, - "children": [ - { - "name": "Belarusian", - "iso_1_code": "be", - "iso_3_code": "bel", - "tokenizers": { - "Cyrl": { - "full_object": "StanzaTokenizer(\"be\")", - "original_lang_name": "belarusian", - "original_lang_code": "bel", - "scripts": [ - "Cyrl" - ], - "class_name": "StanzaTokenizer", - "macrolanguage": false - } - }, - "children": [], - "node_i": "3943", - "scripts": [ - "Cyrl" - ], - "own_tokenizer": true - }, - { - "name": "Ruthenian", - "iso_1_code": null, - "iso_3_code": "rsk", - "tokenizers": {}, - "children": [], - "node_i": "3944", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Rusyn", - "iso_1_code": null, - "iso_3_code": "rue", - "tokenizers": { - "Cyrl": { - "full_object": "SpaCyTokenizer(\"ru\")", - "original_lang_name": "russian", - "original_lang_code": "rus", - "scripts": [ - "Cyrl" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - } - }, - "children": [], - "node_i": "3945", - "scripts": [ - "Cyrl" - ], - "own_tokenizer": false + "own_tokenizer": false }, { "name": "Russian", @@ -1775,101 +745,6 @@ ], "class_name": "SpaCyTokenizer", "macrolanguage": true - }, - "Armn": { - "full_object": "SpaCyTokenizer(\"hy\")", - "original_lang_name": "armenian", - "original_lang_code": "hye", - "scripts": [ - "Armn" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Grek": { - "full_object": "SpaCyTokenizer(\"el\")", - "original_lang_name": "greek", - "original_lang_code": "ell", - "scripts": [ - "Grek" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Deva": { - "full_object": "SpaCyTokenizer(\"hi\")", - "original_lang_name": "hindi", - "original_lang_code": "hin", - "scripts": [ - "Latn", - "Deva" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Gujr": { - "full_object": "SpaCyTokenizer(\"gu\")", - "original_lang_name": "gujarati", - "original_lang_code": "guj", - "scripts": [ - "Latn", - "Gujr" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Guru": { - "full_object": "IndicNLPTokenizer(\"pa\")", - "original_lang_name": "punjabi", - "original_lang_code": "pan", - "scripts": [ - "Latn", - "Guru" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": false - }, - "Beng": { - "full_object": "SpaCyTokenizer(\"bn\")", - "original_lang_name": "bengali", - "original_lang_code": "ben", - "scripts": [ - "Latn", - "Beng" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Orya": { - "full_object": "IndicNLPTokenizer(\"or\")", - "original_lang_name": "oriya", - "original_lang_code": "ori", - "scripts": [ - "Latn", - "Orya" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": true - }, - "Arab": { - "full_object": "SpaCyTokenizer(\"fa\")", - "original_lang_name": "persian", - "original_lang_code": "fas", - "scripts": [ - "Arab" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": true - }, - "Sinh": { - "full_object": "SpaCyTokenizer(\"si\")", - "original_lang_name": "sinhala", - "original_lang_code": "sin", - "scripts": [ - "Sinh" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false } }, "children": [ @@ -1887,111 +762,6 @@ ], "class_name": "SpaCyTokenizer", "macrolanguage": false - }, - "Latn": { - "full_object": "SpaCyTokenizer(\"sr\")", - "original_lang_name": "serbocroatian", - "original_lang_code": "hbs", - "scripts": [ - "Latn" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": true - }, - "Armn": { - "full_object": "SpaCyTokenizer(\"hy\")", - "original_lang_name": "armenian", - "original_lang_code": "hye", - "scripts": [ - "Armn" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Grek": { - "full_object": "SpaCyTokenizer(\"el\")", - "original_lang_name": "greek", - "original_lang_code": "ell", - "scripts": [ - "Grek" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Deva": { - "full_object": "SpaCyTokenizer(\"hi\")", - "original_lang_name": "hindi", - "original_lang_code": "hin", - "scripts": [ - "Latn", - "Deva" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Gujr": { - "full_object": "SpaCyTokenizer(\"gu\")", - "original_lang_name": "gujarati", - "original_lang_code": "guj", - "scripts": [ - "Latn", - "Gujr" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Guru": { - "full_object": "IndicNLPTokenizer(\"pa\")", - "original_lang_name": "punjabi", - "original_lang_code": "pan", - "scripts": [ - "Latn", - "Guru" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": false - }, - "Beng": { - "full_object": "SpaCyTokenizer(\"bn\")", - "original_lang_name": "bengali", - "original_lang_code": "ben", - "scripts": [ - "Latn", - "Beng" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Orya": { - "full_object": "IndicNLPTokenizer(\"or\")", - "original_lang_name": "oriya", - "original_lang_code": "ori", - "scripts": [ - "Latn", - "Orya" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": true - }, - "Arab": { - "full_object": "SpaCyTokenizer(\"fa\")", - "original_lang_name": "persian", - "original_lang_code": "fas", - "scripts": [ - "Arab" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": true - }, - "Sinh": { - "full_object": "SpaCyTokenizer(\"si\")", - "original_lang_name": "sinhala", - "original_lang_code": "sin", - "scripts": [ - "Sinh" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false } }, "children": [ @@ -2094,101 +864,6 @@ ], "class_name": "SpaCyTokenizer", "macrolanguage": false - }, - "Armn": { - "full_object": "SpaCyTokenizer(\"hy\")", - "original_lang_name": "armenian", - "original_lang_code": "hye", - "scripts": [ - "Armn" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Grek": { - "full_object": "SpaCyTokenizer(\"el\")", - "original_lang_name": "greek", - "original_lang_code": "ell", - "scripts": [ - "Grek" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Deva": { - "full_object": "SpaCyTokenizer(\"hi\")", - "original_lang_name": "hindi", - "original_lang_code": "hin", - "scripts": [ - "Latn", - "Deva" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Gujr": { - "full_object": "SpaCyTokenizer(\"gu\")", - "original_lang_name": "gujarati", - "original_lang_code": "guj", - "scripts": [ - "Latn", - "Gujr" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Guru": { - "full_object": "IndicNLPTokenizer(\"pa\")", - "original_lang_name": "punjabi", - "original_lang_code": "pan", - "scripts": [ - "Latn", - "Guru" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": false - }, - "Beng": { - "full_object": "SpaCyTokenizer(\"bn\")", - "original_lang_name": "bengali", - "original_lang_code": "ben", - "scripts": [ - "Latn", - "Beng" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Orya": { - "full_object": "IndicNLPTokenizer(\"or\")", - "original_lang_name": "oriya", - "original_lang_code": "ori", - "scripts": [ - "Latn", - "Orya" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": true - }, - "Arab": { - "full_object": "SpaCyTokenizer(\"fa\")", - "original_lang_name": "persian", - "original_lang_code": "fas", - "scripts": [ - "Arab" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": true - }, - "Sinh": { - "full_object": "SpaCyTokenizer(\"si\")", - "original_lang_name": "sinhala", - "original_lang_code": "sin", - "scripts": [ - "Sinh" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false } }, "children": [ @@ -2377,245 +1052,113 @@ ], "class_name": "SpaCyTokenizer", "macrolanguage": false - }, - "Cyrl": { - "full_object": "SpaCyTokenizer(\"ru\")", - "original_lang_name": "russian", - "original_lang_code": "rus", - "scripts": [ - "Cyrl" + } + }, + "children": [ + { + "name": "Czech-Slovak", + "iso_1_code": null, + "iso_3_code": null, + "tokenizers": { + "Latn": { + "full_object": "SpaCyTokenizer(\"cs\")", + "original_lang_name": "czech", + "original_lang_code": "ces", + "scripts": [ + "Latn" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } + }, + "children": [ + { + "name": "Czech", + "iso_1_code": "cs", + "iso_3_code": "ces", + "tokenizers": { + "Latn": { + "full_object": "SpaCyTokenizer(\"cs\")", + "original_lang_name": "czech", + "original_lang_code": "ces", + "scripts": [ + "Latn" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } + }, + "children": [], + "node_i": "3963", + "scripts": [ + "Latn" + ], + "own_tokenizer": true + }, + { + "name": "Knaanic", + "iso_1_code": null, + "iso_3_code": "czk", + "tokenizers": {}, + "children": [], + "node_i": "3964", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Slovak", + "iso_1_code": "sk", + "iso_3_code": "slk", + "tokenizers": { + "Latn": { + "full_object": "SpaCyTokenizer(\"sk\")", + "original_lang_name": "slovak", + "original_lang_code": "slk", + "scripts": [ + "Latn" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } + }, + "children": [], + "node_i": "3965", + "scripts": [ + "Latn" + ], + "own_tokenizer": true + } ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false + "node_i": "3962", + "scripts": [], + "own_tokenizer": false }, - "Armn": { - "full_object": "SpaCyTokenizer(\"hy\")", - "original_lang_name": "armenian", - "original_lang_code": "hye", - "scripts": [ - "Armn" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Grek": { - "full_object": "SpaCyTokenizer(\"el\")", - "original_lang_name": "greek", - "original_lang_code": "ell", - "scripts": [ - "Grek" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Deva": { - "full_object": "SpaCyTokenizer(\"hi\")", - "original_lang_name": "hindi", - "original_lang_code": "hin", - "scripts": [ - "Latn", - "Deva" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Gujr": { - "full_object": "SpaCyTokenizer(\"gu\")", - "original_lang_name": "gujarati", - "original_lang_code": "guj", - "scripts": [ - "Latn", - "Gujr" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Guru": { - "full_object": "IndicNLPTokenizer(\"pa\")", - "original_lang_name": "punjabi", - "original_lang_code": "pan", - "scripts": [ - "Latn", - "Guru" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": false - }, - "Beng": { - "full_object": "SpaCyTokenizer(\"bn\")", - "original_lang_name": "bengali", - "original_lang_code": "ben", - "scripts": [ - "Latn", - "Beng" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Orya": { - "full_object": "IndicNLPTokenizer(\"or\")", - "original_lang_name": "oriya", - "original_lang_code": "ori", - "scripts": [ - "Latn", - "Orya" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": true - }, - "Arab": { - "full_object": "SpaCyTokenizer(\"fa\")", - "original_lang_name": "persian", - "original_lang_code": "fas", - "scripts": [ - "Arab" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": true - }, - "Sinh": { - "full_object": "SpaCyTokenizer(\"si\")", - "original_lang_name": "sinhala", - "original_lang_code": "sin", - "scripts": [ - "Sinh" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - } - }, - "children": [ { - "name": "Czech-Slovak", + "name": "Lechitic", "iso_1_code": null, "iso_3_code": null, "tokenizers": { "Latn": { - "full_object": "SpaCyTokenizer(\"cs\")", - "original_lang_name": "czech", - "original_lang_code": "ces", + "full_object": "SpaCyTokenizer(\"pl\")", + "original_lang_name": "polish", + "original_lang_code": "pol", "scripts": [ "Latn" ], "class_name": "SpaCyTokenizer", "macrolanguage": false - }, - "Cyrl": { - "full_object": "SpaCyTokenizer(\"ru\")", - "original_lang_name": "russian", - "original_lang_code": "rus", - "scripts": [ - "Cyrl" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Armn": { - "full_object": "SpaCyTokenizer(\"hy\")", - "original_lang_name": "armenian", - "original_lang_code": "hye", - "scripts": [ - "Armn" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Grek": { - "full_object": "SpaCyTokenizer(\"el\")", - "original_lang_name": "greek", - "original_lang_code": "ell", - "scripts": [ - "Grek" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Deva": { - "full_object": "SpaCyTokenizer(\"hi\")", - "original_lang_name": "hindi", - "original_lang_code": "hin", - "scripts": [ - "Latn", - "Deva" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Gujr": { - "full_object": "SpaCyTokenizer(\"gu\")", - "original_lang_name": "gujarati", - "original_lang_code": "guj", - "scripts": [ - "Latn", - "Gujr" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Guru": { - "full_object": "IndicNLPTokenizer(\"pa\")", - "original_lang_name": "punjabi", - "original_lang_code": "pan", - "scripts": [ - "Latn", - "Guru" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": false - }, - "Beng": { - "full_object": "SpaCyTokenizer(\"bn\")", - "original_lang_name": "bengali", - "original_lang_code": "ben", - "scripts": [ - "Latn", - "Beng" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Orya": { - "full_object": "IndicNLPTokenizer(\"or\")", - "original_lang_name": "oriya", - "original_lang_code": "ori", - "scripts": [ - "Latn", - "Orya" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": true - }, - "Arab": { - "full_object": "SpaCyTokenizer(\"fa\")", - "original_lang_name": "persian", - "original_lang_code": "fas", - "scripts": [ - "Arab" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": true - }, - "Sinh": { - "full_object": "SpaCyTokenizer(\"si\")", - "original_lang_name": "sinhala", - "original_lang_code": "sin", - "scripts": [ - "Sinh" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false } }, "children": [ { - "name": "Czech", - "iso_1_code": "cs", - "iso_3_code": "ces", + "name": "Kashubian", + "iso_1_code": null, + "iso_3_code": "csb", "tokenizers": { "Latn": { - "full_object": "SpaCyTokenizer(\"cs\")", - "original_lang_name": "czech", - "original_lang_code": "ces", + "full_object": "SpaCyTokenizer(\"pl\")", + "original_lang_name": "polish", + "original_lang_code": "pol", "scripts": [ "Latn" ], @@ -2624,31 +1167,54 @@ } }, "children": [], - "node_i": "3963", + "node_i": "3967", + "scripts": [ + "Latn" + ], + "own_tokenizer": false + }, + { + "name": "Polish", + "iso_1_code": "pl", + "iso_3_code": "pol", + "tokenizers": { + "Latn": { + "full_object": "SpaCyTokenizer(\"pl\")", + "original_lang_name": "polish", + "original_lang_code": "pol", + "scripts": [ + "Latn" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } + }, + "children": [], + "node_i": "3968", "scripts": [ "Latn" ], "own_tokenizer": true }, { - "name": "Knaanic", + "name": "Polabian", "iso_1_code": null, - "iso_3_code": "czk", + "iso_3_code": "pox", "tokenizers": {}, "children": [], - "node_i": "3964", + "node_i": "3969", "scripts": [], "own_tokenizer": false }, { - "name": "Slovak", - "iso_1_code": "sk", - "iso_3_code": "slk", + "name": "Silesian", + "iso_1_code": null, + "iso_3_code": "szl", "tokenizers": { "Latn": { - "full_object": "SpaCyTokenizer(\"sk\")", - "original_lang_name": "slovak", - "original_lang_code": "slk", + "full_object": "SpaCyTokenizer(\"pl\")", + "original_lang_name": "polish", + "original_lang_code": "pol", "scripts": [ "Latn" ], @@ -2657,171 +1223,43 @@ } }, "children": [], - "node_i": "3965", + "node_i": "3970", "scripts": [ "Latn" ], - "own_tokenizer": true + "own_tokenizer": false } ], - "node_i": "3962", + "node_i": "3966", "scripts": [], "own_tokenizer": false }, { - "name": "Lechitic", + "name": "Sorbian", "iso_1_code": null, "iso_3_code": null, "tokenizers": { "Latn": { - "full_object": "SpaCyTokenizer(\"pl\")", - "original_lang_name": "polish", - "original_lang_code": "pol", + "full_object": "SpaCyTokenizer(\"hsb\")", + "original_lang_name": "upper_sorbian", + "original_lang_code": "hsb", "scripts": [ "Latn" ], "class_name": "SpaCyTokenizer", "macrolanguage": false - }, - "Cyrl": { - "full_object": "SpaCyTokenizer(\"ru\")", - "original_lang_name": "russian", - "original_lang_code": "rus", - "scripts": [ - "Cyrl" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Armn": { - "full_object": "SpaCyTokenizer(\"hy\")", - "original_lang_name": "armenian", - "original_lang_code": "hye", - "scripts": [ - "Armn" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Grek": { - "full_object": "SpaCyTokenizer(\"el\")", - "original_lang_name": "greek", - "original_lang_code": "ell", - "scripts": [ - "Grek" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Deva": { - "full_object": "SpaCyTokenizer(\"hi\")", - "original_lang_name": "hindi", - "original_lang_code": "hin", - "scripts": [ - "Latn", - "Deva" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Gujr": { - "full_object": "SpaCyTokenizer(\"gu\")", - "original_lang_name": "gujarati", - "original_lang_code": "guj", - "scripts": [ - "Latn", - "Gujr" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Guru": { - "full_object": "IndicNLPTokenizer(\"pa\")", - "original_lang_name": "punjabi", - "original_lang_code": "pan", - "scripts": [ - "Latn", - "Guru" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": false - }, - "Beng": { - "full_object": "SpaCyTokenizer(\"bn\")", - "original_lang_name": "bengali", - "original_lang_code": "ben", - "scripts": [ - "Latn", - "Beng" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Orya": { - "full_object": "IndicNLPTokenizer(\"or\")", - "original_lang_name": "oriya", - "original_lang_code": "ori", - "scripts": [ - "Latn", - "Orya" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": true - }, - "Arab": { - "full_object": "SpaCyTokenizer(\"fa\")", - "original_lang_name": "persian", - "original_lang_code": "fas", - "scripts": [ - "Arab" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": true - }, - "Sinh": { - "full_object": "SpaCyTokenizer(\"si\")", - "original_lang_name": "sinhala", - "original_lang_code": "sin", - "scripts": [ - "Sinh" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false } }, "children": [ { - "name": "Kashubian", + "name": "Sorbian, Lower", "iso_1_code": null, - "iso_3_code": "csb", - "tokenizers": { - "Latn": { - "full_object": "SpaCyTokenizer(\"pl\")", - "original_lang_name": "polish", - "original_lang_code": "pol", - "scripts": [ - "Latn" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - } - }, - "children": [], - "node_i": "3967", - "scripts": [ - "Latn" - ], - "own_tokenizer": false - }, - { - "name": "Polish", - "iso_1_code": "pl", - "iso_3_code": "pol", + "iso_3_code": "dsb", "tokenizers": { "Latn": { - "full_object": "SpaCyTokenizer(\"pl\")", - "original_lang_name": "polish", - "original_lang_code": "pol", + "full_object": "SpaCyTokenizer(\"dsb\")", + "original_lang_name": "lower_sorbian", + "original_lang_code": "dsb", "scripts": [ "Latn" ], @@ -2830,31 +1268,21 @@ } }, "children": [], - "node_i": "3968", + "node_i": "3972", "scripts": [ "Latn" ], "own_tokenizer": true }, { - "name": "Polabian", - "iso_1_code": null, - "iso_3_code": "pox", - "tokenizers": {}, - "children": [], - "node_i": "3969", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Silesian", + "name": "Sorbian, Upper", "iso_1_code": null, - "iso_3_code": "szl", + "iso_3_code": "hsb", "tokenizers": { "Latn": { - "full_object": "SpaCyTokenizer(\"pl\")", - "original_lang_name": "polish", - "original_lang_code": "pol", + "full_object": "SpaCyTokenizer(\"hsb\")", + "original_lang_name": "upper_sorbian", + "original_lang_code": "hsb", "scripts": [ "Latn" ], @@ -2863,761 +1291,490 @@ } }, "children": [], - "node_i": "3970", + "node_i": "3973", "scripts": [ "Latn" ], - "own_tokenizer": false + "own_tokenizer": true } ], - "node_i": "3966", + "node_i": "3971", "scripts": [], "own_tokenizer": false - }, + } + ], + "node_i": "3961", + "scripts": [], + "own_tokenizer": false + } + ], + "node_i": "3941", + "scripts": [], + "own_tokenizer": false + } + ], + "node_i": "3930", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Celtic", + "iso_1_code": null, + "iso_3_code": null, + "tokenizers": { + "Latn": { + "full_object": "StanzaTokenizer(\"cy\")", + "original_lang_name": "welsh", + "original_lang_code": "cym", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + } + }, + "children": [ + { + "name": "Insular", + "iso_1_code": null, + "iso_3_code": null, + "tokenizers": { + "Latn": { + "full_object": "StanzaTokenizer(\"cy\")", + "original_lang_name": "welsh", + "original_lang_code": "cym", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + } + }, + "children": [ + { + "name": "Brythonic", + "iso_1_code": null, + "iso_3_code": null, + "tokenizers": { + "Latn": { + "full_object": "StanzaTokenizer(\"cy\")", + "original_lang_name": "welsh", + "original_lang_code": "cym", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + } + }, + "children": [ { - "name": "Sorbian", - "iso_1_code": null, - "iso_3_code": null, + "name": "Breton", + "iso_1_code": "br", + "iso_3_code": "bre", "tokenizers": { "Latn": { - "full_object": "SpaCyTokenizer(\"hsb\")", - "original_lang_name": "upper_sorbian", - "original_lang_code": "hsb", + "full_object": "StanzaTokenizer(\"cy\")", + "original_lang_name": "welsh", + "original_lang_code": "cym", "scripts": [ "Latn" ], - "class_name": "SpaCyTokenizer", + "class_name": "StanzaTokenizer", "macrolanguage": false - }, - "Cyrl": { - "full_object": "SpaCyTokenizer(\"ru\")", - "original_lang_name": "russian", - "original_lang_code": "rus", - "scripts": [ - "Cyrl" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Armn": { - "full_object": "SpaCyTokenizer(\"hy\")", - "original_lang_name": "armenian", - "original_lang_code": "hye", - "scripts": [ - "Armn" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Grek": { - "full_object": "SpaCyTokenizer(\"el\")", - "original_lang_name": "greek", - "original_lang_code": "ell", - "scripts": [ - "Grek" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Deva": { - "full_object": "SpaCyTokenizer(\"hi\")", - "original_lang_name": "hindi", - "original_lang_code": "hin", - "scripts": [ - "Latn", - "Deva" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Gujr": { - "full_object": "SpaCyTokenizer(\"gu\")", - "original_lang_name": "gujarati", - "original_lang_code": "guj", + } + }, + "children": [], + "node_i": "3977", + "scripts": [ + "Latn" + ], + "own_tokenizer": false + }, + { + "name": "Cornish", + "iso_1_code": "kw", + "iso_3_code": "cor", + "tokenizers": { + "Latn": { + "full_object": "StanzaTokenizer(\"cy\")", + "original_lang_name": "welsh", + "original_lang_code": "cym", "scripts": [ - "Latn", - "Gujr" + "Latn" ], - "class_name": "SpaCyTokenizer", + "class_name": "StanzaTokenizer", "macrolanguage": false - }, - "Guru": { - "full_object": "IndicNLPTokenizer(\"pa\")", - "original_lang_name": "punjabi", - "original_lang_code": "pan", + } + }, + "children": [], + "node_i": "3978", + "scripts": [ + "Latn" + ], + "own_tokenizer": false + }, + { + "name": "Welsh", + "iso_1_code": "cy", + "iso_3_code": "cym", + "tokenizers": { + "Latn": { + "full_object": "StanzaTokenizer(\"cy\")", + "original_lang_name": "welsh", + "original_lang_code": "cym", "scripts": [ - "Latn", - "Guru" + "Latn" ], - "class_name": "IndicNLPTokenizer", + "class_name": "StanzaTokenizer", "macrolanguage": false - }, - "Beng": { - "full_object": "SpaCyTokenizer(\"bn\")", - "original_lang_name": "bengali", - "original_lang_code": "ben", + } + }, + "children": [], + "node_i": "3979", + "scripts": [ + "Latn" + ], + "own_tokenizer": true + } + ], + "node_i": "3976", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Goidelic", + "iso_1_code": null, + "iso_3_code": null, + "tokenizers": { + "Latn": { + "full_object": "SpaCyTokenizer(\"ga\")", + "original_lang_name": "irish", + "original_lang_code": "gle", + "scripts": [ + "Latn" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } + }, + "children": [ + { + "name": "Scottish Gaelic", + "iso_1_code": "gd", + "iso_3_code": "gla", + "tokenizers": { + "Latn": { + "full_object": "StanzaTokenizer(\"gd\")", + "original_lang_name": "gaelic", + "original_lang_code": "gla", "scripts": [ - "Latn", - "Beng" + "Latn" ], - "class_name": "SpaCyTokenizer", + "class_name": "StanzaTokenizer", "macrolanguage": false - }, - "Orya": { - "full_object": "IndicNLPTokenizer(\"or\")", - "original_lang_name": "oriya", - "original_lang_code": "ori", - "scripts": [ - "Latn", - "Orya" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": true - }, - "Arab": { - "full_object": "SpaCyTokenizer(\"fa\")", - "original_lang_name": "persian", - "original_lang_code": "fas", - "scripts": [ - "Arab" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": true - }, - "Sinh": { - "full_object": "SpaCyTokenizer(\"si\")", - "original_lang_name": "sinhala", - "original_lang_code": "sin", + } + }, + "children": [], + "node_i": "3981", + "scripts": [ + "Latn" + ], + "own_tokenizer": true + }, + { + "name": "Irish", + "iso_1_code": "ga", + "iso_3_code": "gle", + "tokenizers": { + "Latn": { + "full_object": "SpaCyTokenizer(\"ga\")", + "original_lang_name": "irish", + "original_lang_code": "gle", "scripts": [ - "Sinh" + "Latn" ], "class_name": "SpaCyTokenizer", "macrolanguage": false } }, - "children": [ - { - "name": "Sorbian, Lower", - "iso_1_code": null, - "iso_3_code": "dsb", - "tokenizers": { - "Latn": { - "full_object": "SpaCyTokenizer(\"dsb\")", - "original_lang_name": "lower_sorbian", - "original_lang_code": "dsb", - "scripts": [ - "Latn" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - } - }, - "children": [], - "node_i": "3972", - "scripts": [ - "Latn" - ], - "own_tokenizer": true - }, - { - "name": "Sorbian, Upper", - "iso_1_code": null, - "iso_3_code": "hsb", - "tokenizers": { - "Latn": { - "full_object": "SpaCyTokenizer(\"hsb\")", - "original_lang_name": "upper_sorbian", - "original_lang_code": "hsb", - "scripts": [ - "Latn" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - } - }, - "children": [], - "node_i": "3973", + "children": [], + "node_i": "3982", + "scripts": [ + "Latn" + ], + "own_tokenizer": true + }, + { + "name": "Manx", + "iso_1_code": "gv", + "iso_3_code": "glv", + "tokenizers": { + "Latn": { + "full_object": "StanzaTokenizer(\"gv\")", + "original_lang_name": "manx", + "original_lang_code": "glv", "scripts": [ "Latn" ], - "own_tokenizer": true + "class_name": "StanzaTokenizer", + "macrolanguage": false } + }, + "children": [], + "node_i": "3983", + "scripts": [ + "Latn" ], - "node_i": "3971", - "scripts": [], - "own_tokenizer": false + "own_tokenizer": true } ], - "node_i": "3961", + "node_i": "3980", "scripts": [], "own_tokenizer": false } ], - "node_i": "3941", + "node_i": "3975", "scripts": [], "own_tokenizer": false } ], - "node_i": "3930", + "node_i": "3974", "scripts": [], "own_tokenizer": false }, { - "name": "Celtic", + "name": "Germanic", "iso_1_code": null, "iso_3_code": null, "tokenizers": { "Latn": { - "full_object": "StanzaTokenizer(\"cy\")", - "original_lang_name": "welsh", - "original_lang_code": "cym", + "full_object": "SpaCyTokenizer(\"en\")", + "original_lang_name": "english", + "original_lang_code": "eng", "scripts": [ "Latn" ], - "class_name": "StanzaTokenizer", - "macrolanguage": false - }, - "Armn": { - "full_object": "SpaCyTokenizer(\"hy\")", - "original_lang_name": "armenian", - "original_lang_code": "hye", - "scripts": [ - "Armn" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Cyrl": { - "full_object": "SpaCyTokenizer(\"ru\")", - "original_lang_name": "russian", - "original_lang_code": "rus", - "scripts": [ - "Cyrl" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Grek": { - "full_object": "SpaCyTokenizer(\"el\")", - "original_lang_name": "greek", - "original_lang_code": "ell", - "scripts": [ - "Grek" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Deva": { - "full_object": "SpaCyTokenizer(\"hi\")", - "original_lang_name": "hindi", - "original_lang_code": "hin", - "scripts": [ - "Latn", - "Deva" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Gujr": { - "full_object": "SpaCyTokenizer(\"gu\")", - "original_lang_name": "gujarati", - "original_lang_code": "guj", - "scripts": [ - "Latn", - "Gujr" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Guru": { - "full_object": "IndicNLPTokenizer(\"pa\")", - "original_lang_name": "punjabi", - "original_lang_code": "pan", - "scripts": [ - "Latn", - "Guru" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": false - }, - "Beng": { - "full_object": "SpaCyTokenizer(\"bn\")", - "original_lang_name": "bengali", - "original_lang_code": "ben", - "scripts": [ - "Latn", - "Beng" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Orya": { - "full_object": "IndicNLPTokenizer(\"or\")", - "original_lang_name": "oriya", - "original_lang_code": "ori", - "scripts": [ - "Latn", - "Orya" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": true - }, - "Arab": { - "full_object": "SpaCyTokenizer(\"fa\")", - "original_lang_name": "persian", - "original_lang_code": "fas", - "scripts": [ - "Arab" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": true - }, - "Sinh": { - "full_object": "SpaCyTokenizer(\"si\")", - "original_lang_name": "sinhala", - "original_lang_code": "sin", - "scripts": [ - "Sinh" - ], "class_name": "SpaCyTokenizer", "macrolanguage": false } }, "children": [ { - "name": "Insular", + "name": "North", "iso_1_code": null, "iso_3_code": null, "tokenizers": { "Latn": { - "full_object": "StanzaTokenizer(\"cy\")", - "original_lang_name": "welsh", - "original_lang_code": "cym", + "full_object": "SpaCyTokenizer(\"sv\")", + "original_lang_name": "swedish", + "original_lang_code": "swe", "scripts": [ "Latn" ], - "class_name": "StanzaTokenizer", - "macrolanguage": false - }, - "Armn": { - "full_object": "SpaCyTokenizer(\"hy\")", - "original_lang_name": "armenian", - "original_lang_code": "hye", - "scripts": [ - "Armn" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Cyrl": { - "full_object": "SpaCyTokenizer(\"ru\")", - "original_lang_name": "russian", - "original_lang_code": "rus", - "scripts": [ - "Cyrl" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Grek": { - "full_object": "SpaCyTokenizer(\"el\")", - "original_lang_name": "greek", - "original_lang_code": "ell", - "scripts": [ - "Grek" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Deva": { - "full_object": "SpaCyTokenizer(\"hi\")", - "original_lang_name": "hindi", - "original_lang_code": "hin", - "scripts": [ - "Latn", - "Deva" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Gujr": { - "full_object": "SpaCyTokenizer(\"gu\")", - "original_lang_name": "gujarati", - "original_lang_code": "guj", - "scripts": [ - "Latn", - "Gujr" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Guru": { - "full_object": "IndicNLPTokenizer(\"pa\")", - "original_lang_name": "punjabi", - "original_lang_code": "pan", - "scripts": [ - "Latn", - "Guru" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": false - }, - "Beng": { - "full_object": "SpaCyTokenizer(\"bn\")", - "original_lang_name": "bengali", - "original_lang_code": "ben", - "scripts": [ - "Latn", - "Beng" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Orya": { - "full_object": "IndicNLPTokenizer(\"or\")", - "original_lang_name": "oriya", - "original_lang_code": "ori", - "scripts": [ - "Latn", - "Orya" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": true - }, - "Arab": { - "full_object": "SpaCyTokenizer(\"fa\")", - "original_lang_name": "persian", - "original_lang_code": "fas", - "scripts": [ - "Arab" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": true - }, - "Sinh": { - "full_object": "SpaCyTokenizer(\"si\")", - "original_lang_name": "sinhala", - "original_lang_code": "sin", - "scripts": [ - "Sinh" - ], "class_name": "SpaCyTokenizer", "macrolanguage": false } }, "children": [ { - "name": "Brythonic", + "name": "East Scandinavian", "iso_1_code": null, "iso_3_code": null, "tokenizers": { "Latn": { - "full_object": "StanzaTokenizer(\"cy\")", - "original_lang_name": "welsh", - "original_lang_code": "cym", + "full_object": "SpaCyTokenizer(\"sv\")", + "original_lang_name": "swedish", + "original_lang_code": "swe", "scripts": [ "Latn" ], - "class_name": "StanzaTokenizer", - "macrolanguage": false - }, - "Armn": { - "full_object": "SpaCyTokenizer(\"hy\")", - "original_lang_name": "armenian", - "original_lang_code": "hye", - "scripts": [ - "Armn" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Cyrl": { - "full_object": "SpaCyTokenizer(\"ru\")", - "original_lang_name": "russian", - "original_lang_code": "rus", - "scripts": [ - "Cyrl" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Grek": { - "full_object": "SpaCyTokenizer(\"el\")", - "original_lang_name": "greek", - "original_lang_code": "ell", - "scripts": [ - "Grek" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Deva": { - "full_object": "SpaCyTokenizer(\"hi\")", - "original_lang_name": "hindi", - "original_lang_code": "hin", - "scripts": [ - "Latn", - "Deva" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Gujr": { - "full_object": "SpaCyTokenizer(\"gu\")", - "original_lang_name": "gujarati", - "original_lang_code": "guj", - "scripts": [ - "Latn", - "Gujr" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Guru": { - "full_object": "IndicNLPTokenizer(\"pa\")", - "original_lang_name": "punjabi", - "original_lang_code": "pan", - "scripts": [ - "Latn", - "Guru" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": false - }, - "Beng": { - "full_object": "SpaCyTokenizer(\"bn\")", - "original_lang_name": "bengali", - "original_lang_code": "ben", - "scripts": [ - "Latn", - "Beng" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Orya": { - "full_object": "IndicNLPTokenizer(\"or\")", - "original_lang_name": "oriya", - "original_lang_code": "ori", - "scripts": [ - "Latn", - "Orya" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": true - }, - "Arab": { - "full_object": "SpaCyTokenizer(\"fa\")", - "original_lang_name": "persian", - "original_lang_code": "fas", - "scripts": [ - "Arab" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": true - }, - "Sinh": { - "full_object": "SpaCyTokenizer(\"si\")", - "original_lang_name": "sinhala", - "original_lang_code": "sin", - "scripts": [ - "Sinh" - ], "class_name": "SpaCyTokenizer", "macrolanguage": false } }, "children": [ { - "name": "Breton", - "iso_1_code": "br", - "iso_3_code": "bre", - "tokenizers": { - "Latn": { - "full_object": "StanzaTokenizer(\"cy\")", - "original_lang_name": "welsh", - "original_lang_code": "cym", - "scripts": [ - "Latn" - ], - "class_name": "StanzaTokenizer", - "macrolanguage": false - } - }, + "name": "\u00d6vdalian", + "iso_1_code": null, + "iso_3_code": "ovd", + "tokenizers": {}, "children": [], - "node_i": "3977", - "scripts": [ - "Latn" - ], + "node_i": "3987", + "scripts": [], "own_tokenizer": false }, { - "name": "Cornish", - "iso_1_code": "kw", - "iso_3_code": "cor", + "name": "Danish-Swedish", + "iso_1_code": null, + "iso_3_code": null, "tokenizers": { "Latn": { - "full_object": "StanzaTokenizer(\"cy\")", - "original_lang_name": "welsh", - "original_lang_code": "cym", + "full_object": "SpaCyTokenizer(\"sv\")", + "original_lang_name": "swedish", + "original_lang_code": "swe", "scripts": [ "Latn" ], - "class_name": "StanzaTokenizer", + "class_name": "SpaCyTokenizer", "macrolanguage": false } }, - "children": [], - "node_i": "3978", - "scripts": [ - "Latn" - ], - "own_tokenizer": false - }, - { - "name": "Welsh", - "iso_1_code": "cy", - "iso_3_code": "cym", - "tokenizers": { - "Latn": { - "full_object": "StanzaTokenizer(\"cy\")", - "original_lang_name": "welsh", - "original_lang_code": "cym", - "scripts": [ - "Latn" + "children": [ + { + "name": "Danish-Bokmal", + "iso_1_code": null, + "iso_3_code": null, + "tokenizers": {}, + "children": [ + { + "name": "Norwegian", + "iso_1_code": "no", + "iso_3_code": "nor", + "tokenizers": {}, + "children": [], + "node_i": "3990", + "scripts": [], + "own_tokenizer": true + } ], - "class_name": "StanzaTokenizer", - "macrolanguage": false - } - }, - "children": [], - "node_i": "3979", - "scripts": [ - "Latn" - ], - "own_tokenizer": true - } - ], - "node_i": "3976", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Goidelic", - "iso_1_code": null, - "iso_3_code": null, - "tokenizers": { - "Latn": { - "full_object": "SpaCyTokenizer(\"ga\")", - "original_lang_name": "irish", - "original_lang_code": "gle", - "scripts": [ - "Latn" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Armn": { - "full_object": "SpaCyTokenizer(\"hy\")", - "original_lang_name": "armenian", - "original_lang_code": "hye", - "scripts": [ - "Armn" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Cyrl": { - "full_object": "SpaCyTokenizer(\"ru\")", - "original_lang_name": "russian", - "original_lang_code": "rus", - "scripts": [ - "Cyrl" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Grek": { - "full_object": "SpaCyTokenizer(\"el\")", - "original_lang_name": "greek", - "original_lang_code": "ell", - "scripts": [ - "Grek" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Deva": { - "full_object": "SpaCyTokenizer(\"hi\")", - "original_lang_name": "hindi", - "original_lang_code": "hin", - "scripts": [ - "Latn", - "Deva" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Gujr": { - "full_object": "SpaCyTokenizer(\"gu\")", - "original_lang_name": "gujarati", - "original_lang_code": "guj", - "scripts": [ - "Latn", - "Gujr" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Guru": { - "full_object": "IndicNLPTokenizer(\"pa\")", - "original_lang_name": "punjabi", - "original_lang_code": "pan", - "scripts": [ - "Latn", - "Guru" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": false - }, - "Beng": { - "full_object": "SpaCyTokenizer(\"bn\")", - "original_lang_name": "bengali", - "original_lang_code": "ben", - "scripts": [ - "Latn", - "Beng" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Orya": { - "full_object": "IndicNLPTokenizer(\"or\")", - "original_lang_name": "oriya", - "original_lang_code": "ori", - "scripts": [ - "Latn", - "Orya" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": true - }, - "Arab": { - "full_object": "SpaCyTokenizer(\"fa\")", - "original_lang_name": "persian", - "original_lang_code": "fas", - "scripts": [ - "Arab" + "node_i": "3989", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Danish-Riksmal", + "iso_1_code": null, + "iso_3_code": null, + "tokenizers": { + "Latn": { + "full_object": "SpaCyTokenizer(\"da\")", + "original_lang_name": "danish", + "original_lang_code": "dan", + "scripts": [ + "Latn" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } + }, + "children": [ + { + "name": "Danish", + "iso_1_code": null, + "iso_3_code": null, + "tokenizers": { + "Latn": { + "full_object": "SpaCyTokenizer(\"da\")", + "original_lang_name": "danish", + "original_lang_code": "dan", + "scripts": [ + "Latn" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } + }, + "children": [ + { + "name": "Danish", + "iso_1_code": "da", + "iso_3_code": "dan", + "tokenizers": { + "Latn": { + "full_object": "SpaCyTokenizer(\"da\")", + "original_lang_name": "danish", + "original_lang_code": "dan", + "scripts": [ + "Latn" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } + }, + "children": [], + "node_i": "3993", + "scripts": [ + "Latn" + ], + "own_tokenizer": true + } + ], + "node_i": "3992", + "scripts": [], + "own_tokenizer": false + } + ], + "node_i": "3991", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Swedish", + "iso_1_code": null, + "iso_3_code": null, + "tokenizers": { + "Latn": { + "full_object": "SpaCyTokenizer(\"sv\")", + "original_lang_name": "swedish", + "original_lang_code": "swe", + "scripts": [ + "Latn" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } + }, + "children": [ + { + "name": "Swedish", + "iso_1_code": "sv", + "iso_3_code": "swe", + "tokenizers": { + "Latn": { + "full_object": "SpaCyTokenizer(\"sv\")", + "original_lang_name": "swedish", + "original_lang_code": "swe", + "scripts": [ + "Latn" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } + }, + "children": [], + "node_i": "3995", + "scripts": [ + "Latn" + ], + "own_tokenizer": true + } + ], + "node_i": "3994", + "scripts": [], + "own_tokenizer": false + } ], - "class_name": "SpaCyTokenizer", - "macrolanguage": true - }, - "Sinh": { - "full_object": "SpaCyTokenizer(\"si\")", - "original_lang_name": "sinhala", - "original_lang_code": "sin", + "node_i": "3988", + "scripts": [], + "own_tokenizer": false + } + ], + "node_i": "3986", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "West Scandinavian", + "iso_1_code": null, + "iso_3_code": null, + "tokenizers": { + "Latn": { + "full_object": "SpaCyTokenizer(\"is\")", + "original_lang_name": "icelandic", + "original_lang_code": "isl", "scripts": [ - "Sinh" + "Latn" ], "class_name": "SpaCyTokenizer", "macrolanguage": false @@ -3625,37 +1782,37 @@ }, "children": [ { - "name": "Scottish Gaelic", - "iso_1_code": "gd", - "iso_3_code": "gla", + "name": "Faroese", + "iso_1_code": "fo", + "iso_3_code": "fao", "tokenizers": { "Latn": { - "full_object": "StanzaTokenizer(\"gd\")", - "original_lang_name": "gaelic", - "original_lang_code": "gla", + "full_object": "SpaCyTokenizer(\"fo\")", + "original_lang_name": "faroese", + "original_lang_code": "fao", "scripts": [ "Latn" ], - "class_name": "StanzaTokenizer", + "class_name": "SpaCyTokenizer", "macrolanguage": false } }, "children": [], - "node_i": "3981", + "node_i": "3997", "scripts": [ "Latn" ], "own_tokenizer": true }, { - "name": "Irish", - "iso_1_code": "ga", - "iso_3_code": "gle", + "name": "Icelandic", + "iso_1_code": "is", + "iso_3_code": "isl", "tokenizers": { "Latn": { - "full_object": "SpaCyTokenizer(\"ga\")", - "original_lang_name": "irish", - "original_lang_code": "gle", + "full_object": "SpaCyTokenizer(\"is\")", + "original_lang_name": "icelandic", + "original_lang_code": "isl", "scripts": [ "Latn" ], @@ -3664,410 +1821,229 @@ } }, "children": [], - "node_i": "3982", + "node_i": "3998", "scripts": [ "Latn" ], "own_tokenizer": true }, { - "name": "Manx", - "iso_1_code": "gv", - "iso_3_code": "glv", - "tokenizers": { - "Latn": { - "full_object": "StanzaTokenizer(\"gv\")", - "original_lang_name": "manx", - "original_lang_code": "glv", - "scripts": [ - "Latn" - ], - "class_name": "StanzaTokenizer", - "macrolanguage": false - } - }, + "name": "Norn", + "iso_1_code": null, + "iso_3_code": "nrn", + "tokenizers": {}, "children": [], - "node_i": "3983", - "scripts": [ - "Latn" - ], - "own_tokenizer": true + "node_i": "3999", + "scripts": [], + "own_tokenizer": false } ], - "node_i": "3980", + "node_i": "3996", "scripts": [], "own_tokenizer": false } ], - "node_i": "3975", + "node_i": "3985", "scripts": [], "own_tokenizer": false - } - ], - "node_i": "3974", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Germanic", - "iso_1_code": null, - "iso_3_code": null, - "tokenizers": { - "Latn": { - "full_object": "SpaCyTokenizer(\"en\")", - "original_lang_name": "english", - "original_lang_code": "eng", - "scripts": [ - "Latn" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Armn": { - "full_object": "SpaCyTokenizer(\"hy\")", - "original_lang_name": "armenian", - "original_lang_code": "hye", - "scripts": [ - "Armn" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false }, - "Cyrl": { - "full_object": "SpaCyTokenizer(\"ru\")", - "original_lang_name": "russian", - "original_lang_code": "rus", - "scripts": [ - "Cyrl" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Grek": { - "full_object": "SpaCyTokenizer(\"el\")", - "original_lang_name": "greek", - "original_lang_code": "ell", - "scripts": [ - "Grek" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Deva": { - "full_object": "SpaCyTokenizer(\"hi\")", - "original_lang_name": "hindi", - "original_lang_code": "hin", - "scripts": [ - "Latn", - "Deva" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Gujr": { - "full_object": "SpaCyTokenizer(\"gu\")", - "original_lang_name": "gujarati", - "original_lang_code": "guj", - "scripts": [ - "Latn", - "Gujr" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Guru": { - "full_object": "IndicNLPTokenizer(\"pa\")", - "original_lang_name": "punjabi", - "original_lang_code": "pan", - "scripts": [ - "Latn", - "Guru" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": false - }, - "Beng": { - "full_object": "SpaCyTokenizer(\"bn\")", - "original_lang_name": "bengali", - "original_lang_code": "ben", - "scripts": [ - "Latn", - "Beng" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Orya": { - "full_object": "IndicNLPTokenizer(\"or\")", - "original_lang_name": "oriya", - "original_lang_code": "ori", - "scripts": [ - "Latn", - "Orya" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": true - }, - "Arab": { - "full_object": "SpaCyTokenizer(\"fa\")", - "original_lang_name": "persian", - "original_lang_code": "fas", - "scripts": [ - "Arab" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": true - }, - "Sinh": { - "full_object": "SpaCyTokenizer(\"si\")", - "original_lang_name": "sinhala", - "original_lang_code": "sin", - "scripts": [ - "Sinh" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - } - }, - "children": [ { - "name": "North", + "name": "West", "iso_1_code": null, "iso_3_code": null, "tokenizers": { "Latn": { - "full_object": "SpaCyTokenizer(\"sv\")", - "original_lang_name": "swedish", - "original_lang_code": "swe", + "full_object": "SpaCyTokenizer(\"en\")", + "original_lang_name": "english", + "original_lang_code": "eng", "scripts": [ "Latn" ], "class_name": "SpaCyTokenizer", "macrolanguage": false - }, - "Armn": { - "full_object": "SpaCyTokenizer(\"hy\")", - "original_lang_name": "armenian", - "original_lang_code": "hye", - "scripts": [ - "Armn" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Cyrl": { - "full_object": "SpaCyTokenizer(\"ru\")", - "original_lang_name": "russian", - "original_lang_code": "rus", - "scripts": [ - "Cyrl" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Grek": { - "full_object": "SpaCyTokenizer(\"el\")", - "original_lang_name": "greek", - "original_lang_code": "ell", - "scripts": [ - "Grek" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Deva": { - "full_object": "SpaCyTokenizer(\"hi\")", - "original_lang_name": "hindi", - "original_lang_code": "hin", - "scripts": [ - "Latn", - "Deva" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Gujr": { - "full_object": "SpaCyTokenizer(\"gu\")", - "original_lang_name": "gujarati", - "original_lang_code": "guj", - "scripts": [ - "Latn", - "Gujr" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Guru": { - "full_object": "IndicNLPTokenizer(\"pa\")", - "original_lang_name": "punjabi", - "original_lang_code": "pan", - "scripts": [ - "Latn", - "Guru" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": false - }, - "Beng": { - "full_object": "SpaCyTokenizer(\"bn\")", - "original_lang_name": "bengali", - "original_lang_code": "ben", - "scripts": [ - "Latn", - "Beng" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Orya": { - "full_object": "IndicNLPTokenizer(\"or\")", - "original_lang_name": "oriya", - "original_lang_code": "ori", - "scripts": [ - "Latn", - "Orya" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": true - }, - "Arab": { - "full_object": "SpaCyTokenizer(\"fa\")", - "original_lang_name": "persian", - "original_lang_code": "fas", - "scripts": [ - "Arab" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": true - }, - "Sinh": { - "full_object": "SpaCyTokenizer(\"si\")", - "original_lang_name": "sinhala", - "original_lang_code": "sin", - "scripts": [ - "Sinh" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false } }, "children": [ { - "name": "East Scandinavian", + "name": "English", "iso_1_code": null, "iso_3_code": null, "tokenizers": { "Latn": { - "full_object": "SpaCyTokenizer(\"sv\")", - "original_lang_name": "swedish", - "original_lang_code": "swe", + "full_object": "SpaCyTokenizer(\"en\")", + "original_lang_name": "english", + "original_lang_code": "eng", "scripts": [ "Latn" ], "class_name": "SpaCyTokenizer", "macrolanguage": false - }, - "Armn": { - "full_object": "SpaCyTokenizer(\"hy\")", - "original_lang_name": "armenian", - "original_lang_code": "hye", - "scripts": [ - "Armn" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Cyrl": { - "full_object": "SpaCyTokenizer(\"ru\")", - "original_lang_name": "russian", - "original_lang_code": "rus", + } + }, + "children": [ + { + "name": "English", + "iso_1_code": "en", + "iso_3_code": "eng", + "tokenizers": { + "Latn": { + "full_object": "SpaCyTokenizer(\"en\")", + "original_lang_name": "english", + "original_lang_code": "eng", + "scripts": [ + "Latn" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } + }, + "children": [], + "node_i": "4002", "scripts": [ - "Cyrl" + "Latn" ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false + "own_tokenizer": true }, - "Grek": { - "full_object": "SpaCyTokenizer(\"el\")", - "original_lang_name": "greek", - "original_lang_code": "ell", - "scripts": [ - "Grek" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Deva": { - "full_object": "SpaCyTokenizer(\"hi\")", - "original_lang_name": "hindi", - "original_lang_code": "hin", + { + "name": "Scots", + "iso_1_code": null, + "iso_3_code": "sco", + "tokenizers": { + "Latn": { + "full_object": "SpaCyTokenizer(\"en\")", + "original_lang_name": "english", + "original_lang_code": "eng", + "scripts": [ + "Latn" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } + }, + "children": [], + "node_i": "4003", "scripts": [ - "Latn", - "Deva" + "Latn" ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false + "own_tokenizer": false }, - "Gujr": { - "full_object": "SpaCyTokenizer(\"gu\")", - "original_lang_name": "gujarati", - "original_lang_code": "guj", + { + "name": "Yola", + "iso_1_code": null, + "iso_3_code": "yol", + "tokenizers": {}, + "children": [], + "node_i": "4004", + "scripts": [], + "own_tokenizer": false + } + ], + "node_i": "4001", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Frisian", + "iso_1_code": null, + "iso_3_code": null, + "tokenizers": { + "Latn": { + "full_object": "SpaCyTokenizer(\"en\")", + "original_lang_name": "english", + "original_lang_code": "eng", "scripts": [ - "Latn", - "Gujr" + "Latn" ], "class_name": "SpaCyTokenizer", "macrolanguage": false - }, - "Guru": { - "full_object": "IndicNLPTokenizer(\"pa\")", - "original_lang_name": "punjabi", - "original_lang_code": "pan", - "scripts": [ - "Latn", - "Guru" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": false - }, - "Beng": { - "full_object": "SpaCyTokenizer(\"bn\")", - "original_lang_name": "bengali", - "original_lang_code": "ben", + } + }, + "children": [ + { + "name": "Frisian, Northern", + "iso_1_code": null, + "iso_3_code": "frr", + "tokenizers": { + "Latn": { + "full_object": "SpaCyTokenizer(\"en\")", + "original_lang_name": "english", + "original_lang_code": "eng", + "scripts": [ + "Latn" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } + }, + "children": [], + "node_i": "4006", "scripts": [ - "Latn", - "Beng" + "Latn" ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false + "own_tokenizer": false }, - "Orya": { - "full_object": "IndicNLPTokenizer(\"or\")", - "original_lang_name": "oriya", - "original_lang_code": "ori", + { + "name": "Frisian", + "iso_1_code": "fy", + "iso_3_code": "fry", + "tokenizers": { + "Latn": { + "full_object": "SpaCyTokenizer(\"en\")", + "original_lang_name": "english", + "original_lang_code": "eng", + "scripts": [ + "Latn" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } + }, + "children": [], + "node_i": "4007", "scripts": [ - "Latn", - "Orya" + "Latn" ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": true + "own_tokenizer": false }, - "Arab": { - "full_object": "SpaCyTokenizer(\"fa\")", - "original_lang_name": "persian", - "original_lang_code": "fas", + { + "name": "Saterfriesisch", + "iso_1_code": null, + "iso_3_code": "stq", + "tokenizers": { + "Latn": { + "full_object": "SpaCyTokenizer(\"en\")", + "original_lang_name": "english", + "original_lang_code": "eng", + "scripts": [ + "Latn" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } + }, + "children": [], + "node_i": "4008", "scripts": [ - "Arab" + "Latn" ], - "class_name": "SpaCyTokenizer", - "macrolanguage": true - }, - "Sinh": { - "full_object": "SpaCyTokenizer(\"si\")", - "original_lang_name": "sinhala", - "original_lang_code": "sin", + "own_tokenizer": false + } + ], + "node_i": "4005", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "High German", + "iso_1_code": null, + "iso_3_code": null, + "tokenizers": { + "Latn": { + "full_object": "SpaCyTokenizer(\"de\")", + "original_lang_name": "german", + "original_lang_code": "deu", "scripts": [ - "Sinh" + "Latn" ], "class_name": "SpaCyTokenizer", "macrolanguage": false @@ -4075,527 +2051,163 @@ }, "children": [ { - "name": "\u00d6vdalian", - "iso_1_code": null, - "iso_3_code": "ovd", - "tokenizers": {}, - "children": [], - "node_i": "3987", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Danish-Swedish", + "name": "German", "iso_1_code": null, "iso_3_code": null, "tokenizers": { "Latn": { - "full_object": "SpaCyTokenizer(\"sv\")", - "original_lang_name": "swedish", - "original_lang_code": "swe", + "full_object": "SpaCyTokenizer(\"de\")", + "original_lang_name": "german", + "original_lang_code": "deu", "scripts": [ "Latn" ], "class_name": "SpaCyTokenizer", "macrolanguage": false - }, - "Armn": { - "full_object": "SpaCyTokenizer(\"hy\")", - "original_lang_name": "armenian", - "original_lang_code": "hye", - "scripts": [ - "Armn" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Cyrl": { - "full_object": "SpaCyTokenizer(\"ru\")", - "original_lang_name": "russian", - "original_lang_code": "rus", - "scripts": [ - "Cyrl" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Grek": { - "full_object": "SpaCyTokenizer(\"el\")", - "original_lang_name": "greek", - "original_lang_code": "ell", + } + }, + "children": [ + { + "name": "Hunsrik", + "iso_1_code": null, + "iso_3_code": "hrx", + "tokenizers": { + "Latn": { + "full_object": "SpaCyTokenizer(\"de\")", + "original_lang_name": "german", + "original_lang_code": "deu", + "scripts": [ + "Latn" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } + }, + "children": [], + "node_i": "4011", "scripts": [ - "Grek" + "Latn" ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false + "own_tokenizer": false }, - "Deva": { - "full_object": "SpaCyTokenizer(\"hi\")", - "original_lang_name": "hindi", - "original_lang_code": "hin", - "scripts": [ - "Latn", - "Deva" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Gujr": { - "full_object": "SpaCyTokenizer(\"gu\")", - "original_lang_name": "gujarati", - "original_lang_code": "guj", - "scripts": [ - "Latn", - "Gujr" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Guru": { - "full_object": "IndicNLPTokenizer(\"pa\")", - "original_lang_name": "punjabi", - "original_lang_code": "pan", - "scripts": [ - "Latn", - "Guru" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": false - }, - "Beng": { - "full_object": "SpaCyTokenizer(\"bn\")", - "original_lang_name": "bengali", - "original_lang_code": "ben", - "scripts": [ - "Latn", - "Beng" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Orya": { - "full_object": "IndicNLPTokenizer(\"or\")", - "original_lang_name": "oriya", - "original_lang_code": "ori", - "scripts": [ - "Latn", - "Orya" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": true - }, - "Arab": { - "full_object": "SpaCyTokenizer(\"fa\")", - "original_lang_name": "persian", - "original_lang_code": "fas", - "scripts": [ - "Arab" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": true - }, - "Sinh": { - "full_object": "SpaCyTokenizer(\"si\")", - "original_lang_name": "sinhala", - "original_lang_code": "sin", - "scripts": [ - "Sinh" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - } - }, - "children": [ { - "name": "Danish-Bokmal", + "name": "Middle German", "iso_1_code": null, "iso_3_code": null, "tokenizers": { "Latn": { - "full_object": "SpaCyTokenizer(\"sv\")", - "original_lang_name": "swedish", - "original_lang_code": "swe", + "full_object": "SpaCyTokenizer(\"de\")", + "original_lang_name": "german", + "original_lang_code": "deu", "scripts": [ "Latn" ], "class_name": "SpaCyTokenizer", "macrolanguage": false - }, - "Armn": { - "full_object": "SpaCyTokenizer(\"hy\")", - "original_lang_name": "armenian", - "original_lang_code": "hye", - "scripts": [ - "Armn" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Cyrl": { - "full_object": "SpaCyTokenizer(\"ru\")", - "original_lang_name": "russian", - "original_lang_code": "rus", - "scripts": [ - "Cyrl" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Grek": { - "full_object": "SpaCyTokenizer(\"el\")", - "original_lang_name": "greek", - "original_lang_code": "ell", - "scripts": [ - "Grek" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Deva": { - "full_object": "SpaCyTokenizer(\"hi\")", - "original_lang_name": "hindi", - "original_lang_code": "hin", - "scripts": [ - "Latn", - "Deva" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Gujr": { - "full_object": "SpaCyTokenizer(\"gu\")", - "original_lang_name": "gujarati", - "original_lang_code": "guj", - "scripts": [ - "Latn", - "Gujr" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Guru": { - "full_object": "IndicNLPTokenizer(\"pa\")", - "original_lang_name": "punjabi", - "original_lang_code": "pan", - "scripts": [ - "Latn", - "Guru" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": false - }, - "Beng": { - "full_object": "SpaCyTokenizer(\"bn\")", - "original_lang_name": "bengali", - "original_lang_code": "ben", - "scripts": [ - "Latn", - "Beng" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Orya": { - "full_object": "IndicNLPTokenizer(\"or\")", - "original_lang_name": "oriya", - "original_lang_code": "ori", - "scripts": [ - "Latn", - "Orya" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": true - }, - "Arab": { - "full_object": "SpaCyTokenizer(\"fa\")", - "original_lang_name": "persian", - "original_lang_code": "fas", - "scripts": [ - "Arab" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": true - }, - "Sinh": { - "full_object": "SpaCyTokenizer(\"si\")", - "original_lang_name": "sinhala", - "original_lang_code": "sin", - "scripts": [ - "Sinh" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false } }, "children": [ { - "name": "Norwegian", - "iso_1_code": "no", - "iso_3_code": "nor", - "tokenizers": {}, - "children": [], - "node_i": "3990", + "name": "East Middle German", + "iso_1_code": null, + "iso_3_code": null, + "tokenizers": { + "Latn": { + "full_object": "SpaCyTokenizer(\"de\")", + "original_lang_name": "german", + "original_lang_code": "deu", + "scripts": [ + "Latn" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } + }, + "children": [ + { + "name": "German, Standard", + "iso_1_code": "de", + "iso_3_code": "deu", + "tokenizers": { + "Latn": { + "full_object": "SpaCyTokenizer(\"de\")", + "original_lang_name": "german", + "original_lang_code": "deu", + "scripts": [ + "Latn" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } + }, + "children": [], + "node_i": "4014", + "scripts": [ + "Latn" + ], + "own_tokenizer": true + }, + { + "name": "Silesian, Lower", + "iso_1_code": null, + "iso_3_code": "sli", + "tokenizers": {}, + "children": [], + "node_i": "4015", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Saxon, Upper", + "iso_1_code": null, + "iso_3_code": "sxu", + "tokenizers": {}, + "children": [], + "node_i": "4016", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Wymysorys", + "iso_1_code": null, + "iso_3_code": "wym", + "tokenizers": {}, + "children": [], + "node_i": "4017", + "scripts": [], + "own_tokenizer": false + } + ], + "node_i": "4013", "scripts": [], - "own_tokenizer": true - } - ], - "node_i": "3989", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Danish-Riksmal", - "iso_1_code": null, - "iso_3_code": null, - "tokenizers": { - "Latn": { - "full_object": "SpaCyTokenizer(\"da\")", - "original_lang_name": "danish", - "original_lang_code": "dan", - "scripts": [ - "Latn" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Armn": { - "full_object": "SpaCyTokenizer(\"hy\")", - "original_lang_name": "armenian", - "original_lang_code": "hye", - "scripts": [ - "Armn" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Cyrl": { - "full_object": "SpaCyTokenizer(\"ru\")", - "original_lang_name": "russian", - "original_lang_code": "rus", - "scripts": [ - "Cyrl" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Grek": { - "full_object": "SpaCyTokenizer(\"el\")", - "original_lang_name": "greek", - "original_lang_code": "ell", - "scripts": [ - "Grek" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Deva": { - "full_object": "SpaCyTokenizer(\"hi\")", - "original_lang_name": "hindi", - "original_lang_code": "hin", - "scripts": [ - "Latn", - "Deva" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Gujr": { - "full_object": "SpaCyTokenizer(\"gu\")", - "original_lang_name": "gujarati", - "original_lang_code": "guj", - "scripts": [ - "Latn", - "Gujr" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Guru": { - "full_object": "IndicNLPTokenizer(\"pa\")", - "original_lang_name": "punjabi", - "original_lang_code": "pan", - "scripts": [ - "Latn", - "Guru" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": false - }, - "Beng": { - "full_object": "SpaCyTokenizer(\"bn\")", - "original_lang_name": "bengali", - "original_lang_code": "ben", - "scripts": [ - "Latn", - "Beng" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Orya": { - "full_object": "IndicNLPTokenizer(\"or\")", - "original_lang_name": "oriya", - "original_lang_code": "ori", - "scripts": [ - "Latn", - "Orya" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": true - }, - "Arab": { - "full_object": "SpaCyTokenizer(\"fa\")", - "original_lang_name": "persian", - "original_lang_code": "fas", - "scripts": [ - "Arab" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": true + "own_tokenizer": false }, - "Sinh": { - "full_object": "SpaCyTokenizer(\"si\")", - "original_lang_name": "sinhala", - "original_lang_code": "sin", - "scripts": [ - "Sinh" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - } - }, - "children": [ { - "name": "Danish", + "name": "West Middle German", "iso_1_code": null, "iso_3_code": null, "tokenizers": { "Latn": { - "full_object": "SpaCyTokenizer(\"da\")", - "original_lang_name": "danish", - "original_lang_code": "dan", + "full_object": "SpaCyTokenizer(\"lb\")", + "original_lang_name": "luxembourgish", + "original_lang_code": "ltz", "scripts": [ "Latn" ], "class_name": "SpaCyTokenizer", "macrolanguage": false - }, - "Armn": { - "full_object": "SpaCyTokenizer(\"hy\")", - "original_lang_name": "armenian", - "original_lang_code": "hye", - "scripts": [ - "Armn" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Cyrl": { - "full_object": "SpaCyTokenizer(\"ru\")", - "original_lang_name": "russian", - "original_lang_code": "rus", - "scripts": [ - "Cyrl" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Grek": { - "full_object": "SpaCyTokenizer(\"el\")", - "original_lang_name": "greek", - "original_lang_code": "ell", - "scripts": [ - "Grek" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Deva": { - "full_object": "SpaCyTokenizer(\"hi\")", - "original_lang_name": "hindi", - "original_lang_code": "hin", - "scripts": [ - "Latn", - "Deva" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Gujr": { - "full_object": "SpaCyTokenizer(\"gu\")", - "original_lang_name": "gujarati", - "original_lang_code": "guj", - "scripts": [ - "Latn", - "Gujr" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Guru": { - "full_object": "IndicNLPTokenizer(\"pa\")", - "original_lang_name": "punjabi", - "original_lang_code": "pan", - "scripts": [ - "Latn", - "Guru" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": false - }, - "Beng": { - "full_object": "SpaCyTokenizer(\"bn\")", - "original_lang_name": "bengali", - "original_lang_code": "ben", - "scripts": [ - "Latn", - "Beng" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Orya": { - "full_object": "IndicNLPTokenizer(\"or\")", - "original_lang_name": "oriya", - "original_lang_code": "ori", - "scripts": [ - "Latn", - "Orya" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": true - }, - "Arab": { - "full_object": "SpaCyTokenizer(\"fa\")", - "original_lang_name": "persian", - "original_lang_code": "fas", - "scripts": [ - "Arab" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": true - }, - "Sinh": { - "full_object": "SpaCyTokenizer(\"si\")", - "original_lang_name": "sinhala", - "original_lang_code": "sin", - "scripts": [ - "Sinh" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false } }, "children": [ { - "name": "Danish", - "iso_1_code": "da", - "iso_3_code": "dan", + "name": "Ripuarian", + "iso_1_code": null, + "iso_3_code": "ksh", "tokenizers": { "Latn": { - "full_object": "SpaCyTokenizer(\"da\")", - "original_lang_name": "danish", - "original_lang_code": "dan", + "full_object": "SpaCyTokenizer(\"lb\")", + "original_lang_name": "luxembourgish", + "original_lang_code": "ltz", "scripts": [ "Latn" ], @@ -4604,138 +2216,124 @@ } }, "children": [], - "node_i": "3993", + "node_i": "4019", "scripts": [ "Latn" ], - "own_tokenizer": true - } - ], - "node_i": "3992", - "scripts": [], - "own_tokenizer": false - } - ], - "node_i": "3991", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Swedish", - "iso_1_code": null, - "iso_3_code": null, - "tokenizers": { - "Latn": { - "full_object": "SpaCyTokenizer(\"sv\")", - "original_lang_name": "swedish", - "original_lang_code": "swe", - "scripts": [ - "Latn" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Armn": { - "full_object": "SpaCyTokenizer(\"hy\")", - "original_lang_name": "armenian", - "original_lang_code": "hye", - "scripts": [ - "Armn" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Cyrl": { - "full_object": "SpaCyTokenizer(\"ru\")", - "original_lang_name": "russian", - "original_lang_code": "rus", - "scripts": [ - "Cyrl" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Grek": { - "full_object": "SpaCyTokenizer(\"el\")", - "original_lang_name": "greek", - "original_lang_code": "ell", - "scripts": [ - "Grek" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Deva": { - "full_object": "SpaCyTokenizer(\"hi\")", - "original_lang_name": "hindi", - "original_lang_code": "hin", - "scripts": [ - "Latn", - "Deva" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Gujr": { - "full_object": "SpaCyTokenizer(\"gu\")", - "original_lang_name": "gujarati", - "original_lang_code": "guj", - "scripts": [ - "Latn", - "Gujr" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Guru": { - "full_object": "IndicNLPTokenizer(\"pa\")", - "original_lang_name": "punjabi", - "original_lang_code": "pan", - "scripts": [ - "Latn", - "Guru" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": false - }, - "Beng": { - "full_object": "SpaCyTokenizer(\"bn\")", - "original_lang_name": "bengali", - "original_lang_code": "ben", - "scripts": [ - "Latn", - "Beng" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Orya": { - "full_object": "IndicNLPTokenizer(\"or\")", - "original_lang_name": "oriya", - "original_lang_code": "ori", - "scripts": [ - "Latn", - "Orya" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": true - }, - "Arab": { - "full_object": "SpaCyTokenizer(\"fa\")", - "original_lang_name": "persian", - "original_lang_code": "fas", - "scripts": [ - "Arab" + "own_tokenizer": false + }, + { + "name": "German, Pennsylvania", + "iso_1_code": null, + "iso_3_code": "pdc", + "tokenizers": { + "Latn": { + "full_object": "SpaCyTokenizer(\"lb\")", + "original_lang_name": "luxembourgish", + "original_lang_code": "ltz", + "scripts": [ + "Latn" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } + }, + "children": [], + "node_i": "4020", + "scripts": [ + "Latn" + ], + "own_tokenizer": false + }, + { + "name": "Palatinate Franconian", + "iso_1_code": null, + "iso_3_code": "pfl", + "tokenizers": { + "Latn": { + "full_object": "SpaCyTokenizer(\"lb\")", + "original_lang_name": "luxembourgish", + "original_lang_code": "ltz", + "scripts": [ + "Latn" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } + }, + "children": [], + "node_i": "4021", + "scripts": [ + "Latn" + ], + "own_tokenizer": false + }, + { + "name": "Moselle Franconian", + "iso_1_code": null, + "iso_3_code": null, + "tokenizers": { + "Latn": { + "full_object": "SpaCyTokenizer(\"lb\")", + "original_lang_name": "luxembourgish", + "original_lang_code": "ltz", + "scripts": [ + "Latn" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } + }, + "children": [ + { + "name": "Luxembourgish", + "iso_1_code": "lb", + "iso_3_code": "ltz", + "tokenizers": { + "Latn": { + "full_object": "SpaCyTokenizer(\"lb\")", + "original_lang_name": "luxembourgish", + "original_lang_code": "ltz", + "scripts": [ + "Latn" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } + }, + "children": [], + "node_i": "4023", + "scripts": [ + "Latn" + ], + "own_tokenizer": true + } + ], + "node_i": "4022", + "scripts": [], + "own_tokenizer": false + } ], - "class_name": "SpaCyTokenizer", - "macrolanguage": true - }, - "Sinh": { - "full_object": "SpaCyTokenizer(\"si\")", - "original_lang_name": "sinhala", - "original_lang_code": "sin", + "node_i": "4018", + "scripts": [], + "own_tokenizer": false + } + ], + "node_i": "4012", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Upper German", + "iso_1_code": null, + "iso_3_code": null, + "tokenizers": { + "Latn": { + "full_object": "SpaCyTokenizer(\"de\")", + "original_lang_name": "german", + "original_lang_code": "deu", "scripts": [ - "Sinh" + "Latn" ], "class_name": "SpaCyTokenizer", "macrolanguage": false @@ -4743,14 +2341,24 @@ }, "children": [ { - "name": "Swedish", - "iso_1_code": "sv", - "iso_3_code": "swe", + "name": "Eastern Franconian", + "iso_1_code": null, + "iso_3_code": "vmf", + "tokenizers": {}, + "children": [], + "node_i": "4025", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Alemannic", + "iso_1_code": null, + "iso_3_code": null, "tokenizers": { "Latn": { - "full_object": "SpaCyTokenizer(\"sv\")", - "original_lang_name": "swedish", - "original_lang_code": "swe", + "full_object": "SpaCyTokenizer(\"de\")", + "original_lang_name": "german", + "original_lang_code": "deu", "scripts": [ "Latn" ], @@ -4758,144 +2366,212 @@ "macrolanguage": false } }, - "children": [], - "node_i": "3995", - "scripts": [ - "Latn" - ], - "own_tokenizer": true - } - ], - "node_i": "3994", - "scripts": [], - "own_tokenizer": false - } - ], - "node_i": "3988", - "scripts": [], - "own_tokenizer": false - } - ], - "node_i": "3986", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "West Scandinavian", - "iso_1_code": null, - "iso_3_code": null, - "tokenizers": { - "Latn": { - "full_object": "SpaCyTokenizer(\"is\")", - "original_lang_name": "icelandic", - "original_lang_code": "isl", - "scripts": [ - "Latn" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Armn": { - "full_object": "SpaCyTokenizer(\"hy\")", - "original_lang_name": "armenian", - "original_lang_code": "hye", - "scripts": [ - "Armn" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Cyrl": { - "full_object": "SpaCyTokenizer(\"ru\")", - "original_lang_name": "russian", - "original_lang_code": "rus", - "scripts": [ - "Cyrl" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Grek": { - "full_object": "SpaCyTokenizer(\"el\")", - "original_lang_name": "greek", - "original_lang_code": "ell", - "scripts": [ - "Grek" + "children": [ + { + "name": "German, Colonia Tovar", + "iso_1_code": null, + "iso_3_code": "gct", + "tokenizers": {}, + "children": [], + "node_i": "4027", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "German, Swiss", + "iso_1_code": null, + "iso_3_code": "gsw", + "tokenizers": { + "Latn": { + "full_object": "SpaCyTokenizer(\"de\")", + "original_lang_name": "german", + "original_lang_code": "deu", + "scripts": [ + "Latn" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } + }, + "children": [], + "node_i": "4028", + "scripts": [ + "Latn" + ], + "own_tokenizer": false + }, + { + "name": "Swabian", + "iso_1_code": null, + "iso_3_code": "swg", + "tokenizers": { + "Latn": { + "full_object": "SpaCyTokenizer(\"de\")", + "original_lang_name": "german", + "original_lang_code": "deu", + "scripts": [ + "Latn" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } + }, + "children": [], + "node_i": "4029", + "scripts": [ + "Latn" + ], + "own_tokenizer": false + }, + { + "name": "Walser", + "iso_1_code": null, + "iso_3_code": "wae", + "tokenizers": {}, + "children": [], + "node_i": "4030", + "scripts": [], + "own_tokenizer": false + } + ], + "node_i": "4026", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Bavarian-Austrian", + "iso_1_code": null, + "iso_3_code": null, + "tokenizers": { + "Latn": { + "full_object": "SpaCyTokenizer(\"de\")", + "original_lang_name": "german", + "original_lang_code": "deu", + "scripts": [ + "Latn" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } + }, + "children": [ + { + "name": "Bavarian", + "iso_1_code": null, + "iso_3_code": "bar", + "tokenizers": { + "Latn": { + "full_object": "SpaCyTokenizer(\"de\")", + "original_lang_name": "german", + "original_lang_code": "deu", + "scripts": [ + "Latn" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } + }, + "children": [], + "node_i": "4032", + "scripts": [ + "Latn" + ], + "own_tokenizer": false + }, + { + "name": "Cimbrian", + "iso_1_code": null, + "iso_3_code": "cim", + "tokenizers": {}, + "children": [], + "node_i": "4033", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Hutterisch", + "iso_1_code": null, + "iso_3_code": "geh", + "tokenizers": {}, + "children": [], + "node_i": "4034", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "M\u00f2cheno", + "iso_1_code": null, + "iso_3_code": "mhn", + "tokenizers": {}, + "children": [], + "node_i": "4035", + "scripts": [], + "own_tokenizer": false + } + ], + "node_i": "4031", + "scripts": [], + "own_tokenizer": false + } + ], + "node_i": "4024", + "scripts": [], + "own_tokenizer": false + } ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false + "node_i": "4010", + "scripts": [], + "own_tokenizer": false }, - "Deva": { - "full_object": "SpaCyTokenizer(\"hi\")", - "original_lang_name": "hindi", - "original_lang_code": "hin", - "scripts": [ - "Latn", - "Deva" + { + "name": "Yiddish", + "iso_1_code": null, + "iso_3_code": null, + "tokenizers": {}, + "children": [ + { + "name": "Yiddish, Eastern", + "iso_1_code": "yi", + "iso_3_code": "ydd", + "tokenizers": {}, + "children": [], + "node_i": "4037", + "scripts": [ + "Hebr" + ], + "own_tokenizer": false + }, + { + "name": "Yiddish, Western", + "iso_1_code": "yi", + "iso_3_code": "yih", + "tokenizers": {}, + "children": [], + "node_i": "4038", + "scripts": [], + "own_tokenizer": false + } ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Gujr": { - "full_object": "SpaCyTokenizer(\"gu\")", - "original_lang_name": "gujarati", - "original_lang_code": "guj", + "node_i": "4036", + "scripts": [], + "own_tokenizer": false + } + ], + "node_i": "4009", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Low Saxon-Low Franconian", + "iso_1_code": null, + "iso_3_code": null, + "tokenizers": { + "Latn": { + "full_object": "SpaCyTokenizer(\"nl\")", + "original_lang_name": "dutch", + "original_lang_code": "nld", "scripts": [ - "Latn", - "Gujr" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Guru": { - "full_object": "IndicNLPTokenizer(\"pa\")", - "original_lang_name": "punjabi", - "original_lang_code": "pan", - "scripts": [ - "Latn", - "Guru" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": false - }, - "Beng": { - "full_object": "SpaCyTokenizer(\"bn\")", - "original_lang_name": "bengali", - "original_lang_code": "ben", - "scripts": [ - "Latn", - "Beng" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Orya": { - "full_object": "IndicNLPTokenizer(\"or\")", - "original_lang_name": "oriya", - "original_lang_code": "ori", - "scripts": [ - "Latn", - "Orya" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": true - }, - "Arab": { - "full_object": "SpaCyTokenizer(\"fa\")", - "original_lang_name": "persian", - "original_lang_code": "fas", - "scripts": [ - "Arab" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": true - }, - "Sinh": { - "full_object": "SpaCyTokenizer(\"si\")", - "original_lang_name": "sinhala", - "original_lang_code": "sin", - "scripts": [ - "Sinh" + "Latn" ], "class_name": "SpaCyTokenizer", "macrolanguage": false @@ -4903,14 +2579,14 @@ }, "children": [ { - "name": "Faroese", - "iso_1_code": "fo", - "iso_3_code": "fao", + "name": "Low Franconian", + "iso_1_code": null, + "iso_3_code": null, "tokenizers": { "Latn": { - "full_object": "SpaCyTokenizer(\"fo\")", - "original_lang_name": "faroese", - "original_lang_code": "fao", + "full_object": "SpaCyTokenizer(\"nl\")", + "original_lang_name": "dutch", + "original_lang_code": "nld", "scripts": [ "Latn" ], @@ -4918,22 +2594,136 @@ "macrolanguage": false } }, - "children": [], - "node_i": "3997", - "scripts": [ - "Latn" + "children": [ + { + "name": "Afrikaans", + "iso_1_code": "af", + "iso_3_code": "afr", + "tokenizers": { + "Latn": { + "full_object": "SpaCyTokenizer(\"af\")", + "original_lang_name": "afrikaans", + "original_lang_code": "afr", + "scripts": [ + "Latn" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } + }, + "children": [], + "node_i": "4041", + "scripts": [ + "Latn" + ], + "own_tokenizer": true + }, + { + "name": "Limburgish", + "iso_1_code": "li", + "iso_3_code": "lim", + "tokenizers": { + "Latn": { + "full_object": "SpaCyTokenizer(\"nl\")", + "original_lang_name": "dutch", + "original_lang_code": "nld", + "scripts": [ + "Latn" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } + }, + "children": [], + "node_i": "4042", + "scripts": [ + "Latn" + ], + "own_tokenizer": false + }, + { + "name": "Dutch", + "iso_1_code": "nl", + "iso_3_code": "nld", + "tokenizers": { + "Latn": { + "full_object": "SpaCyTokenizer(\"nl\")", + "original_lang_name": "dutch", + "original_lang_code": "nld", + "scripts": [ + "Latn" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } + }, + "children": [], + "node_i": "4043", + "scripts": [ + "Latn" + ], + "own_tokenizer": true + }, + { + "name": "West Flemish", + "iso_1_code": null, + "iso_3_code": "vls", + "tokenizers": { + "Latn": { + "full_object": "SpaCyTokenizer(\"nl\")", + "original_lang_name": "dutch", + "original_lang_code": "nld", + "scripts": [ + "Latn" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } + }, + "children": [], + "node_i": "4044", + "scripts": [ + "Latn" + ], + "own_tokenizer": false + }, + { + "name": "Zeeuws", + "iso_1_code": null, + "iso_3_code": "zea", + "tokenizers": { + "Latn": { + "full_object": "SpaCyTokenizer(\"nl\")", + "original_lang_name": "dutch", + "original_lang_code": "nld", + "scripts": [ + "Latn" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } + }, + "children": [], + "node_i": "4045", + "scripts": [ + "Latn" + ], + "own_tokenizer": false + } ], - "own_tokenizer": true + "node_i": "4040", + "scripts": [], + "own_tokenizer": false }, { - "name": "Icelandic", - "iso_1_code": "is", - "iso_3_code": "isl", + "name": "Low Saxon", + "iso_1_code": null, + "iso_3_code": null, "tokenizers": { "Latn": { - "full_object": "SpaCyTokenizer(\"is\")", - "original_lang_name": "icelandic", - "original_lang_code": "isl", + "full_object": "SpaCyTokenizer(\"nl\")", + "original_lang_name": "dutch", + "original_lang_code": "nld", "scripts": [ "Latn" ], @@ -4941,68 +2731,198 @@ "macrolanguage": false } }, - "children": [], - "node_i": "3998", - "scripts": [ - "Latn" - ], - "own_tokenizer": true - }, - { - "name": "Norn", - "iso_1_code": null, - "iso_3_code": "nrn", - "tokenizers": {}, - "children": [], - "node_i": "3999", - "scripts": [], - "own_tokenizer": false - } - ], - "node_i": "3996", - "scripts": [], - "own_tokenizer": false - } - ], - "node_i": "3985", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "West", - "iso_1_code": null, - "iso_3_code": null, - "tokenizers": { - "Latn": { - "full_object": "SpaCyTokenizer(\"en\")", - "original_lang_name": "english", - "original_lang_code": "eng", - "scripts": [ - "Latn" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Armn": { - "full_object": "SpaCyTokenizer(\"hy\")", - "original_lang_name": "armenian", - "original_lang_code": "hye", - "scripts": [ - "Armn" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Cyrl": { - "full_object": "SpaCyTokenizer(\"ru\")", - "original_lang_name": "russian", - "original_lang_code": "rus", - "scripts": [ - "Cyrl" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, + "children": [ + { + "name": "Achterhoeks", + "iso_1_code": null, + "iso_3_code": "act", + "tokenizers": {}, + "children": [], + "node_i": "4047", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Drents", + "iso_1_code": null, + "iso_3_code": "drt", + "tokenizers": {}, + "children": [], + "node_i": "4048", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Saxon, East Frisian Low", + "iso_1_code": null, + "iso_3_code": "frs", + "tokenizers": {}, + "children": [], + "node_i": "4049", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Gronings", + "iso_1_code": null, + "iso_3_code": "gos", + "tokenizers": { + "Latn": { + "full_object": "SpaCyTokenizer(\"nl\")", + "original_lang_name": "dutch", + "original_lang_code": "nld", + "scripts": [ + "Latn" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } + }, + "children": [], + "node_i": "4050", + "scripts": [ + "Latn" + ], + "own_tokenizer": false + }, + { + "name": "Saxon, Low", + "iso_1_code": null, + "iso_3_code": "nds", + "tokenizers": { + "Latn": { + "full_object": "SpaCyTokenizer(\"nl\")", + "original_lang_name": "dutch", + "original_lang_code": "nld", + "scripts": [ + "Latn" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } + }, + "children": [], + "node_i": "4051", + "scripts": [ + "Latn" + ], + "own_tokenizer": false + }, + { + "name": "Plautdietsch", + "iso_1_code": null, + "iso_3_code": "pdt", + "tokenizers": { + "Latn": { + "full_object": "SpaCyTokenizer(\"nl\")", + "original_lang_name": "dutch", + "original_lang_code": "nld", + "scripts": [ + "Latn" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } + }, + "children": [], + "node_i": "4052", + "scripts": [ + "Latn" + ], + "own_tokenizer": false + }, + { + "name": "Sallands", + "iso_1_code": null, + "iso_3_code": "sdz", + "tokenizers": {}, + "children": [], + "node_i": "4053", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Stellingwerfs", + "iso_1_code": null, + "iso_3_code": "stl", + "tokenizers": {}, + "children": [], + "node_i": "4054", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Twents", + "iso_1_code": null, + "iso_3_code": "twd", + "tokenizers": {}, + "children": [], + "node_i": "4055", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Veluws", + "iso_1_code": null, + "iso_3_code": "vel", + "tokenizers": {}, + "children": [], + "node_i": "4056", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Westphalien", + "iso_1_code": null, + "iso_3_code": "wep", + "tokenizers": {}, + "children": [], + "node_i": "4057", + "scripts": [], + "own_tokenizer": false + } + ], + "node_i": "4046", + "scripts": [], + "own_tokenizer": false + } + ], + "node_i": "4039", + "scripts": [], + "own_tokenizer": false + } + ], + "node_i": "4000", + "scripts": [], + "own_tokenizer": false + } + ], + "node_i": "3984", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Greek", + "iso_1_code": null, + "iso_3_code": null, + "tokenizers": { + "Grek": { + "full_object": "SpaCyTokenizer(\"el\")", + "original_lang_name": "greek", + "original_lang_code": "ell", + "scripts": [ + "Grek" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } + }, + "children": [ + { + "name": "Attic", + "iso_1_code": null, + "iso_3_code": null, + "tokenizers": { "Grek": { "full_object": "SpaCyTokenizer(\"el\")", "original_lang_name": "greek", @@ -5012,22 +2932,274 @@ ], "class_name": "SpaCyTokenizer", "macrolanguage": false + } + }, + "children": [ + { + "name": "Cappadocian Greek", + "iso_1_code": null, + "iso_3_code": "cpg", + "tokenizers": {}, + "children": [], + "node_i": "4060", + "scripts": [], + "own_tokenizer": false }, - "Deva": { - "full_object": "SpaCyTokenizer(\"hi\")", - "original_lang_name": "hindi", - "original_lang_code": "hin", + { + "name": "Greek", + "iso_1_code": "el", + "iso_3_code": "ell", + "tokenizers": { + "Grek": { + "full_object": "SpaCyTokenizer(\"el\")", + "original_lang_name": "greek", + "original_lang_code": "ell", + "scripts": [ + "Grek" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } + }, + "children": [], + "node_i": "4061", "scripts": [ - "Latn", - "Deva" + "Grek" ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false + "own_tokenizer": true }, - "Gujr": { - "full_object": "SpaCyTokenizer(\"gu\")", - "original_lang_name": "gujarati", - "original_lang_code": "guj", + { + "name": "Greek, Ancient", + "iso_1_code": null, + "iso_3_code": "grc", + "tokenizers": { + "Grek": { + "full_object": "SpaCyTokenizer(\"grc\")", + "original_lang_name": "ancient_greek", + "original_lang_code": "grc", + "scripts": [ + "Grek" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } + }, + "children": [], + "node_i": "4062", + "scripts": [ + "Grek" + ], + "own_tokenizer": true + }, + { + "name": "Pontic", + "iso_1_code": null, + "iso_3_code": "pnt", + "tokenizers": { + "Grek": { + "full_object": "SpaCyTokenizer(\"el\")", + "original_lang_name": "greek", + "original_lang_code": "ell", + "scripts": [ + "Grek" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } + }, + "children": [], + "node_i": "4063", + "scripts": [ + "Grek" + ], + "own_tokenizer": false + }, + { + "name": "Yevanic", + "iso_1_code": null, + "iso_3_code": "yej", + "tokenizers": {}, + "children": [], + "node_i": "4064", + "scripts": [], + "own_tokenizer": false + } + ], + "node_i": "4059", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Doric", + "iso_1_code": null, + "iso_3_code": null, + "tokenizers": {}, + "children": [ + { + "name": "Tsakonian", + "iso_1_code": null, + "iso_3_code": "tsd", + "tokenizers": {}, + "children": [], + "node_i": "4066", + "scripts": [], + "own_tokenizer": false + } + ], + "node_i": "4065", + "scripts": [], + "own_tokenizer": false + } + ], + "node_i": "4058", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Indo-Iranian", + "iso_1_code": null, + "iso_3_code": null, + "tokenizers": { + "Latn": { + "full_object": "SpaCyTokenizer(\"hi\")", + "original_lang_name": "hindi", + "original_lang_code": "hin", + "scripts": [ + "Latn", + "Deva" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Deva": { + "full_object": "SpaCyTokenizer(\"hi\")", + "original_lang_name": "hindi", + "original_lang_code": "hin", + "scripts": [ + "Latn", + "Deva" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Gujr": { + "full_object": "SpaCyTokenizer(\"gu\")", + "original_lang_name": "gujarati", + "original_lang_code": "guj", + "scripts": [ + "Latn", + "Gujr" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Guru": { + "full_object": "IndicNLPTokenizer(\"pa\")", + "original_lang_name": "punjabi", + "original_lang_code": "pan", + "scripts": [ + "Latn", + "Guru" + ], + "class_name": "IndicNLPTokenizer", + "macrolanguage": false + }, + "Beng": { + "full_object": "SpaCyTokenizer(\"bn\")", + "original_lang_name": "bengali", + "original_lang_code": "ben", + "scripts": [ + "Latn", + "Beng" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Orya": { + "full_object": "IndicNLPTokenizer(\"or\")", + "original_lang_name": "oriya", + "original_lang_code": "ori", + "scripts": [ + "Latn", + "Orya" + ], + "class_name": "IndicNLPTokenizer", + "macrolanguage": true + }, + "Arab": { + "full_object": "SpaCyTokenizer(\"fa\")", + "original_lang_name": "persian", + "original_lang_code": "fas", + "scripts": [ + "Arab" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + }, + "Sinh": { + "full_object": "SpaCyTokenizer(\"si\")", + "original_lang_name": "sinhala", + "original_lang_code": "sin", + "scripts": [ + "Sinh" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Cyrl": { + "full_object": "StanzaTokenizer(\"kmr\")", + "original_lang_name": "northern_kurdish", + "original_lang_code": "kmr", + "scripts": [ + "Latn", + "Cyrl" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Grek": { + "full_object": "SpaCyTokenizer(\"el\")", + "original_lang_name": "greek", + "original_lang_code": "ell", + "scripts": [ + "Grek" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } + }, + "children": [ + { + "name": "Indo-Aryan", + "iso_1_code": null, + "iso_3_code": null, + "tokenizers": { + "Latn": { + "full_object": "SpaCyTokenizer(\"hi\")", + "original_lang_name": "hindi", + "original_lang_code": "hin", + "scripts": [ + "Latn", + "Deva" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Deva": { + "full_object": "SpaCyTokenizer(\"hi\")", + "original_lang_name": "hindi", + "original_lang_code": "hin", + "scripts": [ + "Latn", + "Deva" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Gujr": { + "full_object": "SpaCyTokenizer(\"gu\")", + "original_lang_name": "gujarati", + "original_lang_code": "guj", "scripts": [ "Latn", "Gujr" @@ -5069,14 +3241,15 @@ "macrolanguage": true }, "Arab": { - "full_object": "SpaCyTokenizer(\"fa\")", - "original_lang_name": "persian", - "original_lang_code": "fas", + "full_object": "SpaCyTokenizer(\"ur\")", + "original_lang_name": "urdu", + "original_lang_code": "urd", "scripts": [ + "Latn", "Arab" ], "class_name": "SpaCyTokenizer", - "macrolanguage": true + "macrolanguage": false }, "Sinh": { "full_object": "SpaCyTokenizer(\"si\")", @@ -5087,58 +3260,39 @@ ], "class_name": "SpaCyTokenizer", "macrolanguage": false + }, + "Cyrl": { + "full_object": "StanzaTokenizer(\"kmr\")", + "original_lang_name": "northern_kurdish", + "original_lang_code": "kmr", + "scripts": [ + "Latn", + "Cyrl" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Grek": { + "full_object": "SpaCyTokenizer(\"el\")", + "original_lang_name": "greek", + "original_lang_code": "ell", + "scripts": [ + "Grek" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false } }, "children": [ { - "name": "English", - "iso_1_code": null, - "iso_3_code": null, + "name": "Sanskrit", + "iso_1_code": "sa", + "iso_3_code": "san", "tokenizers": { "Latn": { - "full_object": "SpaCyTokenizer(\"en\")", - "original_lang_name": "english", - "original_lang_code": "eng", - "scripts": [ - "Latn" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Armn": { - "full_object": "SpaCyTokenizer(\"hy\")", - "original_lang_name": "armenian", - "original_lang_code": "hye", - "scripts": [ - "Armn" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Cyrl": { - "full_object": "SpaCyTokenizer(\"ru\")", - "original_lang_name": "russian", - "original_lang_code": "rus", - "scripts": [ - "Cyrl" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Grek": { - "full_object": "SpaCyTokenizer(\"el\")", - "original_lang_name": "greek", - "original_lang_code": "ell", - "scripts": [ - "Grek" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Deva": { - "full_object": "SpaCyTokenizer(\"hi\")", - "original_lang_name": "hindi", - "original_lang_code": "hin", + "full_object": "SpaCyTokenizer(\"sa\")", + "original_lang_name": "sanskrit", + "original_lang_code": "san", "scripts": [ "Latn", "Deva" @@ -5146,188 +3300,52 @@ "class_name": "SpaCyTokenizer", "macrolanguage": false }, - "Gujr": { - "full_object": "SpaCyTokenizer(\"gu\")", - "original_lang_name": "gujarati", - "original_lang_code": "guj", - "scripts": [ - "Latn", - "Gujr" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Guru": { - "full_object": "IndicNLPTokenizer(\"pa\")", - "original_lang_name": "punjabi", - "original_lang_code": "pan", - "scripts": [ - "Latn", - "Guru" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": false - }, - "Beng": { - "full_object": "SpaCyTokenizer(\"bn\")", - "original_lang_name": "bengali", - "original_lang_code": "ben", - "scripts": [ - "Latn", - "Beng" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Orya": { - "full_object": "IndicNLPTokenizer(\"or\")", - "original_lang_name": "oriya", - "original_lang_code": "ori", + "Deva": { + "full_object": "SpaCyTokenizer(\"sa\")", + "original_lang_name": "sanskrit", + "original_lang_code": "san", "scripts": [ "Latn", - "Orya" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": true - }, - "Arab": { - "full_object": "SpaCyTokenizer(\"fa\")", - "original_lang_name": "persian", - "original_lang_code": "fas", - "scripts": [ - "Arab" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": true - }, - "Sinh": { - "full_object": "SpaCyTokenizer(\"si\")", - "original_lang_name": "sinhala", - "original_lang_code": "sin", - "scripts": [ - "Sinh" + "Deva" ], "class_name": "SpaCyTokenizer", "macrolanguage": false } }, - "children": [ - { - "name": "English", - "iso_1_code": "en", - "iso_3_code": "eng", - "tokenizers": { - "Latn": { - "full_object": "SpaCyTokenizer(\"en\")", - "original_lang_name": "english", - "original_lang_code": "eng", - "scripts": [ - "Latn" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - } - }, - "children": [], - "node_i": "4002", - "scripts": [ - "Latn" - ], - "own_tokenizer": true - }, - { - "name": "Scots", - "iso_1_code": null, - "iso_3_code": "sco", - "tokenizers": { - "Latn": { - "full_object": "SpaCyTokenizer(\"en\")", - "original_lang_name": "english", - "original_lang_code": "eng", - "scripts": [ - "Latn" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - } - }, - "children": [], - "node_i": "4003", - "scripts": [ - "Latn" - ], - "own_tokenizer": false - }, - { - "name": "Yola", - "iso_1_code": null, - "iso_3_code": "yol", - "tokenizers": {}, - "children": [], - "node_i": "4004", - "scripts": [], - "own_tokenizer": false - } + "children": [], + "node_i": "4069", + "scripts": [ + "Deva", + "Latn" ], - "node_i": "4001", - "scripts": [], - "own_tokenizer": false + "own_tokenizer": true }, { - "name": "Frisian", + "name": "Intermediate Divisions", "iso_1_code": null, "iso_3_code": null, "tokenizers": { "Latn": { - "full_object": "SpaCyTokenizer(\"en\")", - "original_lang_name": "english", - "original_lang_code": "eng", - "scripts": [ - "Latn" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Armn": { - "full_object": "SpaCyTokenizer(\"hy\")", - "original_lang_name": "armenian", - "original_lang_code": "hye", - "scripts": [ - "Armn" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Cyrl": { - "full_object": "SpaCyTokenizer(\"ru\")", - "original_lang_name": "russian", - "original_lang_code": "rus", - "scripts": [ - "Cyrl" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Grek": { - "full_object": "SpaCyTokenizer(\"el\")", - "original_lang_name": "greek", - "original_lang_code": "ell", + "full_object": "SpaCyTokenizer(\"ne\")", + "original_lang_name": "nepali", + "original_lang_code": "nep", "scripts": [ - "Grek" + "Latn", + "Deva" ], "class_name": "SpaCyTokenizer", - "macrolanguage": false + "macrolanguage": true }, "Deva": { - "full_object": "SpaCyTokenizer(\"hi\")", - "original_lang_name": "hindi", - "original_lang_code": "hin", + "full_object": "SpaCyTokenizer(\"ne\")", + "original_lang_name": "nepali", + "original_lang_code": "nep", "scripts": [ "Latn", "Deva" ], "class_name": "SpaCyTokenizer", - "macrolanguage": false + "macrolanguage": true }, "Gujr": { "full_object": "SpaCyTokenizer(\"gu\")", @@ -5351,44 +3369,34 @@ "class_name": "IndicNLPTokenizer", "macrolanguage": false }, - "Beng": { - "full_object": "SpaCyTokenizer(\"bn\")", - "original_lang_name": "bengali", - "original_lang_code": "ben", + "Arab": { + "full_object": "SpaCyTokenizer(\"ur\")", + "original_lang_name": "urdu", + "original_lang_code": "urd", "scripts": [ "Latn", - "Beng" + "Arab" ], "class_name": "SpaCyTokenizer", "macrolanguage": false }, - "Orya": { - "full_object": "IndicNLPTokenizer(\"or\")", - "original_lang_name": "oriya", - "original_lang_code": "ori", + "Cyrl": { + "full_object": "StanzaTokenizer(\"kmr\")", + "original_lang_name": "northern_kurdish", + "original_lang_code": "kmr", "scripts": [ "Latn", - "Orya" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": true - }, - "Arab": { - "full_object": "SpaCyTokenizer(\"fa\")", - "original_lang_name": "persian", - "original_lang_code": "fas", - "scripts": [ - "Arab" + "Cyrl" ], - "class_name": "SpaCyTokenizer", - "macrolanguage": true + "class_name": "StanzaTokenizer", + "macrolanguage": false }, - "Sinh": { - "full_object": "SpaCyTokenizer(\"si\")", - "original_lang_name": "sinhala", - "original_lang_code": "sin", + "Grek": { + "full_object": "SpaCyTokenizer(\"el\")", + "original_lang_name": "greek", + "original_lang_code": "ell", "scripts": [ - "Sinh" + "Grek" ], "class_name": "SpaCyTokenizer", "macrolanguage": false @@ -5396,14712 +3404,1557 @@ }, "children": [ { - "name": "Frisian, Northern", - "iso_1_code": null, - "iso_3_code": "frr", - "tokenizers": { - "Latn": { - "full_object": "SpaCyTokenizer(\"en\")", - "original_lang_name": "english", - "original_lang_code": "eng", - "scripts": [ - "Latn" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - } - }, - "children": [], - "node_i": "4006", - "scripts": [ - "Latn" - ], - "own_tokenizer": false - }, - { - "name": "Frisian", - "iso_1_code": "fy", - "iso_3_code": "fry", - "tokenizers": { - "Latn": { - "full_object": "SpaCyTokenizer(\"en\")", - "original_lang_name": "english", - "original_lang_code": "eng", - "scripts": [ - "Latn" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - } - }, - "children": [], - "node_i": "4007", - "scripts": [ - "Latn" - ], - "own_tokenizer": false - }, - { - "name": "Saterfriesisch", - "iso_1_code": null, - "iso_3_code": "stq", - "tokenizers": { - "Latn": { - "full_object": "SpaCyTokenizer(\"en\")", - "original_lang_name": "english", - "original_lang_code": "eng", - "scripts": [ - "Latn" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - } - }, - "children": [], - "node_i": "4008", - "scripts": [ - "Latn" - ], - "own_tokenizer": false - } - ], - "node_i": "4005", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "High German", - "iso_1_code": null, - "iso_3_code": null, - "tokenizers": { - "Latn": { - "full_object": "SpaCyTokenizer(\"de\")", - "original_lang_name": "german", - "original_lang_code": "deu", - "scripts": [ - "Latn" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Armn": { - "full_object": "SpaCyTokenizer(\"hy\")", - "original_lang_name": "armenian", - "original_lang_code": "hye", - "scripts": [ - "Armn" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Cyrl": { - "full_object": "SpaCyTokenizer(\"ru\")", - "original_lang_name": "russian", - "original_lang_code": "rus", - "scripts": [ - "Cyrl" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Grek": { - "full_object": "SpaCyTokenizer(\"el\")", - "original_lang_name": "greek", - "original_lang_code": "ell", - "scripts": [ - "Grek" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Deva": { - "full_object": "SpaCyTokenizer(\"hi\")", - "original_lang_name": "hindi", - "original_lang_code": "hin", - "scripts": [ - "Latn", - "Deva" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Gujr": { - "full_object": "SpaCyTokenizer(\"gu\")", - "original_lang_name": "gujarati", - "original_lang_code": "guj", - "scripts": [ - "Latn", - "Gujr" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Guru": { - "full_object": "IndicNLPTokenizer(\"pa\")", - "original_lang_name": "punjabi", - "original_lang_code": "pan", - "scripts": [ - "Latn", - "Guru" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": false - }, - "Beng": { - "full_object": "SpaCyTokenizer(\"bn\")", - "original_lang_name": "bengali", - "original_lang_code": "ben", - "scripts": [ - "Latn", - "Beng" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Orya": { - "full_object": "IndicNLPTokenizer(\"or\")", - "original_lang_name": "oriya", - "original_lang_code": "ori", - "scripts": [ - "Latn", - "Orya" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": true - }, - "Arab": { - "full_object": "SpaCyTokenizer(\"fa\")", - "original_lang_name": "persian", - "original_lang_code": "fas", - "scripts": [ - "Arab" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": true - }, - "Sinh": { - "full_object": "SpaCyTokenizer(\"si\")", - "original_lang_name": "sinhala", - "original_lang_code": "sin", - "scripts": [ - "Sinh" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - } - }, - "children": [ - { - "name": "German", + "name": "Eastern", "iso_1_code": null, "iso_3_code": null, "tokenizers": { "Latn": { - "full_object": "SpaCyTokenizer(\"de\")", - "original_lang_name": "german", - "original_lang_code": "deu", - "scripts": [ - "Latn" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Armn": { - "full_object": "SpaCyTokenizer(\"hy\")", - "original_lang_name": "armenian", - "original_lang_code": "hye", - "scripts": [ - "Armn" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Cyrl": { - "full_object": "SpaCyTokenizer(\"ru\")", - "original_lang_name": "russian", - "original_lang_code": "rus", - "scripts": [ - "Cyrl" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Grek": { - "full_object": "SpaCyTokenizer(\"el\")", - "original_lang_name": "greek", - "original_lang_code": "ell", - "scripts": [ - "Grek" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Deva": { - "full_object": "SpaCyTokenizer(\"hi\")", - "original_lang_name": "hindi", - "original_lang_code": "hin", + "full_object": "SpaCyTokenizer(\"ne\")", + "original_lang_name": "nepali", + "original_lang_code": "nep", "scripts": [ "Latn", "Deva" ], "class_name": "SpaCyTokenizer", - "macrolanguage": false + "macrolanguage": true }, - "Gujr": { - "full_object": "SpaCyTokenizer(\"gu\")", - "original_lang_name": "gujarati", - "original_lang_code": "guj", + "Deva": { + "full_object": "SpaCyTokenizer(\"ne\")", + "original_lang_name": "nepali", + "original_lang_code": "nep", "scripts": [ "Latn", - "Gujr" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Guru": { - "full_object": "IndicNLPTokenizer(\"pa\")", - "original_lang_name": "punjabi", - "original_lang_code": "pan", - "scripts": [ - "Latn", - "Guru" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": false - }, - "Beng": { - "full_object": "SpaCyTokenizer(\"bn\")", - "original_lang_name": "bengali", - "original_lang_code": "ben", - "scripts": [ - "Latn", - "Beng" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Orya": { - "full_object": "IndicNLPTokenizer(\"or\")", - "original_lang_name": "oriya", - "original_lang_code": "ori", - "scripts": [ - "Latn", - "Orya" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": true - }, - "Arab": { - "full_object": "SpaCyTokenizer(\"fa\")", - "original_lang_name": "persian", - "original_lang_code": "fas", - "scripts": [ - "Arab" + "Deva" ], "class_name": "SpaCyTokenizer", "macrolanguage": true - }, - "Sinh": { - "full_object": "SpaCyTokenizer(\"si\")", - "original_lang_name": "sinhala", - "original_lang_code": "sin", - "scripts": [ - "Sinh" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false } }, "children": [ { - "name": "Hunsrik", - "iso_1_code": null, - "iso_3_code": "hrx", - "tokenizers": { - "Latn": { - "full_object": "SpaCyTokenizer(\"de\")", - "original_lang_name": "german", - "original_lang_code": "deu", - "scripts": [ - "Latn" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - } - }, - "children": [], - "node_i": "4011", - "scripts": [ - "Latn" - ], - "own_tokenizer": false - }, - { - "name": "Middle German", + "name": "East Central", "iso_1_code": null, "iso_3_code": null, "tokenizers": { "Latn": { - "full_object": "SpaCyTokenizer(\"de\")", - "original_lang_name": "german", - "original_lang_code": "deu", - "scripts": [ - "Latn" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Armn": { - "full_object": "SpaCyTokenizer(\"hy\")", - "original_lang_name": "armenian", - "original_lang_code": "hye", - "scripts": [ - "Armn" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Cyrl": { - "full_object": "SpaCyTokenizer(\"ru\")", - "original_lang_name": "russian", - "original_lang_code": "rus", - "scripts": [ - "Cyrl" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Grek": { - "full_object": "SpaCyTokenizer(\"el\")", - "original_lang_name": "greek", - "original_lang_code": "ell", - "scripts": [ - "Grek" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Deva": { - "full_object": "SpaCyTokenizer(\"hi\")", - "original_lang_name": "hindi", - "original_lang_code": "hin", + "full_object": "SpaCyTokenizer(\"ne\")", + "original_lang_name": "nepali", + "original_lang_code": "nep", "scripts": [ "Latn", "Deva" ], "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Gujr": { - "full_object": "SpaCyTokenizer(\"gu\")", - "original_lang_name": "gujarati", - "original_lang_code": "guj", - "scripts": [ - "Latn", - "Gujr" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Guru": { - "full_object": "IndicNLPTokenizer(\"pa\")", - "original_lang_name": "punjabi", - "original_lang_code": "pan", - "scripts": [ - "Latn", - "Guru" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": false - }, - "Beng": { - "full_object": "SpaCyTokenizer(\"bn\")", - "original_lang_name": "bengali", - "original_lang_code": "ben", - "scripts": [ - "Latn", - "Beng" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Orya": { - "full_object": "IndicNLPTokenizer(\"or\")", - "original_lang_name": "oriya", - "original_lang_code": "ori", - "scripts": [ - "Latn", - "Orya" - ], - "class_name": "IndicNLPTokenizer", "macrolanguage": true }, - "Arab": { - "full_object": "SpaCyTokenizer(\"fa\")", - "original_lang_name": "persian", - "original_lang_code": "fas", + "Deva": { + "full_object": "SpaCyTokenizer(\"ne\")", + "original_lang_name": "nepali", + "original_lang_code": "nep", "scripts": [ - "Arab" + "Latn", + "Deva" ], "class_name": "SpaCyTokenizer", "macrolanguage": true - }, - "Sinh": { - "full_object": "SpaCyTokenizer(\"si\")", - "original_lang_name": "sinhala", - "original_lang_code": "sin", - "scripts": [ - "Sinh" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false } }, "children": [ { - "name": "East Middle German", + "name": "Awadhi", "iso_1_code": null, - "iso_3_code": null, + "iso_3_code": "awa", "tokenizers": { - "Latn": { - "full_object": "SpaCyTokenizer(\"de\")", - "original_lang_name": "german", - "original_lang_code": "deu", - "scripts": [ - "Latn" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Armn": { - "full_object": "SpaCyTokenizer(\"hy\")", - "original_lang_name": "armenian", - "original_lang_code": "hye", - "scripts": [ - "Armn" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Cyrl": { - "full_object": "SpaCyTokenizer(\"ru\")", - "original_lang_name": "russian", - "original_lang_code": "rus", + "Deva": { + "full_object": "SpaCyTokenizer(\"ne\")", + "original_lang_name": "nepali", + "original_lang_code": "nep", "scripts": [ - "Cyrl" + "Latn", + "Deva" ], "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Grek": { - "full_object": "SpaCyTokenizer(\"el\")", - "original_lang_name": "greek", - "original_lang_code": "ell", + "macrolanguage": true + } + }, + "children": [], + "node_i": "4073", + "scripts": [ + "Deva" + ], + "own_tokenizer": false + }, + { + "name": "Bagheli", + "iso_1_code": null, + "iso_3_code": "bfy", + "tokenizers": {}, + "children": [], + "node_i": "4074", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Fiji Hindi", + "iso_1_code": null, + "iso_3_code": "hif", + "tokenizers": { + "Latn": { + "full_object": "SpaCyTokenizer(\"ne\")", + "original_lang_name": "nepali", + "original_lang_code": "nep", "scripts": [ - "Grek" + "Latn", + "Deva" ], "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, + "macrolanguage": true + } + }, + "children": [], + "node_i": "4075", + "scripts": [ + "Latn" + ], + "own_tokenizer": false + }, + { + "name": "Chhattisgarhi", + "iso_1_code": null, + "iso_3_code": "hne", + "tokenizers": { "Deva": { - "full_object": "SpaCyTokenizer(\"hi\")", - "original_lang_name": "hindi", - "original_lang_code": "hin", + "full_object": "SpaCyTokenizer(\"ne\")", + "original_lang_name": "nepali", + "original_lang_code": "nep", "scripts": [ "Latn", "Deva" ], "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Gujr": { - "full_object": "SpaCyTokenizer(\"gu\")", - "original_lang_name": "gujarati", - "original_lang_code": "guj", - "scripts": [ - "Latn", - "Gujr" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Guru": { - "full_object": "IndicNLPTokenizer(\"pa\")", - "original_lang_name": "punjabi", - "original_lang_code": "pan", - "scripts": [ - "Latn", - "Guru" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": false - }, - "Beng": { - "full_object": "SpaCyTokenizer(\"bn\")", - "original_lang_name": "bengali", - "original_lang_code": "ben", + "macrolanguage": true + } + }, + "children": [], + "node_i": "4076", + "scripts": [ + "Deva" + ], + "own_tokenizer": false + }, + { + "name": "Kamar", + "iso_1_code": null, + "iso_3_code": "keq", + "tokenizers": {}, + "children": [], + "node_i": "4077", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Surgujia", + "iso_1_code": null, + "iso_3_code": "sgj", + "tokenizers": {}, + "children": [], + "node_i": "4078", + "scripts": [], + "own_tokenizer": false + } + ], + "node_i": "4072", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Eastern Pahari", + "iso_1_code": null, + "iso_3_code": null, + "tokenizers": { + "Latn": { + "full_object": "SpaCyTokenizer(\"ne\")", + "original_lang_name": "nepali", + "original_lang_code": "nep", + "scripts": [ + "Latn", + "Deva" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + }, + "Deva": { + "full_object": "SpaCyTokenizer(\"ne\")", + "original_lang_name": "nepali", + "original_lang_code": "nep", + "scripts": [ + "Latn", + "Deva" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + } + }, + "children": [ + { + "name": "Dotyali", + "iso_1_code": "ne", + "iso_3_code": "dty", + "tokenizers": { + "Latn": { + "full_object": "SpaCyTokenizer(\"ne\")", + "original_lang_name": "nepali", + "original_lang_code": "nep", "scripts": [ "Latn", - "Beng" + "Deva" ], "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Orya": { - "full_object": "IndicNLPTokenizer(\"or\")", - "original_lang_name": "oriya", - "original_lang_code": "ori", - "scripts": [ - "Latn", - "Orya" - ], - "class_name": "IndicNLPTokenizer", "macrolanguage": true }, - "Arab": { - "full_object": "SpaCyTokenizer(\"fa\")", - "original_lang_name": "persian", - "original_lang_code": "fas", + "Deva": { + "full_object": "SpaCyTokenizer(\"ne\")", + "original_lang_name": "nepali", + "original_lang_code": "nep", "scripts": [ - "Arab" + "Latn", + "Deva" ], "class_name": "SpaCyTokenizer", "macrolanguage": true - }, - "Sinh": { - "full_object": "SpaCyTokenizer(\"si\")", - "original_lang_name": "sinhala", - "original_lang_code": "sin", - "scripts": [ - "Sinh" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false } }, - "children": [ - { - "name": "German, Standard", - "iso_1_code": "de", - "iso_3_code": "deu", - "tokenizers": { - "Latn": { - "full_object": "SpaCyTokenizer(\"de\")", - "original_lang_name": "german", - "original_lang_code": "deu", - "scripts": [ - "Latn" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - } - }, - "children": [], - "node_i": "4014", - "scripts": [ - "Latn" - ], - "own_tokenizer": true - }, - { - "name": "Silesian, Lower", - "iso_1_code": null, - "iso_3_code": "sli", - "tokenizers": {}, - "children": [], - "node_i": "4015", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Saxon, Upper", - "iso_1_code": null, - "iso_3_code": "sxu", - "tokenizers": {}, - "children": [], - "node_i": "4016", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Wymysorys", - "iso_1_code": null, - "iso_3_code": "wym", - "tokenizers": {}, - "children": [], - "node_i": "4017", - "scripts": [], - "own_tokenizer": false - } + "children": [], + "node_i": "4080", + "scripts": [ + "Deva" ], - "node_i": "4013", + "own_tokenizer": true + }, + { + "name": "Jumli", + "iso_1_code": null, + "iso_3_code": "jml", + "tokenizers": {}, + "children": [], + "node_i": "4081", "scripts": [], "own_tokenizer": false }, { - "name": "West Middle German", - "iso_1_code": null, - "iso_3_code": null, + "name": "Nepali", + "iso_1_code": "ne", + "iso_3_code": "npi", "tokenizers": { "Latn": { - "full_object": "SpaCyTokenizer(\"lb\")", - "original_lang_name": "luxembourgish", - "original_lang_code": "ltz", - "scripts": [ - "Latn" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Armn": { - "full_object": "SpaCyTokenizer(\"hy\")", - "original_lang_name": "armenian", - "original_lang_code": "hye", - "scripts": [ - "Armn" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Cyrl": { - "full_object": "SpaCyTokenizer(\"ru\")", - "original_lang_name": "russian", - "original_lang_code": "rus", - "scripts": [ - "Cyrl" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Grek": { - "full_object": "SpaCyTokenizer(\"el\")", - "original_lang_name": "greek", - "original_lang_code": "ell", - "scripts": [ - "Grek" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Deva": { - "full_object": "SpaCyTokenizer(\"hi\")", - "original_lang_name": "hindi", - "original_lang_code": "hin", + "full_object": "SpaCyTokenizer(\"ne\")", + "original_lang_name": "nepali", + "original_lang_code": "nep", "scripts": [ "Latn", "Deva" ], "class_name": "SpaCyTokenizer", - "macrolanguage": false + "macrolanguage": true }, - "Gujr": { - "full_object": "SpaCyTokenizer(\"gu\")", - "original_lang_name": "gujarati", - "original_lang_code": "guj", + "Deva": { + "full_object": "SpaCyTokenizer(\"ne\")", + "original_lang_name": "nepali", + "original_lang_code": "nep", "scripts": [ "Latn", - "Gujr" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Guru": { - "full_object": "IndicNLPTokenizer(\"pa\")", - "original_lang_name": "punjabi", - "original_lang_code": "pan", - "scripts": [ - "Latn", - "Guru" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": false - }, - "Beng": { - "full_object": "SpaCyTokenizer(\"bn\")", - "original_lang_name": "bengali", - "original_lang_code": "ben", - "scripts": [ - "Latn", - "Beng" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Orya": { - "full_object": "IndicNLPTokenizer(\"or\")", - "original_lang_name": "oriya", - "original_lang_code": "ori", - "scripts": [ - "Latn", - "Orya" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": true - }, - "Arab": { - "full_object": "SpaCyTokenizer(\"fa\")", - "original_lang_name": "persian", - "original_lang_code": "fas", - "scripts": [ - "Arab" + "Deva" ], "class_name": "SpaCyTokenizer", "macrolanguage": true - }, - "Sinh": { - "full_object": "SpaCyTokenizer(\"si\")", - "original_lang_name": "sinhala", - "original_lang_code": "sin", - "scripts": [ - "Sinh" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false } }, - "children": [ - { - "name": "Ripuarian", - "iso_1_code": null, - "iso_3_code": "ksh", - "tokenizers": { - "Latn": { - "full_object": "SpaCyTokenizer(\"lb\")", - "original_lang_name": "luxembourgish", - "original_lang_code": "ltz", - "scripts": [ - "Latn" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - } - }, - "children": [], - "node_i": "4019", - "scripts": [ - "Latn" - ], - "own_tokenizer": false - }, - { - "name": "German, Pennsylvania", - "iso_1_code": null, - "iso_3_code": "pdc", - "tokenizers": { - "Latn": { - "full_object": "SpaCyTokenizer(\"lb\")", - "original_lang_name": "luxembourgish", - "original_lang_code": "ltz", - "scripts": [ - "Latn" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - } - }, - "children": [], - "node_i": "4020", - "scripts": [ - "Latn" - ], - "own_tokenizer": false - }, - { - "name": "Palatinate Franconian", - "iso_1_code": null, - "iso_3_code": "pfl", - "tokenizers": { - "Latn": { - "full_object": "SpaCyTokenizer(\"lb\")", - "original_lang_name": "luxembourgish", - "original_lang_code": "ltz", - "scripts": [ - "Latn" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - } - }, - "children": [], - "node_i": "4021", - "scripts": [ - "Latn" - ], - "own_tokenizer": false - }, - { - "name": "Moselle Franconian", - "iso_1_code": null, - "iso_3_code": null, - "tokenizers": { - "Latn": { - "full_object": "SpaCyTokenizer(\"lb\")", - "original_lang_name": "luxembourgish", - "original_lang_code": "ltz", - "scripts": [ - "Latn" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Armn": { - "full_object": "SpaCyTokenizer(\"hy\")", - "original_lang_name": "armenian", - "original_lang_code": "hye", - "scripts": [ - "Armn" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Cyrl": { - "full_object": "SpaCyTokenizer(\"ru\")", - "original_lang_name": "russian", - "original_lang_code": "rus", - "scripts": [ - "Cyrl" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Grek": { - "full_object": "SpaCyTokenizer(\"el\")", - "original_lang_name": "greek", - "original_lang_code": "ell", - "scripts": [ - "Grek" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Deva": { - "full_object": "SpaCyTokenizer(\"hi\")", - "original_lang_name": "hindi", - "original_lang_code": "hin", - "scripts": [ - "Latn", - "Deva" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Gujr": { - "full_object": "SpaCyTokenizer(\"gu\")", - "original_lang_name": "gujarati", - "original_lang_code": "guj", - "scripts": [ - "Latn", - "Gujr" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Guru": { - "full_object": "IndicNLPTokenizer(\"pa\")", - "original_lang_name": "punjabi", - "original_lang_code": "pan", - "scripts": [ - "Latn", - "Guru" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": false - }, - "Beng": { - "full_object": "SpaCyTokenizer(\"bn\")", - "original_lang_name": "bengali", - "original_lang_code": "ben", - "scripts": [ - "Latn", - "Beng" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Orya": { - "full_object": "IndicNLPTokenizer(\"or\")", - "original_lang_name": "oriya", - "original_lang_code": "ori", - "scripts": [ - "Latn", - "Orya" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": true - }, - "Arab": { - "full_object": "SpaCyTokenizer(\"fa\")", - "original_lang_name": "persian", - "original_lang_code": "fas", - "scripts": [ - "Arab" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": true - }, - "Sinh": { - "full_object": "SpaCyTokenizer(\"si\")", - "original_lang_name": "sinhala", - "original_lang_code": "sin", - "scripts": [ - "Sinh" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - } - }, - "children": [ - { - "name": "Luxembourgish", - "iso_1_code": "lb", - "iso_3_code": "ltz", - "tokenizers": { - "Latn": { - "full_object": "SpaCyTokenizer(\"lb\")", - "original_lang_name": "luxembourgish", - "original_lang_code": "ltz", - "scripts": [ - "Latn" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - } - }, - "children": [], - "node_i": "4023", - "scripts": [ - "Latn" - ], - "own_tokenizer": true - } - ], - "node_i": "4022", - "scripts": [], - "own_tokenizer": false - } + "children": [], + "node_i": "4082", + "scripts": [ + "Latn", + "Deva" ], - "node_i": "4018", - "scripts": [], - "own_tokenizer": false + "own_tokenizer": true } ], - "node_i": "4012", + "node_i": "4079", "scripts": [], "own_tokenizer": false - }, - { - "name": "Upper German", - "iso_1_code": null, - "iso_3_code": null, - "tokenizers": { - "Latn": { - "full_object": "SpaCyTokenizer(\"de\")", - "original_lang_name": "german", - "original_lang_code": "deu", - "scripts": [ - "Latn" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Armn": { - "full_object": "SpaCyTokenizer(\"hy\")", - "original_lang_name": "armenian", - "original_lang_code": "hye", - "scripts": [ - "Armn" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Cyrl": { - "full_object": "SpaCyTokenizer(\"ru\")", - "original_lang_name": "russian", - "original_lang_code": "rus", - "scripts": [ - "Cyrl" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Grek": { - "full_object": "SpaCyTokenizer(\"el\")", - "original_lang_name": "greek", - "original_lang_code": "ell", - "scripts": [ - "Grek" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Deva": { - "full_object": "SpaCyTokenizer(\"hi\")", - "original_lang_name": "hindi", - "original_lang_code": "hin", - "scripts": [ - "Latn", - "Deva" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Gujr": { - "full_object": "SpaCyTokenizer(\"gu\")", - "original_lang_name": "gujarati", - "original_lang_code": "guj", - "scripts": [ - "Latn", - "Gujr" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Guru": { - "full_object": "IndicNLPTokenizer(\"pa\")", - "original_lang_name": "punjabi", - "original_lang_code": "pan", - "scripts": [ - "Latn", - "Guru" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": false - }, - "Beng": { - "full_object": "SpaCyTokenizer(\"bn\")", - "original_lang_name": "bengali", - "original_lang_code": "ben", - "scripts": [ - "Latn", - "Beng" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Orya": { - "full_object": "IndicNLPTokenizer(\"or\")", - "original_lang_name": "oriya", - "original_lang_code": "ori", - "scripts": [ - "Latn", - "Orya" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": true + } + ], + "node_i": "4071", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Western", + "iso_1_code": null, + "iso_3_code": null, + "tokenizers": { + "Latn": { + "full_object": "SpaCyTokenizer(\"gu\")", + "original_lang_name": "gujarati", + "original_lang_code": "guj", + "scripts": [ + "Latn", + "Gujr" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Gujr": { + "full_object": "SpaCyTokenizer(\"gu\")", + "original_lang_name": "gujarati", + "original_lang_code": "guj", + "scripts": [ + "Latn", + "Gujr" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Guru": { + "full_object": "IndicNLPTokenizer(\"pa\")", + "original_lang_name": "punjabi", + "original_lang_code": "pan", + "scripts": [ + "Latn", + "Guru" + ], + "class_name": "IndicNLPTokenizer", + "macrolanguage": false + }, + "Deva": { + "full_object": "SpaCyTokenizer(\"ne\")", + "original_lang_name": "nepali", + "original_lang_code": "nep", + "scripts": [ + "Latn", + "Deva" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + }, + "Arab": { + "full_object": "SpaCyTokenizer(\"ur\")", + "original_lang_name": "urdu", + "original_lang_code": "urd", + "scripts": [ + "Latn", + "Arab" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Cyrl": { + "full_object": "StanzaTokenizer(\"kmr\")", + "original_lang_name": "northern_kurdish", + "original_lang_code": "kmr", + "scripts": [ + "Latn", + "Cyrl" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Grek": { + "full_object": "SpaCyTokenizer(\"el\")", + "original_lang_name": "greek", + "original_lang_code": "ell", + "scripts": [ + "Grek" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } + }, + "children": [ + { + "name": "Dawoodi", + "iso_1_code": null, + "iso_3_code": "dmk", + "tokenizers": {}, + "children": [], + "node_i": "4084", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Parya", + "iso_1_code": null, + "iso_3_code": "paq", + "tokenizers": {}, + "children": [], + "node_i": "4085", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Powari", + "iso_1_code": null, + "iso_3_code": "pwr", + "tokenizers": {}, + "children": [], + "node_i": "4086", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Bhil", + "iso_1_code": null, + "iso_3_code": null, + "tokenizers": {}, + "children": [ + { + "name": "Bareli, Pauri", + "iso_1_code": null, + "iso_3_code": "bfb", + "tokenizers": {}, + "children": [], + "node_i": "4088", + "scripts": [], + "own_tokenizer": false }, - "Arab": { - "full_object": "SpaCyTokenizer(\"fa\")", - "original_lang_name": "persian", - "original_lang_code": "fas", - "scripts": [ - "Arab" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": true + { + "name": "Bareli, Rathwi", + "iso_1_code": null, + "iso_3_code": "bgd", + "tokenizers": {}, + "children": [], + "node_i": "4089", + "scripts": [], + "own_tokenizer": false }, - "Sinh": { - "full_object": "SpaCyTokenizer(\"si\")", - "original_lang_name": "sinhala", - "original_lang_code": "sin", - "scripts": [ - "Sinh" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - } - }, - "children": [ { - "name": "Eastern Franconian", + "name": "Bauria", "iso_1_code": null, - "iso_3_code": "vmf", + "iso_3_code": "bge", "tokenizers": {}, "children": [], - "node_i": "4025", + "node_i": "4090", "scripts": [], "own_tokenizer": false }, { - "name": "Alemannic", + "name": "Bhili", "iso_1_code": null, - "iso_3_code": null, - "tokenizers": { + "iso_3_code": "bhb", + "tokenizers": {}, + "children": [], + "node_i": "4091", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Bhilali", + "iso_1_code": null, + "iso_3_code": "bhi", + "tokenizers": {}, + "children": [], + "node_i": "4092", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Bareli, Palya", + "iso_1_code": null, + "iso_3_code": "bpx", + "tokenizers": {}, + "children": [], + "node_i": "4093", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Chodri", + "iso_1_code": null, + "iso_3_code": "cdi", + "tokenizers": {}, + "children": [], + "node_i": "4094", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Dhodia", + "iso_1_code": null, + "iso_3_code": "dho", + "tokenizers": {}, + "children": [], + "node_i": "4095", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Dubli", + "iso_1_code": null, + "iso_3_code": "dub", + "tokenizers": {}, + "children": [], + "node_i": "4096", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Dungra Bhil", + "iso_1_code": null, + "iso_3_code": "duh", + "tokenizers": {}, + "children": [], + "node_i": "4097", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Garasia, Adiwasi", + "iso_1_code": null, + "iso_3_code": "gas", + "tokenizers": {}, + "children": [], + "node_i": "4098", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Gamit", + "iso_1_code": null, + "iso_3_code": "gbl", + "tokenizers": {}, + "children": [], + "node_i": "4099", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Garasia, Rajput", + "iso_1_code": null, + "iso_3_code": "gra", + "tokenizers": {}, + "children": [], + "node_i": "4100", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Mawchi", + "iso_1_code": null, + "iso_3_code": "mke", + "tokenizers": {}, + "children": [], + "node_i": "4101", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Nahali", + "iso_1_code": null, + "iso_3_code": "nlx", + "tokenizers": {}, + "children": [], + "node_i": "4102", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Noiri", + "iso_1_code": null, + "iso_3_code": "noi", + "tokenizers": {}, + "children": [], + "node_i": "4103", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Pardhi", + "iso_1_code": null, + "iso_3_code": "pcl", + "tokenizers": {}, + "children": [], + "node_i": "4104", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Rathawi", + "iso_1_code": null, + "iso_3_code": "rtw", + "tokenizers": {}, + "children": [], + "node_i": "4105", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Wagdi", + "iso_1_code": null, + "iso_3_code": "wbr", + "tokenizers": {}, + "children": [], + "node_i": "4106", + "scripts": [], + "own_tokenizer": false + } + ], + "node_i": "4087", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Dom", + "iso_1_code": null, + "iso_3_code": null, + "tokenizers": {}, + "children": [ + { + "name": "Domari", + "iso_1_code": null, + "iso_3_code": "rmt", + "tokenizers": {}, + "children": [], + "node_i": "4108", + "scripts": [], + "own_tokenizer": false + } + ], + "node_i": "4107", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Gujarati", + "iso_1_code": null, + "iso_3_code": null, + "tokenizers": { + "Latn": { + "full_object": "SpaCyTokenizer(\"gu\")", + "original_lang_name": "gujarati", + "original_lang_code": "guj", + "scripts": [ + "Latn", + "Gujr" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Gujr": { + "full_object": "SpaCyTokenizer(\"gu\")", + "original_lang_name": "gujarati", + "original_lang_code": "guj", + "scripts": [ + "Latn", + "Gujr" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } + }, + "children": [ + { + "name": "Aer", + "iso_1_code": null, + "iso_3_code": "aeq", + "tokenizers": {}, + "children": [], + "node_i": "4110", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Koli, Kachi", + "iso_1_code": null, + "iso_3_code": "gjk", + "tokenizers": {}, + "children": [], + "node_i": "4111", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Gujarati", + "iso_1_code": "gu", + "iso_3_code": "guj", + "tokenizers": { "Latn": { - "full_object": "SpaCyTokenizer(\"de\")", - "original_lang_name": "german", - "original_lang_code": "deu", + "full_object": "SpaCyTokenizer(\"gu\")", + "original_lang_name": "gujarati", + "original_lang_code": "guj", "scripts": [ - "Latn" + "Latn", + "Gujr" ], "class_name": "SpaCyTokenizer", "macrolanguage": false }, - "Armn": { - "full_object": "SpaCyTokenizer(\"hy\")", - "original_lang_name": "armenian", - "original_lang_code": "hye", + "Gujr": { + "full_object": "SpaCyTokenizer(\"gu\")", + "original_lang_name": "gujarati", + "original_lang_code": "guj", "scripts": [ - "Armn" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Cyrl": { - "full_object": "SpaCyTokenizer(\"ru\")", - "original_lang_name": "russian", - "original_lang_code": "rus", - "scripts": [ - "Cyrl" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Grek": { - "full_object": "SpaCyTokenizer(\"el\")", - "original_lang_name": "greek", - "original_lang_code": "ell", - "scripts": [ - "Grek" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Deva": { - "full_object": "SpaCyTokenizer(\"hi\")", - "original_lang_name": "hindi", - "original_lang_code": "hin", - "scripts": [ - "Latn", - "Deva" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Gujr": { - "full_object": "SpaCyTokenizer(\"gu\")", - "original_lang_name": "gujarati", - "original_lang_code": "guj", - "scripts": [ - "Latn", - "Gujr" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Guru": { - "full_object": "IndicNLPTokenizer(\"pa\")", - "original_lang_name": "punjabi", - "original_lang_code": "pan", - "scripts": [ - "Latn", - "Guru" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": false - }, - "Beng": { - "full_object": "SpaCyTokenizer(\"bn\")", - "original_lang_name": "bengali", - "original_lang_code": "ben", - "scripts": [ - "Latn", - "Beng" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Orya": { - "full_object": "IndicNLPTokenizer(\"or\")", - "original_lang_name": "oriya", - "original_lang_code": "ori", - "scripts": [ - "Latn", - "Orya" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": true - }, - "Arab": { - "full_object": "SpaCyTokenizer(\"fa\")", - "original_lang_name": "persian", - "original_lang_code": "fas", - "scripts": [ - "Arab" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": true - }, - "Sinh": { - "full_object": "SpaCyTokenizer(\"si\")", - "original_lang_name": "sinhala", - "original_lang_code": "sin", - "scripts": [ - "Sinh" + "Latn", + "Gujr" ], "class_name": "SpaCyTokenizer", "macrolanguage": false } }, + "children": [], + "node_i": "4112", + "scripts": [ + "Gujr", + "Latn" + ], + "own_tokenizer": true + }, + { + "name": "Jandavra", + "iso_1_code": null, + "iso_3_code": "jnd", + "tokenizers": {}, + "children": [], + "node_i": "4113", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Koli, Parkari", + "iso_1_code": null, + "iso_3_code": "kvx", + "tokenizers": {}, + "children": [], + "node_i": "4114", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Koli, Wadiyari", + "iso_1_code": null, + "iso_3_code": "kxp", + "tokenizers": {}, + "children": [], + "node_i": "4115", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Sourashtra", + "iso_1_code": null, + "iso_3_code": "saz", + "tokenizers": {}, + "children": [], + "node_i": "4116", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Vasavi", + "iso_1_code": null, + "iso_3_code": "vas", + "tokenizers": {}, + "children": [], + "node_i": "4117", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Vaghri", + "iso_1_code": null, + "iso_3_code": "vgr", + "tokenizers": {}, + "children": [], + "node_i": "4118", + "scripts": [], + "own_tokenizer": false + } + ], + "node_i": "4109", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Khandesi", + "iso_1_code": null, + "iso_3_code": null, + "tokenizers": {}, + "children": [ + { + "name": "Ahirani", + "iso_1_code": null, + "iso_3_code": "ahr", + "tokenizers": {}, + "children": [], + "node_i": "4120", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Dangi", + "iso_1_code": null, + "iso_3_code": "dhn", + "tokenizers": {}, + "children": [], + "node_i": "4121", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Khandesi", + "iso_1_code": null, + "iso_3_code": "khn", + "tokenizers": {}, + "children": [], + "node_i": "4122", + "scripts": [], + "own_tokenizer": false + } + ], + "node_i": "4119", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Pahari", + "iso_1_code": null, + "iso_3_code": null, + "tokenizers": {}, + "children": [ + { + "name": "Central Pahari", + "iso_1_code": null, + "iso_3_code": null, + "tokenizers": {}, "children": [ { - "name": "German, Colonia Tovar", + "name": "Kumaoni", "iso_1_code": null, - "iso_3_code": "gct", + "iso_3_code": "kfy", "tokenizers": {}, "children": [], - "node_i": "4027", + "node_i": "4125", "scripts": [], "own_tokenizer": false - }, + } + ], + "node_i": "4124", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Garhwali", + "iso_1_code": null, + "iso_3_code": null, + "tokenizers": {}, + "children": [ { - "name": "German, Swiss", + "name": "Garhwali", "iso_1_code": null, - "iso_3_code": "gsw", - "tokenizers": { - "Latn": { - "full_object": "SpaCyTokenizer(\"de\")", - "original_lang_name": "german", - "original_lang_code": "deu", - "scripts": [ - "Latn" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - } - }, + "iso_3_code": "gbm", + "tokenizers": {}, "children": [], - "node_i": "4028", - "scripts": [ - "Latn" - ], - "own_tokenizer": false - }, - { - "name": "Swabian", - "iso_1_code": null, - "iso_3_code": "swg", - "tokenizers": { - "Latn": { - "full_object": "SpaCyTokenizer(\"de\")", - "original_lang_name": "german", - "original_lang_code": "deu", - "scripts": [ - "Latn" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - } - }, - "children": [], - "node_i": "4029", - "scripts": [ - "Latn" - ], - "own_tokenizer": false - }, - { - "name": "Walser", - "iso_1_code": null, - "iso_3_code": "wae", - "tokenizers": {}, - "children": [], - "node_i": "4030", - "scripts": [], + "node_i": "4127", + "scripts": [], "own_tokenizer": false } ], - "node_i": "4026", + "node_i": "4126", "scripts": [], "own_tokenizer": false }, { - "name": "Bavarian-Austrian", + "name": "Western Pahari", "iso_1_code": null, "iso_3_code": null, - "tokenizers": { - "Latn": { - "full_object": "SpaCyTokenizer(\"de\")", - "original_lang_name": "german", - "original_lang_code": "deu", - "scripts": [ - "Latn" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Armn": { - "full_object": "SpaCyTokenizer(\"hy\")", - "original_lang_name": "armenian", - "original_lang_code": "hye", - "scripts": [ - "Armn" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false + "tokenizers": {}, + "children": [ + { + "name": "Pahari, Mahasu", + "iso_1_code": null, + "iso_3_code": "bfz", + "tokenizers": {}, + "children": [], + "node_i": "4129", + "scripts": [], + "own_tokenizer": false }, - "Cyrl": { - "full_object": "SpaCyTokenizer(\"ru\")", - "original_lang_name": "russian", - "original_lang_code": "rus", - "scripts": [ - "Cyrl" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false + { + "name": "Bhadrawahi", + "iso_1_code": null, + "iso_3_code": "bhd", + "tokenizers": {}, + "children": [], + "node_i": "4130", + "scripts": [], + "own_tokenizer": false }, - "Grek": { - "full_object": "SpaCyTokenizer(\"el\")", - "original_lang_name": "greek", - "original_lang_code": "ell", - "scripts": [ - "Grek" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false + { + "name": "Bhattiyali", + "iso_1_code": null, + "iso_3_code": "bht", + "tokenizers": {}, + "children": [], + "node_i": "4131", + "scripts": [], + "own_tokenizer": false }, - "Deva": { - "full_object": "SpaCyTokenizer(\"hi\")", - "original_lang_name": "hindi", - "original_lang_code": "hin", - "scripts": [ - "Latn", - "Deva" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false + { + "name": "Chambeali", + "iso_1_code": null, + "iso_3_code": "cdh", + "tokenizers": {}, + "children": [], + "node_i": "4132", + "scripts": [], + "own_tokenizer": false }, - "Gujr": { - "full_object": "SpaCyTokenizer(\"gu\")", - "original_lang_name": "gujarati", - "original_lang_code": "guj", - "scripts": [ - "Latn", - "Gujr" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false + { + "name": "Churahi", + "iso_1_code": null, + "iso_3_code": "cdj", + "tokenizers": {}, + "children": [], + "node_i": "4133", + "scripts": [], + "own_tokenizer": false }, - "Guru": { - "full_object": "IndicNLPTokenizer(\"pa\")", - "original_lang_name": "punjabi", - "original_lang_code": "pan", - "scripts": [ - "Latn", - "Guru" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": false + { + "name": "Dogri", + "iso_1_code": null, + "iso_3_code": "dgo", + "tokenizers": {}, + "children": [], + "node_i": "4134", + "scripts": [], + "own_tokenizer": false }, - "Beng": { - "full_object": "SpaCyTokenizer(\"bn\")", - "original_lang_name": "bengali", - "original_lang_code": "ben", - "scripts": [ - "Latn", - "Beng" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false + { + "name": "Gaddi", + "iso_1_code": null, + "iso_3_code": "gbk", + "tokenizers": {}, + "children": [], + "node_i": "4135", + "scripts": [], + "own_tokenizer": false }, - "Orya": { - "full_object": "IndicNLPTokenizer(\"or\")", - "original_lang_name": "oriya", - "original_lang_code": "ori", - "scripts": [ - "Latn", - "Orya" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": true + { + "name": "Hinduri", + "iso_1_code": null, + "iso_3_code": "hii", + "tokenizers": {}, + "children": [], + "node_i": "4136", + "scripts": [], + "own_tokenizer": false }, - "Arab": { - "full_object": "SpaCyTokenizer(\"fa\")", - "original_lang_name": "persian", - "original_lang_code": "fas", - "scripts": [ - "Arab" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": true + { + "name": "Khah", + "iso_1_code": null, + "iso_3_code": "hkh", + "tokenizers": {}, + "children": [], + "node_i": "4137", + "scripts": [], + "own_tokenizer": false }, - "Sinh": { - "full_object": "SpaCyTokenizer(\"si\")", - "original_lang_name": "sinhala", - "original_lang_code": "sin", - "scripts": [ - "Sinh" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - } - }, - "children": [ { - "name": "Bavarian", + "name": "Jaunsari", "iso_1_code": null, - "iso_3_code": "bar", - "tokenizers": { - "Latn": { - "full_object": "SpaCyTokenizer(\"de\")", - "original_lang_name": "german", - "original_lang_code": "deu", - "scripts": [ - "Latn" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - } - }, + "iso_3_code": "jns", + "tokenizers": {}, "children": [], - "node_i": "4032", - "scripts": [ - "Latn" - ], + "node_i": "4138", + "scripts": [], "own_tokenizer": false }, { - "name": "Cimbrian", + "name": "Bilaspuri", "iso_1_code": null, - "iso_3_code": "cim", + "iso_3_code": "kfs", "tokenizers": {}, "children": [], - "node_i": "4033", + "node_i": "4139", "scripts": [], "own_tokenizer": false }, { - "name": "Hutterisch", + "name": "Pahari, Kullu", "iso_1_code": null, - "iso_3_code": "geh", + "iso_3_code": "kfx", "tokenizers": {}, "children": [], - "node_i": "4034", + "node_i": "4140", "scripts": [], "own_tokenizer": false }, { - "name": "M\u00f2cheno", + "name": "Kinnauri, Pahari", "iso_1_code": null, - "iso_3_code": "mhn", + "iso_3_code": "kjo", "tokenizers": {}, "children": [], - "node_i": "4035", + "node_i": "4141", "scripts": [], "own_tokenizer": false - } + }, + { + "name": "Mandeali", + "iso_1_code": null, + "iso_3_code": "mjl", + "tokenizers": {}, + "children": [], + "node_i": "4142", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Pangwali", + "iso_1_code": null, + "iso_3_code": "pgg", + "tokenizers": {}, + "children": [], + "node_i": "4143", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Sirmauri", + "iso_1_code": null, + "iso_3_code": "srx", + "tokenizers": {}, + "children": [], + "node_i": "4144", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Kangri", + "iso_1_code": null, + "iso_3_code": "xnr", + "tokenizers": {}, + "children": [], + "node_i": "4145", + "scripts": [], + "own_tokenizer": false + } ], - "node_i": "4031", + "node_i": "4128", "scripts": [], "own_tokenizer": false } ], - "node_i": "4024", + "node_i": "4123", "scripts": [], "own_tokenizer": false - } - ], - "node_i": "4010", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Yiddish", - "iso_1_code": null, - "iso_3_code": null, - "tokenizers": { - "Latn": { - "full_object": "SpaCyTokenizer(\"de\")", - "original_lang_name": "german", - "original_lang_code": "deu", - "scripts": [ - "Latn" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Armn": { - "full_object": "SpaCyTokenizer(\"hy\")", - "original_lang_name": "armenian", - "original_lang_code": "hye", - "scripts": [ - "Armn" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Cyrl": { - "full_object": "SpaCyTokenizer(\"ru\")", - "original_lang_name": "russian", - "original_lang_code": "rus", - "scripts": [ - "Cyrl" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Grek": { - "full_object": "SpaCyTokenizer(\"el\")", - "original_lang_name": "greek", - "original_lang_code": "ell", - "scripts": [ - "Grek" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Deva": { - "full_object": "SpaCyTokenizer(\"hi\")", - "original_lang_name": "hindi", - "original_lang_code": "hin", - "scripts": [ - "Latn", - "Deva" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Gujr": { - "full_object": "SpaCyTokenizer(\"gu\")", - "original_lang_name": "gujarati", - "original_lang_code": "guj", - "scripts": [ - "Latn", - "Gujr" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Guru": { - "full_object": "IndicNLPTokenizer(\"pa\")", - "original_lang_name": "punjabi", - "original_lang_code": "pan", - "scripts": [ - "Latn", - "Guru" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": false - }, - "Beng": { - "full_object": "SpaCyTokenizer(\"bn\")", - "original_lang_name": "bengali", - "original_lang_code": "ben", - "scripts": [ - "Latn", - "Beng" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Orya": { - "full_object": "IndicNLPTokenizer(\"or\")", - "original_lang_name": "oriya", - "original_lang_code": "ori", - "scripts": [ - "Latn", - "Orya" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": true - }, - "Arab": { - "full_object": "SpaCyTokenizer(\"fa\")", - "original_lang_name": "persian", - "original_lang_code": "fas", - "scripts": [ - "Arab" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": true - }, - "Sinh": { - "full_object": "SpaCyTokenizer(\"si\")", - "original_lang_name": "sinhala", - "original_lang_code": "sin", - "scripts": [ - "Sinh" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - } - }, - "children": [ - { - "name": "Yiddish, Eastern", - "iso_1_code": "yi", - "iso_3_code": "ydd", - "tokenizers": {}, - "children": [], - "node_i": "4037", - "scripts": [ - "Hebr" - ], - "own_tokenizer": false }, { - "name": "Yiddish, Western", - "iso_1_code": "yi", - "iso_3_code": "yih", - "tokenizers": {}, - "children": [], - "node_i": "4038", - "scripts": [], - "own_tokenizer": false - } - ], - "node_i": "4036", - "scripts": [], - "own_tokenizer": false - } - ], - "node_i": "4009", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Low Saxon-Low Franconian", - "iso_1_code": null, - "iso_3_code": null, - "tokenizers": { - "Latn": { - "full_object": "SpaCyTokenizer(\"nl\")", - "original_lang_name": "dutch", - "original_lang_code": "nld", - "scripts": [ - "Latn" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Armn": { - "full_object": "SpaCyTokenizer(\"hy\")", - "original_lang_name": "armenian", - "original_lang_code": "hye", - "scripts": [ - "Armn" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Cyrl": { - "full_object": "SpaCyTokenizer(\"ru\")", - "original_lang_name": "russian", - "original_lang_code": "rus", - "scripts": [ - "Cyrl" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Grek": { - "full_object": "SpaCyTokenizer(\"el\")", - "original_lang_name": "greek", - "original_lang_code": "ell", - "scripts": [ - "Grek" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Deva": { - "full_object": "SpaCyTokenizer(\"hi\")", - "original_lang_name": "hindi", - "original_lang_code": "hin", - "scripts": [ - "Latn", - "Deva" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Gujr": { - "full_object": "SpaCyTokenizer(\"gu\")", - "original_lang_name": "gujarati", - "original_lang_code": "guj", - "scripts": [ - "Latn", - "Gujr" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Guru": { - "full_object": "IndicNLPTokenizer(\"pa\")", - "original_lang_name": "punjabi", - "original_lang_code": "pan", - "scripts": [ - "Latn", - "Guru" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": false - }, - "Beng": { - "full_object": "SpaCyTokenizer(\"bn\")", - "original_lang_name": "bengali", - "original_lang_code": "ben", - "scripts": [ - "Latn", - "Beng" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Orya": { - "full_object": "IndicNLPTokenizer(\"or\")", - "original_lang_name": "oriya", - "original_lang_code": "ori", - "scripts": [ - "Latn", - "Orya" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": true - }, - "Arab": { - "full_object": "SpaCyTokenizer(\"fa\")", - "original_lang_name": "persian", - "original_lang_code": "fas", - "scripts": [ - "Arab" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": true - }, - "Sinh": { - "full_object": "SpaCyTokenizer(\"si\")", - "original_lang_name": "sinhala", - "original_lang_code": "sin", - "scripts": [ - "Sinh" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - } - }, - "children": [ - { - "name": "Low Franconian", - "iso_1_code": null, - "iso_3_code": null, - "tokenizers": { - "Latn": { - "full_object": "SpaCyTokenizer(\"nl\")", - "original_lang_name": "dutch", - "original_lang_code": "nld", - "scripts": [ - "Latn" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Armn": { - "full_object": "SpaCyTokenizer(\"hy\")", - "original_lang_name": "armenian", - "original_lang_code": "hye", - "scripts": [ - "Armn" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Cyrl": { - "full_object": "SpaCyTokenizer(\"ru\")", - "original_lang_name": "russian", - "original_lang_code": "rus", - "scripts": [ - "Cyrl" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Grek": { - "full_object": "SpaCyTokenizer(\"el\")", - "original_lang_name": "greek", - "original_lang_code": "ell", - "scripts": [ - "Grek" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Deva": { - "full_object": "SpaCyTokenizer(\"hi\")", - "original_lang_name": "hindi", - "original_lang_code": "hin", - "scripts": [ - "Latn", - "Deva" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Gujr": { - "full_object": "SpaCyTokenizer(\"gu\")", - "original_lang_name": "gujarati", - "original_lang_code": "guj", - "scripts": [ - "Latn", - "Gujr" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Guru": { - "full_object": "IndicNLPTokenizer(\"pa\")", - "original_lang_name": "punjabi", - "original_lang_code": "pan", - "scripts": [ - "Latn", - "Guru" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": false - }, - "Beng": { - "full_object": "SpaCyTokenizer(\"bn\")", - "original_lang_name": "bengali", - "original_lang_code": "ben", - "scripts": [ - "Latn", - "Beng" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Orya": { - "full_object": "IndicNLPTokenizer(\"or\")", - "original_lang_name": "oriya", - "original_lang_code": "ori", - "scripts": [ - "Latn", - "Orya" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": true - }, - "Arab": { - "full_object": "SpaCyTokenizer(\"fa\")", - "original_lang_name": "persian", - "original_lang_code": "fas", - "scripts": [ - "Arab" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": true - }, - "Sinh": { - "full_object": "SpaCyTokenizer(\"si\")", - "original_lang_name": "sinhala", - "original_lang_code": "sin", - "scripts": [ - "Sinh" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - } - }, - "children": [ - { - "name": "Afrikaans", - "iso_1_code": "af", - "iso_3_code": "afr", + "name": "Panjabi", + "iso_1_code": null, + "iso_3_code": null, "tokenizers": { "Latn": { - "full_object": "SpaCyTokenizer(\"af\")", - "original_lang_name": "afrikaans", - "original_lang_code": "afr", + "full_object": "IndicNLPTokenizer(\"pa\")", + "original_lang_name": "punjabi", + "original_lang_code": "pan", "scripts": [ - "Latn" + "Latn", + "Guru" ], - "class_name": "SpaCyTokenizer", + "class_name": "IndicNLPTokenizer", "macrolanguage": false - } - }, - "children": [], - "node_i": "4041", - "scripts": [ - "Latn" - ], - "own_tokenizer": true - }, - { - "name": "Limburgish", - "iso_1_code": "li", - "iso_3_code": "lim", - "tokenizers": { - "Latn": { - "full_object": "SpaCyTokenizer(\"nl\")", - "original_lang_name": "dutch", - "original_lang_code": "nld", + }, + "Guru": { + "full_object": "IndicNLPTokenizer(\"pa\")", + "original_lang_name": "punjabi", + "original_lang_code": "pan", "scripts": [ - "Latn" + "Latn", + "Guru" ], - "class_name": "SpaCyTokenizer", + "class_name": "IndicNLPTokenizer", "macrolanguage": false - } - }, - "children": [], - "node_i": "4042", - "scripts": [ - "Latn" - ], - "own_tokenizer": false - }, - { - "name": "Dutch", - "iso_1_code": "nl", - "iso_3_code": "nld", - "tokenizers": { - "Latn": { - "full_object": "SpaCyTokenizer(\"nl\")", - "original_lang_name": "dutch", - "original_lang_code": "nld", + }, + "Arab": { + "full_object": "SpaCyTokenizer(\"ur\")", + "original_lang_name": "urdu", + "original_lang_code": "urd", "scripts": [ - "Latn" + "Latn", + "Arab" ], "class_name": "SpaCyTokenizer", "macrolanguage": false } }, - "children": [], - "node_i": "4043", - "scripts": [ - "Latn" - ], - "own_tokenizer": true - }, - { - "name": "West Flemish", - "iso_1_code": null, - "iso_3_code": "vls", - "tokenizers": { - "Latn": { - "full_object": "SpaCyTokenizer(\"nl\")", - "original_lang_name": "dutch", - "original_lang_code": "nld", + "children": [ + { + "name": "Punjabi, Eastern", + "iso_1_code": "pa", + "iso_3_code": "pan", + "tokenizers": { + "Latn": { + "full_object": "IndicNLPTokenizer(\"pa\")", + "original_lang_name": "punjabi", + "original_lang_code": "pan", + "scripts": [ + "Latn", + "Guru" + ], + "class_name": "IndicNLPTokenizer", + "macrolanguage": false + }, + "Guru": { + "full_object": "IndicNLPTokenizer(\"pa\")", + "original_lang_name": "punjabi", + "original_lang_code": "pan", + "scripts": [ + "Latn", + "Guru" + ], + "class_name": "IndicNLPTokenizer", + "macrolanguage": false + } + }, + "children": [], + "node_i": "4147", "scripts": [ - "Latn" + "Latn", + "Guru" ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false + "own_tokenizer": true + }, + { + "name": "Western Panjabi", + "iso_1_code": null, + "iso_3_code": null, + "tokenizers": { + "Arab": { + "full_object": "SpaCyTokenizer(\"ur\")", + "original_lang_name": "urdu", + "original_lang_code": "urd", + "scripts": [ + "Latn", + "Arab" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } + }, + "children": [ + { + "name": "Hindko, Southern", + "iso_1_code": null, + "iso_3_code": "hnd", + "tokenizers": {}, + "children": [], + "node_i": "4149", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Hindko, Northern", + "iso_1_code": null, + "iso_3_code": "hno", + "tokenizers": {}, + "children": [], + "node_i": "4150", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Inku", + "iso_1_code": null, + "iso_3_code": "jat", + "tokenizers": {}, + "children": [], + "node_i": "4151", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Pahari-Potwari", + "iso_1_code": null, + "iso_3_code": "phr", + "tokenizers": {}, + "children": [], + "node_i": "4152", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Punjabi, Western", + "iso_1_code": null, + "iso_3_code": "pnb", + "tokenizers": { + "Arab": { + "full_object": "SpaCyTokenizer(\"ur\")", + "original_lang_name": "urdu", + "original_lang_code": "urd", + "scripts": [ + "Latn", + "Arab" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } + }, + "children": [], + "node_i": "4153", + "scripts": [ + "Arab" + ], + "own_tokenizer": false + }, + { + "name": "Saraiki", + "iso_1_code": null, + "iso_3_code": "skr", + "tokenizers": { + "Arab": { + "full_object": "SpaCyTokenizer(\"ur\")", + "original_lang_name": "urdu", + "original_lang_code": "urd", + "scripts": [ + "Latn", + "Arab" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } + }, + "children": [], + "node_i": "4154", + "scripts": [ + "Arab" + ], + "own_tokenizer": false + }, + { + "name": "Khetrani", + "iso_1_code": null, + "iso_3_code": "xhe", + "tokenizers": {}, + "children": [], + "node_i": "4155", + "scripts": [], + "own_tokenizer": false + } + ], + "node_i": "4148", + "scripts": [], + "own_tokenizer": false } - }, - "children": [], - "node_i": "4044", - "scripts": [ - "Latn" ], + "node_i": "4146", + "scripts": [], "own_tokenizer": false }, { - "name": "Zeeuws", + "name": "Rajasthani", "iso_1_code": null, - "iso_3_code": "zea", + "iso_3_code": null, "tokenizers": { - "Latn": { - "full_object": "SpaCyTokenizer(\"nl\")", - "original_lang_name": "dutch", - "original_lang_code": "nld", + "Deva": { + "full_object": "SpaCyTokenizer(\"ne\")", + "original_lang_name": "nepali", + "original_lang_code": "nep", "scripts": [ - "Latn" + "Latn", + "Deva" ], "class_name": "SpaCyTokenizer", - "macrolanguage": false + "macrolanguage": true } }, - "children": [], - "node_i": "4045", - "scripts": [ - "Latn" - ], - "own_tokenizer": false - } - ], - "node_i": "4040", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Low Saxon", - "iso_1_code": null, - "iso_3_code": null, - "tokenizers": { - "Latn": { - "full_object": "SpaCyTokenizer(\"nl\")", - "original_lang_name": "dutch", - "original_lang_code": "nld", - "scripts": [ - "Latn" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Armn": { - "full_object": "SpaCyTokenizer(\"hy\")", - "original_lang_name": "armenian", - "original_lang_code": "hye", - "scripts": [ - "Armn" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Cyrl": { - "full_object": "SpaCyTokenizer(\"ru\")", - "original_lang_name": "russian", - "original_lang_code": "rus", - "scripts": [ - "Cyrl" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Grek": { - "full_object": "SpaCyTokenizer(\"el\")", - "original_lang_name": "greek", - "original_lang_code": "ell", - "scripts": [ - "Grek" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Deva": { - "full_object": "SpaCyTokenizer(\"hi\")", - "original_lang_name": "hindi", - "original_lang_code": "hin", - "scripts": [ - "Latn", - "Deva" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Gujr": { - "full_object": "SpaCyTokenizer(\"gu\")", - "original_lang_name": "gujarati", - "original_lang_code": "guj", - "scripts": [ - "Latn", - "Gujr" + "children": [ + { + "name": "Gujari", + "iso_1_code": null, + "iso_3_code": "gju", + "tokenizers": {}, + "children": [], + "node_i": "4157", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Marwari", + "iso_1_code": null, + "iso_3_code": null, + "tokenizers": {}, + "children": [ + { + "name": "Dhundari", + "iso_1_code": null, + "iso_3_code": "dhd", + "tokenizers": {}, + "children": [], + "node_i": "4159", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Godwari", + "iso_1_code": null, + "iso_3_code": "gdx", + "tokenizers": {}, + "children": [], + "node_i": "4160", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Goaria", + "iso_1_code": null, + "iso_3_code": "gig", + "tokenizers": {}, + "children": [], + "node_i": "4161", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Jogi", + "iso_1_code": null, + "iso_3_code": "jog", + "tokenizers": {}, + "children": [], + "node_i": "4162", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Loarki", + "iso_1_code": null, + "iso_3_code": "lrk", + "tokenizers": {}, + "children": [], + "node_i": "4163", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Dhatki", + "iso_1_code": null, + "iso_3_code": "mki", + "tokenizers": {}, + "children": [], + "node_i": "4164", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Mewari", + "iso_1_code": null, + "iso_3_code": "mtr", + "tokenizers": {}, + "children": [], + "node_i": "4165", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Marwari", + "iso_1_code": null, + "iso_3_code": "mve", + "tokenizers": {}, + "children": [], + "node_i": "4166", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Marwari", + "iso_1_code": null, + "iso_3_code": "rwr", + "tokenizers": {}, + "children": [], + "node_i": "4167", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Shekhawati", + "iso_1_code": null, + "iso_3_code": "swv", + "tokenizers": {}, + "children": [], + "node_i": "4168", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Merwari", + "iso_1_code": null, + "iso_3_code": "wry", + "tokenizers": {}, + "children": [], + "node_i": "4169", + "scripts": [], + "own_tokenizer": false + } + ], + "node_i": "4158", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Unclassified", + "iso_1_code": null, + "iso_3_code": null, + "tokenizers": { + "Deva": { + "full_object": "SpaCyTokenizer(\"ne\")", + "original_lang_name": "nepali", + "original_lang_code": "nep", + "scripts": [ + "Latn", + "Deva" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + } + }, + "children": [ + { + "name": "Bagri", + "iso_1_code": null, + "iso_3_code": "bgq", + "tokenizers": {}, + "children": [], + "node_i": "4171", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Lohar, Gade", + "iso_1_code": null, + "iso_3_code": "gda", + "tokenizers": {}, + "children": [], + "node_i": "4172", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Gurgula", + "iso_1_code": null, + "iso_3_code": "ggg", + "tokenizers": {}, + "children": [], + "node_i": "4173", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Haroti", + "iso_1_code": null, + "iso_3_code": "hoj", + "tokenizers": {}, + "children": [], + "node_i": "4174", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Lambadi", + "iso_1_code": null, + "iso_3_code": "lmn", + "tokenizers": {}, + "children": [], + "node_i": "4175", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Malvi", + "iso_1_code": null, + "iso_3_code": "mup", + "tokenizers": { + "Deva": { + "full_object": "SpaCyTokenizer(\"ne\")", + "original_lang_name": "nepali", + "original_lang_code": "nep", + "scripts": [ + "Latn", + "Deva" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + } + }, + "children": [], + "node_i": "4176", + "scripts": [ + "Deva" + ], + "own_tokenizer": false + }, + { + "name": "Nimadi", + "iso_1_code": null, + "iso_3_code": "noe", + "tokenizers": {}, + "children": [], + "node_i": "4177", + "scripts": [], + "own_tokenizer": false + } + ], + "node_i": "4170", + "scripts": [], + "own_tokenizer": false + } ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Guru": { - "full_object": "IndicNLPTokenizer(\"pa\")", - "original_lang_name": "punjabi", - "original_lang_code": "pan", - "scripts": [ - "Latn", - "Guru" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": false - }, - "Beng": { - "full_object": "SpaCyTokenizer(\"bn\")", - "original_lang_name": "bengali", - "original_lang_code": "ben", - "scripts": [ - "Latn", - "Beng" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Orya": { - "full_object": "IndicNLPTokenizer(\"or\")", - "original_lang_name": "oriya", - "original_lang_code": "ori", - "scripts": [ - "Latn", - "Orya" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": true - }, - "Arab": { - "full_object": "SpaCyTokenizer(\"fa\")", - "original_lang_name": "persian", - "original_lang_code": "fas", - "scripts": [ - "Arab" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": true - }, - "Sinh": { - "full_object": "SpaCyTokenizer(\"si\")", - "original_lang_name": "sinhala", - "original_lang_code": "sin", - "scripts": [ - "Sinh" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - } - }, - "children": [ - { - "name": "Achterhoeks", - "iso_1_code": null, - "iso_3_code": "act", - "tokenizers": {}, - "children": [], - "node_i": "4047", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Drents", - "iso_1_code": null, - "iso_3_code": "drt", - "tokenizers": {}, - "children": [], - "node_i": "4048", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Saxon, East Frisian Low", - "iso_1_code": null, - "iso_3_code": "frs", - "tokenizers": {}, - "children": [], - "node_i": "4049", - "scripts": [], - "own_tokenizer": false + "node_i": "4156", + "scripts": [], + "own_tokenizer": false }, { - "name": "Gronings", + "name": "Romani", "iso_1_code": null, - "iso_3_code": "gos", + "iso_3_code": null, "tokenizers": { "Latn": { - "full_object": "SpaCyTokenizer(\"nl\")", - "original_lang_name": "dutch", - "original_lang_code": "nld", + "full_object": "SpaCyTokenizer(\"gu\")", + "original_lang_name": "gujarati", + "original_lang_code": "guj", "scripts": [ - "Latn" + "Latn", + "Gujr" ], "class_name": "SpaCyTokenizer", "macrolanguage": false - } - }, - "children": [], - "node_i": "4050", - "scripts": [ - "Latn" - ], - "own_tokenizer": false - }, - { - "name": "Saxon, Low", - "iso_1_code": null, - "iso_3_code": "nds", - "tokenizers": { - "Latn": { - "full_object": "SpaCyTokenizer(\"nl\")", - "original_lang_name": "dutch", - "original_lang_code": "nld", + }, + "Gujr": { + "full_object": "SpaCyTokenizer(\"gu\")", + "original_lang_name": "gujarati", + "original_lang_code": "guj", "scripts": [ - "Latn" + "Latn", + "Gujr" ], "class_name": "SpaCyTokenizer", "macrolanguage": false - } - }, - "children": [], - "node_i": "4051", - "scripts": [ - "Latn" - ], - "own_tokenizer": false - }, - { - "name": "Plautdietsch", - "iso_1_code": null, - "iso_3_code": "pdt", - "tokenizers": { - "Latn": { - "full_object": "SpaCyTokenizer(\"nl\")", - "original_lang_name": "dutch", - "original_lang_code": "nld", + }, + "Cyrl": { + "full_object": "StanzaTokenizer(\"kmr\")", + "original_lang_name": "northern_kurdish", + "original_lang_code": "kmr", "scripts": [ - "Latn" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - } - }, - "children": [], - "node_i": "4052", - "scripts": [ - "Latn" - ], - "own_tokenizer": false - }, - { - "name": "Sallands", - "iso_1_code": null, - "iso_3_code": "sdz", - "tokenizers": {}, - "children": [], - "node_i": "4053", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Stellingwerfs", - "iso_1_code": null, - "iso_3_code": "stl", - "tokenizers": {}, - "children": [], - "node_i": "4054", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Twents", - "iso_1_code": null, - "iso_3_code": "twd", - "tokenizers": {}, - "children": [], - "node_i": "4055", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Veluws", - "iso_1_code": null, - "iso_3_code": "vel", - "tokenizers": {}, - "children": [], - "node_i": "4056", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Westphalien", - "iso_1_code": null, - "iso_3_code": "wep", - "tokenizers": {}, - "children": [], - "node_i": "4057", - "scripts": [], - "own_tokenizer": false - } - ], - "node_i": "4046", - "scripts": [], - "own_tokenizer": false - } - ], - "node_i": "4039", - "scripts": [], - "own_tokenizer": false - } - ], - "node_i": "4000", - "scripts": [], - "own_tokenizer": false - } - ], - "node_i": "3984", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Greek", - "iso_1_code": null, - "iso_3_code": null, - "tokenizers": { - "Grek": { - "full_object": "SpaCyTokenizer(\"el\")", - "original_lang_name": "greek", - "original_lang_code": "ell", - "scripts": [ - "Grek" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Latn": { - "full_object": "SpaCyTokenizer(\"en\")", - "original_lang_name": "english", - "original_lang_code": "eng", - "scripts": [ - "Latn" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Armn": { - "full_object": "SpaCyTokenizer(\"hy\")", - "original_lang_name": "armenian", - "original_lang_code": "hye", - "scripts": [ - "Armn" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Cyrl": { - "full_object": "SpaCyTokenizer(\"ru\")", - "original_lang_name": "russian", - "original_lang_code": "rus", - "scripts": [ - "Cyrl" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Deva": { - "full_object": "SpaCyTokenizer(\"hi\")", - "original_lang_name": "hindi", - "original_lang_code": "hin", - "scripts": [ - "Latn", - "Deva" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Gujr": { - "full_object": "SpaCyTokenizer(\"gu\")", - "original_lang_name": "gujarati", - "original_lang_code": "guj", - "scripts": [ - "Latn", - "Gujr" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Guru": { - "full_object": "IndicNLPTokenizer(\"pa\")", - "original_lang_name": "punjabi", - "original_lang_code": "pan", - "scripts": [ - "Latn", - "Guru" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": false - }, - "Beng": { - "full_object": "SpaCyTokenizer(\"bn\")", - "original_lang_name": "bengali", - "original_lang_code": "ben", - "scripts": [ - "Latn", - "Beng" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Orya": { - "full_object": "IndicNLPTokenizer(\"or\")", - "original_lang_name": "oriya", - "original_lang_code": "ori", - "scripts": [ - "Latn", - "Orya" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": true - }, - "Arab": { - "full_object": "SpaCyTokenizer(\"fa\")", - "original_lang_name": "persian", - "original_lang_code": "fas", - "scripts": [ - "Arab" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": true - }, - "Sinh": { - "full_object": "SpaCyTokenizer(\"si\")", - "original_lang_name": "sinhala", - "original_lang_code": "sin", - "scripts": [ - "Sinh" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - } - }, - "children": [ - { - "name": "Attic", - "iso_1_code": null, - "iso_3_code": null, - "tokenizers": { - "Grek": { - "full_object": "SpaCyTokenizer(\"el\")", - "original_lang_name": "greek", - "original_lang_code": "ell", - "scripts": [ - "Grek" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Latn": { - "full_object": "SpaCyTokenizer(\"en\")", - "original_lang_name": "english", - "original_lang_code": "eng", - "scripts": [ - "Latn" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Armn": { - "full_object": "SpaCyTokenizer(\"hy\")", - "original_lang_name": "armenian", - "original_lang_code": "hye", - "scripts": [ - "Armn" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Cyrl": { - "full_object": "SpaCyTokenizer(\"ru\")", - "original_lang_name": "russian", - "original_lang_code": "rus", - "scripts": [ - "Cyrl" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Deva": { - "full_object": "SpaCyTokenizer(\"hi\")", - "original_lang_name": "hindi", - "original_lang_code": "hin", - "scripts": [ - "Latn", - "Deva" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Gujr": { - "full_object": "SpaCyTokenizer(\"gu\")", - "original_lang_name": "gujarati", - "original_lang_code": "guj", - "scripts": [ - "Latn", - "Gujr" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Guru": { - "full_object": "IndicNLPTokenizer(\"pa\")", - "original_lang_name": "punjabi", - "original_lang_code": "pan", - "scripts": [ - "Latn", - "Guru" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": false - }, - "Beng": { - "full_object": "SpaCyTokenizer(\"bn\")", - "original_lang_name": "bengali", - "original_lang_code": "ben", - "scripts": [ - "Latn", - "Beng" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Orya": { - "full_object": "IndicNLPTokenizer(\"or\")", - "original_lang_name": "oriya", - "original_lang_code": "ori", - "scripts": [ - "Latn", - "Orya" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": true - }, - "Arab": { - "full_object": "SpaCyTokenizer(\"fa\")", - "original_lang_name": "persian", - "original_lang_code": "fas", - "scripts": [ - "Arab" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": true - }, - "Sinh": { - "full_object": "SpaCyTokenizer(\"si\")", - "original_lang_name": "sinhala", - "original_lang_code": "sin", - "scripts": [ - "Sinh" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - } - }, - "children": [ - { - "name": "Cappadocian Greek", - "iso_1_code": null, - "iso_3_code": "cpg", - "tokenizers": {}, - "children": [], - "node_i": "4060", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Greek", - "iso_1_code": "el", - "iso_3_code": "ell", - "tokenizers": { - "Grek": { - "full_object": "SpaCyTokenizer(\"el\")", - "original_lang_name": "greek", - "original_lang_code": "ell", - "scripts": [ - "Grek" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - } - }, - "children": [], - "node_i": "4061", - "scripts": [ - "Grek" - ], - "own_tokenizer": true - }, - { - "name": "Greek, Ancient", - "iso_1_code": null, - "iso_3_code": "grc", - "tokenizers": { - "Grek": { - "full_object": "SpaCyTokenizer(\"grc\")", - "original_lang_name": "ancient_greek", - "original_lang_code": "grc", - "scripts": [ - "Grek" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - } - }, - "children": [], - "node_i": "4062", - "scripts": [ - "Grek" - ], - "own_tokenizer": true - }, - { - "name": "Pontic", - "iso_1_code": null, - "iso_3_code": "pnt", - "tokenizers": { - "Grek": { - "full_object": "SpaCyTokenizer(\"el\")", - "original_lang_name": "greek", - "original_lang_code": "ell", - "scripts": [ - "Grek" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - } - }, - "children": [], - "node_i": "4063", - "scripts": [ - "Grek" - ], - "own_tokenizer": false - }, - { - "name": "Yevanic", - "iso_1_code": null, - "iso_3_code": "yej", - "tokenizers": {}, - "children": [], - "node_i": "4064", - "scripts": [], - "own_tokenizer": false - } - ], - "node_i": "4059", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Doric", - "iso_1_code": null, - "iso_3_code": null, - "tokenizers": { - "Grek": { - "full_object": "SpaCyTokenizer(\"el\")", - "original_lang_name": "greek", - "original_lang_code": "ell", - "scripts": [ - "Grek" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Latn": { - "full_object": "SpaCyTokenizer(\"en\")", - "original_lang_name": "english", - "original_lang_code": "eng", - "scripts": [ - "Latn" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Armn": { - "full_object": "SpaCyTokenizer(\"hy\")", - "original_lang_name": "armenian", - "original_lang_code": "hye", - "scripts": [ - "Armn" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Cyrl": { - "full_object": "SpaCyTokenizer(\"ru\")", - "original_lang_name": "russian", - "original_lang_code": "rus", - "scripts": [ - "Cyrl" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Deva": { - "full_object": "SpaCyTokenizer(\"hi\")", - "original_lang_name": "hindi", - "original_lang_code": "hin", - "scripts": [ - "Latn", - "Deva" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Gujr": { - "full_object": "SpaCyTokenizer(\"gu\")", - "original_lang_name": "gujarati", - "original_lang_code": "guj", - "scripts": [ - "Latn", - "Gujr" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Guru": { - "full_object": "IndicNLPTokenizer(\"pa\")", - "original_lang_name": "punjabi", - "original_lang_code": "pan", - "scripts": [ - "Latn", - "Guru" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": false - }, - "Beng": { - "full_object": "SpaCyTokenizer(\"bn\")", - "original_lang_name": "bengali", - "original_lang_code": "ben", - "scripts": [ - "Latn", - "Beng" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Orya": { - "full_object": "IndicNLPTokenizer(\"or\")", - "original_lang_name": "oriya", - "original_lang_code": "ori", - "scripts": [ - "Latn", - "Orya" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": true - }, - "Arab": { - "full_object": "SpaCyTokenizer(\"fa\")", - "original_lang_name": "persian", - "original_lang_code": "fas", - "scripts": [ - "Arab" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": true - }, - "Sinh": { - "full_object": "SpaCyTokenizer(\"si\")", - "original_lang_name": "sinhala", - "original_lang_code": "sin", - "scripts": [ - "Sinh" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - } - }, - "children": [ - { - "name": "Tsakonian", - "iso_1_code": null, - "iso_3_code": "tsd", - "tokenizers": {}, - "children": [], - "node_i": "4066", - "scripts": [], - "own_tokenizer": false - } - ], - "node_i": "4065", - "scripts": [], - "own_tokenizer": false - } - ], - "node_i": "4058", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Indo-Iranian", - "iso_1_code": null, - "iso_3_code": null, - "tokenizers": { - "Latn": { - "full_object": "SpaCyTokenizer(\"hi\")", - "original_lang_name": "hindi", - "original_lang_code": "hin", - "scripts": [ - "Latn", - "Deva" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Deva": { - "full_object": "SpaCyTokenizer(\"hi\")", - "original_lang_name": "hindi", - "original_lang_code": "hin", - "scripts": [ - "Latn", - "Deva" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Gujr": { - "full_object": "SpaCyTokenizer(\"gu\")", - "original_lang_name": "gujarati", - "original_lang_code": "guj", - "scripts": [ - "Latn", - "Gujr" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Guru": { - "full_object": "IndicNLPTokenizer(\"pa\")", - "original_lang_name": "punjabi", - "original_lang_code": "pan", - "scripts": [ - "Latn", - "Guru" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": false - }, - "Beng": { - "full_object": "SpaCyTokenizer(\"bn\")", - "original_lang_name": "bengali", - "original_lang_code": "ben", - "scripts": [ - "Latn", - "Beng" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Orya": { - "full_object": "IndicNLPTokenizer(\"or\")", - "original_lang_name": "oriya", - "original_lang_code": "ori", - "scripts": [ - "Latn", - "Orya" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": true - }, - "Arab": { - "full_object": "SpaCyTokenizer(\"fa\")", - "original_lang_name": "persian", - "original_lang_code": "fas", - "scripts": [ - "Arab" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": true - }, - "Sinh": { - "full_object": "SpaCyTokenizer(\"si\")", - "original_lang_name": "sinhala", - "original_lang_code": "sin", - "scripts": [ - "Sinh" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Cyrl": { - "full_object": "StanzaTokenizer(\"kmr\")", - "original_lang_name": "northern_kurdish", - "original_lang_code": "kmr", - "scripts": [ - "Latn", - "Cyrl" - ], - "class_name": "StanzaTokenizer", - "macrolanguage": false - }, - "Armn": { - "full_object": "SpaCyTokenizer(\"hy\")", - "original_lang_name": "armenian", - "original_lang_code": "hye", - "scripts": [ - "Armn" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Grek": { - "full_object": "SpaCyTokenizer(\"el\")", - "original_lang_name": "greek", - "original_lang_code": "ell", - "scripts": [ - "Grek" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - } - }, - "children": [ - { - "name": "Indo-Aryan", - "iso_1_code": null, - "iso_3_code": null, - "tokenizers": { - "Latn": { - "full_object": "SpaCyTokenizer(\"hi\")", - "original_lang_name": "hindi", - "original_lang_code": "hin", - "scripts": [ - "Latn", - "Deva" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Deva": { - "full_object": "SpaCyTokenizer(\"hi\")", - "original_lang_name": "hindi", - "original_lang_code": "hin", - "scripts": [ - "Latn", - "Deva" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Gujr": { - "full_object": "SpaCyTokenizer(\"gu\")", - "original_lang_name": "gujarati", - "original_lang_code": "guj", - "scripts": [ - "Latn", - "Gujr" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Guru": { - "full_object": "IndicNLPTokenizer(\"pa\")", - "original_lang_name": "punjabi", - "original_lang_code": "pan", - "scripts": [ - "Latn", - "Guru" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": false - }, - "Beng": { - "full_object": "SpaCyTokenizer(\"bn\")", - "original_lang_name": "bengali", - "original_lang_code": "ben", - "scripts": [ - "Latn", - "Beng" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Orya": { - "full_object": "IndicNLPTokenizer(\"or\")", - "original_lang_name": "oriya", - "original_lang_code": "ori", - "scripts": [ - "Latn", - "Orya" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": true - }, - "Arab": { - "full_object": "SpaCyTokenizer(\"ur\")", - "original_lang_name": "urdu", - "original_lang_code": "urd", - "scripts": [ - "Latn", - "Arab" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Sinh": { - "full_object": "SpaCyTokenizer(\"si\")", - "original_lang_name": "sinhala", - "original_lang_code": "sin", - "scripts": [ - "Sinh" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Cyrl": { - "full_object": "StanzaTokenizer(\"kmr\")", - "original_lang_name": "northern_kurdish", - "original_lang_code": "kmr", - "scripts": [ - "Latn", - "Cyrl" - ], - "class_name": "StanzaTokenizer", - "macrolanguage": false - }, - "Armn": { - "full_object": "SpaCyTokenizer(\"hy\")", - "original_lang_name": "armenian", - "original_lang_code": "hye", - "scripts": [ - "Armn" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Grek": { - "full_object": "SpaCyTokenizer(\"el\")", - "original_lang_name": "greek", - "original_lang_code": "ell", - "scripts": [ - "Grek" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - } - }, - "children": [ - { - "name": "Sanskrit", - "iso_1_code": "sa", - "iso_3_code": "san", - "tokenizers": { - "Latn": { - "full_object": "SpaCyTokenizer(\"sa\")", - "original_lang_name": "sanskrit", - "original_lang_code": "san", - "scripts": [ - "Latn", - "Deva" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Deva": { - "full_object": "SpaCyTokenizer(\"sa\")", - "original_lang_name": "sanskrit", - "original_lang_code": "san", - "scripts": [ - "Latn", - "Deva" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - } - }, - "children": [], - "node_i": "4069", - "scripts": [ - "Deva", - "Latn" - ], - "own_tokenizer": true - }, - { - "name": "Intermediate Divisions", - "iso_1_code": null, - "iso_3_code": null, - "tokenizers": { - "Latn": { - "full_object": "SpaCyTokenizer(\"ne\")", - "original_lang_name": "nepali", - "original_lang_code": "nep", - "scripts": [ - "Latn", - "Deva" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": true - }, - "Deva": { - "full_object": "SpaCyTokenizer(\"ne\")", - "original_lang_name": "nepali", - "original_lang_code": "nep", - "scripts": [ - "Latn", - "Deva" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": true - }, - "Gujr": { - "full_object": "SpaCyTokenizer(\"gu\")", - "original_lang_name": "gujarati", - "original_lang_code": "guj", - "scripts": [ - "Latn", - "Gujr" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Guru": { - "full_object": "IndicNLPTokenizer(\"pa\")", - "original_lang_name": "punjabi", - "original_lang_code": "pan", - "scripts": [ - "Latn", - "Guru" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": false - }, - "Beng": { - "full_object": "SpaCyTokenizer(\"bn\")", - "original_lang_name": "bengali", - "original_lang_code": "ben", - "scripts": [ - "Latn", - "Beng" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Orya": { - "full_object": "IndicNLPTokenizer(\"or\")", - "original_lang_name": "oriya", - "original_lang_code": "ori", - "scripts": [ - "Latn", - "Orya" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": true - }, - "Arab": { - "full_object": "SpaCyTokenizer(\"ur\")", - "original_lang_name": "urdu", - "original_lang_code": "urd", - "scripts": [ - "Latn", - "Arab" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Sinh": { - "full_object": "SpaCyTokenizer(\"si\")", - "original_lang_name": "sinhala", - "original_lang_code": "sin", - "scripts": [ - "Sinh" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Cyrl": { - "full_object": "StanzaTokenizer(\"kmr\")", - "original_lang_name": "northern_kurdish", - "original_lang_code": "kmr", - "scripts": [ - "Latn", - "Cyrl" - ], - "class_name": "StanzaTokenizer", - "macrolanguage": false - }, - "Armn": { - "full_object": "SpaCyTokenizer(\"hy\")", - "original_lang_name": "armenian", - "original_lang_code": "hye", - "scripts": [ - "Armn" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Grek": { - "full_object": "SpaCyTokenizer(\"el\")", - "original_lang_name": "greek", - "original_lang_code": "ell", - "scripts": [ - "Grek" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - } - }, - "children": [ - { - "name": "Eastern", - "iso_1_code": null, - "iso_3_code": null, - "tokenizers": { - "Latn": { - "full_object": "SpaCyTokenizer(\"ne\")", - "original_lang_name": "nepali", - "original_lang_code": "nep", - "scripts": [ - "Latn", - "Deva" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": true - }, - "Deva": { - "full_object": "SpaCyTokenizer(\"ne\")", - "original_lang_name": "nepali", - "original_lang_code": "nep", - "scripts": [ - "Latn", - "Deva" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": true - }, - "Gujr": { - "full_object": "SpaCyTokenizer(\"gu\")", - "original_lang_name": "gujarati", - "original_lang_code": "guj", - "scripts": [ - "Latn", - "Gujr" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Guru": { - "full_object": "IndicNLPTokenizer(\"pa\")", - "original_lang_name": "punjabi", - "original_lang_code": "pan", - "scripts": [ - "Latn", - "Guru" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": false - }, - "Beng": { - "full_object": "SpaCyTokenizer(\"bn\")", - "original_lang_name": "bengali", - "original_lang_code": "ben", - "scripts": [ - "Latn", - "Beng" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Orya": { - "full_object": "IndicNLPTokenizer(\"or\")", - "original_lang_name": "oriya", - "original_lang_code": "ori", - "scripts": [ - "Latn", - "Orya" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": true - }, - "Arab": { - "full_object": "SpaCyTokenizer(\"ur\")", - "original_lang_name": "urdu", - "original_lang_code": "urd", - "scripts": [ - "Latn", - "Arab" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Sinh": { - "full_object": "SpaCyTokenizer(\"si\")", - "original_lang_name": "sinhala", - "original_lang_code": "sin", - "scripts": [ - "Sinh" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Cyrl": { - "full_object": "StanzaTokenizer(\"kmr\")", - "original_lang_name": "northern_kurdish", - "original_lang_code": "kmr", - "scripts": [ - "Latn", - "Cyrl" - ], - "class_name": "StanzaTokenizer", - "macrolanguage": false - }, - "Armn": { - "full_object": "SpaCyTokenizer(\"hy\")", - "original_lang_name": "armenian", - "original_lang_code": "hye", - "scripts": [ - "Armn" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Grek": { - "full_object": "SpaCyTokenizer(\"el\")", - "original_lang_name": "greek", - "original_lang_code": "ell", - "scripts": [ - "Grek" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - } - }, - "children": [ - { - "name": "East Central", - "iso_1_code": null, - "iso_3_code": null, - "tokenizers": { - "Latn": { - "full_object": "SpaCyTokenizer(\"ne\")", - "original_lang_name": "nepali", - "original_lang_code": "nep", - "scripts": [ - "Latn", - "Deva" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": true - }, - "Deva": { - "full_object": "SpaCyTokenizer(\"ne\")", - "original_lang_name": "nepali", - "original_lang_code": "nep", - "scripts": [ - "Latn", - "Deva" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": true - }, - "Gujr": { - "full_object": "SpaCyTokenizer(\"gu\")", - "original_lang_name": "gujarati", - "original_lang_code": "guj", - "scripts": [ - "Latn", - "Gujr" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Guru": { - "full_object": "IndicNLPTokenizer(\"pa\")", - "original_lang_name": "punjabi", - "original_lang_code": "pan", - "scripts": [ - "Latn", - "Guru" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": false - }, - "Beng": { - "full_object": "SpaCyTokenizer(\"bn\")", - "original_lang_name": "bengali", - "original_lang_code": "ben", - "scripts": [ - "Latn", - "Beng" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Orya": { - "full_object": "IndicNLPTokenizer(\"or\")", - "original_lang_name": "oriya", - "original_lang_code": "ori", - "scripts": [ - "Latn", - "Orya" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": true - }, - "Arab": { - "full_object": "SpaCyTokenizer(\"ur\")", - "original_lang_name": "urdu", - "original_lang_code": "urd", - "scripts": [ - "Latn", - "Arab" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Sinh": { - "full_object": "SpaCyTokenizer(\"si\")", - "original_lang_name": "sinhala", - "original_lang_code": "sin", - "scripts": [ - "Sinh" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Cyrl": { - "full_object": "StanzaTokenizer(\"kmr\")", - "original_lang_name": "northern_kurdish", - "original_lang_code": "kmr", - "scripts": [ - "Latn", - "Cyrl" - ], - "class_name": "StanzaTokenizer", - "macrolanguage": false - }, - "Armn": { - "full_object": "SpaCyTokenizer(\"hy\")", - "original_lang_name": "armenian", - "original_lang_code": "hye", - "scripts": [ - "Armn" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Grek": { - "full_object": "SpaCyTokenizer(\"el\")", - "original_lang_name": "greek", - "original_lang_code": "ell", - "scripts": [ - "Grek" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - } - }, - "children": [ - { - "name": "Awadhi", - "iso_1_code": null, - "iso_3_code": "awa", - "tokenizers": { - "Deva": { - "full_object": "SpaCyTokenizer(\"ne\")", - "original_lang_name": "nepali", - "original_lang_code": "nep", - "scripts": [ - "Latn", - "Deva" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": true - } - }, - "children": [], - "node_i": "4073", - "scripts": [ - "Deva" - ], - "own_tokenizer": false - }, - { - "name": "Bagheli", - "iso_1_code": null, - "iso_3_code": "bfy", - "tokenizers": {}, - "children": [], - "node_i": "4074", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Fiji Hindi", - "iso_1_code": null, - "iso_3_code": "hif", - "tokenizers": { - "Latn": { - "full_object": "SpaCyTokenizer(\"ne\")", - "original_lang_name": "nepali", - "original_lang_code": "nep", - "scripts": [ - "Latn", - "Deva" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": true - } - }, - "children": [], - "node_i": "4075", - "scripts": [ - "Latn" - ], - "own_tokenizer": false - }, - { - "name": "Chhattisgarhi", - "iso_1_code": null, - "iso_3_code": "hne", - "tokenizers": { - "Deva": { - "full_object": "SpaCyTokenizer(\"ne\")", - "original_lang_name": "nepali", - "original_lang_code": "nep", - "scripts": [ - "Latn", - "Deva" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": true - } - }, - "children": [], - "node_i": "4076", - "scripts": [ - "Deva" - ], - "own_tokenizer": false - }, - { - "name": "Kamar", - "iso_1_code": null, - "iso_3_code": "keq", - "tokenizers": {}, - "children": [], - "node_i": "4077", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Surgujia", - "iso_1_code": null, - "iso_3_code": "sgj", - "tokenizers": {}, - "children": [], - "node_i": "4078", - "scripts": [], - "own_tokenizer": false - } - ], - "node_i": "4072", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Eastern Pahari", - "iso_1_code": null, - "iso_3_code": null, - "tokenizers": { - "Latn": { - "full_object": "SpaCyTokenizer(\"ne\")", - "original_lang_name": "nepali", - "original_lang_code": "nep", - "scripts": [ - "Latn", - "Deva" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": true - }, - "Deva": { - "full_object": "SpaCyTokenizer(\"ne\")", - "original_lang_name": "nepali", - "original_lang_code": "nep", - "scripts": [ - "Latn", - "Deva" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": true - }, - "Gujr": { - "full_object": "SpaCyTokenizer(\"gu\")", - "original_lang_name": "gujarati", - "original_lang_code": "guj", - "scripts": [ - "Latn", - "Gujr" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Guru": { - "full_object": "IndicNLPTokenizer(\"pa\")", - "original_lang_name": "punjabi", - "original_lang_code": "pan", - "scripts": [ - "Latn", - "Guru" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": false - }, - "Beng": { - "full_object": "SpaCyTokenizer(\"bn\")", - "original_lang_name": "bengali", - "original_lang_code": "ben", - "scripts": [ - "Latn", - "Beng" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Orya": { - "full_object": "IndicNLPTokenizer(\"or\")", - "original_lang_name": "oriya", - "original_lang_code": "ori", - "scripts": [ - "Latn", - "Orya" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": true - }, - "Arab": { - "full_object": "SpaCyTokenizer(\"ur\")", - "original_lang_name": "urdu", - "original_lang_code": "urd", - "scripts": [ - "Latn", - "Arab" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Sinh": { - "full_object": "SpaCyTokenizer(\"si\")", - "original_lang_name": "sinhala", - "original_lang_code": "sin", - "scripts": [ - "Sinh" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Cyrl": { - "full_object": "StanzaTokenizer(\"kmr\")", - "original_lang_name": "northern_kurdish", - "original_lang_code": "kmr", - "scripts": [ - "Latn", - "Cyrl" - ], - "class_name": "StanzaTokenizer", - "macrolanguage": false - }, - "Armn": { - "full_object": "SpaCyTokenizer(\"hy\")", - "original_lang_name": "armenian", - "original_lang_code": "hye", - "scripts": [ - "Armn" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Grek": { - "full_object": "SpaCyTokenizer(\"el\")", - "original_lang_name": "greek", - "original_lang_code": "ell", - "scripts": [ - "Grek" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - } - }, - "children": [ - { - "name": "Dotyali", - "iso_1_code": "ne", - "iso_3_code": "dty", - "tokenizers": { - "Latn": { - "full_object": "SpaCyTokenizer(\"ne\")", - "original_lang_name": "nepali", - "original_lang_code": "nep", - "scripts": [ - "Latn", - "Deva" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": true - }, - "Deva": { - "full_object": "SpaCyTokenizer(\"ne\")", - "original_lang_name": "nepali", - "original_lang_code": "nep", - "scripts": [ - "Latn", - "Deva" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": true - } - }, - "children": [], - "node_i": "4080", - "scripts": [ - "Deva" - ], - "own_tokenizer": true - }, - { - "name": "Jumli", - "iso_1_code": null, - "iso_3_code": "jml", - "tokenizers": {}, - "children": [], - "node_i": "4081", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Nepali", - "iso_1_code": "ne", - "iso_3_code": "npi", - "tokenizers": { - "Latn": { - "full_object": "SpaCyTokenizer(\"ne\")", - "original_lang_name": "nepali", - "original_lang_code": "nep", - "scripts": [ - "Latn", - "Deva" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": true - }, - "Deva": { - "full_object": "SpaCyTokenizer(\"ne\")", - "original_lang_name": "nepali", - "original_lang_code": "nep", - "scripts": [ - "Latn", - "Deva" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": true - } - }, - "children": [], - "node_i": "4082", - "scripts": [ - "Latn", - "Deva" - ], - "own_tokenizer": true - } - ], - "node_i": "4079", - "scripts": [], - "own_tokenizer": false - } - ], - "node_i": "4071", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Western", - "iso_1_code": null, - "iso_3_code": null, - "tokenizers": { - "Latn": { - "full_object": "SpaCyTokenizer(\"gu\")", - "original_lang_name": "gujarati", - "original_lang_code": "guj", - "scripts": [ - "Latn", - "Gujr" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Gujr": { - "full_object": "SpaCyTokenizer(\"gu\")", - "original_lang_name": "gujarati", - "original_lang_code": "guj", - "scripts": [ - "Latn", - "Gujr" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Guru": { - "full_object": "IndicNLPTokenizer(\"pa\")", - "original_lang_name": "punjabi", - "original_lang_code": "pan", - "scripts": [ - "Latn", - "Guru" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": false - }, - "Deva": { - "full_object": "SpaCyTokenizer(\"ne\")", - "original_lang_name": "nepali", - "original_lang_code": "nep", - "scripts": [ - "Latn", - "Deva" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": true - }, - "Beng": { - "full_object": "SpaCyTokenizer(\"bn\")", - "original_lang_name": "bengali", - "original_lang_code": "ben", - "scripts": [ - "Latn", - "Beng" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Orya": { - "full_object": "IndicNLPTokenizer(\"or\")", - "original_lang_name": "oriya", - "original_lang_code": "ori", - "scripts": [ - "Latn", - "Orya" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": true - }, - "Arab": { - "full_object": "SpaCyTokenizer(\"ur\")", - "original_lang_name": "urdu", - "original_lang_code": "urd", - "scripts": [ - "Latn", - "Arab" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Sinh": { - "full_object": "SpaCyTokenizer(\"si\")", - "original_lang_name": "sinhala", - "original_lang_code": "sin", - "scripts": [ - "Sinh" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Cyrl": { - "full_object": "StanzaTokenizer(\"kmr\")", - "original_lang_name": "northern_kurdish", - "original_lang_code": "kmr", - "scripts": [ - "Latn", - "Cyrl" - ], - "class_name": "StanzaTokenizer", - "macrolanguage": false - }, - "Armn": { - "full_object": "SpaCyTokenizer(\"hy\")", - "original_lang_name": "armenian", - "original_lang_code": "hye", - "scripts": [ - "Armn" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Grek": { - "full_object": "SpaCyTokenizer(\"el\")", - "original_lang_name": "greek", - "original_lang_code": "ell", - "scripts": [ - "Grek" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - } - }, - "children": [ - { - "name": "Dawoodi", - "iso_1_code": null, - "iso_3_code": "dmk", - "tokenizers": {}, - "children": [], - "node_i": "4084", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Parya", - "iso_1_code": null, - "iso_3_code": "paq", - "tokenizers": {}, - "children": [], - "node_i": "4085", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Powari", - "iso_1_code": null, - "iso_3_code": "pwr", - "tokenizers": {}, - "children": [], - "node_i": "4086", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Bhil", - "iso_1_code": null, - "iso_3_code": null, - "tokenizers": { - "Latn": { - "full_object": "SpaCyTokenizer(\"gu\")", - "original_lang_name": "gujarati", - "original_lang_code": "guj", - "scripts": [ - "Latn", - "Gujr" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Gujr": { - "full_object": "SpaCyTokenizer(\"gu\")", - "original_lang_name": "gujarati", - "original_lang_code": "guj", - "scripts": [ - "Latn", - "Gujr" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Guru": { - "full_object": "IndicNLPTokenizer(\"pa\")", - "original_lang_name": "punjabi", - "original_lang_code": "pan", - "scripts": [ - "Latn", - "Guru" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": false - }, - "Deva": { - "full_object": "SpaCyTokenizer(\"ne\")", - "original_lang_name": "nepali", - "original_lang_code": "nep", - "scripts": [ - "Latn", - "Deva" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": true - }, - "Beng": { - "full_object": "SpaCyTokenizer(\"bn\")", - "original_lang_name": "bengali", - "original_lang_code": "ben", - "scripts": [ - "Latn", - "Beng" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Orya": { - "full_object": "IndicNLPTokenizer(\"or\")", - "original_lang_name": "oriya", - "original_lang_code": "ori", - "scripts": [ - "Latn", - "Orya" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": true - }, - "Arab": { - "full_object": "SpaCyTokenizer(\"ur\")", - "original_lang_name": "urdu", - "original_lang_code": "urd", - "scripts": [ - "Latn", - "Arab" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Sinh": { - "full_object": "SpaCyTokenizer(\"si\")", - "original_lang_name": "sinhala", - "original_lang_code": "sin", - "scripts": [ - "Sinh" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Cyrl": { - "full_object": "StanzaTokenizer(\"kmr\")", - "original_lang_name": "northern_kurdish", - "original_lang_code": "kmr", - "scripts": [ - "Latn", - "Cyrl" - ], - "class_name": "StanzaTokenizer", - "macrolanguage": false - }, - "Armn": { - "full_object": "SpaCyTokenizer(\"hy\")", - "original_lang_name": "armenian", - "original_lang_code": "hye", - "scripts": [ - "Armn" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Grek": { - "full_object": "SpaCyTokenizer(\"el\")", - "original_lang_name": "greek", - "original_lang_code": "ell", - "scripts": [ - "Grek" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - } - }, - "children": [ - { - "name": "Bareli, Pauri", - "iso_1_code": null, - "iso_3_code": "bfb", - "tokenizers": {}, - "children": [], - "node_i": "4088", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Bareli, Rathwi", - "iso_1_code": null, - "iso_3_code": "bgd", - "tokenizers": {}, - "children": [], - "node_i": "4089", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Bauria", - "iso_1_code": null, - "iso_3_code": "bge", - "tokenizers": {}, - "children": [], - "node_i": "4090", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Bhili", - "iso_1_code": null, - "iso_3_code": "bhb", - "tokenizers": {}, - "children": [], - "node_i": "4091", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Bhilali", - "iso_1_code": null, - "iso_3_code": "bhi", - "tokenizers": {}, - "children": [], - "node_i": "4092", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Bareli, Palya", - "iso_1_code": null, - "iso_3_code": "bpx", - "tokenizers": {}, - "children": [], - "node_i": "4093", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Chodri", - "iso_1_code": null, - "iso_3_code": "cdi", - "tokenizers": {}, - "children": [], - "node_i": "4094", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Dhodia", - "iso_1_code": null, - "iso_3_code": "dho", - "tokenizers": {}, - "children": [], - "node_i": "4095", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Dubli", - "iso_1_code": null, - "iso_3_code": "dub", - "tokenizers": {}, - "children": [], - "node_i": "4096", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Dungra Bhil", - "iso_1_code": null, - "iso_3_code": "duh", - "tokenizers": {}, - "children": [], - "node_i": "4097", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Garasia, Adiwasi", - "iso_1_code": null, - "iso_3_code": "gas", - "tokenizers": {}, - "children": [], - "node_i": "4098", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Gamit", - "iso_1_code": null, - "iso_3_code": "gbl", - "tokenizers": {}, - "children": [], - "node_i": "4099", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Garasia, Rajput", - "iso_1_code": null, - "iso_3_code": "gra", - "tokenizers": {}, - "children": [], - "node_i": "4100", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Mawchi", - "iso_1_code": null, - "iso_3_code": "mke", - "tokenizers": {}, - "children": [], - "node_i": "4101", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Nahali", - "iso_1_code": null, - "iso_3_code": "nlx", - "tokenizers": {}, - "children": [], - "node_i": "4102", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Noiri", - "iso_1_code": null, - "iso_3_code": "noi", - "tokenizers": {}, - "children": [], - "node_i": "4103", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Pardhi", - "iso_1_code": null, - "iso_3_code": "pcl", - "tokenizers": {}, - "children": [], - "node_i": "4104", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Rathawi", - "iso_1_code": null, - "iso_3_code": "rtw", - "tokenizers": {}, - "children": [], - "node_i": "4105", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Wagdi", - "iso_1_code": null, - "iso_3_code": "wbr", - "tokenizers": {}, - "children": [], - "node_i": "4106", - "scripts": [], - "own_tokenizer": false - } - ], - "node_i": "4087", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Dom", - "iso_1_code": null, - "iso_3_code": null, - "tokenizers": { - "Latn": { - "full_object": "SpaCyTokenizer(\"gu\")", - "original_lang_name": "gujarati", - "original_lang_code": "guj", - "scripts": [ - "Latn", - "Gujr" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Gujr": { - "full_object": "SpaCyTokenizer(\"gu\")", - "original_lang_name": "gujarati", - "original_lang_code": "guj", - "scripts": [ - "Latn", - "Gujr" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Guru": { - "full_object": "IndicNLPTokenizer(\"pa\")", - "original_lang_name": "punjabi", - "original_lang_code": "pan", - "scripts": [ - "Latn", - "Guru" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": false - }, - "Deva": { - "full_object": "SpaCyTokenizer(\"ne\")", - "original_lang_name": "nepali", - "original_lang_code": "nep", - "scripts": [ - "Latn", - "Deva" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": true - }, - "Beng": { - "full_object": "SpaCyTokenizer(\"bn\")", - "original_lang_name": "bengali", - "original_lang_code": "ben", - "scripts": [ - "Latn", - "Beng" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Orya": { - "full_object": "IndicNLPTokenizer(\"or\")", - "original_lang_name": "oriya", - "original_lang_code": "ori", - "scripts": [ - "Latn", - "Orya" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": true - }, - "Arab": { - "full_object": "SpaCyTokenizer(\"ur\")", - "original_lang_name": "urdu", - "original_lang_code": "urd", - "scripts": [ - "Latn", - "Arab" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Sinh": { - "full_object": "SpaCyTokenizer(\"si\")", - "original_lang_name": "sinhala", - "original_lang_code": "sin", - "scripts": [ - "Sinh" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Cyrl": { - "full_object": "StanzaTokenizer(\"kmr\")", - "original_lang_name": "northern_kurdish", - "original_lang_code": "kmr", - "scripts": [ - "Latn", - "Cyrl" - ], - "class_name": "StanzaTokenizer", - "macrolanguage": false - }, - "Armn": { - "full_object": "SpaCyTokenizer(\"hy\")", - "original_lang_name": "armenian", - "original_lang_code": "hye", - "scripts": [ - "Armn" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Grek": { - "full_object": "SpaCyTokenizer(\"el\")", - "original_lang_name": "greek", - "original_lang_code": "ell", - "scripts": [ - "Grek" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - } - }, - "children": [ - { - "name": "Domari", - "iso_1_code": null, - "iso_3_code": "rmt", - "tokenizers": {}, - "children": [], - "node_i": "4108", - "scripts": [], - "own_tokenizer": false - } - ], - "node_i": "4107", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Gujarati", - "iso_1_code": null, - "iso_3_code": null, - "tokenizers": { - "Latn": { - "full_object": "SpaCyTokenizer(\"gu\")", - "original_lang_name": "gujarati", - "original_lang_code": "guj", - "scripts": [ - "Latn", - "Gujr" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Gujr": { - "full_object": "SpaCyTokenizer(\"gu\")", - "original_lang_name": "gujarati", - "original_lang_code": "guj", - "scripts": [ - "Latn", - "Gujr" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Guru": { - "full_object": "IndicNLPTokenizer(\"pa\")", - "original_lang_name": "punjabi", - "original_lang_code": "pan", - "scripts": [ - "Latn", - "Guru" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": false - }, - "Deva": { - "full_object": "SpaCyTokenizer(\"ne\")", - "original_lang_name": "nepali", - "original_lang_code": "nep", - "scripts": [ - "Latn", - "Deva" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": true - }, - "Beng": { - "full_object": "SpaCyTokenizer(\"bn\")", - "original_lang_name": "bengali", - "original_lang_code": "ben", - "scripts": [ - "Latn", - "Beng" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Orya": { - "full_object": "IndicNLPTokenizer(\"or\")", - "original_lang_name": "oriya", - "original_lang_code": "ori", - "scripts": [ - "Latn", - "Orya" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": true - }, - "Arab": { - "full_object": "SpaCyTokenizer(\"ur\")", - "original_lang_name": "urdu", - "original_lang_code": "urd", - "scripts": [ - "Latn", - "Arab" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Sinh": { - "full_object": "SpaCyTokenizer(\"si\")", - "original_lang_name": "sinhala", - "original_lang_code": "sin", - "scripts": [ - "Sinh" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Cyrl": { - "full_object": "StanzaTokenizer(\"kmr\")", - "original_lang_name": "northern_kurdish", - "original_lang_code": "kmr", - "scripts": [ - "Latn", - "Cyrl" - ], - "class_name": "StanzaTokenizer", - "macrolanguage": false - }, - "Armn": { - "full_object": "SpaCyTokenizer(\"hy\")", - "original_lang_name": "armenian", - "original_lang_code": "hye", - "scripts": [ - "Armn" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Grek": { - "full_object": "SpaCyTokenizer(\"el\")", - "original_lang_name": "greek", - "original_lang_code": "ell", - "scripts": [ - "Grek" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - } - }, - "children": [ - { - "name": "Aer", - "iso_1_code": null, - "iso_3_code": "aeq", - "tokenizers": {}, - "children": [], - "node_i": "4110", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Koli, Kachi", - "iso_1_code": null, - "iso_3_code": "gjk", - "tokenizers": {}, - "children": [], - "node_i": "4111", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Gujarati", - "iso_1_code": "gu", - "iso_3_code": "guj", - "tokenizers": { - "Latn": { - "full_object": "SpaCyTokenizer(\"gu\")", - "original_lang_name": "gujarati", - "original_lang_code": "guj", - "scripts": [ - "Latn", - "Gujr" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Gujr": { - "full_object": "SpaCyTokenizer(\"gu\")", - "original_lang_name": "gujarati", - "original_lang_code": "guj", - "scripts": [ - "Latn", - "Gujr" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - } - }, - "children": [], - "node_i": "4112", - "scripts": [ - "Gujr", - "Latn" - ], - "own_tokenizer": true - }, - { - "name": "Jandavra", - "iso_1_code": null, - "iso_3_code": "jnd", - "tokenizers": {}, - "children": [], - "node_i": "4113", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Koli, Parkari", - "iso_1_code": null, - "iso_3_code": "kvx", - "tokenizers": {}, - "children": [], - "node_i": "4114", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Koli, Wadiyari", - "iso_1_code": null, - "iso_3_code": "kxp", - "tokenizers": {}, - "children": [], - "node_i": "4115", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Sourashtra", - "iso_1_code": null, - "iso_3_code": "saz", - "tokenizers": {}, - "children": [], - "node_i": "4116", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Vasavi", - "iso_1_code": null, - "iso_3_code": "vas", - "tokenizers": {}, - "children": [], - "node_i": "4117", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Vaghri", - "iso_1_code": null, - "iso_3_code": "vgr", - "tokenizers": {}, - "children": [], - "node_i": "4118", - "scripts": [], - "own_tokenizer": false - } - ], - "node_i": "4109", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Khandesi", - "iso_1_code": null, - "iso_3_code": null, - "tokenizers": { - "Latn": { - "full_object": "SpaCyTokenizer(\"gu\")", - "original_lang_name": "gujarati", - "original_lang_code": "guj", - "scripts": [ - "Latn", - "Gujr" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Gujr": { - "full_object": "SpaCyTokenizer(\"gu\")", - "original_lang_name": "gujarati", - "original_lang_code": "guj", - "scripts": [ - "Latn", - "Gujr" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Guru": { - "full_object": "IndicNLPTokenizer(\"pa\")", - "original_lang_name": "punjabi", - "original_lang_code": "pan", - "scripts": [ - "Latn", - "Guru" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": false - }, - "Deva": { - "full_object": "SpaCyTokenizer(\"ne\")", - "original_lang_name": "nepali", - "original_lang_code": "nep", - "scripts": [ - "Latn", - "Deva" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": true - }, - "Beng": { - "full_object": "SpaCyTokenizer(\"bn\")", - "original_lang_name": "bengali", - "original_lang_code": "ben", - "scripts": [ - "Latn", - "Beng" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Orya": { - "full_object": "IndicNLPTokenizer(\"or\")", - "original_lang_name": "oriya", - "original_lang_code": "ori", - "scripts": [ - "Latn", - "Orya" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": true - }, - "Arab": { - "full_object": "SpaCyTokenizer(\"ur\")", - "original_lang_name": "urdu", - "original_lang_code": "urd", - "scripts": [ - "Latn", - "Arab" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Sinh": { - "full_object": "SpaCyTokenizer(\"si\")", - "original_lang_name": "sinhala", - "original_lang_code": "sin", - "scripts": [ - "Sinh" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Cyrl": { - "full_object": "StanzaTokenizer(\"kmr\")", - "original_lang_name": "northern_kurdish", - "original_lang_code": "kmr", - "scripts": [ - "Latn", - "Cyrl" - ], - "class_name": "StanzaTokenizer", - "macrolanguage": false - }, - "Armn": { - "full_object": "SpaCyTokenizer(\"hy\")", - "original_lang_name": "armenian", - "original_lang_code": "hye", - "scripts": [ - "Armn" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Grek": { - "full_object": "SpaCyTokenizer(\"el\")", - "original_lang_name": "greek", - "original_lang_code": "ell", - "scripts": [ - "Grek" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - } - }, - "children": [ - { - "name": "Ahirani", - "iso_1_code": null, - "iso_3_code": "ahr", - "tokenizers": {}, - "children": [], - "node_i": "4120", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Dangi", - "iso_1_code": null, - "iso_3_code": "dhn", - "tokenizers": {}, - "children": [], - "node_i": "4121", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Khandesi", - "iso_1_code": null, - "iso_3_code": "khn", - "tokenizers": {}, - "children": [], - "node_i": "4122", - "scripts": [], - "own_tokenizer": false - } - ], - "node_i": "4119", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Pahari", - "iso_1_code": null, - "iso_3_code": null, - "tokenizers": { - "Latn": { - "full_object": "SpaCyTokenizer(\"gu\")", - "original_lang_name": "gujarati", - "original_lang_code": "guj", - "scripts": [ - "Latn", - "Gujr" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Gujr": { - "full_object": "SpaCyTokenizer(\"gu\")", - "original_lang_name": "gujarati", - "original_lang_code": "guj", - "scripts": [ - "Latn", - "Gujr" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Guru": { - "full_object": "IndicNLPTokenizer(\"pa\")", - "original_lang_name": "punjabi", - "original_lang_code": "pan", - "scripts": [ - "Latn", - "Guru" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": false - }, - "Deva": { - "full_object": "SpaCyTokenizer(\"ne\")", - "original_lang_name": "nepali", - "original_lang_code": "nep", - "scripts": [ - "Latn", - "Deva" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": true - }, - "Beng": { - "full_object": "SpaCyTokenizer(\"bn\")", - "original_lang_name": "bengali", - "original_lang_code": "ben", - "scripts": [ - "Latn", - "Beng" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Orya": { - "full_object": "IndicNLPTokenizer(\"or\")", - "original_lang_name": "oriya", - "original_lang_code": "ori", - "scripts": [ - "Latn", - "Orya" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": true - }, - "Arab": { - "full_object": "SpaCyTokenizer(\"ur\")", - "original_lang_name": "urdu", - "original_lang_code": "urd", - "scripts": [ - "Latn", - "Arab" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Sinh": { - "full_object": "SpaCyTokenizer(\"si\")", - "original_lang_name": "sinhala", - "original_lang_code": "sin", - "scripts": [ - "Sinh" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Cyrl": { - "full_object": "StanzaTokenizer(\"kmr\")", - "original_lang_name": "northern_kurdish", - "original_lang_code": "kmr", - "scripts": [ - "Latn", - "Cyrl" - ], - "class_name": "StanzaTokenizer", - "macrolanguage": false - }, - "Armn": { - "full_object": "SpaCyTokenizer(\"hy\")", - "original_lang_name": "armenian", - "original_lang_code": "hye", - "scripts": [ - "Armn" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Grek": { - "full_object": "SpaCyTokenizer(\"el\")", - "original_lang_name": "greek", - "original_lang_code": "ell", - "scripts": [ - "Grek" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - } - }, - "children": [ - { - "name": "Central Pahari", - "iso_1_code": null, - "iso_3_code": null, - "tokenizers": { - "Latn": { - "full_object": "SpaCyTokenizer(\"gu\")", - "original_lang_name": "gujarati", - "original_lang_code": "guj", - "scripts": [ - "Latn", - "Gujr" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Gujr": { - "full_object": "SpaCyTokenizer(\"gu\")", - "original_lang_name": "gujarati", - "original_lang_code": "guj", - "scripts": [ - "Latn", - "Gujr" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Guru": { - "full_object": "IndicNLPTokenizer(\"pa\")", - "original_lang_name": "punjabi", - "original_lang_code": "pan", - "scripts": [ - "Latn", - "Guru" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": false - }, - "Deva": { - "full_object": "SpaCyTokenizer(\"ne\")", - "original_lang_name": "nepali", - "original_lang_code": "nep", - "scripts": [ - "Latn", - "Deva" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": true - }, - "Beng": { - "full_object": "SpaCyTokenizer(\"bn\")", - "original_lang_name": "bengali", - "original_lang_code": "ben", - "scripts": [ - "Latn", - "Beng" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Orya": { - "full_object": "IndicNLPTokenizer(\"or\")", - "original_lang_name": "oriya", - "original_lang_code": "ori", - "scripts": [ - "Latn", - "Orya" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": true - }, - "Arab": { - "full_object": "SpaCyTokenizer(\"ur\")", - "original_lang_name": "urdu", - "original_lang_code": "urd", - "scripts": [ - "Latn", - "Arab" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Sinh": { - "full_object": "SpaCyTokenizer(\"si\")", - "original_lang_name": "sinhala", - "original_lang_code": "sin", - "scripts": [ - "Sinh" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Cyrl": { - "full_object": "StanzaTokenizer(\"kmr\")", - "original_lang_name": "northern_kurdish", - "original_lang_code": "kmr", - "scripts": [ - "Latn", - "Cyrl" - ], - "class_name": "StanzaTokenizer", - "macrolanguage": false - }, - "Armn": { - "full_object": "SpaCyTokenizer(\"hy\")", - "original_lang_name": "armenian", - "original_lang_code": "hye", - "scripts": [ - "Armn" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Grek": { - "full_object": "SpaCyTokenizer(\"el\")", - "original_lang_name": "greek", - "original_lang_code": "ell", - "scripts": [ - "Grek" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - } - }, - "children": [ - { - "name": "Kumaoni", - "iso_1_code": null, - "iso_3_code": "kfy", - "tokenizers": {}, - "children": [], - "node_i": "4125", - "scripts": [], - "own_tokenizer": false - } - ], - "node_i": "4124", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Garhwali", - "iso_1_code": null, - "iso_3_code": null, - "tokenizers": { - "Latn": { - "full_object": "SpaCyTokenizer(\"gu\")", - "original_lang_name": "gujarati", - "original_lang_code": "guj", - "scripts": [ - "Latn", - "Gujr" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Gujr": { - "full_object": "SpaCyTokenizer(\"gu\")", - "original_lang_name": "gujarati", - "original_lang_code": "guj", - "scripts": [ - "Latn", - "Gujr" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Guru": { - "full_object": "IndicNLPTokenizer(\"pa\")", - "original_lang_name": "punjabi", - "original_lang_code": "pan", - "scripts": [ - "Latn", - "Guru" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": false - }, - "Deva": { - "full_object": "SpaCyTokenizer(\"ne\")", - "original_lang_name": "nepali", - "original_lang_code": "nep", - "scripts": [ - "Latn", - "Deva" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": true - }, - "Beng": { - "full_object": "SpaCyTokenizer(\"bn\")", - "original_lang_name": "bengali", - "original_lang_code": "ben", - "scripts": [ - "Latn", - "Beng" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Orya": { - "full_object": "IndicNLPTokenizer(\"or\")", - "original_lang_name": "oriya", - "original_lang_code": "ori", - "scripts": [ - "Latn", - "Orya" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": true - }, - "Arab": { - "full_object": "SpaCyTokenizer(\"ur\")", - "original_lang_name": "urdu", - "original_lang_code": "urd", - "scripts": [ - "Latn", - "Arab" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Sinh": { - "full_object": "SpaCyTokenizer(\"si\")", - "original_lang_name": "sinhala", - "original_lang_code": "sin", - "scripts": [ - "Sinh" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Cyrl": { - "full_object": "StanzaTokenizer(\"kmr\")", - "original_lang_name": "northern_kurdish", - "original_lang_code": "kmr", - "scripts": [ - "Latn", - "Cyrl" - ], - "class_name": "StanzaTokenizer", - "macrolanguage": false - }, - "Armn": { - "full_object": "SpaCyTokenizer(\"hy\")", - "original_lang_name": "armenian", - "original_lang_code": "hye", - "scripts": [ - "Armn" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Grek": { - "full_object": "SpaCyTokenizer(\"el\")", - "original_lang_name": "greek", - "original_lang_code": "ell", - "scripts": [ - "Grek" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - } - }, - "children": [ - { - "name": "Garhwali", - "iso_1_code": null, - "iso_3_code": "gbm", - "tokenizers": {}, - "children": [], - "node_i": "4127", - "scripts": [], - "own_tokenizer": false - } - ], - "node_i": "4126", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Western Pahari", - "iso_1_code": null, - "iso_3_code": null, - "tokenizers": { - "Latn": { - "full_object": "SpaCyTokenizer(\"gu\")", - "original_lang_name": "gujarati", - "original_lang_code": "guj", - "scripts": [ - "Latn", - "Gujr" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Gujr": { - "full_object": "SpaCyTokenizer(\"gu\")", - "original_lang_name": "gujarati", - "original_lang_code": "guj", - "scripts": [ - "Latn", - "Gujr" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Guru": { - "full_object": "IndicNLPTokenizer(\"pa\")", - "original_lang_name": "punjabi", - "original_lang_code": "pan", - "scripts": [ - "Latn", - "Guru" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": false - }, - "Deva": { - "full_object": "SpaCyTokenizer(\"ne\")", - "original_lang_name": "nepali", - "original_lang_code": "nep", - "scripts": [ - "Latn", - "Deva" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": true - }, - "Beng": { - "full_object": "SpaCyTokenizer(\"bn\")", - "original_lang_name": "bengali", - "original_lang_code": "ben", - "scripts": [ - "Latn", - "Beng" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Orya": { - "full_object": "IndicNLPTokenizer(\"or\")", - "original_lang_name": "oriya", - "original_lang_code": "ori", - "scripts": [ - "Latn", - "Orya" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": true - }, - "Arab": { - "full_object": "SpaCyTokenizer(\"ur\")", - "original_lang_name": "urdu", - "original_lang_code": "urd", - "scripts": [ - "Latn", - "Arab" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Sinh": { - "full_object": "SpaCyTokenizer(\"si\")", - "original_lang_name": "sinhala", - "original_lang_code": "sin", - "scripts": [ - "Sinh" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Cyrl": { - "full_object": "StanzaTokenizer(\"kmr\")", - "original_lang_name": "northern_kurdish", - "original_lang_code": "kmr", - "scripts": [ - "Latn", - "Cyrl" - ], - "class_name": "StanzaTokenizer", - "macrolanguage": false - }, - "Armn": { - "full_object": "SpaCyTokenizer(\"hy\")", - "original_lang_name": "armenian", - "original_lang_code": "hye", - "scripts": [ - "Armn" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Grek": { - "full_object": "SpaCyTokenizer(\"el\")", - "original_lang_name": "greek", - "original_lang_code": "ell", - "scripts": [ - "Grek" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - } - }, - "children": [ - { - "name": "Pahari, Mahasu", - "iso_1_code": null, - "iso_3_code": "bfz", - "tokenizers": {}, - "children": [], - "node_i": "4129", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Bhadrawahi", - "iso_1_code": null, - "iso_3_code": "bhd", - "tokenizers": {}, - "children": [], - "node_i": "4130", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Bhattiyali", - "iso_1_code": null, - "iso_3_code": "bht", - "tokenizers": {}, - "children": [], - "node_i": "4131", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Chambeali", - "iso_1_code": null, - "iso_3_code": "cdh", - "tokenizers": {}, - "children": [], - "node_i": "4132", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Churahi", - "iso_1_code": null, - "iso_3_code": "cdj", - "tokenizers": {}, - "children": [], - "node_i": "4133", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Dogri", - "iso_1_code": null, - "iso_3_code": "dgo", - "tokenizers": {}, - "children": [], - "node_i": "4134", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Gaddi", - "iso_1_code": null, - "iso_3_code": "gbk", - "tokenizers": {}, - "children": [], - "node_i": "4135", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Hinduri", - "iso_1_code": null, - "iso_3_code": "hii", - "tokenizers": {}, - "children": [], - "node_i": "4136", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Khah", - "iso_1_code": null, - "iso_3_code": "hkh", - "tokenizers": {}, - "children": [], - "node_i": "4137", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Jaunsari", - "iso_1_code": null, - "iso_3_code": "jns", - "tokenizers": {}, - "children": [], - "node_i": "4138", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Bilaspuri", - "iso_1_code": null, - "iso_3_code": "kfs", - "tokenizers": {}, - "children": [], - "node_i": "4139", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Pahari, Kullu", - "iso_1_code": null, - "iso_3_code": "kfx", - "tokenizers": {}, - "children": [], - "node_i": "4140", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Kinnauri, Pahari", - "iso_1_code": null, - "iso_3_code": "kjo", - "tokenizers": {}, - "children": [], - "node_i": "4141", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Mandeali", - "iso_1_code": null, - "iso_3_code": "mjl", - "tokenizers": {}, - "children": [], - "node_i": "4142", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Pangwali", - "iso_1_code": null, - "iso_3_code": "pgg", - "tokenizers": {}, - "children": [], - "node_i": "4143", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Sirmauri", - "iso_1_code": null, - "iso_3_code": "srx", - "tokenizers": {}, - "children": [], - "node_i": "4144", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Kangri", - "iso_1_code": null, - "iso_3_code": "xnr", - "tokenizers": {}, - "children": [], - "node_i": "4145", - "scripts": [], - "own_tokenizer": false - } - ], - "node_i": "4128", - "scripts": [], - "own_tokenizer": false - } - ], - "node_i": "4123", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Panjabi", - "iso_1_code": null, - "iso_3_code": null, - "tokenizers": { - "Latn": { - "full_object": "IndicNLPTokenizer(\"pa\")", - "original_lang_name": "punjabi", - "original_lang_code": "pan", - "scripts": [ - "Latn", - "Guru" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": false - }, - "Guru": { - "full_object": "IndicNLPTokenizer(\"pa\")", - "original_lang_name": "punjabi", - "original_lang_code": "pan", - "scripts": [ - "Latn", - "Guru" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": false - }, - "Gujr": { - "full_object": "SpaCyTokenizer(\"gu\")", - "original_lang_name": "gujarati", - "original_lang_code": "guj", - "scripts": [ - "Latn", - "Gujr" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Deva": { - "full_object": "SpaCyTokenizer(\"ne\")", - "original_lang_name": "nepali", - "original_lang_code": "nep", - "scripts": [ - "Latn", - "Deva" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": true - }, - "Beng": { - "full_object": "SpaCyTokenizer(\"bn\")", - "original_lang_name": "bengali", - "original_lang_code": "ben", - "scripts": [ - "Latn", - "Beng" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Orya": { - "full_object": "IndicNLPTokenizer(\"or\")", - "original_lang_name": "oriya", - "original_lang_code": "ori", - "scripts": [ - "Latn", - "Orya" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": true - }, - "Arab": { - "full_object": "SpaCyTokenizer(\"ur\")", - "original_lang_name": "urdu", - "original_lang_code": "urd", - "scripts": [ - "Latn", - "Arab" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Sinh": { - "full_object": "SpaCyTokenizer(\"si\")", - "original_lang_name": "sinhala", - "original_lang_code": "sin", - "scripts": [ - "Sinh" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Cyrl": { - "full_object": "StanzaTokenizer(\"kmr\")", - "original_lang_name": "northern_kurdish", - "original_lang_code": "kmr", - "scripts": [ - "Latn", - "Cyrl" - ], - "class_name": "StanzaTokenizer", - "macrolanguage": false - }, - "Armn": { - "full_object": "SpaCyTokenizer(\"hy\")", - "original_lang_name": "armenian", - "original_lang_code": "hye", - "scripts": [ - "Armn" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Grek": { - "full_object": "SpaCyTokenizer(\"el\")", - "original_lang_name": "greek", - "original_lang_code": "ell", - "scripts": [ - "Grek" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - } - }, - "children": [ - { - "name": "Punjabi, Eastern", - "iso_1_code": "pa", - "iso_3_code": "pan", - "tokenizers": { - "Latn": { - "full_object": "IndicNLPTokenizer(\"pa\")", - "original_lang_name": "punjabi", - "original_lang_code": "pan", - "scripts": [ - "Latn", - "Guru" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": false - }, - "Guru": { - "full_object": "IndicNLPTokenizer(\"pa\")", - "original_lang_name": "punjabi", - "original_lang_code": "pan", - "scripts": [ - "Latn", - "Guru" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": false - } - }, - "children": [], - "node_i": "4147", - "scripts": [ - "Latn", - "Guru" - ], - "own_tokenizer": true - }, - { - "name": "Western Panjabi", - "iso_1_code": null, - "iso_3_code": null, - "tokenizers": { - "Latn": { - "full_object": "IndicNLPTokenizer(\"pa\")", - "original_lang_name": "punjabi", - "original_lang_code": "pan", - "scripts": [ - "Latn", - "Guru" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": false - }, - "Guru": { - "full_object": "IndicNLPTokenizer(\"pa\")", - "original_lang_name": "punjabi", - "original_lang_code": "pan", - "scripts": [ - "Latn", - "Guru" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": false - }, - "Gujr": { - "full_object": "SpaCyTokenizer(\"gu\")", - "original_lang_name": "gujarati", - "original_lang_code": "guj", - "scripts": [ - "Latn", - "Gujr" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Deva": { - "full_object": "SpaCyTokenizer(\"ne\")", - "original_lang_name": "nepali", - "original_lang_code": "nep", - "scripts": [ - "Latn", - "Deva" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": true - }, - "Beng": { - "full_object": "SpaCyTokenizer(\"bn\")", - "original_lang_name": "bengali", - "original_lang_code": "ben", - "scripts": [ - "Latn", - "Beng" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Orya": { - "full_object": "IndicNLPTokenizer(\"or\")", - "original_lang_name": "oriya", - "original_lang_code": "ori", - "scripts": [ - "Latn", - "Orya" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": true - }, - "Arab": { - "full_object": "SpaCyTokenizer(\"ur\")", - "original_lang_name": "urdu", - "original_lang_code": "urd", - "scripts": [ - "Latn", - "Arab" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Sinh": { - "full_object": "SpaCyTokenizer(\"si\")", - "original_lang_name": "sinhala", - "original_lang_code": "sin", - "scripts": [ - "Sinh" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Cyrl": { - "full_object": "StanzaTokenizer(\"kmr\")", - "original_lang_name": "northern_kurdish", - "original_lang_code": "kmr", - "scripts": [ - "Latn", - "Cyrl" - ], - "class_name": "StanzaTokenizer", - "macrolanguage": false - }, - "Armn": { - "full_object": "SpaCyTokenizer(\"hy\")", - "original_lang_name": "armenian", - "original_lang_code": "hye", - "scripts": [ - "Armn" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Grek": { - "full_object": "SpaCyTokenizer(\"el\")", - "original_lang_name": "greek", - "original_lang_code": "ell", - "scripts": [ - "Grek" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - } - }, - "children": [ - { - "name": "Hindko, Southern", - "iso_1_code": null, - "iso_3_code": "hnd", - "tokenizers": {}, - "children": [], - "node_i": "4149", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Hindko, Northern", - "iso_1_code": null, - "iso_3_code": "hno", - "tokenizers": {}, - "children": [], - "node_i": "4150", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Inku", - "iso_1_code": null, - "iso_3_code": "jat", - "tokenizers": {}, - "children": [], - "node_i": "4151", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Pahari-Potwari", - "iso_1_code": null, - "iso_3_code": "phr", - "tokenizers": {}, - "children": [], - "node_i": "4152", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Punjabi, Western", - "iso_1_code": null, - "iso_3_code": "pnb", - "tokenizers": { - "Arab": { - "full_object": "SpaCyTokenizer(\"ur\")", - "original_lang_name": "urdu", - "original_lang_code": "urd", - "scripts": [ - "Latn", - "Arab" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - } - }, - "children": [], - "node_i": "4153", - "scripts": [ - "Arab" - ], - "own_tokenizer": false - }, - { - "name": "Saraiki", - "iso_1_code": null, - "iso_3_code": "skr", - "tokenizers": { - "Arab": { - "full_object": "SpaCyTokenizer(\"ur\")", - "original_lang_name": "urdu", - "original_lang_code": "urd", - "scripts": [ - "Latn", - "Arab" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - } - }, - "children": [], - "node_i": "4154", - "scripts": [ - "Arab" - ], - "own_tokenizer": false - }, - { - "name": "Khetrani", - "iso_1_code": null, - "iso_3_code": "xhe", - "tokenizers": {}, - "children": [], - "node_i": "4155", - "scripts": [], - "own_tokenizer": false - } - ], - "node_i": "4148", - "scripts": [], - "own_tokenizer": false - } - ], - "node_i": "4146", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Rajasthani", - "iso_1_code": null, - "iso_3_code": null, - "tokenizers": { - "Latn": { - "full_object": "SpaCyTokenizer(\"gu\")", - "original_lang_name": "gujarati", - "original_lang_code": "guj", - "scripts": [ - "Latn", - "Gujr" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Gujr": { - "full_object": "SpaCyTokenizer(\"gu\")", - "original_lang_name": "gujarati", - "original_lang_code": "guj", - "scripts": [ - "Latn", - "Gujr" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Guru": { - "full_object": "IndicNLPTokenizer(\"pa\")", - "original_lang_name": "punjabi", - "original_lang_code": "pan", - "scripts": [ - "Latn", - "Guru" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": false - }, - "Deva": { - "full_object": "SpaCyTokenizer(\"ne\")", - "original_lang_name": "nepali", - "original_lang_code": "nep", - "scripts": [ - "Latn", - "Deva" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": true - }, - "Beng": { - "full_object": "SpaCyTokenizer(\"bn\")", - "original_lang_name": "bengali", - "original_lang_code": "ben", - "scripts": [ - "Latn", - "Beng" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Orya": { - "full_object": "IndicNLPTokenizer(\"or\")", - "original_lang_name": "oriya", - "original_lang_code": "ori", - "scripts": [ - "Latn", - "Orya" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": true - }, - "Arab": { - "full_object": "SpaCyTokenizer(\"ur\")", - "original_lang_name": "urdu", - "original_lang_code": "urd", - "scripts": [ - "Latn", - "Arab" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Sinh": { - "full_object": "SpaCyTokenizer(\"si\")", - "original_lang_name": "sinhala", - "original_lang_code": "sin", - "scripts": [ - "Sinh" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Cyrl": { - "full_object": "StanzaTokenizer(\"kmr\")", - "original_lang_name": "northern_kurdish", - "original_lang_code": "kmr", - "scripts": [ - "Latn", - "Cyrl" - ], - "class_name": "StanzaTokenizer", - "macrolanguage": false - }, - "Armn": { - "full_object": "SpaCyTokenizer(\"hy\")", - "original_lang_name": "armenian", - "original_lang_code": "hye", - "scripts": [ - "Armn" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Grek": { - "full_object": "SpaCyTokenizer(\"el\")", - "original_lang_name": "greek", - "original_lang_code": "ell", - "scripts": [ - "Grek" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - } - }, - "children": [ - { - "name": "Gujari", - "iso_1_code": null, - "iso_3_code": "gju", - "tokenizers": {}, - "children": [], - "node_i": "4157", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Marwari", - "iso_1_code": null, - "iso_3_code": null, - "tokenizers": { - "Latn": { - "full_object": "SpaCyTokenizer(\"gu\")", - "original_lang_name": "gujarati", - "original_lang_code": "guj", - "scripts": [ - "Latn", - "Gujr" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Gujr": { - "full_object": "SpaCyTokenizer(\"gu\")", - "original_lang_name": "gujarati", - "original_lang_code": "guj", - "scripts": [ - "Latn", - "Gujr" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Guru": { - "full_object": "IndicNLPTokenizer(\"pa\")", - "original_lang_name": "punjabi", - "original_lang_code": "pan", - "scripts": [ - "Latn", - "Guru" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": false - }, - "Deva": { - "full_object": "SpaCyTokenizer(\"ne\")", - "original_lang_name": "nepali", - "original_lang_code": "nep", - "scripts": [ - "Latn", - "Deva" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": true - }, - "Beng": { - "full_object": "SpaCyTokenizer(\"bn\")", - "original_lang_name": "bengali", - "original_lang_code": "ben", - "scripts": [ - "Latn", - "Beng" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Orya": { - "full_object": "IndicNLPTokenizer(\"or\")", - "original_lang_name": "oriya", - "original_lang_code": "ori", - "scripts": [ - "Latn", - "Orya" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": true - }, - "Arab": { - "full_object": "SpaCyTokenizer(\"ur\")", - "original_lang_name": "urdu", - "original_lang_code": "urd", - "scripts": [ - "Latn", - "Arab" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Sinh": { - "full_object": "SpaCyTokenizer(\"si\")", - "original_lang_name": "sinhala", - "original_lang_code": "sin", - "scripts": [ - "Sinh" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Cyrl": { - "full_object": "StanzaTokenizer(\"kmr\")", - "original_lang_name": "northern_kurdish", - "original_lang_code": "kmr", - "scripts": [ - "Latn", - "Cyrl" - ], - "class_name": "StanzaTokenizer", - "macrolanguage": false - }, - "Armn": { - "full_object": "SpaCyTokenizer(\"hy\")", - "original_lang_name": "armenian", - "original_lang_code": "hye", - "scripts": [ - "Armn" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Grek": { - "full_object": "SpaCyTokenizer(\"el\")", - "original_lang_name": "greek", - "original_lang_code": "ell", - "scripts": [ - "Grek" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - } - }, - "children": [ - { - "name": "Dhundari", - "iso_1_code": null, - "iso_3_code": "dhd", - "tokenizers": {}, - "children": [], - "node_i": "4159", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Godwari", - "iso_1_code": null, - "iso_3_code": "gdx", - "tokenizers": {}, - "children": [], - "node_i": "4160", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Goaria", - "iso_1_code": null, - "iso_3_code": "gig", - "tokenizers": {}, - "children": [], - "node_i": "4161", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Jogi", - "iso_1_code": null, - "iso_3_code": "jog", - "tokenizers": {}, - "children": [], - "node_i": "4162", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Loarki", - "iso_1_code": null, - "iso_3_code": "lrk", - "tokenizers": {}, - "children": [], - "node_i": "4163", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Dhatki", - "iso_1_code": null, - "iso_3_code": "mki", - "tokenizers": {}, - "children": [], - "node_i": "4164", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Mewari", - "iso_1_code": null, - "iso_3_code": "mtr", - "tokenizers": {}, - "children": [], - "node_i": "4165", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Marwari", - "iso_1_code": null, - "iso_3_code": "mve", - "tokenizers": {}, - "children": [], - "node_i": "4166", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Marwari", - "iso_1_code": null, - "iso_3_code": "rwr", - "tokenizers": {}, - "children": [], - "node_i": "4167", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Shekhawati", - "iso_1_code": null, - "iso_3_code": "swv", - "tokenizers": {}, - "children": [], - "node_i": "4168", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Merwari", - "iso_1_code": null, - "iso_3_code": "wry", - "tokenizers": {}, - "children": [], - "node_i": "4169", - "scripts": [], - "own_tokenizer": false - } - ], - "node_i": "4158", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Unclassified", - "iso_1_code": null, - "iso_3_code": null, - "tokenizers": { - "Latn": { - "full_object": "SpaCyTokenizer(\"gu\")", - "original_lang_name": "gujarati", - "original_lang_code": "guj", - "scripts": [ - "Latn", - "Gujr" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Gujr": { - "full_object": "SpaCyTokenizer(\"gu\")", - "original_lang_name": "gujarati", - "original_lang_code": "guj", - "scripts": [ - "Latn", - "Gujr" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Guru": { - "full_object": "IndicNLPTokenizer(\"pa\")", - "original_lang_name": "punjabi", - "original_lang_code": "pan", - "scripts": [ - "Latn", - "Guru" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": false - }, - "Deva": { - "full_object": "SpaCyTokenizer(\"ne\")", - "original_lang_name": "nepali", - "original_lang_code": "nep", - "scripts": [ - "Latn", - "Deva" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": true - }, - "Beng": { - "full_object": "SpaCyTokenizer(\"bn\")", - "original_lang_name": "bengali", - "original_lang_code": "ben", - "scripts": [ - "Latn", - "Beng" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Orya": { - "full_object": "IndicNLPTokenizer(\"or\")", - "original_lang_name": "oriya", - "original_lang_code": "ori", - "scripts": [ - "Latn", - "Orya" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": true - }, - "Arab": { - "full_object": "SpaCyTokenizer(\"ur\")", - "original_lang_name": "urdu", - "original_lang_code": "urd", - "scripts": [ - "Latn", - "Arab" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Sinh": { - "full_object": "SpaCyTokenizer(\"si\")", - "original_lang_name": "sinhala", - "original_lang_code": "sin", - "scripts": [ - "Sinh" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Cyrl": { - "full_object": "StanzaTokenizer(\"kmr\")", - "original_lang_name": "northern_kurdish", - "original_lang_code": "kmr", - "scripts": [ - "Latn", - "Cyrl" - ], - "class_name": "StanzaTokenizer", - "macrolanguage": false - }, - "Armn": { - "full_object": "SpaCyTokenizer(\"hy\")", - "original_lang_name": "armenian", - "original_lang_code": "hye", - "scripts": [ - "Armn" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Grek": { - "full_object": "SpaCyTokenizer(\"el\")", - "original_lang_name": "greek", - "original_lang_code": "ell", - "scripts": [ - "Grek" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - } - }, - "children": [ - { - "name": "Bagri", - "iso_1_code": null, - "iso_3_code": "bgq", - "tokenizers": {}, - "children": [], - "node_i": "4171", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Lohar, Gade", - "iso_1_code": null, - "iso_3_code": "gda", - "tokenizers": {}, - "children": [], - "node_i": "4172", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Gurgula", - "iso_1_code": null, - "iso_3_code": "ggg", - "tokenizers": {}, - "children": [], - "node_i": "4173", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Haroti", - "iso_1_code": null, - "iso_3_code": "hoj", - "tokenizers": {}, - "children": [], - "node_i": "4174", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Lambadi", - "iso_1_code": null, - "iso_3_code": "lmn", - "tokenizers": {}, - "children": [], - "node_i": "4175", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Malvi", - "iso_1_code": null, - "iso_3_code": "mup", - "tokenizers": { - "Deva": { - "full_object": "SpaCyTokenizer(\"ne\")", - "original_lang_name": "nepali", - "original_lang_code": "nep", - "scripts": [ - "Latn", - "Deva" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": true - } - }, - "children": [], - "node_i": "4176", - "scripts": [ - "Deva" - ], - "own_tokenizer": false - }, - { - "name": "Nimadi", - "iso_1_code": null, - "iso_3_code": "noe", - "tokenizers": {}, - "children": [], - "node_i": "4177", - "scripts": [], - "own_tokenizer": false - } - ], - "node_i": "4170", - "scripts": [], - "own_tokenizer": false - } - ], - "node_i": "4156", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Romani", - "iso_1_code": null, - "iso_3_code": null, - "tokenizers": { - "Latn": { - "full_object": "SpaCyTokenizer(\"gu\")", - "original_lang_name": "gujarati", - "original_lang_code": "guj", - "scripts": [ - "Latn", - "Gujr" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Gujr": { - "full_object": "SpaCyTokenizer(\"gu\")", - "original_lang_name": "gujarati", - "original_lang_code": "guj", - "scripts": [ - "Latn", - "Gujr" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Guru": { - "full_object": "IndicNLPTokenizer(\"pa\")", - "original_lang_name": "punjabi", - "original_lang_code": "pan", - "scripts": [ - "Latn", - "Guru" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": false - }, - "Deva": { - "full_object": "SpaCyTokenizer(\"ne\")", - "original_lang_name": "nepali", - "original_lang_code": "nep", - "scripts": [ - "Latn", - "Deva" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": true - }, - "Beng": { - "full_object": "SpaCyTokenizer(\"bn\")", - "original_lang_name": "bengali", - "original_lang_code": "ben", - "scripts": [ - "Latn", - "Beng" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Orya": { - "full_object": "IndicNLPTokenizer(\"or\")", - "original_lang_name": "oriya", - "original_lang_code": "ori", - "scripts": [ - "Latn", - "Orya" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": true - }, - "Arab": { - "full_object": "SpaCyTokenizer(\"ur\")", - "original_lang_name": "urdu", - "original_lang_code": "urd", - "scripts": [ - "Latn", - "Arab" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Sinh": { - "full_object": "SpaCyTokenizer(\"si\")", - "original_lang_name": "sinhala", - "original_lang_code": "sin", - "scripts": [ - "Sinh" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Cyrl": { - "full_object": "StanzaTokenizer(\"kmr\")", - "original_lang_name": "northern_kurdish", - "original_lang_code": "kmr", - "scripts": [ - "Latn", - "Cyrl" - ], - "class_name": "StanzaTokenizer", - "macrolanguage": false - }, - "Armn": { - "full_object": "SpaCyTokenizer(\"hy\")", - "original_lang_name": "armenian", - "original_lang_code": "hye", - "scripts": [ - "Armn" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Grek": { - "full_object": "SpaCyTokenizer(\"el\")", - "original_lang_name": "greek", - "original_lang_code": "ell", - "scripts": [ - "Grek" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - } - }, - "children": [ - { - "name": "Balkan", - "iso_1_code": null, - "iso_3_code": null, - "tokenizers": { - "Latn": { - "full_object": "SpaCyTokenizer(\"gu\")", - "original_lang_name": "gujarati", - "original_lang_code": "guj", - "scripts": [ - "Latn", - "Gujr" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Gujr": { - "full_object": "SpaCyTokenizer(\"gu\")", - "original_lang_name": "gujarati", - "original_lang_code": "guj", - "scripts": [ - "Latn", - "Gujr" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Guru": { - "full_object": "IndicNLPTokenizer(\"pa\")", - "original_lang_name": "punjabi", - "original_lang_code": "pan", - "scripts": [ - "Latn", - "Guru" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": false - }, - "Deva": { - "full_object": "SpaCyTokenizer(\"ne\")", - "original_lang_name": "nepali", - "original_lang_code": "nep", - "scripts": [ - "Latn", - "Deva" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": true - }, - "Beng": { - "full_object": "SpaCyTokenizer(\"bn\")", - "original_lang_name": "bengali", - "original_lang_code": "ben", - "scripts": [ - "Latn", - "Beng" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Orya": { - "full_object": "IndicNLPTokenizer(\"or\")", - "original_lang_name": "oriya", - "original_lang_code": "ori", - "scripts": [ - "Latn", - "Orya" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": true - }, - "Arab": { - "full_object": "SpaCyTokenizer(\"ur\")", - "original_lang_name": "urdu", - "original_lang_code": "urd", - "scripts": [ - "Latn", - "Arab" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Sinh": { - "full_object": "SpaCyTokenizer(\"si\")", - "original_lang_name": "sinhala", - "original_lang_code": "sin", - "scripts": [ - "Sinh" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Cyrl": { - "full_object": "StanzaTokenizer(\"kmr\")", - "original_lang_name": "northern_kurdish", - "original_lang_code": "kmr", - "scripts": [ - "Latn", - "Cyrl" - ], - "class_name": "StanzaTokenizer", - "macrolanguage": false - }, - "Armn": { - "full_object": "SpaCyTokenizer(\"hy\")", - "original_lang_name": "armenian", - "original_lang_code": "hye", - "scripts": [ - "Armn" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Grek": { - "full_object": "SpaCyTokenizer(\"el\")", - "original_lang_name": "greek", - "original_lang_code": "ell", - "scripts": [ - "Grek" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - } - }, - "children": [ - { - "name": "Romani, Balkan", - "iso_1_code": null, - "iso_3_code": "rmn", - "tokenizers": { - "Latn": { - "full_object": "SpaCyTokenizer(\"gu\")", - "original_lang_name": "gujarati", - "original_lang_code": "guj", - "scripts": [ - "Latn", - "Gujr" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Cyrl": { - "full_object": "StanzaTokenizer(\"kmr\")", - "original_lang_name": "northern_kurdish", - "original_lang_code": "kmr", - "scripts": [ - "Latn", - "Cyrl" - ], - "class_name": "StanzaTokenizer", - "macrolanguage": false - }, - "Grek": { - "full_object": "SpaCyTokenizer(\"el\")", - "original_lang_name": "greek", - "original_lang_code": "ell", - "scripts": [ - "Grek" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - } - }, - "children": [], - "node_i": "4180", - "scripts": [ - "Latn", - "Cyrl", - "Grek" - ], - "own_tokenizer": false - } - ], - "node_i": "4179", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Northern", - "iso_1_code": null, - "iso_3_code": null, - "tokenizers": { - "Latn": { - "full_object": "SpaCyTokenizer(\"gu\")", - "original_lang_name": "gujarati", - "original_lang_code": "guj", - "scripts": [ - "Latn", - "Gujr" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Gujr": { - "full_object": "SpaCyTokenizer(\"gu\")", - "original_lang_name": "gujarati", - "original_lang_code": "guj", - "scripts": [ - "Latn", - "Gujr" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Guru": { - "full_object": "IndicNLPTokenizer(\"pa\")", - "original_lang_name": "punjabi", - "original_lang_code": "pan", - "scripts": [ - "Latn", - "Guru" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": false - }, - "Deva": { - "full_object": "SpaCyTokenizer(\"ne\")", - "original_lang_name": "nepali", - "original_lang_code": "nep", - "scripts": [ - "Latn", - "Deva" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": true - }, - "Beng": { - "full_object": "SpaCyTokenizer(\"bn\")", - "original_lang_name": "bengali", - "original_lang_code": "ben", - "scripts": [ - "Latn", - "Beng" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Orya": { - "full_object": "IndicNLPTokenizer(\"or\")", - "original_lang_name": "oriya", - "original_lang_code": "ori", - "scripts": [ - "Latn", - "Orya" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": true - }, - "Arab": { - "full_object": "SpaCyTokenizer(\"ur\")", - "original_lang_name": "urdu", - "original_lang_code": "urd", - "scripts": [ - "Latn", - "Arab" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Sinh": { - "full_object": "SpaCyTokenizer(\"si\")", - "original_lang_name": "sinhala", - "original_lang_code": "sin", - "scripts": [ - "Sinh" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Cyrl": { - "full_object": "StanzaTokenizer(\"kmr\")", - "original_lang_name": "northern_kurdish", - "original_lang_code": "kmr", - "scripts": [ - "Latn", - "Cyrl" - ], - "class_name": "StanzaTokenizer", - "macrolanguage": false - }, - "Armn": { - "full_object": "SpaCyTokenizer(\"hy\")", - "original_lang_name": "armenian", - "original_lang_code": "hye", - "scripts": [ - "Armn" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Grek": { - "full_object": "SpaCyTokenizer(\"el\")", - "original_lang_name": "greek", - "original_lang_code": "ell", - "scripts": [ - "Grek" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - } - }, - "children": [ - { - "name": "Romani, Carpathian", - "iso_1_code": null, - "iso_3_code": "rmc", - "tokenizers": { - "Latn": { - "full_object": "SpaCyTokenizer(\"gu\")", - "original_lang_name": "gujarati", - "original_lang_code": "guj", - "scripts": [ - "Latn", - "Gujr" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - } - }, - "children": [], - "node_i": "4182", - "scripts": [ - "Latn" - ], - "own_tokenizer": false - }, - { - "name": "Romani, Kalo Finnish", - "iso_1_code": null, - "iso_3_code": "rmf", - "tokenizers": {}, - "children": [], - "node_i": "4183", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Romani, Baltic", - "iso_1_code": null, - "iso_3_code": "rml", - "tokenizers": { - "Latn": { - "full_object": "SpaCyTokenizer(\"gu\")", - "original_lang_name": "gujarati", - "original_lang_code": "guj", - "scripts": [ - "Latn", - "Gujr" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - } - }, - "children": [], - "node_i": "4184", - "scripts": [ - "Latn" - ], - "own_tokenizer": false - }, - { - "name": "Romani, Sinte", - "iso_1_code": null, - "iso_3_code": "rmo", - "tokenizers": { - "Latn": { - "full_object": "SpaCyTokenizer(\"gu\")", - "original_lang_name": "gujarati", - "original_lang_code": "guj", - "scripts": [ - "Latn", - "Gujr" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - } - }, - "children": [], - "node_i": "4185", - "scripts": [ - "Latn" - ], - "own_tokenizer": false - }, - { - "name": "Romani, Welsh", - "iso_1_code": null, - "iso_3_code": "rmw", - "tokenizers": {}, - "children": [], - "node_i": "4186", - "scripts": [], - "own_tokenizer": false - } - ], - "node_i": "4181", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Vlax", - "iso_1_code": null, - "iso_3_code": null, - "tokenizers": { - "Latn": { - "full_object": "SpaCyTokenizer(\"gu\")", - "original_lang_name": "gujarati", - "original_lang_code": "guj", - "scripts": [ - "Latn", - "Gujr" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Gujr": { - "full_object": "SpaCyTokenizer(\"gu\")", - "original_lang_name": "gujarati", - "original_lang_code": "guj", - "scripts": [ - "Latn", - "Gujr" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Guru": { - "full_object": "IndicNLPTokenizer(\"pa\")", - "original_lang_name": "punjabi", - "original_lang_code": "pan", - "scripts": [ - "Latn", - "Guru" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": false - }, - "Deva": { - "full_object": "SpaCyTokenizer(\"ne\")", - "original_lang_name": "nepali", - "original_lang_code": "nep", - "scripts": [ - "Latn", - "Deva" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": true - }, - "Beng": { - "full_object": "SpaCyTokenizer(\"bn\")", - "original_lang_name": "bengali", - "original_lang_code": "ben", - "scripts": [ - "Latn", - "Beng" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Orya": { - "full_object": "IndicNLPTokenizer(\"or\")", - "original_lang_name": "oriya", - "original_lang_code": "ori", - "scripts": [ - "Latn", - "Orya" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": true - }, - "Arab": { - "full_object": "SpaCyTokenizer(\"ur\")", - "original_lang_name": "urdu", - "original_lang_code": "urd", - "scripts": [ - "Latn", - "Arab" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Sinh": { - "full_object": "SpaCyTokenizer(\"si\")", - "original_lang_name": "sinhala", - "original_lang_code": "sin", - "scripts": [ - "Sinh" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Cyrl": { - "full_object": "StanzaTokenizer(\"kmr\")", - "original_lang_name": "northern_kurdish", - "original_lang_code": "kmr", - "scripts": [ - "Latn", - "Cyrl" - ], - "class_name": "StanzaTokenizer", - "macrolanguage": false - }, - "Armn": { - "full_object": "SpaCyTokenizer(\"hy\")", - "original_lang_name": "armenian", - "original_lang_code": "hye", - "scripts": [ - "Armn" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Grek": { - "full_object": "SpaCyTokenizer(\"el\")", - "original_lang_name": "greek", - "original_lang_code": "ell", - "scripts": [ - "Grek" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - } - }, - "children": [ - { - "name": "Romani, Vlax", - "iso_1_code": null, - "iso_3_code": "rmy", - "tokenizers": { - "Latn": { - "full_object": "SpaCyTokenizer(\"gu\")", - "original_lang_name": "gujarati", - "original_lang_code": "guj", - "scripts": [ - "Latn", - "Gujr" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Cyrl": { - "full_object": "StanzaTokenizer(\"kmr\")", - "original_lang_name": "northern_kurdish", - "original_lang_code": "kmr", - "scripts": [ - "Latn", - "Cyrl" - ], - "class_name": "StanzaTokenizer", - "macrolanguage": false - } - }, - "children": [], - "node_i": "4188", - "scripts": [ - "Latn", - "Cyrl" - ], - "own_tokenizer": false - } - ], - "node_i": "4187", - "scripts": [], - "own_tokenizer": false - } - ], - "node_i": "4178", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Unclassified", - "iso_1_code": null, - "iso_3_code": null, - "tokenizers": { - "Latn": { - "full_object": "SpaCyTokenizer(\"gu\")", - "original_lang_name": "gujarati", - "original_lang_code": "guj", - "scripts": [ - "Latn", - "Gujr" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Gujr": { - "full_object": "SpaCyTokenizer(\"gu\")", - "original_lang_name": "gujarati", - "original_lang_code": "guj", - "scripts": [ - "Latn", - "Gujr" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Guru": { - "full_object": "IndicNLPTokenizer(\"pa\")", - "original_lang_name": "punjabi", - "original_lang_code": "pan", - "scripts": [ - "Latn", - "Guru" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": false - }, - "Deva": { - "full_object": "SpaCyTokenizer(\"ne\")", - "original_lang_name": "nepali", - "original_lang_code": "nep", - "scripts": [ - "Latn", - "Deva" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": true - }, - "Beng": { - "full_object": "SpaCyTokenizer(\"bn\")", - "original_lang_name": "bengali", - "original_lang_code": "ben", - "scripts": [ - "Latn", - "Beng" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Orya": { - "full_object": "IndicNLPTokenizer(\"or\")", - "original_lang_name": "oriya", - "original_lang_code": "ori", - "scripts": [ - "Latn", - "Orya" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": true - }, - "Arab": { - "full_object": "SpaCyTokenizer(\"ur\")", - "original_lang_name": "urdu", - "original_lang_code": "urd", - "scripts": [ - "Latn", - "Arab" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Sinh": { - "full_object": "SpaCyTokenizer(\"si\")", - "original_lang_name": "sinhala", - "original_lang_code": "sin", - "scripts": [ - "Sinh" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Cyrl": { - "full_object": "StanzaTokenizer(\"kmr\")", - "original_lang_name": "northern_kurdish", - "original_lang_code": "kmr", - "scripts": [ - "Latn", - "Cyrl" - ], - "class_name": "StanzaTokenizer", - "macrolanguage": false - }, - "Armn": { - "full_object": "SpaCyTokenizer(\"hy\")", - "original_lang_name": "armenian", - "original_lang_code": "hye", - "scripts": [ - "Armn" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Grek": { - "full_object": "SpaCyTokenizer(\"el\")", - "original_lang_name": "greek", - "original_lang_code": "ell", - "scripts": [ - "Grek" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - } - }, - "children": [ - { - "name": "Sonha", - "iso_1_code": null, - "iso_3_code": "soi", - "tokenizers": {}, - "children": [], - "node_i": "4190", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Mewati", - "iso_1_code": null, - "iso_3_code": "wtm", - "tokenizers": {}, - "children": [], - "node_i": "4191", - "scripts": [], - "own_tokenizer": false - } - ], - "node_i": "4189", - "scripts": [], - "own_tokenizer": false - } - ], - "node_i": "4083", - "scripts": [], - "own_tokenizer": false - } - ], - "node_i": "4070", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Outer Languages", - "iso_1_code": null, - "iso_3_code": null, - "tokenizers": { - "Latn": { - "full_object": "SpaCyTokenizer(\"bn\")", - "original_lang_name": "bengali", - "original_lang_code": "ben", - "scripts": [ - "Latn", - "Beng" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Beng": { - "full_object": "SpaCyTokenizer(\"bn\")", - "original_lang_name": "bengali", - "original_lang_code": "ben", - "scripts": [ - "Latn", - "Beng" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Orya": { - "full_object": "IndicNLPTokenizer(\"or\")", - "original_lang_name": "oriya", - "original_lang_code": "ori", - "scripts": [ - "Latn", - "Orya" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": true - }, - "Arab": { - "full_object": "IndicNLPTokenizer(\"sd\")", - "original_lang_name": "sindhi", - "original_lang_code": "snd", - "scripts": [ - "Latn", - "Arab", - "Deva" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": false - }, - "Deva": { - "full_object": "SpaCyTokenizer(\"mr\")", - "original_lang_name": "marathi", - "original_lang_code": "mar", - "scripts": [ - "Latn", - "Deva" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Sinh": { - "full_object": "SpaCyTokenizer(\"si\")", - "original_lang_name": "sinhala", - "original_lang_code": "sin", - "scripts": [ - "Sinh" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Gujr": { - "full_object": "SpaCyTokenizer(\"gu\")", - "original_lang_name": "gujarati", - "original_lang_code": "guj", - "scripts": [ - "Latn", - "Gujr" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Guru": { - "full_object": "IndicNLPTokenizer(\"pa\")", - "original_lang_name": "punjabi", - "original_lang_code": "pan", - "scripts": [ - "Latn", - "Guru" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": false - }, - "Cyrl": { - "full_object": "StanzaTokenizer(\"kmr\")", - "original_lang_name": "northern_kurdish", - "original_lang_code": "kmr", - "scripts": [ - "Latn", - "Cyrl" - ], - "class_name": "StanzaTokenizer", - "macrolanguage": false - }, - "Armn": { - "full_object": "SpaCyTokenizer(\"hy\")", - "original_lang_name": "armenian", - "original_lang_code": "hye", - "scripts": [ - "Armn" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Grek": { - "full_object": "SpaCyTokenizer(\"el\")", - "original_lang_name": "greek", - "original_lang_code": "ell", - "scripts": [ - "Grek" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - } - }, - "children": [ - { - "name": "Eastern", - "iso_1_code": null, - "iso_3_code": null, - "tokenizers": { - "Latn": { - "full_object": "SpaCyTokenizer(\"bn\")", - "original_lang_name": "bengali", - "original_lang_code": "ben", - "scripts": [ - "Latn", - "Beng" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Beng": { - "full_object": "SpaCyTokenizer(\"bn\")", - "original_lang_name": "bengali", - "original_lang_code": "ben", - "scripts": [ - "Latn", - "Beng" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Orya": { - "full_object": "IndicNLPTokenizer(\"or\")", - "original_lang_name": "oriya", - "original_lang_code": "ori", - "scripts": [ - "Latn", - "Orya" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": true - }, - "Arab": { - "full_object": "IndicNLPTokenizer(\"sd\")", - "original_lang_name": "sindhi", - "original_lang_code": "snd", - "scripts": [ - "Latn", - "Arab", - "Deva" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": false - }, - "Deva": { - "full_object": "SpaCyTokenizer(\"mr\")", - "original_lang_name": "marathi", - "original_lang_code": "mar", - "scripts": [ - "Latn", - "Deva" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Sinh": { - "full_object": "SpaCyTokenizer(\"si\")", - "original_lang_name": "sinhala", - "original_lang_code": "sin", - "scripts": [ - "Sinh" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Gujr": { - "full_object": "SpaCyTokenizer(\"gu\")", - "original_lang_name": "gujarati", - "original_lang_code": "guj", - "scripts": [ - "Latn", - "Gujr" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Guru": { - "full_object": "IndicNLPTokenizer(\"pa\")", - "original_lang_name": "punjabi", - "original_lang_code": "pan", - "scripts": [ - "Latn", - "Guru" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": false - }, - "Cyrl": { - "full_object": "StanzaTokenizer(\"kmr\")", - "original_lang_name": "northern_kurdish", - "original_lang_code": "kmr", - "scripts": [ - "Latn", - "Cyrl" - ], - "class_name": "StanzaTokenizer", - "macrolanguage": false - }, - "Armn": { - "full_object": "SpaCyTokenizer(\"hy\")", - "original_lang_name": "armenian", - "original_lang_code": "hye", - "scripts": [ - "Armn" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Grek": { - "full_object": "SpaCyTokenizer(\"el\")", - "original_lang_name": "greek", - "original_lang_code": "ell", - "scripts": [ - "Grek" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - } - }, - "children": [ - { - "name": "Bengali-Assamese", - "iso_1_code": null, - "iso_3_code": null, - "tokenizers": { - "Latn": { - "full_object": "SpaCyTokenizer(\"bn\")", - "original_lang_name": "bengali", - "original_lang_code": "ben", - "scripts": [ - "Latn", - "Beng" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Beng": { - "full_object": "SpaCyTokenizer(\"bn\")", - "original_lang_name": "bengali", - "original_lang_code": "ben", - "scripts": [ - "Latn", - "Beng" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Orya": { - "full_object": "IndicNLPTokenizer(\"or\")", - "original_lang_name": "oriya", - "original_lang_code": "ori", - "scripts": [ - "Latn", - "Orya" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": true - }, - "Arab": { - "full_object": "IndicNLPTokenizer(\"sd\")", - "original_lang_name": "sindhi", - "original_lang_code": "snd", - "scripts": [ - "Latn", - "Arab", - "Deva" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": false - }, - "Deva": { - "full_object": "SpaCyTokenizer(\"mr\")", - "original_lang_name": "marathi", - "original_lang_code": "mar", - "scripts": [ - "Latn", - "Deva" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Sinh": { - "full_object": "SpaCyTokenizer(\"si\")", - "original_lang_name": "sinhala", - "original_lang_code": "sin", - "scripts": [ - "Sinh" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Gujr": { - "full_object": "SpaCyTokenizer(\"gu\")", - "original_lang_name": "gujarati", - "original_lang_code": "guj", - "scripts": [ - "Latn", - "Gujr" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Guru": { - "full_object": "IndicNLPTokenizer(\"pa\")", - "original_lang_name": "punjabi", - "original_lang_code": "pan", - "scripts": [ - "Latn", - "Guru" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": false - }, - "Cyrl": { - "full_object": "StanzaTokenizer(\"kmr\")", - "original_lang_name": "northern_kurdish", - "original_lang_code": "kmr", - "scripts": [ - "Latn", - "Cyrl" - ], - "class_name": "StanzaTokenizer", - "macrolanguage": false - }, - "Armn": { - "full_object": "SpaCyTokenizer(\"hy\")", - "original_lang_name": "armenian", - "original_lang_code": "hye", - "scripts": [ - "Armn" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Grek": { - "full_object": "SpaCyTokenizer(\"el\")", - "original_lang_name": "greek", - "original_lang_code": "ell", - "scripts": [ - "Grek" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - } - }, - "children": [ - { - "name": "Assamese", - "iso_1_code": "as", - "iso_3_code": "asm", - "tokenizers": { - "Latn": { - "full_object": "IndicNLPTokenizer(\"as\")", - "original_lang_name": "assamese", - "original_lang_code": "asm", - "scripts": [ - "Latn", - "Beng" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": false - }, - "Beng": { - "full_object": "IndicNLPTokenizer(\"as\")", - "original_lang_name": "assamese", - "original_lang_code": "asm", - "scripts": [ - "Latn", - "Beng" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": false - } - }, - "children": [], - "node_i": "4195", - "scripts": [ - "Beng", - "Latn" - ], - "own_tokenizer": true - }, - { - "name": "Bengali", - "iso_1_code": "bn", - "iso_3_code": "ben", - "tokenizers": { - "Latn": { - "full_object": "SpaCyTokenizer(\"bn\")", - "original_lang_name": "bengali", - "original_lang_code": "ben", - "scripts": [ - "Latn", - "Beng" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Beng": { - "full_object": "SpaCyTokenizer(\"bn\")", - "original_lang_name": "bengali", - "original_lang_code": "ben", - "scripts": [ - "Latn", - "Beng" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - } - }, - "children": [], - "node_i": "4196", - "scripts": [ - "Beng", - "Latn" - ], - "own_tokenizer": true - }, - { - "name": "Bishnupuriya", - "iso_1_code": null, - "iso_3_code": "bpy", - "tokenizers": { - "Beng": { - "full_object": "SpaCyTokenizer(\"bn\")", - "original_lang_name": "bengali", - "original_lang_code": "ben", - "scripts": [ - "Latn", - "Beng" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - } - }, - "children": [], - "node_i": "4197", - "scripts": [ - "Beng" - ], - "own_tokenizer": false - }, - { - "name": "Chakma", - "iso_1_code": null, - "iso_3_code": "ccp", - "tokenizers": { - "Latn": { - "full_object": "SpaCyTokenizer(\"bn\")", - "original_lang_name": "bengali", - "original_lang_code": "ben", - "scripts": [ - "Latn", - "Beng" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - } - }, - "children": [], - "node_i": "4198", - "scripts": [ - "Latn" - ], - "own_tokenizer": false - }, - { - "name": "Chittagonian", - "iso_1_code": null, - "iso_3_code": "ctg", - "tokenizers": {}, - "children": [], - "node_i": "4199", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Hajong", - "iso_1_code": null, - "iso_3_code": "haj", - "tokenizers": {}, - "children": [], - "node_i": "4200", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Halbi", - "iso_1_code": null, - "iso_3_code": "hlb", - "tokenizers": {}, - "children": [], - "node_i": "4201", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Kurmukar", - "iso_1_code": null, - "iso_3_code": "kfv", - "tokenizers": {}, - "children": [], - "node_i": "4202", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Kharia Thar", - "iso_1_code": null, - "iso_3_code": "ksy", - "tokenizers": {}, - "children": [], - "node_i": "4203", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Kewat", - "iso_1_code": null, - "iso_3_code": "kyv", - "tokenizers": {}, - "children": [], - "node_i": "4204", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Lodhi", - "iso_1_code": null, - "iso_3_code": "lbm", - "tokenizers": {}, - "children": [], - "node_i": "4205", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Mal Paharia", - "iso_1_code": null, - "iso_3_code": "mkb", - "tokenizers": {}, - "children": [], - "node_i": "4206", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Nahari", - "iso_1_code": null, - "iso_3_code": "nhh", - "tokenizers": {}, - "children": [], - "node_i": "4207", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Rohingya", - "iso_1_code": null, - "iso_3_code": "rhg", - "tokenizers": { - "Latn": { - "full_object": "SpaCyTokenizer(\"bn\")", - "original_lang_name": "bengali", - "original_lang_code": "ben", - "scripts": [ - "Latn", - "Beng" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - } - }, - "children": [], - "node_i": "4208", - "scripts": [ - "Latn" - ], - "own_tokenizer": false - }, - { - "name": "Rajbanshi", - "iso_1_code": null, - "iso_3_code": "rjs", - "tokenizers": { - "Deva": { - "full_object": "SpaCyTokenizer(\"mr\")", - "original_lang_name": "marathi", - "original_lang_code": "mar", - "scripts": [ - "Latn", - "Deva" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - } - }, - "children": [], - "node_i": "4209", - "scripts": [ - "Deva" - ], - "own_tokenizer": false - }, - { - "name": "Rangpuri", - "iso_1_code": null, - "iso_3_code": "rkt", - "tokenizers": {}, - "children": [], - "node_i": "4210", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Sylheti", - "iso_1_code": null, - "iso_3_code": "syl", - "tokenizers": { - "Latn": { - "full_object": "SpaCyTokenizer(\"bn\")", - "original_lang_name": "bengali", - "original_lang_code": "ben", - "scripts": [ - "Latn", - "Beng" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Beng": { - "full_object": "SpaCyTokenizer(\"bn\")", - "original_lang_name": "bengali", - "original_lang_code": "ben", - "scripts": [ - "Latn", - "Beng" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - } - }, - "children": [], - "node_i": "4211", - "scripts": [ - "Latn", - "Beng" - ], - "own_tokenizer": false - }, - { - "name": "Tangchangya", - "iso_1_code": null, - "iso_3_code": "tnv", - "tokenizers": {}, - "children": [], - "node_i": "4212", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Mirgan", - "iso_1_code": null, - "iso_3_code": "zrg", - "tokenizers": {}, - "children": [], - "node_i": "4213", - "scripts": [], - "own_tokenizer": false - } - ], - "node_i": "4194", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Bihari", - "iso_1_code": null, - "iso_3_code": null, - "tokenizers": { - "Latn": { - "full_object": "SpaCyTokenizer(\"bn\")", - "original_lang_name": "bengali", - "original_lang_code": "ben", - "scripts": [ - "Latn", - "Beng" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Beng": { - "full_object": "SpaCyTokenizer(\"bn\")", - "original_lang_name": "bengali", - "original_lang_code": "ben", - "scripts": [ - "Latn", - "Beng" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Orya": { - "full_object": "IndicNLPTokenizer(\"or\")", - "original_lang_name": "oriya", - "original_lang_code": "ori", - "scripts": [ - "Latn", - "Orya" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": true - }, - "Arab": { - "full_object": "IndicNLPTokenizer(\"sd\")", - "original_lang_name": "sindhi", - "original_lang_code": "snd", - "scripts": [ - "Latn", - "Arab", - "Deva" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": false - }, - "Deva": { - "full_object": "SpaCyTokenizer(\"mr\")", - "original_lang_name": "marathi", - "original_lang_code": "mar", - "scripts": [ - "Latn", - "Deva" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Sinh": { - "full_object": "SpaCyTokenizer(\"si\")", - "original_lang_name": "sinhala", - "original_lang_code": "sin", - "scripts": [ - "Sinh" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Gujr": { - "full_object": "SpaCyTokenizer(\"gu\")", - "original_lang_name": "gujarati", - "original_lang_code": "guj", - "scripts": [ - "Latn", - "Gujr" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Guru": { - "full_object": "IndicNLPTokenizer(\"pa\")", - "original_lang_name": "punjabi", - "original_lang_code": "pan", - "scripts": [ - "Latn", - "Guru" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": false - }, - "Cyrl": { - "full_object": "StanzaTokenizer(\"kmr\")", - "original_lang_name": "northern_kurdish", - "original_lang_code": "kmr", - "scripts": [ - "Latn", - "Cyrl" - ], - "class_name": "StanzaTokenizer", - "macrolanguage": false - }, - "Armn": { - "full_object": "SpaCyTokenizer(\"hy\")", - "original_lang_name": "armenian", - "original_lang_code": "hye", - "scripts": [ - "Armn" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Grek": { - "full_object": "SpaCyTokenizer(\"el\")", - "original_lang_name": "greek", - "original_lang_code": "ell", - "scripts": [ - "Grek" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - } - }, - "children": [ - { - "name": "Bhojpuri", - "iso_1_code": null, - "iso_3_code": "bho", - "tokenizers": { - "Deva": { - "full_object": "SpaCyTokenizer(\"mr\")", - "original_lang_name": "marathi", - "original_lang_code": "mar", - "scripts": [ - "Latn", - "Deva" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - } - }, - "children": [], - "node_i": "4215", - "scripts": [ - "Deva" - ], - "own_tokenizer": false - }, - { - "name": "Hindustani, Sarnami", - "iso_1_code": null, - "iso_3_code": "hns", - "tokenizers": { - "Latn": { - "full_object": "SpaCyTokenizer(\"bn\")", - "original_lang_name": "bengali", - "original_lang_code": "ben", - "scripts": [ - "Latn", - "Beng" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - } - }, - "children": [], - "node_i": "4216", - "scripts": [ - "Latn" - ], - "own_tokenizer": false - }, - { - "name": "Kudmali", - "iso_1_code": null, - "iso_3_code": "kyw", - "tokenizers": {}, - "children": [], - "node_i": "4217", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Magahi", - "iso_1_code": null, - "iso_3_code": "mag", - "tokenizers": { - "Deva": { - "full_object": "SpaCyTokenizer(\"mr\")", - "original_lang_name": "marathi", - "original_lang_code": "mar", - "scripts": [ - "Latn", - "Deva" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - } - }, - "children": [], - "node_i": "4218", - "scripts": [ - "Deva" - ], - "own_tokenizer": false - }, - { - "name": "Maithili", - "iso_1_code": null, - "iso_3_code": "mai", - "tokenizers": { - "Deva": { - "full_object": "SpaCyTokenizer(\"mr\")", - "original_lang_name": "marathi", - "original_lang_code": "mar", - "scripts": [ - "Latn", - "Deva" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - } - }, - "children": [], - "node_i": "4219", - "scripts": [ - "Deva" - ], - "own_tokenizer": false - }, - { - "name": "Majhi", - "iso_1_code": null, - "iso_3_code": "mjz", - "tokenizers": {}, - "children": [], - "node_i": "4220", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Sadri", - "iso_1_code": null, - "iso_3_code": "sck", - "tokenizers": { - "Deva": { - "full_object": "SpaCyTokenizer(\"mr\")", - "original_lang_name": "marathi", - "original_lang_code": "mar", - "scripts": [ - "Latn", - "Deva" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - } - }, - "children": [], - "node_i": "4221", - "scripts": [ - "Deva" - ], - "own_tokenizer": false - }, - { - "name": "Sadri, Oraon", - "iso_1_code": null, - "iso_3_code": "sdr", - "tokenizers": {}, - "children": [], - "node_i": "4222", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Surjapuri", - "iso_1_code": null, - "iso_3_code": "sjp", - "tokenizers": {}, - "children": [], - "node_i": "4223", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Musasa", - "iso_1_code": null, - "iso_3_code": "smm", - "tokenizers": {}, - "children": [], - "node_i": "4224", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Panchpargania", - "iso_1_code": null, - "iso_3_code": "tdb", - "tokenizers": {}, - "children": [], - "node_i": "4225", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Bajjika", - "iso_1_code": null, - "iso_3_code": "vjk", - "tokenizers": {}, - "children": [], - "node_i": "4226", - "scripts": [], - "own_tokenizer": false - } - ], - "node_i": "4214", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Oriya", - "iso_1_code": null, - "iso_3_code": null, - "tokenizers": { - "Latn": { - "full_object": "IndicNLPTokenizer(\"or\")", - "original_lang_name": "oriya", - "original_lang_code": "ori", - "scripts": [ - "Latn", - "Orya" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": true - }, - "Orya": { - "full_object": "IndicNLPTokenizer(\"or\")", - "original_lang_name": "oriya", - "original_lang_code": "ori", - "scripts": [ - "Latn", - "Orya" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": true - }, - "Beng": { - "full_object": "SpaCyTokenizer(\"bn\")", - "original_lang_name": "bengali", - "original_lang_code": "ben", - "scripts": [ - "Latn", - "Beng" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Arab": { - "full_object": "IndicNLPTokenizer(\"sd\")", - "original_lang_name": "sindhi", - "original_lang_code": "snd", - "scripts": [ - "Latn", - "Arab", - "Deva" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": false - }, - "Deva": { - "full_object": "SpaCyTokenizer(\"mr\")", - "original_lang_name": "marathi", - "original_lang_code": "mar", - "scripts": [ - "Latn", - "Deva" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Sinh": { - "full_object": "SpaCyTokenizer(\"si\")", - "original_lang_name": "sinhala", - "original_lang_code": "sin", - "scripts": [ - "Sinh" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Gujr": { - "full_object": "SpaCyTokenizer(\"gu\")", - "original_lang_name": "gujarati", - "original_lang_code": "guj", - "scripts": [ - "Latn", - "Gujr" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Guru": { - "full_object": "IndicNLPTokenizer(\"pa\")", - "original_lang_name": "punjabi", - "original_lang_code": "pan", - "scripts": [ - "Latn", - "Guru" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": false - }, - "Cyrl": { - "full_object": "StanzaTokenizer(\"kmr\")", - "original_lang_name": "northern_kurdish", - "original_lang_code": "kmr", - "scripts": [ - "Latn", - "Cyrl" - ], - "class_name": "StanzaTokenizer", - "macrolanguage": false - }, - "Armn": { - "full_object": "SpaCyTokenizer(\"hy\")", - "original_lang_name": "armenian", - "original_lang_code": "hye", - "scripts": [ - "Armn" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Grek": { - "full_object": "SpaCyTokenizer(\"el\")", - "original_lang_name": "greek", - "original_lang_code": "ell", - "scripts": [ - "Grek" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - } - }, - "children": [ - { - "name": "Bodo Parja", - "iso_1_code": null, - "iso_3_code": "bdv", - "tokenizers": {}, - "children": [], - "node_i": "4228", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Bhatri", - "iso_1_code": null, - "iso_3_code": "bgw", - "tokenizers": {}, - "children": [], - "node_i": "4229", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Bhunjia", - "iso_1_code": null, - "iso_3_code": "bhu", - "tokenizers": {}, - "children": [], - "node_i": "4230", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Desiya", - "iso_1_code": null, - "iso_3_code": "dso", - "tokenizers": {}, - "children": [], - "node_i": "4231", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Kupia", - "iso_1_code": null, - "iso_3_code": "key", - "tokenizers": {}, - "children": [], - "node_i": "4232", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Oriya, Adivasi", - "iso_1_code": null, - "iso_3_code": "ort", - "tokenizers": {}, - "children": [], - "node_i": "4233", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Odia", - "iso_1_code": "or", - "iso_3_code": "ory", - "tokenizers": { - "Latn": { - "full_object": "IndicNLPTokenizer(\"or\")", - "original_lang_name": "oriya", - "original_lang_code": "ori", - "scripts": [ - "Latn", - "Orya" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": true - }, - "Orya": { - "full_object": "IndicNLPTokenizer(\"or\")", - "original_lang_name": "oriya", - "original_lang_code": "ori", - "scripts": [ - "Latn", - "Orya" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": true - } - }, - "children": [], - "node_i": "4234", - "scripts": [ - "Latn", - "Orya" - ], - "own_tokenizer": true - }, - { - "name": "Reli", - "iso_1_code": null, - "iso_3_code": "rei", - "tokenizers": {}, - "children": [], - "node_i": "4235", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Sambalpuri", - "iso_1_code": "or", - "iso_3_code": "spv", - "tokenizers": { - "Latn": { - "full_object": "IndicNLPTokenizer(\"or\")", - "original_lang_name": "oriya", - "original_lang_code": "ori", - "scripts": [ - "Latn", - "Orya" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": true - }, - "Orya": { - "full_object": "IndicNLPTokenizer(\"or\")", - "original_lang_name": "oriya", - "original_lang_code": "ori", - "scripts": [ - "Latn", - "Orya" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": true - } - }, - "children": [], - "node_i": "4236", - "scripts": [], - "own_tokenizer": true - } - ], - "node_i": "4227", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Unclassified", - "iso_1_code": null, - "iso_3_code": null, - "tokenizers": { - "Latn": { - "full_object": "SpaCyTokenizer(\"bn\")", - "original_lang_name": "bengali", - "original_lang_code": "ben", - "scripts": [ - "Latn", - "Beng" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Beng": { - "full_object": "SpaCyTokenizer(\"bn\")", - "original_lang_name": "bengali", - "original_lang_code": "ben", - "scripts": [ - "Latn", - "Beng" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Orya": { - "full_object": "IndicNLPTokenizer(\"or\")", - "original_lang_name": "oriya", - "original_lang_code": "ori", - "scripts": [ - "Latn", - "Orya" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": true - }, - "Arab": { - "full_object": "IndicNLPTokenizer(\"sd\")", - "original_lang_name": "sindhi", - "original_lang_code": "snd", - "scripts": [ - "Latn", - "Arab", - "Deva" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": false - }, - "Deva": { - "full_object": "SpaCyTokenizer(\"mr\")", - "original_lang_name": "marathi", - "original_lang_code": "mar", - "scripts": [ - "Latn", - "Deva" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Sinh": { - "full_object": "SpaCyTokenizer(\"si\")", - "original_lang_name": "sinhala", - "original_lang_code": "sin", - "scripts": [ - "Sinh" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Gujr": { - "full_object": "SpaCyTokenizer(\"gu\")", - "original_lang_name": "gujarati", - "original_lang_code": "guj", - "scripts": [ - "Latn", - "Gujr" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Guru": { - "full_object": "IndicNLPTokenizer(\"pa\")", - "original_lang_name": "punjabi", - "original_lang_code": "pan", - "scripts": [ - "Latn", - "Guru" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": false - }, - "Cyrl": { - "full_object": "StanzaTokenizer(\"kmr\")", - "original_lang_name": "northern_kurdish", - "original_lang_code": "kmr", - "scripts": [ - "Latn", - "Cyrl" - ], - "class_name": "StanzaTokenizer", - "macrolanguage": false - }, - "Armn": { - "full_object": "SpaCyTokenizer(\"hy\")", - "original_lang_name": "armenian", - "original_lang_code": "hye", - "scripts": [ - "Armn" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Grek": { - "full_object": "SpaCyTokenizer(\"el\")", - "original_lang_name": "greek", - "original_lang_code": "ell", - "scripts": [ - "Grek" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - } - }, - "children": [ - { - "name": "Angika", - "iso_1_code": null, - "iso_3_code": "anp", - "tokenizers": { - "Deva": { - "full_object": "SpaCyTokenizer(\"mr\")", - "original_lang_name": "marathi", - "original_lang_code": "mar", - "scripts": [ - "Latn", - "Deva" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - } - }, - "children": [], - "node_i": "4238", - "scripts": [ - "Deva" - ], - "own_tokenizer": false - }, - { - "name": "Bote", - "iso_1_code": null, - "iso_3_code": "bmj", - "tokenizers": {}, - "children": [], - "node_i": "4239", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Buksa", - "iso_1_code": null, - "iso_3_code": "tkb", - "tokenizers": {}, - "children": [], - "node_i": "4240", - "scripts": [], - "own_tokenizer": false - } - ], - "node_i": "4237", - "scripts": [], - "own_tokenizer": false - } - ], - "node_i": "4193", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Northwestern", - "iso_1_code": null, - "iso_3_code": null, - "tokenizers": { - "Latn": { - "full_object": "IndicNLPTokenizer(\"sd\")", - "original_lang_name": "sindhi", - "original_lang_code": "snd", - "scripts": [ - "Latn", - "Arab", - "Deva" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": false - }, - "Arab": { - "full_object": "IndicNLPTokenizer(\"sd\")", - "original_lang_name": "sindhi", - "original_lang_code": "snd", - "scripts": [ - "Latn", - "Arab", - "Deva" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": false - }, - "Deva": { - "full_object": "IndicNLPTokenizer(\"sd\")", - "original_lang_name": "sindhi", - "original_lang_code": "snd", - "scripts": [ - "Latn", - "Arab", - "Deva" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": false - }, - "Beng": { - "full_object": "SpaCyTokenizer(\"bn\")", - "original_lang_name": "bengali", - "original_lang_code": "ben", - "scripts": [ - "Latn", - "Beng" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Orya": { - "full_object": "IndicNLPTokenizer(\"or\")", - "original_lang_name": "oriya", - "original_lang_code": "ori", - "scripts": [ - "Latn", - "Orya" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": true - }, - "Sinh": { - "full_object": "SpaCyTokenizer(\"si\")", - "original_lang_name": "sinhala", - "original_lang_code": "sin", - "scripts": [ - "Sinh" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Gujr": { - "full_object": "SpaCyTokenizer(\"gu\")", - "original_lang_name": "gujarati", - "original_lang_code": "guj", - "scripts": [ - "Latn", - "Gujr" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Guru": { - "full_object": "IndicNLPTokenizer(\"pa\")", - "original_lang_name": "punjabi", - "original_lang_code": "pan", - "scripts": [ - "Latn", - "Guru" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": false - }, - "Cyrl": { - "full_object": "StanzaTokenizer(\"kmr\")", - "original_lang_name": "northern_kurdish", - "original_lang_code": "kmr", - "scripts": [ - "Latn", - "Cyrl" - ], - "class_name": "StanzaTokenizer", - "macrolanguage": false - }, - "Armn": { - "full_object": "SpaCyTokenizer(\"hy\")", - "original_lang_name": "armenian", - "original_lang_code": "hye", - "scripts": [ - "Armn" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Grek": { - "full_object": "SpaCyTokenizer(\"el\")", - "original_lang_name": "greek", - "original_lang_code": "ell", - "scripts": [ - "Grek" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - } - }, - "children": [ - { - "name": "Dardic", - "iso_1_code": null, - "iso_3_code": null, - "tokenizers": { - "Latn": { - "full_object": "IndicNLPTokenizer(\"sd\")", - "original_lang_name": "sindhi", - "original_lang_code": "snd", - "scripts": [ - "Latn", - "Arab", - "Deva" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": false - }, - "Arab": { - "full_object": "IndicNLPTokenizer(\"sd\")", - "original_lang_name": "sindhi", - "original_lang_code": "snd", - "scripts": [ - "Latn", - "Arab", - "Deva" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": false - }, - "Deva": { - "full_object": "IndicNLPTokenizer(\"sd\")", - "original_lang_name": "sindhi", - "original_lang_code": "snd", - "scripts": [ - "Latn", - "Arab", - "Deva" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": false - }, - "Beng": { - "full_object": "SpaCyTokenizer(\"bn\")", - "original_lang_name": "bengali", - "original_lang_code": "ben", - "scripts": [ - "Latn", - "Beng" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Orya": { - "full_object": "IndicNLPTokenizer(\"or\")", - "original_lang_name": "oriya", - "original_lang_code": "ori", - "scripts": [ - "Latn", - "Orya" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": true - }, - "Sinh": { - "full_object": "SpaCyTokenizer(\"si\")", - "original_lang_name": "sinhala", - "original_lang_code": "sin", - "scripts": [ - "Sinh" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Gujr": { - "full_object": "SpaCyTokenizer(\"gu\")", - "original_lang_name": "gujarati", - "original_lang_code": "guj", - "scripts": [ - "Latn", - "Gujr" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Guru": { - "full_object": "IndicNLPTokenizer(\"pa\")", - "original_lang_name": "punjabi", - "original_lang_code": "pan", - "scripts": [ - "Latn", - "Guru" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": false - }, - "Cyrl": { - "full_object": "StanzaTokenizer(\"kmr\")", - "original_lang_name": "northern_kurdish", - "original_lang_code": "kmr", - "scripts": [ - "Latn", - "Cyrl" - ], - "class_name": "StanzaTokenizer", - "macrolanguage": false - }, - "Armn": { - "full_object": "SpaCyTokenizer(\"hy\")", - "original_lang_name": "armenian", - "original_lang_code": "hye", - "scripts": [ - "Armn" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Grek": { - "full_object": "SpaCyTokenizer(\"el\")", - "original_lang_name": "greek", - "original_lang_code": "ell", - "scripts": [ - "Grek" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - } - }, - "children": [ - { - "name": "Chitral", - "iso_1_code": null, - "iso_3_code": null, - "tokenizers": { - "Latn": { - "full_object": "IndicNLPTokenizer(\"sd\")", - "original_lang_name": "sindhi", - "original_lang_code": "snd", - "scripts": [ - "Latn", - "Arab", - "Deva" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": false - }, - "Arab": { - "full_object": "IndicNLPTokenizer(\"sd\")", - "original_lang_name": "sindhi", - "original_lang_code": "snd", - "scripts": [ - "Latn", - "Arab", - "Deva" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": false - }, - "Deva": { - "full_object": "IndicNLPTokenizer(\"sd\")", - "original_lang_name": "sindhi", - "original_lang_code": "snd", - "scripts": [ - "Latn", - "Arab", - "Deva" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": false - }, - "Beng": { - "full_object": "SpaCyTokenizer(\"bn\")", - "original_lang_name": "bengali", - "original_lang_code": "ben", - "scripts": [ - "Latn", - "Beng" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Orya": { - "full_object": "IndicNLPTokenizer(\"or\")", - "original_lang_name": "oriya", - "original_lang_code": "ori", - "scripts": [ - "Latn", - "Orya" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": true - }, - "Sinh": { - "full_object": "SpaCyTokenizer(\"si\")", - "original_lang_name": "sinhala", - "original_lang_code": "sin", - "scripts": [ - "Sinh" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Gujr": { - "full_object": "SpaCyTokenizer(\"gu\")", - "original_lang_name": "gujarati", - "original_lang_code": "guj", - "scripts": [ - "Latn", - "Gujr" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Guru": { - "full_object": "IndicNLPTokenizer(\"pa\")", - "original_lang_name": "punjabi", - "original_lang_code": "pan", - "scripts": [ - "Latn", - "Guru" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": false - }, - "Cyrl": { - "full_object": "StanzaTokenizer(\"kmr\")", - "original_lang_name": "northern_kurdish", - "original_lang_code": "kmr", - "scripts": [ - "Latn", - "Cyrl" - ], - "class_name": "StanzaTokenizer", - "macrolanguage": false - }, - "Armn": { - "full_object": "SpaCyTokenizer(\"hy\")", - "original_lang_name": "armenian", - "original_lang_code": "hye", - "scripts": [ - "Armn" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Grek": { - "full_object": "SpaCyTokenizer(\"el\")", - "original_lang_name": "greek", - "original_lang_code": "ell", - "scripts": [ - "Grek" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - } - }, - "children": [ - { - "name": "Khowar", - "iso_1_code": null, - "iso_3_code": "khw", - "tokenizers": {}, - "children": [], - "node_i": "4244", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Kalasha", - "iso_1_code": null, - "iso_3_code": "kls", - "tokenizers": {}, - "children": [], - "node_i": "4245", - "scripts": [], - "own_tokenizer": false - } - ], - "node_i": "4243", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Kashmiri", - "iso_1_code": null, - "iso_3_code": null, - "tokenizers": { - "Latn": { - "full_object": "IndicNLPTokenizer(\"sd\")", - "original_lang_name": "sindhi", - "original_lang_code": "snd", - "scripts": [ - "Latn", - "Arab", - "Deva" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": false - }, - "Arab": { - "full_object": "IndicNLPTokenizer(\"sd\")", - "original_lang_name": "sindhi", - "original_lang_code": "snd", - "scripts": [ - "Latn", - "Arab", - "Deva" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": false - }, - "Deva": { - "full_object": "IndicNLPTokenizer(\"sd\")", - "original_lang_name": "sindhi", - "original_lang_code": "snd", - "scripts": [ - "Latn", - "Arab", - "Deva" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": false - }, - "Beng": { - "full_object": "SpaCyTokenizer(\"bn\")", - "original_lang_name": "bengali", - "original_lang_code": "ben", - "scripts": [ - "Latn", - "Beng" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Orya": { - "full_object": "IndicNLPTokenizer(\"or\")", - "original_lang_name": "oriya", - "original_lang_code": "ori", - "scripts": [ - "Latn", - "Orya" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": true - }, - "Sinh": { - "full_object": "SpaCyTokenizer(\"si\")", - "original_lang_name": "sinhala", - "original_lang_code": "sin", - "scripts": [ - "Sinh" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Gujr": { - "full_object": "SpaCyTokenizer(\"gu\")", - "original_lang_name": "gujarati", - "original_lang_code": "guj", - "scripts": [ - "Latn", - "Gujr" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Guru": { - "full_object": "IndicNLPTokenizer(\"pa\")", - "original_lang_name": "punjabi", - "original_lang_code": "pan", - "scripts": [ - "Latn", - "Guru" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": false - }, - "Cyrl": { - "full_object": "StanzaTokenizer(\"kmr\")", - "original_lang_name": "northern_kurdish", - "original_lang_code": "kmr", - "scripts": [ - "Latn", - "Cyrl" - ], - "class_name": "StanzaTokenizer", - "macrolanguage": false - }, - "Armn": { - "full_object": "SpaCyTokenizer(\"hy\")", - "original_lang_name": "armenian", - "original_lang_code": "hye", - "scripts": [ - "Armn" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Grek": { - "full_object": "SpaCyTokenizer(\"el\")", - "original_lang_name": "greek", - "original_lang_code": "ell", - "scripts": [ - "Grek" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - } - }, - "children": [ - { - "name": "Kashmiri", - "iso_1_code": "ks", - "iso_3_code": "kas", - "tokenizers": { - "Latn": { - "full_object": "IndicNLPTokenizer(\"sd\")", - "original_lang_name": "sindhi", - "original_lang_code": "snd", - "scripts": [ - "Latn", - "Arab", - "Deva" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": false - }, - "Arab": { - "full_object": "IndicNLPTokenizer(\"sd\")", - "original_lang_name": "sindhi", - "original_lang_code": "snd", - "scripts": [ - "Latn", - "Arab", - "Deva" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": false - }, - "Deva": { - "full_object": "IndicNLPTokenizer(\"sd\")", - "original_lang_name": "sindhi", - "original_lang_code": "snd", - "scripts": [ - "Latn", - "Arab", - "Deva" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": false - } - }, - "children": [], - "node_i": "4247", - "scripts": [ - "Latn", - "Arab", - "Deva" - ], - "own_tokenizer": false - } - ], - "node_i": "4246", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Kohistani", - "iso_1_code": null, - "iso_3_code": null, - "tokenizers": { - "Latn": { - "full_object": "IndicNLPTokenizer(\"sd\")", - "original_lang_name": "sindhi", - "original_lang_code": "snd", - "scripts": [ - "Latn", - "Arab", - "Deva" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": false - }, - "Arab": { - "full_object": "IndicNLPTokenizer(\"sd\")", - "original_lang_name": "sindhi", - "original_lang_code": "snd", - "scripts": [ - "Latn", - "Arab", - "Deva" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": false - }, - "Deva": { - "full_object": "IndicNLPTokenizer(\"sd\")", - "original_lang_name": "sindhi", - "original_lang_code": "snd", - "scripts": [ - "Latn", - "Arab", - "Deva" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": false - }, - "Beng": { - "full_object": "SpaCyTokenizer(\"bn\")", - "original_lang_name": "bengali", - "original_lang_code": "ben", - "scripts": [ - "Latn", - "Beng" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Orya": { - "full_object": "IndicNLPTokenizer(\"or\")", - "original_lang_name": "oriya", - "original_lang_code": "ori", - "scripts": [ - "Latn", - "Orya" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": true - }, - "Sinh": { - "full_object": "SpaCyTokenizer(\"si\")", - "original_lang_name": "sinhala", - "original_lang_code": "sin", - "scripts": [ - "Sinh" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Gujr": { - "full_object": "SpaCyTokenizer(\"gu\")", - "original_lang_name": "gujarati", - "original_lang_code": "guj", - "scripts": [ - "Latn", - "Gujr" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Guru": { - "full_object": "IndicNLPTokenizer(\"pa\")", - "original_lang_name": "punjabi", - "original_lang_code": "pan", - "scripts": [ - "Latn", - "Guru" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": false - }, - "Cyrl": { - "full_object": "StanzaTokenizer(\"kmr\")", - "original_lang_name": "northern_kurdish", - "original_lang_code": "kmr", - "scripts": [ - "Latn", - "Cyrl" - ], - "class_name": "StanzaTokenizer", - "macrolanguage": false - }, - "Armn": { - "full_object": "SpaCyTokenizer(\"hy\")", - "original_lang_name": "armenian", - "original_lang_code": "hye", - "scripts": [ - "Armn" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Grek": { - "full_object": "SpaCyTokenizer(\"el\")", - "original_lang_name": "greek", - "original_lang_code": "ell", - "scripts": [ - "Grek" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - } - }, - "children": [ - { - "name": "Bateri", - "iso_1_code": null, - "iso_3_code": "btv", - "tokenizers": {}, - "children": [], - "node_i": "4249", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Chilisso", - "iso_1_code": null, - "iso_3_code": "clh", - "tokenizers": {}, - "children": [], - "node_i": "4250", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Gawri", - "iso_1_code": null, - "iso_3_code": "gwc", - "tokenizers": {}, - "children": [], - "node_i": "4251", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Gowro", - "iso_1_code": null, - "iso_3_code": "gwf", - "tokenizers": {}, - "children": [], - "node_i": "4252", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Kohistani, Indus", - "iso_1_code": null, - "iso_3_code": "mvy", - "tokenizers": {}, - "children": [], - "node_i": "4253", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Mankiyali", - "iso_1_code": null, - "iso_3_code": "nlm", - "tokenizers": {}, - "children": [], - "node_i": "4254", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Tirahi", - "iso_1_code": null, - "iso_3_code": "tra", - "tokenizers": {}, - "children": [], - "node_i": "4255", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Torwali", - "iso_1_code": null, - "iso_3_code": "trw", - "tokenizers": {}, - "children": [], - "node_i": "4256", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Degano", - "iso_1_code": null, - "iso_3_code": "wsv", - "tokenizers": {}, - "children": [], - "node_i": "4257", - "scripts": [], - "own_tokenizer": false - } - ], - "node_i": "4248", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Kunar", - "iso_1_code": null, - "iso_3_code": null, - "tokenizers": { - "Latn": { - "full_object": "IndicNLPTokenizer(\"sd\")", - "original_lang_name": "sindhi", - "original_lang_code": "snd", - "scripts": [ - "Latn", - "Arab", - "Deva" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": false - }, - "Arab": { - "full_object": "IndicNLPTokenizer(\"sd\")", - "original_lang_name": "sindhi", - "original_lang_code": "snd", - "scripts": [ - "Latn", - "Arab", - "Deva" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": false - }, - "Deva": { - "full_object": "IndicNLPTokenizer(\"sd\")", - "original_lang_name": "sindhi", - "original_lang_code": "snd", - "scripts": [ - "Latn", - "Arab", - "Deva" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": false - }, - "Beng": { - "full_object": "SpaCyTokenizer(\"bn\")", - "original_lang_name": "bengali", - "original_lang_code": "ben", - "scripts": [ - "Latn", - "Beng" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Orya": { - "full_object": "IndicNLPTokenizer(\"or\")", - "original_lang_name": "oriya", - "original_lang_code": "ori", - "scripts": [ - "Latn", - "Orya" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": true - }, - "Sinh": { - "full_object": "SpaCyTokenizer(\"si\")", - "original_lang_name": "sinhala", - "original_lang_code": "sin", - "scripts": [ - "Sinh" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Gujr": { - "full_object": "SpaCyTokenizer(\"gu\")", - "original_lang_name": "gujarati", - "original_lang_code": "guj", - "scripts": [ - "Latn", - "Gujr" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Guru": { - "full_object": "IndicNLPTokenizer(\"pa\")", - "original_lang_name": "punjabi", - "original_lang_code": "pan", - "scripts": [ - "Latn", - "Guru" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": false - }, - "Cyrl": { - "full_object": "StanzaTokenizer(\"kmr\")", - "original_lang_name": "northern_kurdish", - "original_lang_code": "kmr", - "scripts": [ - "Latn", - "Cyrl" - ], - "class_name": "StanzaTokenizer", - "macrolanguage": false - }, - "Armn": { - "full_object": "SpaCyTokenizer(\"hy\")", - "original_lang_name": "armenian", - "original_lang_code": "hye", - "scripts": [ - "Armn" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Grek": { - "full_object": "SpaCyTokenizer(\"el\")", - "original_lang_name": "greek", - "original_lang_code": "ell", - "scripts": [ - "Grek" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - } - }, - "children": [ - { - "name": "Dameli", - "iso_1_code": null, - "iso_3_code": "dml", - "tokenizers": {}, - "children": [], - "node_i": "4259", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Gawar-Bati", - "iso_1_code": null, - "iso_3_code": "gwt", - "tokenizers": {}, - "children": [], - "node_i": "4260", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Grangali", - "iso_1_code": null, - "iso_3_code": "nli", - "tokenizers": {}, - "children": [], - "node_i": "4261", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Shumashti", - "iso_1_code": null, - "iso_3_code": "sts", - "tokenizers": {}, - "children": [], - "node_i": "4262", - "scripts": [], - "own_tokenizer": false - } - ], - "node_i": "4258", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Pashai", - "iso_1_code": null, - "iso_3_code": null, - "tokenizers": { - "Latn": { - "full_object": "IndicNLPTokenizer(\"sd\")", - "original_lang_name": "sindhi", - "original_lang_code": "snd", - "scripts": [ - "Latn", - "Arab", - "Deva" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": false - }, - "Arab": { - "full_object": "IndicNLPTokenizer(\"sd\")", - "original_lang_name": "sindhi", - "original_lang_code": "snd", - "scripts": [ - "Latn", - "Arab", - "Deva" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": false - }, - "Deva": { - "full_object": "IndicNLPTokenizer(\"sd\")", - "original_lang_name": "sindhi", - "original_lang_code": "snd", - "scripts": [ - "Latn", - "Arab", - "Deva" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": false - }, - "Beng": { - "full_object": "SpaCyTokenizer(\"bn\")", - "original_lang_name": "bengali", - "original_lang_code": "ben", - "scripts": [ - "Latn", - "Beng" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Orya": { - "full_object": "IndicNLPTokenizer(\"or\")", - "original_lang_name": "oriya", - "original_lang_code": "ori", - "scripts": [ - "Latn", - "Orya" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": true - }, - "Sinh": { - "full_object": "SpaCyTokenizer(\"si\")", - "original_lang_name": "sinhala", - "original_lang_code": "sin", - "scripts": [ - "Sinh" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Gujr": { - "full_object": "SpaCyTokenizer(\"gu\")", - "original_lang_name": "gujarati", - "original_lang_code": "guj", - "scripts": [ - "Latn", - "Gujr" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Guru": { - "full_object": "IndicNLPTokenizer(\"pa\")", - "original_lang_name": "punjabi", - "original_lang_code": "pan", - "scripts": [ - "Latn", - "Guru" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": false - }, - "Cyrl": { - "full_object": "StanzaTokenizer(\"kmr\")", - "original_lang_name": "northern_kurdish", - "original_lang_code": "kmr", - "scripts": [ - "Latn", - "Cyrl" - ], - "class_name": "StanzaTokenizer", - "macrolanguage": false - }, - "Armn": { - "full_object": "SpaCyTokenizer(\"hy\")", - "original_lang_name": "armenian", - "original_lang_code": "hye", - "scripts": [ - "Armn" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Grek": { - "full_object": "SpaCyTokenizer(\"el\")", - "original_lang_name": "greek", - "original_lang_code": "ell", - "scripts": [ - "Grek" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - } - }, - "children": [ - { - "name": "Pashai, Northeast", - "iso_1_code": null, - "iso_3_code": "aee", - "tokenizers": {}, - "children": [], - "node_i": "4264", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Pashai, Northwest", - "iso_1_code": null, - "iso_3_code": "glh", - "tokenizers": {}, - "children": [], - "node_i": "4265", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Pashai, Southwest", - "iso_1_code": null, - "iso_3_code": "psh", - "tokenizers": {}, - "children": [], - "node_i": "4266", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Pashai, Southeast", - "iso_1_code": null, - "iso_3_code": "psi", - "tokenizers": {}, - "children": [], - "node_i": "4267", - "scripts": [], - "own_tokenizer": false - } - ], - "node_i": "4263", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Shina", - "iso_1_code": null, - "iso_3_code": null, - "tokenizers": { - "Latn": { - "full_object": "IndicNLPTokenizer(\"sd\")", - "original_lang_name": "sindhi", - "original_lang_code": "snd", - "scripts": [ - "Latn", - "Arab", - "Deva" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": false - }, - "Arab": { - "full_object": "IndicNLPTokenizer(\"sd\")", - "original_lang_name": "sindhi", - "original_lang_code": "snd", - "scripts": [ - "Latn", - "Arab", - "Deva" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": false - }, - "Deva": { - "full_object": "IndicNLPTokenizer(\"sd\")", - "original_lang_name": "sindhi", - "original_lang_code": "snd", - "scripts": [ - "Latn", - "Arab", - "Deva" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": false - }, - "Beng": { - "full_object": "SpaCyTokenizer(\"bn\")", - "original_lang_name": "bengali", - "original_lang_code": "ben", - "scripts": [ - "Latn", - "Beng" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Orya": { - "full_object": "IndicNLPTokenizer(\"or\")", - "original_lang_name": "oriya", - "original_lang_code": "ori", - "scripts": [ - "Latn", - "Orya" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": true - }, - "Sinh": { - "full_object": "SpaCyTokenizer(\"si\")", - "original_lang_name": "sinhala", - "original_lang_code": "sin", - "scripts": [ - "Sinh" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Gujr": { - "full_object": "SpaCyTokenizer(\"gu\")", - "original_lang_name": "gujarati", - "original_lang_code": "guj", - "scripts": [ - "Latn", - "Gujr" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Guru": { - "full_object": "IndicNLPTokenizer(\"pa\")", - "original_lang_name": "punjabi", - "original_lang_code": "pan", - "scripts": [ - "Latn", - "Guru" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": false - }, - "Cyrl": { - "full_object": "StanzaTokenizer(\"kmr\")", - "original_lang_name": "northern_kurdish", - "original_lang_code": "kmr", - "scripts": [ - "Latn", - "Cyrl" - ], - "class_name": "StanzaTokenizer", - "macrolanguage": false - }, - "Armn": { - "full_object": "SpaCyTokenizer(\"hy\")", - "original_lang_name": "armenian", - "original_lang_code": "hye", - "scripts": [ - "Armn" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Grek": { - "full_object": "SpaCyTokenizer(\"el\")", - "original_lang_name": "greek", - "original_lang_code": "ell", - "scripts": [ - "Grek" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - } - }, - "children": [ - { - "name": "Brokskat", - "iso_1_code": null, - "iso_3_code": "bkk", - "tokenizers": {}, - "children": [], - "node_i": "4269", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Palula", - "iso_1_code": null, - "iso_3_code": "phl", - "tokenizers": {}, - "children": [], - "node_i": "4270", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Shina, Kohistani", - "iso_1_code": null, - "iso_3_code": "plk", - "tokenizers": {}, - "children": [], - "node_i": "4271", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Shina", - "iso_1_code": null, - "iso_3_code": "scl", - "tokenizers": {}, - "children": [], - "node_i": "4272", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Savi", - "iso_1_code": null, - "iso_3_code": "sdg", - "tokenizers": {}, - "children": [], - "node_i": "4273", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Kundal Shahi", - "iso_1_code": null, - "iso_3_code": "shd", - "tokenizers": {}, - "children": [], - "node_i": "4274", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Ushojo", - "iso_1_code": null, - "iso_3_code": "ush", - "tokenizers": {}, - "children": [], - "node_i": "4275", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Kalkoti", - "iso_1_code": null, - "iso_3_code": "xka", - "tokenizers": {}, - "children": [], - "node_i": "4276", - "scripts": [], - "own_tokenizer": false - } - ], - "node_i": "4268", - "scripts": [], - "own_tokenizer": false - } - ], - "node_i": "4242", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Sindhi", - "iso_1_code": null, - "iso_3_code": null, - "tokenizers": { - "Latn": { - "full_object": "IndicNLPTokenizer(\"sd\")", - "original_lang_name": "sindhi", - "original_lang_code": "snd", - "scripts": [ - "Latn", - "Arab", - "Deva" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": false - }, - "Arab": { - "full_object": "IndicNLPTokenizer(\"sd\")", - "original_lang_name": "sindhi", - "original_lang_code": "snd", - "scripts": [ - "Latn", - "Arab", - "Deva" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": false - }, - "Deva": { - "full_object": "IndicNLPTokenizer(\"sd\")", - "original_lang_name": "sindhi", - "original_lang_code": "snd", - "scripts": [ - "Latn", - "Arab", - "Deva" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": false - }, - "Beng": { - "full_object": "SpaCyTokenizer(\"bn\")", - "original_lang_name": "bengali", - "original_lang_code": "ben", - "scripts": [ - "Latn", - "Beng" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Orya": { - "full_object": "IndicNLPTokenizer(\"or\")", - "original_lang_name": "oriya", - "original_lang_code": "ori", - "scripts": [ - "Latn", - "Orya" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": true - }, - "Sinh": { - "full_object": "SpaCyTokenizer(\"si\")", - "original_lang_name": "sinhala", - "original_lang_code": "sin", - "scripts": [ - "Sinh" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Gujr": { - "full_object": "SpaCyTokenizer(\"gu\")", - "original_lang_name": "gujarati", - "original_lang_code": "guj", - "scripts": [ - "Latn", - "Gujr" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Guru": { - "full_object": "IndicNLPTokenizer(\"pa\")", - "original_lang_name": "punjabi", - "original_lang_code": "pan", - "scripts": [ - "Latn", - "Guru" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": false - }, - "Cyrl": { - "full_object": "StanzaTokenizer(\"kmr\")", - "original_lang_name": "northern_kurdish", - "original_lang_code": "kmr", - "scripts": [ - "Latn", - "Cyrl" - ], - "class_name": "StanzaTokenizer", - "macrolanguage": false - }, - "Armn": { - "full_object": "SpaCyTokenizer(\"hy\")", - "original_lang_name": "armenian", - "original_lang_code": "hye", - "scripts": [ - "Armn" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Grek": { - "full_object": "SpaCyTokenizer(\"el\")", - "original_lang_name": "greek", - "original_lang_code": "ell", - "scripts": [ - "Grek" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - } - }, - "children": [ - { - "name": "Jadgali", - "iso_1_code": null, - "iso_3_code": "jdg", - "tokenizers": {}, - "children": [], - "node_i": "4278", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Kacchi", - "iso_1_code": null, - "iso_3_code": "kfr", - "tokenizers": {}, - "children": [], - "node_i": "4279", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Lasi", - "iso_1_code": null, - "iso_3_code": "lss", - "tokenizers": {}, - "children": [], - "node_i": "4280", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Luwati", - "iso_1_code": null, - "iso_3_code": "luv", - "tokenizers": {}, - "children": [], - "node_i": "4281", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Sindhi Bhil", - "iso_1_code": null, - "iso_3_code": "sbn", - "tokenizers": {}, - "children": [], - "node_i": "4282", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Sindhi", - "iso_1_code": "sd", - "iso_3_code": "snd", - "tokenizers": { - "Latn": { - "full_object": "IndicNLPTokenizer(\"sd\")", - "original_lang_name": "sindhi", - "original_lang_code": "snd", - "scripts": [ - "Latn", - "Arab", - "Deva" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": false - }, - "Arab": { - "full_object": "IndicNLPTokenizer(\"sd\")", - "original_lang_name": "sindhi", - "original_lang_code": "snd", - "scripts": [ - "Latn", - "Arab", - "Deva" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": false - }, - "Deva": { - "full_object": "IndicNLPTokenizer(\"sd\")", - "original_lang_name": "sindhi", - "original_lang_code": "snd", - "scripts": [ - "Latn", - "Arab", - "Deva" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": false - } - }, - "children": [], - "node_i": "4283", - "scripts": [ - "Latn", - "Arab", - "Deva" - ], - "own_tokenizer": true - } - ], - "node_i": "4277", - "scripts": [], - "own_tokenizer": false - } - ], - "node_i": "4241", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Southern", - "iso_1_code": null, - "iso_3_code": null, - "tokenizers": { - "Latn": { - "full_object": "SpaCyTokenizer(\"mr\")", - "original_lang_name": "marathi", - "original_lang_code": "mar", - "scripts": [ - "Latn", - "Deva" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Deva": { - "full_object": "SpaCyTokenizer(\"mr\")", - "original_lang_name": "marathi", - "original_lang_code": "mar", - "scripts": [ - "Latn", - "Deva" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Sinh": { - "full_object": "SpaCyTokenizer(\"si\")", - "original_lang_name": "sinhala", - "original_lang_code": "sin", - "scripts": [ - "Sinh" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Beng": { - "full_object": "SpaCyTokenizer(\"bn\")", - "original_lang_name": "bengali", - "original_lang_code": "ben", - "scripts": [ - "Latn", - "Beng" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Orya": { - "full_object": "IndicNLPTokenizer(\"or\")", - "original_lang_name": "oriya", - "original_lang_code": "ori", - "scripts": [ - "Latn", - "Orya" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": true - }, - "Arab": { - "full_object": "IndicNLPTokenizer(\"sd\")", - "original_lang_name": "sindhi", - "original_lang_code": "snd", - "scripts": [ - "Latn", - "Arab", - "Deva" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": false - }, - "Gujr": { - "full_object": "SpaCyTokenizer(\"gu\")", - "original_lang_name": "gujarati", - "original_lang_code": "guj", - "scripts": [ - "Latn", - "Gujr" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Guru": { - "full_object": "IndicNLPTokenizer(\"pa\")", - "original_lang_name": "punjabi", - "original_lang_code": "pan", - "scripts": [ - "Latn", - "Guru" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": false - }, - "Cyrl": { - "full_object": "StanzaTokenizer(\"kmr\")", - "original_lang_name": "northern_kurdish", - "original_lang_code": "kmr", - "scripts": [ - "Latn", - "Cyrl" - ], - "class_name": "StanzaTokenizer", - "macrolanguage": false - }, - "Armn": { - "full_object": "SpaCyTokenizer(\"hy\")", - "original_lang_name": "armenian", - "original_lang_code": "hye", - "scripts": [ - "Armn" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Grek": { - "full_object": "SpaCyTokenizer(\"el\")", - "original_lang_name": "greek", - "original_lang_code": "ell", - "scripts": [ - "Grek" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - } - }, - "children": [ - { - "name": "Marathi", - "iso_1_code": "mr", - "iso_3_code": "mar", - "tokenizers": { - "Latn": { - "full_object": "SpaCyTokenizer(\"mr\")", - "original_lang_name": "marathi", - "original_lang_code": "mar", - "scripts": [ - "Latn", - "Deva" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Deva": { - "full_object": "SpaCyTokenizer(\"mr\")", - "original_lang_name": "marathi", - "original_lang_code": "mar", - "scripts": [ - "Latn", - "Deva" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - } - }, - "children": [], - "node_i": "4285", - "scripts": [ - "Deva", - "Latn" - ], - "own_tokenizer": true - }, - { - "name": "Konkani", - "iso_1_code": null, - "iso_3_code": null, - "tokenizers": { - "Latn": { - "full_object": "IndicNLPTokenizer(\"kok\")", - "original_lang_name": "konkani", - "original_lang_code": "kok", - "scripts": [ - "Latn", - "Deva" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": true - }, - "Deva": { - "full_object": "IndicNLPTokenizer(\"kok\")", - "original_lang_name": "konkani", - "original_lang_code": "kok", - "scripts": [ - "Latn", - "Deva" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": true - }, - "Sinh": { - "full_object": "SpaCyTokenizer(\"si\")", - "original_lang_name": "sinhala", - "original_lang_code": "sin", - "scripts": [ - "Sinh" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Beng": { - "full_object": "SpaCyTokenizer(\"bn\")", - "original_lang_name": "bengali", - "original_lang_code": "ben", - "scripts": [ - "Latn", - "Beng" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Orya": { - "full_object": "IndicNLPTokenizer(\"or\")", - "original_lang_name": "oriya", - "original_lang_code": "ori", - "scripts": [ - "Latn", - "Orya" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": true - }, - "Arab": { - "full_object": "IndicNLPTokenizer(\"sd\")", - "original_lang_name": "sindhi", - "original_lang_code": "snd", - "scripts": [ - "Latn", - "Arab", - "Deva" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": false - }, - "Gujr": { - "full_object": "SpaCyTokenizer(\"gu\")", - "original_lang_name": "gujarati", - "original_lang_code": "guj", - "scripts": [ - "Latn", - "Gujr" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Guru": { - "full_object": "IndicNLPTokenizer(\"pa\")", - "original_lang_name": "punjabi", - "original_lang_code": "pan", - "scripts": [ - "Latn", - "Guru" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": false - }, - "Cyrl": { - "full_object": "StanzaTokenizer(\"kmr\")", - "original_lang_name": "northern_kurdish", - "original_lang_code": "kmr", - "scripts": [ - "Latn", - "Cyrl" - ], - "class_name": "StanzaTokenizer", - "macrolanguage": false - }, - "Armn": { - "full_object": "SpaCyTokenizer(\"hy\")", - "original_lang_name": "armenian", - "original_lang_code": "hye", - "scripts": [ - "Armn" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Grek": { - "full_object": "SpaCyTokenizer(\"el\")", - "original_lang_name": "greek", - "original_lang_code": "ell", - "scripts": [ - "Grek" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - } - }, - "children": [ - { - "name": "Konkani, Goan", - "iso_1_code": null, - "iso_3_code": "gom", - "tokenizers": { - "Latn": { - "full_object": "IndicNLPTokenizer(\"kok\")", - "original_lang_name": "konkani", - "original_lang_code": "kok", - "scripts": [ - "Latn", - "Deva" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": true - }, - "Deva": { - "full_object": "IndicNLPTokenizer(\"kok\")", - "original_lang_name": "konkani", - "original_lang_code": "kok", - "scripts": [ - "Latn", - "Deva" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": true - } - }, - "children": [], - "node_i": "4287", - "scripts": [ - "Latn", - "Deva" - ], - "own_tokenizer": true - }, - { - "name": "Kukna", - "iso_1_code": null, - "iso_3_code": "kex", - "tokenizers": { - "Deva": { - "full_object": "IndicNLPTokenizer(\"kok\")", - "original_lang_name": "konkani", - "original_lang_code": "kok", - "scripts": [ - "Latn", - "Deva" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": true - } - }, - "children": [], - "node_i": "4288", - "scripts": [ - "Deva" - ], - "own_tokenizer": false - }, - { - "name": "Katkari", - "iso_1_code": null, - "iso_3_code": "kfu", - "tokenizers": {}, - "children": [], - "node_i": "4289", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Konkani", - "iso_1_code": null, - "iso_3_code": "knn", - "tokenizers": { - "Latn": { - "full_object": "IndicNLPTokenizer(\"kok\")", - "original_lang_name": "konkani", - "original_lang_code": "kok", - "scripts": [ - "Latn", - "Deva" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": true - }, - "Deva": { - "full_object": "IndicNLPTokenizer(\"kok\")", - "original_lang_name": "konkani", - "original_lang_code": "kok", - "scripts": [ - "Latn", - "Deva" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": true - } - }, - "children": [], - "node_i": "4290", - "scripts": [], - "own_tokenizer": true - }, - { - "name": "Phudagi", - "iso_1_code": null, - "iso_3_code": "phd", - "tokenizers": {}, - "children": [], - "node_i": "4291", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Samvedi", - "iso_1_code": null, - "iso_3_code": "smv", - "tokenizers": {}, - "children": [], - "node_i": "4292", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Varli", - "iso_1_code": null, - "iso_3_code": "vav", - "tokenizers": {}, - "children": [], - "node_i": "4293", - "scripts": [], - "own_tokenizer": false - } - ], - "node_i": "4286", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Sinhalese-Maldivian", - "iso_1_code": null, - "iso_3_code": null, - "tokenizers": { - "Sinh": { - "full_object": "SpaCyTokenizer(\"si\")", - "original_lang_name": "sinhala", - "original_lang_code": "sin", - "scripts": [ - "Sinh" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Latn": { - "full_object": "SpaCyTokenizer(\"mr\")", - "original_lang_name": "marathi", - "original_lang_code": "mar", - "scripts": [ - "Latn", - "Deva" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Deva": { - "full_object": "SpaCyTokenizer(\"mr\")", - "original_lang_name": "marathi", - "original_lang_code": "mar", - "scripts": [ - "Latn", - "Deva" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Beng": { - "full_object": "SpaCyTokenizer(\"bn\")", - "original_lang_name": "bengali", - "original_lang_code": "ben", - "scripts": [ - "Latn", - "Beng" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Orya": { - "full_object": "IndicNLPTokenizer(\"or\")", - "original_lang_name": "oriya", - "original_lang_code": "ori", - "scripts": [ - "Latn", - "Orya" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": true - }, - "Arab": { - "full_object": "IndicNLPTokenizer(\"sd\")", - "original_lang_name": "sindhi", - "original_lang_code": "snd", - "scripts": [ - "Latn", - "Arab", - "Deva" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": false - }, - "Gujr": { - "full_object": "SpaCyTokenizer(\"gu\")", - "original_lang_name": "gujarati", - "original_lang_code": "guj", - "scripts": [ - "Latn", - "Gujr" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Guru": { - "full_object": "IndicNLPTokenizer(\"pa\")", - "original_lang_name": "punjabi", - "original_lang_code": "pan", - "scripts": [ - "Latn", - "Guru" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": false - }, - "Cyrl": { - "full_object": "StanzaTokenizer(\"kmr\")", - "original_lang_name": "northern_kurdish", - "original_lang_code": "kmr", - "scripts": [ - "Latn", - "Cyrl" - ], - "class_name": "StanzaTokenizer", - "macrolanguage": false - }, - "Armn": { - "full_object": "SpaCyTokenizer(\"hy\")", - "original_lang_name": "armenian", - "original_lang_code": "hye", - "scripts": [ - "Armn" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Grek": { - "full_object": "SpaCyTokenizer(\"el\")", - "original_lang_name": "greek", - "original_lang_code": "ell", - "scripts": [ - "Grek" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - } - }, - "children": [ - { - "name": "Maldivian", - "iso_1_code": "dv", - "iso_3_code": "div", - "tokenizers": {}, - "children": [], - "node_i": "4295", - "scripts": [ - "Thaa" - ], - "own_tokenizer": false - }, - { - "name": "Sinhala", - "iso_1_code": "si", - "iso_3_code": "sin", - "tokenizers": { - "Sinh": { - "full_object": "SpaCyTokenizer(\"si\")", - "original_lang_name": "sinhala", - "original_lang_code": "sin", - "scripts": [ - "Sinh" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - } - }, - "children": [], - "node_i": "4296", - "scripts": [ - "Sinh" - ], - "own_tokenizer": true - }, - { - "name": "Veddah", - "iso_1_code": null, - "iso_3_code": "ved", - "tokenizers": {}, - "children": [], - "node_i": "4297", - "scripts": [], - "own_tokenizer": false - } - ], - "node_i": "4294", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Unclassified", - "iso_1_code": null, - "iso_3_code": null, - "tokenizers": { - "Latn": { - "full_object": "SpaCyTokenizer(\"mr\")", - "original_lang_name": "marathi", - "original_lang_code": "mar", - "scripts": [ - "Latn", - "Deva" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Deva": { - "full_object": "SpaCyTokenizer(\"mr\")", - "original_lang_name": "marathi", - "original_lang_code": "mar", - "scripts": [ - "Latn", - "Deva" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Sinh": { - "full_object": "SpaCyTokenizer(\"si\")", - "original_lang_name": "sinhala", - "original_lang_code": "sin", - "scripts": [ - "Sinh" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Beng": { - "full_object": "SpaCyTokenizer(\"bn\")", - "original_lang_name": "bengali", - "original_lang_code": "ben", - "scripts": [ - "Latn", - "Beng" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Orya": { - "full_object": "IndicNLPTokenizer(\"or\")", - "original_lang_name": "oriya", - "original_lang_code": "ori", - "scripts": [ - "Latn", - "Orya" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": true - }, - "Arab": { - "full_object": "IndicNLPTokenizer(\"sd\")", - "original_lang_name": "sindhi", - "original_lang_code": "snd", - "scripts": [ - "Latn", - "Arab", - "Deva" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": false - }, - "Gujr": { - "full_object": "SpaCyTokenizer(\"gu\")", - "original_lang_name": "gujarati", - "original_lang_code": "guj", - "scripts": [ - "Latn", - "Gujr" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Guru": { - "full_object": "IndicNLPTokenizer(\"pa\")", - "original_lang_name": "punjabi", - "original_lang_code": "pan", - "scripts": [ - "Latn", - "Guru" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": false - }, - "Cyrl": { - "full_object": "StanzaTokenizer(\"kmr\")", - "original_lang_name": "northern_kurdish", - "original_lang_code": "kmr", - "scripts": [ - "Latn", - "Cyrl" - ], - "class_name": "StanzaTokenizer", - "macrolanguage": false - }, - "Armn": { - "full_object": "SpaCyTokenizer(\"hy\")", - "original_lang_name": "armenian", - "original_lang_code": "hye", - "scripts": [ - "Armn" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Grek": { - "full_object": "SpaCyTokenizer(\"el\")", - "original_lang_name": "greek", - "original_lang_code": "ell", - "scripts": [ - "Grek" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - } - }, - "children": [ - { - "name": "Bhalay", - "iso_1_code": null, - "iso_3_code": "bhx", - "tokenizers": {}, - "children": [], - "node_i": "4299", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Deccan", - "iso_1_code": null, - "iso_3_code": "dcc", - "tokenizers": {}, - "children": [], - "node_i": "4300", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Gowlan", - "iso_1_code": null, - "iso_3_code": "goj", - "tokenizers": {}, - "children": [], - "node_i": "4301", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Varhadi-Nagpuri", - "iso_1_code": null, - "iso_3_code": "vah", - "tokenizers": {}, - "children": [], - "node_i": "4302", - "scripts": [], - "own_tokenizer": false - } - ], - "node_i": "4298", - "scripts": [], - "own_tokenizer": false - } - ], - "node_i": "4284", - "scripts": [], - "own_tokenizer": false - } - ], - "node_i": "4192", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Tharu", - "iso_1_code": null, - "iso_3_code": null, - "tokenizers": { - "Latn": { - "full_object": "SpaCyTokenizer(\"hi\")", - "original_lang_name": "hindi", - "original_lang_code": "hin", - "scripts": [ - "Latn", - "Deva" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Deva": { - "full_object": "SpaCyTokenizer(\"hi\")", - "original_lang_name": "hindi", - "original_lang_code": "hin", - "scripts": [ - "Latn", - "Deva" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Gujr": { - "full_object": "SpaCyTokenizer(\"gu\")", - "original_lang_name": "gujarati", - "original_lang_code": "guj", - "scripts": [ - "Latn", - "Gujr" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Guru": { - "full_object": "IndicNLPTokenizer(\"pa\")", - "original_lang_name": "punjabi", - "original_lang_code": "pan", - "scripts": [ - "Latn", - "Guru" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": false - }, - "Beng": { - "full_object": "SpaCyTokenizer(\"bn\")", - "original_lang_name": "bengali", - "original_lang_code": "ben", - "scripts": [ - "Latn", - "Beng" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Orya": { - "full_object": "IndicNLPTokenizer(\"or\")", - "original_lang_name": "oriya", - "original_lang_code": "ori", - "scripts": [ - "Latn", - "Orya" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": true - }, - "Arab": { - "full_object": "SpaCyTokenizer(\"ur\")", - "original_lang_name": "urdu", - "original_lang_code": "urd", - "scripts": [ - "Latn", - "Arab" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Sinh": { - "full_object": "SpaCyTokenizer(\"si\")", - "original_lang_name": "sinhala", - "original_lang_code": "sin", - "scripts": [ - "Sinh" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Cyrl": { - "full_object": "StanzaTokenizer(\"kmr\")", - "original_lang_name": "northern_kurdish", - "original_lang_code": "kmr", - "scripts": [ - "Latn", - "Cyrl" - ], - "class_name": "StanzaTokenizer", - "macrolanguage": false - }, - "Armn": { - "full_object": "SpaCyTokenizer(\"hy\")", - "original_lang_name": "armenian", - "original_lang_code": "hye", - "scripts": [ - "Armn" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Grek": { - "full_object": "SpaCyTokenizer(\"el\")", - "original_lang_name": "greek", - "original_lang_code": "ell", - "scripts": [ - "Grek" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - } - }, - "children": [ - { - "name": "Tharu, Rana", - "iso_1_code": null, - "iso_3_code": "thr", - "tokenizers": {}, - "children": [], - "node_i": "4304", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Tharu, Kathariya", - "iso_1_code": null, - "iso_3_code": "tkt", - "tokenizers": {}, - "children": [], - "node_i": "4305", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Eastern Tharu", - "iso_1_code": null, - "iso_3_code": null, - "tokenizers": { - "Latn": { - "full_object": "SpaCyTokenizer(\"hi\")", - "original_lang_name": "hindi", - "original_lang_code": "hin", - "scripts": [ - "Latn", - "Deva" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Deva": { - "full_object": "SpaCyTokenizer(\"hi\")", - "original_lang_name": "hindi", - "original_lang_code": "hin", - "scripts": [ - "Latn", - "Deva" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Gujr": { - "full_object": "SpaCyTokenizer(\"gu\")", - "original_lang_name": "gujarati", - "original_lang_code": "guj", - "scripts": [ - "Latn", - "Gujr" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Guru": { - "full_object": "IndicNLPTokenizer(\"pa\")", - "original_lang_name": "punjabi", - "original_lang_code": "pan", - "scripts": [ - "Latn", - "Guru" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": false - }, - "Beng": { - "full_object": "SpaCyTokenizer(\"bn\")", - "original_lang_name": "bengali", - "original_lang_code": "ben", - "scripts": [ - "Latn", - "Beng" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Orya": { - "full_object": "IndicNLPTokenizer(\"or\")", - "original_lang_name": "oriya", - "original_lang_code": "ori", - "scripts": [ - "Latn", - "Orya" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": true - }, - "Arab": { - "full_object": "SpaCyTokenizer(\"ur\")", - "original_lang_name": "urdu", - "original_lang_code": "urd", - "scripts": [ - "Latn", - "Arab" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Sinh": { - "full_object": "SpaCyTokenizer(\"si\")", - "original_lang_name": "sinhala", - "original_lang_code": "sin", - "scripts": [ - "Sinh" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Cyrl": { - "full_object": "StanzaTokenizer(\"kmr\")", - "original_lang_name": "northern_kurdish", - "original_lang_code": "kmr", - "scripts": [ - "Latn", - "Cyrl" - ], - "class_name": "StanzaTokenizer", - "macrolanguage": false - }, - "Armn": { - "full_object": "SpaCyTokenizer(\"hy\")", - "original_lang_name": "armenian", - "original_lang_code": "hye", - "scripts": [ - "Armn" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Grek": { - "full_object": "SpaCyTokenizer(\"el\")", - "original_lang_name": "greek", - "original_lang_code": "ell", - "scripts": [ - "Grek" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - } - }, - "children": [ - { - "name": "Tharu, Central", - "iso_1_code": null, - "iso_3_code": "the", - "tokenizers": {}, - "children": [], - "node_i": "4307", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Tharu, Dangaura", - "iso_1_code": null, - "iso_3_code": "thl", - "tokenizers": { - "Deva": { - "full_object": "SpaCyTokenizer(\"hi\")", - "original_lang_name": "hindi", - "original_lang_code": "hin", - "scripts": [ - "Latn", - "Deva" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - } - }, - "children": [], - "node_i": "4308", - "scripts": [ - "Deva" - ], - "own_tokenizer": false - }, - { - "name": "Tharu, Mid-Eastern", - "iso_1_code": null, - "iso_3_code": "thq", - "tokenizers": {}, - "children": [], - "node_i": "4309", - "scripts": [], - "own_tokenizer": false - } - ], - "node_i": "4306", - "scripts": [], - "own_tokenizer": false - } - ], - "node_i": "4303", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Unclassified", - "iso_1_code": null, - "iso_3_code": null, - "tokenizers": { - "Latn": { - "full_object": "SpaCyTokenizer(\"hi\")", - "original_lang_name": "hindi", - "original_lang_code": "hin", - "scripts": [ - "Latn", - "Deva" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Deva": { - "full_object": "SpaCyTokenizer(\"hi\")", - "original_lang_name": "hindi", - "original_lang_code": "hin", - "scripts": [ - "Latn", - "Deva" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Gujr": { - "full_object": "SpaCyTokenizer(\"gu\")", - "original_lang_name": "gujarati", - "original_lang_code": "guj", - "scripts": [ - "Latn", - "Gujr" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Guru": { - "full_object": "IndicNLPTokenizer(\"pa\")", - "original_lang_name": "punjabi", - "original_lang_code": "pan", - "scripts": [ - "Latn", - "Guru" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": false - }, - "Beng": { - "full_object": "SpaCyTokenizer(\"bn\")", - "original_lang_name": "bengali", - "original_lang_code": "ben", - "scripts": [ - "Latn", - "Beng" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Orya": { - "full_object": "IndicNLPTokenizer(\"or\")", - "original_lang_name": "oriya", - "original_lang_code": "ori", - "scripts": [ - "Latn", - "Orya" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": true - }, - "Arab": { - "full_object": "SpaCyTokenizer(\"ur\")", - "original_lang_name": "urdu", - "original_lang_code": "urd", - "scripts": [ - "Latn", - "Arab" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Sinh": { - "full_object": "SpaCyTokenizer(\"si\")", - "original_lang_name": "sinhala", - "original_lang_code": "sin", - "scripts": [ - "Sinh" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Cyrl": { - "full_object": "StanzaTokenizer(\"kmr\")", - "original_lang_name": "northern_kurdish", - "original_lang_code": "kmr", - "scripts": [ - "Latn", - "Cyrl" - ], - "class_name": "StanzaTokenizer", - "macrolanguage": false - }, - "Armn": { - "full_object": "SpaCyTokenizer(\"hy\")", - "original_lang_name": "armenian", - "original_lang_code": "hye", - "scripts": [ - "Armn" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Grek": { - "full_object": "SpaCyTokenizer(\"el\")", - "original_lang_name": "greek", - "original_lang_code": "ell", - "scripts": [ - "Grek" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - } - }, - "children": [ - { - "name": "Andh", - "iso_1_code": null, - "iso_3_code": "anr", - "tokenizers": {}, - "children": [], - "node_i": "4311", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Bazigar", - "iso_1_code": null, - "iso_3_code": "bfr", - "tokenizers": {}, - "children": [], - "node_i": "4312", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Chinali", - "iso_1_code": null, - "iso_3_code": "cih", - "tokenizers": {}, - "children": [], - "node_i": "4313", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Danuwar", - "iso_1_code": null, - "iso_3_code": "dhw", - "tokenizers": {}, - "children": [], - "node_i": "4314", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Darai", - "iso_1_code": null, - "iso_3_code": "dry", - "tokenizers": {}, - "children": [], - "node_i": "4315", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Dewas Rai", - "iso_1_code": null, - "iso_3_code": "dwz", - "tokenizers": {}, - "children": [], - "node_i": "4316", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Kanjari", - "iso_1_code": null, - "iso_3_code": "kft", - "tokenizers": {}, - "children": [], - "node_i": "4317", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Kumal", - "iso_1_code": null, - "iso_3_code": "kra", - "tokenizers": {}, - "children": [], - "node_i": "4318", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Lohar, Lahul", - "iso_1_code": null, - "iso_3_code": "lhl", - "tokenizers": {}, - "children": [], - "node_i": "4319", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Memoni", - "iso_1_code": null, - "iso_3_code": "mby", - "tokenizers": {}, - "children": [], - "node_i": "4320", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Oadki", - "iso_1_code": null, - "iso_3_code": "odk", - "tokenizers": {}, - "children": [], - "node_i": "4321", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Pali", - "iso_1_code": "pi", - "iso_3_code": "pli", - "tokenizers": {}, - "children": [], - "node_i": "4322", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Vaagri Booli", - "iso_1_code": null, - "iso_3_code": "vaa", - "tokenizers": {}, - "children": [], - "node_i": "4323", - "scripts": [], - "own_tokenizer": false - } - ], - "node_i": "4310", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Western Hindi", - "iso_1_code": null, - "iso_3_code": null, - "tokenizers": { - "Latn": { - "full_object": "SpaCyTokenizer(\"hi\")", - "original_lang_name": "hindi", - "original_lang_code": "hin", - "scripts": [ - "Latn", - "Deva" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Deva": { - "full_object": "SpaCyTokenizer(\"hi\")", - "original_lang_name": "hindi", - "original_lang_code": "hin", - "scripts": [ - "Latn", - "Deva" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Arab": { - "full_object": "SpaCyTokenizer(\"ur\")", - "original_lang_name": "urdu", - "original_lang_code": "urd", - "scripts": [ - "Latn", - "Arab" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Gujr": { - "full_object": "SpaCyTokenizer(\"gu\")", - "original_lang_name": "gujarati", - "original_lang_code": "guj", - "scripts": [ - "Latn", - "Gujr" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Guru": { - "full_object": "IndicNLPTokenizer(\"pa\")", - "original_lang_name": "punjabi", - "original_lang_code": "pan", - "scripts": [ - "Latn", - "Guru" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": false - }, - "Beng": { - "full_object": "SpaCyTokenizer(\"bn\")", - "original_lang_name": "bengali", - "original_lang_code": "ben", - "scripts": [ - "Latn", - "Beng" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Orya": { - "full_object": "IndicNLPTokenizer(\"or\")", - "original_lang_name": "oriya", - "original_lang_code": "ori", - "scripts": [ - "Latn", - "Orya" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": true - }, - "Sinh": { - "full_object": "SpaCyTokenizer(\"si\")", - "original_lang_name": "sinhala", - "original_lang_code": "sin", - "scripts": [ - "Sinh" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Cyrl": { - "full_object": "StanzaTokenizer(\"kmr\")", - "original_lang_name": "northern_kurdish", - "original_lang_code": "kmr", - "scripts": [ - "Latn", - "Cyrl" - ], - "class_name": "StanzaTokenizer", - "macrolanguage": false - }, - "Armn": { - "full_object": "SpaCyTokenizer(\"hy\")", - "original_lang_name": "armenian", - "original_lang_code": "hye", - "scripts": [ - "Armn" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Grek": { - "full_object": "SpaCyTokenizer(\"el\")", - "original_lang_name": "greek", - "original_lang_code": "ell", - "scripts": [ - "Grek" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - } - }, - "children": [ - { - "name": "Bundeli", - "iso_1_code": null, - "iso_3_code": null, - "tokenizers": { - "Latn": { - "full_object": "SpaCyTokenizer(\"hi\")", - "original_lang_name": "hindi", - "original_lang_code": "hin", - "scripts": [ - "Latn", - "Deva" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Deva": { - "full_object": "SpaCyTokenizer(\"hi\")", - "original_lang_name": "hindi", - "original_lang_code": "hin", - "scripts": [ - "Latn", - "Deva" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Arab": { - "full_object": "SpaCyTokenizer(\"ur\")", - "original_lang_name": "urdu", - "original_lang_code": "urd", - "scripts": [ - "Latn", - "Arab" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Gujr": { - "full_object": "SpaCyTokenizer(\"gu\")", - "original_lang_name": "gujarati", - "original_lang_code": "guj", - "scripts": [ - "Latn", - "Gujr" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Guru": { - "full_object": "IndicNLPTokenizer(\"pa\")", - "original_lang_name": "punjabi", - "original_lang_code": "pan", - "scripts": [ - "Latn", - "Guru" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": false - }, - "Beng": { - "full_object": "SpaCyTokenizer(\"bn\")", - "original_lang_name": "bengali", - "original_lang_code": "ben", - "scripts": [ - "Latn", - "Beng" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Orya": { - "full_object": "IndicNLPTokenizer(\"or\")", - "original_lang_name": "oriya", - "original_lang_code": "ori", - "scripts": [ - "Latn", - "Orya" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": true - }, - "Sinh": { - "full_object": "SpaCyTokenizer(\"si\")", - "original_lang_name": "sinhala", - "original_lang_code": "sin", - "scripts": [ - "Sinh" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Cyrl": { - "full_object": "StanzaTokenizer(\"kmr\")", - "original_lang_name": "northern_kurdish", - "original_lang_code": "kmr", - "scripts": [ - "Latn", - "Cyrl" - ], - "class_name": "StanzaTokenizer", - "macrolanguage": false - }, - "Armn": { - "full_object": "SpaCyTokenizer(\"hy\")", - "original_lang_name": "armenian", - "original_lang_code": "hye", - "scripts": [ - "Armn" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Grek": { - "full_object": "SpaCyTokenizer(\"el\")", - "original_lang_name": "greek", - "original_lang_code": "ell", - "scripts": [ - "Grek" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - } - }, - "children": [ - { - "name": "Bundeli", - "iso_1_code": null, - "iso_3_code": "bns", - "tokenizers": {}, - "children": [], - "node_i": "4326", - "scripts": [], - "own_tokenizer": false - } - ], - "node_i": "4325", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Hindustani", - "iso_1_code": null, - "iso_3_code": null, - "tokenizers": { - "Latn": { - "full_object": "SpaCyTokenizer(\"hi\")", - "original_lang_name": "hindi", - "original_lang_code": "hin", - "scripts": [ - "Latn", - "Deva" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Deva": { - "full_object": "SpaCyTokenizer(\"hi\")", - "original_lang_name": "hindi", - "original_lang_code": "hin", - "scripts": [ - "Latn", - "Deva" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Arab": { - "full_object": "SpaCyTokenizer(\"ur\")", - "original_lang_name": "urdu", - "original_lang_code": "urd", - "scripts": [ - "Latn", - "Arab" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Gujr": { - "full_object": "SpaCyTokenizer(\"gu\")", - "original_lang_name": "gujarati", - "original_lang_code": "guj", - "scripts": [ - "Latn", - "Gujr" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Guru": { - "full_object": "IndicNLPTokenizer(\"pa\")", - "original_lang_name": "punjabi", - "original_lang_code": "pan", - "scripts": [ - "Latn", - "Guru" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": false - }, - "Beng": { - "full_object": "SpaCyTokenizer(\"bn\")", - "original_lang_name": "bengali", - "original_lang_code": "ben", - "scripts": [ - "Latn", - "Beng" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Orya": { - "full_object": "IndicNLPTokenizer(\"or\")", - "original_lang_name": "oriya", - "original_lang_code": "ori", - "scripts": [ - "Latn", - "Orya" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": true - }, - "Sinh": { - "full_object": "SpaCyTokenizer(\"si\")", - "original_lang_name": "sinhala", - "original_lang_code": "sin", - "scripts": [ - "Sinh" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Cyrl": { - "full_object": "StanzaTokenizer(\"kmr\")", - "original_lang_name": "northern_kurdish", - "original_lang_code": "kmr", - "scripts": [ - "Latn", - "Cyrl" - ], - "class_name": "StanzaTokenizer", - "macrolanguage": false - }, - "Armn": { - "full_object": "SpaCyTokenizer(\"hy\")", - "original_lang_name": "armenian", - "original_lang_code": "hye", - "scripts": [ - "Armn" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Grek": { - "full_object": "SpaCyTokenizer(\"el\")", - "original_lang_name": "greek", - "original_lang_code": "ell", - "scripts": [ - "Grek" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - } - }, - "children": [ - { - "name": "Hindi", - "iso_1_code": "hi", - "iso_3_code": "hin", - "tokenizers": { - "Latn": { - "full_object": "SpaCyTokenizer(\"hi\")", - "original_lang_name": "hindi", - "original_lang_code": "hin", - "scripts": [ - "Latn", - "Deva" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Deva": { - "full_object": "SpaCyTokenizer(\"hi\")", - "original_lang_name": "hindi", - "original_lang_code": "hin", - "scripts": [ - "Latn", - "Deva" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - } - }, - "children": [], - "node_i": "4328", - "scripts": [ - "Latn", - "Deva" - ], - "own_tokenizer": true - }, - { - "name": "Urdu", - "iso_1_code": "ur", - "iso_3_code": "urd", - "tokenizers": { - "Latn": { - "full_object": "SpaCyTokenizer(\"ur\")", - "original_lang_name": "urdu", - "original_lang_code": "urd", - "scripts": [ - "Latn", - "Arab" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Arab": { - "full_object": "SpaCyTokenizer(\"ur\")", - "original_lang_name": "urdu", - "original_lang_code": "urd", - "scripts": [ - "Latn", - "Arab" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - } - }, - "children": [], - "node_i": "4329", - "scripts": [ - "Latn", - "Arab" - ], - "own_tokenizer": true - }, - { - "name": "Sansi", - "iso_1_code": null, - "iso_3_code": null, - "tokenizers": { - "Latn": { - "full_object": "SpaCyTokenizer(\"hi\")", - "original_lang_name": "hindi", - "original_lang_code": "hin", - "scripts": [ - "Latn", - "Deva" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Deva": { - "full_object": "SpaCyTokenizer(\"hi\")", - "original_lang_name": "hindi", - "original_lang_code": "hin", - "scripts": [ - "Latn", - "Deva" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Arab": { - "full_object": "SpaCyTokenizer(\"ur\")", - "original_lang_name": "urdu", - "original_lang_code": "urd", - "scripts": [ - "Latn", - "Arab" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Gujr": { - "full_object": "SpaCyTokenizer(\"gu\")", - "original_lang_name": "gujarati", - "original_lang_code": "guj", - "scripts": [ - "Latn", - "Gujr" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Guru": { - "full_object": "IndicNLPTokenizer(\"pa\")", - "original_lang_name": "punjabi", - "original_lang_code": "pan", - "scripts": [ - "Latn", - "Guru" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": false - }, - "Beng": { - "full_object": "SpaCyTokenizer(\"bn\")", - "original_lang_name": "bengali", - "original_lang_code": "ben", - "scripts": [ - "Latn", - "Beng" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Orya": { - "full_object": "IndicNLPTokenizer(\"or\")", - "original_lang_name": "oriya", - "original_lang_code": "ori", - "scripts": [ - "Latn", - "Orya" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": true - }, - "Sinh": { - "full_object": "SpaCyTokenizer(\"si\")", - "original_lang_name": "sinhala", - "original_lang_code": "sin", - "scripts": [ - "Sinh" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Cyrl": { - "full_object": "StanzaTokenizer(\"kmr\")", - "original_lang_name": "northern_kurdish", - "original_lang_code": "kmr", - "scripts": [ - "Latn", - "Cyrl" - ], - "class_name": "StanzaTokenizer", - "macrolanguage": false - }, - "Armn": { - "full_object": "SpaCyTokenizer(\"hy\")", - "original_lang_name": "armenian", - "original_lang_code": "hye", - "scripts": [ - "Armn" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Grek": { - "full_object": "SpaCyTokenizer(\"el\")", - "original_lang_name": "greek", - "original_lang_code": "ell", - "scripts": [ - "Grek" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - } - }, - "children": [ - { - "name": "Kabutra", - "iso_1_code": null, - "iso_3_code": "kbu", - "tokenizers": {}, - "children": [], - "node_i": "4331", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Sansi", - "iso_1_code": null, - "iso_3_code": "ssi", - "tokenizers": {}, - "children": [], - "node_i": "4332", - "scripts": [], - "own_tokenizer": false - } - ], - "node_i": "4330", - "scripts": [], - "own_tokenizer": false - } - ], - "node_i": "4327", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Unclassified", - "iso_1_code": null, - "iso_3_code": null, - "tokenizers": { - "Latn": { - "full_object": "SpaCyTokenizer(\"hi\")", - "original_lang_name": "hindi", - "original_lang_code": "hin", - "scripts": [ - "Latn", - "Deva" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Deva": { - "full_object": "SpaCyTokenizer(\"hi\")", - "original_lang_name": "hindi", - "original_lang_code": "hin", - "scripts": [ - "Latn", - "Deva" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Arab": { - "full_object": "SpaCyTokenizer(\"ur\")", - "original_lang_name": "urdu", - "original_lang_code": "urd", - "scripts": [ - "Latn", - "Arab" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Gujr": { - "full_object": "SpaCyTokenizer(\"gu\")", - "original_lang_name": "gujarati", - "original_lang_code": "guj", - "scripts": [ - "Latn", - "Gujr" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Guru": { - "full_object": "IndicNLPTokenizer(\"pa\")", - "original_lang_name": "punjabi", - "original_lang_code": "pan", - "scripts": [ - "Latn", - "Guru" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": false - }, - "Beng": { - "full_object": "SpaCyTokenizer(\"bn\")", - "original_lang_name": "bengali", - "original_lang_code": "ben", - "scripts": [ - "Latn", - "Beng" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Orya": { - "full_object": "IndicNLPTokenizer(\"or\")", - "original_lang_name": "oriya", - "original_lang_code": "ori", - "scripts": [ - "Latn", - "Orya" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": true - }, - "Sinh": { - "full_object": "SpaCyTokenizer(\"si\")", - "original_lang_name": "sinhala", - "original_lang_code": "sin", - "scripts": [ - "Sinh" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Cyrl": { - "full_object": "StanzaTokenizer(\"kmr\")", - "original_lang_name": "northern_kurdish", - "original_lang_code": "kmr", - "scripts": [ - "Latn", - "Cyrl" - ], - "class_name": "StanzaTokenizer", - "macrolanguage": false - }, - "Armn": { - "full_object": "SpaCyTokenizer(\"hy\")", - "original_lang_name": "armenian", - "original_lang_code": "hye", - "scripts": [ - "Armn" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Grek": { - "full_object": "SpaCyTokenizer(\"el\")", - "original_lang_name": "greek", - "original_lang_code": "ell", - "scripts": [ - "Grek" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - } - }, - "children": [ - { - "name": "Haryanvi", - "iso_1_code": null, - "iso_3_code": "bgc", - "tokenizers": {}, - "children": [], - "node_i": "4334", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Bhaya", - "iso_1_code": null, - "iso_3_code": "bhe", - "tokenizers": {}, - "children": [], - "node_i": "4335", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Kanauji", - "iso_1_code": null, - "iso_3_code": "bjj", - "tokenizers": {}, - "children": [], - "node_i": "4336", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Braj Bhasha", - "iso_1_code": null, - "iso_3_code": "bra", - "tokenizers": {}, - "children": [], - "node_i": "4337", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Ghera", - "iso_1_code": null, - "iso_3_code": "ghr", - "tokenizers": {}, - "children": [], - "node_i": "4338", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Gowli", - "iso_1_code": null, - "iso_3_code": "gok", - "tokenizers": {}, - "children": [], - "node_i": "4339", - "scripts": [], - "own_tokenizer": false - } - ], - "node_i": "4333", - "scripts": [], - "own_tokenizer": false - } - ], - "node_i": "4324", - "scripts": [], - "own_tokenizer": false - } - ], - "node_i": "4068", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Iranian", - "iso_1_code": null, - "iso_3_code": null, - "tokenizers": { - "Latn": { - "full_object": "StanzaTokenizer(\"kmr\")", - "original_lang_name": "northern_kurdish", - "original_lang_code": "kmr", - "scripts": [ - "Latn", - "Cyrl" - ], - "class_name": "StanzaTokenizer", - "macrolanguage": false - }, - "Cyrl": { - "full_object": "StanzaTokenizer(\"kmr\")", - "original_lang_name": "northern_kurdish", - "original_lang_code": "kmr", - "scripts": [ - "Latn", - "Cyrl" - ], - "class_name": "StanzaTokenizer", - "macrolanguage": false - }, - "Arab": { - "full_object": "SpaCyTokenizer(\"fa\")", - "original_lang_name": "persian", - "original_lang_code": "fas", - "scripts": [ - "Arab" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": true - }, - "Deva": { - "full_object": "SpaCyTokenizer(\"hi\")", - "original_lang_name": "hindi", - "original_lang_code": "hin", - "scripts": [ - "Latn", - "Deva" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Gujr": { - "full_object": "SpaCyTokenizer(\"gu\")", - "original_lang_name": "gujarati", - "original_lang_code": "guj", - "scripts": [ - "Latn", - "Gujr" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Guru": { - "full_object": "IndicNLPTokenizer(\"pa\")", - "original_lang_name": "punjabi", - "original_lang_code": "pan", - "scripts": [ - "Latn", - "Guru" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": false - }, - "Beng": { - "full_object": "SpaCyTokenizer(\"bn\")", - "original_lang_name": "bengali", - "original_lang_code": "ben", - "scripts": [ - "Latn", - "Beng" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Orya": { - "full_object": "IndicNLPTokenizer(\"or\")", - "original_lang_name": "oriya", - "original_lang_code": "ori", - "scripts": [ - "Latn", - "Orya" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": true - }, - "Sinh": { - "full_object": "SpaCyTokenizer(\"si\")", - "original_lang_name": "sinhala", - "original_lang_code": "sin", - "scripts": [ - "Sinh" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Armn": { - "full_object": "SpaCyTokenizer(\"hy\")", - "original_lang_name": "armenian", - "original_lang_code": "hye", - "scripts": [ - "Armn" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Grek": { - "full_object": "SpaCyTokenizer(\"el\")", - "original_lang_name": "greek", - "original_lang_code": "ell", - "scripts": [ - "Grek" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - } - }, - "children": [ - { - "name": "Avestan", - "iso_1_code": "ae", - "iso_3_code": "ave", - "tokenizers": {}, - "children": [], - "node_i": "4341", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Eastern", - "iso_1_code": null, - "iso_3_code": null, - "tokenizers": { - "Latn": { - "full_object": "StanzaTokenizer(\"kmr\")", - "original_lang_name": "northern_kurdish", - "original_lang_code": "kmr", - "scripts": [ - "Latn", - "Cyrl" - ], - "class_name": "StanzaTokenizer", - "macrolanguage": false - }, - "Cyrl": { - "full_object": "StanzaTokenizer(\"kmr\")", - "original_lang_name": "northern_kurdish", - "original_lang_code": "kmr", - "scripts": [ - "Latn", - "Cyrl" - ], - "class_name": "StanzaTokenizer", - "macrolanguage": false - }, - "Arab": { - "full_object": "SpaCyTokenizer(\"fa\")", - "original_lang_name": "persian", - "original_lang_code": "fas", - "scripts": [ - "Arab" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": true - }, - "Deva": { - "full_object": "SpaCyTokenizer(\"hi\")", - "original_lang_name": "hindi", - "original_lang_code": "hin", - "scripts": [ - "Latn", - "Deva" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Gujr": { - "full_object": "SpaCyTokenizer(\"gu\")", - "original_lang_name": "gujarati", - "original_lang_code": "guj", - "scripts": [ - "Latn", - "Gujr" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Guru": { - "full_object": "IndicNLPTokenizer(\"pa\")", - "original_lang_name": "punjabi", - "original_lang_code": "pan", - "scripts": [ - "Latn", - "Guru" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": false - }, - "Beng": { - "full_object": "SpaCyTokenizer(\"bn\")", - "original_lang_name": "bengali", - "original_lang_code": "ben", - "scripts": [ - "Latn", - "Beng" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Orya": { - "full_object": "IndicNLPTokenizer(\"or\")", - "original_lang_name": "oriya", - "original_lang_code": "ori", - "scripts": [ - "Latn", - "Orya" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": true - }, - "Sinh": { - "full_object": "SpaCyTokenizer(\"si\")", - "original_lang_name": "sinhala", - "original_lang_code": "sin", - "scripts": [ - "Sinh" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Armn": { - "full_object": "SpaCyTokenizer(\"hy\")", - "original_lang_name": "armenian", - "original_lang_code": "hye", - "scripts": [ - "Armn" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Grek": { - "full_object": "SpaCyTokenizer(\"el\")", - "original_lang_name": "greek", - "original_lang_code": "ell", - "scripts": [ - "Grek" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - } - }, - "children": [ - { - "name": "Northeastern", - "iso_1_code": null, - "iso_3_code": null, - "tokenizers": { - "Latn": { - "full_object": "StanzaTokenizer(\"kmr\")", - "original_lang_name": "northern_kurdish", - "original_lang_code": "kmr", - "scripts": [ - "Latn", - "Cyrl" - ], - "class_name": "StanzaTokenizer", - "macrolanguage": false - }, - "Cyrl": { - "full_object": "StanzaTokenizer(\"kmr\")", - "original_lang_name": "northern_kurdish", - "original_lang_code": "kmr", - "scripts": [ - "Latn", - "Cyrl" - ], - "class_name": "StanzaTokenizer", - "macrolanguage": false - }, - "Arab": { - "full_object": "SpaCyTokenizer(\"fa\")", - "original_lang_name": "persian", - "original_lang_code": "fas", - "scripts": [ - "Arab" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": true - }, - "Deva": { - "full_object": "SpaCyTokenizer(\"hi\")", - "original_lang_name": "hindi", - "original_lang_code": "hin", - "scripts": [ - "Latn", - "Deva" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Gujr": { - "full_object": "SpaCyTokenizer(\"gu\")", - "original_lang_name": "gujarati", - "original_lang_code": "guj", - "scripts": [ - "Latn", - "Gujr" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Guru": { - "full_object": "IndicNLPTokenizer(\"pa\")", - "original_lang_name": "punjabi", - "original_lang_code": "pan", - "scripts": [ - "Latn", - "Guru" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": false - }, - "Beng": { - "full_object": "SpaCyTokenizer(\"bn\")", - "original_lang_name": "bengali", - "original_lang_code": "ben", - "scripts": [ - "Latn", - "Beng" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Orya": { - "full_object": "IndicNLPTokenizer(\"or\")", - "original_lang_name": "oriya", - "original_lang_code": "ori", - "scripts": [ - "Latn", - "Orya" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": true - }, - "Sinh": { - "full_object": "SpaCyTokenizer(\"si\")", - "original_lang_name": "sinhala", - "original_lang_code": "sin", - "scripts": [ - "Sinh" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Armn": { - "full_object": "SpaCyTokenizer(\"hy\")", - "original_lang_name": "armenian", - "original_lang_code": "hye", - "scripts": [ - "Armn" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Grek": { - "full_object": "SpaCyTokenizer(\"el\")", - "original_lang_name": "greek", - "original_lang_code": "ell", - "scripts": [ - "Grek" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - } - }, - "children": [ - { - "name": "Ossetic", - "iso_1_code": "os", - "iso_3_code": "oss", - "tokenizers": { - "Cyrl": { - "full_object": "StanzaTokenizer(\"kmr\")", - "original_lang_name": "northern_kurdish", - "original_lang_code": "kmr", - "scripts": [ - "Latn", - "Cyrl" - ], - "class_name": "StanzaTokenizer", - "macrolanguage": false - } - }, - "children": [], - "node_i": "4344", - "scripts": [ - "Cyrl" - ], - "own_tokenizer": false - }, - { - "name": "Yagnobi", - "iso_1_code": null, - "iso_3_code": "yai", - "tokenizers": {}, - "children": [], - "node_i": "4345", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Yassic", - "iso_1_code": null, - "iso_3_code": "ysc", - "tokenizers": {}, - "children": [], - "node_i": "4346", - "scripts": [], - "own_tokenizer": false - } - ], - "node_i": "4343", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Southeastern", - "iso_1_code": null, - "iso_3_code": null, - "tokenizers": { - "Latn": { - "full_object": "StanzaTokenizer(\"kmr\")", - "original_lang_name": "northern_kurdish", - "original_lang_code": "kmr", - "scripts": [ - "Latn", - "Cyrl" - ], - "class_name": "StanzaTokenizer", - "macrolanguage": false - }, - "Cyrl": { - "full_object": "StanzaTokenizer(\"kmr\")", - "original_lang_name": "northern_kurdish", - "original_lang_code": "kmr", - "scripts": [ - "Latn", - "Cyrl" - ], - "class_name": "StanzaTokenizer", - "macrolanguage": false - }, - "Arab": { - "full_object": "SpaCyTokenizer(\"fa\")", - "original_lang_name": "persian", - "original_lang_code": "fas", - "scripts": [ - "Arab" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": true - }, - "Deva": { - "full_object": "SpaCyTokenizer(\"hi\")", - "original_lang_name": "hindi", - "original_lang_code": "hin", - "scripts": [ - "Latn", - "Deva" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Gujr": { - "full_object": "SpaCyTokenizer(\"gu\")", - "original_lang_name": "gujarati", - "original_lang_code": "guj", - "scripts": [ - "Latn", - "Gujr" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Guru": { - "full_object": "IndicNLPTokenizer(\"pa\")", - "original_lang_name": "punjabi", - "original_lang_code": "pan", - "scripts": [ - "Latn", - "Guru" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": false - }, - "Beng": { - "full_object": "SpaCyTokenizer(\"bn\")", - "original_lang_name": "bengali", - "original_lang_code": "ben", - "scripts": [ - "Latn", - "Beng" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Orya": { - "full_object": "IndicNLPTokenizer(\"or\")", - "original_lang_name": "oriya", - "original_lang_code": "ori", - "scripts": [ - "Latn", - "Orya" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": true - }, - "Sinh": { - "full_object": "SpaCyTokenizer(\"si\")", - "original_lang_name": "sinhala", - "original_lang_code": "sin", - "scripts": [ - "Sinh" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Armn": { - "full_object": "SpaCyTokenizer(\"hy\")", - "original_lang_name": "armenian", - "original_lang_code": "hye", - "scripts": [ - "Armn" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Grek": { - "full_object": "SpaCyTokenizer(\"el\")", - "original_lang_name": "greek", - "original_lang_code": "ell", - "scripts": [ - "Grek" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - } - }, - "children": [ - { - "name": "Pamir", - "iso_1_code": null, - "iso_3_code": null, - "tokenizers": { - "Latn": { - "full_object": "StanzaTokenizer(\"kmr\")", - "original_lang_name": "northern_kurdish", - "original_lang_code": "kmr", - "scripts": [ - "Latn", - "Cyrl" - ], - "class_name": "StanzaTokenizer", - "macrolanguage": false - }, - "Cyrl": { - "full_object": "StanzaTokenizer(\"kmr\")", - "original_lang_name": "northern_kurdish", - "original_lang_code": "kmr", - "scripts": [ - "Latn", - "Cyrl" - ], - "class_name": "StanzaTokenizer", - "macrolanguage": false - }, - "Arab": { - "full_object": "SpaCyTokenizer(\"fa\")", - "original_lang_name": "persian", - "original_lang_code": "fas", - "scripts": [ - "Arab" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": true - }, - "Deva": { - "full_object": "SpaCyTokenizer(\"hi\")", - "original_lang_name": "hindi", - "original_lang_code": "hin", - "scripts": [ - "Latn", - "Deva" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Gujr": { - "full_object": "SpaCyTokenizer(\"gu\")", - "original_lang_name": "gujarati", - "original_lang_code": "guj", - "scripts": [ - "Latn", - "Gujr" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Guru": { - "full_object": "IndicNLPTokenizer(\"pa\")", - "original_lang_name": "punjabi", - "original_lang_code": "pan", - "scripts": [ - "Latn", - "Guru" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": false - }, - "Beng": { - "full_object": "SpaCyTokenizer(\"bn\")", - "original_lang_name": "bengali", - "original_lang_code": "ben", - "scripts": [ - "Latn", - "Beng" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Orya": { - "full_object": "IndicNLPTokenizer(\"or\")", - "original_lang_name": "oriya", - "original_lang_code": "ori", - "scripts": [ - "Latn", - "Orya" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": true - }, - "Sinh": { - "full_object": "SpaCyTokenizer(\"si\")", - "original_lang_name": "sinhala", - "original_lang_code": "sin", - "scripts": [ - "Sinh" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Armn": { - "full_object": "SpaCyTokenizer(\"hy\")", - "original_lang_name": "armenian", - "original_lang_code": "hye", - "scripts": [ - "Armn" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Grek": { - "full_object": "SpaCyTokenizer(\"el\")", - "original_lang_name": "greek", - "original_lang_code": "ell", - "scripts": [ - "Grek" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - } - }, - "children": [ - { - "name": "Ishkashimi", - "iso_1_code": null, - "iso_3_code": "isk", - "tokenizers": {}, - "children": [], - "node_i": "4349", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Munji", - "iso_1_code": null, - "iso_3_code": "mnj", - "tokenizers": {}, - "children": [], - "node_i": "4350", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Sanglechi", - "iso_1_code": null, - "iso_3_code": "sgy", - "tokenizers": {}, - "children": [], - "node_i": "4351", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Wakhi", - "iso_1_code": null, - "iso_3_code": "wbl", - "tokenizers": {}, - "children": [], - "node_i": "4352", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Yadgha", - "iso_1_code": null, - "iso_3_code": "ydg", - "tokenizers": {}, - "children": [], - "node_i": "4353", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Shugni-Yazgulami", - "iso_1_code": null, - "iso_3_code": null, - "tokenizers": { - "Latn": { - "full_object": "StanzaTokenizer(\"kmr\")", - "original_lang_name": "northern_kurdish", - "original_lang_code": "kmr", - "scripts": [ - "Latn", - "Cyrl" - ], - "class_name": "StanzaTokenizer", - "macrolanguage": false - }, - "Cyrl": { - "full_object": "StanzaTokenizer(\"kmr\")", - "original_lang_name": "northern_kurdish", - "original_lang_code": "kmr", - "scripts": [ - "Latn", - "Cyrl" - ], - "class_name": "StanzaTokenizer", - "macrolanguage": false - }, - "Arab": { - "full_object": "SpaCyTokenizer(\"fa\")", - "original_lang_name": "persian", - "original_lang_code": "fas", - "scripts": [ - "Arab" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": true - }, - "Deva": { - "full_object": "SpaCyTokenizer(\"hi\")", - "original_lang_name": "hindi", - "original_lang_code": "hin", - "scripts": [ - "Latn", - "Deva" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Gujr": { - "full_object": "SpaCyTokenizer(\"gu\")", - "original_lang_name": "gujarati", - "original_lang_code": "guj", - "scripts": [ - "Latn", - "Gujr" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Guru": { - "full_object": "IndicNLPTokenizer(\"pa\")", - "original_lang_name": "punjabi", - "original_lang_code": "pan", - "scripts": [ - "Latn", - "Guru" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": false - }, - "Beng": { - "full_object": "SpaCyTokenizer(\"bn\")", - "original_lang_name": "bengali", - "original_lang_code": "ben", - "scripts": [ - "Latn", - "Beng" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Orya": { - "full_object": "IndicNLPTokenizer(\"or\")", - "original_lang_name": "oriya", - "original_lang_code": "ori", - "scripts": [ - "Latn", - "Orya" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": true - }, - "Sinh": { - "full_object": "SpaCyTokenizer(\"si\")", - "original_lang_name": "sinhala", - "original_lang_code": "sin", - "scripts": [ - "Sinh" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Armn": { - "full_object": "SpaCyTokenizer(\"hy\")", - "original_lang_name": "armenian", - "original_lang_code": "hye", - "scripts": [ - "Armn" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Grek": { - "full_object": "SpaCyTokenizer(\"el\")", - "original_lang_name": "greek", - "original_lang_code": "ell", - "scripts": [ - "Grek" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - } - }, - "children": [ - { - "name": "Shughni", - "iso_1_code": null, - "iso_3_code": "sgh", - "tokenizers": { - "Cyrl": { - "full_object": "StanzaTokenizer(\"kmr\")", - "original_lang_name": "northern_kurdish", - "original_lang_code": "kmr", - "scripts": [ - "Latn", - "Cyrl" - ], - "class_name": "StanzaTokenizer", - "macrolanguage": false - } - }, - "children": [], - "node_i": "4355", - "scripts": [ - "Cyrl" - ], - "own_tokenizer": false - }, - { - "name": "Sarikoli", - "iso_1_code": null, - "iso_3_code": "srh", - "tokenizers": {}, - "children": [], - "node_i": "4356", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Yazghulami", - "iso_1_code": null, - "iso_3_code": "yah", - "tokenizers": {}, - "children": [], - "node_i": "4357", - "scripts": [], - "own_tokenizer": false - } - ], - "node_i": "4354", - "scripts": [], - "own_tokenizer": false - } - ], - "node_i": "4348", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Pashto", - "iso_1_code": null, - "iso_3_code": null, - "tokenizers": { - "Latn": { - "full_object": "StanzaTokenizer(\"kmr\")", - "original_lang_name": "northern_kurdish", - "original_lang_code": "kmr", - "scripts": [ - "Latn", - "Cyrl" - ], - "class_name": "StanzaTokenizer", - "macrolanguage": false - }, - "Cyrl": { - "full_object": "StanzaTokenizer(\"kmr\")", - "original_lang_name": "northern_kurdish", - "original_lang_code": "kmr", - "scripts": [ - "Latn", - "Cyrl" - ], - "class_name": "StanzaTokenizer", - "macrolanguage": false - }, - "Arab": { - "full_object": "SpaCyTokenizer(\"fa\")", - "original_lang_name": "persian", - "original_lang_code": "fas", - "scripts": [ - "Arab" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": true - }, - "Deva": { - "full_object": "SpaCyTokenizer(\"hi\")", - "original_lang_name": "hindi", - "original_lang_code": "hin", - "scripts": [ - "Latn", - "Deva" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Gujr": { - "full_object": "SpaCyTokenizer(\"gu\")", - "original_lang_name": "gujarati", - "original_lang_code": "guj", - "scripts": [ - "Latn", - "Gujr" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Guru": { - "full_object": "IndicNLPTokenizer(\"pa\")", - "original_lang_name": "punjabi", - "original_lang_code": "pan", - "scripts": [ - "Latn", - "Guru" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": false - }, - "Beng": { - "full_object": "SpaCyTokenizer(\"bn\")", - "original_lang_name": "bengali", - "original_lang_code": "ben", - "scripts": [ - "Latn", - "Beng" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Orya": { - "full_object": "IndicNLPTokenizer(\"or\")", - "original_lang_name": "oriya", - "original_lang_code": "ori", - "scripts": [ - "Latn", - "Orya" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": true - }, - "Sinh": { - "full_object": "SpaCyTokenizer(\"si\")", - "original_lang_name": "sinhala", - "original_lang_code": "sin", - "scripts": [ - "Sinh" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Armn": { - "full_object": "SpaCyTokenizer(\"hy\")", - "original_lang_name": "armenian", - "original_lang_code": "hye", - "scripts": [ - "Armn" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Grek": { - "full_object": "SpaCyTokenizer(\"el\")", - "original_lang_name": "greek", - "original_lang_code": "ell", - "scripts": [ - "Grek" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - } - }, - "children": [ - { - "name": "Pashto, Southern", - "iso_1_code": "ps", - "iso_3_code": "pbt", - "tokenizers": { - "Arab": { - "full_object": "SpaCyTokenizer(\"fa\")", - "original_lang_name": "persian", - "original_lang_code": "fas", - "scripts": [ - "Arab" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": true - } - }, - "children": [], - "node_i": "4359", - "scripts": [ - "Arab" - ], - "own_tokenizer": false - }, - { - "name": "Pashto, Northern", - "iso_1_code": "ps", - "iso_3_code": "pbu", - "tokenizers": {}, - "children": [], - "node_i": "4360", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Pashto, Central", - "iso_1_code": "ps", - "iso_3_code": "pst", - "tokenizers": {}, - "children": [], - "node_i": "4361", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Waneci", - "iso_1_code": null, - "iso_3_code": "wne", - "tokenizers": {}, - "children": [], - "node_i": "4362", - "scripts": [], - "own_tokenizer": false - } - ], - "node_i": "4358", - "scripts": [], - "own_tokenizer": false - } - ], - "node_i": "4347", - "scripts": [], - "own_tokenizer": false - } - ], - "node_i": "4342", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Western", - "iso_1_code": null, - "iso_3_code": null, - "tokenizers": { - "Latn": { - "full_object": "StanzaTokenizer(\"kmr\")", - "original_lang_name": "northern_kurdish", - "original_lang_code": "kmr", - "scripts": [ - "Latn", - "Cyrl" - ], - "class_name": "StanzaTokenizer", - "macrolanguage": false - }, - "Cyrl": { - "full_object": "StanzaTokenizer(\"kmr\")", - "original_lang_name": "northern_kurdish", - "original_lang_code": "kmr", - "scripts": [ - "Latn", - "Cyrl" - ], - "class_name": "StanzaTokenizer", - "macrolanguage": false - }, - "Arab": { - "full_object": "SpaCyTokenizer(\"fa\")", - "original_lang_name": "persian", - "original_lang_code": "fas", - "scripts": [ - "Arab" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": true - }, - "Deva": { - "full_object": "SpaCyTokenizer(\"hi\")", - "original_lang_name": "hindi", - "original_lang_code": "hin", - "scripts": [ - "Latn", - "Deva" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Gujr": { - "full_object": "SpaCyTokenizer(\"gu\")", - "original_lang_name": "gujarati", - "original_lang_code": "guj", - "scripts": [ - "Latn", - "Gujr" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Guru": { - "full_object": "IndicNLPTokenizer(\"pa\")", - "original_lang_name": "punjabi", - "original_lang_code": "pan", - "scripts": [ - "Latn", - "Guru" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": false - }, - "Beng": { - "full_object": "SpaCyTokenizer(\"bn\")", - "original_lang_name": "bengali", - "original_lang_code": "ben", - "scripts": [ - "Latn", - "Beng" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Orya": { - "full_object": "IndicNLPTokenizer(\"or\")", - "original_lang_name": "oriya", - "original_lang_code": "ori", - "scripts": [ - "Latn", - "Orya" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": true - }, - "Sinh": { - "full_object": "SpaCyTokenizer(\"si\")", - "original_lang_name": "sinhala", - "original_lang_code": "sin", - "scripts": [ - "Sinh" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Armn": { - "full_object": "SpaCyTokenizer(\"hy\")", - "original_lang_name": "armenian", - "original_lang_code": "hye", - "scripts": [ - "Armn" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Grek": { - "full_object": "SpaCyTokenizer(\"el\")", - "original_lang_name": "greek", - "original_lang_code": "ell", - "scripts": [ - "Grek" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - } - }, - "children": [ - { - "name": "Northwestern", - "iso_1_code": null, - "iso_3_code": null, - "tokenizers": { - "Latn": { - "full_object": "StanzaTokenizer(\"kmr\")", - "original_lang_name": "northern_kurdish", - "original_lang_code": "kmr", - "scripts": [ - "Latn", - "Cyrl" - ], - "class_name": "StanzaTokenizer", - "macrolanguage": false - }, - "Cyrl": { - "full_object": "StanzaTokenizer(\"kmr\")", - "original_lang_name": "northern_kurdish", - "original_lang_code": "kmr", - "scripts": [ - "Latn", - "Cyrl" - ], - "class_name": "StanzaTokenizer", - "macrolanguage": false - }, - "Arab": { - "full_object": "SpaCyTokenizer(\"fa\")", - "original_lang_name": "persian", - "original_lang_code": "fas", - "scripts": [ - "Arab" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": true - }, - "Deva": { - "full_object": "SpaCyTokenizer(\"hi\")", - "original_lang_name": "hindi", - "original_lang_code": "hin", - "scripts": [ - "Latn", - "Deva" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Gujr": { - "full_object": "SpaCyTokenizer(\"gu\")", - "original_lang_name": "gujarati", - "original_lang_code": "guj", - "scripts": [ - "Latn", - "Gujr" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Guru": { - "full_object": "IndicNLPTokenizer(\"pa\")", - "original_lang_name": "punjabi", - "original_lang_code": "pan", - "scripts": [ - "Latn", - "Guru" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": false - }, - "Beng": { - "full_object": "SpaCyTokenizer(\"bn\")", - "original_lang_name": "bengali", - "original_lang_code": "ben", - "scripts": [ - "Latn", - "Beng" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Orya": { - "full_object": "IndicNLPTokenizer(\"or\")", - "original_lang_name": "oriya", - "original_lang_code": "ori", - "scripts": [ - "Latn", - "Orya" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": true - }, - "Sinh": { - "full_object": "SpaCyTokenizer(\"si\")", - "original_lang_name": "sinhala", - "original_lang_code": "sin", - "scripts": [ - "Sinh" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Armn": { - "full_object": "SpaCyTokenizer(\"hy\")", - "original_lang_name": "armenian", - "original_lang_code": "hye", - "scripts": [ - "Armn" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Grek": { - "full_object": "SpaCyTokenizer(\"el\")", - "original_lang_name": "greek", - "original_lang_code": "ell", - "scripts": [ - "Grek" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - } - }, - "children": [ - { - "name": "Balochi", - "iso_1_code": null, - "iso_3_code": null, - "tokenizers": { - "Latn": { - "full_object": "StanzaTokenizer(\"kmr\")", - "original_lang_name": "northern_kurdish", - "original_lang_code": "kmr", - "scripts": [ - "Latn", - "Cyrl" - ], - "class_name": "StanzaTokenizer", - "macrolanguage": false - }, - "Cyrl": { - "full_object": "StanzaTokenizer(\"kmr\")", - "original_lang_name": "northern_kurdish", - "original_lang_code": "kmr", - "scripts": [ - "Latn", - "Cyrl" - ], - "class_name": "StanzaTokenizer", - "macrolanguage": false - }, - "Arab": { - "full_object": "SpaCyTokenizer(\"fa\")", - "original_lang_name": "persian", - "original_lang_code": "fas", - "scripts": [ - "Arab" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": true - }, - "Deva": { - "full_object": "SpaCyTokenizer(\"hi\")", - "original_lang_name": "hindi", - "original_lang_code": "hin", - "scripts": [ - "Latn", - "Deva" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Gujr": { - "full_object": "SpaCyTokenizer(\"gu\")", - "original_lang_name": "gujarati", - "original_lang_code": "guj", - "scripts": [ - "Latn", - "Gujr" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Guru": { - "full_object": "IndicNLPTokenizer(\"pa\")", - "original_lang_name": "punjabi", - "original_lang_code": "pan", - "scripts": [ - "Latn", - "Guru" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": false - }, - "Beng": { - "full_object": "SpaCyTokenizer(\"bn\")", - "original_lang_name": "bengali", - "original_lang_code": "ben", - "scripts": [ - "Latn", - "Beng" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Orya": { - "full_object": "IndicNLPTokenizer(\"or\")", - "original_lang_name": "oriya", - "original_lang_code": "ori", - "scripts": [ - "Latn", - "Orya" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": true - }, - "Sinh": { - "full_object": "SpaCyTokenizer(\"si\")", - "original_lang_name": "sinhala", - "original_lang_code": "sin", - "scripts": [ - "Sinh" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Armn": { - "full_object": "SpaCyTokenizer(\"hy\")", - "original_lang_name": "armenian", - "original_lang_code": "hye", - "scripts": [ - "Armn" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Grek": { - "full_object": "SpaCyTokenizer(\"el\")", - "original_lang_name": "greek", - "original_lang_code": "ell", - "scripts": [ - "Grek" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - } - }, - "children": [ - { - "name": "Balochi, Southern", - "iso_1_code": null, - "iso_3_code": "bcc", - "tokenizers": { - "Arab": { - "full_object": "SpaCyTokenizer(\"fa\")", - "original_lang_name": "persian", - "original_lang_code": "fas", - "scripts": [ - "Arab" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": true - } - }, - "children": [], - "node_i": "4366", - "scripts": [ - "Arab" - ], - "own_tokenizer": false - }, - { - "name": "Balochi, Western", - "iso_1_code": null, - "iso_3_code": "bgn", - "tokenizers": {}, - "children": [], - "node_i": "4367", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Balochi, Eastern", - "iso_1_code": null, - "iso_3_code": "bgp", - "tokenizers": {}, - "children": [], - "node_i": "4368", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Bashkardi", - "iso_1_code": null, - "iso_3_code": "bsg", - "tokenizers": {}, - "children": [], - "node_i": "4369", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Koroshi", - "iso_1_code": null, - "iso_3_code": "ktl", - "tokenizers": {}, - "children": [], - "node_i": "4370", - "scripts": [], - "own_tokenizer": false - } - ], - "node_i": "4365", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Caspian", - "iso_1_code": null, - "iso_3_code": null, - "tokenizers": { - "Latn": { - "full_object": "StanzaTokenizer(\"kmr\")", - "original_lang_name": "northern_kurdish", - "original_lang_code": "kmr", - "scripts": [ - "Latn", - "Cyrl" - ], - "class_name": "StanzaTokenizer", - "macrolanguage": false - }, - "Cyrl": { - "full_object": "StanzaTokenizer(\"kmr\")", - "original_lang_name": "northern_kurdish", - "original_lang_code": "kmr", - "scripts": [ - "Latn", - "Cyrl" - ], - "class_name": "StanzaTokenizer", - "macrolanguage": false - }, - "Arab": { - "full_object": "SpaCyTokenizer(\"fa\")", - "original_lang_name": "persian", - "original_lang_code": "fas", - "scripts": [ - "Arab" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": true - }, - "Deva": { - "full_object": "SpaCyTokenizer(\"hi\")", - "original_lang_name": "hindi", - "original_lang_code": "hin", - "scripts": [ - "Latn", - "Deva" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Gujr": { - "full_object": "SpaCyTokenizer(\"gu\")", - "original_lang_name": "gujarati", - "original_lang_code": "guj", - "scripts": [ - "Latn", - "Gujr" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Guru": { - "full_object": "IndicNLPTokenizer(\"pa\")", - "original_lang_name": "punjabi", - "original_lang_code": "pan", - "scripts": [ - "Latn", - "Guru" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": false - }, - "Beng": { - "full_object": "SpaCyTokenizer(\"bn\")", - "original_lang_name": "bengali", - "original_lang_code": "ben", - "scripts": [ - "Latn", - "Beng" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Orya": { - "full_object": "IndicNLPTokenizer(\"or\")", - "original_lang_name": "oriya", - "original_lang_code": "ori", - "scripts": [ - "Latn", - "Orya" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": true - }, - "Sinh": { - "full_object": "SpaCyTokenizer(\"si\")", - "original_lang_name": "sinhala", - "original_lang_code": "sin", - "scripts": [ - "Sinh" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Armn": { - "full_object": "SpaCyTokenizer(\"hy\")", - "original_lang_name": "armenian", - "original_lang_code": "hye", - "scripts": [ - "Armn" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Grek": { - "full_object": "SpaCyTokenizer(\"el\")", - "original_lang_name": "greek", - "original_lang_code": "ell", - "scripts": [ - "Grek" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - } - }, - "children": [ - { - "name": "Gilaki", - "iso_1_code": null, - "iso_3_code": "glk", - "tokenizers": { - "Arab": { - "full_object": "SpaCyTokenizer(\"fa\")", - "original_lang_name": "persian", - "original_lang_code": "fas", - "scripts": [ - "Arab" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": true - } - }, - "children": [], - "node_i": "4372", - "scripts": [ - "Arab" - ], - "own_tokenizer": false - }, - { - "name": "Mazandarani", - "iso_1_code": null, - "iso_3_code": "mzn", - "tokenizers": { - "Arab": { - "full_object": "SpaCyTokenizer(\"fa\")", - "original_lang_name": "persian", - "original_lang_code": "fas", - "scripts": [ - "Arab" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": true - } - }, - "children": [], - "node_i": "4373", - "scripts": [ - "Arab" - ], - "own_tokenizer": false - }, - { - "name": "Shahmirzadi", - "iso_1_code": null, - "iso_3_code": "srz", - "tokenizers": {}, - "children": [], - "node_i": "4374", - "scripts": [], - "own_tokenizer": false - } - ], - "node_i": "4371", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Central Iran", - "iso_1_code": null, - "iso_3_code": null, - "tokenizers": { - "Latn": { - "full_object": "StanzaTokenizer(\"kmr\")", - "original_lang_name": "northern_kurdish", - "original_lang_code": "kmr", - "scripts": [ - "Latn", - "Cyrl" - ], - "class_name": "StanzaTokenizer", - "macrolanguage": false - }, - "Cyrl": { - "full_object": "StanzaTokenizer(\"kmr\")", - "original_lang_name": "northern_kurdish", - "original_lang_code": "kmr", - "scripts": [ - "Latn", - "Cyrl" - ], - "class_name": "StanzaTokenizer", - "macrolanguage": false - }, - "Arab": { - "full_object": "SpaCyTokenizer(\"fa\")", - "original_lang_name": "persian", - "original_lang_code": "fas", - "scripts": [ - "Arab" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": true - }, - "Deva": { - "full_object": "SpaCyTokenizer(\"hi\")", - "original_lang_name": "hindi", - "original_lang_code": "hin", - "scripts": [ - "Latn", - "Deva" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Gujr": { - "full_object": "SpaCyTokenizer(\"gu\")", - "original_lang_name": "gujarati", - "original_lang_code": "guj", - "scripts": [ - "Latn", - "Gujr" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Guru": { - "full_object": "IndicNLPTokenizer(\"pa\")", - "original_lang_name": "punjabi", - "original_lang_code": "pan", - "scripts": [ - "Latn", - "Guru" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": false - }, - "Beng": { - "full_object": "SpaCyTokenizer(\"bn\")", - "original_lang_name": "bengali", - "original_lang_code": "ben", - "scripts": [ - "Latn", - "Beng" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Orya": { - "full_object": "IndicNLPTokenizer(\"or\")", - "original_lang_name": "oriya", - "original_lang_code": "ori", - "scripts": [ - "Latn", - "Orya" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": true - }, - "Sinh": { - "full_object": "SpaCyTokenizer(\"si\")", - "original_lang_name": "sinhala", - "original_lang_code": "sin", - "scripts": [ - "Sinh" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Armn": { - "full_object": "SpaCyTokenizer(\"hy\")", - "original_lang_name": "armenian", - "original_lang_code": "hye", - "scripts": [ - "Armn" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Grek": { - "full_object": "SpaCyTokenizer(\"el\")", - "original_lang_name": "greek", - "original_lang_code": "ell", - "scripts": [ - "Grek" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - } - }, - "children": [ - { - "name": "Ashtiani", - "iso_1_code": null, - "iso_3_code": "atn", - "tokenizers": {}, - "children": [], - "node_i": "4376", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Dari, Zoroastrian", - "iso_1_code": null, - "iso_3_code": "gbz", - "tokenizers": {}, - "children": [], - "node_i": "4377", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Gazi", - "iso_1_code": null, - "iso_3_code": "gzi", - "tokenizers": {}, - "children": [], - "node_i": "4378", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Khunsari", - "iso_1_code": null, - "iso_3_code": "kfm", - "tokenizers": {}, - "children": [], - "node_i": "4379", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Natanzi", - "iso_1_code": null, - "iso_3_code": "ntz", - "tokenizers": {}, - "children": [], - "node_i": "4380", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Nayini", - "iso_1_code": null, - "iso_3_code": "nyq", - "tokenizers": {}, - "children": [], - "node_i": "4381", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Parsi-Dari", - "iso_1_code": null, - "iso_3_code": "prd", - "tokenizers": {}, - "children": [], - "node_i": "4382", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Sivandi", - "iso_1_code": null, - "iso_3_code": "siy", - "tokenizers": {}, - "children": [], - "node_i": "4383", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Soi", - "iso_1_code": null, - "iso_3_code": "soj", - "tokenizers": {}, - "children": [], - "node_i": "4384", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Vafsi", - "iso_1_code": null, - "iso_3_code": "vaf", - "tokenizers": {}, - "children": [], - "node_i": "4385", - "scripts": [], - "own_tokenizer": false - } - ], - "node_i": "4375", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Kurdish", - "iso_1_code": null, - "iso_3_code": null, - "tokenizers": { - "Latn": { - "full_object": "StanzaTokenizer(\"kmr\")", - "original_lang_name": "northern_kurdish", - "original_lang_code": "kmr", - "scripts": [ - "Latn", - "Cyrl" - ], - "class_name": "StanzaTokenizer", - "macrolanguage": false - }, - "Cyrl": { - "full_object": "StanzaTokenizer(\"kmr\")", - "original_lang_name": "northern_kurdish", - "original_lang_code": "kmr", - "scripts": [ - "Latn", - "Cyrl" - ], - "class_name": "StanzaTokenizer", - "macrolanguage": false - }, - "Arab": { - "full_object": "SpaCyTokenizer(\"fa\")", - "original_lang_name": "persian", - "original_lang_code": "fas", - "scripts": [ - "Arab" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": true - }, - "Deva": { - "full_object": "SpaCyTokenizer(\"hi\")", - "original_lang_name": "hindi", - "original_lang_code": "hin", - "scripts": [ - "Latn", - "Deva" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Gujr": { - "full_object": "SpaCyTokenizer(\"gu\")", - "original_lang_name": "gujarati", - "original_lang_code": "guj", - "scripts": [ - "Latn", - "Gujr" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Guru": { - "full_object": "IndicNLPTokenizer(\"pa\")", - "original_lang_name": "punjabi", - "original_lang_code": "pan", - "scripts": [ - "Latn", - "Guru" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": false - }, - "Beng": { - "full_object": "SpaCyTokenizer(\"bn\")", - "original_lang_name": "bengali", - "original_lang_code": "ben", - "scripts": [ - "Latn", - "Beng" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Orya": { - "full_object": "IndicNLPTokenizer(\"or\")", - "original_lang_name": "oriya", - "original_lang_code": "ori", - "scripts": [ - "Latn", - "Orya" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": true - }, - "Sinh": { - "full_object": "SpaCyTokenizer(\"si\")", - "original_lang_name": "sinhala", - "original_lang_code": "sin", - "scripts": [ - "Sinh" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Armn": { - "full_object": "SpaCyTokenizer(\"hy\")", - "original_lang_name": "armenian", - "original_lang_code": "hye", - "scripts": [ - "Armn" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Grek": { - "full_object": "SpaCyTokenizer(\"el\")", - "original_lang_name": "greek", - "original_lang_code": "ell", - "scripts": [ - "Grek" + "Latn", + "Cyrl" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Grek": { + "full_object": "SpaCyTokenizer(\"el\")", + "original_lang_name": "greek", + "original_lang_code": "ell", + "scripts": [ + "Grek" ], "class_name": "SpaCyTokenizer", "macrolanguage": false @@ -20109,42 +4962,30 @@ }, "children": [ { - "name": "Kurdish, Central", - "iso_1_code": "ku", - "iso_3_code": "ckb", + "name": "Balkan", + "iso_1_code": null, + "iso_3_code": null, "tokenizers": { - "Arab": { - "full_object": "SpaCyTokenizer(\"fa\")", - "original_lang_name": "persian", - "original_lang_code": "fas", + "Latn": { + "full_object": "SpaCyTokenizer(\"gu\")", + "original_lang_name": "gujarati", + "original_lang_code": "guj", "scripts": [ - "Arab" + "Latn", + "Gujr" ], "class_name": "SpaCyTokenizer", - "macrolanguage": true - } - }, - "children": [], - "node_i": "4387", - "scripts": [ - "Arab" - ], - "own_tokenizer": false - }, - { - "name": "Kurdish, Northern", - "iso_1_code": "ku", - "iso_3_code": "kmr", - "tokenizers": { - "Latn": { - "full_object": "StanzaTokenizer(\"kmr\")", - "original_lang_name": "northern_kurdish", - "original_lang_code": "kmr", + "macrolanguage": false + }, + "Gujr": { + "full_object": "SpaCyTokenizer(\"gu\")", + "original_lang_name": "gujarati", + "original_lang_code": "guj", "scripts": [ "Latn", - "Cyrl" + "Gujr" ], - "class_name": "StanzaTokenizer", + "class_name": "SpaCyTokenizer", "macrolanguage": false }, "Cyrl": { @@ -20155,457 +4996,466 @@ "Latn", "Cyrl" ], - "class_name": "StanzaTokenizer", - "macrolanguage": false + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Grek": { + "full_object": "SpaCyTokenizer(\"el\")", + "original_lang_name": "greek", + "original_lang_code": "ell", + "scripts": [ + "Grek" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } + }, + "children": [ + { + "name": "Romani, Balkan", + "iso_1_code": null, + "iso_3_code": "rmn", + "tokenizers": { + "Latn": { + "full_object": "SpaCyTokenizer(\"gu\")", + "original_lang_name": "gujarati", + "original_lang_code": "guj", + "scripts": [ + "Latn", + "Gujr" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Cyrl": { + "full_object": "StanzaTokenizer(\"kmr\")", + "original_lang_name": "northern_kurdish", + "original_lang_code": "kmr", + "scripts": [ + "Latn", + "Cyrl" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Grek": { + "full_object": "SpaCyTokenizer(\"el\")", + "original_lang_name": "greek", + "original_lang_code": "ell", + "scripts": [ + "Grek" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } + }, + "children": [], + "node_i": "4180", + "scripts": [ + "Latn", + "Cyrl", + "Grek" + ], + "own_tokenizer": false } - }, - "children": [], - "node_i": "4388", - "scripts": [ - "Latn", - "Cyrl" ], - "own_tokenizer": true + "node_i": "4179", + "scripts": [], + "own_tokenizer": false }, { - "name": "Laki", + "name": "Northern", "iso_1_code": null, - "iso_3_code": "lki", + "iso_3_code": null, "tokenizers": { - "Arab": { - "full_object": "SpaCyTokenizer(\"fa\")", - "original_lang_name": "persian", - "original_lang_code": "fas", + "Latn": { + "full_object": "SpaCyTokenizer(\"gu\")", + "original_lang_name": "gujarati", + "original_lang_code": "guj", "scripts": [ - "Arab" + "Latn", + "Gujr" ], "class_name": "SpaCyTokenizer", - "macrolanguage": true - } - }, - "children": [], - "node_i": "4389", - "scripts": [ - "Arab" - ], - "own_tokenizer": false - }, - { - "name": "Kurdish, Southern", - "iso_1_code": "ku", - "iso_3_code": "sdh", - "tokenizers": { - "Arab": { - "full_object": "SpaCyTokenizer(\"fa\")", - "original_lang_name": "persian", - "original_lang_code": "fas", + "macrolanguage": false + }, + "Gujr": { + "full_object": "SpaCyTokenizer(\"gu\")", + "original_lang_name": "gujarati", + "original_lang_code": "guj", "scripts": [ - "Arab" + "Latn", + "Gujr" ], "class_name": "SpaCyTokenizer", - "macrolanguage": true + "macrolanguage": false } }, - "children": [], - "node_i": "4390", - "scripts": [ - "Arab" - ], - "own_tokenizer": false - } - ], - "node_i": "4386", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Ormuri-Parachi", - "iso_1_code": null, - "iso_3_code": null, - "tokenizers": { - "Latn": { - "full_object": "StanzaTokenizer(\"kmr\")", - "original_lang_name": "northern_kurdish", - "original_lang_code": "kmr", - "scripts": [ - "Latn", - "Cyrl" - ], - "class_name": "StanzaTokenizer", - "macrolanguage": false - }, - "Cyrl": { - "full_object": "StanzaTokenizer(\"kmr\")", - "original_lang_name": "northern_kurdish", - "original_lang_code": "kmr", - "scripts": [ - "Latn", - "Cyrl" - ], - "class_name": "StanzaTokenizer", - "macrolanguage": false - }, - "Arab": { - "full_object": "SpaCyTokenizer(\"fa\")", - "original_lang_name": "persian", - "original_lang_code": "fas", - "scripts": [ - "Arab" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": true - }, - "Deva": { - "full_object": "SpaCyTokenizer(\"hi\")", - "original_lang_name": "hindi", - "original_lang_code": "hin", - "scripts": [ - "Latn", - "Deva" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Gujr": { - "full_object": "SpaCyTokenizer(\"gu\")", - "original_lang_name": "gujarati", - "original_lang_code": "guj", - "scripts": [ - "Latn", - "Gujr" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Guru": { - "full_object": "IndicNLPTokenizer(\"pa\")", - "original_lang_name": "punjabi", - "original_lang_code": "pan", - "scripts": [ - "Latn", - "Guru" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": false - }, - "Beng": { - "full_object": "SpaCyTokenizer(\"bn\")", - "original_lang_name": "bengali", - "original_lang_code": "ben", - "scripts": [ - "Latn", - "Beng" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Orya": { - "full_object": "IndicNLPTokenizer(\"or\")", - "original_lang_name": "oriya", - "original_lang_code": "ori", - "scripts": [ - "Latn", - "Orya" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": true - }, - "Sinh": { - "full_object": "SpaCyTokenizer(\"si\")", - "original_lang_name": "sinhala", - "original_lang_code": "sin", - "scripts": [ - "Sinh" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Armn": { - "full_object": "SpaCyTokenizer(\"hy\")", - "original_lang_name": "armenian", - "original_lang_code": "hye", - "scripts": [ - "Armn" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Grek": { - "full_object": "SpaCyTokenizer(\"el\")", - "original_lang_name": "greek", - "original_lang_code": "ell", - "scripts": [ - "Grek" + "children": [ + { + "name": "Romani, Carpathian", + "iso_1_code": null, + "iso_3_code": "rmc", + "tokenizers": { + "Latn": { + "full_object": "SpaCyTokenizer(\"gu\")", + "original_lang_name": "gujarati", + "original_lang_code": "guj", + "scripts": [ + "Latn", + "Gujr" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } + }, + "children": [], + "node_i": "4182", + "scripts": [ + "Latn" + ], + "own_tokenizer": false + }, + { + "name": "Romani, Kalo Finnish", + "iso_1_code": null, + "iso_3_code": "rmf", + "tokenizers": {}, + "children": [], + "node_i": "4183", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Romani, Baltic", + "iso_1_code": null, + "iso_3_code": "rml", + "tokenizers": { + "Latn": { + "full_object": "SpaCyTokenizer(\"gu\")", + "original_lang_name": "gujarati", + "original_lang_code": "guj", + "scripts": [ + "Latn", + "Gujr" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } + }, + "children": [], + "node_i": "4184", + "scripts": [ + "Latn" + ], + "own_tokenizer": false + }, + { + "name": "Romani, Sinte", + "iso_1_code": null, + "iso_3_code": "rmo", + "tokenizers": { + "Latn": { + "full_object": "SpaCyTokenizer(\"gu\")", + "original_lang_name": "gujarati", + "original_lang_code": "guj", + "scripts": [ + "Latn", + "Gujr" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } + }, + "children": [], + "node_i": "4185", + "scripts": [ + "Latn" + ], + "own_tokenizer": false + }, + { + "name": "Romani, Welsh", + "iso_1_code": null, + "iso_3_code": "rmw", + "tokenizers": {}, + "children": [], + "node_i": "4186", + "scripts": [], + "own_tokenizer": false + } ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - } - }, - "children": [ - { - "name": "Ormuri", - "iso_1_code": null, - "iso_3_code": "oru", - "tokenizers": {}, - "children": [], - "node_i": "4392", + "node_i": "4181", "scripts": [], "own_tokenizer": false }, { - "name": "Parachi", + "name": "Vlax", "iso_1_code": null, - "iso_3_code": "prc", - "tokenizers": {}, - "children": [], - "node_i": "4393", + "iso_3_code": null, + "tokenizers": { + "Latn": { + "full_object": "SpaCyTokenizer(\"gu\")", + "original_lang_name": "gujarati", + "original_lang_code": "guj", + "scripts": [ + "Latn", + "Gujr" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Gujr": { + "full_object": "SpaCyTokenizer(\"gu\")", + "original_lang_name": "gujarati", + "original_lang_code": "guj", + "scripts": [ + "Latn", + "Gujr" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Cyrl": { + "full_object": "StanzaTokenizer(\"kmr\")", + "original_lang_name": "northern_kurdish", + "original_lang_code": "kmr", + "scripts": [ + "Latn", + "Cyrl" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + } + }, + "children": [ + { + "name": "Romani, Vlax", + "iso_1_code": null, + "iso_3_code": "rmy", + "tokenizers": { + "Latn": { + "full_object": "SpaCyTokenizer(\"gu\")", + "original_lang_name": "gujarati", + "original_lang_code": "guj", + "scripts": [ + "Latn", + "Gujr" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Cyrl": { + "full_object": "StanzaTokenizer(\"kmr\")", + "original_lang_name": "northern_kurdish", + "original_lang_code": "kmr", + "scripts": [ + "Latn", + "Cyrl" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + } + }, + "children": [], + "node_i": "4188", + "scripts": [ + "Latn", + "Cyrl" + ], + "own_tokenizer": false + } + ], + "node_i": "4187", "scripts": [], "own_tokenizer": false } ], - "node_i": "4391", + "node_i": "4178", "scripts": [], "own_tokenizer": false }, { - "name": "Semnani", + "name": "Unclassified", "iso_1_code": null, "iso_3_code": null, - "tokenizers": { - "Latn": { - "full_object": "StanzaTokenizer(\"kmr\")", - "original_lang_name": "northern_kurdish", - "original_lang_code": "kmr", - "scripts": [ - "Latn", - "Cyrl" - ], - "class_name": "StanzaTokenizer", - "macrolanguage": false - }, - "Cyrl": { - "full_object": "StanzaTokenizer(\"kmr\")", - "original_lang_name": "northern_kurdish", - "original_lang_code": "kmr", - "scripts": [ - "Latn", - "Cyrl" - ], - "class_name": "StanzaTokenizer", - "macrolanguage": false - }, - "Arab": { - "full_object": "SpaCyTokenizer(\"fa\")", - "original_lang_name": "persian", - "original_lang_code": "fas", - "scripts": [ - "Arab" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": true - }, - "Deva": { - "full_object": "SpaCyTokenizer(\"hi\")", - "original_lang_name": "hindi", - "original_lang_code": "hin", - "scripts": [ - "Latn", - "Deva" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Gujr": { - "full_object": "SpaCyTokenizer(\"gu\")", - "original_lang_name": "gujarati", - "original_lang_code": "guj", - "scripts": [ - "Latn", - "Gujr" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Guru": { - "full_object": "IndicNLPTokenizer(\"pa\")", - "original_lang_name": "punjabi", - "original_lang_code": "pan", - "scripts": [ - "Latn", - "Guru" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": false - }, - "Beng": { - "full_object": "SpaCyTokenizer(\"bn\")", - "original_lang_name": "bengali", - "original_lang_code": "ben", - "scripts": [ - "Latn", - "Beng" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Orya": { - "full_object": "IndicNLPTokenizer(\"or\")", - "original_lang_name": "oriya", - "original_lang_code": "ori", - "scripts": [ - "Latn", - "Orya" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": true - }, - "Sinh": { - "full_object": "SpaCyTokenizer(\"si\")", - "original_lang_name": "sinhala", - "original_lang_code": "sin", - "scripts": [ - "Sinh" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Armn": { - "full_object": "SpaCyTokenizer(\"hy\")", - "original_lang_name": "armenian", - "original_lang_code": "hye", - "scripts": [ - "Armn" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Grek": { - "full_object": "SpaCyTokenizer(\"el\")", - "original_lang_name": "greek", - "original_lang_code": "ell", - "scripts": [ - "Grek" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - } - }, + "tokenizers": {}, "children": [ { - "name": "Lasgerdi", - "iso_1_code": null, - "iso_3_code": "lsa", - "tokenizers": {}, - "children": [], - "node_i": "4395", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Sangisari", - "iso_1_code": null, - "iso_3_code": "sgr", - "tokenizers": {}, - "children": [], - "node_i": "4396", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Semnani", + "name": "Sonha", "iso_1_code": null, - "iso_3_code": "smy", + "iso_3_code": "soi", "tokenizers": {}, "children": [], - "node_i": "4397", + "node_i": "4190", "scripts": [], "own_tokenizer": false }, { - "name": "Sorkhei", + "name": "Mewati", "iso_1_code": null, - "iso_3_code": "sqo", + "iso_3_code": "wtm", "tokenizers": {}, "children": [], - "node_i": "4398", + "node_i": "4191", "scripts": [], "own_tokenizer": false } ], - "node_i": "4394", + "node_i": "4189", "scripts": [], "own_tokenizer": false + } + ], + "node_i": "4083", + "scripts": [], + "own_tokenizer": false + } + ], + "node_i": "4070", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Outer Languages", + "iso_1_code": null, + "iso_3_code": null, + "tokenizers": { + "Latn": { + "full_object": "SpaCyTokenizer(\"bn\")", + "original_lang_name": "bengali", + "original_lang_code": "ben", + "scripts": [ + "Latn", + "Beng" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Beng": { + "full_object": "SpaCyTokenizer(\"bn\")", + "original_lang_name": "bengali", + "original_lang_code": "ben", + "scripts": [ + "Latn", + "Beng" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Orya": { + "full_object": "IndicNLPTokenizer(\"or\")", + "original_lang_name": "oriya", + "original_lang_code": "ori", + "scripts": [ + "Latn", + "Orya" + ], + "class_name": "IndicNLPTokenizer", + "macrolanguage": true + }, + "Deva": { + "full_object": "SpaCyTokenizer(\"mr\")", + "original_lang_name": "marathi", + "original_lang_code": "mar", + "scripts": [ + "Latn", + "Deva" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Arab": { + "full_object": "IndicNLPTokenizer(\"sd\")", + "original_lang_name": "sindhi", + "original_lang_code": "snd", + "scripts": [ + "Latn", + "Deva", + "Arab" + ], + "class_name": "IndicNLPTokenizer", + "macrolanguage": false + }, + "Sinh": { + "full_object": "SpaCyTokenizer(\"si\")", + "original_lang_name": "sinhala", + "original_lang_code": "sin", + "scripts": [ + "Sinh" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } + }, + "children": [ + { + "name": "Eastern", + "iso_1_code": null, + "iso_3_code": null, + "tokenizers": { + "Latn": { + "full_object": "SpaCyTokenizer(\"bn\")", + "original_lang_name": "bengali", + "original_lang_code": "ben", + "scripts": [ + "Latn", + "Beng" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Beng": { + "full_object": "SpaCyTokenizer(\"bn\")", + "original_lang_name": "bengali", + "original_lang_code": "ben", + "scripts": [ + "Latn", + "Beng" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Orya": { + "full_object": "IndicNLPTokenizer(\"or\")", + "original_lang_name": "oriya", + "original_lang_code": "ori", + "scripts": [ + "Latn", + "Orya" + ], + "class_name": "IndicNLPTokenizer", + "macrolanguage": true }, + "Deva": { + "full_object": "SpaCyTokenizer(\"mr\")", + "original_lang_name": "marathi", + "original_lang_code": "mar", + "scripts": [ + "Latn", + "Deva" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } + }, + "children": [ { - "name": "Talysh", + "name": "Bengali-Assamese", "iso_1_code": null, "iso_3_code": null, "tokenizers": { "Latn": { - "full_object": "StanzaTokenizer(\"kmr\")", - "original_lang_name": "northern_kurdish", - "original_lang_code": "kmr", - "scripts": [ - "Latn", - "Cyrl" - ], - "class_name": "StanzaTokenizer", - "macrolanguage": false - }, - "Cyrl": { - "full_object": "StanzaTokenizer(\"kmr\")", - "original_lang_name": "northern_kurdish", - "original_lang_code": "kmr", - "scripts": [ - "Latn", - "Cyrl" - ], - "class_name": "StanzaTokenizer", - "macrolanguage": false - }, - "Arab": { - "full_object": "SpaCyTokenizer(\"fa\")", - "original_lang_name": "persian", - "original_lang_code": "fas", - "scripts": [ - "Arab" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": true - }, - "Deva": { - "full_object": "SpaCyTokenizer(\"hi\")", - "original_lang_name": "hindi", - "original_lang_code": "hin", - "scripts": [ - "Latn", - "Deva" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Gujr": { - "full_object": "SpaCyTokenizer(\"gu\")", - "original_lang_name": "gujarati", - "original_lang_code": "guj", + "full_object": "SpaCyTokenizer(\"bn\")", + "original_lang_name": "bengali", + "original_lang_code": "ben", "scripts": [ "Latn", - "Gujr" + "Beng" ], "class_name": "SpaCyTokenizer", "macrolanguage": false }, - "Guru": { - "full_object": "IndicNLPTokenizer(\"pa\")", - "original_lang_name": "punjabi", - "original_lang_code": "pan", - "scripts": [ - "Latn", - "Guru" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": false - }, "Beng": { "full_object": "SpaCyTokenizer(\"bn\")", "original_lang_name": "bengali", @@ -20617,485 +5467,382 @@ "class_name": "SpaCyTokenizer", "macrolanguage": false }, - "Orya": { - "full_object": "IndicNLPTokenizer(\"or\")", - "original_lang_name": "oriya", - "original_lang_code": "ori", + "Deva": { + "full_object": "SpaCyTokenizer(\"mr\")", + "original_lang_name": "marathi", + "original_lang_code": "mar", "scripts": [ "Latn", - "Orya" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": true - }, - "Sinh": { - "full_object": "SpaCyTokenizer(\"si\")", - "original_lang_name": "sinhala", - "original_lang_code": "sin", - "scripts": [ - "Sinh" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Armn": { - "full_object": "SpaCyTokenizer(\"hy\")", - "original_lang_name": "armenian", - "original_lang_code": "hye", - "scripts": [ - "Armn" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Grek": { - "full_object": "SpaCyTokenizer(\"el\")", - "original_lang_name": "greek", - "original_lang_code": "ell", - "scripts": [ - "Grek" + "Deva" ], "class_name": "SpaCyTokenizer", "macrolanguage": false } }, - "children": [ - { - "name": "Alviri-Vidari", - "iso_1_code": null, - "iso_3_code": "avd", - "tokenizers": {}, - "children": [], - "node_i": "4400", - "scripts": [], - "own_tokenizer": false - }, + "children": [ { - "name": "Eshtehardi", - "iso_1_code": null, - "iso_3_code": "esh", - "tokenizers": {}, + "name": "Assamese", + "iso_1_code": "as", + "iso_3_code": "asm", + "tokenizers": { + "Latn": { + "full_object": "IndicNLPTokenizer(\"as\")", + "original_lang_name": "assamese", + "original_lang_code": "asm", + "scripts": [ + "Latn", + "Beng" + ], + "class_name": "IndicNLPTokenizer", + "macrolanguage": false + }, + "Beng": { + "full_object": "IndicNLPTokenizer(\"as\")", + "original_lang_name": "assamese", + "original_lang_code": "asm", + "scripts": [ + "Latn", + "Beng" + ], + "class_name": "IndicNLPTokenizer", + "macrolanguage": false + } + }, "children": [], - "node_i": "4401", - "scripts": [], - "own_tokenizer": false + "node_i": "4195", + "scripts": [ + "Beng", + "Latn" + ], + "own_tokenizer": true }, { - "name": "Gozarkhani", - "iso_1_code": null, - "iso_3_code": "goz", - "tokenizers": {}, + "name": "Bengali", + "iso_1_code": "bn", + "iso_3_code": "ben", + "tokenizers": { + "Latn": { + "full_object": "SpaCyTokenizer(\"bn\")", + "original_lang_name": "bengali", + "original_lang_code": "ben", + "scripts": [ + "Latn", + "Beng" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Beng": { + "full_object": "SpaCyTokenizer(\"bn\")", + "original_lang_name": "bengali", + "original_lang_code": "ben", + "scripts": [ + "Latn", + "Beng" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } + }, "children": [], - "node_i": "4402", - "scripts": [], - "own_tokenizer": false + "node_i": "4196", + "scripts": [ + "Beng", + "Latn" + ], + "own_tokenizer": true }, { - "name": "Harzani", + "name": "Bishnupuriya", "iso_1_code": null, - "iso_3_code": "hrz", - "tokenizers": {}, + "iso_3_code": "bpy", + "tokenizers": { + "Beng": { + "full_object": "SpaCyTokenizer(\"bn\")", + "original_lang_name": "bengali", + "original_lang_code": "ben", + "scripts": [ + "Latn", + "Beng" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } + }, "children": [], - "node_i": "4403", - "scripts": [], + "node_i": "4197", + "scripts": [ + "Beng" + ], "own_tokenizer": false }, { - "name": "Karingani", + "name": "Chakma", "iso_1_code": null, - "iso_3_code": "kgn", - "tokenizers": {}, + "iso_3_code": "ccp", + "tokenizers": { + "Latn": { + "full_object": "SpaCyTokenizer(\"bn\")", + "original_lang_name": "bengali", + "original_lang_code": "ben", + "scripts": [ + "Latn", + "Beng" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } + }, "children": [], - "node_i": "4404", - "scripts": [], + "node_i": "4198", + "scripts": [ + "Latn" + ], "own_tokenizer": false }, { - "name": "Koresh-e Rostam", + "name": "Chittagonian", "iso_1_code": null, - "iso_3_code": "okh", + "iso_3_code": "ctg", "tokenizers": {}, "children": [], - "node_i": "4405", + "node_i": "4199", "scripts": [], "own_tokenizer": false }, { - "name": "Razajerdi", + "name": "Hajong", "iso_1_code": null, - "iso_3_code": "rat", + "iso_3_code": "haj", "tokenizers": {}, "children": [], - "node_i": "4406", + "node_i": "4200", "scripts": [], "own_tokenizer": false }, { - "name": "Rudbari", + "name": "Halbi", "iso_1_code": null, - "iso_3_code": "rdb", + "iso_3_code": "hlb", "tokenizers": {}, "children": [], - "node_i": "4407", + "node_i": "4201", "scripts": [], "own_tokenizer": false }, { - "name": "Shahrudi", + "name": "Kurmukar", "iso_1_code": null, - "iso_3_code": "shm", + "iso_3_code": "kfv", "tokenizers": {}, "children": [], - "node_i": "4408", + "node_i": "4202", "scripts": [], "own_tokenizer": false }, { - "name": "Takestani", + "name": "Kharia Thar", "iso_1_code": null, - "iso_3_code": "tks", + "iso_3_code": "ksy", "tokenizers": {}, "children": [], - "node_i": "4409", + "node_i": "4203", "scripts": [], "own_tokenizer": false }, { - "name": "Talysh", - "iso_1_code": null, - "iso_3_code": "tly", - "tokenizers": { - "Latn": { - "full_object": "StanzaTokenizer(\"kmr\")", - "original_lang_name": "northern_kurdish", - "original_lang_code": "kmr", - "scripts": [ - "Latn", - "Cyrl" - ], - "class_name": "StanzaTokenizer", - "macrolanguage": false - } - }, - "children": [], - "node_i": "4410", - "scripts": [ - "Latn" - ], - "own_tokenizer": false - }, - { - "name": "Taromi, Upper", + "name": "Kewat", "iso_1_code": null, - "iso_3_code": "tov", + "iso_3_code": "kyv", "tokenizers": {}, "children": [], - "node_i": "4411", + "node_i": "4204", "scripts": [], "own_tokenizer": false }, { - "name": "Maraghei", + "name": "Lodhi", "iso_1_code": null, - "iso_3_code": "vmh", + "iso_3_code": "lbm", "tokenizers": {}, "children": [], - "node_i": "4412", + "node_i": "4205", "scripts": [], "own_tokenizer": false }, { - "name": "Kho\u2019ini", + "name": "Mal Paharia", "iso_1_code": null, - "iso_3_code": "xkc", + "iso_3_code": "mkb", "tokenizers": {}, "children": [], - "node_i": "4413", + "node_i": "4206", "scripts": [], "own_tokenizer": false }, { - "name": "Kajali", + "name": "Nahari", "iso_1_code": null, - "iso_3_code": "xkj", + "iso_3_code": "nhh", "tokenizers": {}, "children": [], - "node_i": "4414", + "node_i": "4207", "scripts": [], "own_tokenizer": false }, { - "name": "Kabatei", + "name": "Rohingya", "iso_1_code": null, - "iso_3_code": "xkp", - "tokenizers": {}, - "children": [], - "node_i": "4415", - "scripts": [], - "own_tokenizer": false - } - ], - "node_i": "4399", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Unclassified", - "iso_1_code": null, - "iso_3_code": null, - "tokenizers": { - "Latn": { - "full_object": "StanzaTokenizer(\"kmr\")", - "original_lang_name": "northern_kurdish", - "original_lang_code": "kmr", - "scripts": [ - "Latn", - "Cyrl" - ], - "class_name": "StanzaTokenizer", - "macrolanguage": false - }, - "Cyrl": { - "full_object": "StanzaTokenizer(\"kmr\")", - "original_lang_name": "northern_kurdish", - "original_lang_code": "kmr", - "scripts": [ - "Latn", - "Cyrl" - ], - "class_name": "StanzaTokenizer", - "macrolanguage": false - }, - "Arab": { - "full_object": "SpaCyTokenizer(\"fa\")", - "original_lang_name": "persian", - "original_lang_code": "fas", + "iso_3_code": "rhg", + "tokenizers": { + "Latn": { + "full_object": "SpaCyTokenizer(\"bn\")", + "original_lang_name": "bengali", + "original_lang_code": "ben", + "scripts": [ + "Latn", + "Beng" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } + }, + "children": [], + "node_i": "4208", "scripts": [ - "Arab" + "Latn" ], - "class_name": "SpaCyTokenizer", - "macrolanguage": true + "own_tokenizer": false }, - "Deva": { - "full_object": "SpaCyTokenizer(\"hi\")", - "original_lang_name": "hindi", - "original_lang_code": "hin", + { + "name": "Rajbanshi", + "iso_1_code": null, + "iso_3_code": "rjs", + "tokenizers": { + "Deva": { + "full_object": "SpaCyTokenizer(\"mr\")", + "original_lang_name": "marathi", + "original_lang_code": "mar", + "scripts": [ + "Latn", + "Deva" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } + }, + "children": [], + "node_i": "4209", "scripts": [ - "Latn", "Deva" ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Gujr": { - "full_object": "SpaCyTokenizer(\"gu\")", - "original_lang_name": "gujarati", - "original_lang_code": "guj", - "scripts": [ - "Latn", - "Gujr" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false + "own_tokenizer": false }, - "Guru": { - "full_object": "IndicNLPTokenizer(\"pa\")", - "original_lang_name": "punjabi", - "original_lang_code": "pan", - "scripts": [ - "Latn", - "Guru" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": false + { + "name": "Rangpuri", + "iso_1_code": null, + "iso_3_code": "rkt", + "tokenizers": {}, + "children": [], + "node_i": "4210", + "scripts": [], + "own_tokenizer": false }, - "Beng": { - "full_object": "SpaCyTokenizer(\"bn\")", - "original_lang_name": "bengali", - "original_lang_code": "ben", + { + "name": "Sylheti", + "iso_1_code": null, + "iso_3_code": "syl", + "tokenizers": { + "Latn": { + "full_object": "SpaCyTokenizer(\"bn\")", + "original_lang_name": "bengali", + "original_lang_code": "ben", + "scripts": [ + "Latn", + "Beng" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Beng": { + "full_object": "SpaCyTokenizer(\"bn\")", + "original_lang_name": "bengali", + "original_lang_code": "ben", + "scripts": [ + "Latn", + "Beng" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } + }, + "children": [], + "node_i": "4211", "scripts": [ "Latn", "Beng" ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Orya": { - "full_object": "IndicNLPTokenizer(\"or\")", - "original_lang_name": "oriya", - "original_lang_code": "ori", - "scripts": [ - "Latn", - "Orya" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": true - }, - "Sinh": { - "full_object": "SpaCyTokenizer(\"si\")", - "original_lang_name": "sinhala", - "original_lang_code": "sin", - "scripts": [ - "Sinh" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false + "own_tokenizer": false }, - "Armn": { - "full_object": "SpaCyTokenizer(\"hy\")", - "original_lang_name": "armenian", - "original_lang_code": "hye", - "scripts": [ - "Armn" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false + { + "name": "Tangchangya", + "iso_1_code": null, + "iso_3_code": "tnv", + "tokenizers": {}, + "children": [], + "node_i": "4212", + "scripts": [], + "own_tokenizer": false }, - "Grek": { - "full_object": "SpaCyTokenizer(\"el\")", - "original_lang_name": "greek", - "original_lang_code": "ell", - "scripts": [ - "Grek" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - } - }, - "children": [ { - "name": "Dezfuli", + "name": "Mirgan", "iso_1_code": null, - "iso_3_code": "def", + "iso_3_code": "zrg", "tokenizers": {}, "children": [], - "node_i": "4417", + "node_i": "4213", "scripts": [], "own_tokenizer": false } ], - "node_i": "4416", + "node_i": "4194", "scripts": [], "own_tokenizer": false }, { - "name": "Zaza-Gorani", + "name": "Bihari", "iso_1_code": null, "iso_3_code": null, "tokenizers": { "Latn": { - "full_object": "StanzaTokenizer(\"kmr\")", - "original_lang_name": "northern_kurdish", - "original_lang_code": "kmr", - "scripts": [ - "Latn", - "Cyrl" - ], - "class_name": "StanzaTokenizer", - "macrolanguage": false - }, - "Cyrl": { - "full_object": "StanzaTokenizer(\"kmr\")", - "original_lang_name": "northern_kurdish", - "original_lang_code": "kmr", - "scripts": [ - "Latn", - "Cyrl" - ], - "class_name": "StanzaTokenizer", - "macrolanguage": false - }, - "Arab": { - "full_object": "SpaCyTokenizer(\"fa\")", - "original_lang_name": "persian", - "original_lang_code": "fas", - "scripts": [ - "Arab" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": true - }, - "Deva": { - "full_object": "SpaCyTokenizer(\"hi\")", - "original_lang_name": "hindi", - "original_lang_code": "hin", - "scripts": [ - "Latn", - "Deva" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Gujr": { - "full_object": "SpaCyTokenizer(\"gu\")", - "original_lang_name": "gujarati", - "original_lang_code": "guj", - "scripts": [ - "Latn", - "Gujr" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Guru": { - "full_object": "IndicNLPTokenizer(\"pa\")", - "original_lang_name": "punjabi", - "original_lang_code": "pan", - "scripts": [ - "Latn", - "Guru" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": false - }, - "Beng": { "full_object": "SpaCyTokenizer(\"bn\")", - "original_lang_name": "bengali", - "original_lang_code": "ben", - "scripts": [ - "Latn", - "Beng" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Orya": { - "full_object": "IndicNLPTokenizer(\"or\")", - "original_lang_name": "oriya", - "original_lang_code": "ori", - "scripts": [ - "Latn", - "Orya" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": true - }, - "Sinh": { - "full_object": "SpaCyTokenizer(\"si\")", - "original_lang_name": "sinhala", - "original_lang_code": "sin", + "original_lang_name": "bengali", + "original_lang_code": "ben", "scripts": [ - "Sinh" + "Latn", + "Beng" ], "class_name": "SpaCyTokenizer", "macrolanguage": false }, - "Armn": { - "full_object": "SpaCyTokenizer(\"hy\")", - "original_lang_name": "armenian", - "original_lang_code": "hye", + "Beng": { + "full_object": "SpaCyTokenizer(\"bn\")", + "original_lang_name": "bengali", + "original_lang_code": "ben", "scripts": [ - "Armn" + "Latn", + "Beng" ], "class_name": "SpaCyTokenizer", "macrolanguage": false }, - "Grek": { - "full_object": "SpaCyTokenizer(\"el\")", - "original_lang_name": "greek", - "original_lang_code": "ell", + "Deva": { + "full_object": "SpaCyTokenizer(\"mr\")", + "original_lang_name": "marathi", + "original_lang_code": "mar", "scripts": [ - "Grek" + "Latn", + "Deva" ], "class_name": "SpaCyTokenizer", "macrolanguage": false @@ -21103,320 +5850,215 @@ }, "children": [ { - "name": "Bajelani", - "iso_1_code": null, - "iso_3_code": "bjm", - "tokenizers": {}, - "children": [], - "node_i": "4419", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Zazaki, Southern", + "name": "Bhojpuri", "iso_1_code": null, - "iso_3_code": "diq", + "iso_3_code": "bho", "tokenizers": { - "Latn": { - "full_object": "StanzaTokenizer(\"kmr\")", - "original_lang_name": "northern_kurdish", - "original_lang_code": "kmr", + "Deva": { + "full_object": "SpaCyTokenizer(\"mr\")", + "original_lang_name": "marathi", + "original_lang_code": "mar", "scripts": [ "Latn", - "Cyrl" - ], - "class_name": "StanzaTokenizer", - "macrolanguage": false - } - }, - "children": [], - "node_i": "4420", - "scripts": [ - "Latn" - ], - "own_tokenizer": false - }, - { - "name": "Gurani", - "iso_1_code": null, - "iso_3_code": "hac", - "tokenizers": { - "Arab": { - "full_object": "SpaCyTokenizer(\"fa\")", - "original_lang_name": "persian", - "original_lang_code": "fas", - "scripts": [ - "Arab" + "Deva" ], "class_name": "SpaCyTokenizer", - "macrolanguage": true + "macrolanguage": false } }, "children": [], - "node_i": "4421", + "node_i": "4215", "scripts": [ - "Arab" + "Deva" ], "own_tokenizer": false }, { - "name": "Zazaki, Northern", + "name": "Hindustani, Sarnami", "iso_1_code": null, - "iso_3_code": "kiu", + "iso_3_code": "hns", "tokenizers": { "Latn": { - "full_object": "StanzaTokenizer(\"kmr\")", - "original_lang_name": "northern_kurdish", - "original_lang_code": "kmr", + "full_object": "SpaCyTokenizer(\"bn\")", + "original_lang_name": "bengali", + "original_lang_code": "ben", "scripts": [ "Latn", - "Cyrl" + "Beng" ], - "class_name": "StanzaTokenizer", + "class_name": "SpaCyTokenizer", "macrolanguage": false } }, "children": [], - "node_i": "4422", + "node_i": "4216", "scripts": [ "Latn" ], "own_tokenizer": false }, { - "name": "Shabak", + "name": "Kudmali", "iso_1_code": null, - "iso_3_code": "sdb", + "iso_3_code": "kyw", "tokenizers": {}, "children": [], - "node_i": "4423", + "node_i": "4217", "scripts": [], "own_tokenizer": false }, { - "name": "Sarli", + "name": "Magahi", "iso_1_code": null, - "iso_3_code": "sdf", - "tokenizers": {}, - "children": [], - "node_i": "4424", - "scripts": [], - "own_tokenizer": false - } - ], - "node_i": "4418", - "scripts": [], - "own_tokenizer": false - } - ], - "node_i": "4364", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Southwestern", - "iso_1_code": null, - "iso_3_code": null, - "tokenizers": { - "Arab": { - "full_object": "SpaCyTokenizer(\"fa\")", - "original_lang_name": "persian", - "original_lang_code": "fas", - "scripts": [ - "Arab" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": true - }, - "Latn": { - "full_object": "StanzaTokenizer(\"kmr\")", - "original_lang_name": "northern_kurdish", - "original_lang_code": "kmr", - "scripts": [ - "Latn", - "Cyrl" - ], - "class_name": "StanzaTokenizer", - "macrolanguage": false - }, - "Cyrl": { - "full_object": "StanzaTokenizer(\"kmr\")", - "original_lang_name": "northern_kurdish", - "original_lang_code": "kmr", - "scripts": [ - "Latn", - "Cyrl" - ], - "class_name": "StanzaTokenizer", - "macrolanguage": false - }, - "Deva": { - "full_object": "SpaCyTokenizer(\"hi\")", - "original_lang_name": "hindi", - "original_lang_code": "hin", - "scripts": [ - "Latn", - "Deva" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Gujr": { - "full_object": "SpaCyTokenizer(\"gu\")", - "original_lang_name": "gujarati", - "original_lang_code": "guj", - "scripts": [ - "Latn", - "Gujr" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Guru": { - "full_object": "IndicNLPTokenizer(\"pa\")", - "original_lang_name": "punjabi", - "original_lang_code": "pan", - "scripts": [ - "Latn", - "Guru" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": false - }, - "Beng": { - "full_object": "SpaCyTokenizer(\"bn\")", - "original_lang_name": "bengali", - "original_lang_code": "ben", - "scripts": [ - "Latn", - "Beng" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Orya": { - "full_object": "IndicNLPTokenizer(\"or\")", - "original_lang_name": "oriya", - "original_lang_code": "ori", - "scripts": [ - "Latn", - "Orya" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": true - }, - "Sinh": { - "full_object": "SpaCyTokenizer(\"si\")", - "original_lang_name": "sinhala", - "original_lang_code": "sin", - "scripts": [ - "Sinh" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Armn": { - "full_object": "SpaCyTokenizer(\"hy\")", - "original_lang_name": "armenian", - "original_lang_code": "hye", - "scripts": [ - "Armn" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Grek": { - "full_object": "SpaCyTokenizer(\"el\")", - "original_lang_name": "greek", - "original_lang_code": "ell", - "scripts": [ - "Grek" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - } - }, - "children": [ - { - "name": "Fars", - "iso_1_code": null, - "iso_3_code": null, - "tokenizers": { - "Arab": { - "full_object": "SpaCyTokenizer(\"fa\")", - "original_lang_name": "persian", - "original_lang_code": "fas", + "iso_3_code": "mag", + "tokenizers": { + "Deva": { + "full_object": "SpaCyTokenizer(\"mr\")", + "original_lang_name": "marathi", + "original_lang_code": "mar", + "scripts": [ + "Latn", + "Deva" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } + }, + "children": [], + "node_i": "4218", "scripts": [ - "Arab" + "Deva" ], - "class_name": "SpaCyTokenizer", - "macrolanguage": true + "own_tokenizer": false }, - "Latn": { - "full_object": "StanzaTokenizer(\"kmr\")", - "original_lang_name": "northern_kurdish", - "original_lang_code": "kmr", + { + "name": "Maithili", + "iso_1_code": null, + "iso_3_code": "mai", + "tokenizers": { + "Deva": { + "full_object": "SpaCyTokenizer(\"mr\")", + "original_lang_name": "marathi", + "original_lang_code": "mar", + "scripts": [ + "Latn", + "Deva" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } + }, + "children": [], + "node_i": "4219", "scripts": [ - "Latn", - "Cyrl" + "Deva" ], - "class_name": "StanzaTokenizer", - "macrolanguage": false + "own_tokenizer": false }, - "Cyrl": { - "full_object": "StanzaTokenizer(\"kmr\")", - "original_lang_name": "northern_kurdish", - "original_lang_code": "kmr", - "scripts": [ - "Latn", - "Cyrl" - ], - "class_name": "StanzaTokenizer", - "macrolanguage": false + { + "name": "Majhi", + "iso_1_code": null, + "iso_3_code": "mjz", + "tokenizers": {}, + "children": [], + "node_i": "4220", + "scripts": [], + "own_tokenizer": false }, - "Deva": { - "full_object": "SpaCyTokenizer(\"hi\")", - "original_lang_name": "hindi", - "original_lang_code": "hin", + { + "name": "Sadri", + "iso_1_code": null, + "iso_3_code": "sck", + "tokenizers": { + "Deva": { + "full_object": "SpaCyTokenizer(\"mr\")", + "original_lang_name": "marathi", + "original_lang_code": "mar", + "scripts": [ + "Latn", + "Deva" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } + }, + "children": [], + "node_i": "4221", "scripts": [ - "Latn", "Deva" ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false + "own_tokenizer": false }, - "Gujr": { - "full_object": "SpaCyTokenizer(\"gu\")", - "original_lang_name": "gujarati", - "original_lang_code": "guj", - "scripts": [ - "Latn", - "Gujr" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false + { + "name": "Sadri, Oraon", + "iso_1_code": null, + "iso_3_code": "sdr", + "tokenizers": {}, + "children": [], + "node_i": "4222", + "scripts": [], + "own_tokenizer": false }, - "Guru": { - "full_object": "IndicNLPTokenizer(\"pa\")", - "original_lang_name": "punjabi", - "original_lang_code": "pan", - "scripts": [ - "Latn", - "Guru" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": false + { + "name": "Surjapuri", + "iso_1_code": null, + "iso_3_code": "sjp", + "tokenizers": {}, + "children": [], + "node_i": "4223", + "scripts": [], + "own_tokenizer": false }, - "Beng": { - "full_object": "SpaCyTokenizer(\"bn\")", - "original_lang_name": "bengali", - "original_lang_code": "ben", + { + "name": "Musasa", + "iso_1_code": null, + "iso_3_code": "smm", + "tokenizers": {}, + "children": [], + "node_i": "4224", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Panchpargania", + "iso_1_code": null, + "iso_3_code": "tdb", + "tokenizers": {}, + "children": [], + "node_i": "4225", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Bajjika", + "iso_1_code": null, + "iso_3_code": "vjk", + "tokenizers": {}, + "children": [], + "node_i": "4226", + "scripts": [], + "own_tokenizer": false + } + ], + "node_i": "4214", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Oriya", + "iso_1_code": null, + "iso_3_code": null, + "tokenizers": { + "Latn": { + "full_object": "IndicNLPTokenizer(\"or\")", + "original_lang_name": "oriya", + "original_lang_code": "ori", "scripts": [ "Latn", - "Beng" + "Orya" ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false + "class_name": "IndicNLPTokenizer", + "macrolanguage": true }, "Orya": { "full_object": "IndicNLPTokenizer(\"or\")", @@ -21426,35 +6068,167 @@ "Latn", "Orya" ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": true - }, - "Sinh": { - "full_object": "SpaCyTokenizer(\"si\")", - "original_lang_name": "sinhala", - "original_lang_code": "sin", - "scripts": [ - "Sinh" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false + "class_name": "IndicNLPTokenizer", + "macrolanguage": true + } + }, + "children": [ + { + "name": "Bodo Parja", + "iso_1_code": null, + "iso_3_code": "bdv", + "tokenizers": {}, + "children": [], + "node_i": "4228", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Bhatri", + "iso_1_code": null, + "iso_3_code": "bgw", + "tokenizers": {}, + "children": [], + "node_i": "4229", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Bhunjia", + "iso_1_code": null, + "iso_3_code": "bhu", + "tokenizers": {}, + "children": [], + "node_i": "4230", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Desiya", + "iso_1_code": null, + "iso_3_code": "dso", + "tokenizers": {}, + "children": [], + "node_i": "4231", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Kupia", + "iso_1_code": null, + "iso_3_code": "key", + "tokenizers": {}, + "children": [], + "node_i": "4232", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Oriya, Adivasi", + "iso_1_code": null, + "iso_3_code": "ort", + "tokenizers": {}, + "children": [], + "node_i": "4233", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Odia", + "iso_1_code": "or", + "iso_3_code": "ory", + "tokenizers": { + "Latn": { + "full_object": "IndicNLPTokenizer(\"or\")", + "original_lang_name": "oriya", + "original_lang_code": "ori", + "scripts": [ + "Latn", + "Orya" + ], + "class_name": "IndicNLPTokenizer", + "macrolanguage": true + }, + "Orya": { + "full_object": "IndicNLPTokenizer(\"or\")", + "original_lang_name": "oriya", + "original_lang_code": "ori", + "scripts": [ + "Latn", + "Orya" + ], + "class_name": "IndicNLPTokenizer", + "macrolanguage": true + } + }, + "children": [], + "node_i": "4234", + "scripts": [ + "Latn", + "Orya" + ], + "own_tokenizer": true }, - "Armn": { - "full_object": "SpaCyTokenizer(\"hy\")", - "original_lang_name": "armenian", - "original_lang_code": "hye", - "scripts": [ - "Armn" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false + { + "name": "Reli", + "iso_1_code": null, + "iso_3_code": "rei", + "tokenizers": {}, + "children": [], + "node_i": "4235", + "scripts": [], + "own_tokenizer": false }, - "Grek": { - "full_object": "SpaCyTokenizer(\"el\")", - "original_lang_name": "greek", - "original_lang_code": "ell", + { + "name": "Sambalpuri", + "iso_1_code": "or", + "iso_3_code": "spv", + "tokenizers": { + "Latn": { + "full_object": "IndicNLPTokenizer(\"or\")", + "original_lang_name": "oriya", + "original_lang_code": "ori", + "scripts": [ + "Latn", + "Orya" + ], + "class_name": "IndicNLPTokenizer", + "macrolanguage": true + }, + "Orya": { + "full_object": "IndicNLPTokenizer(\"or\")", + "original_lang_name": "oriya", + "original_lang_code": "ori", + "scripts": [ + "Latn", + "Orya" + ], + "class_name": "IndicNLPTokenizer", + "macrolanguage": true + } + }, + "children": [], + "node_i": "4236", + "scripts": [], + "own_tokenizer": true + } + ], + "node_i": "4227", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Unclassified", + "iso_1_code": null, + "iso_3_code": null, + "tokenizers": { + "Deva": { + "full_object": "SpaCyTokenizer(\"mr\")", + "original_lang_name": "marathi", + "original_lang_code": "mar", "scripts": [ - "Grek" + "Latn", + "Deva" ], "class_name": "SpaCyTokenizer", "macrolanguage": false @@ -21462,559 +6236,975 @@ }, "children": [ { - "name": "Fars, Southwestern", + "name": "Angika", "iso_1_code": null, - "iso_3_code": "fay", + "iso_3_code": "anp", + "tokenizers": { + "Deva": { + "full_object": "SpaCyTokenizer(\"mr\")", + "original_lang_name": "marathi", + "original_lang_code": "mar", + "scripts": [ + "Latn", + "Deva" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } + }, + "children": [], + "node_i": "4238", + "scripts": [ + "Deva" + ], + "own_tokenizer": false + }, + { + "name": "Bote", + "iso_1_code": null, + "iso_3_code": "bmj", "tokenizers": {}, "children": [], - "node_i": "4427", + "node_i": "4239", "scripts": [], "own_tokenizer": false }, { - "name": "Lari", + "name": "Buksa", "iso_1_code": null, - "iso_3_code": "lrl", + "iso_3_code": "tkb", "tokenizers": {}, "children": [], - "node_i": "4428", + "node_i": "4240", "scripts": [], "own_tokenizer": false } ], - "node_i": "4426", + "node_i": "4237", "scripts": [], "own_tokenizer": false + } + ], + "node_i": "4193", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Northwestern", + "iso_1_code": null, + "iso_3_code": null, + "tokenizers": { + "Latn": { + "full_object": "IndicNLPTokenizer(\"sd\")", + "original_lang_name": "sindhi", + "original_lang_code": "snd", + "scripts": [ + "Latn", + "Deva", + "Arab" + ], + "class_name": "IndicNLPTokenizer", + "macrolanguage": false + }, + "Deva": { + "full_object": "IndicNLPTokenizer(\"sd\")", + "original_lang_name": "sindhi", + "original_lang_code": "snd", + "scripts": [ + "Latn", + "Deva", + "Arab" + ], + "class_name": "IndicNLPTokenizer", + "macrolanguage": false }, + "Arab": { + "full_object": "IndicNLPTokenizer(\"sd\")", + "original_lang_name": "sindhi", + "original_lang_code": "snd", + "scripts": [ + "Latn", + "Deva", + "Arab" + ], + "class_name": "IndicNLPTokenizer", + "macrolanguage": false + } + }, + "children": [ { - "name": "Luri", + "name": "Dardic", "iso_1_code": null, "iso_3_code": null, "tokenizers": { - "Arab": { - "full_object": "SpaCyTokenizer(\"fa\")", - "original_lang_name": "persian", - "original_lang_code": "fas", - "scripts": [ - "Arab" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": true - }, "Latn": { - "full_object": "StanzaTokenizer(\"kmr\")", - "original_lang_name": "northern_kurdish", - "original_lang_code": "kmr", - "scripts": [ - "Latn", - "Cyrl" - ], - "class_name": "StanzaTokenizer", - "macrolanguage": false - }, - "Cyrl": { - "full_object": "StanzaTokenizer(\"kmr\")", - "original_lang_name": "northern_kurdish", - "original_lang_code": "kmr", - "scripts": [ - "Latn", - "Cyrl" - ], - "class_name": "StanzaTokenizer", - "macrolanguage": false - }, - "Deva": { - "full_object": "SpaCyTokenizer(\"hi\")", - "original_lang_name": "hindi", - "original_lang_code": "hin", - "scripts": [ - "Latn", - "Deva" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Gujr": { - "full_object": "SpaCyTokenizer(\"gu\")", - "original_lang_name": "gujarati", - "original_lang_code": "guj", - "scripts": [ - "Latn", - "Gujr" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Guru": { - "full_object": "IndicNLPTokenizer(\"pa\")", - "original_lang_name": "punjabi", - "original_lang_code": "pan", - "scripts": [ - "Latn", - "Guru" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": false - }, - "Beng": { - "full_object": "SpaCyTokenizer(\"bn\")", - "original_lang_name": "bengali", - "original_lang_code": "ben", + "full_object": "IndicNLPTokenizer(\"sd\")", + "original_lang_name": "sindhi", + "original_lang_code": "snd", "scripts": [ "Latn", - "Beng" + "Deva", + "Arab" ], - "class_name": "SpaCyTokenizer", + "class_name": "IndicNLPTokenizer", "macrolanguage": false }, - "Orya": { - "full_object": "IndicNLPTokenizer(\"or\")", - "original_lang_name": "oriya", - "original_lang_code": "ori", + "Deva": { + "full_object": "IndicNLPTokenizer(\"sd\")", + "original_lang_name": "sindhi", + "original_lang_code": "snd", "scripts": [ "Latn", - "Orya" + "Deva", + "Arab" ], "class_name": "IndicNLPTokenizer", - "macrolanguage": true - }, - "Sinh": { - "full_object": "SpaCyTokenizer(\"si\")", - "original_lang_name": "sinhala", - "original_lang_code": "sin", - "scripts": [ - "Sinh" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Armn": { - "full_object": "SpaCyTokenizer(\"hy\")", - "original_lang_name": "armenian", - "original_lang_code": "hye", - "scripts": [ - "Armn" - ], - "class_name": "SpaCyTokenizer", "macrolanguage": false }, - "Grek": { - "full_object": "SpaCyTokenizer(\"el\")", - "original_lang_name": "greek", - "original_lang_code": "ell", + "Arab": { + "full_object": "IndicNLPTokenizer(\"sd\")", + "original_lang_name": "sindhi", + "original_lang_code": "snd", "scripts": [ - "Grek" + "Latn", + "Deva", + "Arab" ], - "class_name": "SpaCyTokenizer", + "class_name": "IndicNLPTokenizer", "macrolanguage": false } }, "children": [ { - "name": "Bakhti\u00e2ri", + "name": "Chitral", "iso_1_code": null, - "iso_3_code": "bqi", + "iso_3_code": null, "tokenizers": {}, - "children": [], - "node_i": "4430", + "children": [ + { + "name": "Khowar", + "iso_1_code": null, + "iso_3_code": "khw", + "tokenizers": {}, + "children": [], + "node_i": "4244", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Kalasha", + "iso_1_code": null, + "iso_3_code": "kls", + "tokenizers": {}, + "children": [], + "node_i": "4245", + "scripts": [], + "own_tokenizer": false + } + ], + "node_i": "4243", "scripts": [], "own_tokenizer": false }, { - "name": "Luri, Northern", + "name": "Kashmiri", "iso_1_code": null, - "iso_3_code": "lrc", + "iso_3_code": null, "tokenizers": { + "Latn": { + "full_object": "IndicNLPTokenizer(\"sd\")", + "original_lang_name": "sindhi", + "original_lang_code": "snd", + "scripts": [ + "Latn", + "Deva", + "Arab" + ], + "class_name": "IndicNLPTokenizer", + "macrolanguage": false + }, + "Deva": { + "full_object": "IndicNLPTokenizer(\"sd\")", + "original_lang_name": "sindhi", + "original_lang_code": "snd", + "scripts": [ + "Latn", + "Deva", + "Arab" + ], + "class_name": "IndicNLPTokenizer", + "macrolanguage": false + }, "Arab": { - "full_object": "SpaCyTokenizer(\"fa\")", - "original_lang_name": "persian", - "original_lang_code": "fas", + "full_object": "IndicNLPTokenizer(\"sd\")", + "original_lang_name": "sindhi", + "original_lang_code": "snd", "scripts": [ + "Latn", + "Deva", "Arab" ], - "class_name": "SpaCyTokenizer", - "macrolanguage": true + "class_name": "IndicNLPTokenizer", + "macrolanguage": false + } + }, + "children": [ + { + "name": "Kashmiri", + "iso_1_code": "ks", + "iso_3_code": "kas", + "tokenizers": { + "Latn": { + "full_object": "IndicNLPTokenizer(\"sd\")", + "original_lang_name": "sindhi", + "original_lang_code": "snd", + "scripts": [ + "Latn", + "Deva", + "Arab" + ], + "class_name": "IndicNLPTokenizer", + "macrolanguage": false + }, + "Deva": { + "full_object": "IndicNLPTokenizer(\"sd\")", + "original_lang_name": "sindhi", + "original_lang_code": "snd", + "scripts": [ + "Latn", + "Deva", + "Arab" + ], + "class_name": "IndicNLPTokenizer", + "macrolanguage": false + }, + "Arab": { + "full_object": "IndicNLPTokenizer(\"sd\")", + "original_lang_name": "sindhi", + "original_lang_code": "snd", + "scripts": [ + "Latn", + "Deva", + "Arab" + ], + "class_name": "IndicNLPTokenizer", + "macrolanguage": false + } + }, + "children": [], + "node_i": "4247", + "scripts": [ + "Latn", + "Arab", + "Deva" + ], + "own_tokenizer": false + } + ], + "node_i": "4246", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Kohistani", + "iso_1_code": null, + "iso_3_code": null, + "tokenizers": {}, + "children": [ + { + "name": "Bateri", + "iso_1_code": null, + "iso_3_code": "btv", + "tokenizers": {}, + "children": [], + "node_i": "4249", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Chilisso", + "iso_1_code": null, + "iso_3_code": "clh", + "tokenizers": {}, + "children": [], + "node_i": "4250", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Gawri", + "iso_1_code": null, + "iso_3_code": "gwc", + "tokenizers": {}, + "children": [], + "node_i": "4251", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Gowro", + "iso_1_code": null, + "iso_3_code": "gwf", + "tokenizers": {}, + "children": [], + "node_i": "4252", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Kohistani, Indus", + "iso_1_code": null, + "iso_3_code": "mvy", + "tokenizers": {}, + "children": [], + "node_i": "4253", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Mankiyali", + "iso_1_code": null, + "iso_3_code": "nlm", + "tokenizers": {}, + "children": [], + "node_i": "4254", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Tirahi", + "iso_1_code": null, + "iso_3_code": "tra", + "tokenizers": {}, + "children": [], + "node_i": "4255", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Torwali", + "iso_1_code": null, + "iso_3_code": "trw", + "tokenizers": {}, + "children": [], + "node_i": "4256", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Degano", + "iso_1_code": null, + "iso_3_code": "wsv", + "tokenizers": {}, + "children": [], + "node_i": "4257", + "scripts": [], + "own_tokenizer": false } - }, - "children": [], - "node_i": "4431", - "scripts": [ - "Arab" ], + "node_i": "4248", + "scripts": [], "own_tokenizer": false }, { - "name": "Luri, Southern", + "name": "Kunar", "iso_1_code": null, - "iso_3_code": "luz", + "iso_3_code": null, "tokenizers": {}, - "children": [], - "node_i": "4432", + "children": [ + { + "name": "Dameli", + "iso_1_code": null, + "iso_3_code": "dml", + "tokenizers": {}, + "children": [], + "node_i": "4259", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Gawar-Bati", + "iso_1_code": null, + "iso_3_code": "gwt", + "tokenizers": {}, + "children": [], + "node_i": "4260", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Grangali", + "iso_1_code": null, + "iso_3_code": "nli", + "tokenizers": {}, + "children": [], + "node_i": "4261", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Shumashti", + "iso_1_code": null, + "iso_3_code": "sts", + "tokenizers": {}, + "children": [], + "node_i": "4262", + "scripts": [], + "own_tokenizer": false + } + ], + "node_i": "4258", "scripts": [], "own_tokenizer": false }, { - "name": "Kumzari", + "name": "Pashai", "iso_1_code": null, - "iso_3_code": "zum", + "iso_3_code": null, "tokenizers": {}, - "children": [], - "node_i": "4433", + "children": [ + { + "name": "Pashai, Northeast", + "iso_1_code": null, + "iso_3_code": "aee", + "tokenizers": {}, + "children": [], + "node_i": "4264", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Pashai, Northwest", + "iso_1_code": null, + "iso_3_code": "glh", + "tokenizers": {}, + "children": [], + "node_i": "4265", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Pashai, Southwest", + "iso_1_code": null, + "iso_3_code": "psh", + "tokenizers": {}, + "children": [], + "node_i": "4266", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Pashai, Southeast", + "iso_1_code": null, + "iso_3_code": "psi", + "tokenizers": {}, + "children": [], + "node_i": "4267", + "scripts": [], + "own_tokenizer": false + } + ], + "node_i": "4263", "scripts": [], "own_tokenizer": false - } - ], - "node_i": "4429", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Persian", - "iso_1_code": null, - "iso_3_code": null, - "tokenizers": { - "Arab": { - "full_object": "SpaCyTokenizer(\"fa\")", - "original_lang_name": "persian", - "original_lang_code": "fas", - "scripts": [ - "Arab" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": true - }, - "Latn": { - "full_object": "StanzaTokenizer(\"kmr\")", - "original_lang_name": "northern_kurdish", - "original_lang_code": "kmr", - "scripts": [ - "Latn", - "Cyrl" - ], - "class_name": "StanzaTokenizer", - "macrolanguage": false - }, - "Cyrl": { - "full_object": "StanzaTokenizer(\"kmr\")", - "original_lang_name": "northern_kurdish", - "original_lang_code": "kmr", - "scripts": [ - "Latn", - "Cyrl" - ], - "class_name": "StanzaTokenizer", - "macrolanguage": false - }, - "Deva": { - "full_object": "SpaCyTokenizer(\"hi\")", - "original_lang_name": "hindi", - "original_lang_code": "hin", - "scripts": [ - "Latn", - "Deva" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false }, - "Gujr": { - "full_object": "SpaCyTokenizer(\"gu\")", - "original_lang_name": "gujarati", - "original_lang_code": "guj", - "scripts": [ - "Latn", - "Gujr" + { + "name": "Shina", + "iso_1_code": null, + "iso_3_code": null, + "tokenizers": {}, + "children": [ + { + "name": "Brokskat", + "iso_1_code": null, + "iso_3_code": "bkk", + "tokenizers": {}, + "children": [], + "node_i": "4269", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Palula", + "iso_1_code": null, + "iso_3_code": "phl", + "tokenizers": {}, + "children": [], + "node_i": "4270", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Shina, Kohistani", + "iso_1_code": null, + "iso_3_code": "plk", + "tokenizers": {}, + "children": [], + "node_i": "4271", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Shina", + "iso_1_code": null, + "iso_3_code": "scl", + "tokenizers": {}, + "children": [], + "node_i": "4272", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Savi", + "iso_1_code": null, + "iso_3_code": "sdg", + "tokenizers": {}, + "children": [], + "node_i": "4273", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Kundal Shahi", + "iso_1_code": null, + "iso_3_code": "shd", + "tokenizers": {}, + "children": [], + "node_i": "4274", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Ushojo", + "iso_1_code": null, + "iso_3_code": "ush", + "tokenizers": {}, + "children": [], + "node_i": "4275", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Kalkoti", + "iso_1_code": null, + "iso_3_code": "xka", + "tokenizers": {}, + "children": [], + "node_i": "4276", + "scripts": [], + "own_tokenizer": false + } ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Guru": { - "full_object": "IndicNLPTokenizer(\"pa\")", - "original_lang_name": "punjabi", - "original_lang_code": "pan", + "node_i": "4268", + "scripts": [], + "own_tokenizer": false + } + ], + "node_i": "4242", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Sindhi", + "iso_1_code": null, + "iso_3_code": null, + "tokenizers": { + "Latn": { + "full_object": "IndicNLPTokenizer(\"sd\")", + "original_lang_name": "sindhi", + "original_lang_code": "snd", "scripts": [ "Latn", - "Guru" + "Deva", + "Arab" ], "class_name": "IndicNLPTokenizer", "macrolanguage": false }, - "Beng": { - "full_object": "SpaCyTokenizer(\"bn\")", - "original_lang_name": "bengali", - "original_lang_code": "ben", + "Deva": { + "full_object": "IndicNLPTokenizer(\"sd\")", + "original_lang_name": "sindhi", + "original_lang_code": "snd", "scripts": [ "Latn", - "Beng" + "Deva", + "Arab" ], - "class_name": "SpaCyTokenizer", + "class_name": "IndicNLPTokenizer", "macrolanguage": false }, - "Orya": { - "full_object": "IndicNLPTokenizer(\"or\")", - "original_lang_name": "oriya", - "original_lang_code": "ori", + "Arab": { + "full_object": "IndicNLPTokenizer(\"sd\")", + "original_lang_name": "sindhi", + "original_lang_code": "snd", "scripts": [ "Latn", - "Orya" + "Deva", + "Arab" ], "class_name": "IndicNLPTokenizer", - "macrolanguage": true - }, - "Sinh": { - "full_object": "SpaCyTokenizer(\"si\")", - "original_lang_name": "sinhala", - "original_lang_code": "sin", - "scripts": [ - "Sinh" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Armn": { - "full_object": "SpaCyTokenizer(\"hy\")", - "original_lang_name": "armenian", - "original_lang_code": "hye", - "scripts": [ - "Armn" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Grek": { - "full_object": "SpaCyTokenizer(\"el\")", - "original_lang_name": "greek", - "original_lang_code": "ell", - "scripts": [ - "Grek" - ], - "class_name": "SpaCyTokenizer", "macrolanguage": false } }, "children": [ { - "name": "Aimaq", + "name": "Jadgali", "iso_1_code": null, - "iso_3_code": "aiq", + "iso_3_code": "jdg", "tokenizers": {}, "children": [], - "node_i": "4435", + "node_i": "4278", "scripts": [], "own_tokenizer": false }, { - "name": "Bukharic", + "name": "Kacchi", "iso_1_code": null, - "iso_3_code": "bhh", + "iso_3_code": "kfr", "tokenizers": {}, "children": [], - "node_i": "4436", + "node_i": "4279", "scripts": [], "own_tokenizer": false }, { - "name": "Dehwari", + "name": "Lasi", "iso_1_code": null, - "iso_3_code": "deh", + "iso_3_code": "lss", "tokenizers": {}, "children": [], - "node_i": "4437", + "node_i": "4280", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Luwati", + "iso_1_code": null, + "iso_3_code": "luv", + "tokenizers": {}, + "children": [], + "node_i": "4281", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Sindhi Bhil", + "iso_1_code": null, + "iso_3_code": "sbn", + "tokenizers": {}, + "children": [], + "node_i": "4282", "scripts": [], "own_tokenizer": false }, { - "name": "Hazaragi", + "name": "Sindhi", + "iso_1_code": "sd", + "iso_3_code": "snd", + "tokenizers": { + "Latn": { + "full_object": "IndicNLPTokenizer(\"sd\")", + "original_lang_name": "sindhi", + "original_lang_code": "snd", + "scripts": [ + "Latn", + "Deva", + "Arab" + ], + "class_name": "IndicNLPTokenizer", + "macrolanguage": false + }, + "Deva": { + "full_object": "IndicNLPTokenizer(\"sd\")", + "original_lang_name": "sindhi", + "original_lang_code": "snd", + "scripts": [ + "Latn", + "Deva", + "Arab" + ], + "class_name": "IndicNLPTokenizer", + "macrolanguage": false + }, + "Arab": { + "full_object": "IndicNLPTokenizer(\"sd\")", + "original_lang_name": "sindhi", + "original_lang_code": "snd", + "scripts": [ + "Latn", + "Deva", + "Arab" + ], + "class_name": "IndicNLPTokenizer", + "macrolanguage": false + } + }, + "children": [], + "node_i": "4283", + "scripts": [ + "Latn", + "Arab", + "Deva" + ], + "own_tokenizer": true + } + ], + "node_i": "4277", + "scripts": [], + "own_tokenizer": false + } + ], + "node_i": "4241", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Southern", + "iso_1_code": null, + "iso_3_code": null, + "tokenizers": { + "Latn": { + "full_object": "SpaCyTokenizer(\"mr\")", + "original_lang_name": "marathi", + "original_lang_code": "mar", + "scripts": [ + "Latn", + "Deva" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Deva": { + "full_object": "SpaCyTokenizer(\"mr\")", + "original_lang_name": "marathi", + "original_lang_code": "mar", + "scripts": [ + "Latn", + "Deva" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Sinh": { + "full_object": "SpaCyTokenizer(\"si\")", + "original_lang_name": "sinhala", + "original_lang_code": "sin", + "scripts": [ + "Sinh" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } + }, + "children": [ + { + "name": "Marathi", + "iso_1_code": "mr", + "iso_3_code": "mar", + "tokenizers": { + "Latn": { + "full_object": "SpaCyTokenizer(\"mr\")", + "original_lang_name": "marathi", + "original_lang_code": "mar", + "scripts": [ + "Latn", + "Deva" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Deva": { + "full_object": "SpaCyTokenizer(\"mr\")", + "original_lang_name": "marathi", + "original_lang_code": "mar", + "scripts": [ + "Latn", + "Deva" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } + }, + "children": [], + "node_i": "4285", + "scripts": [ + "Deva", + "Latn" + ], + "own_tokenizer": true + }, + { + "name": "Konkani", + "iso_1_code": null, + "iso_3_code": null, + "tokenizers": { + "Latn": { + "full_object": "IndicNLPTokenizer(\"kok\")", + "original_lang_name": "konkani", + "original_lang_code": "kok", + "scripts": [ + "Latn", + "Deva" + ], + "class_name": "IndicNLPTokenizer", + "macrolanguage": true + }, + "Deva": { + "full_object": "IndicNLPTokenizer(\"kok\")", + "original_lang_name": "konkani", + "original_lang_code": "kok", + "scripts": [ + "Latn", + "Deva" + ], + "class_name": "IndicNLPTokenizer", + "macrolanguage": true + } + }, + "children": [ + { + "name": "Konkani, Goan", "iso_1_code": null, - "iso_3_code": "haz", - "tokenizers": {}, + "iso_3_code": "gom", + "tokenizers": { + "Latn": { + "full_object": "IndicNLPTokenizer(\"kok\")", + "original_lang_name": "konkani", + "original_lang_code": "kok", + "scripts": [ + "Latn", + "Deva" + ], + "class_name": "IndicNLPTokenizer", + "macrolanguage": true + }, + "Deva": { + "full_object": "IndicNLPTokenizer(\"kok\")", + "original_lang_name": "konkani", + "original_lang_code": "kok", + "scripts": [ + "Latn", + "Deva" + ], + "class_name": "IndicNLPTokenizer", + "macrolanguage": true + } + }, "children": [], - "node_i": "4438", - "scripts": [], - "own_tokenizer": false + "node_i": "4287", + "scripts": [ + "Latn", + "Deva" + ], + "own_tokenizer": true }, { - "name": "Dzhidi", + "name": "Kukna", "iso_1_code": null, - "iso_3_code": "jpr", - "tokenizers": {}, - "children": [], - "node_i": "4439", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Persian, Iranian", - "iso_1_code": "fa", - "iso_3_code": "pes", + "iso_3_code": "kex", "tokenizers": { - "Arab": { - "full_object": "SpaCyTokenizer(\"fa\")", - "original_lang_name": "persian", - "original_lang_code": "fas", + "Deva": { + "full_object": "IndicNLPTokenizer(\"kok\")", + "original_lang_name": "konkani", + "original_lang_code": "kok", "scripts": [ - "Arab" + "Latn", + "Deva" ], - "class_name": "SpaCyTokenizer", + "class_name": "IndicNLPTokenizer", "macrolanguage": true } }, "children": [], - "node_i": "4440", - "scripts": [], - "own_tokenizer": true + "node_i": "4288", + "scripts": [ + "Deva" + ], + "own_tokenizer": false }, { - "name": "Pahlavani", + "name": "Katkari", "iso_1_code": null, - "iso_3_code": "phv", + "iso_3_code": "kfu", "tokenizers": {}, "children": [], - "node_i": "4441", + "node_i": "4289", "scripts": [], "own_tokenizer": false }, { - "name": "Dari", - "iso_1_code": "fa", - "iso_3_code": "prs", + "name": "Konkani", + "iso_1_code": null, + "iso_3_code": "knn", "tokenizers": { - "Arab": { - "full_object": "SpaCyTokenizer(\"fa\")", - "original_lang_name": "persian", - "original_lang_code": "fas", + "Latn": { + "full_object": "IndicNLPTokenizer(\"kok\")", + "original_lang_name": "konkani", + "original_lang_code": "kok", "scripts": [ - "Arab" + "Latn", + "Deva" ], - "class_name": "SpaCyTokenizer", + "class_name": "IndicNLPTokenizer", + "macrolanguage": true + }, + "Deva": { + "full_object": "IndicNLPTokenizer(\"kok\")", + "original_lang_name": "konkani", + "original_lang_code": "kok", + "scripts": [ + "Latn", + "Deva" + ], + "class_name": "IndicNLPTokenizer", "macrolanguage": true } }, "children": [], - "node_i": "4442", + "node_i": "4290", "scripts": [], "own_tokenizer": true }, { - "name": "Tajik", - "iso_1_code": "tg", - "iso_3_code": "tgk", - "tokenizers": { - "Cyrl": { - "full_object": "StanzaTokenizer(\"kmr\")", - "original_lang_name": "northern_kurdish", - "original_lang_code": "kmr", - "scripts": [ - "Latn", - "Cyrl" - ], - "class_name": "StanzaTokenizer", - "macrolanguage": false - } - }, + "name": "Phudagi", + "iso_1_code": null, + "iso_3_code": "phd", + "tokenizers": {}, "children": [], - "node_i": "4443", - "scripts": [ - "Cyrl" - ], + "node_i": "4291", + "scripts": [], "own_tokenizer": false - } - ], - "node_i": "4434", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Tat", - "iso_1_code": null, - "iso_3_code": null, - "tokenizers": { - "Arab": { - "full_object": "SpaCyTokenizer(\"fa\")", - "original_lang_name": "persian", - "original_lang_code": "fas", - "scripts": [ - "Arab" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": true - }, - "Latn": { - "full_object": "StanzaTokenizer(\"kmr\")", - "original_lang_name": "northern_kurdish", - "original_lang_code": "kmr", - "scripts": [ - "Latn", - "Cyrl" - ], - "class_name": "StanzaTokenizer", - "macrolanguage": false - }, - "Cyrl": { - "full_object": "StanzaTokenizer(\"kmr\")", - "original_lang_name": "northern_kurdish", - "original_lang_code": "kmr", - "scripts": [ - "Latn", - "Cyrl" - ], - "class_name": "StanzaTokenizer", - "macrolanguage": false - }, - "Deva": { - "full_object": "SpaCyTokenizer(\"hi\")", - "original_lang_name": "hindi", - "original_lang_code": "hin", - "scripts": [ - "Latn", - "Deva" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Gujr": { - "full_object": "SpaCyTokenizer(\"gu\")", - "original_lang_name": "gujarati", - "original_lang_code": "guj", - "scripts": [ - "Latn", - "Gujr" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Guru": { - "full_object": "IndicNLPTokenizer(\"pa\")", - "original_lang_name": "punjabi", - "original_lang_code": "pan", - "scripts": [ - "Latn", - "Guru" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": false - }, - "Beng": { - "full_object": "SpaCyTokenizer(\"bn\")", - "original_lang_name": "bengali", - "original_lang_code": "ben", - "scripts": [ - "Latn", - "Beng" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false }, - "Orya": { - "full_object": "IndicNLPTokenizer(\"or\")", - "original_lang_name": "oriya", - "original_lang_code": "ori", - "scripts": [ - "Latn", - "Orya" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": true + { + "name": "Samvedi", + "iso_1_code": null, + "iso_3_code": "smv", + "tokenizers": {}, + "children": [], + "node_i": "4292", + "scripts": [], + "own_tokenizer": false }, + { + "name": "Varli", + "iso_1_code": null, + "iso_3_code": "vav", + "tokenizers": {}, + "children": [], + "node_i": "4293", + "scripts": [], + "own_tokenizer": false + } + ], + "node_i": "4286", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Sinhalese-Maldivian", + "iso_1_code": null, + "iso_3_code": null, + "tokenizers": { "Sinh": { "full_object": "SpaCyTokenizer(\"si\")", "original_lang_name": "sinhala", @@ -22024,626 +7214,710 @@ ], "class_name": "SpaCyTokenizer", "macrolanguage": false - }, - "Armn": { - "full_object": "SpaCyTokenizer(\"hy\")", - "original_lang_name": "armenian", - "original_lang_code": "hye", + } + }, + "children": [ + { + "name": "Maldivian", + "iso_1_code": "dv", + "iso_3_code": "div", + "tokenizers": {}, + "children": [], + "node_i": "4295", "scripts": [ - "Armn" + "Thaa" ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false + "own_tokenizer": false }, - "Grek": { - "full_object": "SpaCyTokenizer(\"el\")", - "original_lang_name": "greek", - "original_lang_code": "ell", + { + "name": "Sinhala", + "iso_1_code": "si", + "iso_3_code": "sin", + "tokenizers": { + "Sinh": { + "full_object": "SpaCyTokenizer(\"si\")", + "original_lang_name": "sinhala", + "original_lang_code": "sin", + "scripts": [ + "Sinh" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } + }, + "children": [], + "node_i": "4296", "scripts": [ - "Grek" + "Sinh" ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false + "own_tokenizer": true + }, + { + "name": "Veddah", + "iso_1_code": null, + "iso_3_code": "ved", + "tokenizers": {}, + "children": [], + "node_i": "4297", + "scripts": [], + "own_tokenizer": false } - }, + ], + "node_i": "4294", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Unclassified", + "iso_1_code": null, + "iso_3_code": null, + "tokenizers": {}, "children": [ { - "name": "Judeo-Tat", + "name": "Bhalay", "iso_1_code": null, - "iso_3_code": "jdt", + "iso_3_code": "bhx", "tokenizers": {}, "children": [], - "node_i": "4445", + "node_i": "4299", "scripts": [], "own_tokenizer": false }, { - "name": "Tat, Muslim", + "name": "Deccan", "iso_1_code": null, - "iso_3_code": "ttt", + "iso_3_code": "dcc", "tokenizers": {}, "children": [], - "node_i": "4446", + "node_i": "4300", "scripts": [], "own_tokenizer": false - } - ], - "node_i": "4444", - "scripts": [], - "own_tokenizer": false - } - ], - "node_i": "4425", - "scripts": [], - "own_tokenizer": false - } - ], - "node_i": "4363", - "scripts": [], - "own_tokenizer": false - } - ], - "node_i": "4340", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Nuristani", - "iso_1_code": null, - "iso_3_code": null, - "tokenizers": { - "Latn": { - "full_object": "SpaCyTokenizer(\"hi\")", - "original_lang_name": "hindi", - "original_lang_code": "hin", - "scripts": [ - "Latn", - "Deva" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Deva": { - "full_object": "SpaCyTokenizer(\"hi\")", - "original_lang_name": "hindi", - "original_lang_code": "hin", - "scripts": [ - "Latn", - "Deva" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Gujr": { - "full_object": "SpaCyTokenizer(\"gu\")", - "original_lang_name": "gujarati", - "original_lang_code": "guj", - "scripts": [ - "Latn", - "Gujr" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Guru": { - "full_object": "IndicNLPTokenizer(\"pa\")", - "original_lang_name": "punjabi", - "original_lang_code": "pan", - "scripts": [ - "Latn", - "Guru" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": false - }, - "Beng": { - "full_object": "SpaCyTokenizer(\"bn\")", - "original_lang_name": "bengali", - "original_lang_code": "ben", - "scripts": [ - "Latn", - "Beng" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Orya": { - "full_object": "IndicNLPTokenizer(\"or\")", - "original_lang_name": "oriya", - "original_lang_code": "ori", - "scripts": [ - "Latn", - "Orya" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": true - }, - "Arab": { - "full_object": "SpaCyTokenizer(\"fa\")", - "original_lang_name": "persian", - "original_lang_code": "fas", - "scripts": [ - "Arab" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": true - }, - "Sinh": { - "full_object": "SpaCyTokenizer(\"si\")", - "original_lang_name": "sinhala", - "original_lang_code": "sin", - "scripts": [ - "Sinh" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Cyrl": { - "full_object": "StanzaTokenizer(\"kmr\")", - "original_lang_name": "northern_kurdish", - "original_lang_code": "kmr", - "scripts": [ - "Latn", - "Cyrl" - ], - "class_name": "StanzaTokenizer", - "macrolanguage": false - }, - "Armn": { - "full_object": "SpaCyTokenizer(\"hy\")", - "original_lang_name": "armenian", - "original_lang_code": "hye", - "scripts": [ - "Armn" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Grek": { - "full_object": "SpaCyTokenizer(\"el\")", - "original_lang_name": "greek", - "original_lang_code": "ell", - "scripts": [ - "Grek" + }, + { + "name": "Gowlan", + "iso_1_code": null, + "iso_3_code": "goj", + "tokenizers": {}, + "children": [], + "node_i": "4301", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Varhadi-Nagpuri", + "iso_1_code": null, + "iso_3_code": "vah", + "tokenizers": {}, + "children": [], + "node_i": "4302", + "scripts": [], + "own_tokenizer": false + } + ], + "node_i": "4298", + "scripts": [], + "own_tokenizer": false + } + ], + "node_i": "4284", + "scripts": [], + "own_tokenizer": false + } ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - } - }, - "children": [ - { - "name": "Ashkun", - "iso_1_code": null, - "iso_3_code": "ask", - "tokenizers": {}, - "children": [], - "node_i": "4448", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Kateviri", - "iso_1_code": null, - "iso_3_code": "bsh", - "tokenizers": {}, - "children": [], - "node_i": "4449", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Prasuni", - "iso_1_code": null, - "iso_3_code": "prn", - "tokenizers": {}, - "children": [], - "node_i": "4450", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Tregami", - "iso_1_code": null, - "iso_3_code": "trm", - "tokenizers": {}, - "children": [], - "node_i": "4451", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Waigali", - "iso_1_code": null, - "iso_3_code": "wbk", - "tokenizers": {}, - "children": [], - "node_i": "4452", + "node_i": "4192", "scripts": [], "own_tokenizer": false }, { - "name": "Komviri", + "name": "Tharu", "iso_1_code": null, - "iso_3_code": "xvi", - "tokenizers": {}, - "children": [], - "node_i": "4453", - "scripts": [], - "own_tokenizer": false - } - ], - "node_i": "4447", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Unclassified", - "iso_1_code": null, - "iso_3_code": null, - "tokenizers": { - "Latn": { - "full_object": "SpaCyTokenizer(\"hi\")", - "original_lang_name": "hindi", - "original_lang_code": "hin", - "scripts": [ - "Latn", - "Deva" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Deva": { - "full_object": "SpaCyTokenizer(\"hi\")", - "original_lang_name": "hindi", - "original_lang_code": "hin", - "scripts": [ - "Latn", - "Deva" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Gujr": { - "full_object": "SpaCyTokenizer(\"gu\")", - "original_lang_name": "gujarati", - "original_lang_code": "guj", - "scripts": [ - "Latn", - "Gujr" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Guru": { - "full_object": "IndicNLPTokenizer(\"pa\")", - "original_lang_name": "punjabi", - "original_lang_code": "pan", - "scripts": [ - "Latn", - "Guru" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": false - }, - "Beng": { - "full_object": "SpaCyTokenizer(\"bn\")", - "original_lang_name": "bengali", - "original_lang_code": "ben", - "scripts": [ - "Latn", - "Beng" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Orya": { - "full_object": "IndicNLPTokenizer(\"or\")", - "original_lang_name": "oriya", - "original_lang_code": "ori", - "scripts": [ - "Latn", - "Orya" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": true - }, - "Arab": { - "full_object": "SpaCyTokenizer(\"fa\")", - "original_lang_name": "persian", - "original_lang_code": "fas", - "scripts": [ - "Arab" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": true - }, - "Sinh": { - "full_object": "SpaCyTokenizer(\"si\")", - "original_lang_name": "sinhala", - "original_lang_code": "sin", - "scripts": [ - "Sinh" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Cyrl": { - "full_object": "StanzaTokenizer(\"kmr\")", - "original_lang_name": "northern_kurdish", - "original_lang_code": "kmr", - "scripts": [ - "Latn", - "Cyrl" - ], - "class_name": "StanzaTokenizer", - "macrolanguage": false - }, - "Armn": { - "full_object": "SpaCyTokenizer(\"hy\")", - "original_lang_name": "armenian", - "original_lang_code": "hye", - "scripts": [ - "Armn" + "iso_3_code": null, + "tokenizers": { + "Latn": { + "full_object": "SpaCyTokenizer(\"hi\")", + "original_lang_name": "hindi", + "original_lang_code": "hin", + "scripts": [ + "Latn", + "Deva" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Deva": { + "full_object": "SpaCyTokenizer(\"hi\")", + "original_lang_name": "hindi", + "original_lang_code": "hin", + "scripts": [ + "Latn", + "Deva" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } + }, + "children": [ + { + "name": "Tharu, Rana", + "iso_1_code": null, + "iso_3_code": "thr", + "tokenizers": {}, + "children": [], + "node_i": "4304", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Tharu, Kathariya", + "iso_1_code": null, + "iso_3_code": "tkt", + "tokenizers": {}, + "children": [], + "node_i": "4305", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Eastern Tharu", + "iso_1_code": null, + "iso_3_code": null, + "tokenizers": { + "Latn": { + "full_object": "SpaCyTokenizer(\"hi\")", + "original_lang_name": "hindi", + "original_lang_code": "hin", + "scripts": [ + "Latn", + "Deva" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Deva": { + "full_object": "SpaCyTokenizer(\"hi\")", + "original_lang_name": "hindi", + "original_lang_code": "hin", + "scripts": [ + "Latn", + "Deva" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } + }, + "children": [ + { + "name": "Tharu, Central", + "iso_1_code": null, + "iso_3_code": "the", + "tokenizers": {}, + "children": [], + "node_i": "4307", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Tharu, Dangaura", + "iso_1_code": null, + "iso_3_code": "thl", + "tokenizers": { + "Deva": { + "full_object": "SpaCyTokenizer(\"hi\")", + "original_lang_name": "hindi", + "original_lang_code": "hin", + "scripts": [ + "Latn", + "Deva" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } + }, + "children": [], + "node_i": "4308", + "scripts": [ + "Deva" + ], + "own_tokenizer": false + }, + { + "name": "Tharu, Mid-Eastern", + "iso_1_code": null, + "iso_3_code": "thq", + "tokenizers": {}, + "children": [], + "node_i": "4309", + "scripts": [], + "own_tokenizer": false + } + ], + "node_i": "4306", + "scripts": [], + "own_tokenizer": false + } ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false + "node_i": "4303", + "scripts": [], + "own_tokenizer": false }, - "Grek": { - "full_object": "SpaCyTokenizer(\"el\")", - "original_lang_name": "greek", - "original_lang_code": "ell", - "scripts": [ - "Grek" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - } - }, - "children": [ { - "name": "Badeshi", + "name": "Unclassified", "iso_1_code": null, - "iso_3_code": "bdz", + "iso_3_code": null, "tokenizers": {}, - "children": [], - "node_i": "4455", - "scripts": [], - "own_tokenizer": false - } - ], - "node_i": "4454", - "scripts": [], - "own_tokenizer": false - } - ], - "node_i": "4067", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Italic", - "iso_1_code": null, - "iso_3_code": null, - "tokenizers": { - "Latn": { - "full_object": "SpaCyTokenizer(\"es\")", - "original_lang_name": "spanish", - "original_lang_code": "spa", - "scripts": [ - "Latn" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Cyrl": { - "full_object": "SpaCyTokenizer(\"ro\")", - "original_lang_name": "romanian", - "original_lang_code": "ron", - "scripts": [ - "Latn", - "Cyrl" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Armn": { - "full_object": "SpaCyTokenizer(\"hy\")", - "original_lang_name": "armenian", - "original_lang_code": "hye", - "scripts": [ - "Armn" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Grek": { - "full_object": "SpaCyTokenizer(\"el\")", - "original_lang_name": "greek", - "original_lang_code": "ell", - "scripts": [ - "Grek" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Deva": { - "full_object": "SpaCyTokenizer(\"hi\")", - "original_lang_name": "hindi", - "original_lang_code": "hin", - "scripts": [ - "Latn", - "Deva" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Gujr": { - "full_object": "SpaCyTokenizer(\"gu\")", - "original_lang_name": "gujarati", - "original_lang_code": "guj", - "scripts": [ - "Latn", - "Gujr" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Guru": { - "full_object": "IndicNLPTokenizer(\"pa\")", - "original_lang_name": "punjabi", - "original_lang_code": "pan", - "scripts": [ - "Latn", - "Guru" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": false - }, - "Beng": { - "full_object": "SpaCyTokenizer(\"bn\")", - "original_lang_name": "bengali", - "original_lang_code": "ben", - "scripts": [ - "Latn", - "Beng" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Orya": { - "full_object": "IndicNLPTokenizer(\"or\")", - "original_lang_name": "oriya", - "original_lang_code": "ori", - "scripts": [ - "Latn", - "Orya" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": true - }, - "Arab": { - "full_object": "SpaCyTokenizer(\"fa\")", - "original_lang_name": "persian", - "original_lang_code": "fas", - "scripts": [ - "Arab" + "children": [ + { + "name": "Andh", + "iso_1_code": null, + "iso_3_code": "anr", + "tokenizers": {}, + "children": [], + "node_i": "4311", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Bazigar", + "iso_1_code": null, + "iso_3_code": "bfr", + "tokenizers": {}, + "children": [], + "node_i": "4312", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Chinali", + "iso_1_code": null, + "iso_3_code": "cih", + "tokenizers": {}, + "children": [], + "node_i": "4313", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Danuwar", + "iso_1_code": null, + "iso_3_code": "dhw", + "tokenizers": {}, + "children": [], + "node_i": "4314", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Darai", + "iso_1_code": null, + "iso_3_code": "dry", + "tokenizers": {}, + "children": [], + "node_i": "4315", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Dewas Rai", + "iso_1_code": null, + "iso_3_code": "dwz", + "tokenizers": {}, + "children": [], + "node_i": "4316", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Kanjari", + "iso_1_code": null, + "iso_3_code": "kft", + "tokenizers": {}, + "children": [], + "node_i": "4317", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Kumal", + "iso_1_code": null, + "iso_3_code": "kra", + "tokenizers": {}, + "children": [], + "node_i": "4318", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Lohar, Lahul", + "iso_1_code": null, + "iso_3_code": "lhl", + "tokenizers": {}, + "children": [], + "node_i": "4319", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Memoni", + "iso_1_code": null, + "iso_3_code": "mby", + "tokenizers": {}, + "children": [], + "node_i": "4320", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Oadki", + "iso_1_code": null, + "iso_3_code": "odk", + "tokenizers": {}, + "children": [], + "node_i": "4321", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Pali", + "iso_1_code": "pi", + "iso_3_code": "pli", + "tokenizers": {}, + "children": [], + "node_i": "4322", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Vaagri Booli", + "iso_1_code": null, + "iso_3_code": "vaa", + "tokenizers": {}, + "children": [], + "node_i": "4323", + "scripts": [], + "own_tokenizer": false + } + ], + "node_i": "4310", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Western Hindi", + "iso_1_code": null, + "iso_3_code": null, + "tokenizers": { + "Latn": { + "full_object": "SpaCyTokenizer(\"hi\")", + "original_lang_name": "hindi", + "original_lang_code": "hin", + "scripts": [ + "Latn", + "Deva" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Deva": { + "full_object": "SpaCyTokenizer(\"hi\")", + "original_lang_name": "hindi", + "original_lang_code": "hin", + "scripts": [ + "Latn", + "Deva" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Arab": { + "full_object": "SpaCyTokenizer(\"ur\")", + "original_lang_name": "urdu", + "original_lang_code": "urd", + "scripts": [ + "Latn", + "Arab" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } + }, + "children": [ + { + "name": "Bundeli", + "iso_1_code": null, + "iso_3_code": null, + "tokenizers": {}, + "children": [ + { + "name": "Bundeli", + "iso_1_code": null, + "iso_3_code": "bns", + "tokenizers": {}, + "children": [], + "node_i": "4326", + "scripts": [], + "own_tokenizer": false + } + ], + "node_i": "4325", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Hindustani", + "iso_1_code": null, + "iso_3_code": null, + "tokenizers": { + "Latn": { + "full_object": "SpaCyTokenizer(\"hi\")", + "original_lang_name": "hindi", + "original_lang_code": "hin", + "scripts": [ + "Latn", + "Deva" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Deva": { + "full_object": "SpaCyTokenizer(\"hi\")", + "original_lang_name": "hindi", + "original_lang_code": "hin", + "scripts": [ + "Latn", + "Deva" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Arab": { + "full_object": "SpaCyTokenizer(\"ur\")", + "original_lang_name": "urdu", + "original_lang_code": "urd", + "scripts": [ + "Latn", + "Arab" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } + }, + "children": [ + { + "name": "Hindi", + "iso_1_code": "hi", + "iso_3_code": "hin", + "tokenizers": { + "Latn": { + "full_object": "SpaCyTokenizer(\"hi\")", + "original_lang_name": "hindi", + "original_lang_code": "hin", + "scripts": [ + "Latn", + "Deva" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Deva": { + "full_object": "SpaCyTokenizer(\"hi\")", + "original_lang_name": "hindi", + "original_lang_code": "hin", + "scripts": [ + "Latn", + "Deva" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } + }, + "children": [], + "node_i": "4328", + "scripts": [ + "Latn", + "Deva" + ], + "own_tokenizer": true + }, + { + "name": "Urdu", + "iso_1_code": "ur", + "iso_3_code": "urd", + "tokenizers": { + "Latn": { + "full_object": "SpaCyTokenizer(\"ur\")", + "original_lang_name": "urdu", + "original_lang_code": "urd", + "scripts": [ + "Latn", + "Arab" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Arab": { + "full_object": "SpaCyTokenizer(\"ur\")", + "original_lang_name": "urdu", + "original_lang_code": "urd", + "scripts": [ + "Latn", + "Arab" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } + }, + "children": [], + "node_i": "4329", + "scripts": [ + "Latn", + "Arab" + ], + "own_tokenizer": true + }, + { + "name": "Sansi", + "iso_1_code": null, + "iso_3_code": null, + "tokenizers": {}, + "children": [ + { + "name": "Kabutra", + "iso_1_code": null, + "iso_3_code": "kbu", + "tokenizers": {}, + "children": [], + "node_i": "4331", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Sansi", + "iso_1_code": null, + "iso_3_code": "ssi", + "tokenizers": {}, + "children": [], + "node_i": "4332", + "scripts": [], + "own_tokenizer": false + } + ], + "node_i": "4330", + "scripts": [], + "own_tokenizer": false + } + ], + "node_i": "4327", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Unclassified", + "iso_1_code": null, + "iso_3_code": null, + "tokenizers": {}, + "children": [ + { + "name": "Haryanvi", + "iso_1_code": null, + "iso_3_code": "bgc", + "tokenizers": {}, + "children": [], + "node_i": "4334", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Bhaya", + "iso_1_code": null, + "iso_3_code": "bhe", + "tokenizers": {}, + "children": [], + "node_i": "4335", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Kanauji", + "iso_1_code": null, + "iso_3_code": "bjj", + "tokenizers": {}, + "children": [], + "node_i": "4336", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Braj Bhasha", + "iso_1_code": null, + "iso_3_code": "bra", + "tokenizers": {}, + "children": [], + "node_i": "4337", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Ghera", + "iso_1_code": null, + "iso_3_code": "ghr", + "tokenizers": {}, + "children": [], + "node_i": "4338", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Gowli", + "iso_1_code": null, + "iso_3_code": "gok", + "tokenizers": {}, + "children": [], + "node_i": "4339", + "scripts": [], + "own_tokenizer": false + } + ], + "node_i": "4333", + "scripts": [], + "own_tokenizer": false + } + ], + "node_i": "4324", + "scripts": [], + "own_tokenizer": false + } ], - "class_name": "SpaCyTokenizer", - "macrolanguage": true + "node_i": "4068", + "scripts": [], + "own_tokenizer": false }, - "Sinh": { - "full_object": "SpaCyTokenizer(\"si\")", - "original_lang_name": "sinhala", - "original_lang_code": "sin", - "scripts": [ - "Sinh" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - } - }, - "children": [ { - "name": "Latino-Faliscan", + "name": "Iranian", "iso_1_code": null, "iso_3_code": null, "tokenizers": { "Latn": { - "full_object": "SpaCyTokenizer(\"la\")", - "original_lang_name": "latin", - "original_lang_code": "lat", - "scripts": [ - "Latn" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Cyrl": { - "full_object": "SpaCyTokenizer(\"ro\")", - "original_lang_name": "romanian", - "original_lang_code": "ron", + "full_object": "StanzaTokenizer(\"kmr\")", + "original_lang_name": "northern_kurdish", + "original_lang_code": "kmr", "scripts": [ "Latn", "Cyrl" ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Armn": { - "full_object": "SpaCyTokenizer(\"hy\")", - "original_lang_name": "armenian", - "original_lang_code": "hye", - "scripts": [ - "Armn" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Grek": { - "full_object": "SpaCyTokenizer(\"el\")", - "original_lang_name": "greek", - "original_lang_code": "ell", - "scripts": [ - "Grek" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Deva": { - "full_object": "SpaCyTokenizer(\"hi\")", - "original_lang_name": "hindi", - "original_lang_code": "hin", - "scripts": [ - "Latn", - "Deva" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Gujr": { - "full_object": "SpaCyTokenizer(\"gu\")", - "original_lang_name": "gujarati", - "original_lang_code": "guj", - "scripts": [ - "Latn", - "Gujr" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Guru": { - "full_object": "IndicNLPTokenizer(\"pa\")", - "original_lang_name": "punjabi", - "original_lang_code": "pan", - "scripts": [ - "Latn", - "Guru" - ], - "class_name": "IndicNLPTokenizer", + "class_name": "StanzaTokenizer", "macrolanguage": false }, - "Beng": { - "full_object": "SpaCyTokenizer(\"bn\")", - "original_lang_name": "bengali", - "original_lang_code": "ben", + "Cyrl": { + "full_object": "StanzaTokenizer(\"kmr\")", + "original_lang_name": "northern_kurdish", + "original_lang_code": "kmr", "scripts": [ "Latn", - "Beng" + "Cyrl" ], - "class_name": "SpaCyTokenizer", + "class_name": "StanzaTokenizer", "macrolanguage": false }, - "Orya": { - "full_object": "IndicNLPTokenizer(\"or\")", - "original_lang_name": "oriya", - "original_lang_code": "ori", - "scripts": [ - "Latn", - "Orya" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": true - }, "Arab": { "full_object": "SpaCyTokenizer(\"fa\")", "original_lang_name": "persian", @@ -22651,397 +7925,1951 @@ "scripts": [ "Arab" ], - "class_name": "SpaCyTokenizer", - "macrolanguage": true + "class_name": "SpaCyTokenizer", + "macrolanguage": true + } + }, + "children": [ + { + "name": "Avestan", + "iso_1_code": "ae", + "iso_3_code": "ave", + "tokenizers": {}, + "children": [], + "node_i": "4341", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Eastern", + "iso_1_code": null, + "iso_3_code": null, + "tokenizers": { + "Latn": { + "full_object": "StanzaTokenizer(\"kmr\")", + "original_lang_name": "northern_kurdish", + "original_lang_code": "kmr", + "scripts": [ + "Latn", + "Cyrl" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Cyrl": { + "full_object": "StanzaTokenizer(\"kmr\")", + "original_lang_name": "northern_kurdish", + "original_lang_code": "kmr", + "scripts": [ + "Latn", + "Cyrl" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Arab": { + "full_object": "SpaCyTokenizer(\"fa\")", + "original_lang_name": "persian", + "original_lang_code": "fas", + "scripts": [ + "Arab" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + } + }, + "children": [ + { + "name": "Northeastern", + "iso_1_code": null, + "iso_3_code": null, + "tokenizers": { + "Latn": { + "full_object": "StanzaTokenizer(\"kmr\")", + "original_lang_name": "northern_kurdish", + "original_lang_code": "kmr", + "scripts": [ + "Latn", + "Cyrl" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Cyrl": { + "full_object": "StanzaTokenizer(\"kmr\")", + "original_lang_name": "northern_kurdish", + "original_lang_code": "kmr", + "scripts": [ + "Latn", + "Cyrl" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + } + }, + "children": [ + { + "name": "Ossetic", + "iso_1_code": "os", + "iso_3_code": "oss", + "tokenizers": { + "Cyrl": { + "full_object": "StanzaTokenizer(\"kmr\")", + "original_lang_name": "northern_kurdish", + "original_lang_code": "kmr", + "scripts": [ + "Latn", + "Cyrl" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + } + }, + "children": [], + "node_i": "4344", + "scripts": [ + "Cyrl" + ], + "own_tokenizer": false + }, + { + "name": "Yagnobi", + "iso_1_code": null, + "iso_3_code": "yai", + "tokenizers": {}, + "children": [], + "node_i": "4345", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Yassic", + "iso_1_code": null, + "iso_3_code": "ysc", + "tokenizers": {}, + "children": [], + "node_i": "4346", + "scripts": [], + "own_tokenizer": false + } + ], + "node_i": "4343", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Southeastern", + "iso_1_code": null, + "iso_3_code": null, + "tokenizers": { + "Latn": { + "full_object": "StanzaTokenizer(\"kmr\")", + "original_lang_name": "northern_kurdish", + "original_lang_code": "kmr", + "scripts": [ + "Latn", + "Cyrl" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Cyrl": { + "full_object": "StanzaTokenizer(\"kmr\")", + "original_lang_name": "northern_kurdish", + "original_lang_code": "kmr", + "scripts": [ + "Latn", + "Cyrl" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Arab": { + "full_object": "SpaCyTokenizer(\"fa\")", + "original_lang_name": "persian", + "original_lang_code": "fas", + "scripts": [ + "Arab" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + } + }, + "children": [ + { + "name": "Pamir", + "iso_1_code": null, + "iso_3_code": null, + "tokenizers": { + "Latn": { + "full_object": "StanzaTokenizer(\"kmr\")", + "original_lang_name": "northern_kurdish", + "original_lang_code": "kmr", + "scripts": [ + "Latn", + "Cyrl" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Cyrl": { + "full_object": "StanzaTokenizer(\"kmr\")", + "original_lang_name": "northern_kurdish", + "original_lang_code": "kmr", + "scripts": [ + "Latn", + "Cyrl" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + } + }, + "children": [ + { + "name": "Ishkashimi", + "iso_1_code": null, + "iso_3_code": "isk", + "tokenizers": {}, + "children": [], + "node_i": "4349", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Munji", + "iso_1_code": null, + "iso_3_code": "mnj", + "tokenizers": {}, + "children": [], + "node_i": "4350", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Sanglechi", + "iso_1_code": null, + "iso_3_code": "sgy", + "tokenizers": {}, + "children": [], + "node_i": "4351", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Wakhi", + "iso_1_code": null, + "iso_3_code": "wbl", + "tokenizers": {}, + "children": [], + "node_i": "4352", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Yadgha", + "iso_1_code": null, + "iso_3_code": "ydg", + "tokenizers": {}, + "children": [], + "node_i": "4353", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Shugni-Yazgulami", + "iso_1_code": null, + "iso_3_code": null, + "tokenizers": { + "Latn": { + "full_object": "StanzaTokenizer(\"kmr\")", + "original_lang_name": "northern_kurdish", + "original_lang_code": "kmr", + "scripts": [ + "Latn", + "Cyrl" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Cyrl": { + "full_object": "StanzaTokenizer(\"kmr\")", + "original_lang_name": "northern_kurdish", + "original_lang_code": "kmr", + "scripts": [ + "Latn", + "Cyrl" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + } + }, + "children": [ + { + "name": "Shughni", + "iso_1_code": null, + "iso_3_code": "sgh", + "tokenizers": { + "Cyrl": { + "full_object": "StanzaTokenizer(\"kmr\")", + "original_lang_name": "northern_kurdish", + "original_lang_code": "kmr", + "scripts": [ + "Latn", + "Cyrl" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + } + }, + "children": [], + "node_i": "4355", + "scripts": [ + "Cyrl" + ], + "own_tokenizer": false + }, + { + "name": "Sarikoli", + "iso_1_code": null, + "iso_3_code": "srh", + "tokenizers": {}, + "children": [], + "node_i": "4356", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Yazghulami", + "iso_1_code": null, + "iso_3_code": "yah", + "tokenizers": {}, + "children": [], + "node_i": "4357", + "scripts": [], + "own_tokenizer": false + } + ], + "node_i": "4354", + "scripts": [], + "own_tokenizer": false + } + ], + "node_i": "4348", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Pashto", + "iso_1_code": null, + "iso_3_code": null, + "tokenizers": { + "Arab": { + "full_object": "SpaCyTokenizer(\"fa\")", + "original_lang_name": "persian", + "original_lang_code": "fas", + "scripts": [ + "Arab" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + } + }, + "children": [ + { + "name": "Pashto, Southern", + "iso_1_code": "ps", + "iso_3_code": "pbt", + "tokenizers": { + "Arab": { + "full_object": "SpaCyTokenizer(\"fa\")", + "original_lang_name": "persian", + "original_lang_code": "fas", + "scripts": [ + "Arab" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + } + }, + "children": [], + "node_i": "4359", + "scripts": [ + "Arab" + ], + "own_tokenizer": false + }, + { + "name": "Pashto, Northern", + "iso_1_code": "ps", + "iso_3_code": "pbu", + "tokenizers": {}, + "children": [], + "node_i": "4360", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Pashto, Central", + "iso_1_code": "ps", + "iso_3_code": "pst", + "tokenizers": {}, + "children": [], + "node_i": "4361", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Waneci", + "iso_1_code": null, + "iso_3_code": "wne", + "tokenizers": {}, + "children": [], + "node_i": "4362", + "scripts": [], + "own_tokenizer": false + } + ], + "node_i": "4358", + "scripts": [], + "own_tokenizer": false + } + ], + "node_i": "4347", + "scripts": [], + "own_tokenizer": false + } + ], + "node_i": "4342", + "scripts": [], + "own_tokenizer": false }, - "Sinh": { - "full_object": "SpaCyTokenizer(\"si\")", - "original_lang_name": "sinhala", - "original_lang_code": "sin", - "scripts": [ - "Sinh" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - } - }, - "children": [ { - "name": "Latin", - "iso_1_code": "la", - "iso_3_code": "lat", + "name": "Western", + "iso_1_code": null, + "iso_3_code": null, "tokenizers": { "Latn": { - "full_object": "SpaCyTokenizer(\"la\")", - "original_lang_name": "latin", - "original_lang_code": "lat", + "full_object": "StanzaTokenizer(\"kmr\")", + "original_lang_name": "northern_kurdish", + "original_lang_code": "kmr", "scripts": [ - "Latn" + "Latn", + "Cyrl" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Cyrl": { + "full_object": "StanzaTokenizer(\"kmr\")", + "original_lang_name": "northern_kurdish", + "original_lang_code": "kmr", + "scripts": [ + "Latn", + "Cyrl" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Arab": { + "full_object": "SpaCyTokenizer(\"fa\")", + "original_lang_name": "persian", + "original_lang_code": "fas", + "scripts": [ + "Arab" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + } + }, + "children": [ + { + "name": "Northwestern", + "iso_1_code": null, + "iso_3_code": null, + "tokenizers": { + "Latn": { + "full_object": "StanzaTokenizer(\"kmr\")", + "original_lang_name": "northern_kurdish", + "original_lang_code": "kmr", + "scripts": [ + "Latn", + "Cyrl" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Cyrl": { + "full_object": "StanzaTokenizer(\"kmr\")", + "original_lang_name": "northern_kurdish", + "original_lang_code": "kmr", + "scripts": [ + "Latn", + "Cyrl" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Arab": { + "full_object": "SpaCyTokenizer(\"fa\")", + "original_lang_name": "persian", + "original_lang_code": "fas", + "scripts": [ + "Arab" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + } + }, + "children": [ + { + "name": "Balochi", + "iso_1_code": null, + "iso_3_code": null, + "tokenizers": { + "Arab": { + "full_object": "SpaCyTokenizer(\"fa\")", + "original_lang_name": "persian", + "original_lang_code": "fas", + "scripts": [ + "Arab" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + } + }, + "children": [ + { + "name": "Balochi, Southern", + "iso_1_code": null, + "iso_3_code": "bcc", + "tokenizers": { + "Arab": { + "full_object": "SpaCyTokenizer(\"fa\")", + "original_lang_name": "persian", + "original_lang_code": "fas", + "scripts": [ + "Arab" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + } + }, + "children": [], + "node_i": "4366", + "scripts": [ + "Arab" + ], + "own_tokenizer": false + }, + { + "name": "Balochi, Western", + "iso_1_code": null, + "iso_3_code": "bgn", + "tokenizers": {}, + "children": [], + "node_i": "4367", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Balochi, Eastern", + "iso_1_code": null, + "iso_3_code": "bgp", + "tokenizers": {}, + "children": [], + "node_i": "4368", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Bashkardi", + "iso_1_code": null, + "iso_3_code": "bsg", + "tokenizers": {}, + "children": [], + "node_i": "4369", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Koroshi", + "iso_1_code": null, + "iso_3_code": "ktl", + "tokenizers": {}, + "children": [], + "node_i": "4370", + "scripts": [], + "own_tokenizer": false + } + ], + "node_i": "4365", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Caspian", + "iso_1_code": null, + "iso_3_code": null, + "tokenizers": { + "Arab": { + "full_object": "SpaCyTokenizer(\"fa\")", + "original_lang_name": "persian", + "original_lang_code": "fas", + "scripts": [ + "Arab" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + } + }, + "children": [ + { + "name": "Gilaki", + "iso_1_code": null, + "iso_3_code": "glk", + "tokenizers": { + "Arab": { + "full_object": "SpaCyTokenizer(\"fa\")", + "original_lang_name": "persian", + "original_lang_code": "fas", + "scripts": [ + "Arab" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + } + }, + "children": [], + "node_i": "4372", + "scripts": [ + "Arab" + ], + "own_tokenizer": false + }, + { + "name": "Mazandarani", + "iso_1_code": null, + "iso_3_code": "mzn", + "tokenizers": { + "Arab": { + "full_object": "SpaCyTokenizer(\"fa\")", + "original_lang_name": "persian", + "original_lang_code": "fas", + "scripts": [ + "Arab" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + } + }, + "children": [], + "node_i": "4373", + "scripts": [ + "Arab" + ], + "own_tokenizer": false + }, + { + "name": "Shahmirzadi", + "iso_1_code": null, + "iso_3_code": "srz", + "tokenizers": {}, + "children": [], + "node_i": "4374", + "scripts": [], + "own_tokenizer": false + } + ], + "node_i": "4371", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Central Iran", + "iso_1_code": null, + "iso_3_code": null, + "tokenizers": {}, + "children": [ + { + "name": "Ashtiani", + "iso_1_code": null, + "iso_3_code": "atn", + "tokenizers": {}, + "children": [], + "node_i": "4376", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Dari, Zoroastrian", + "iso_1_code": null, + "iso_3_code": "gbz", + "tokenizers": {}, + "children": [], + "node_i": "4377", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Gazi", + "iso_1_code": null, + "iso_3_code": "gzi", + "tokenizers": {}, + "children": [], + "node_i": "4378", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Khunsari", + "iso_1_code": null, + "iso_3_code": "kfm", + "tokenizers": {}, + "children": [], + "node_i": "4379", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Natanzi", + "iso_1_code": null, + "iso_3_code": "ntz", + "tokenizers": {}, + "children": [], + "node_i": "4380", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Nayini", + "iso_1_code": null, + "iso_3_code": "nyq", + "tokenizers": {}, + "children": [], + "node_i": "4381", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Parsi-Dari", + "iso_1_code": null, + "iso_3_code": "prd", + "tokenizers": {}, + "children": [], + "node_i": "4382", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Sivandi", + "iso_1_code": null, + "iso_3_code": "siy", + "tokenizers": {}, + "children": [], + "node_i": "4383", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Soi", + "iso_1_code": null, + "iso_3_code": "soj", + "tokenizers": {}, + "children": [], + "node_i": "4384", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Vafsi", + "iso_1_code": null, + "iso_3_code": "vaf", + "tokenizers": {}, + "children": [], + "node_i": "4385", + "scripts": [], + "own_tokenizer": false + } + ], + "node_i": "4375", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Kurdish", + "iso_1_code": null, + "iso_3_code": null, + "tokenizers": { + "Latn": { + "full_object": "StanzaTokenizer(\"kmr\")", + "original_lang_name": "northern_kurdish", + "original_lang_code": "kmr", + "scripts": [ + "Latn", + "Cyrl" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Cyrl": { + "full_object": "StanzaTokenizer(\"kmr\")", + "original_lang_name": "northern_kurdish", + "original_lang_code": "kmr", + "scripts": [ + "Latn", + "Cyrl" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Arab": { + "full_object": "SpaCyTokenizer(\"fa\")", + "original_lang_name": "persian", + "original_lang_code": "fas", + "scripts": [ + "Arab" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + } + }, + "children": [ + { + "name": "Kurdish, Central", + "iso_1_code": "ku", + "iso_3_code": "ckb", + "tokenizers": { + "Arab": { + "full_object": "SpaCyTokenizer(\"fa\")", + "original_lang_name": "persian", + "original_lang_code": "fas", + "scripts": [ + "Arab" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + } + }, + "children": [], + "node_i": "4387", + "scripts": [ + "Arab" + ], + "own_tokenizer": false + }, + { + "name": "Kurdish, Northern", + "iso_1_code": "ku", + "iso_3_code": "kmr", + "tokenizers": { + "Latn": { + "full_object": "StanzaTokenizer(\"kmr\")", + "original_lang_name": "northern_kurdish", + "original_lang_code": "kmr", + "scripts": [ + "Latn", + "Cyrl" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Cyrl": { + "full_object": "StanzaTokenizer(\"kmr\")", + "original_lang_name": "northern_kurdish", + "original_lang_code": "kmr", + "scripts": [ + "Latn", + "Cyrl" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + } + }, + "children": [], + "node_i": "4388", + "scripts": [ + "Latn", + "Cyrl" + ], + "own_tokenizer": true + }, + { + "name": "Laki", + "iso_1_code": null, + "iso_3_code": "lki", + "tokenizers": { + "Arab": { + "full_object": "SpaCyTokenizer(\"fa\")", + "original_lang_name": "persian", + "original_lang_code": "fas", + "scripts": [ + "Arab" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + } + }, + "children": [], + "node_i": "4389", + "scripts": [ + "Arab" + ], + "own_tokenizer": false + }, + { + "name": "Kurdish, Southern", + "iso_1_code": "ku", + "iso_3_code": "sdh", + "tokenizers": { + "Arab": { + "full_object": "SpaCyTokenizer(\"fa\")", + "original_lang_name": "persian", + "original_lang_code": "fas", + "scripts": [ + "Arab" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + } + }, + "children": [], + "node_i": "4390", + "scripts": [ + "Arab" + ], + "own_tokenizer": false + } + ], + "node_i": "4386", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Ormuri-Parachi", + "iso_1_code": null, + "iso_3_code": null, + "tokenizers": {}, + "children": [ + { + "name": "Ormuri", + "iso_1_code": null, + "iso_3_code": "oru", + "tokenizers": {}, + "children": [], + "node_i": "4392", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Parachi", + "iso_1_code": null, + "iso_3_code": "prc", + "tokenizers": {}, + "children": [], + "node_i": "4393", + "scripts": [], + "own_tokenizer": false + } + ], + "node_i": "4391", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Semnani", + "iso_1_code": null, + "iso_3_code": null, + "tokenizers": {}, + "children": [ + { + "name": "Lasgerdi", + "iso_1_code": null, + "iso_3_code": "lsa", + "tokenizers": {}, + "children": [], + "node_i": "4395", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Sangisari", + "iso_1_code": null, + "iso_3_code": "sgr", + "tokenizers": {}, + "children": [], + "node_i": "4396", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Semnani", + "iso_1_code": null, + "iso_3_code": "smy", + "tokenizers": {}, + "children": [], + "node_i": "4397", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Sorkhei", + "iso_1_code": null, + "iso_3_code": "sqo", + "tokenizers": {}, + "children": [], + "node_i": "4398", + "scripts": [], + "own_tokenizer": false + } + ], + "node_i": "4394", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Talysh", + "iso_1_code": null, + "iso_3_code": null, + "tokenizers": { + "Latn": { + "full_object": "StanzaTokenizer(\"kmr\")", + "original_lang_name": "northern_kurdish", + "original_lang_code": "kmr", + "scripts": [ + "Latn", + "Cyrl" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Cyrl": { + "full_object": "StanzaTokenizer(\"kmr\")", + "original_lang_name": "northern_kurdish", + "original_lang_code": "kmr", + "scripts": [ + "Latn", + "Cyrl" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + } + }, + "children": [ + { + "name": "Alviri-Vidari", + "iso_1_code": null, + "iso_3_code": "avd", + "tokenizers": {}, + "children": [], + "node_i": "4400", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Eshtehardi", + "iso_1_code": null, + "iso_3_code": "esh", + "tokenizers": {}, + "children": [], + "node_i": "4401", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Gozarkhani", + "iso_1_code": null, + "iso_3_code": "goz", + "tokenizers": {}, + "children": [], + "node_i": "4402", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Harzani", + "iso_1_code": null, + "iso_3_code": "hrz", + "tokenizers": {}, + "children": [], + "node_i": "4403", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Karingani", + "iso_1_code": null, + "iso_3_code": "kgn", + "tokenizers": {}, + "children": [], + "node_i": "4404", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Koresh-e Rostam", + "iso_1_code": null, + "iso_3_code": "okh", + "tokenizers": {}, + "children": [], + "node_i": "4405", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Razajerdi", + "iso_1_code": null, + "iso_3_code": "rat", + "tokenizers": {}, + "children": [], + "node_i": "4406", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Rudbari", + "iso_1_code": null, + "iso_3_code": "rdb", + "tokenizers": {}, + "children": [], + "node_i": "4407", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Shahrudi", + "iso_1_code": null, + "iso_3_code": "shm", + "tokenizers": {}, + "children": [], + "node_i": "4408", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Takestani", + "iso_1_code": null, + "iso_3_code": "tks", + "tokenizers": {}, + "children": [], + "node_i": "4409", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Talysh", + "iso_1_code": null, + "iso_3_code": "tly", + "tokenizers": { + "Latn": { + "full_object": "StanzaTokenizer(\"kmr\")", + "original_lang_name": "northern_kurdish", + "original_lang_code": "kmr", + "scripts": [ + "Latn", + "Cyrl" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + } + }, + "children": [], + "node_i": "4410", + "scripts": [ + "Latn" + ], + "own_tokenizer": false + }, + { + "name": "Taromi, Upper", + "iso_1_code": null, + "iso_3_code": "tov", + "tokenizers": {}, + "children": [], + "node_i": "4411", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Maraghei", + "iso_1_code": null, + "iso_3_code": "vmh", + "tokenizers": {}, + "children": [], + "node_i": "4412", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Kho\u2019ini", + "iso_1_code": null, + "iso_3_code": "xkc", + "tokenizers": {}, + "children": [], + "node_i": "4413", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Kajali", + "iso_1_code": null, + "iso_3_code": "xkj", + "tokenizers": {}, + "children": [], + "node_i": "4414", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Kabatei", + "iso_1_code": null, + "iso_3_code": "xkp", + "tokenizers": {}, + "children": [], + "node_i": "4415", + "scripts": [], + "own_tokenizer": false + } + ], + "node_i": "4399", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Unclassified", + "iso_1_code": null, + "iso_3_code": null, + "tokenizers": {}, + "children": [ + { + "name": "Dezfuli", + "iso_1_code": null, + "iso_3_code": "def", + "tokenizers": {}, + "children": [], + "node_i": "4417", + "scripts": [], + "own_tokenizer": false + } + ], + "node_i": "4416", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Zaza-Gorani", + "iso_1_code": null, + "iso_3_code": null, + "tokenizers": { + "Latn": { + "full_object": "StanzaTokenizer(\"kmr\")", + "original_lang_name": "northern_kurdish", + "original_lang_code": "kmr", + "scripts": [ + "Latn", + "Cyrl" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Cyrl": { + "full_object": "StanzaTokenizer(\"kmr\")", + "original_lang_name": "northern_kurdish", + "original_lang_code": "kmr", + "scripts": [ + "Latn", + "Cyrl" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Arab": { + "full_object": "SpaCyTokenizer(\"fa\")", + "original_lang_name": "persian", + "original_lang_code": "fas", + "scripts": [ + "Arab" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + } + }, + "children": [ + { + "name": "Bajelani", + "iso_1_code": null, + "iso_3_code": "bjm", + "tokenizers": {}, + "children": [], + "node_i": "4419", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Zazaki, Southern", + "iso_1_code": null, + "iso_3_code": "diq", + "tokenizers": { + "Latn": { + "full_object": "StanzaTokenizer(\"kmr\")", + "original_lang_name": "northern_kurdish", + "original_lang_code": "kmr", + "scripts": [ + "Latn", + "Cyrl" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + } + }, + "children": [], + "node_i": "4420", + "scripts": [ + "Latn" + ], + "own_tokenizer": false + }, + { + "name": "Gurani", + "iso_1_code": null, + "iso_3_code": "hac", + "tokenizers": { + "Arab": { + "full_object": "SpaCyTokenizer(\"fa\")", + "original_lang_name": "persian", + "original_lang_code": "fas", + "scripts": [ + "Arab" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + } + }, + "children": [], + "node_i": "4421", + "scripts": [ + "Arab" + ], + "own_tokenizer": false + }, + { + "name": "Zazaki, Northern", + "iso_1_code": null, + "iso_3_code": "kiu", + "tokenizers": { + "Latn": { + "full_object": "StanzaTokenizer(\"kmr\")", + "original_lang_name": "northern_kurdish", + "original_lang_code": "kmr", + "scripts": [ + "Latn", + "Cyrl" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + } + }, + "children": [], + "node_i": "4422", + "scripts": [ + "Latn" + ], + "own_tokenizer": false + }, + { + "name": "Shabak", + "iso_1_code": null, + "iso_3_code": "sdb", + "tokenizers": {}, + "children": [], + "node_i": "4423", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Sarli", + "iso_1_code": null, + "iso_3_code": "sdf", + "tokenizers": {}, + "children": [], + "node_i": "4424", + "scripts": [], + "own_tokenizer": false + } + ], + "node_i": "4418", + "scripts": [], + "own_tokenizer": false + } + ], + "node_i": "4364", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Southwestern", + "iso_1_code": null, + "iso_3_code": null, + "tokenizers": { + "Arab": { + "full_object": "SpaCyTokenizer(\"fa\")", + "original_lang_name": "persian", + "original_lang_code": "fas", + "scripts": [ + "Arab" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + }, + "Latn": { + "full_object": "StanzaTokenizer(\"kmr\")", + "original_lang_name": "northern_kurdish", + "original_lang_code": "kmr", + "scripts": [ + "Latn", + "Cyrl" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Cyrl": { + "full_object": "StanzaTokenizer(\"kmr\")", + "original_lang_name": "northern_kurdish", + "original_lang_code": "kmr", + "scripts": [ + "Latn", + "Cyrl" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + } + }, + "children": [ + { + "name": "Fars", + "iso_1_code": null, + "iso_3_code": null, + "tokenizers": {}, + "children": [ + { + "name": "Fars, Southwestern", + "iso_1_code": null, + "iso_3_code": "fay", + "tokenizers": {}, + "children": [], + "node_i": "4427", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Lari", + "iso_1_code": null, + "iso_3_code": "lrl", + "tokenizers": {}, + "children": [], + "node_i": "4428", + "scripts": [], + "own_tokenizer": false + } + ], + "node_i": "4426", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Luri", + "iso_1_code": null, + "iso_3_code": null, + "tokenizers": { + "Arab": { + "full_object": "SpaCyTokenizer(\"fa\")", + "original_lang_name": "persian", + "original_lang_code": "fas", + "scripts": [ + "Arab" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + } + }, + "children": [ + { + "name": "Bakhti\u00e2ri", + "iso_1_code": null, + "iso_3_code": "bqi", + "tokenizers": {}, + "children": [], + "node_i": "4430", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Luri, Northern", + "iso_1_code": null, + "iso_3_code": "lrc", + "tokenizers": { + "Arab": { + "full_object": "SpaCyTokenizer(\"fa\")", + "original_lang_name": "persian", + "original_lang_code": "fas", + "scripts": [ + "Arab" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + } + }, + "children": [], + "node_i": "4431", + "scripts": [ + "Arab" + ], + "own_tokenizer": false + }, + { + "name": "Luri, Southern", + "iso_1_code": null, + "iso_3_code": "luz", + "tokenizers": {}, + "children": [], + "node_i": "4432", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Kumzari", + "iso_1_code": null, + "iso_3_code": "zum", + "tokenizers": {}, + "children": [], + "node_i": "4433", + "scripts": [], + "own_tokenizer": false + } + ], + "node_i": "4429", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Persian", + "iso_1_code": null, + "iso_3_code": null, + "tokenizers": { + "Arab": { + "full_object": "SpaCyTokenizer(\"fa\")", + "original_lang_name": "persian", + "original_lang_code": "fas", + "scripts": [ + "Arab" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + }, + "Latn": { + "full_object": "StanzaTokenizer(\"kmr\")", + "original_lang_name": "northern_kurdish", + "original_lang_code": "kmr", + "scripts": [ + "Latn", + "Cyrl" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Cyrl": { + "full_object": "StanzaTokenizer(\"kmr\")", + "original_lang_name": "northern_kurdish", + "original_lang_code": "kmr", + "scripts": [ + "Latn", + "Cyrl" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + } + }, + "children": [ + { + "name": "Aimaq", + "iso_1_code": null, + "iso_3_code": "aiq", + "tokenizers": {}, + "children": [], + "node_i": "4435", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Bukharic", + "iso_1_code": null, + "iso_3_code": "bhh", + "tokenizers": {}, + "children": [], + "node_i": "4436", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Dehwari", + "iso_1_code": null, + "iso_3_code": "deh", + "tokenizers": {}, + "children": [], + "node_i": "4437", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Hazaragi", + "iso_1_code": null, + "iso_3_code": "haz", + "tokenizers": {}, + "children": [], + "node_i": "4438", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Dzhidi", + "iso_1_code": null, + "iso_3_code": "jpr", + "tokenizers": {}, + "children": [], + "node_i": "4439", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Persian, Iranian", + "iso_1_code": "fa", + "iso_3_code": "pes", + "tokenizers": { + "Arab": { + "full_object": "SpaCyTokenizer(\"fa\")", + "original_lang_name": "persian", + "original_lang_code": "fas", + "scripts": [ + "Arab" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + } + }, + "children": [], + "node_i": "4440", + "scripts": [], + "own_tokenizer": true + }, + { + "name": "Pahlavani", + "iso_1_code": null, + "iso_3_code": "phv", + "tokenizers": {}, + "children": [], + "node_i": "4441", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Dari", + "iso_1_code": "fa", + "iso_3_code": "prs", + "tokenizers": { + "Arab": { + "full_object": "SpaCyTokenizer(\"fa\")", + "original_lang_name": "persian", + "original_lang_code": "fas", + "scripts": [ + "Arab" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + } + }, + "children": [], + "node_i": "4442", + "scripts": [], + "own_tokenizer": true + }, + { + "name": "Tajik", + "iso_1_code": "tg", + "iso_3_code": "tgk", + "tokenizers": { + "Cyrl": { + "full_object": "StanzaTokenizer(\"kmr\")", + "original_lang_name": "northern_kurdish", + "original_lang_code": "kmr", + "scripts": [ + "Latn", + "Cyrl" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + } + }, + "children": [], + "node_i": "4443", + "scripts": [ + "Cyrl" + ], + "own_tokenizer": false + } + ], + "node_i": "4434", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Tat", + "iso_1_code": null, + "iso_3_code": null, + "tokenizers": {}, + "children": [ + { + "name": "Judeo-Tat", + "iso_1_code": null, + "iso_3_code": "jdt", + "tokenizers": {}, + "children": [], + "node_i": "4445", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Tat, Muslim", + "iso_1_code": null, + "iso_3_code": "ttt", + "tokenizers": {}, + "children": [], + "node_i": "4446", + "scripts": [], + "own_tokenizer": false + } + ], + "node_i": "4444", + "scripts": [], + "own_tokenizer": false + } ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false + "node_i": "4425", + "scripts": [], + "own_tokenizer": false } - }, - "children": [], - "node_i": "4458", - "scripts": [ - "Latn" ], - "own_tokenizer": true + "node_i": "4363", + "scripts": [], + "own_tokenizer": false } ], - "node_i": "4457", + "node_i": "4340", "scripts": [], "own_tokenizer": false }, { - "name": "Romance", + "name": "Nuristani", "iso_1_code": null, "iso_3_code": null, - "tokenizers": { - "Latn": { - "full_object": "SpaCyTokenizer(\"es\")", - "original_lang_name": "spanish", - "original_lang_code": "spa", - "scripts": [ - "Latn" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Cyrl": { - "full_object": "SpaCyTokenizer(\"ro\")", - "original_lang_name": "romanian", - "original_lang_code": "ron", - "scripts": [ - "Latn", - "Cyrl" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Armn": { - "full_object": "SpaCyTokenizer(\"hy\")", - "original_lang_name": "armenian", - "original_lang_code": "hye", - "scripts": [ - "Armn" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Grek": { - "full_object": "SpaCyTokenizer(\"el\")", - "original_lang_name": "greek", - "original_lang_code": "ell", - "scripts": [ - "Grek" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Deva": { - "full_object": "SpaCyTokenizer(\"hi\")", - "original_lang_name": "hindi", - "original_lang_code": "hin", - "scripts": [ - "Latn", - "Deva" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Gujr": { - "full_object": "SpaCyTokenizer(\"gu\")", - "original_lang_name": "gujarati", - "original_lang_code": "guj", - "scripts": [ - "Latn", - "Gujr" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false + "tokenizers": {}, + "children": [ + { + "name": "Ashkun", + "iso_1_code": null, + "iso_3_code": "ask", + "tokenizers": {}, + "children": [], + "node_i": "4448", + "scripts": [], + "own_tokenizer": false }, - "Guru": { - "full_object": "IndicNLPTokenizer(\"pa\")", - "original_lang_name": "punjabi", - "original_lang_code": "pan", - "scripts": [ - "Latn", - "Guru" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": false + { + "name": "Kateviri", + "iso_1_code": null, + "iso_3_code": "bsh", + "tokenizers": {}, + "children": [], + "node_i": "4449", + "scripts": [], + "own_tokenizer": false }, - "Beng": { - "full_object": "SpaCyTokenizer(\"bn\")", - "original_lang_name": "bengali", - "original_lang_code": "ben", - "scripts": [ - "Latn", - "Beng" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false + { + "name": "Prasuni", + "iso_1_code": null, + "iso_3_code": "prn", + "tokenizers": {}, + "children": [], + "node_i": "4450", + "scripts": [], + "own_tokenizer": false }, - "Orya": { - "full_object": "IndicNLPTokenizer(\"or\")", - "original_lang_name": "oriya", - "original_lang_code": "ori", - "scripts": [ - "Latn", - "Orya" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": true + { + "name": "Tregami", + "iso_1_code": null, + "iso_3_code": "trm", + "tokenizers": {}, + "children": [], + "node_i": "4451", + "scripts": [], + "own_tokenizer": false }, - "Arab": { - "full_object": "SpaCyTokenizer(\"fa\")", - "original_lang_name": "persian", - "original_lang_code": "fas", - "scripts": [ - "Arab" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": true + { + "name": "Waigali", + "iso_1_code": null, + "iso_3_code": "wbk", + "tokenizers": {}, + "children": [], + "node_i": "4452", + "scripts": [], + "own_tokenizer": false }, - "Sinh": { - "full_object": "SpaCyTokenizer(\"si\")", - "original_lang_name": "sinhala", - "original_lang_code": "sin", - "scripts": [ - "Sinh" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false + { + "name": "Komviri", + "iso_1_code": null, + "iso_3_code": "xvi", + "tokenizers": {}, + "children": [], + "node_i": "4453", + "scripts": [], + "own_tokenizer": false } - }, + ], + "node_i": "4447", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Unclassified", + "iso_1_code": null, + "iso_3_code": null, + "tokenizers": {}, "children": [ { - "name": "Eastern", + "name": "Badeshi", "iso_1_code": null, - "iso_3_code": null, - "tokenizers": { - "Latn": { - "full_object": "SpaCyTokenizer(\"ro\")", - "original_lang_name": "romanian", - "original_lang_code": "ron", - "scripts": [ - "Latn", - "Cyrl" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Cyrl": { - "full_object": "SpaCyTokenizer(\"ro\")", - "original_lang_name": "romanian", - "original_lang_code": "ron", - "scripts": [ - "Latn", - "Cyrl" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Armn": { - "full_object": "SpaCyTokenizer(\"hy\")", - "original_lang_name": "armenian", - "original_lang_code": "hye", - "scripts": [ - "Armn" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Grek": { - "full_object": "SpaCyTokenizer(\"el\")", - "original_lang_name": "greek", - "original_lang_code": "ell", - "scripts": [ - "Grek" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Deva": { - "full_object": "SpaCyTokenizer(\"hi\")", - "original_lang_name": "hindi", - "original_lang_code": "hin", - "scripts": [ - "Latn", - "Deva" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Gujr": { - "full_object": "SpaCyTokenizer(\"gu\")", - "original_lang_name": "gujarati", - "original_lang_code": "guj", - "scripts": [ - "Latn", - "Gujr" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Guru": { - "full_object": "IndicNLPTokenizer(\"pa\")", - "original_lang_name": "punjabi", - "original_lang_code": "pan", - "scripts": [ - "Latn", - "Guru" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": false - }, - "Beng": { - "full_object": "SpaCyTokenizer(\"bn\")", - "original_lang_name": "bengali", - "original_lang_code": "ben", - "scripts": [ - "Latn", - "Beng" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Orya": { - "full_object": "IndicNLPTokenizer(\"or\")", - "original_lang_name": "oriya", - "original_lang_code": "ori", - "scripts": [ - "Latn", - "Orya" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": true - }, - "Arab": { - "full_object": "SpaCyTokenizer(\"fa\")", - "original_lang_name": "persian", - "original_lang_code": "fas", - "scripts": [ - "Arab" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": true - }, - "Sinh": { - "full_object": "SpaCyTokenizer(\"si\")", - "original_lang_name": "sinhala", - "original_lang_code": "sin", + "iso_3_code": "bdz", + "tokenizers": {}, + "children": [], + "node_i": "4455", + "scripts": [], + "own_tokenizer": false + } + ], + "node_i": "4454", + "scripts": [], + "own_tokenizer": false + } + ], + "node_i": "4067", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Italic", + "iso_1_code": null, + "iso_3_code": null, + "tokenizers": { + "Latn": { + "full_object": "SpaCyTokenizer(\"es\")", + "original_lang_name": "spanish", + "original_lang_code": "spa", + "scripts": [ + "Latn" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Cyrl": { + "full_object": "SpaCyTokenizer(\"ro\")", + "original_lang_name": "romanian", + "original_lang_code": "ron", + "scripts": [ + "Latn", + "Cyrl" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } + }, + "children": [ + { + "name": "Latino-Faliscan", + "iso_1_code": null, + "iso_3_code": null, + "tokenizers": { + "Latn": { + "full_object": "SpaCyTokenizer(\"la\")", + "original_lang_name": "latin", + "original_lang_code": "lat", + "scripts": [ + "Latn" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } + }, + "children": [ + { + "name": "Latin", + "iso_1_code": "la", + "iso_3_code": "lat", + "tokenizers": { + "Latn": { + "full_object": "SpaCyTokenizer(\"la\")", + "original_lang_name": "latin", + "original_lang_code": "lat", "scripts": [ - "Sinh" + "Latn" ], "class_name": "SpaCyTokenizer", "macrolanguage": false } }, - "children": [ - { - "name": "Romanian", - "iso_1_code": "ro", - "iso_3_code": "ron", - "tokenizers": { - "Latn": { - "full_object": "SpaCyTokenizer(\"ro\")", - "original_lang_name": "romanian", - "original_lang_code": "ron", - "scripts": [ - "Latn", - "Cyrl" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Cyrl": { - "full_object": "SpaCyTokenizer(\"ro\")", - "original_lang_name": "romanian", - "original_lang_code": "ron", - "scripts": [ - "Latn", - "Cyrl" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - } - }, - "children": [], - "node_i": "4461", - "scripts": [ - "Latn", - "Cyrl" - ], - "own_tokenizer": true - }, - { - "name": "Romanian, Istro", - "iso_1_code": null, - "iso_3_code": "ruo", - "tokenizers": {}, - "children": [], - "node_i": "4462", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Aromanian", - "iso_1_code": null, - "iso_3_code": "rup", - "tokenizers": { - "Latn": { - "full_object": "SpaCyTokenizer(\"ro\")", - "original_lang_name": "romanian", - "original_lang_code": "ron", - "scripts": [ - "Latn", - "Cyrl" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - } - }, - "children": [], - "node_i": "4463", - "scripts": [ - "Latn" - ], - "own_tokenizer": false - }, - { - "name": "Romanian, Megleno", - "iso_1_code": null, - "iso_3_code": "ruq", - "tokenizers": {}, - "children": [], - "node_i": "4464", - "scripts": [], - "own_tokenizer": false - } + "children": [], + "node_i": "4458", + "scripts": [ + "Latn" ], - "node_i": "4460", - "scripts": [], - "own_tokenizer": false + "own_tokenizer": true + } + ], + "node_i": "4457", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Romance", + "iso_1_code": null, + "iso_3_code": null, + "tokenizers": { + "Latn": { + "full_object": "SpaCyTokenizer(\"es\")", + "original_lang_name": "spanish", + "original_lang_code": "spa", + "scripts": [ + "Latn" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false }, + "Cyrl": { + "full_object": "SpaCyTokenizer(\"ro\")", + "original_lang_name": "romanian", + "original_lang_code": "ron", + "scripts": [ + "Latn", + "Cyrl" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } + }, + "children": [ { - "name": "Italo-Western", + "name": "Eastern", "iso_1_code": null, "iso_3_code": null, "tokenizers": { "Latn": { - "full_object": "SpaCyTokenizer(\"es\")", - "original_lang_name": "spanish", - "original_lang_code": "spa", - "scripts": [ - "Latn" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Cyrl": { "full_object": "SpaCyTokenizer(\"ro\")", "original_lang_name": "romanian", "original_lang_code": "ron", @@ -23050,99 +9878,15 @@ "Cyrl" ], "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Armn": { - "full_object": "SpaCyTokenizer(\"hy\")", - "original_lang_name": "armenian", - "original_lang_code": "hye", - "scripts": [ - "Armn" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Grek": { - "full_object": "SpaCyTokenizer(\"el\")", - "original_lang_name": "greek", - "original_lang_code": "ell", - "scripts": [ - "Grek" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Deva": { - "full_object": "SpaCyTokenizer(\"hi\")", - "original_lang_name": "hindi", - "original_lang_code": "hin", - "scripts": [ - "Latn", - "Deva" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Gujr": { - "full_object": "SpaCyTokenizer(\"gu\")", - "original_lang_name": "gujarati", - "original_lang_code": "guj", - "scripts": [ - "Latn", - "Gujr" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Guru": { - "full_object": "IndicNLPTokenizer(\"pa\")", - "original_lang_name": "punjabi", - "original_lang_code": "pan", - "scripts": [ - "Latn", - "Guru" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": false - }, - "Beng": { - "full_object": "SpaCyTokenizer(\"bn\")", - "original_lang_name": "bengali", - "original_lang_code": "ben", - "scripts": [ - "Latn", - "Beng" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Orya": { - "full_object": "IndicNLPTokenizer(\"or\")", - "original_lang_name": "oriya", - "original_lang_code": "ori", - "scripts": [ - "Latn", - "Orya" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": true - }, - "Arab": { - "full_object": "SpaCyTokenizer(\"fa\")", - "original_lang_name": "persian", - "original_lang_code": "fas", - "scripts": [ - "Arab" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": true - }, - "Sinh": { - "full_object": "SpaCyTokenizer(\"si\")", - "original_lang_name": "sinhala", - "original_lang_code": "sin", + "macrolanguage": false + }, + "Cyrl": { + "full_object": "SpaCyTokenizer(\"ro\")", + "original_lang_name": "romanian", + "original_lang_code": "ron", "scripts": [ - "Sinh" + "Latn", + "Cyrl" ], "class_name": "SpaCyTokenizer", "macrolanguage": false @@ -23150,21 +9894,11 @@ }, "children": [ { - "name": "Italo-Dalmatian", - "iso_1_code": null, - "iso_3_code": null, + "name": "Romanian", + "iso_1_code": "ro", + "iso_3_code": "ron", "tokenizers": { "Latn": { - "full_object": "SpaCyTokenizer(\"it\")", - "original_lang_name": "italian", - "original_lang_code": "ita", - "scripts": [ - "Latn" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Cyrl": { "full_object": "SpaCyTokenizer(\"ro\")", "original_lang_name": "romanian", "original_lang_code": "ron", @@ -23175,97 +9909,103 @@ "class_name": "SpaCyTokenizer", "macrolanguage": false }, - "Armn": { - "full_object": "SpaCyTokenizer(\"hy\")", - "original_lang_name": "armenian", - "original_lang_code": "hye", - "scripts": [ - "Armn" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Grek": { - "full_object": "SpaCyTokenizer(\"el\")", - "original_lang_name": "greek", - "original_lang_code": "ell", - "scripts": [ - "Grek" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Deva": { - "full_object": "SpaCyTokenizer(\"hi\")", - "original_lang_name": "hindi", - "original_lang_code": "hin", - "scripts": [ - "Latn", - "Deva" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Gujr": { - "full_object": "SpaCyTokenizer(\"gu\")", - "original_lang_name": "gujarati", - "original_lang_code": "guj", + "Cyrl": { + "full_object": "SpaCyTokenizer(\"ro\")", + "original_lang_name": "romanian", + "original_lang_code": "ron", "scripts": [ "Latn", - "Gujr" + "Cyrl" ], "class_name": "SpaCyTokenizer", "macrolanguage": false - }, - "Guru": { - "full_object": "IndicNLPTokenizer(\"pa\")", - "original_lang_name": "punjabi", - "original_lang_code": "pan", - "scripts": [ - "Latn", - "Guru" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": false - }, - "Beng": { - "full_object": "SpaCyTokenizer(\"bn\")", - "original_lang_name": "bengali", - "original_lang_code": "ben", + } + }, + "children": [], + "node_i": "4461", + "scripts": [ + "Latn", + "Cyrl" + ], + "own_tokenizer": true + }, + { + "name": "Romanian, Istro", + "iso_1_code": null, + "iso_3_code": "ruo", + "tokenizers": {}, + "children": [], + "node_i": "4462", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Aromanian", + "iso_1_code": null, + "iso_3_code": "rup", + "tokenizers": { + "Latn": { + "full_object": "SpaCyTokenizer(\"ro\")", + "original_lang_name": "romanian", + "original_lang_code": "ron", "scripts": [ "Latn", - "Beng" + "Cyrl" ], "class_name": "SpaCyTokenizer", "macrolanguage": false - }, - "Orya": { - "full_object": "IndicNLPTokenizer(\"or\")", - "original_lang_name": "oriya", - "original_lang_code": "ori", - "scripts": [ - "Latn", - "Orya" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": true - }, - "Arab": { - "full_object": "SpaCyTokenizer(\"fa\")", - "original_lang_name": "persian", - "original_lang_code": "fas", - "scripts": [ - "Arab" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": true - }, - "Sinh": { - "full_object": "SpaCyTokenizer(\"si\")", - "original_lang_name": "sinhala", - "original_lang_code": "sin", + } + }, + "children": [], + "node_i": "4463", + "scripts": [ + "Latn" + ], + "own_tokenizer": false + }, + { + "name": "Romanian, Megleno", + "iso_1_code": null, + "iso_3_code": "ruq", + "tokenizers": {}, + "children": [], + "node_i": "4464", + "scripts": [], + "own_tokenizer": false + } + ], + "node_i": "4460", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Italo-Western", + "iso_1_code": null, + "iso_3_code": null, + "tokenizers": { + "Latn": { + "full_object": "SpaCyTokenizer(\"es\")", + "original_lang_name": "spanish", + "original_lang_code": "spa", + "scripts": [ + "Latn" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } + }, + "children": [ + { + "name": "Italo-Dalmatian", + "iso_1_code": null, + "iso_3_code": null, + "tokenizers": { + "Latn": { + "full_object": "SpaCyTokenizer(\"it\")", + "original_lang_name": "italian", + "original_lang_code": "ita", "scripts": [ - "Sinh" + "Latn" ], "class_name": "SpaCyTokenizer", "macrolanguage": false @@ -23276,82 +10016,26 @@ "name": "Dalmatian", "iso_1_code": null, "iso_3_code": "dlm", - "tokenizers": {}, - "children": [], - "node_i": "4467", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Istriot", - "iso_1_code": null, - "iso_3_code": "ist", - "tokenizers": {}, - "children": [], - "node_i": "4468", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Italian", - "iso_1_code": "it", - "iso_3_code": "ita", - "tokenizers": { - "Latn": { - "full_object": "SpaCyTokenizer(\"it\")", - "original_lang_name": "italian", - "original_lang_code": "ita", - "scripts": [ - "Latn" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - } - }, - "children": [], - "node_i": "4469", - "scripts": [ - "Latn" - ], - "own_tokenizer": true - }, - { - "name": "Judeo-Italian", - "iso_1_code": null, - "iso_3_code": "itk", - "tokenizers": {}, - "children": [], - "node_i": "4470", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Napoletano", - "iso_1_code": null, - "iso_3_code": "nap", - "tokenizers": { - "Latn": { - "full_object": "SpaCyTokenizer(\"it\")", - "original_lang_name": "italian", - "original_lang_code": "ita", - "scripts": [ - "Latn" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - } - }, - "children": [], - "node_i": "4471", - "scripts": [ - "Latn" - ], + "tokenizers": {}, + "children": [], + "node_i": "4467", + "scripts": [], "own_tokenizer": false }, { - "name": "Sicilian", + "name": "Istriot", "iso_1_code": null, - "iso_3_code": "scn", + "iso_3_code": "ist", + "tokenizers": {}, + "children": [], + "node_i": "4468", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Italian", + "iso_1_code": "it", + "iso_3_code": "ita", "tokenizers": { "Latn": { "full_object": "SpaCyTokenizer(\"it\")", @@ -23365,503 +10049,135 @@ } }, "children": [], - "node_i": "4472", - "scripts": [ - "Latn" - ], - "own_tokenizer": false - } - ], - "node_i": "4466", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Western", - "iso_1_code": null, - "iso_3_code": null, - "tokenizers": { - "Latn": { - "full_object": "SpaCyTokenizer(\"es\")", - "original_lang_name": "spanish", - "original_lang_code": "spa", + "node_i": "4469", "scripts": [ "Latn" ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Cyrl": { - "full_object": "SpaCyTokenizer(\"ro\")", - "original_lang_name": "romanian", - "original_lang_code": "ron", - "scripts": [ - "Latn", - "Cyrl" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Armn": { - "full_object": "SpaCyTokenizer(\"hy\")", - "original_lang_name": "armenian", - "original_lang_code": "hye", - "scripts": [ - "Armn" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Grek": { - "full_object": "SpaCyTokenizer(\"el\")", - "original_lang_name": "greek", - "original_lang_code": "ell", - "scripts": [ - "Grek" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Deva": { - "full_object": "SpaCyTokenizer(\"hi\")", - "original_lang_name": "hindi", - "original_lang_code": "hin", - "scripts": [ - "Latn", - "Deva" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Gujr": { - "full_object": "SpaCyTokenizer(\"gu\")", - "original_lang_name": "gujarati", - "original_lang_code": "guj", - "scripts": [ - "Latn", - "Gujr" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Guru": { - "full_object": "IndicNLPTokenizer(\"pa\")", - "original_lang_name": "punjabi", - "original_lang_code": "pan", - "scripts": [ - "Latn", - "Guru" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": false - }, - "Beng": { - "full_object": "SpaCyTokenizer(\"bn\")", - "original_lang_name": "bengali", - "original_lang_code": "ben", - "scripts": [ - "Latn", - "Beng" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Orya": { - "full_object": "IndicNLPTokenizer(\"or\")", - "original_lang_name": "oriya", - "original_lang_code": "ori", - "scripts": [ - "Latn", - "Orya" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": true + "own_tokenizer": true }, - "Arab": { - "full_object": "SpaCyTokenizer(\"fa\")", - "original_lang_name": "persian", - "original_lang_code": "fas", - "scripts": [ - "Arab" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": true + { + "name": "Judeo-Italian", + "iso_1_code": null, + "iso_3_code": "itk", + "tokenizers": {}, + "children": [], + "node_i": "4470", + "scripts": [], + "own_tokenizer": false }, - "Sinh": { - "full_object": "SpaCyTokenizer(\"si\")", - "original_lang_name": "sinhala", - "original_lang_code": "sin", - "scripts": [ - "Sinh" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - } - }, - "children": [ { - "name": "Gallo-Iberian", + "name": "Napoletano", "iso_1_code": null, - "iso_3_code": null, + "iso_3_code": "nap", "tokenizers": { "Latn": { - "full_object": "SpaCyTokenizer(\"es\")", - "original_lang_name": "spanish", - "original_lang_code": "spa", + "full_object": "SpaCyTokenizer(\"it\")", + "original_lang_name": "italian", + "original_lang_code": "ita", "scripts": [ "Latn" ], "class_name": "SpaCyTokenizer", "macrolanguage": false - }, - "Cyrl": { - "full_object": "SpaCyTokenizer(\"ro\")", - "original_lang_name": "romanian", - "original_lang_code": "ron", - "scripts": [ - "Latn", - "Cyrl" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Armn": { - "full_object": "SpaCyTokenizer(\"hy\")", - "original_lang_name": "armenian", - "original_lang_code": "hye", - "scripts": [ - "Armn" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Grek": { - "full_object": "SpaCyTokenizer(\"el\")", - "original_lang_name": "greek", - "original_lang_code": "ell", - "scripts": [ - "Grek" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Deva": { - "full_object": "SpaCyTokenizer(\"hi\")", - "original_lang_name": "hindi", - "original_lang_code": "hin", - "scripts": [ - "Latn", - "Deva" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Gujr": { - "full_object": "SpaCyTokenizer(\"gu\")", - "original_lang_name": "gujarati", - "original_lang_code": "guj", - "scripts": [ - "Latn", - "Gujr" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Guru": { - "full_object": "IndicNLPTokenizer(\"pa\")", - "original_lang_name": "punjabi", - "original_lang_code": "pan", - "scripts": [ - "Latn", - "Guru" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": false - }, - "Beng": { - "full_object": "SpaCyTokenizer(\"bn\")", - "original_lang_name": "bengali", - "original_lang_code": "ben", - "scripts": [ - "Latn", - "Beng" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Orya": { - "full_object": "IndicNLPTokenizer(\"or\")", - "original_lang_name": "oriya", - "original_lang_code": "ori", - "scripts": [ - "Latn", - "Orya" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": true - }, - "Arab": { - "full_object": "SpaCyTokenizer(\"fa\")", - "original_lang_name": "persian", - "original_lang_code": "fas", - "scripts": [ - "Arab" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": true - }, - "Sinh": { - "full_object": "SpaCyTokenizer(\"si\")", - "original_lang_name": "sinhala", - "original_lang_code": "sin", + } + }, + "children": [], + "node_i": "4471", + "scripts": [ + "Latn" + ], + "own_tokenizer": false + }, + { + "name": "Sicilian", + "iso_1_code": null, + "iso_3_code": "scn", + "tokenizers": { + "Latn": { + "full_object": "SpaCyTokenizer(\"it\")", + "original_lang_name": "italian", + "original_lang_code": "ita", "scripts": [ - "Sinh" + "Latn" ], "class_name": "SpaCyTokenizer", "macrolanguage": false } }, - "children": [ - { - "name": "Gallo-Romance", - "iso_1_code": null, - "iso_3_code": null, - "tokenizers": { - "Latn": { - "full_object": "SpaCyTokenizer(\"fr\")", - "original_lang_name": "french", - "original_lang_code": "fra", - "scripts": [ - "Latn" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Cyrl": { - "full_object": "SpaCyTokenizer(\"ro\")", - "original_lang_name": "romanian", - "original_lang_code": "ron", - "scripts": [ - "Latn", - "Cyrl" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Armn": { - "full_object": "SpaCyTokenizer(\"hy\")", - "original_lang_name": "armenian", - "original_lang_code": "hye", - "scripts": [ - "Armn" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Grek": { - "full_object": "SpaCyTokenizer(\"el\")", - "original_lang_name": "greek", - "original_lang_code": "ell", - "scripts": [ - "Grek" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Deva": { - "full_object": "SpaCyTokenizer(\"hi\")", - "original_lang_name": "hindi", - "original_lang_code": "hin", - "scripts": [ - "Latn", - "Deva" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Gujr": { - "full_object": "SpaCyTokenizer(\"gu\")", - "original_lang_name": "gujarati", - "original_lang_code": "guj", - "scripts": [ - "Latn", - "Gujr" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Guru": { - "full_object": "IndicNLPTokenizer(\"pa\")", - "original_lang_name": "punjabi", - "original_lang_code": "pan", - "scripts": [ - "Latn", - "Guru" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": false - }, - "Beng": { - "full_object": "SpaCyTokenizer(\"bn\")", - "original_lang_name": "bengali", - "original_lang_code": "ben", - "scripts": [ - "Latn", - "Beng" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Orya": { - "full_object": "IndicNLPTokenizer(\"or\")", - "original_lang_name": "oriya", - "original_lang_code": "ori", - "scripts": [ - "Latn", - "Orya" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": true - }, - "Arab": { - "full_object": "SpaCyTokenizer(\"fa\")", - "original_lang_name": "persian", - "original_lang_code": "fas", - "scripts": [ - "Arab" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": true - }, - "Sinh": { - "full_object": "SpaCyTokenizer(\"si\")", - "original_lang_name": "sinhala", - "original_lang_code": "sin", - "scripts": [ - "Sinh" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - } - }, - "children": [ - { - "name": "Gallo-Italian", - "iso_1_code": null, - "iso_3_code": null, - "tokenizers": { - "Latn": { - "full_object": "SpaCyTokenizer(\"lij\")", - "original_lang_name": "ligurian", - "original_lang_code": "lij", - "scripts": [ - "Latn" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Cyrl": { - "full_object": "SpaCyTokenizer(\"ro\")", - "original_lang_name": "romanian", - "original_lang_code": "ron", - "scripts": [ - "Latn", - "Cyrl" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Armn": { - "full_object": "SpaCyTokenizer(\"hy\")", - "original_lang_name": "armenian", - "original_lang_code": "hye", - "scripts": [ - "Armn" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Grek": { - "full_object": "SpaCyTokenizer(\"el\")", - "original_lang_name": "greek", - "original_lang_code": "ell", - "scripts": [ - "Grek" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Deva": { - "full_object": "SpaCyTokenizer(\"hi\")", - "original_lang_name": "hindi", - "original_lang_code": "hin", - "scripts": [ - "Latn", - "Deva" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Gujr": { - "full_object": "SpaCyTokenizer(\"gu\")", - "original_lang_name": "gujarati", - "original_lang_code": "guj", - "scripts": [ - "Latn", - "Gujr" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Guru": { - "full_object": "IndicNLPTokenizer(\"pa\")", - "original_lang_name": "punjabi", - "original_lang_code": "pan", - "scripts": [ - "Latn", - "Guru" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": false - }, - "Beng": { - "full_object": "SpaCyTokenizer(\"bn\")", - "original_lang_name": "bengali", - "original_lang_code": "ben", - "scripts": [ - "Latn", - "Beng" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Orya": { - "full_object": "IndicNLPTokenizer(\"or\")", - "original_lang_name": "oriya", - "original_lang_code": "ori", - "scripts": [ - "Latn", - "Orya" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": true - }, - "Arab": { - "full_object": "SpaCyTokenizer(\"fa\")", - "original_lang_name": "persian", - "original_lang_code": "fas", - "scripts": [ - "Arab" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": true - }, - "Sinh": { - "full_object": "SpaCyTokenizer(\"si\")", - "original_lang_name": "sinhala", - "original_lang_code": "sin", + "children": [], + "node_i": "4472", + "scripts": [ + "Latn" + ], + "own_tokenizer": false + } + ], + "node_i": "4466", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Western", + "iso_1_code": null, + "iso_3_code": null, + "tokenizers": { + "Latn": { + "full_object": "SpaCyTokenizer(\"es\")", + "original_lang_name": "spanish", + "original_lang_code": "spa", + "scripts": [ + "Latn" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } + }, + "children": [ + { + "name": "Gallo-Iberian", + "iso_1_code": null, + "iso_3_code": null, + "tokenizers": { + "Latn": { + "full_object": "SpaCyTokenizer(\"es\")", + "original_lang_name": "spanish", + "original_lang_code": "spa", + "scripts": [ + "Latn" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } + }, + "children": [ + { + "name": "Gallo-Romance", + "iso_1_code": null, + "iso_3_code": null, + "tokenizers": { + "Latn": { + "full_object": "SpaCyTokenizer(\"fr\")", + "original_lang_name": "french", + "original_lang_code": "fra", + "scripts": [ + "Latn" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } + }, + "children": [ + { + "name": "Gallo-Italian", + "iso_1_code": null, + "iso_3_code": null, + "tokenizers": { + "Latn": { + "full_object": "SpaCyTokenizer(\"lij\")", + "original_lang_name": "ligurian", + "original_lang_code": "lij", "scripts": [ - "Sinh" + "Latn" ], "class_name": "SpaCyTokenizer", "macrolanguage": false @@ -23999,112 +10315,6 @@ ], "class_name": "SpaCyTokenizer", "macrolanguage": false - }, - "Cyrl": { - "full_object": "SpaCyTokenizer(\"ro\")", - "original_lang_name": "romanian", - "original_lang_code": "ron", - "scripts": [ - "Latn", - "Cyrl" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Armn": { - "full_object": "SpaCyTokenizer(\"hy\")", - "original_lang_name": "armenian", - "original_lang_code": "hye", - "scripts": [ - "Armn" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Grek": { - "full_object": "SpaCyTokenizer(\"el\")", - "original_lang_name": "greek", - "original_lang_code": "ell", - "scripts": [ - "Grek" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Deva": { - "full_object": "SpaCyTokenizer(\"hi\")", - "original_lang_name": "hindi", - "original_lang_code": "hin", - "scripts": [ - "Latn", - "Deva" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Gujr": { - "full_object": "SpaCyTokenizer(\"gu\")", - "original_lang_name": "gujarati", - "original_lang_code": "guj", - "scripts": [ - "Latn", - "Gujr" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Guru": { - "full_object": "IndicNLPTokenizer(\"pa\")", - "original_lang_name": "punjabi", - "original_lang_code": "pan", - "scripts": [ - "Latn", - "Guru" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": false - }, - "Beng": { - "full_object": "SpaCyTokenizer(\"bn\")", - "original_lang_name": "bengali", - "original_lang_code": "ben", - "scripts": [ - "Latn", - "Beng" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Orya": { - "full_object": "IndicNLPTokenizer(\"or\")", - "original_lang_name": "oriya", - "original_lang_code": "ori", - "scripts": [ - "Latn", - "Orya" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": true - }, - "Arab": { - "full_object": "SpaCyTokenizer(\"fa\")", - "original_lang_name": "persian", - "original_lang_code": "fas", - "scripts": [ - "Arab" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": true - }, - "Sinh": { - "full_object": "SpaCyTokenizer(\"si\")", - "original_lang_name": "sinhala", - "original_lang_code": "sin", - "scripts": [ - "Sinh" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false } }, "children": [ @@ -24122,112 +10332,6 @@ ], "class_name": "SpaCyTokenizer", "macrolanguage": false - }, - "Cyrl": { - "full_object": "SpaCyTokenizer(\"ro\")", - "original_lang_name": "romanian", - "original_lang_code": "ron", - "scripts": [ - "Latn", - "Cyrl" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Armn": { - "full_object": "SpaCyTokenizer(\"hy\")", - "original_lang_name": "armenian", - "original_lang_code": "hye", - "scripts": [ - "Armn" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Grek": { - "full_object": "SpaCyTokenizer(\"el\")", - "original_lang_name": "greek", - "original_lang_code": "ell", - "scripts": [ - "Grek" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Deva": { - "full_object": "SpaCyTokenizer(\"hi\")", - "original_lang_name": "hindi", - "original_lang_code": "hin", - "scripts": [ - "Latn", - "Deva" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Gujr": { - "full_object": "SpaCyTokenizer(\"gu\")", - "original_lang_name": "gujarati", - "original_lang_code": "guj", - "scripts": [ - "Latn", - "Gujr" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Guru": { - "full_object": "IndicNLPTokenizer(\"pa\")", - "original_lang_name": "punjabi", - "original_lang_code": "pan", - "scripts": [ - "Latn", - "Guru" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": false - }, - "Beng": { - "full_object": "SpaCyTokenizer(\"bn\")", - "original_lang_name": "bengali", - "original_lang_code": "ben", - "scripts": [ - "Latn", - "Beng" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Orya": { - "full_object": "IndicNLPTokenizer(\"or\")", - "original_lang_name": "oriya", - "original_lang_code": "ori", - "scripts": [ - "Latn", - "Orya" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": true - }, - "Arab": { - "full_object": "SpaCyTokenizer(\"fa\")", - "original_lang_name": "persian", - "original_lang_code": "fas", - "scripts": [ - "Arab" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": true - }, - "Sinh": { - "full_object": "SpaCyTokenizer(\"si\")", - "original_lang_name": "sinhala", - "original_lang_code": "sin", - "scripts": [ - "Sinh" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false } }, "children": [ @@ -24245,112 +10349,6 @@ ], "class_name": "SpaCyTokenizer", "macrolanguage": false - }, - "Cyrl": { - "full_object": "SpaCyTokenizer(\"ro\")", - "original_lang_name": "romanian", - "original_lang_code": "ron", - "scripts": [ - "Latn", - "Cyrl" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Armn": { - "full_object": "SpaCyTokenizer(\"hy\")", - "original_lang_name": "armenian", - "original_lang_code": "hye", - "scripts": [ - "Armn" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Grek": { - "full_object": "SpaCyTokenizer(\"el\")", - "original_lang_name": "greek", - "original_lang_code": "ell", - "scripts": [ - "Grek" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Deva": { - "full_object": "SpaCyTokenizer(\"hi\")", - "original_lang_name": "hindi", - "original_lang_code": "hin", - "scripts": [ - "Latn", - "Deva" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Gujr": { - "full_object": "SpaCyTokenizer(\"gu\")", - "original_lang_name": "gujarati", - "original_lang_code": "guj", - "scripts": [ - "Latn", - "Gujr" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Guru": { - "full_object": "IndicNLPTokenizer(\"pa\")", - "original_lang_name": "punjabi", - "original_lang_code": "pan", - "scripts": [ - "Latn", - "Guru" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": false - }, - "Beng": { - "full_object": "SpaCyTokenizer(\"bn\")", - "original_lang_name": "bengali", - "original_lang_code": "ben", - "scripts": [ - "Latn", - "Beng" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Orya": { - "full_object": "IndicNLPTokenizer(\"or\")", - "original_lang_name": "oriya", - "original_lang_code": "ori", - "scripts": [ - "Latn", - "Orya" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": true - }, - "Arab": { - "full_object": "SpaCyTokenizer(\"fa\")", - "original_lang_name": "persian", - "original_lang_code": "fas", - "scripts": [ - "Arab" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": true - }, - "Sinh": { - "full_object": "SpaCyTokenizer(\"si\")", - "original_lang_name": "sinhala", - "original_lang_code": "sin", - "scripts": [ - "Sinh" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false } }, "children": [ @@ -24461,123 +10459,17 @@ "scripts": [], "own_tokenizer": false }, - { - "name": "Southeastern", - "iso_1_code": null, - "iso_3_code": null, - "tokenizers": { - "Latn": { - "full_object": "SpaCyTokenizer(\"fr\")", - "original_lang_name": "french", - "original_lang_code": "fra", - "scripts": [ - "Latn" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Cyrl": { - "full_object": "SpaCyTokenizer(\"ro\")", - "original_lang_name": "romanian", - "original_lang_code": "ron", - "scripts": [ - "Latn", - "Cyrl" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Armn": { - "full_object": "SpaCyTokenizer(\"hy\")", - "original_lang_name": "armenian", - "original_lang_code": "hye", - "scripts": [ - "Armn" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Grek": { - "full_object": "SpaCyTokenizer(\"el\")", - "original_lang_name": "greek", - "original_lang_code": "ell", - "scripts": [ - "Grek" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Deva": { - "full_object": "SpaCyTokenizer(\"hi\")", - "original_lang_name": "hindi", - "original_lang_code": "hin", - "scripts": [ - "Latn", - "Deva" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Gujr": { - "full_object": "SpaCyTokenizer(\"gu\")", - "original_lang_name": "gujarati", - "original_lang_code": "guj", - "scripts": [ - "Latn", - "Gujr" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Guru": { - "full_object": "IndicNLPTokenizer(\"pa\")", - "original_lang_name": "punjabi", - "original_lang_code": "pan", - "scripts": [ - "Latn", - "Guru" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": false - }, - "Beng": { - "full_object": "SpaCyTokenizer(\"bn\")", - "original_lang_name": "bengali", - "original_lang_code": "ben", - "scripts": [ - "Latn", - "Beng" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Orya": { - "full_object": "IndicNLPTokenizer(\"or\")", - "original_lang_name": "oriya", - "original_lang_code": "ori", - "scripts": [ - "Latn", - "Orya" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": true - }, - "Arab": { - "full_object": "SpaCyTokenizer(\"fa\")", - "original_lang_name": "persian", - "original_lang_code": "fas", - "scripts": [ - "Arab" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": true - }, - "Sinh": { - "full_object": "SpaCyTokenizer(\"si\")", - "original_lang_name": "sinhala", - "original_lang_code": "sin", + { + "name": "Southeastern", + "iso_1_code": null, + "iso_3_code": null, + "tokenizers": { + "Latn": { + "full_object": "SpaCyTokenizer(\"fr\")", + "original_lang_name": "french", + "original_lang_code": "fra", "scripts": [ - "Sinh" + "Latn" ], "class_name": "SpaCyTokenizer", "macrolanguage": false @@ -24631,112 +10523,6 @@ ], "class_name": "SpaCyTokenizer", "macrolanguage": false - }, - "Cyrl": { - "full_object": "SpaCyTokenizer(\"ro\")", - "original_lang_name": "romanian", - "original_lang_code": "ron", - "scripts": [ - "Latn", - "Cyrl" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Armn": { - "full_object": "SpaCyTokenizer(\"hy\")", - "original_lang_name": "armenian", - "original_lang_code": "hye", - "scripts": [ - "Armn" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Grek": { - "full_object": "SpaCyTokenizer(\"el\")", - "original_lang_name": "greek", - "original_lang_code": "ell", - "scripts": [ - "Grek" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Deva": { - "full_object": "SpaCyTokenizer(\"hi\")", - "original_lang_name": "hindi", - "original_lang_code": "hin", - "scripts": [ - "Latn", - "Deva" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Gujr": { - "full_object": "SpaCyTokenizer(\"gu\")", - "original_lang_name": "gujarati", - "original_lang_code": "guj", - "scripts": [ - "Latn", - "Gujr" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Guru": { - "full_object": "IndicNLPTokenizer(\"pa\")", - "original_lang_name": "punjabi", - "original_lang_code": "pan", - "scripts": [ - "Latn", - "Guru" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": false - }, - "Beng": { - "full_object": "SpaCyTokenizer(\"bn\")", - "original_lang_name": "bengali", - "original_lang_code": "ben", - "scripts": [ - "Latn", - "Beng" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Orya": { - "full_object": "IndicNLPTokenizer(\"or\")", - "original_lang_name": "oriya", - "original_lang_code": "ori", - "scripts": [ - "Latn", - "Orya" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": true - }, - "Arab": { - "full_object": "SpaCyTokenizer(\"fa\")", - "original_lang_name": "persian", - "original_lang_code": "fas", - "scripts": [ - "Arab" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": true - }, - "Sinh": { - "full_object": "SpaCyTokenizer(\"si\")", - "original_lang_name": "sinhala", - "original_lang_code": "sin", - "scripts": [ - "Sinh" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false } }, "children": [ @@ -24823,247 +10609,35 @@ "node_i": "4475", "scripts": [], "own_tokenizer": false - }, - { - "name": "Ibero-Romance", - "iso_1_code": null, - "iso_3_code": null, - "tokenizers": { - "Latn": { - "full_object": "SpaCyTokenizer(\"es\")", - "original_lang_name": "spanish", - "original_lang_code": "spa", - "scripts": [ - "Latn" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Cyrl": { - "full_object": "SpaCyTokenizer(\"ro\")", - "original_lang_name": "romanian", - "original_lang_code": "ron", - "scripts": [ - "Latn", - "Cyrl" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Armn": { - "full_object": "SpaCyTokenizer(\"hy\")", - "original_lang_name": "armenian", - "original_lang_code": "hye", - "scripts": [ - "Armn" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Grek": { - "full_object": "SpaCyTokenizer(\"el\")", - "original_lang_name": "greek", - "original_lang_code": "ell", - "scripts": [ - "Grek" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Deva": { - "full_object": "SpaCyTokenizer(\"hi\")", - "original_lang_name": "hindi", - "original_lang_code": "hin", - "scripts": [ - "Latn", - "Deva" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Gujr": { - "full_object": "SpaCyTokenizer(\"gu\")", - "original_lang_name": "gujarati", - "original_lang_code": "guj", - "scripts": [ - "Latn", - "Gujr" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Guru": { - "full_object": "IndicNLPTokenizer(\"pa\")", - "original_lang_name": "punjabi", - "original_lang_code": "pan", - "scripts": [ - "Latn", - "Guru" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": false - }, - "Beng": { - "full_object": "SpaCyTokenizer(\"bn\")", - "original_lang_name": "bengali", - "original_lang_code": "ben", - "scripts": [ - "Latn", - "Beng" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Orya": { - "full_object": "IndicNLPTokenizer(\"or\")", - "original_lang_name": "oriya", - "original_lang_code": "ori", - "scripts": [ - "Latn", - "Orya" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": true - }, - "Arab": { - "full_object": "SpaCyTokenizer(\"fa\")", - "original_lang_name": "persian", - "original_lang_code": "fas", - "scripts": [ - "Arab" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": true - }, - "Sinh": { - "full_object": "SpaCyTokenizer(\"si\")", - "original_lang_name": "sinhala", - "original_lang_code": "sin", - "scripts": [ - "Sinh" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - } - }, - "children": [ - { - "name": "East Iberian", - "iso_1_code": null, - "iso_3_code": null, - "tokenizers": { - "Latn": { - "full_object": "SpaCyTokenizer(\"ca\")", - "original_lang_name": "catalan", - "original_lang_code": "cat", - "scripts": [ - "Latn" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Cyrl": { - "full_object": "SpaCyTokenizer(\"ro\")", - "original_lang_name": "romanian", - "original_lang_code": "ron", - "scripts": [ - "Latn", - "Cyrl" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Armn": { - "full_object": "SpaCyTokenizer(\"hy\")", - "original_lang_name": "armenian", - "original_lang_code": "hye", - "scripts": [ - "Armn" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Grek": { - "full_object": "SpaCyTokenizer(\"el\")", - "original_lang_name": "greek", - "original_lang_code": "ell", - "scripts": [ - "Grek" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Deva": { - "full_object": "SpaCyTokenizer(\"hi\")", - "original_lang_name": "hindi", - "original_lang_code": "hin", - "scripts": [ - "Latn", - "Deva" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Gujr": { - "full_object": "SpaCyTokenizer(\"gu\")", - "original_lang_name": "gujarati", - "original_lang_code": "guj", - "scripts": [ - "Latn", - "Gujr" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Guru": { - "full_object": "IndicNLPTokenizer(\"pa\")", - "original_lang_name": "punjabi", - "original_lang_code": "pan", - "scripts": [ - "Latn", - "Guru" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": false - }, - "Beng": { - "full_object": "SpaCyTokenizer(\"bn\")", - "original_lang_name": "bengali", - "original_lang_code": "ben", - "scripts": [ - "Latn", - "Beng" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Orya": { - "full_object": "IndicNLPTokenizer(\"or\")", - "original_lang_name": "oriya", - "original_lang_code": "ori", - "scripts": [ - "Latn", - "Orya" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": true - }, - "Arab": { - "full_object": "SpaCyTokenizer(\"fa\")", - "original_lang_name": "persian", - "original_lang_code": "fas", - "scripts": [ - "Arab" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": true - }, - "Sinh": { - "full_object": "SpaCyTokenizer(\"si\")", - "original_lang_name": "sinhala", - "original_lang_code": "sin", + }, + { + "name": "Ibero-Romance", + "iso_1_code": null, + "iso_3_code": null, + "tokenizers": { + "Latn": { + "full_object": "SpaCyTokenizer(\"es\")", + "original_lang_name": "spanish", + "original_lang_code": "spa", + "scripts": [ + "Latn" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } + }, + "children": [ + { + "name": "East Iberian", + "iso_1_code": null, + "iso_3_code": null, + "tokenizers": { + "Latn": { + "full_object": "SpaCyTokenizer(\"ca\")", + "original_lang_name": "catalan", + "original_lang_code": "cat", "scripts": [ - "Sinh" + "Latn" ], "class_name": "SpaCyTokenizer", "macrolanguage": false @@ -25112,112 +10686,6 @@ ], "class_name": "SpaCyTokenizer", "macrolanguage": false - }, - "Cyrl": { - "full_object": "SpaCyTokenizer(\"ro\")", - "original_lang_name": "romanian", - "original_lang_code": "ron", - "scripts": [ - "Latn", - "Cyrl" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Armn": { - "full_object": "SpaCyTokenizer(\"hy\")", - "original_lang_name": "armenian", - "original_lang_code": "hye", - "scripts": [ - "Armn" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Grek": { - "full_object": "SpaCyTokenizer(\"el\")", - "original_lang_name": "greek", - "original_lang_code": "ell", - "scripts": [ - "Grek" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Deva": { - "full_object": "SpaCyTokenizer(\"hi\")", - "original_lang_name": "hindi", - "original_lang_code": "hin", - "scripts": [ - "Latn", - "Deva" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Gujr": { - "full_object": "SpaCyTokenizer(\"gu\")", - "original_lang_name": "gujarati", - "original_lang_code": "guj", - "scripts": [ - "Latn", - "Gujr" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Guru": { - "full_object": "IndicNLPTokenizer(\"pa\")", - "original_lang_name": "punjabi", - "original_lang_code": "pan", - "scripts": [ - "Latn", - "Guru" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": false - }, - "Beng": { - "full_object": "SpaCyTokenizer(\"bn\")", - "original_lang_name": "bengali", - "original_lang_code": "ben", - "scripts": [ - "Latn", - "Beng" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Orya": { - "full_object": "IndicNLPTokenizer(\"or\")", - "original_lang_name": "oriya", - "original_lang_code": "ori", - "scripts": [ - "Latn", - "Orya" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": true - }, - "Arab": { - "full_object": "SpaCyTokenizer(\"fa\")", - "original_lang_name": "persian", - "original_lang_code": "fas", - "scripts": [ - "Arab" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": true - }, - "Sinh": { - "full_object": "SpaCyTokenizer(\"si\")", - "original_lang_name": "sinhala", - "original_lang_code": "sin", - "scripts": [ - "Sinh" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false } }, "children": [ @@ -25252,253 +10720,41 @@ "children": [], "node_i": "4502", "scripts": [], - "own_tokenizer": false - } - ], - "node_i": "4500", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "West Iberian", - "iso_1_code": null, - "iso_3_code": null, - "tokenizers": { - "Latn": { - "full_object": "SpaCyTokenizer(\"es\")", - "original_lang_name": "spanish", - "original_lang_code": "spa", - "scripts": [ - "Latn" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Cyrl": { - "full_object": "SpaCyTokenizer(\"ro\")", - "original_lang_name": "romanian", - "original_lang_code": "ron", - "scripts": [ - "Latn", - "Cyrl" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Armn": { - "full_object": "SpaCyTokenizer(\"hy\")", - "original_lang_name": "armenian", - "original_lang_code": "hye", - "scripts": [ - "Armn" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Grek": { - "full_object": "SpaCyTokenizer(\"el\")", - "original_lang_name": "greek", - "original_lang_code": "ell", - "scripts": [ - "Grek" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Deva": { - "full_object": "SpaCyTokenizer(\"hi\")", - "original_lang_name": "hindi", - "original_lang_code": "hin", - "scripts": [ - "Latn", - "Deva" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Gujr": { - "full_object": "SpaCyTokenizer(\"gu\")", - "original_lang_name": "gujarati", - "original_lang_code": "guj", - "scripts": [ - "Latn", - "Gujr" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Guru": { - "full_object": "IndicNLPTokenizer(\"pa\")", - "original_lang_name": "punjabi", - "original_lang_code": "pan", - "scripts": [ - "Latn", - "Guru" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": false - }, - "Beng": { - "full_object": "SpaCyTokenizer(\"bn\")", - "original_lang_name": "bengali", - "original_lang_code": "ben", - "scripts": [ - "Latn", - "Beng" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Orya": { - "full_object": "IndicNLPTokenizer(\"or\")", - "original_lang_name": "oriya", - "original_lang_code": "ori", - "scripts": [ - "Latn", - "Orya" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": true - }, - "Arab": { - "full_object": "SpaCyTokenizer(\"fa\")", - "original_lang_name": "persian", - "original_lang_code": "fas", - "scripts": [ - "Arab" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": true - }, - "Sinh": { - "full_object": "SpaCyTokenizer(\"si\")", - "original_lang_name": "sinhala", - "original_lang_code": "sin", - "scripts": [ - "Sinh" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - } - }, - "children": [ - { - "name": "Asturo-Leonese", - "iso_1_code": null, - "iso_3_code": null, - "tokenizers": { - "Latn": { - "full_object": "SpaCyTokenizer(\"es\")", - "original_lang_name": "spanish", - "original_lang_code": "spa", - "scripts": [ - "Latn" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Cyrl": { - "full_object": "SpaCyTokenizer(\"ro\")", - "original_lang_name": "romanian", - "original_lang_code": "ron", - "scripts": [ - "Latn", - "Cyrl" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Armn": { - "full_object": "SpaCyTokenizer(\"hy\")", - "original_lang_name": "armenian", - "original_lang_code": "hye", - "scripts": [ - "Armn" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Grek": { - "full_object": "SpaCyTokenizer(\"el\")", - "original_lang_name": "greek", - "original_lang_code": "ell", - "scripts": [ - "Grek" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Deva": { - "full_object": "SpaCyTokenizer(\"hi\")", - "original_lang_name": "hindi", - "original_lang_code": "hin", - "scripts": [ - "Latn", - "Deva" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Gujr": { - "full_object": "SpaCyTokenizer(\"gu\")", - "original_lang_name": "gujarati", - "original_lang_code": "guj", - "scripts": [ - "Latn", - "Gujr" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Guru": { - "full_object": "IndicNLPTokenizer(\"pa\")", - "original_lang_name": "punjabi", - "original_lang_code": "pan", - "scripts": [ - "Latn", - "Guru" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": false - }, - "Beng": { - "full_object": "SpaCyTokenizer(\"bn\")", - "original_lang_name": "bengali", - "original_lang_code": "ben", - "scripts": [ - "Latn", - "Beng" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Orya": { - "full_object": "IndicNLPTokenizer(\"or\")", - "original_lang_name": "oriya", - "original_lang_code": "ori", - "scripts": [ - "Latn", - "Orya" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": true - }, - "Arab": { - "full_object": "SpaCyTokenizer(\"fa\")", - "original_lang_name": "persian", - "original_lang_code": "fas", - "scripts": [ - "Arab" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": true - }, - "Sinh": { - "full_object": "SpaCyTokenizer(\"si\")", - "original_lang_name": "sinhala", - "original_lang_code": "sin", + "own_tokenizer": false + } + ], + "node_i": "4500", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "West Iberian", + "iso_1_code": null, + "iso_3_code": null, + "tokenizers": { + "Latn": { + "full_object": "SpaCyTokenizer(\"es\")", + "original_lang_name": "spanish", + "original_lang_code": "spa", + "scripts": [ + "Latn" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } + }, + "children": [ + { + "name": "Asturo-Leonese", + "iso_1_code": null, + "iso_3_code": null, + "tokenizers": { + "Latn": { + "full_object": "SpaCyTokenizer(\"es\")", + "original_lang_name": "spanish", + "original_lang_code": "spa", "scripts": [ - "Sinh" + "Latn" ], "class_name": "SpaCyTokenizer", "macrolanguage": false @@ -25570,112 +10826,6 @@ ], "class_name": "SpaCyTokenizer", "macrolanguage": false - }, - "Cyrl": { - "full_object": "SpaCyTokenizer(\"ro\")", - "original_lang_name": "romanian", - "original_lang_code": "ron", - "scripts": [ - "Latn", - "Cyrl" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Armn": { - "full_object": "SpaCyTokenizer(\"hy\")", - "original_lang_name": "armenian", - "original_lang_code": "hye", - "scripts": [ - "Armn" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Grek": { - "full_object": "SpaCyTokenizer(\"el\")", - "original_lang_name": "greek", - "original_lang_code": "ell", - "scripts": [ - "Grek" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Deva": { - "full_object": "SpaCyTokenizer(\"hi\")", - "original_lang_name": "hindi", - "original_lang_code": "hin", - "scripts": [ - "Latn", - "Deva" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Gujr": { - "full_object": "SpaCyTokenizer(\"gu\")", - "original_lang_name": "gujarati", - "original_lang_code": "guj", - "scripts": [ - "Latn", - "Gujr" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Guru": { - "full_object": "IndicNLPTokenizer(\"pa\")", - "original_lang_name": "punjabi", - "original_lang_code": "pan", - "scripts": [ - "Latn", - "Guru" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": false - }, - "Beng": { - "full_object": "SpaCyTokenizer(\"bn\")", - "original_lang_name": "bengali", - "original_lang_code": "ben", - "scripts": [ - "Latn", - "Beng" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Orya": { - "full_object": "IndicNLPTokenizer(\"or\")", - "original_lang_name": "oriya", - "original_lang_code": "ori", - "scripts": [ - "Latn", - "Orya" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": true - }, - "Arab": { - "full_object": "SpaCyTokenizer(\"fa\")", - "original_lang_name": "persian", - "original_lang_code": "fas", - "scripts": [ - "Arab" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": true - }, - "Sinh": { - "full_object": "SpaCyTokenizer(\"si\")", - "original_lang_name": "sinhala", - "original_lang_code": "sin", - "scripts": [ - "Sinh" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false } }, "children": [ @@ -25778,112 +10928,6 @@ ], "class_name": "SpaCyTokenizer", "macrolanguage": false - }, - "Cyrl": { - "full_object": "SpaCyTokenizer(\"ro\")", - "original_lang_name": "romanian", - "original_lang_code": "ron", - "scripts": [ - "Latn", - "Cyrl" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Armn": { - "full_object": "SpaCyTokenizer(\"hy\")", - "original_lang_name": "armenian", - "original_lang_code": "hye", - "scripts": [ - "Armn" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Grek": { - "full_object": "SpaCyTokenizer(\"el\")", - "original_lang_name": "greek", - "original_lang_code": "ell", - "scripts": [ - "Grek" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Deva": { - "full_object": "SpaCyTokenizer(\"hi\")", - "original_lang_name": "hindi", - "original_lang_code": "hin", - "scripts": [ - "Latn", - "Deva" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Gujr": { - "full_object": "SpaCyTokenizer(\"gu\")", - "original_lang_name": "gujarati", - "original_lang_code": "guj", - "scripts": [ - "Latn", - "Gujr" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Guru": { - "full_object": "IndicNLPTokenizer(\"pa\")", - "original_lang_name": "punjabi", - "original_lang_code": "pan", - "scripts": [ - "Latn", - "Guru" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": false - }, - "Beng": { - "full_object": "SpaCyTokenizer(\"bn\")", - "original_lang_name": "bengali", - "original_lang_code": "ben", - "scripts": [ - "Latn", - "Beng" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Orya": { - "full_object": "IndicNLPTokenizer(\"or\")", - "original_lang_name": "oriya", - "original_lang_code": "ori", - "scripts": [ - "Latn", - "Orya" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": true - }, - "Arab": { - "full_object": "SpaCyTokenizer(\"fa\")", - "original_lang_name": "persian", - "original_lang_code": "fas", - "scripts": [ - "Arab" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": true - }, - "Sinh": { - "full_object": "SpaCyTokenizer(\"si\")", - "original_lang_name": "sinhala", - "original_lang_code": "sin", - "scripts": [ - "Sinh" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false } }, "children": [ @@ -25962,134 +11006,28 @@ "node_i": "4503", "scripts": [], "own_tokenizer": false - } - ], - "node_i": "4497", - "scripts": [], - "own_tokenizer": false - } - ], - "node_i": "4474", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Pyrenean-Mozarabic", - "iso_1_code": null, - "iso_3_code": null, - "tokenizers": { - "Latn": { - "full_object": "SpaCyTokenizer(\"es\")", - "original_lang_name": "spanish", - "original_lang_code": "spa", - "scripts": [ - "Latn" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Cyrl": { - "full_object": "SpaCyTokenizer(\"ro\")", - "original_lang_name": "romanian", - "original_lang_code": "ron", - "scripts": [ - "Latn", - "Cyrl" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Armn": { - "full_object": "SpaCyTokenizer(\"hy\")", - "original_lang_name": "armenian", - "original_lang_code": "hye", - "scripts": [ - "Armn" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Grek": { - "full_object": "SpaCyTokenizer(\"el\")", - "original_lang_name": "greek", - "original_lang_code": "ell", - "scripts": [ - "Grek" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Deva": { - "full_object": "SpaCyTokenizer(\"hi\")", - "original_lang_name": "hindi", - "original_lang_code": "hin", - "scripts": [ - "Latn", - "Deva" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Gujr": { - "full_object": "SpaCyTokenizer(\"gu\")", - "original_lang_name": "gujarati", - "original_lang_code": "guj", - "scripts": [ - "Latn", - "Gujr" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Guru": { - "full_object": "IndicNLPTokenizer(\"pa\")", - "original_lang_name": "punjabi", - "original_lang_code": "pan", - "scripts": [ - "Latn", - "Guru" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": false - }, - "Beng": { - "full_object": "SpaCyTokenizer(\"bn\")", - "original_lang_name": "bengali", - "original_lang_code": "ben", - "scripts": [ - "Latn", - "Beng" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Orya": { - "full_object": "IndicNLPTokenizer(\"or\")", - "original_lang_name": "oriya", - "original_lang_code": "ori", - "scripts": [ - "Latn", - "Orya" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": true - }, - "Arab": { - "full_object": "SpaCyTokenizer(\"fa\")", - "original_lang_name": "persian", - "original_lang_code": "fas", - "scripts": [ - "Arab" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": true - }, - "Sinh": { - "full_object": "SpaCyTokenizer(\"si\")", - "original_lang_name": "sinhala", - "original_lang_code": "sin", + } + ], + "node_i": "4497", + "scripts": [], + "own_tokenizer": false + } + ], + "node_i": "4474", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Pyrenean-Mozarabic", + "iso_1_code": null, + "iso_3_code": null, + "tokenizers": { + "Latn": { + "full_object": "SpaCyTokenizer(\"es\")", + "original_lang_name": "spanish", + "original_lang_code": "spa", "scripts": [ - "Sinh" + "Latn" ], "class_name": "SpaCyTokenizer", "macrolanguage": false @@ -26110,112 +11048,6 @@ ], "class_name": "SpaCyTokenizer", "macrolanguage": false - }, - "Cyrl": { - "full_object": "SpaCyTokenizer(\"ro\")", - "original_lang_name": "romanian", - "original_lang_code": "ron", - "scripts": [ - "Latn", - "Cyrl" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Armn": { - "full_object": "SpaCyTokenizer(\"hy\")", - "original_lang_name": "armenian", - "original_lang_code": "hye", - "scripts": [ - "Armn" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Grek": { - "full_object": "SpaCyTokenizer(\"el\")", - "original_lang_name": "greek", - "original_lang_code": "ell", - "scripts": [ - "Grek" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Deva": { - "full_object": "SpaCyTokenizer(\"hi\")", - "original_lang_name": "hindi", - "original_lang_code": "hin", - "scripts": [ - "Latn", - "Deva" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Gujr": { - "full_object": "SpaCyTokenizer(\"gu\")", - "original_lang_name": "gujarati", - "original_lang_code": "guj", - "scripts": [ - "Latn", - "Gujr" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Guru": { - "full_object": "IndicNLPTokenizer(\"pa\")", - "original_lang_name": "punjabi", - "original_lang_code": "pan", - "scripts": [ - "Latn", - "Guru" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": false - }, - "Beng": { - "full_object": "SpaCyTokenizer(\"bn\")", - "original_lang_name": "bengali", - "original_lang_code": "ben", - "scripts": [ - "Latn", - "Beng" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Orya": { - "full_object": "IndicNLPTokenizer(\"or\")", - "original_lang_name": "oriya", - "original_lang_code": "ori", - "scripts": [ - "Latn", - "Orya" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": true - }, - "Arab": { - "full_object": "SpaCyTokenizer(\"fa\")", - "original_lang_name": "persian", - "original_lang_code": "fas", - "scripts": [ - "Arab" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": true - }, - "Sinh": { - "full_object": "SpaCyTokenizer(\"si\")", - "original_lang_name": "sinhala", - "original_lang_code": "sin", - "scripts": [ - "Sinh" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false } }, "children": [ @@ -26225,160 +11057,54 @@ "iso_3_code": "arg", "tokenizers": { "Latn": { - "full_object": "SpaCyTokenizer(\"es\")", - "original_lang_name": "spanish", - "original_lang_code": "spa", - "scripts": [ - "Latn" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - } - }, - "children": [], - "node_i": "4519", - "scripts": [ - "Latn" - ], - "own_tokenizer": false - } - ], - "node_i": "4518", - "scripts": [], - "own_tokenizer": false - } - ], - "node_i": "4517", - "scripts": [], - "own_tokenizer": false - } - ], - "node_i": "4473", - "scripts": [], - "own_tokenizer": false - } - ], - "node_i": "4465", - "scripts": [], - "own_tokenizer": false - }, - { - "name": "Southern", - "iso_1_code": null, - "iso_3_code": null, - "tokenizers": { - "Latn": { - "full_object": "SpaCyTokenizer(\"es\")", - "original_lang_name": "spanish", - "original_lang_code": "spa", - "scripts": [ - "Latn" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Cyrl": { - "full_object": "SpaCyTokenizer(\"ro\")", - "original_lang_name": "romanian", - "original_lang_code": "ron", - "scripts": [ - "Latn", - "Cyrl" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Armn": { - "full_object": "SpaCyTokenizer(\"hy\")", - "original_lang_name": "armenian", - "original_lang_code": "hye", - "scripts": [ - "Armn" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Grek": { - "full_object": "SpaCyTokenizer(\"el\")", - "original_lang_name": "greek", - "original_lang_code": "ell", - "scripts": [ - "Grek" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Deva": { - "full_object": "SpaCyTokenizer(\"hi\")", - "original_lang_name": "hindi", - "original_lang_code": "hin", - "scripts": [ - "Latn", - "Deva" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Gujr": { - "full_object": "SpaCyTokenizer(\"gu\")", - "original_lang_name": "gujarati", - "original_lang_code": "guj", - "scripts": [ - "Latn", - "Gujr" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Guru": { - "full_object": "IndicNLPTokenizer(\"pa\")", - "original_lang_name": "punjabi", - "original_lang_code": "pan", - "scripts": [ - "Latn", - "Guru" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": false - }, - "Beng": { - "full_object": "SpaCyTokenizer(\"bn\")", - "original_lang_name": "bengali", - "original_lang_code": "ben", - "scripts": [ - "Latn", - "Beng" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Orya": { - "full_object": "IndicNLPTokenizer(\"or\")", - "original_lang_name": "oriya", - "original_lang_code": "ori", - "scripts": [ - "Latn", - "Orya" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": true - }, - "Arab": { - "full_object": "SpaCyTokenizer(\"fa\")", - "original_lang_name": "persian", - "original_lang_code": "fas", - "scripts": [ - "Arab" + "full_object": "SpaCyTokenizer(\"es\")", + "original_lang_name": "spanish", + "original_lang_code": "spa", + "scripts": [ + "Latn" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } + }, + "children": [], + "node_i": "4519", + "scripts": [ + "Latn" + ], + "own_tokenizer": false + } + ], + "node_i": "4518", + "scripts": [], + "own_tokenizer": false + } + ], + "node_i": "4517", + "scripts": [], + "own_tokenizer": false + } ], - "class_name": "SpaCyTokenizer", - "macrolanguage": true - }, - "Sinh": { - "full_object": "SpaCyTokenizer(\"si\")", - "original_lang_name": "sinhala", - "original_lang_code": "sin", + "node_i": "4473", + "scripts": [], + "own_tokenizer": false + } + ], + "node_i": "4465", + "scripts": [], + "own_tokenizer": false + }, + { + "name": "Southern", + "iso_1_code": null, + "iso_3_code": null, + "tokenizers": { + "Latn": { + "full_object": "SpaCyTokenizer(\"es\")", + "original_lang_name": "spanish", + "original_lang_code": "spa", "scripts": [ - "Sinh" + "Latn" ], "class_name": "SpaCyTokenizer", "macrolanguage": false @@ -26399,112 +11125,6 @@ ], "class_name": "SpaCyTokenizer", "macrolanguage": false - }, - "Cyrl": { - "full_object": "SpaCyTokenizer(\"ro\")", - "original_lang_name": "romanian", - "original_lang_code": "ron", - "scripts": [ - "Latn", - "Cyrl" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Armn": { - "full_object": "SpaCyTokenizer(\"hy\")", - "original_lang_name": "armenian", - "original_lang_code": "hye", - "scripts": [ - "Armn" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Grek": { - "full_object": "SpaCyTokenizer(\"el\")", - "original_lang_name": "greek", - "original_lang_code": "ell", - "scripts": [ - "Grek" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Deva": { - "full_object": "SpaCyTokenizer(\"hi\")", - "original_lang_name": "hindi", - "original_lang_code": "hin", - "scripts": [ - "Latn", - "Deva" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Gujr": { - "full_object": "SpaCyTokenizer(\"gu\")", - "original_lang_name": "gujarati", - "original_lang_code": "guj", - "scripts": [ - "Latn", - "Gujr" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Guru": { - "full_object": "IndicNLPTokenizer(\"pa\")", - "original_lang_name": "punjabi", - "original_lang_code": "pan", - "scripts": [ - "Latn", - "Guru" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": false - }, - "Beng": { - "full_object": "SpaCyTokenizer(\"bn\")", - "original_lang_name": "bengali", - "original_lang_code": "ben", - "scripts": [ - "Latn", - "Beng" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Orya": { - "full_object": "IndicNLPTokenizer(\"or\")", - "original_lang_name": "oriya", - "original_lang_code": "ori", - "scripts": [ - "Latn", - "Orya" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": true - }, - "Arab": { - "full_object": "SpaCyTokenizer(\"fa\")", - "original_lang_name": "persian", - "original_lang_code": "fas", - "scripts": [ - "Arab" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": true - }, - "Sinh": { - "full_object": "SpaCyTokenizer(\"si\")", - "original_lang_name": "sinhala", - "original_lang_code": "sin", - "scripts": [ - "Sinh" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false } }, "children": [ @@ -26550,112 +11170,6 @@ ], "class_name": "SpaCyTokenizer", "macrolanguage": false - }, - "Cyrl": { - "full_object": "SpaCyTokenizer(\"ro\")", - "original_lang_name": "romanian", - "original_lang_code": "ron", - "scripts": [ - "Latn", - "Cyrl" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Armn": { - "full_object": "SpaCyTokenizer(\"hy\")", - "original_lang_name": "armenian", - "original_lang_code": "hye", - "scripts": [ - "Armn" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Grek": { - "full_object": "SpaCyTokenizer(\"el\")", - "original_lang_name": "greek", - "original_lang_code": "ell", - "scripts": [ - "Grek" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Deva": { - "full_object": "SpaCyTokenizer(\"hi\")", - "original_lang_name": "hindi", - "original_lang_code": "hin", - "scripts": [ - "Latn", - "Deva" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Gujr": { - "full_object": "SpaCyTokenizer(\"gu\")", - "original_lang_name": "gujarati", - "original_lang_code": "guj", - "scripts": [ - "Latn", - "Gujr" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Guru": { - "full_object": "IndicNLPTokenizer(\"pa\")", - "original_lang_name": "punjabi", - "original_lang_code": "pan", - "scripts": [ - "Latn", - "Guru" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": false - }, - "Beng": { - "full_object": "SpaCyTokenizer(\"bn\")", - "original_lang_name": "bengali", - "original_lang_code": "ben", - "scripts": [ - "Latn", - "Beng" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false - }, - "Orya": { - "full_object": "IndicNLPTokenizer(\"or\")", - "original_lang_name": "oriya", - "original_lang_code": "ori", - "scripts": [ - "Latn", - "Orya" - ], - "class_name": "IndicNLPTokenizer", - "macrolanguage": true - }, - "Arab": { - "full_object": "SpaCyTokenizer(\"fa\")", - "original_lang_name": "persian", - "original_lang_code": "fas", - "scripts": [ - "Arab" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": true - }, - "Sinh": { - "full_object": "SpaCyTokenizer(\"si\")", - "original_lang_name": "sinhala", - "original_lang_code": "sin", - "scripts": [ - "Sinh" - ], - "class_name": "SpaCyTokenizer", - "macrolanguage": false } }, "children": [