| prefix = "munin-open" | |
| tokenizer_name = "common-pile/comma-v0.1-2t" | |
| dyna_train = { | |
| "adl": 1.0, | |
| "ai-aktindsigt": 1.0, | |
| "botxt": 1.0, | |
| "cellar": 1.0, | |
| "dannet": 1.0, | |
| "danske-taler": 1.0, | |
| "domsdatabasen": 1.0, | |
| "enevaeldens_nyheder": 1.0, | |
| "ep": 1.0, | |
| "eur-lex-sum-da": 1.0, | |
| "fm-udgivelser": 1.0, | |
| "ft": 1.0, | |
| "grundtvig": 1.0, | |
| "gutenberg": 1.0, | |
| "health_hovedstaden": 1.0, | |
| "hest": 1.0, | |
| "historical-danish-handwriting": 1.0, | |
| "memo": 1.0, | |
| "miljoeportalen": 1.0, | |
| "naat": 1.0, | |
| "ncc_books": 1.0, | |
| "ncc_maalfrid": 1.0, | |
| "ncc_newspaper": 1.0, | |
| "ncc_parliament": 1.0, | |
| "nota": 1.0, | |
| "opensubtitles": 1.0, | |
| "relig": 1.0, | |
| "retsinformationdk": 1.0, | |
| "skat": 1.0, | |
| "retspraksis": 1.0, | |
| "spont": 1.0, | |
| "tv2r": 1.0, | |
| "wiki-comments": 1.0, | |
| "wikibooks": 1.0, | |
| "wikipedia": 1.0, | |
| "wikisource": 1.0, | |
| } | |
| dyna_test = { | |
| "depbank": 1.0, | |
| "jvj": 1.0, | |
| "nordjyllandnews": 1.0, | |
| "synne": 1.0, | |
| } | |
| cp_train = { | |
| "arxiv_papers": 0.5, | |
| "cccc": 0.3, | |
| "data_provenance_initiative": 2, | |
| "doab": 2, | |
| "foodista": 2, | |
| "libretexts": 2, | |
| "news": 2, | |
| "oercommons": 2, | |
| "peS2o": 0.1, | |
| "pressbooks": 2, | |
| "public_domain_review": 2, | |
| "python_enhancement_proposals": 2, | |
| "stackexchange": 0.25, | |
| "stackv2_edu": 0.1, | |
| "wikimedia": 0.4, | |
| } | |
| sources = { | |
| "dyna": { | |
| "uri": "hf://datasets/danish-foundation-models/danish-dynaword/data/{key}/*.parquet", | |
| "format": "parquet", | |
| "shards": 1, | |
| "shard_index": 0, | |
| "train": dyna_train, | |
| "test": dyna_test, | |
| }, | |
| "cp": { | |
| "uri": "hf://datasets/common-pile/comma_v0.1_training_dataset/{key}/*.jsonl.gz", | |
| "format": "json", | |
| "shards": 16, | |
| "shard_index": 2, | |
| "train": cp_train, | |
| "test": {}, | |
| }, | |
| } | |