prefix = "munin-open" tokenizer_name = "common-pile/comma-v0.1-2t" dyna_train = { "adl": 1.0, "ai-aktindsigt": 1.0, "botxt": 1.0, "cellar": 1.0, "dannet": 1.0, "danske-taler": 1.0, "domsdatabasen": 1.0, "enevaeldens_nyheder": 1.0, "ep": 1.0, "eur-lex-sum-da": 1.0, "fm-udgivelser": 1.0, "ft": 1.0, "grundtvig": 1.0, "gutenberg": 1.0, "health_hovedstaden": 1.0, "hest": 1.0, "historical-danish-handwriting": 1.0, "memo": 1.0, "miljoeportalen": 1.0, "naat": 1.0, "ncc_books": 1.0, "ncc_maalfrid": 1.0, "ncc_newspaper": 1.0, "ncc_parliament": 1.0, "nota": 1.0, "opensubtitles": 1.0, "relig": 1.0, "retsinformationdk": 1.0, "skat": 1.0, "retspraksis": 1.0, "spont": 1.0, "tv2r": 1.0, "wiki-comments": 1.0, "wikibooks": 1.0, "wikipedia": 1.0, "wikisource": 1.0, } dyna_test = { "depbank": 1.0, "jvj": 1.0, "nordjyllandnews": 1.0, "synne": 1.0, } cp_train = { "arxiv_papers": 0.5, "cccc": 0.3, "data_provenance_initiative": 2, "doab": 2, "foodista": 2, "libretexts": 2, "news": 2, "oercommons": 2, "peS2o": 0.1, "pressbooks": 2, "public_domain_review": 2, "python_enhancement_proposals": 2, "stackexchange": 0.25, "stackv2_edu": 0.1, "wikimedia": 0.4, } sources = { "dyna": { "uri": "hf://datasets/danish-foundation-models/danish-dynaword/data/{key}/*.parquet", "format": "parquet", "shards": 1, "shard_index": 0, "train": dyna_train, "test": dyna_test, }, "cp": { "uri": "hf://datasets/common-pile/comma_v0.1_training_dataset/{key}/*.jsonl.gz", "format": "json", "shards": 16, "shard_index": 2, "train": cp_train, "test": {}, }, }