Text Generation
Safetensors
Danish
English
llama
peter-sk's picture
Super-squash branch 'main' using huggingface_hub
255c557
prefix = "munin-open"
tokenizer_name = "common-pile/comma-v0.1-2t"
dyna_train = {
"adl": 1.0,
"ai-aktindsigt": 1.0,
"botxt": 1.0,
"cellar": 1.0,
"dannet": 1.0,
"danske-taler": 1.0,
"domsdatabasen": 1.0,
"enevaeldens_nyheder": 1.0,
"ep": 1.0,
"eur-lex-sum-da": 1.0,
"fm-udgivelser": 1.0,
"ft": 1.0,
"grundtvig": 1.0,
"gutenberg": 1.0,
"health_hovedstaden": 1.0,
"hest": 1.0,
"historical-danish-handwriting": 1.0,
"memo": 1.0,
"miljoeportalen": 1.0,
"naat": 1.0,
"ncc_books": 1.0,
"ncc_maalfrid": 1.0,
"ncc_newspaper": 1.0,
"ncc_parliament": 1.0,
"nota": 1.0,
"opensubtitles": 1.0,
"relig": 1.0,
"retsinformationdk": 1.0,
"skat": 1.0,
"retspraksis": 1.0,
"spont": 1.0,
"tv2r": 1.0,
"wiki-comments": 1.0,
"wikibooks": 1.0,
"wikipedia": 1.0,
"wikisource": 1.0,
}
dyna_test = {
"depbank": 1.0,
"jvj": 1.0,
"nordjyllandnews": 1.0,
"synne": 1.0,
}
cp_train = {
"arxiv_papers": 0.5,
"cccc": 0.3,
"data_provenance_initiative": 2,
"doab": 2,
"foodista": 2,
"libretexts": 2,
"news": 2,
"oercommons": 2,
"peS2o": 0.1,
"pressbooks": 2,
"public_domain_review": 2,
"python_enhancement_proposals": 2,
"stackexchange": 0.25,
"stackv2_edu": 0.1,
"wikimedia": 0.4,
}
sources = {
"dyna": {
"uri": "hf://datasets/danish-foundation-models/danish-dynaword/data/{key}/*.parquet",
"format": "parquet",
"shards": 1,
"shard_index": 0,
"train": dyna_train,
"test": dyna_test,
},
"cp": {
"uri": "hf://datasets/common-pile/comma_v0.1_training_dataset/{key}/*.jsonl.gz",
"format": "json",
"shards": 16,
"shard_index": 2,
"train": cp_train,
"test": {},
},
}