dummy_dataset / validation /dataset_info.json

push

d42a80a over 4 years ago

6.51 kB

	{
	"builder_name": "oscar",
	"citation": "@inproceedings{ortiz-suarez-etal-2020-monolingual,\n title = \"A Monolingual Approach to Contextualized Word Embeddings for Mid-Resource Languages\",\n author = \"Ortiz Su{'a}rez, Pedro Javier and\n Romary, Laurent and\n Sagot, Benoit\",\n booktitle = \"Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics\",\n month = jul,\n year = \"2020\",\n address = \"Online\",\n publisher = \"Association for Computational Linguistics\",\n url = \"https://www.aclweb.org/anthology/2020.acl-main.156\",\n pages = \"1703--1714\",\n abstract = \"We use the multilingual OSCAR corpus, extracted from Common Crawl via language classification, filtering and cleaning, to train monolingual contextualized word embeddings (ELMo) for five mid-resource languages. We then compare the performance of OSCAR-based and Wikipedia-based ELMo embeddings for these languages on the part-of-speech tagging and parsing tasks. We show that, despite the noise in the Common-Crawl-based OSCAR data, embeddings trained on OSCAR perform much better than monolingual embeddings trained on Wikipedia. They actually equal or improve the current state of the art in tagging and parsing for all five languages. In particular, they also improve over multilingual Wikipedia-based contextual embeddings (multilingual BERT), which almost always constitutes the previous state of the art, thereby showing that the benefit of a larger, more diverse corpus surpasses the cross-lingual benefit of multilingual embedding architectures.\",\n}\n\n@inproceedings{OrtizSuarezSagotRomary2019,\n author = {Pedro Javier {Ortiz Su{'a}rez} and Benoit Sagot and Laurent Romary},\n title = {Asynchronous pipelines for processing huge corpora on medium to low resource infrastructures},\n series = {Proceedings of the Workshop on Challenges in the Management of Large Corpora (CMLC-7) 2019. Cardiff, 22nd July 2019},\n editor = {Piotr Ba\u0144ski and Adrien Barbaresi and Hanno Biber and Evelyn Breiteneder and Simon Clematide and Marc Kupietz and Harald L{\"u}ngen and Caroline Iliadi},\n publisher = {Leibniz-Institut f{\"u}r Deutsche Sprache},\n address = {Mannheim},\n doi = {10.14618/ids-pub-9021},\n url = {http://nbn-resolving.de/urn:nbn:de:bsz:mh39-90215},\n pages = {9 -- 16},\n year = {2019},\n abstract = {Common Crawl is a considerably large, heterogeneous multilingual corpus comprised of crawled documents from the internet, surpassing 20TB of data and distributed as a set of more than 50 thousand plain text files where each contains many documents written in a wide variety of languages. Even though each document has a metadata block associated to it, this data lacks any information about the language in which each document is written, making it extremely difficult to use Common Crawl for monolingual applications. We propose a general, highly parallel, multithreaded pipeline to clean and classify Common Crawl by language; we specifically design it so that it runs efficiently on medium to low resource infrastructures where I/O speeds are the main constraint. We develop the pipeline so that it can be easily reapplied to any kind of heterogeneous corpus and so that it can be parameterised to a wide range of infrastructures. We also distribute a 6.3TB version of Common Crawl, filtered, classified by language, shuffled at line level in order to avoid copyright issues, and ready to be used for NLP applications.},\n language = {en}\n}\n",
	"config_name": "unshuffled_deduplicated_als",
	"dataset_size": 2915912,
	"description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.",
	"download_checksums": {
	"https://s3.amazonaws.com/datasets.huggingface.co/oscar/1.0/unshuffled/deduplicated/als/als_sha256.txt": {
	"num_bytes": 82,
	"checksum": "75c1fb069c40aec1c0a50b4b0bb076a8d1cbcaa71d9592318930a48ea0f6a813"
	},
	"https://s3.amazonaws.com/datasets.huggingface.co/oscar/1.0/unshuffled/deduplicated/als/als_dedup.txt.gz": {
	"num_bytes": 1263212,
	"checksum": "7e5eff6f004740a493ca703e10172449ae2a27ae507fb255f75c99c939a30ed1"
	}
	},
	"download_size": 1263294,
	"features": {
	"attention_mask": {
	"feature": {
	"dtype": "int8",
	"id": null,
	"_type": "Value"
	},
	"length": -1,
	"id": null,
	"_type": "Sequence"
	},
	"input_ids": {
	"feature": {
	"dtype": "int32",
	"id": null,
	"_type": "Value"
	},
	"length": -1,
	"id": null,
	"_type": "Sequence"
	},
	"labels": {
	"feature": {
	"dtype": "int64",
	"id": null,
	"_type": "Value"
	},
	"length": -1,
	"id": null,
	"_type": "Sequence"
	}
	},
	"homepage": "https://oscar-corpus.com",
	"license": "\n These data are released under this licensing scheme\n We do not own any of the text from which these data has been extracted.\n We license the actual packaging of these data under the Creative Commons CC0 license (\"no rights reserved\") http://creativecommons.org/publicdomain/zero/1.0/\n To the extent possible under law, Inria has waived all copyright and related or neighboring rights to OSCAR\n This work is published from: France.\n\n Should you consider that our data contains material that is owned by you and should therefore not be reproduced here, please:\n * Clearly identify yourself, with detailed contact data such as an address, telephone number or email address at which you can be contacted.\n * Clearly identify the copyrighted work claimed to be infringed.\n * Clearly identify the material that is claimed to be infringing and information reasonably sufficient to allow us to locate the material.\n\n We will comply to legitimate requests by removing the affected sources from the next release of the corpus. ",
	"post_processed": null,
	"post_processing_size": null,
	"size_in_bytes": 4179206,
	"splits": {
	"train": {
	"name": "train",
	"num_bytes": 2915912,
	"num_examples": 4518,
	"dataset_name": "oscar"
	}
	},
	"supervised_keys": null,
	"task_templates": null,
	"version": {
	"version_str": "1.0.0",
	"description": null,
	"major": 1,
	"minor": 0,
	"patch": 0
	}
	}