| # | |
| # Download word vector files for all supported languages. Run as: | |
| # ./download_vectors.sh WORDVEC_DIR | |
| # where WORDVEC_DIR is the target directory to store the word vector data. | |
| # check arguments | |
| : ${1?"Usage: $0 WORDVEC_DIR"} | |
| WORDVEC_DIR=$1 | |
| # constants and functions | |
| CONLL17_URL="https://lindat.mff.cuni.cz/repository/xmlui/bitstream/handle/11234/1-1989/word-embeddings-conll17.tar" | |
| CONLL17_TAR="word-embeddings-conll17.tar" | |
| FASTTEXT_BASE_URL="https://dl.fbaipublicfiles.com/fasttext/vectors-wiki" | |
| # TODO: some fasttext vectors are now at | |
| # https://fasttext.cc/docs/en/pretrained-vectors.html | |
| # there are also vectors for | |
| # Welsh, Icelandic, Thai, Sanskrit | |
| # https://fasttext.cc/docs/en/crawl-vectors.html | |
| # We get the Armenian word vectors from here: | |
| # https://github.com/ispras-texterra/word-embeddings-eval-hy | |
| # https://arxiv.org/ftp/arxiv/papers/1906/1906.03134.pdf | |
| # In particular, the glove model (dogfooding): | |
| # https://at.ispras.ru/owncloud/index.php/s/pUUiS1l1jGKNax3/download | |
| # These vectors improved F1 by about 1 on various tasks for Armenian | |
| # and had much better coverage of Western Armenian | |
| # For Eryza, we use word vectors available here: | |
| # https://github.com/mokha/semantics | |
| # @incollection{Alnajjar_2021, | |
| # doi = {10.31885/9789515150257.24}, | |
| # url = {https://doi.org/10.31885%2F9789515150257.24}, | |
| # year = 2021, | |
| # month = {mar}, | |
| # publisher = {University of Helsinki}, | |
| # pages = {275--288}, | |
| # author = {Khalid Alnajjar}, | |
| # title = {When Word Embeddings Become Endangered}, | |
| # booktitle = {Multilingual Facilitation} | |
| # } | |
| declare -a FASTTEXT_LANG=("Afrikaans" "Breton" "Buryat" "Chinese" "Faroese" "Gothic" "Kurmanji" "North_Sami" "Serbian" "Upper_Sorbian") | |
| declare -a FASTTEXT_CODE=("af" "br" "bxr" "zh" "fo" "got" "ku" "se" "sr" "hsb") | |
| declare -a LOCAL_CODE=("af" "br" "bxr" "zh" "fo" "got" "kmr" "sme" "sr" "hsb") | |
| color_green='\033[32;1m' | |
| color_clear='\033[0m' # No Color | |
| function msg() { | |
| echo -e "${color_green}$@${color_clear}" | |
| } | |
| function prepare_fasttext_vec() { | |
| lang=$1 | |
| ftcode=$2 | |
| code=$3 | |
| cwd=$(pwd) | |
| mkdir -p $lang | |
| cd $lang | |
| msg "=== Downloading fasttext vector file for ${lang}..." | |
| url="${FASTTEXT_BASE_URL}/wiki.${ftcode}.vec" | |
| fname="${code}.vectors" | |
| wget $url -O $fname | |
| msg "=== Compressing file ${fname}..." | |
| xz $fname | |
| cd $cwd | |
| } | |
| # do the actual work | |
| mkdir -p $WORDVEC_DIR | |
| cd $WORDVEC_DIR | |
| msg "Downloading CONLL17 word vectors. This may take a while..." | |
| wget $CONLL17_URL -O $CONLL17_TAR | |
| msg "Extracting CONLL17 word vector files..." | |
| tar -xvf $CONLL17_TAR | |
| rm $CONLL17_TAR | |
| msg "Preparing fasttext vectors for the rest of the languages." | |
| for (( i=0; i<${#FASTTEXT_LANG[*]}; ++i)); do | |
| prepare_fasttext_vec ${FASTTEXT_LANG[$i]} ${FASTTEXT_CODE[$i]} ${LOCAL_CODE[$i]} | |
| done | |
| # handle old french | |
| mkdir Old_French | |
| ln -s French/fr.vectors.xz Old_French/fro.vectors.xz | |
| msg "All done." | |