latin_experiments / stanza /scripts /download_vectors.sh

Add files using upload-large-folder tool

af1acfc verified 11 months ago

2.91 kB

	#!/bin/bash
	#
	# Download word vector files for all supported languages. Run as:
	# ./download_vectors.sh WORDVEC_DIR
	# where WORDVEC_DIR is the target directory to store the word vector data.

	# check arguments
	: ${1?"Usage: $0 WORDVEC_DIR"}
	WORDVEC_DIR=$1

	# constants and functions
	CONLL17_URL="https://lindat.mff.cuni.cz/repository/xmlui/bitstream/handle/11234/1-1989/word-embeddings-conll17.tar"
	CONLL17_TAR="word-embeddings-conll17.tar"

	FASTTEXT_BASE_URL="https://dl.fbaipublicfiles.com/fasttext/vectors-wiki"

	# TODO: some fasttext vectors are now at
	# https://fasttext.cc/docs/en/pretrained-vectors.html
	# there are also vectors for
	# Welsh, Icelandic, Thai, Sanskrit
	# https://fasttext.cc/docs/en/crawl-vectors.html

	# We get the Armenian word vectors from here:
	# https://github.com/ispras-texterra/word-embeddings-eval-hy
	# https://arxiv.org/ftp/arxiv/papers/1906/1906.03134.pdf
	# In particular, the glove model (dogfooding):
	# https://at.ispras.ru/owncloud/index.php/s/pUUiS1l1jGKNax3/download
	# These vectors improved F1 by about 1 on various tasks for Armenian
	# and had much better coverage of Western Armenian

	# For Eryza, we use word vectors available here:
	# https://github.com/mokha/semantics
	# @incollection{Alnajjar_2021,
	# doi = {10.31885/9789515150257.24},
	# url = {https://doi.org/10.31885%2F9789515150257.24},
	# year = 2021,
	# month = {mar},
	# publisher = {University of Helsinki},
	# pages = {275--288},
	# author = {Khalid Alnajjar},
	# title = {When Word Embeddings Become Endangered},
	# booktitle = {Multilingual Facilitation}
	# }

	declare -a FASTTEXT_LANG=("Afrikaans" "Breton" "Buryat" "Chinese" "Faroese" "Gothic" "Kurmanji" "North_Sami" "Serbian" "Upper_Sorbian")
	declare -a FASTTEXT_CODE=("af" "br" "bxr" "zh" "fo" "got" "ku" "se" "sr" "hsb")
	declare -a LOCAL_CODE=("af" "br" "bxr" "zh" "fo" "got" "kmr" "sme" "sr" "hsb")

	color_green='\033[32;1m'
	color_clear='\033[0m' # No Color
	function msg() {
	echo -e "${color_green}$@${color_clear}"
	}

	function prepare_fasttext_vec() {
	lang=$1
	ftcode=$2
	code=$3

	cwd=$(pwd)
	mkdir -p $lang
	cd $lang
	msg "=== Downloading fasttext vector file for ${lang}..."
	url="${FASTTEXT_BASE_URL}/wiki.${ftcode}.vec"
	fname="${code}.vectors"
	wget $url -O $fname

	msg "=== Compressing file ${fname}..."
	xz $fname
	cd $cwd
	}

	# do the actual work
	mkdir -p $WORDVEC_DIR
	cd $WORDVEC_DIR

	msg "Downloading CONLL17 word vectors. This may take a while..."
	wget $CONLL17_URL -O $CONLL17_TAR

	msg "Extracting CONLL17 word vector files..."
	tar -xvf $CONLL17_TAR
	rm $CONLL17_TAR

	msg "Preparing fasttext vectors for the rest of the languages."
	for (( i=0; i<${#FASTTEXT_LANG[*]}; ++i)); do
	prepare_fasttext_vec ${FASTTEXT_LANG[$i]} ${FASTTEXT_CODE[$i]} ${LOCAL_CODE[$i]}
	done

	# handle old french
	mkdir Old_French
	ln -s French/fr.vectors.xz Old_French/fro.vectors.xz

	msg "All done."