stanza-digphil / stanza /utils /default_paths.py
Albin Thörn Cleland
Clean initial commit with LFS
19b8775
import os
def get_default_paths():
"""
Gets base paths for the data directories
If DATA_ROOT is set in the environment, use that as the root
otherwise use "./data"
individual paths can also be set in the environment
"""
DATA_ROOT = os.environ.get("DATA_ROOT", "data")
defaults = {
"TOKENIZE_DATA_DIR": DATA_ROOT + "/tokenize",
"MWT_DATA_DIR": DATA_ROOT + "/mwt",
"LEMMA_DATA_DIR": DATA_ROOT + "/lemma",
"POS_DATA_DIR": DATA_ROOT + "/pos",
"DEPPARSE_DATA_DIR": DATA_ROOT + "/depparse",
"ETE_DATA_DIR": DATA_ROOT + "/ete",
"NER_DATA_DIR": DATA_ROOT + "/ner",
"CHARLM_DATA_DIR": DATA_ROOT + "/charlm",
"SENTIMENT_DATA_DIR": DATA_ROOT + "/sentiment",
"CONSTITUENCY_DATA_DIR": DATA_ROOT + "/constituency",
"COREF_DATA_DIR": DATA_ROOT + "/coref",
"LEMMA_CLASSIFIER_DATA_DIR": DATA_ROOT + "/lemma_classifier",
# Set directories to store external word vector data
"WORDVEC_DIR": "extern_data/wordvec",
# TODO: not sure what other people actually have
# TODO: also, could make this automatically update to the latest
"UDBASE": "extern_data/ud2/ud-treebanks-v2.11",
"UDBASE_GIT": "extern_data/ud2/git",
"NERBASE": "extern_data/ner",
"CONSTITUENCY_BASE": "extern_data/constituency",
"SENTIMENT_BASE": "extern_data/sentiment",
"COREF_BASE": "extern_data/coref",
# there's a stanford github, stanfordnlp/handparsed-treebank,
# with some data for different languages
"HANDPARSED_DIR": "extern_data/handparsed-treebank",
# directory with the contents of https://nlp.stanford.edu/projects/stanza/bio/
# on the cluster, for example, /u/nlp/software/stanza/bio_ud
"BIO_UD_DIR": "extern_data/bio",
# data root for other general input files, such as VI_VLSP
"STANZA_EXTERN_DIR": "extern_data",
}
paths = { "DATA_ROOT" : DATA_ROOT }
for k, v in defaults.items():
paths[k] = os.environ.get(k, v)
return paths