import os def get_default_paths(): """ Gets base paths for the data directories If DATA_ROOT is set in the environment, use that as the root otherwise use "./data" individual paths can also be set in the environment """ DATA_ROOT = os.environ.get("DATA_ROOT", "data") defaults = { "TOKENIZE_DATA_DIR": DATA_ROOT + "/tokenize", "MWT_DATA_DIR": DATA_ROOT + "/mwt", "LEMMA_DATA_DIR": DATA_ROOT + "/lemma", "POS_DATA_DIR": DATA_ROOT + "/pos", "DEPPARSE_DATA_DIR": DATA_ROOT + "/depparse", "ETE_DATA_DIR": DATA_ROOT + "/ete", "NER_DATA_DIR": DATA_ROOT + "/ner", "CHARLM_DATA_DIR": DATA_ROOT + "/charlm", "SENTIMENT_DATA_DIR": DATA_ROOT + "/sentiment", "CONSTITUENCY_DATA_DIR": DATA_ROOT + "/constituency", "COREF_DATA_DIR": DATA_ROOT + "/coref", "LEMMA_CLASSIFIER_DATA_DIR": DATA_ROOT + "/lemma_classifier", # Set directories to store external word vector data "WORDVEC_DIR": "extern_data/wordvec", # TODO: not sure what other people actually have # TODO: also, could make this automatically update to the latest "UDBASE": "extern_data/ud2/ud-treebanks-v2.11", "UDBASE_GIT": "extern_data/ud2/git", "NERBASE": "extern_data/ner", "CONSTITUENCY_BASE": "extern_data/constituency", "SENTIMENT_BASE": "extern_data/sentiment", "COREF_BASE": "extern_data/coref", # there's a stanford github, stanfordnlp/handparsed-treebank, # with some data for different languages "HANDPARSED_DIR": "extern_data/handparsed-treebank", # directory with the contents of https://nlp.stanford.edu/projects/stanza/bio/ # on the cluster, for example, /u/nlp/software/stanza/bio_ud "BIO_UD_DIR": "extern_data/bio", # data root for other general input files, such as VI_VLSP "STANZA_EXTERN_DIR": "extern_data", } paths = { "DATA_ROOT" : DATA_ROOT } for k, v in defaults.items(): paths[k] = os.environ.get(k, v) return paths