| #!/bin/bash
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| TOPDIR='./data'
|
| RUNDIR=${PWD}
|
|
|
| mkdir -p ${TOPDIR}
|
| cd ${TOPDIR}
|
| mkdir -p raw_data
|
| mkdir -p raw_data/pretrained_embeddings
|
| mkdir -p raw_data/unlabeled_data
|
| mkdir -p raw_data/chunk
|
| cd ${RUNDIR}
|
|
|
| echo "Preparing GloVe embeddings"
|
| cd "${TOPDIR}/raw_data/pretrained_embeddings"
|
| curl -OL http://nlp.stanford.edu/data/glove.6B.zip
|
| unzip glove.6B.zip
|
| cd ${RUNDIR}
|
| echo
|
|
|
| echo "Preparing lm1b corpus"
|
| cd "${TOPDIR}/raw_data/unlabeled_data"
|
| curl -OL http://www.statmt.org/lm-benchmark/1-billion-word-language-modeling-benchmark-r13output.tar.gz
|
| tar xzf 1-billion-word-language-modeling-benchmark-r13output.tar.gz
|
| cd ${RUNDIR}
|
| echo
|
|
|
| echo "Preparing chunking corpus"
|
| cd "${TOPDIR}/raw_data/chunk"
|
| curl -OL https://www.clips.uantwerpen.be/conll2000/chunking/train.txt.gz
|
| curl -OL http://www.clips.uantwerpen.be/conll2000/chunking/test.txt.gz
|
| gunzip *
|
| cd ${RUNDIR}
|
| echo
|
|
|
| echo "Done with data fetching!"
|
|
|
|
|