Spaces:
Runtime error
Runtime error
Create create_corpora.sh
Browse files- server/create_corpora.sh +30 -0
server/create_corpora.sh
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/bin/bash
|
| 2 |
+
|
| 3 |
+
# WARNING: Do not call this as an absolute path
|
| 4 |
+
|
| 5 |
+
SCRIPT_DIR="./"
|
| 6 |
+
WOZ_NAME="woz"
|
| 7 |
+
WIKI_NAME="wiki"
|
| 8 |
+
CORPORA="$WOZ_NAME $WIKI_NAME"
|
| 9 |
+
# MODELS="bert-base-cased gpt2 distilgpt2 roberta-base distilroberta-base distilbert-base-uncased"
|
| 10 |
+
MODELS="gpt2 distilgpt2"
|
| 11 |
+
OUT_DIR="./$SCRIPT_DIR/corpora"
|
| 12 |
+
RAW_TEXT_DIR="./$SCRIPT_DIR/raw_data"
|
| 13 |
+
PYTHON_SCRIPT="./$SCRIPT_DIR/data_processing/create_corpus.py"
|
| 14 |
+
|
| 15 |
+
# Download the models, hardcoded for now
|
| 16 |
+
# mkdir -p $RAW_TEXT_DIR
|
| 17 |
+
# WOZURL="https://ibm.box.com/shared/static/uchx6xdvb1ghhrv3ztxk9dvyvfxy31ce.txt"
|
| 18 |
+
# WIKIURL="https://ibm.box.com/shared/static/3rfbn3v3h6wpjalwob1pl0geppzx9746.txt"
|
| 19 |
+
#
|
| 20 |
+
# wget -O "$RAW_TEXT_DIR/$WOZ_NAME.txt" -L $WOZURL
|
| 21 |
+
# wget -O "$RAW_TEXT_DIR/$WIKI_NAME.txt" -L $WIKIURL
|
| 22 |
+
|
| 23 |
+
# Create the corpus
|
| 24 |
+
mkdir -p "$OUT_DIR"
|
| 25 |
+
for model in $MODELS; do
|
| 26 |
+
for corpus in $CORPORA; do
|
| 27 |
+
CORPUSFILE="$RAW_TEXT_DIR/$corpus.txt"
|
| 28 |
+
python $PYTHON_SCRIPT -f $CORPUSFILE -o $OUT_DIR -m $model -n $corpus --force
|
| 29 |
+
done
|
| 30 |
+
done
|