Student0809
/

interactSpeech

Model card Files Files and versions

interactSpeech / docs /transformers /examples /legacy /token-classification /run.sh

Student0809's picture

Add files using upload-large-folder tool

0e9a03e verified 9 months ago

1.5 kB

	## The relevant files are currently on a shared Google
	## drive at https://drive.google.com/drive/folders/1kC0I2UGl2ltrluI9NqDjaQJGw5iliw_J
	## Monitor for changes and eventually migrate to use the `datasets` library
	curl -L 'https://drive.google.com/uc?export=download&id=1Jjhbal535VVz2ap4v4r_rN1UEHTdLK5P' \
	\| grep -v "^#" \| cut -f 2,3 \| tr '\t' ' ' > train.txt.tmp
	curl -L 'https://drive.google.com/uc?export=download&id=1ZfRcQThdtAR5PPRjIDtrVP7BtXSCUBbm' \
	\| grep -v "^#" \| cut -f 2,3 \| tr '\t' ' ' > dev.txt.tmp
	curl -L 'https://drive.google.com/uc?export=download&id=1u9mb7kNJHWQCWyweMDRMuTFoOHOfeBTH' \
	\| grep -v "^#" \| cut -f 2,3 \| tr '\t' ' ' > test.txt.tmp

	export MAX_LENGTH=128
	export BERT_MODEL=bert-base-multilingual-cased
	python3 scripts/preprocess.py train.txt.tmp $BERT_MODEL $MAX_LENGTH > train.txt
	python3 scripts/preprocess.py dev.txt.tmp $BERT_MODEL $MAX_LENGTH > dev.txt
	python3 scripts/preprocess.py test.txt.tmp $BERT_MODEL $MAX_LENGTH > test.txt
	cat train.txt dev.txt test.txt \| cut -d " " -f 2 \| grep -v "^$"\| sort \| uniq > labels.txt
	export OUTPUT_DIR=germeval-model
	export BATCH_SIZE=32
	export NUM_EPOCHS=3
	export SAVE_STEPS=750
	export SEED=1

	python3 run_ner.py \
	--task_type NER \
	--data_dir . \
	--labels ./labels.txt \
	--model_name_or_path $BERT_MODEL \
	--output_dir $OUTPUT_DIR \
	--max_seq_length $MAX_LENGTH \
	--num_train_epochs $NUM_EPOCHS \
	--per_gpu_train_batch_size $BATCH_SIZE \
	--save_steps $SAVE_STEPS \
	--seed $SEED \
	--do_train \
	--do_eval \
	--do_predict