#!/bin/bash

# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

## Download the Spoken Wikipedia corpus for English
## Note, that there are some other languages available
## @InProceedings{KHN16.518,
##  author = {Arne K{\"o}hn and Florian Stegen and Timo Baumann},
##  title = {Mining the Spoken Wikipedia for Speech Data and Beyond},
##  booktitle = {Proceedings of the Tenth International Conference on Language Resources and Evaluation (LREC 2016)},
##  year = {2016},
##  month = {may},
##  date = {23-28},
##  location = {Portorož, Slovenia},
##  editor = {Nicoletta Calzolari (Conference Chair) and Khalid Choukri and Thierry Declerck and Marko Grobelnik and Bente Maegaard and Joseph Mariani and Asuncion Moreno and Jan Odijk and Stelios Piperidis},
##  publisher = {European Language Resources Association (ELRA)},
##  address = {Paris, France},
##  isbn = {978-2-9517408-9-1},
##  islrn = {684-927-624-257-3/},
##  language = {english}
## }

wget https://corpora.uni-hamburg.de/hzsk/de/islandora/object/file:swc-2.0_en-with-audio/datastream/TAR/en-with-audio.tar .
tar -xvf en-with-audio.tar

##  We get a folder English with 1339 subfolders, each subfolder corresponds to a Wikipedia article. Example:
##  ├── Universal_suffrage
##  │   ├── aligned.swc
##  │   ├── audiometa.txt
##  │   ├── audio.ogg
##  │   ├── info.json
##  │   ├── wiki.html
##  │   ├── wiki.txt
##  │   └── wiki.xml

##  We will use two files: audio.ogg and wiki.txt

## Some folders have multiple .ogg files, this will be handled during preprocess.py. Example:
##  |── Universe
##  │   ├── aligned.swc
##  │   ├── audio1.ogg
##  │   ├── audio2.ogg
##  │   ├── audio3.ogg
##  │   ├── audio4.ogg
##  │   ├── audiometa.txt
##  │   ├── info.json
##  │   ├── wiki.html
##  │   ├── wiki.txt
##  │   └── wiki.xml

## Some rare folders are incomplete, these will be skipped during preprocessing.

## Rename some folders with special symbols because they cause problems to ffmpeg when concatening multiple .ogg files
mv "english/The_Hitchhiker%27s_Guide_to_the_Galaxy" "english/The_Hitchhikers_guide_to_the_Galaxy"
mv "english/SummerSlam_(2003)" "english/SummerSlam_2003"
mv "english/Over_the_Edge_(1999)" "english/Over_the_Edge_1999"
mv "english/Lost_(TV_series)" "english/Lost_TV_series"
mv "english/S._A._Andr%c3%a9e%27s_Arctic_Balloon_Expedition_of_1897" "english/S_A_Andres_Arctic_Balloon_Expedition_of_1897"

## path to NeMo repository, e.g. /home/user/NeMo
NEMO_PATH=

INPUT_DIR="english"
OUTPUT_DIR=${INPUT_DIR}_result

rm -rf $OUTPUT_DIR
rm -rf ${INPUT_DIR}_prepared
mkdir ${INPUT_DIR}_prepared
mkdir ${INPUT_DIR}_prepared/audio
mkdir ${INPUT_DIR}_prepared/text
python ${NEMO_PATH}/scripts/dataset_processing/spoken_wikipedia/preprocess.py --input_folder ${INPUT_DIR} --destination_folder ${INPUT_DIR}_prepared

## Now we have ${INPUT_DIR}_prepared folder with the following structure:
##  ├── audio
##  |   ├── 1.ogg
##  |   ├── 2.ogg
##  |   ...
##  └── text
##      ├── 1.txt     
##      ├── 2.txt     
##      ...

MODEL_FOR_SEGMENTATION="QuartzNet15x5Base-En" 
MODEL_FOR_RECOGNITION="stt_en_conformer_ctc_large"
## We set this threshold as very permissive, later we will use other metrics for filtering
THRESHOLD=-10

${NEMO_PATH}/tools/ctc_segmentation/run_segmentation.sh \
--SCRIPTS_DIR=${NEMO_PATH}/tools/ctc_segmentation/scripts \
--MODEL_NAME_OR_PATH=${MODEL_FOR_SEGMENTATION} \
--DATA_DIR=${INPUT_DIR}_prepared \
--OUTPUT_DIR=${OUTPUT_DIR} \
--MIN_SCORE=${MIN_SCORE}

# Thresholds for filtering
CER_THRESHOLD=20
WER_THRESHOLD=30
CER_EDGE_THRESHOLD=30
LEN_DIFF_RATIO_THRESHOLD=0.15
EDGE_LEN=25
BATCH_SIZE=1

${NEMO_PATH}/tools/ctc_segmentation/run_filter.sh \
--SCRIPTS_DIR=${NEMO_PATH}/tools/ctc_segmentation/scripts \
--MODEL_NAME_OR_PATH=${MODEL_FOR_RECOGNITION} \
--BATCH_SIZE=${BATCH_SIZE} \
--MANIFEST=$OUTPUT_DIR/manifests/manifest.json \
--INPUT_AUDIO_DIR=${INPUT_DIR}_prepared/audio/ \
--EDGE_LEN=${EDGE_LEN} \
--CER_THRESHOLD=${CER_THRESHOLD} \
--WER_THRESHOLD=${WER_THRESHOLD} \
--CER_EDGE_THRESHOLD=${CER_EDGE_THRESHOLD} \
--LEN_DIFF_RATIO_THRESHOLD=${LEN_DIFF_RATIO_THRESHOLD}

python ${NEMO_PATH}/examples/asr/speech_to_text_eval.py \
  dataset_manifest=${OUTPUT_DIR}/manifests/manifest_transcribed_metrics_filtered.json \
  use_cer=True \
  only_score_manifest=True

python ${NEMO_PATH}/examples/asr/speech_to_text_eval.py \
  dataset_manifest=${OUTPUT_DIR}/manifests/manifest_transcribed_metrics_filtered.json \
  use_cer=False \
  only_score_manifest=True