jaeikkim
Reinit Space without binary assets
7bfbdc3
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "54c0a618-750f-4bf0-8cdb-c2dda158c433",
"metadata": {},
"outputs": [],
"source": [
"import argparse\n",
"import json\n",
"import os\n",
"import editdistance"
]
},
{
"cell_type": "markdown",
"id": "658bb863-f147-444e-8b14-466e1999d15f",
"metadata": {},
"source": [
"# Speech -> Text"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "7e4d5e19-e526-4b33-aa03-0a4cc68abd90",
"metadata": {},
"outputs": [],
"source": [
"def calculate_WER(recognized_text_list, groundtruth_text_list):\n",
" word_num = 0.0\n",
" scores = 0.0\n",
" for recognized_text, groundtruth_text in zip(recognized_text_list, groundtruth_text_list):\n",
" if len(recognized_text) > 1000:\n",
" print(recognized_text)\n",
" continue\n",
" recognized_word_list = recognized_text.split()\n",
" groundtruth_word_list = groundtruth_text.split()\n",
" current_word_num = len(groundtruth_word_list)\n",
" word_num += current_word_num\n",
" # Compute Levenstein's distance\n",
" current_score = editdistance.eval(recognized_word_list, groundtruth_word_list)\n",
" scores += current_score\n",
" WER = scores / word_num\n",
" return WER, scores, word_num\n",
"\n",
"\n",
"def evaluate_asr(prediction_list, ground_truth_list):\n",
" wer, scores_wer, word_num_wer = calculate_WER(prediction_list, ground_truth_list)\n",
" print(f'wer: {wer}, scores_wer: {scores_wer}, word_num_wer: {word_num_wer}')"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "05f4a95c",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"WER (demo): 0.4375 | word errors: 7.0 | total words: 16.0\n"
]
}
],
"source": [
"\n",
"gt_0 = \"Hello. We are AIDAS laboratory.\"\n",
"gt_1 = \"Hello. Let's build an omni model diffusion foundation model.\"\n",
"gt_2 = \"Pretty intense.\"\n",
"\n",
"pred_0 = \"hello, we are AIDAS laboratory.\"\n",
"pred_1 = \"hello let's build an omni model diffusion foundation model\"\n",
"pred_2 = \"pretty intense\"\n",
"\n",
"groundtruth_text_list = [gt_0, gt_1, gt_2]\n",
"recognized_text_list = [pred_0, pred_1, pred_2]\n",
"\n",
"wer, errors, words = calculate_WER(recognized_text_list, groundtruth_text_list)\n",
"print(f\"WER (demo): {wer:.4f} | word errors: {errors} | total words: {words}\")"
]
},
{
"cell_type": "markdown",
"id": "3635f492-2ae2-4ef4-9321-36d08aa6645e",
"metadata": {},
"source": [
"# Text -> Speech (with normalizer)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "1ac74c9a",
"metadata": {},
"outputs": [],
"source": [
"# Environment & deps check (safe to run multiple times)\n",
"import sys, os, importlib\n",
"from pathlib import Path\n",
"\n",
"\n",
"# optional: ensure packages (comment out if you manage env separately)\n",
"try:\n",
" import editdistance # used by calculate_WER\n",
"except Exception:\n",
" print(\"Installing editdistance...\")\n",
" %pip -q install editdistance\n",
"\n",
"try:\n",
" import more_itertools # required by english.py normalizer\n",
"except Exception:\n",
" print(\"Installing more-itertools...\")\n",
" %pip -q install more-itertools\n",
"\n",
"# local modules\n",
"from whisper_asr.whisper_asr import load_whisper_model, EN_ASR_WER\n",
"from whisper_asr.normalizers.english import EnglishTextNormalizer # EMOVA-style normalizer\n"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "4ffd26a0",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Device set to use cuda\n",
"Using `chunk_length_s` is very experimental with seq2seq models. The results will not necessarily be entirely accurate and will have caveats. More information: https://github.com/huggingface/transformers/pull/20104. Ignore this warning with pipeline(..., ignore_warning=True). To use Whisper for long-form transcription, use rather the model's `generate` method directly as the model relies on it's own chunking mechanism (cf. Whisper original paper, section 3.8. Long-form Transcription).\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"whisper model loaded!\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
" 0%| | 0/1 [00:00<?, ?it/s]Using custom `forced_decoder_ids` from the (generation) config. This is deprecated in favor of the `task` and `language` flags/config options.\n",
"Transcription using a multilingual Whisper will default to language detection followed by transcription instead of translation to English. This might be a breaking change for your use case. If you want to instead always translate your audio to English, make sure to pass `language='en'`. See https://github.com/huggingface/transformers/pull/28687 for more details.\n",
"100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1/1 [00:02<00:00, 2.81s/it]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"groundtruth text:Hello. We are AIDAS laboratory.\n",
"recognized text: Hello, we are IDAS Laboratory.\n",
"groundtruth text:Hello. Let's build an omni model diffusion foundation model.\n",
"recognized text: Hello! Let's build an Omnimodal Diffusion Foundation model.\n",
"groundtruth text:It's pretty intense.\n",
"recognized text: It's pretty intense.\n",
"Computation Time: 2.8128 s\n",
"groundtruth:Hello. We are AIDAS laboratory.\n",
"recognized: Hello, we are IDAS Laboratory.\n",
"groundtruth:Hello. Let's build an omni model diffusion foundation model.\n",
"recognized: Hello! Let's build an Omnimodal Diffusion Foundation model.\n",
"groundtruth:It's pretty intense.\n",
"recognized: It's pretty intense.\n",
"Word count: 17\n",
"Word error: 9\n",
"utterance num:3\n",
"WER without Whisper text normalization: 0.5294 \n",
"normalized_groundtruth:hello we are aidas laboratory\n",
"normalized_recognized:hello we are idas laboratory\n",
"normalized_groundtruth:hello let us build an omni model diffusion foundation model\n",
"normalized_recognized:hello let us build an omnimodal diffusion foundation model\n",
"normalized_groundtruth:it is pretty intense\n",
"normalized_recognized:it is pretty intense\n",
"Word count: 19\n",
"Word error: 3\n",
"utterance num:3\n",
"WER with Whisper text normalization: 0.1579 \n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\n"
]
}
],
"source": [
"# TTS β†’ ASR with normalization (EMOVA EnglishTextNormalizer)\n",
"import torch\n",
"from pathlib import Path\n",
"\n",
"# inputs\n",
"groundtruth_text_list = [\n",
" \"Hello. We are AIDAS laboratory.\",\n",
" \"Hello. Let's build an omni model diffusion foundation model.\",\n",
" \"It's pretty intense.\",\n",
"]\n",
"wav_file_list = [\n",
" \"./audio/AIDAS_team.wav\",\n",
" \"./audio/diffusion.wav\",\n",
" \"./audio/pretty_intense.wav\",\n",
"]\n",
"\n",
"# Load Whisper large-v3\n",
"model_id = \"openai/whisper-large-v3\"\n",
"device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n",
"pipe = load_whisper_model(model_id, device)\n",
"\n",
"# Run batch inference and print both raw and normalized WERs\n",
"EN_ASR_WER(pipe, wav_file_list, groundtruth_text_list, batch_size=3, print_verbose=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "dd157230-07a5-4b05-a8c1-2f7a49475cdd",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "310f5f23-43c6-40e1-a20c-09cd1ce287ad",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "01b93914-2624-4c1a-b893-ed2cd3b944b7",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "diff",
"language": "python",
"name": "diff"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.11"
}
},
"nbformat": 4,
"nbformat_minor": 5
}