Spaces:

jaeikkim
/

AIDAS-Omni-Modal-Diffusion

Sleeping

AIDAS-Omni-Modal-Diffusion / MMaDA /eval_ASR_TTS /test.py

jaeikkim

Reinit Space without binary assets

7bfbdc3 2 months ago

8.95 kB

	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": 1,
	"id": "54c0a618-750f-4bf0-8cdb-c2dda158c433",
	"metadata": {},
	"outputs": [],
	"source": [
	"import argparse\n",
	"import json\n",
	"import os\n",
	"import editdistance"
	]
	},
	{
	"cell_type": "markdown",
	"id": "658bb863-f147-444e-8b14-466e1999d15f",
	"metadata": {},
	"source": [
	"# Speech -> Text"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 2,
	"id": "7e4d5e19-e526-4b33-aa03-0a4cc68abd90",
	"metadata": {},
	"outputs": [],
	"source": [
	"def calculate_WER(recognized_text_list, groundtruth_text_list):\n",
	" word_num = 0.0\n",
	" scores = 0.0\n",
	" for recognized_text, groundtruth_text in zip(recognized_text_list, groundtruth_text_list):\n",
	" if len(recognized_text) > 1000:\n",
	" print(recognized_text)\n",
	" continue\n",
	" recognized_word_list = recognized_text.split()\n",
	" groundtruth_word_list = groundtruth_text.split()\n",
	" current_word_num = len(groundtruth_word_list)\n",
	" word_num += current_word_num\n",
	" # Compute Levenstein's distance\n",
	" current_score = editdistance.eval(recognized_word_list, groundtruth_word_list)\n",
	" scores += current_score\n",
	" WER = scores / word_num\n",
	" return WER, scores, word_num\n",
	"\n",
	"\n",
	"def evaluate_asr(prediction_list, ground_truth_list):\n",
	" wer, scores_wer, word_num_wer = calculate_WER(prediction_list, ground_truth_list)\n",
	" print(f'wer: {wer}, scores_wer: {scores_wer}, word_num_wer: {word_num_wer}')"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 3,
	"id": "05f4a95c",
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"WER (demo): 0.4375 \| word errors: 7.0 \| total words: 16.0\n"
	]
	}
	],
	"source": [
	"\n",
	"gt_0 = \"Hello. We are AIDAS laboratory.\"\n",
	"gt_1 = \"Hello. Let's build an omni model diffusion foundation model.\"\n",
	"gt_2 = \"Pretty intense.\"\n",
	"\n",
	"pred_0 = \"hello, we are AIDAS laboratory.\"\n",
	"pred_1 = \"hello let's build an omni model diffusion foundation model\"\n",
	"pred_2 = \"pretty intense\"\n",
	"\n",
	"groundtruth_text_list = [gt_0, gt_1, gt_2]\n",
	"recognized_text_list = [pred_0, pred_1, pred_2]\n",
	"\n",
	"wer, errors, words = calculate_WER(recognized_text_list, groundtruth_text_list)\n",
	"print(f\"WER (demo): {wer:.4f} \| word errors: {errors} \| total words: {words}\")"
	]
	},
	{
	"cell_type": "markdown",
	"id": "3635f492-2ae2-4ef4-9321-36d08aa6645e",
	"metadata": {},
	"source": [
	"# Text -> Speech (with normalizer)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 4,
	"id": "1ac74c9a",
	"metadata": {},
	"outputs": [],
	"source": [
	"# Environment & deps check (safe to run multiple times)\n",
	"import sys, os, importlib\n",
	"from pathlib import Path\n",
	"\n",
	"\n",
	"# optional: ensure packages (comment out if you manage env separately)\n",
	"try:\n",
	" import editdistance # used by calculate_WER\n",
	"except Exception:\n",
	" print(\"Installing editdistance...\")\n",
	" %pip -q install editdistance\n",
	"\n",
	"try:\n",
	" import more_itertools # required by english.py normalizer\n",
	"except Exception:\n",
	" print(\"Installing more-itertools...\")\n",
	" %pip -q install more-itertools\n",
	"\n",
	"# local modules\n",
	"from whisper_asr.whisper_asr import load_whisper_model, EN_ASR_WER\n",
	"from whisper_asr.normalizers.english import EnglishTextNormalizer # EMOVA-style normalizer\n"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 5,
	"id": "4ffd26a0",
	"metadata": {},
	"outputs": [
	{
	"name": "stderr",
	"output_type": "stream",
	"text": [
	"Device set to use cuda\n",
	"Using `chunk_length_s` is very experimental with seq2seq models. The results will not necessarily be entirely accurate and will have caveats. More information: https://github.com/huggingface/transformers/pull/20104. Ignore this warning with pipeline(..., ignore_warning=True). To use Whisper for long-form transcription, use rather the model's `generate` method directly as the model relies on it's own chunking mechanism (cf. Whisper original paper, section 3.8. Long-form Transcription).\n"
	]
	},
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"whisper model loaded!\n"
	]
	},
	{
	"name": "stderr",
	"output_type": "stream",
	"text": [
	" 0%\| \| 0/1 [00:00<?, ?it/s]Using custom `forced_decoder_ids` from the (generation) config. This is deprecated in favor of the `task` and `language` flags/config options.\n",
	"Transcription using a multilingual Whisper will default to language detection followed by transcription instead of translation to English. This might be a breaking change for your use case. If you want to instead always translate your audio to English, make sure to pass `language='en'`. See https://github.com/huggingface/transformers/pull/28687 for more details.\n",
	"100%\|██████████\| 1/1 [00:02<00:00, 2.81s/it]"
	]
	},
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"groundtruth text:Hello. We are AIDAS laboratory.\n",
	"recognized text: Hello, we are IDAS Laboratory.\n",
	"groundtruth text:Hello. Let's build an omni model diffusion foundation model.\n",
	"recognized text: Hello! Let's build an Omnimodal Diffusion Foundation model.\n",
	"groundtruth text:It's pretty intense.\n",
	"recognized text: It's pretty intense.\n",
	"Computation Time: 2.8128 s\n",
	"groundtruth:Hello. We are AIDAS laboratory.\n",
	"recognized: Hello, we are IDAS Laboratory.\n",
	"groundtruth:Hello. Let's build an omni model diffusion foundation model.\n",
	"recognized: Hello! Let's build an Omnimodal Diffusion Foundation model.\n",
	"groundtruth:It's pretty intense.\n",
	"recognized: It's pretty intense.\n",
	"Word count: 17\n",
	"Word error: 9\n",
	"utterance num:3\n",
	"WER without Whisper text normalization: 0.5294 \n",
	"normalized_groundtruth:hello we are aidas laboratory\n",
	"normalized_recognized:hello we are idas laboratory\n",
	"normalized_groundtruth:hello let us build an omni model diffusion foundation model\n",
	"normalized_recognized:hello let us build an omnimodal diffusion foundation model\n",
	"normalized_groundtruth:it is pretty intense\n",
	"normalized_recognized:it is pretty intense\n",
	"Word count: 19\n",
	"Word error: 3\n",
	"utterance num:3\n",
	"WER with Whisper text normalization: 0.1579 \n"
	]
	},
	{
	"name": "stderr",
	"output_type": "stream",
	"text": [
	"\n"
	]
	}
	],
	"source": [
	"# TTS → ASR with normalization (EMOVA EnglishTextNormalizer)\n",
	"import torch\n",
	"from pathlib import Path\n",
	"\n",
	"# inputs\n",
	"groundtruth_text_list = [\n",
	" \"Hello. We are AIDAS laboratory.\",\n",
	" \"Hello. Let's build an omni model diffusion foundation model.\",\n",
	" \"It's pretty intense.\",\n",
	"]\n",
	"wav_file_list = [\n",
	" \"./audio/AIDAS_team.wav\",\n",
	" \"./audio/diffusion.wav\",\n",
	" \"./audio/pretty_intense.wav\",\n",
	"]\n",
	"\n",
	"# Load Whisper large-v3\n",
	"model_id = \"openai/whisper-large-v3\"\n",
	"device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n",
	"pipe = load_whisper_model(model_id, device)\n",
	"\n",
	"# Run batch inference and print both raw and normalized WERs\n",
	"EN_ASR_WER(pipe, wav_file_list, groundtruth_text_list, batch_size=3, print_verbose=True)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "dd157230-07a5-4b05-a8c1-2f7a49475cdd",
	"metadata": {},
	"outputs": [],
	"source": []
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "310f5f23-43c6-40e1-a20c-09cd1ce287ad",
	"metadata": {},
	"outputs": [],
	"source": []
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "01b93914-2624-4c1a-b893-ed2cd3b944b7",
	"metadata": {},
	"outputs": [],
	"source": []
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "diff",
	"language": "python",
	"name": "diff"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.11.11"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 5
	}