Spaces:
Sleeping
Sleeping
| { | |
| "cells": [ | |
| { | |
| "cell_type": "code", | |
| "execution_count": 1, | |
| "id": "54c0a618-750f-4bf0-8cdb-c2dda158c433", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "import argparse\n", | |
| "import json\n", | |
| "import os\n", | |
| "import editdistance" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "id": "658bb863-f147-444e-8b14-466e1999d15f", | |
| "metadata": {}, | |
| "source": [ | |
| "# Speech -> Text" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 2, | |
| "id": "7e4d5e19-e526-4b33-aa03-0a4cc68abd90", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "def calculate_WER(recognized_text_list, groundtruth_text_list):\n", | |
| " word_num = 0.0\n", | |
| " scores = 0.0\n", | |
| " for recognized_text, groundtruth_text in zip(recognized_text_list, groundtruth_text_list):\n", | |
| " if len(recognized_text) > 1000:\n", | |
| " print(recognized_text)\n", | |
| " continue\n", | |
| " recognized_word_list = recognized_text.split()\n", | |
| " groundtruth_word_list = groundtruth_text.split()\n", | |
| " current_word_num = len(groundtruth_word_list)\n", | |
| " word_num += current_word_num\n", | |
| " # Compute Levenstein's distance\n", | |
| " current_score = editdistance.eval(recognized_word_list, groundtruth_word_list)\n", | |
| " scores += current_score\n", | |
| " WER = scores / word_num\n", | |
| " return WER, scores, word_num\n", | |
| "\n", | |
| "\n", | |
| "def evaluate_asr(prediction_list, ground_truth_list):\n", | |
| " wer, scores_wer, word_num_wer = calculate_WER(prediction_list, ground_truth_list)\n", | |
| " print(f'wer: {wer}, scores_wer: {scores_wer}, word_num_wer: {word_num_wer}')" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 3, | |
| "id": "05f4a95c", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "WER (demo): 0.4375 | word errors: 7.0 | total words: 16.0\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "\n", | |
| "gt_0 = \"Hello. We are AIDAS laboratory.\"\n", | |
| "gt_1 = \"Hello. Let's build an omni model diffusion foundation model.\"\n", | |
| "gt_2 = \"Pretty intense.\"\n", | |
| "\n", | |
| "pred_0 = \"hello, we are AIDAS laboratory.\"\n", | |
| "pred_1 = \"hello let's build an omni model diffusion foundation model\"\n", | |
| "pred_2 = \"pretty intense\"\n", | |
| "\n", | |
| "groundtruth_text_list = [gt_0, gt_1, gt_2]\n", | |
| "recognized_text_list = [pred_0, pred_1, pred_2]\n", | |
| "\n", | |
| "wer, errors, words = calculate_WER(recognized_text_list, groundtruth_text_list)\n", | |
| "print(f\"WER (demo): {wer:.4f} | word errors: {errors} | total words: {words}\")" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "id": "3635f492-2ae2-4ef4-9321-36d08aa6645e", | |
| "metadata": {}, | |
| "source": [ | |
| "# Text -> Speech (with normalizer)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 4, | |
| "id": "1ac74c9a", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "# Environment & deps check (safe to run multiple times)\n", | |
| "import sys, os, importlib\n", | |
| "from pathlib import Path\n", | |
| "\n", | |
| "\n", | |
| "# optional: ensure packages (comment out if you manage env separately)\n", | |
| "try:\n", | |
| " import editdistance # used by calculate_WER\n", | |
| "except Exception:\n", | |
| " print(\"Installing editdistance...\")\n", | |
| " %pip -q install editdistance\n", | |
| "\n", | |
| "try:\n", | |
| " import more_itertools # required by english.py normalizer\n", | |
| "except Exception:\n", | |
| " print(\"Installing more-itertools...\")\n", | |
| " %pip -q install more-itertools\n", | |
| "\n", | |
| "# local modules\n", | |
| "from whisper_asr.whisper_asr import load_whisper_model, EN_ASR_WER\n", | |
| "from whisper_asr.normalizers.english import EnglishTextNormalizer # EMOVA-style normalizer\n" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 5, | |
| "id": "4ffd26a0", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stderr", | |
| "output_type": "stream", | |
| "text": [ | |
| "Device set to use cuda\n", | |
| "Using `chunk_length_s` is very experimental with seq2seq models. The results will not necessarily be entirely accurate and will have caveats. More information: https://github.com/huggingface/transformers/pull/20104. Ignore this warning with pipeline(..., ignore_warning=True). To use Whisper for long-form transcription, use rather the model's `generate` method directly as the model relies on it's own chunking mechanism (cf. Whisper original paper, section 3.8. Long-form Transcription).\n" | |
| ] | |
| }, | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "whisper model loaded!\n" | |
| ] | |
| }, | |
| { | |
| "name": "stderr", | |
| "output_type": "stream", | |
| "text": [ | |
| " 0%| | 0/1 [00:00<?, ?it/s]Using custom `forced_decoder_ids` from the (generation) config. This is deprecated in favor of the `task` and `language` flags/config options.\n", | |
| "Transcription using a multilingual Whisper will default to language detection followed by transcription instead of translation to English. This might be a breaking change for your use case. If you want to instead always translate your audio to English, make sure to pass `language='en'`. See https://github.com/huggingface/transformers/pull/28687 for more details.\n", | |
| "100%|ββββββββββ| 1/1 [00:02<00:00, 2.81s/it]" | |
| ] | |
| }, | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "groundtruth text:Hello. We are AIDAS laboratory.\n", | |
| "recognized text: Hello, we are IDAS Laboratory.\n", | |
| "groundtruth text:Hello. Let's build an omni model diffusion foundation model.\n", | |
| "recognized text: Hello! Let's build an Omnimodal Diffusion Foundation model.\n", | |
| "groundtruth text:It's pretty intense.\n", | |
| "recognized text: It's pretty intense.\n", | |
| "Computation Time: 2.8128 s\n", | |
| "groundtruth:Hello. We are AIDAS laboratory.\n", | |
| "recognized: Hello, we are IDAS Laboratory.\n", | |
| "groundtruth:Hello. Let's build an omni model diffusion foundation model.\n", | |
| "recognized: Hello! Let's build an Omnimodal Diffusion Foundation model.\n", | |
| "groundtruth:It's pretty intense.\n", | |
| "recognized: It's pretty intense.\n", | |
| "Word count: 17\n", | |
| "Word error: 9\n", | |
| "utterance num:3\n", | |
| "WER without Whisper text normalization: 0.5294 \n", | |
| "normalized_groundtruth:hello we are aidas laboratory\n", | |
| "normalized_recognized:hello we are idas laboratory\n", | |
| "normalized_groundtruth:hello let us build an omni model diffusion foundation model\n", | |
| "normalized_recognized:hello let us build an omnimodal diffusion foundation model\n", | |
| "normalized_groundtruth:it is pretty intense\n", | |
| "normalized_recognized:it is pretty intense\n", | |
| "Word count: 19\n", | |
| "Word error: 3\n", | |
| "utterance num:3\n", | |
| "WER with Whisper text normalization: 0.1579 \n" | |
| ] | |
| }, | |
| { | |
| "name": "stderr", | |
| "output_type": "stream", | |
| "text": [ | |
| "\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "# TTS β ASR with normalization (EMOVA EnglishTextNormalizer)\n", | |
| "import torch\n", | |
| "from pathlib import Path\n", | |
| "\n", | |
| "# inputs\n", | |
| "groundtruth_text_list = [\n", | |
| " \"Hello. We are AIDAS laboratory.\",\n", | |
| " \"Hello. Let's build an omni model diffusion foundation model.\",\n", | |
| " \"It's pretty intense.\",\n", | |
| "]\n", | |
| "wav_file_list = [\n", | |
| " \"./audio/AIDAS_team.wav\",\n", | |
| " \"./audio/diffusion.wav\",\n", | |
| " \"./audio/pretty_intense.wav\",\n", | |
| "]\n", | |
| "\n", | |
| "# Load Whisper large-v3\n", | |
| "model_id = \"openai/whisper-large-v3\"\n", | |
| "device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n", | |
| "pipe = load_whisper_model(model_id, device)\n", | |
| "\n", | |
| "# Run batch inference and print both raw and normalized WERs\n", | |
| "EN_ASR_WER(pipe, wav_file_list, groundtruth_text_list, batch_size=3, print_verbose=True)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "id": "dd157230-07a5-4b05-a8c1-2f7a49475cdd", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "id": "310f5f23-43c6-40e1-a20c-09cd1ce287ad", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "id": "01b93914-2624-4c1a-b893-ed2cd3b944b7", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [] | |
| } | |
| ], | |
| "metadata": { | |
| "kernelspec": { | |
| "display_name": "diff", | |
| "language": "python", | |
| "name": "diff" | |
| }, | |
| "language_info": { | |
| "codemirror_mode": { | |
| "name": "ipython", | |
| "version": 3 | |
| }, | |
| "file_extension": ".py", | |
| "mimetype": "text/x-python", | |
| "name": "python", | |
| "nbconvert_exporter": "python", | |
| "pygments_lexer": "ipython3", | |
| "version": "3.11.11" | |
| } | |
| }, | |
| "nbformat": 4, | |
| "nbformat_minor": 5 | |
| } | |