{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "54c0a618-750f-4bf0-8cdb-c2dda158c433", "metadata": {}, "outputs": [], "source": [ "import argparse\n", "import json\n", "import os\n", "import editdistance" ] }, { "cell_type": "markdown", "id": "658bb863-f147-444e-8b14-466e1999d15f", "metadata": {}, "source": [ "# Speech -> Text" ] }, { "cell_type": "code", "execution_count": 2, "id": "7e4d5e19-e526-4b33-aa03-0a4cc68abd90", "metadata": {}, "outputs": [], "source": [ "def calculate_WER(recognized_text_list, groundtruth_text_list):\n", " word_num = 0.0\n", " scores = 0.0\n", " for recognized_text, groundtruth_text in zip(recognized_text_list, groundtruth_text_list):\n", " if len(recognized_text) > 1000:\n", " print(recognized_text)\n", " continue\n", " recognized_word_list = recognized_text.split()\n", " groundtruth_word_list = groundtruth_text.split()\n", " current_word_num = len(groundtruth_word_list)\n", " word_num += current_word_num\n", " # Compute Levenstein's distance\n", " current_score = editdistance.eval(recognized_word_list, groundtruth_word_list)\n", " scores += current_score\n", " WER = scores / word_num\n", " return WER, scores, word_num\n", "\n", "\n", "def evaluate_asr(prediction_list, ground_truth_list):\n", " wer, scores_wer, word_num_wer = calculate_WER(prediction_list, ground_truth_list)\n", " print(f'wer: {wer}, scores_wer: {scores_wer}, word_num_wer: {word_num_wer}')" ] }, { "cell_type": "code", "execution_count": 3, "id": "05f4a95c", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "WER (demo): 0.4375 | word errors: 7.0 | total words: 16.0\n" ] } ], "source": [ "\n", "gt_0 = \"Hello. We are AIDAS laboratory.\"\n", "gt_1 = \"Hello. Let's build an omni model diffusion foundation model.\"\n", "gt_2 = \"Pretty intense.\"\n", "\n", "pred_0 = \"hello, we are AIDAS laboratory.\"\n", "pred_1 = \"hello let's build an omni model diffusion foundation model\"\n", "pred_2 = \"pretty intense\"\n", "\n", "groundtruth_text_list = [gt_0, gt_1, gt_2]\n", "recognized_text_list = [pred_0, pred_1, pred_2]\n", "\n", "wer, errors, words = calculate_WER(recognized_text_list, groundtruth_text_list)\n", "print(f\"WER (demo): {wer:.4f} | word errors: {errors} | total words: {words}\")" ] }, { "cell_type": "markdown", "id": "3635f492-2ae2-4ef4-9321-36d08aa6645e", "metadata": {}, "source": [ "# Text -> Speech (with normalizer)" ] }, { "cell_type": "code", "execution_count": 4, "id": "1ac74c9a", "metadata": {}, "outputs": [], "source": [ "# Environment & deps check (safe to run multiple times)\n", "import sys, os, importlib\n", "from pathlib import Path\n", "\n", "\n", "# optional: ensure packages (comment out if you manage env separately)\n", "try:\n", " import editdistance # used by calculate_WER\n", "except Exception:\n", " print(\"Installing editdistance...\")\n", " %pip -q install editdistance\n", "\n", "try:\n", " import more_itertools # required by english.py normalizer\n", "except Exception:\n", " print(\"Installing more-itertools...\")\n", " %pip -q install more-itertools\n", "\n", "# local modules\n", "from whisper_asr.whisper_asr import load_whisper_model, EN_ASR_WER\n", "from whisper_asr.normalizers.english import EnglishTextNormalizer # EMOVA-style normalizer\n" ] }, { "cell_type": "code", "execution_count": 5, "id": "4ffd26a0", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Device set to use cuda\n", "Using `chunk_length_s` is very experimental with seq2seq models. The results will not necessarily be entirely accurate and will have caveats. More information: https://github.com/huggingface/transformers/pull/20104. Ignore this warning with pipeline(..., ignore_warning=True). To use Whisper for long-form transcription, use rather the model's `generate` method directly as the model relies on it's own chunking mechanism (cf. Whisper original paper, section 3.8. Long-form Transcription).\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "whisper model loaded!\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ " 0%| | 0/1 [00:00