{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "54c0a618-750f-4bf0-8cdb-c2dda158c433",
   "metadata": {},
   "outputs": [],
   "source": [
    "import argparse\n",
    "import json\n",
    "import os\n",
    "import editdistance"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "658bb863-f147-444e-8b14-466e1999d15f",
   "metadata": {},
   "source": [
    "# Speech -> Text"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "7e4d5e19-e526-4b33-aa03-0a4cc68abd90",
   "metadata": {},
   "outputs": [],
   "source": [
    "def calculate_WER(recognized_text_list, groundtruth_text_list):\n",
    "    word_num = 0.0\n",
    "    scores = 0.0\n",
    "    for recognized_text, groundtruth_text in zip(recognized_text_list, groundtruth_text_list):\n",
    "        if len(recognized_text) > 1000:\n",
    "            print(recognized_text)\n",
    "            continue\n",
    "        recognized_word_list = recognized_text.split()\n",
    "        groundtruth_word_list = groundtruth_text.split()\n",
    "        current_word_num = len(groundtruth_word_list)\n",
    "        word_num += current_word_num\n",
    "        # Compute Levenstein's distance\n",
    "        current_score = editdistance.eval(recognized_word_list, groundtruth_word_list)\n",
    "        scores += current_score\n",
    "    WER = scores / word_num\n",
    "    return WER, scores, word_num\n",
    "\n",
    "\n",
    "def evaluate_asr(prediction_list, ground_truth_list):\n",
    "    wer, scores_wer, word_num_wer = calculate_WER(prediction_list, ground_truth_list)\n",
    "    print(f'wer: {wer}, scores_wer: {scores_wer}, word_num_wer: {word_num_wer}')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "05f4a95c",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "WER (demo): 0.4375  | word errors: 7.0  | total words: 16.0\n"
     ]
    }
   ],
   "source": [
    "\n",
    "gt_0 = \"Hello. We are AIDAS laboratory.\"\n",
    "gt_1 = \"Hello. Let's build an omni model diffusion foundation model.\"\n",
    "gt_2 = \"Pretty intense.\"\n",
    "\n",
    "pred_0 = \"hello, we are AIDAS laboratory.\"\n",
    "pred_1 = \"hello let's build an omni model diffusion foundation model\"\n",
    "pred_2 = \"pretty intense\"\n",
    "\n",
    "groundtruth_text_list = [gt_0, gt_1, gt_2]\n",
    "recognized_text_list = [pred_0, pred_1, pred_2]\n",
    "\n",
    "wer, errors, words = calculate_WER(recognized_text_list, groundtruth_text_list)\n",
    "print(f\"WER (demo): {wer:.4f}  | word errors: {errors}  | total words: {words}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "3635f492-2ae2-4ef4-9321-36d08aa6645e",
   "metadata": {},
   "source": [
    "# Text -> Speech (with normalizer)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "1ac74c9a",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Environment & deps check (safe to run multiple times)\n",
    "import sys, os, importlib\n",
    "from pathlib import Path\n",
    "\n",
    "\n",
    "# optional: ensure packages (comment out if you manage env separately)\n",
    "try:\n",
    "    import editdistance  # used by calculate_WER\n",
    "except Exception:\n",
    "    print(\"Installing editdistance...\")\n",
    "    %pip -q install editdistance\n",
    "\n",
    "try:\n",
    "    import more_itertools  # required by english.py normalizer\n",
    "except Exception:\n",
    "    print(\"Installing more-itertools...\")\n",
    "    %pip -q install more-itertools\n",
    "\n",
    "# local modules\n",
    "from whisper_asr.whisper_asr import load_whisper_model, EN_ASR_WER\n",
    "from whisper_asr.normalizers.english import EnglishTextNormalizer  # EMOVA-style normalizer\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "4ffd26a0",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Device set to use cuda\n",
      "Using `chunk_length_s` is very experimental with seq2seq models. The results will not necessarily be entirely accurate and will have caveats. More information: https://github.com/huggingface/transformers/pull/20104. Ignore this warning with pipeline(..., ignore_warning=True). To use Whisper for long-form transcription, use rather the model's `generate` method directly as the model relies on it's own chunking mechanism (cf. Whisper original paper, section 3.8. Long-form Transcription).\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "whisper model loaded!\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "  0%|          | 0/1 [00:00<?, ?it/s]Using custom `forced_decoder_ids` from the (generation) config. This is deprecated in favor of the `task` and `language` flags/config options.\n",
      "Transcription using a multilingual Whisper will default to language detection followed by transcription instead of translation to English. This might be a breaking change for your use case. If you want to instead always translate your audio to English, make sure to pass `language='en'`. See https://github.com/huggingface/transformers/pull/28687 for more details.\n",
      "100%|██████████| 1/1 [00:02<00:00,  2.81s/it]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "groundtruth text:Hello. We are AIDAS laboratory.\n",
      "recognized text: Hello, we are IDAS Laboratory.\n",
      "groundtruth text:Hello. Let's build an omni model diffusion foundation model.\n",
      "recognized text: Hello! Let's build an Omnimodal Diffusion Foundation model.\n",
      "groundtruth text:It's pretty intense.\n",
      "recognized text: It's pretty intense.\n",
      "Computation Time: 2.8128 s\n",
      "groundtruth:Hello. We are AIDAS laboratory.\n",
      "recognized: Hello, we are IDAS Laboratory.\n",
      "groundtruth:Hello. Let's build an omni model diffusion foundation model.\n",
      "recognized: Hello! Let's build an Omnimodal Diffusion Foundation model.\n",
      "groundtruth:It's pretty intense.\n",
      "recognized: It's pretty intense.\n",
      "Word count: 17\n",
      "Word error: 9\n",
      "utterance num:3\n",
      "WER without Whisper text normalization: 0.5294 \n",
      "normalized_groundtruth:hello we are aidas laboratory\n",
      "normalized_recognized:hello we are idas laboratory\n",
      "normalized_groundtruth:hello let us build an omni model diffusion foundation model\n",
      "normalized_recognized:hello let us build an omnimodal diffusion foundation model\n",
      "normalized_groundtruth:it is pretty intense\n",
      "normalized_recognized:it is pretty intense\n",
      "Word count: 19\n",
      "Word error: 3\n",
      "utterance num:3\n",
      "WER with Whisper text normalization: 0.1579 \n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    }
   ],
   "source": [
    "# TTS → ASR with normalization (EMOVA EnglishTextNormalizer)\n",
    "import torch\n",
    "from pathlib import Path\n",
    "\n",
    "# inputs\n",
    "groundtruth_text_list = [\n",
    "    \"Hello. We are AIDAS laboratory.\",\n",
    "    \"Hello. Let's build an omni model diffusion foundation model.\",\n",
    "    \"It's pretty intense.\",\n",
    "]\n",
    "wav_file_list = [\n",
    "    \"./audio/AIDAS_team.wav\",\n",
    "    \"./audio/diffusion.wav\",\n",
    "    \"./audio/pretty_intense.wav\",\n",
    "]\n",
    "\n",
    "# Load Whisper large-v3\n",
    "model_id = \"openai/whisper-large-v3\"\n",
    "device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n",
    "pipe = load_whisper_model(model_id, device)\n",
    "\n",
    "# Run batch inference and print both raw and normalized WERs\n",
    "EN_ASR_WER(pipe, wav_file_list, groundtruth_text_list, batch_size=3, print_verbose=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "dd157230-07a5-4b05-a8c1-2f7a49475cdd",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "310f5f23-43c6-40e1-a20c-09cd1ce287ad",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "01b93914-2624-4c1a-b893-ed2cd3b944b7",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "diff",
   "language": "python",
   "name": "diff"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.11"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}