{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "# import libraries\n", "\n", "import glob\n", "import json\n", "import librosa\n", "import numpy as np\n", "from omegaconf import OmegaConf, open_dict\n", "import os\n", "import soundfile as sf\n", "import subprocess\n", "import tarfile\n", "import tqdm\n", "import wget\n", "\n", "import torch\n", "import pandas as pd\n", "\n", "df = pd.read_csv(\"/home/ubuntu/respair/Tsukasa_LITE_Qanary.csv\")\n", "\n", "import re\n", "\n", "\n", "\n", "# Replace any sequence containing \"HAHA\" with <|🤣|>\n", "df['text'] = df['text'].apply(lambda x: re.sub(r'\\S*HAHA\\S*', '<|🤣|>', x))" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "notebookRunGroups": { "groupValue": "1" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Files with .wav: 461086\n", "Files with .ogg: 27643\n" ] } ], "source": [ "import os\n", "\n", "def check_and_fix_extension(filepath):\n", " \"\"\"Check if file exists, if not try .ogg extension\"\"\"\n", " if os.path.exists(filepath):\n", " return filepath\n", " \n", " if filepath.endswith('.wav'):\n", " ogg_path = filepath.replace('.wav', '.ogg')\n", " if os.path.exists(ogg_path):\n", " return ogg_path\n", " \n", " return filepath # Return original if neither exists\n", "\n", "# Apply the fix to all filenames\n", "df['filename'] = df['filename'].apply(check_and_fix_extension)\n", "\n", "# Show summary\n", "wav_count = df['filename'].str.endswith('.wav').sum()\n", "ogg_count = df['filename'].str.endswith('.ogg').sum()\n", "print(f\"Files with .wav: {wav_count}\")\n", "print(f\"Files with .ogg: {ogg_count}\")" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "notebookRunGroups": { "groupValue": "1" } }, "outputs": [ { "data": { "text/plain": [ "(27643, 4)" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df[df['filename'].str.contains(\".ogg\")].shape" ] }, { "cell_type": "code", "execution_count": 52, "metadata": {}, "outputs": [], "source": [ "df[df['text'].str.contains(\"🤣\")]\n", "df.to_csv(\"/home/ubuntu/respair/Tsukasa_LITE_Qanary.csv\", index=False)" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Duration column found in CSV. Using provided durations.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Found 488729 entries in the CSV file.\n", "Processing entries with provided durations...\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████| 488729/488729 [00:04<00:00, 102634.90it/s]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Processing complete!\n", "Successfully processed: 488729/488729 entries\n", "Total duration: 766.35 hours\n", "Manifest created at: /home/ubuntu/NeMo/data/tsukasa_manifest.json\n" ] } ], "source": [ "import os\n", "import json\n", "import csv\n", "import librosa\n", "import numpy as np\n", "from tqdm import tqdm\n", "from multiprocessing import Pool, cpu_count\n", "from functools import partial\n", "\n", "\n", "def get_audio_duration(audio_path):\n", " \"\"\"Get duration of an audio file.\"\"\"\n", " try:\n", " duration = librosa.core.get_duration(path=audio_path)\n", " return duration\n", " except Exception as e:\n", " print(f\"Error processing {audio_path}: {e}\")\n", " return None\n", "\n", "\n", "def process_row_with_duration(row, lang=\"jp\"):\n", " \"\"\"Process a row that already has duration information.\"\"\"\n", " metadata = {\n", " \"audio_filepath\": row['filename'],\n", " \"duration\": float(row['duration']),\n", " \"text\": row['text'],\n", " \"lang\": lang,\n", " \"target_lang\": lang,\n", " \"source_lang\": lang,\n", " \"pnc\": \"False\"\n", " }\n", " return metadata\n", "\n", "\n", "def process_row_without_duration(row, lang=\"jp\"):\n", " \"\"\"Process a row and calculate duration.\"\"\"\n", " audio_path = row['filename']\n", " duration = get_audio_duration(audio_path)\n", " \n", " if duration is None:\n", " return None\n", " \n", " metadata = {\n", " \"audio_filepath\": audio_path,\n", " \"duration\": duration,\n", " \"text\": row['text'],\n", " \"lang\": lang,\n", " \"target_lang\": lang,\n", " \"source_lang\": lang,\n", " \"pnc\": \"False\"\n", " }\n", " return metadata\n", "\n", "\n", "def build_manifest_from_csv(csv_path, manifest_path, lang=\"jp\", n_jobs=None):\n", " \"\"\"\n", " Build a manifest file from a CSV dataset.\n", " \n", " Args:\n", " csv_path: Path to the CSV file containing filename and text columns\n", " manifest_path: Path where the manifest JSON file will be saved\n", " lang: Language code (default: \"jp\" for Japanese)\n", " n_jobs: Number of parallel jobs for duration calculation (default: CPU count - 1)\n", " \"\"\"\n", " if n_jobs is None:\n", " n_jobs = max(1, cpu_count() - 1)\n", " \n", " # Read the CSV file\n", " rows = []\n", " has_duration = False\n", " \n", " with open(csv_path, 'r', encoding='utf-8') as f:\n", " reader = csv.DictReader(f)\n", " \n", " # Check if duration column exists\n", " if 'duration' in reader.fieldnames:\n", " has_duration = True\n", " print(\"Duration column found in CSV. Using provided durations.\")\n", " else:\n", " print(f\"Duration column not found. Will calculate durations using {n_jobs} parallel workers.\")\n", " \n", " for row in reader:\n", " rows.append(row)\n", " \n", " print(f\"Found {len(rows)} entries in the CSV file.\")\n", " \n", " # Process rows\n", " tot_duration = 0\n", " successful_entries = 0\n", " \n", " # Create/clear the manifest file\n", " with open(manifest_path, 'w') as fout:\n", " pass\n", " \n", " if has_duration:\n", " # Process without parallel computation\n", " print(\"Processing entries with provided durations...\")\n", " with open(manifest_path, 'a') as fout:\n", " for row in tqdm(rows):\n", " metadata = process_row_with_duration(row, lang)\n", " if metadata:\n", " json.dump(metadata, fout)\n", " fout.write('\\n')\n", " tot_duration += metadata['duration']\n", " successful_entries += 1\n", " else:\n", " # Process with parallel duration calculation\n", " print(\"Calculating audio durations in parallel...\")\n", " \n", " # Split processing into chunks for better progress tracking\n", " chunk_size = 100\n", " chunks = [rows[i:i + chunk_size] for i in range(0, len(rows), chunk_size)]\n", " \n", " with open(manifest_path, 'a') as fout:\n", " for chunk in tqdm(chunks, desc=\"Processing chunks\"):\n", " # Use multiprocessing pool for duration calculation\n", " with Pool(n_jobs) as pool:\n", " process_func = partial(process_row_without_duration, lang=lang)\n", " results = pool.map(process_func, chunk)\n", " \n", " # Write results\n", " for metadata in results:\n", " if metadata:\n", " json.dump(metadata, fout)\n", " fout.write('\\n')\n", " tot_duration += metadata['duration']\n", " successful_entries += 1\n", " \n", " print(f\"\\nProcessing complete!\")\n", " print(f\"Successfully processed: {successful_entries}/{len(rows)} entries\")\n", " print(f\"Total duration: {np.round(tot_duration/3600, 2)} hours\")\n", " print(f\"Manifest created at: {manifest_path}\")\n", " \n", " return manifest_path, tot_duration\n", "\n", "\n", "def verify_manifest(manifest_path, sample_size=5):\n", " \"\"\"Verify the manifest by displaying a few sample entries.\"\"\"\n", " print(f\"\\nVerifying manifest: {manifest_path}\")\n", " print(f\"Sample entries (first {sample_size}):\")\n", " \n", " with open(manifest_path, 'r') as f:\n", " for i, line in enumerate(f):\n", " if i >= sample_size:\n", " break\n", " entry = json.loads(line)\n", " print(f\"\\nEntry {i+1}:\")\n", " print(f\" Audio: {entry['audio_filepath']}\")\n", " print(f\" Duration: {entry['duration']:.2f}s\")\n", " print(f\" Text: {entry['text'][:50]}{'...' if len(entry['text']) > 50 else ''}\")\n", "\n", "\n", "# Example usage\n", "if __name__ == \"__main__\":\n", " # Example 1: Process the provided CSV file\n", " csv_path = \"/home/ubuntu/respair/Tsukasa_LITE_Qanary.csv\"\n", " manifest_path = \"/home/ubuntu/NeMo/data/tsukasa_manifest.json\"\n", " \n", " # Build the manifest\n", " build_manifest_from_csv(\n", " csv_path=csv_path,\n", " manifest_path=manifest_path,\n", " lang=\"ja\", # Japanese\n", " n_jobs=None # Use all cores - 1\n", " )\n", " " ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "BRANCH='r2.3.0'\n", "def wget_from_nemo(nemo_script_path, local_dir=\"scripts\"):\n", " os.makedirs(local_dir, exist_ok=True)\n", " script_url = f\"https://raw.githubusercontent.com/NVIDIA/NeMo/refs/heads/{BRANCH}/{nemo_script_path}\"\n", " script_path = os.path.basename(nemo_script_path)\n", " if not os.path.exists(f\"{local_dir}/{script_path}\"):\n", " !wget -P {local_dir}/ {script_url}" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# wget_from_nemo(\"scripts/speech_recognition/canary/build_canary_2_special_tokenizer.py\")\n", "output_dir = \"tokenizers/spl_tokens\"\n", "!mkdir -p {output_dir}\n", "!python scripts/speech_recognition/canary/build_canary_2_special_tokenizer.py {output_dir}" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "/bin/bash: /home/ubuntu/miniconda3/envs/respair/lib/libtinfo.so.6: no version information available (required by /bin/bash)\n" ] } ], "source": [ "!sudo rm -r /home/ubuntu/NeMo/tokenizers/spl_tokens" ] }, { "cell_type": "code", "execution_count": 34, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "/bin/bash: /home/ubuntu/miniconda3/envs/respair/lib/libtinfo.so.6: no version information available (required by /bin/bash)\n", "--2025-08-02 15:48:13-- https://raw.githubusercontent.com/NVIDIA/NeMo/refs/heads/r2.3.0/scripts/tokenizers/process_asr_text_tokenizer.py\n", "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.108.133, 185.199.110.133, ...\n", "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.\n", "HTTP request sent, awaiting response... 200 OK\n", "Length: 17146 (17K) [text/plain]\n", "Saving to: ‘scripts/process_asr_text_tokenizer.py’\n", "\n", "process_asr_text_to 100%[===================>] 16.74K --.-KB/s in 0.003s \n", "\n", "2025-08-02 15:48:13 (6.02 MB/s) - ‘scripts/process_asr_text_tokenizer.py’ saved [17146/17146]\n", "\n" ] } ], "source": [ "wget_from_nemo('scripts/tokenizers/process_asr_text_tokenizer.py')\n", "LANG='jp'\n", "DATA='TSUKA'\n", "VOCAB_SIZE=1024\n", "OUT_DIR = f\"tokenizers/{LANG}_{DATA}_{VOCAB_SIZE}\"\n", "manifest_path = \"/home/ubuntu/NeMo/data/tsukasa_manifest.json\"\n", "train_text_path =\"/home/ubuntu/NeMo/data/tsukasa_manifest.lst\"\n", "with open(manifest_path, \"r\") as f:\n", " data = [json.loads(line.strip()) for line in f.readlines()]\n", "with open(train_text_path, \"w\") as f:\n", " for line in data:\n", " f.write(f\"{line['text']}\\n\")\n", "\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "/bin/bash: /home/ubuntu/miniconda3/envs/respair/lib/libtinfo.so.6: no version information available (required by /bin/bash)\n", "[NeMo I 2025-08-02 15:50:45 nemo_logging:393] Processing /home/ubuntu/NeMo/data/tsukasa_manifest.lst and store at tokenizers/jp_TSUKA_1024/tokenizer_spe_bpe_v1024\n", "sentencepiece_trainer.cc(178) LOG(INFO) Running command: --input=/home/ubuntu/NeMo/data/tsukasa_manifest.lst --model_prefix=tokenizers/jp_TSUKA_1024/tokenizer_spe_bpe_v1024/tokenizer --vocab_size=1024 --shuffle_input_sentence=true --hard_vocab_limit=false --model_type=bpe --character_coverage=1.0 --bos_id=-1 --eos_id=-1 --remove_extra_whitespaces=false\n", "sentencepiece_trainer.cc(78) LOG(INFO) Starts training with : \n", "trainer_spec {\n", " input: /home/ubuntu/NeMo/data/tsukasa_manifest.lst\n", " input_format: \n", " model_prefix: tokenizers/jp_TSUKA_1024/tokenizer_spe_bpe_v1024/tokenizer\n", " model_type: BPE\n", " vocab_size: 1024\n", " self_test_sample_size: 0\n", " character_coverage: 1\n", " input_sentence_size: 0\n", " shuffle_input_sentence: 1\n", " seed_sentencepiece_size: 1000000\n", " shrinking_factor: 0.75\n", " max_sentence_length: 4192\n", " num_threads: 16\n", " num_sub_iterations: 2\n", " max_sentencepiece_length: 16\n", " split_by_unicode_script: 1\n", " split_by_number: 1\n", " split_by_whitespace: 1\n", " split_digits: 0\n", " pretokenization_delimiter: \n", " treat_whitespace_as_suffix: 0\n", " allow_whitespace_only_pieces: 0\n", " required_chars: \n", " byte_fallback: 0\n", " vocabulary_output_piece_score: 1\n", " train_extremely_large_corpus: 0\n", " seed_sentencepieces_file: \n", " hard_vocab_limit: 0\n", " use_all_vocab: 0\n", " unk_id: 0\n", " bos_id: -1\n", " eos_id: -1\n", " pad_id: -1\n", " unk_piece: \n", " bos_piece: \n", " eos_piece: \n", " pad_piece: \n", " unk_surface: ⁇ \n", " enable_differential_privacy: 0\n", " differential_privacy_noise_level: 0\n", " differential_privacy_clipping_threshold: 0\n", "}\n", "normalizer_spec {\n", " name: nmt_nfkc\n", " add_dummy_prefix: 1\n", " remove_extra_whitespaces: 0\n", " escape_whitespaces: 1\n", " normalization_rule_tsv: \n", "}\n", "denormalizer_spec {}\n", "trainer_interface.cc(353) LOG(INFO) SentenceIterator is not specified. Using MultiFileSentenceIterator.\n", "trainer_interface.cc(185) LOG(INFO) Loading corpus: /home/ubuntu/NeMo/data/tsukasa_manifest.lst\n", "trainer_interface.cc(409) LOG(INFO) Loaded all 488729 sentences\n", "trainer_interface.cc(425) LOG(INFO) Adding meta_piece: \n", "trainer_interface.cc(430) LOG(INFO) Normalizing sentences...\n", "trainer_interface.cc(539) LOG(INFO) all chars count=32921930\n", "trainer_interface.cc(560) LOG(INFO) Alphabet size=89\n", "trainer_interface.cc(561) LOG(INFO) Final character coverage=1\n", "trainer_interface.cc(592) LOG(INFO) Done! preprocessed 488729 sentences.\n", "trainer_interface.cc(598) LOG(INFO) Tokenizing input sentences with whitespace: 488729\n", "trainer_interface.cc(609) LOG(INFO) Done! 291302\n", "bpe_model_trainer.cc(159) LOG(INFO) Updating active symbols. max_freq=1034735 min_freq=1\n", "bpe_model_trainer.cc(268) LOG(INFO) Added: freq=271215 size=20 all=1868 active=1762 piece=ka\n", "bpe_model_trainer.cc(268) LOG(INFO) Added: freq=161716 size=40 all=2954 active=2848 piece=ʔte\n", "bpe_model_trainer.cc(268) LOG(INFO) Added: freq=83371 size=60 all=4380 active=4274 piece=▁desɯ\n", "bpe_model_trainer.cc(268) LOG(INFO) Added: freq=56419 size=80 all=6100 active=5994 piece=▁ɯ\n", "bpe_model_trainer.cc(268) LOG(INFO) Added: freq=39490 size=100 all=8433 active=8327 piece=▁dʑa\n", "bpe_model_trainer.cc(159) LOG(INFO) Updating active symbols. max_freq=39478 min_freq=1761\n", "bpe_model_trainer.cc(268) LOG(INFO) Added: freq=26896 size=120 all=10127 active=2649 piece=ː,\n", "bpe_model_trainer.cc(268) LOG(INFO) Added: freq=21363 size=140 all=12094 active=4616 piece=ɕo\n", "bpe_model_trainer.cc(268) LOG(INFO) Added: freq=17930 size=160 all=14308 active=6830 piece=ː.\n", "bpe_model_trainer.cc(268) LOG(INFO) Added: freq=15622 size=180 all=16373 active=8895 piece=▁naɽa\n", "bpe_model_trainer.cc(268) LOG(INFO) Added: freq=13828 size=200 all=18200 active=10722 piece=▁ze\n", "bpe_model_trainer.cc(159) LOG(INFO) Updating active symbols. max_freq=13777 min_freq=1551\n", "bpe_model_trainer.cc(268) LOG(INFO) Added: freq=11759 size=220 all=19976 active=2730 piece=▁ano\n", "bpe_model_trainer.cc(268) LOG(INFO) Added: freq=10250 size=240 all=21790 active=4544 piece=▁mitai\n", "bpe_model_trainer.cc(268) LOG(INFO) Added: freq=8631 size=260 all=23455 active=6209 piece=taɽi\n", "bpe_model_trainer.cc(268) LOG(INFO) Added: freq=7995 size=280 all=24911 active=7665 piece=▁tsɯka\n", "bpe_model_trainer.cc(268) LOG(INFO) Added: freq=7268 size=300 all=26692 active=9446 piece=▁sen\n", "bpe_model_trainer.cc(159) LOG(INFO) Updating active symbols. max_freq=7217 min_freq=977\n", "bpe_model_trainer.cc(268) LOG(INFO) Added: freq=6778 size=320 all=28308 active=2910 piece=toɯ\n", "bpe_model_trainer.cc(268) LOG(INFO) Added: freq=6354 size=340 all=29507 active=4109 piece=▁tsɯkɯ\n", "bpe_model_trainer.cc(268) LOG(INFO) Added: freq=5983 size=360 all=30874 active=5476 piece=ː—\n", "bpe_model_trainer.cc(268) LOG(INFO) Added: freq=5580 size=380 all=32629 active=7231 piece=▁kakɯ\n", "bpe_model_trainer.cc(268) LOG(INFO) Added: freq=5169 size=400 all=34217 active=8819 piece=▁ona\n", "bpe_model_trainer.cc(159) LOG(INFO) Updating active symbols. max_freq=5142 min_freq=650\n", "bpe_model_trainer.cc(268) LOG(INFO) Added: freq=4900 size=420 all=35396 active=2854 piece=ɽei\n", "bpe_model_trainer.cc(268) LOG(INFO) Added: freq=4645 size=440 all=36976 active=4434 piece=▁kao\n", "bpe_model_trainer.cc(268) LOG(INFO) Added: freq=4280 size=460 all=38553 active=6011 piece=seɴ\n", "bpe_model_trainer.cc(268) LOG(INFO) Added: freq=4053 size=480 all=39979 active=7437 piece=rɯɴ\n", "bpe_model_trainer.cc(268) LOG(INFO) Added: freq=3842 size=500 all=41237 active=8695 piece=kenai\n", "bpe_model_trainer.cc(159) LOG(INFO) Updating active symbols. max_freq=3830 min_freq=492\n", "bpe_model_trainer.cc(268) LOG(INFO) Added: freq=3611 size=520 all=42650 active=3382 piece=▁kawai\n", "bpe_model_trainer.cc(268) LOG(INFO) Added: freq=3449 size=540 all=44120 active=4852 piece=▁toʔte\n", "bpe_model_trainer.cc(268) LOG(INFO) Added: freq=3305 size=560 all=45261 active=5993 piece=▁hoɽa\n", "bpe_model_trainer.cc(268) LOG(INFO) Added: freq=3170 size=580 all=46443 active=7175 piece=▁moʔte\n", "bpe_model_trainer.cc(268) LOG(INFO) Added: freq=3026 size=600 all=47398 active=8130 piece=▁tsɯite\n", "bpe_model_trainer.cc(159) LOG(INFO) Updating active symbols. max_freq=3017 min_freq=390\n", "bpe_model_trainer.cc(268) LOG(INFO) Added: freq=2908 size=620 all=48580 active=3527 piece=▁tsɯzɯ\n", "bpe_model_trainer.cc(268) LOG(INFO) Added: freq=2779 size=640 all=49765 active=4712 piece=▁baɕo\n", "bpe_model_trainer.cc(268) LOG(INFO) Added: freq=2635 size=660 all=51134 active=6081 piece=▁kɯtɕi\n", "bpe_model_trainer.cc(268) LOG(INFO) Added: freq=2554 size=680 all=52065 active=7012 piece=▁wakaɽi\n", "bpe_model_trainer.cc(268) LOG(INFO) Added: freq=2444 size=700 all=53397 active=8344 piece=waɽi\n", "bpe_model_trainer.cc(159) LOG(INFO) Updating active symbols. max_freq=2444 min_freq=319\n", "bpe_model_trainer.cc(268) LOG(INFO) Added: freq=2381 size=720 all=54657 active=3818 piece=▁harɯ\n", "bpe_model_trainer.cc(268) LOG(INFO) Added: freq=2286 size=740 all=55443 active=4604 piece=▁ikenai\n", "bpe_model_trainer.cc(268) LOG(INFO) Added: freq=2164 size=760 all=57108 active=6269 piece=▁hadʑimete\n", "bpe_model_trainer.cc(268) LOG(INFO) Added: freq=2102 size=780 all=58234 active=7395 piece=gaʔte\n", "bpe_model_trainer.cc(268) LOG(INFO) Added: freq=2040 size=800 all=59522 active=8683 piece=▁ɕimai\n", "bpe_model_trainer.cc(159) LOG(INFO) Updating active symbols. max_freq=2037 min_freq=269\n", "bpe_model_trainer.cc(268) LOG(INFO) Added: freq=1964 size=820 all=60584 active=4013 piece=▁sɯkɯ\n", "bpe_model_trainer.cc(268) LOG(INFO) Added: freq=1888 size=840 all=61568 active=4997 piece=▁natsɯ\n", "bpe_model_trainer.cc(268) LOG(INFO) Added: freq=1834 size=860 all=62710 active=6139 piece=▁totemo\n", "bpe_model_trainer.cc(268) LOG(INFO) Added: freq=1768 size=880 all=63277 active=6706 piece=rɯi\n", "bpe_model_trainer.cc(268) LOG(INFO) Added: freq=1703 size=900 all=64460 active=7889 piece=eta\n", "bpe_model_trainer.cc(159) LOG(INFO) Updating active symbols. max_freq=1701 min_freq=233\n", "bpe_model_trainer.cc(268) LOG(INFO) Added: freq=1640 size=920 all=65670 active=4333 piece=ʔkakɯ\n", "trainer_interface.cc(687) LOG(INFO) Saving model: tokenizers/jp_TSUKA_1024/tokenizer_spe_bpe_v1024/tokenizer.model\n", "trainer_interface.cc(699) LOG(INFO) Saving vocabs: tokenizers/jp_TSUKA_1024/tokenizer_spe_bpe_v1024/tokenizer.vocab\n", "Serialized tokenizer at location : tokenizers/jp_TSUKA_1024/tokenizer_spe_bpe_v1024\n" ] } ], "source": [ "\n", "!python scripts/process_asr_text_tokenizer.py \\\n", " --data_file={train_text_path} \\\n", " --vocab_size={VOCAB_SIZE} \\\n", " --data_root={OUT_DIR} \\\n", " --tokenizer=\"spe\" \\\n", " --spe_type=bpe \\\n", " --spe_character_coverage=1.0 \\\n", " --no_lower_case \\\n", " --log" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Train" ] }, { "cell_type": "code", "execution_count": 36, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "/bin/bash: /home/ubuntu/miniconda3/envs/respair/lib/libtinfo.so.6: no version information available (required by /bin/bash)\n", "--2025-08-02 15:54:46-- https://raw.githubusercontent.com/NVIDIA/NeMo/refs/heads/r2.3.0/examples/asr/speech_multitask/speech_to_text_aed.py\n", "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.110.133, 185.199.111.133, ...\n", "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.\n", "HTTP request sent, awaiting response... 200 OK\n", "Length: 3875 (3.8K) [text/plain]\n", "Saving to: ‘scripts/speech_to_text_aed.py’\n", "\n", "speech_to_text_aed. 100%[===================>] 3.78K --.-KB/s in 0s \n", "\n", "2025-08-02 15:54:46 (59.9 MB/s) - ‘scripts/speech_to_text_aed.py’ saved [3875/3875]\n", "\n", "/bin/bash: /home/ubuntu/miniconda3/envs/respair/lib/libtinfo.so.6: no version information available (required by /bin/bash)\n", "--2025-08-02 15:54:47-- https://raw.githubusercontent.com/NVIDIA/NeMo/refs/heads/r2.3.0/examples/asr/conf/speech_multitask/fast-conformer_aed.yaml\n", "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n", "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n", "HTTP request sent, awaiting response... 200 OK\n", "Length: 12239 (12K) [text/plain]\n", "Saving to: ‘config/fast-conformer_aed.yaml’\n", "\n", "fast-conformer_aed. 100%[===================>] 11.95K --.-KB/s in 0.001s \n", "\n", "2025-08-02 15:54:47 (14.3 MB/s) - ‘config/fast-conformer_aed.yaml’ saved [12239/12239]\n", "\n" ] } ], "source": [ "wget_from_nemo('examples/asr/speech_multitask/speech_to_text_aed.py')\n", "wget_from_nemo('examples/asr/conf/speech_multitask/fast-conformer_aed.yaml', 'config')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "\n", "# %%bash\n", "\n", "# HYDRA_FULL_ERROR=1 python scripts/speech_to_text_aed.py \\\n", "# --config-path=\"/home/ubuntu/NeMo/config\" \\\n", "# --config-name=\"/home/ubuntu/NeMo/config/fast-conformer_aed.yaml\" \\\n", "# name=\"canary-small\" \\\n", "# model.prompt_format=\"canary2\" \\\n", "# model.train_ds.manifest_filepath=\"/home/ubuntu/NeMo/data/tsukasa_manifest.json\" \\\n", "# model.validation_ds.manifest_filepath=\"/home/ubuntu/NeMo/data/tsukasa_manifest.json\" \\\n", "# model.test_ds.manifest_filepath=\"/home/ubuntu/NeMo/data/tsukasa_manifest.json\" \\\n", "# model.tokenizer.langs.jp.dir=\"/home/ubuntu/NeMo/tokenizers/jp_TSUKA_1024/tokenizer_spe_bpe_v1024\" \\\n", "# model.tokenizer.langs.spl_tokens.dir=\"/home/ubuntu/NeMo/tokenizers/spl_tokens\" \\\n", "# spl_tokens.model_dir=\"/home/ubuntu/NeMo/tokenizers/spl_tokens\" \\\n", "# model.encoder.n_layers=17 \\\n", "# model.transf_decoder.config_dict.num_layers=4 \\\n", "# model.transf_decoder.config_dict.max_sequence_length=512 \\ \n", "# model.model_defaults.asr_enc_hidden=512 \\\n", "# model.model_defaults.lm_dec_hidden=1024 \\\n", "# exp_manager.exp_dir=\"canary_results\" \\\n", "# exp_manager.resume_ignore_no_checkpoint=true \\\n", "# trainer.max_steps=200_000 \\\n", "# trainer.log_every_n_steps=50\n", "\n", "!bash /home/ubuntu/NeMo/train_qanary.sh" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Train: 487507 samples (99.7%)\n", "Val: 1222 samples (0.3%)\n" ] } ], "source": [ "import json\n", "import random\n", "from sklearn.model_selection import train_test_split\n", "\n", "# Read all entries\n", "with open(\"/home/ubuntu/NeMo/data/tsukasa_manifest.json\", 'r') as f:\n", " all_data = [json.loads(line) for line in f]\n", "\n", "# Split 90/10 for train/val (adjust ratio as needed)\n", "train_data, val_data = train_test_split(all_data, test_size=0.0025, random_state=42)\n", "\n", "# Write train manifest\n", "with open(\"/home/ubuntu/NeMo/data/tsukasa_train.json\", 'w') as f:\n", " for entry in train_data:\n", " json.dump(entry, f)\n", " f.write('\\n')\n", "\n", "# Write validation manifest \n", "with open(\"/home/ubuntu/NeMo/data/tsukasa_val.json\", 'w') as f:\n", " for entry in val_data:\n", " json.dump(entry, f)\n", " f.write('\\n')\n", "\n", "print(f\"Train: {len(train_data)} samples ({len(train_data)/len(all_data)*100:.1f}%)\")\n", "print(f\"Val: {len(val_data)} samples ({len(val_data)/len(all_data)*100:.1f}%)\")" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Train: 794 samples (65.0%)\n", "Val: 428 samples (35.0%)\n" ] } ], "source": [ "import json\n", "import random\n", "from sklearn.model_selection import train_test_split\n", "\n", "# Read all entries\n", "with open(\"/home/ubuntu/NeMo/data/tsukasa_val.json\", 'r') as f:\n", " all_data = [json.loads(line) for line in f]\n", "\n", "# Split 90/10 for train/val (adjust ratio as needed)\n", "train_data, val_data = train_test_split(all_data, test_size=0.35, random_state=42)\n", "\n", "# Write train manifest\n", "with open(\"/home/ubuntu/NeMo/data/tsukasa_val.json\", 'w') as f:\n", " for entry in train_data:\n", " json.dump(entry, f)\n", " f.write('\\n')\n", "\n", "# Write validation manifest \n", "with open(\"/home/ubuntu/NeMo/data/tsukasa_test.json\", 'w') as f:\n", " for entry in val_data:\n", " json.dump(entry, f)\n", " f.write('\\n')\n", "\n", "print(f\"Train: {len(train_data)} samples ({len(train_data)/len(all_data)*100:.1f}%)\")\n", "print(f\"Val: {len(val_data)} samples ({len(val_data)/len(all_data)*100:.1f}%)\")" ] } ], "metadata": { "kernelspec": { "display_name": "respair", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.9" } }, "nbformat": 4, "nbformat_minor": 2 }