| """
|
| Used to transcribe all audio files in one folder into another folder.
|
| e.g.
|
| Directory structure:
|
| --pre_data_root
|
| ----SP_1
|
| ------01.wav
|
| ------02.wav
|
| ------......
|
| ----SP_2
|
| ------01.wav
|
| ------02.wav
|
| ------......
|
| Use
|
| python tools/whisper_asr.py --audio-dir pre_data_root/SP_1 --save-dir data/SP_1
|
| to transcribe the first speaker.
|
|
|
| Use
|
| python tools/whisper_asr.py --audio-dir pre_data_root/SP_2 --save-dir data/SP_2
|
| to transcribe the second speaker.
|
|
|
| Note: Be aware of your audio sample rate, which defaults to 44.1kHz.
|
| """
|
|
|
| import re
|
| from pathlib import Path
|
|
|
| import click
|
| import soundfile as sf
|
| from faster_whisper import WhisperModel
|
| from loguru import logger
|
| from pydub import AudioSegment
|
| from tqdm import tqdm
|
|
|
| from tools.file import AUDIO_EXTENSIONS, list_files
|
|
|
|
|
| @click.command()
|
| @click.option("--model-size", default="large-v3", help="Size of the Whisper model")
|
| @click.option(
|
| "--compute-type",
|
| default="float16",
|
| help="Computation Precision of the Whisper model [float16 / int8_float16 / int8]",
|
| )
|
| @click.option("--audio-dir", required=True, help="Directory containing audio files")
|
| @click.option(
|
| "--save-dir", required=True, help="Directory to save processed audio files"
|
| )
|
| @click.option(
|
| "--sample-rate",
|
| default=44100,
|
| type=int,
|
| help="Output sample rate, default to input sample rate",
|
| )
|
| @click.option("--device", default="cuda", help="Device to use [cuda / cpu]")
|
| @click.option("--language", default="auto", help="Language of the transcription")
|
| @click.option("--initial-prompt", default=None, help="Initial prompt for transcribing")
|
| def main(
|
| model_size,
|
| compute_type,
|
| audio_dir,
|
| save_dir,
|
| sample_rate,
|
| device,
|
| language,
|
| initial_prompt,
|
| ):
|
| logger.info("Loading / Downloading Faster Whisper model...")
|
|
|
| model = WhisperModel(
|
| model_size,
|
| device=device,
|
| compute_type=compute_type,
|
| download_root="faster_whisper",
|
| )
|
|
|
| logger.info("Model loaded.")
|
|
|
| save_path = Path(save_dir)
|
| save_path.mkdir(parents=True, exist_ok=True)
|
|
|
| audio_files = list_files(
|
| path=audio_dir, extensions=AUDIO_EXTENSIONS, recursive=True
|
| )
|
|
|
| for file_path in tqdm(audio_files, desc="Processing audio file"):
|
| file_stem = file_path.stem
|
| file_suffix = file_path.suffix
|
|
|
| rel_path = Path(file_path).relative_to(audio_dir)
|
| (save_path / rel_path.parent).mkdir(parents=True, exist_ok=True)
|
|
|
| audio = AudioSegment.from_file(file_path)
|
|
|
| segments, info = model.transcribe(
|
| file_path,
|
| beam_size=5,
|
| language=None if language == "auto" else language,
|
| initial_prompt=initial_prompt,
|
| )
|
|
|
| print(
|
| "Detected language '%s' with probability %f"
|
| % (info.language, info.language_probability)
|
| )
|
| print("Total len(ms): ", len(audio))
|
|
|
| whole_text = None
|
| for segment in segments:
|
| id, start, end, text = (
|
| segment.id,
|
| segment.start,
|
| segment.end,
|
| segment.text,
|
| )
|
| print("Segment %03d [%.2fs -> %.2fs] %s" % (id, start, end, text))
|
| if not whole_text:
|
| whole_text = text
|
| else:
|
| whole_text += ", " + text
|
|
|
| whole_text += "."
|
|
|
| audio_save_path = save_path / rel_path.parent / f"{file_stem}{file_suffix}"
|
| audio.export(audio_save_path, format=file_suffix[1:])
|
| print(f"Exported {audio_save_path}")
|
|
|
| transcript_save_path = save_path / rel_path.parent / f"{file_stem}.lab"
|
| with open(
|
| transcript_save_path,
|
| "w",
|
| encoding="utf-8",
|
| ) as f:
|
| f.write(whole_text)
|
|
|
|
|
| if __name__ == "__main__":
|
| main()
|
| exit(0)
|
|
|
| audio = AudioSegment.from_wav(
|
| r"D:\PythonProject\原神语音中文\胡桃\vo_hutao_draw_appear.wav"
|
| )
|
|
|
| model_size = "large-v3"
|
|
|
| model = WhisperModel(
|
| model_size,
|
| device="cuda",
|
| compute_type="float16",
|
| download_root="faster_whisper",
|
| )
|
|
|
| segments, info = model.transcribe(
|
| r"D:\PythonProject\原神语音中文\胡桃\vo_hutao_draw_appear.wav",
|
| beam_size=5,
|
| )
|
|
|
| print(
|
| "Detected language '%s' with probability %f"
|
| % (info.language, info.language_probability)
|
| )
|
| print("Total len(ms): ", len(audio))
|
|
|
| for i, segment in enumerate(segments):
|
| print(
|
| "Segment %03d [%.2fs -> %.2fs] %s"
|
| % (i, segment.start, segment.end, segment.text)
|
| )
|
| start_ms = int(segment.start * 1000)
|
| end_ms = int(segment.end * 1000)
|
| segment_audio = audio[start_ms:end_ms]
|
| segment_audio.export(f"segment_{i:03d}.wav", format="wav")
|
| print(f"Exported segment_{i:03d}.wav")
|
|
|
| print("All segments have been exported.")
|
|
|