piper-de-glados / 2_transcribe.py
systemofapwne's picture
Initial commit
072e103
#!/usr/bin/env python3
# Inspired by https://blog.networkchuck.com/posts/how-to-clone-a-voice/
# Enhanced by https://github.com/SYSTRAN/faster-whisper
import os
from faster_whisper import WhisperModel
# SELECT language and MODEL of the files you want to transcribe
# I highly suggest to use a large model and to do this on GPU
LANG="de"
WHISPER_MODEL = "large-v3"
# Run on GPU
model = WhisperModel(WHISPER_MODEL, device="cuda", compute_type="float16")
# or run on CPU with INT8 (will take ages)
#model = WhisperModel(WHISPER_MODEL, device="cpu", compute_type="int8")
# Path to the directory containing the audio files
audio_dir = "./raw_good"
output_csv = "./metadata.csv"
# List all .wav files in the directory
audio_files = [f for f in os.listdir(audio_dir) if f.endswith(".wav")]
audio_files.sort() # Sort the files alphabetically (optional)
# Open the CSV file for writing
with open(output_csv, "w") as f:
for audio_file in audio_files:
# Full path to the audio file
audio_path = os.path.join(audio_dir, audio_file)
segments, info = model.transcribe(audio_path, language=LANG, beam_size=5)
transcription = ""
for seg in segments:
transcription += " "+seg.text.strip()
transcription = transcription.strip()
# Write the filename (without .wav extension) and transcription to the CSV
file_id = os.path.splitext(audio_file)[0] # Get file name without extension
f.write(f"{file_id}|{transcription}\n")
print(f"{file_id}|{transcription}")
print(f"Transcriptions complete! Metadata saved to {output_csv}")