import pathlib import click import librosa import matplotlib.pyplot as plt import numpy as np import parselmouth as pm import tqdm from textgrid import TextGrid import distribution @click.command(help='Generate word-level pitch summary') @click.option('--wavs', required=True, help='Path to the segments directory') @click.option('--tg', required=True, help='Path to the TextGrids directory') def summary_pitch(wavs, tg): wavs = pathlib.Path(wavs) tg_dir = pathlib.Path(tg) del tg filelist = list(wavs.glob('*.wav')) pit_map = {} f0_min = 40. f0_max = 1100. voicing_thresh_vowel = 0.45 for wavfile in tqdm.tqdm(filelist): tg = TextGrid() tg.read(tg_dir / wavfile.with_suffix('.TextGrid').name) timestep = 0.01 f0 = pm.Sound(str(wavfile)).to_pitch_ac( time_step=timestep, voicing_threshold=voicing_thresh_vowel, pitch_floor=f0_min, pitch_ceiling=f0_max, ).selected_array['frequency'] pitch = 12. * np.log2(f0 / 440.) + 69. for word in tg[0]: if word.mark in ['AP', 'SP']: continue if word.maxTime - word.minTime < timestep: continue word_pit = pitch[int(word.minTime / timestep): int(word.maxTime / timestep)] word_pit = np.extract(word_pit >= 0, word_pit) if word_pit.shape[0] == 0: continue counts = np.bincount(word_pit.astype(np.int64)) midi = counts.argmax() if midi in pit_map: pit_map[midi] += 1 else: pit_map[midi] = 1 midi_keys = sorted(pit_map.keys()) midi_keys = list(range(midi_keys[0], midi_keys[-1] + 1)) distribution.draw_distribution( title='Pitch Distribution Summary', x_label='Pitch', y_label='Number of occurrences', items=[librosa.midi_to_note(k) for k in midi_keys], values=[pit_map.get(k, 0) for k in midi_keys] ) pitch_summary = wavs / 'pitch_distribution.jpg' plt.savefig(fname=pitch_summary, bbox_inches='tight', pad_inches=0.25) print(f'Pitch distribution summary saved to {pitch_summary}') if __name__ == '__main__': summary_pitch()