import parselmouth import numpy as np AUDIO_PATH = "sample_trim.wav" def main(): snd = parselmouth.Sound(AUDIO_PATH) duration = snd.get_total_duration() print("Audio duration (sec):", round(duration, 2)) # Intensity (energy over time) intensity = snd.to_intensity(time_step=0.01) times = intensity.xs() vals = intensity.values[0] # Simple segmentation: find "voiced-ish" regions by intensity threshold thr = np.percentile(vals, 60) # adaptive threshold voiced = vals > thr # Convert boolean mask into segments [start, end] segments = [] in_seg = False start = None for t, v in zip(times, voiced): if v and not in_seg: in_seg = True start = t elif (not v) and in_seg: in_seg = False end = t if end - start >= 0.06: # ignore tiny blips segments.append((start, end)) if in_seg and start is not None: end = times[-1] if end - start >= 0.06: segments.append((start, end)) # Print segments print("Candidate voiced segments:", len(segments)) for i, (s, e) in enumerate(segments[:12], 1): print(f"{i:02d}. {s:.2f} -> {e:.2f} (dur {e-s:.2f}s)") # Heuristic "madd-like" durations: anything > 0.18s is suspiciously long vowel longish = [(s, e, e - s) for (s, e) in segments if (e - s) >= 0.18] print("\nLong segments (possible Madd candidates):", len(longish)) for i, (s, e, d) in enumerate(longish[:12], 1): print(f"{i:02d}. {s:.2f} -> {e:.2f} (dur {d:.2f}s)") if __name__ == "__main__": main()