import parselmouth
import numpy as np

AUDIO_PATH = "sample_trim.wav"

def main():
    snd = parselmouth.Sound(AUDIO_PATH)

    duration = snd.get_total_duration()
    print("Audio duration (sec):", round(duration, 2))

    # Intensity (energy over time)
    intensity = snd.to_intensity(time_step=0.01)
    times = intensity.xs()
    vals = intensity.values[0]

    # Simple segmentation: find "voiced-ish" regions by intensity threshold
    thr = np.percentile(vals, 60)  # adaptive threshold
    voiced = vals > thr

    # Convert boolean mask into segments [start, end]
    segments = []
    in_seg = False
    start = None
    for t, v in zip(times, voiced):
        if v and not in_seg:
            in_seg = True
            start = t
        elif (not v) and in_seg:
            in_seg = False
            end = t
            if end - start >= 0.06:  # ignore tiny blips
                segments.append((start, end))
    if in_seg and start is not None:
        end = times[-1]
        if end - start >= 0.06:
            segments.append((start, end))

    # Print segments
    print("Candidate voiced segments:", len(segments))
    for i, (s, e) in enumerate(segments[:12], 1):
        print(f"{i:02d}. {s:.2f} -> {e:.2f}  (dur {e-s:.2f}s)")

    # Heuristic "madd-like" durations: anything > 0.18s is suspiciously long vowel
    longish = [(s, e, e - s) for (s, e) in segments if (e - s) >= 0.18]
    print("\nLong segments (possible Madd candidates):", len(longish))
    for i, (s, e, d) in enumerate(longish[:12], 1):
        print(f"{i:02d}. {s:.2f} -> {e:.2f}  (dur {d:.2f}s)")

if __name__ == "__main__":
    main()