using System; using UnityEngine; namespace OnDeviceAgent.Inference { public sealed class AudioInputService : MonoBehaviour { public const int TargetSampleRate = 16000; [SerializeField] bool m_StartOnEnable = true; [SerializeField] int m_MicrophoneLoopSeconds = 10; [SerializeField] int m_HistorySeconds = 12; [SerializeField] float m_SilenceDetectionSeconds = 4f; [SerializeField] float m_SilenceEpsilon = 1e-6f; [SerializeField] float m_RestartCooldownSeconds = 5f; public event Action SamplesReady; public bool Mute { get; set; } AudioClip m_MicrophoneClip; int m_LastMicrophonePosition; readonly FloatRingBuffer m_ResampleBuffer = new FloatRingBuffer(TargetSampleRate); FloatRingBuffer m_History; float[] m_ReadBuffer = Array.Empty(); float[] m_OutputBuffer = Array.Empty(); float m_ResampleCursor; long m_NextSampleSequence; float m_SilentSeconds; float m_LastRestartTime = float.MinValue; float m_FrameMaxAbs; public int HistoryCount => m_History.Count; public bool IsRunning => m_MicrophoneClip != null; public long NextSampleSequence => m_NextSampleSequence; void Awake() { m_History = new FloatRingBuffer(TargetSampleRate * m_HistorySeconds); } void OnEnable() { if (m_StartOnEnable) StartMicrophone(); } void Update() { if (m_MicrophoneClip != null) ReadMicrophoneSamples(); } void OnDisable() { StopMicrophone(); } public void StartMicrophone() { if (Microphone.devices.Length == 0) { Debug.LogError("[Voice] No microphone device found."); return; } m_MicrophoneClip = Microphone.Start(null, true, m_MicrophoneLoopSeconds, TargetSampleRate); if (m_MicrophoneClip == null) { Debug.LogError("[Voice] Failed to start default microphone."); return; } m_LastMicrophonePosition = 0; m_ResampleCursor = 0f; m_ResampleBuffer.Clear(); m_History.Clear(); m_NextSampleSequence = 0; Debug.Log($"[Voice] Mic started: default device, {m_MicrophoneClip.frequency} Hz, {m_MicrophoneClip.channels} ch"); } public void StopMicrophone() { if (Microphone.IsRecording(null)) Microphone.End(null); m_MicrophoneClip = null; m_LastMicrophonePosition = 0; } public void CopyLatest(int sampleCount, float[] destination, int destinationIndex) { m_History.CopyLatest(sampleCount, destination, destinationIndex); } void ReadMicrophoneSamples() { if (!Microphone.IsRecording(null)) { TryRestart("[Voice] Mic stopped unexpectedly; restarting"); return; } var clipSamples = m_MicrophoneClip.samples; var position = Microphone.GetPosition(null); if (clipSamples <= 0 || position < 0) return; if (position >= clipSamples) position = 0; if (m_LastMicrophonePosition >= clipSamples) m_LastMicrophonePosition = 0; if (position == m_LastMicrophonePosition) return; m_FrameMaxAbs = 0f; if (position > m_LastMicrophonePosition) { ReadMicrophoneRange(m_LastMicrophonePosition, position - m_LastMicrophonePosition); } else { ReadMicrophoneRange(m_LastMicrophonePosition, clipSamples - m_LastMicrophonePosition); ReadMicrophoneRange(0, position); } m_LastMicrophonePosition = position; if (m_FrameMaxAbs >= m_SilenceEpsilon) { m_SilentSeconds = 0f; } else { m_SilentSeconds += Time.unscaledDeltaTime; if (m_SilentSeconds >= m_SilenceDetectionSeconds) TryRestart("[Voice] Mic silenced detected; restarting capture"); } } void TryRestart(string logMessage) { if (Time.unscaledTime - m_LastRestartTime < m_RestartCooldownSeconds) return; Debug.Log(logMessage); m_LastRestartTime = Time.unscaledTime; m_SilentSeconds = 0f; StopMicrophone(); StartMicrophone(); } void ReadMicrophoneRange(int offsetSamples, int sampleFrames) { var clipSamples = m_MicrophoneClip.samples; if (sampleFrames <= 0 || offsetSamples < 0 || offsetSamples >= clipSamples) return; sampleFrames = Mathf.Min(sampleFrames, clipSamples - offsetSamples); var channels = Mathf.Max(1, m_MicrophoneClip.channels); var sampleCount = sampleFrames * channels; if (sampleCount <= 0) return; if (m_ReadBuffer.Length != sampleCount) m_ReadBuffer = new float[sampleCount]; m_MicrophoneClip.GetData(m_ReadBuffer, offsetSamples); for (var i = 0; i < sampleCount; i++) { var abs = m_ReadBuffer[i] < 0f ? -m_ReadBuffer[i] : m_ReadBuffer[i]; if (abs > m_FrameMaxAbs) m_FrameMaxAbs = abs; } if (m_MicrophoneClip.frequency == TargetSampleRate && channels == 1) { PublishSamples(m_ReadBuffer, sampleFrames); return; } var outputCapacity = m_MicrophoneClip.frequency == TargetSampleRate ? sampleFrames : Mathf.CeilToInt(sampleFrames * (TargetSampleRate / (float)m_MicrophoneClip.frequency)) + 4; if (m_OutputBuffer.Length < outputCapacity) m_OutputBuffer = new float[outputCapacity]; var outputCount = m_MicrophoneClip.frequency == TargetSampleRate ? DownmixInto(m_ReadBuffer, sampleFrames, channels, m_OutputBuffer) : ResampleInto(m_ReadBuffer, sampleFrames, channels, m_MicrophoneClip.frequency, m_OutputBuffer); if (outputCount > 0) PublishSamples(m_OutputBuffer, outputCount); } void PublishSamples(float[] samples, int length) { if (Mute || SupertonicTtsModule.IsSpeaking) { m_NextSampleSequence += length; return; } var startSequence = m_NextSampleSequence; m_NextSampleSequence += length; m_History.AddRange(samples, length); SamplesReady?.Invoke(samples, length, startSequence); } static int DownmixInto(float[] source, int sampleFrames, int channels, float[] destination) { for (var frame = 0; frame < sampleFrames; frame++) destination[frame] = Downmix(source, frame, channels); return sampleFrames; } int ResampleInto(float[] source, int sampleFrames, int channels, int sourceRate, float[] destination) { for (var frame = 0; frame < sampleFrames; frame++) m_ResampleBuffer.Add(Downmix(source, frame, channels)); var outputCount = 0; var step = sourceRate / (float)TargetSampleRate; while (m_ResampleBuffer.Count >= Mathf.CeilToInt(m_ResampleCursor) + 2 && outputCount < destination.Length) { var i0 = Mathf.FloorToInt(m_ResampleCursor); var t = m_ResampleCursor - i0; destination[outputCount++] = Mathf.Lerp(m_ResampleBuffer[i0], m_ResampleBuffer[i0 + 1], t); m_ResampleCursor += step; var drop = Mathf.FloorToInt(m_ResampleCursor); if (drop > 0) m_ResampleBuffer.RemoveFromStart(Mathf.Min(drop, m_ResampleBuffer.Count - 2)); m_ResampleCursor -= drop; } return outputCount; } static float Downmix(float[] data, int frame, int channels) { var sum = 0f; var baseIndex = frame * channels; for (var channel = 0; channel < channels; channel++) sum += data[baseIndex + channel]; return sum / channels; } } }