| using System; |
| using UnityEngine; |
| namespace OnDeviceAgent.Inference |
| { |
|
|
| public sealed class AudioInputService : MonoBehaviour |
| { |
| public const int TargetSampleRate = 16000; |
|
|
| [SerializeField] bool m_StartOnEnable = true; |
| [SerializeField] int m_MicrophoneLoopSeconds = 10; |
| [SerializeField] int m_HistorySeconds = 12; |
| [SerializeField] float m_SilenceDetectionSeconds = 4f; |
| [SerializeField] float m_SilenceEpsilon = 1e-6f; |
| [SerializeField] float m_RestartCooldownSeconds = 5f; |
|
|
| public event Action<float[], int, long> SamplesReady; |
| public bool Mute { get; set; } |
|
|
| AudioClip m_MicrophoneClip; |
| int m_LastMicrophonePosition; |
| readonly FloatRingBuffer m_ResampleBuffer = new FloatRingBuffer(TargetSampleRate); |
| FloatRingBuffer m_History; |
| float[] m_ReadBuffer = Array.Empty<float>(); |
| float[] m_OutputBuffer = Array.Empty<float>(); |
| float m_ResampleCursor; |
| long m_NextSampleSequence; |
| float m_SilentSeconds; |
| float m_LastRestartTime = float.MinValue; |
| float m_FrameMaxAbs; |
|
|
| public int HistoryCount => m_History.Count; |
| public bool IsRunning => m_MicrophoneClip != null; |
| public long NextSampleSequence => m_NextSampleSequence; |
|
|
| void Awake() |
| { |
| m_History = new FloatRingBuffer(TargetSampleRate * m_HistorySeconds); |
| } |
|
|
| void OnEnable() |
| { |
| if (m_StartOnEnable) |
| StartMicrophone(); |
| } |
|
|
| void Update() |
| { |
| if (m_MicrophoneClip != null) |
| ReadMicrophoneSamples(); |
| } |
|
|
| void OnDisable() |
| { |
| StopMicrophone(); |
| } |
|
|
| public void StartMicrophone() |
| { |
| if (Microphone.devices.Length == 0) |
| { |
| Debug.LogError("[Voice] No microphone device found."); |
| return; |
| } |
|
|
| m_MicrophoneClip = Microphone.Start(null, true, m_MicrophoneLoopSeconds, TargetSampleRate); |
| if (m_MicrophoneClip == null) |
| { |
| Debug.LogError("[Voice] Failed to start default microphone."); |
| return; |
| } |
|
|
| m_LastMicrophonePosition = 0; |
| m_ResampleCursor = 0f; |
| m_ResampleBuffer.Clear(); |
| m_History.Clear(); |
| m_NextSampleSequence = 0; |
|
|
| Debug.Log($"[Voice] Mic started: default device, {m_MicrophoneClip.frequency} Hz, {m_MicrophoneClip.channels} ch"); |
| } |
|
|
| public void StopMicrophone() |
| { |
| if (Microphone.IsRecording(null)) |
| Microphone.End(null); |
|
|
| m_MicrophoneClip = null; |
| m_LastMicrophonePosition = 0; |
| } |
|
|
| public void CopyLatest(int sampleCount, float[] destination, int destinationIndex) |
| { |
| m_History.CopyLatest(sampleCount, destination, destinationIndex); |
| } |
|
|
| void ReadMicrophoneSamples() |
| { |
| if (!Microphone.IsRecording(null)) |
| { |
| TryRestart("[Voice] Mic stopped unexpectedly; restarting"); |
| return; |
| } |
|
|
| var clipSamples = m_MicrophoneClip.samples; |
| var position = Microphone.GetPosition(null); |
| if (clipSamples <= 0 || position < 0) |
| return; |
|
|
| if (position >= clipSamples) |
| position = 0; |
|
|
| if (m_LastMicrophonePosition >= clipSamples) |
| m_LastMicrophonePosition = 0; |
|
|
| if (position == m_LastMicrophonePosition) |
| return; |
|
|
| m_FrameMaxAbs = 0f; |
|
|
| if (position > m_LastMicrophonePosition) |
| { |
| ReadMicrophoneRange(m_LastMicrophonePosition, position - m_LastMicrophonePosition); |
| } |
| else |
| { |
| ReadMicrophoneRange(m_LastMicrophonePosition, clipSamples - m_LastMicrophonePosition); |
| ReadMicrophoneRange(0, position); |
| } |
|
|
| m_LastMicrophonePosition = position; |
|
|
| if (m_FrameMaxAbs >= m_SilenceEpsilon) |
| { |
| m_SilentSeconds = 0f; |
| } |
| else |
| { |
| m_SilentSeconds += Time.unscaledDeltaTime; |
| if (m_SilentSeconds >= m_SilenceDetectionSeconds) |
| TryRestart("[Voice] Mic silenced detected; restarting capture"); |
| } |
| } |
|
|
| void TryRestart(string logMessage) |
| { |
| if (Time.unscaledTime - m_LastRestartTime < m_RestartCooldownSeconds) |
| return; |
|
|
| Debug.Log(logMessage); |
| m_LastRestartTime = Time.unscaledTime; |
| m_SilentSeconds = 0f; |
| StopMicrophone(); |
| StartMicrophone(); |
| } |
|
|
| void ReadMicrophoneRange(int offsetSamples, int sampleFrames) |
| { |
| var clipSamples = m_MicrophoneClip.samples; |
| if (sampleFrames <= 0 || offsetSamples < 0 || offsetSamples >= clipSamples) |
| return; |
|
|
| sampleFrames = Mathf.Min(sampleFrames, clipSamples - offsetSamples); |
| var channels = Mathf.Max(1, m_MicrophoneClip.channels); |
| var sampleCount = sampleFrames * channels; |
| if (sampleCount <= 0) |
| return; |
|
|
| if (m_ReadBuffer.Length != sampleCount) |
| m_ReadBuffer = new float[sampleCount]; |
|
|
| m_MicrophoneClip.GetData(m_ReadBuffer, offsetSamples); |
|
|
| for (var i = 0; i < sampleCount; i++) |
| { |
| var abs = m_ReadBuffer[i] < 0f ? -m_ReadBuffer[i] : m_ReadBuffer[i]; |
| if (abs > m_FrameMaxAbs) m_FrameMaxAbs = abs; |
| } |
|
|
| if (m_MicrophoneClip.frequency == TargetSampleRate && channels == 1) |
| { |
| PublishSamples(m_ReadBuffer, sampleFrames); |
| return; |
| } |
|
|
| var outputCapacity = m_MicrophoneClip.frequency == TargetSampleRate |
| ? sampleFrames |
| : Mathf.CeilToInt(sampleFrames * (TargetSampleRate / (float)m_MicrophoneClip.frequency)) + 4; |
| if (m_OutputBuffer.Length < outputCapacity) |
| m_OutputBuffer = new float[outputCapacity]; |
|
|
| var outputCount = m_MicrophoneClip.frequency == TargetSampleRate |
| ? DownmixInto(m_ReadBuffer, sampleFrames, channels, m_OutputBuffer) |
| : ResampleInto(m_ReadBuffer, sampleFrames, channels, m_MicrophoneClip.frequency, m_OutputBuffer); |
|
|
| if (outputCount > 0) |
| PublishSamples(m_OutputBuffer, outputCount); |
| } |
|
|
| void PublishSamples(float[] samples, int length) |
| { |
| if (Mute || SupertonicTtsModule.IsSpeaking) |
| { |
| m_NextSampleSequence += length; |
| return; |
| } |
|
|
| var startSequence = m_NextSampleSequence; |
| m_NextSampleSequence += length; |
| m_History.AddRange(samples, length); |
| SamplesReady?.Invoke(samples, length, startSequence); |
| } |
|
|
| static int DownmixInto(float[] source, int sampleFrames, int channels, float[] destination) |
| { |
| for (var frame = 0; frame < sampleFrames; frame++) |
| destination[frame] = Downmix(source, frame, channels); |
|
|
| return sampleFrames; |
| } |
|
|
| int ResampleInto(float[] source, int sampleFrames, int channels, int sourceRate, float[] destination) |
| { |
| for (var frame = 0; frame < sampleFrames; frame++) |
| m_ResampleBuffer.Add(Downmix(source, frame, channels)); |
|
|
| var outputCount = 0; |
| var step = sourceRate / (float)TargetSampleRate; |
| while (m_ResampleBuffer.Count >= Mathf.CeilToInt(m_ResampleCursor) + 2 && outputCount < destination.Length) |
| { |
| var i0 = Mathf.FloorToInt(m_ResampleCursor); |
| var t = m_ResampleCursor - i0; |
| destination[outputCount++] = Mathf.Lerp(m_ResampleBuffer[i0], m_ResampleBuffer[i0 + 1], t); |
| m_ResampleCursor += step; |
|
|
| var drop = Mathf.FloorToInt(m_ResampleCursor); |
| if (drop > 0) |
| m_ResampleBuffer.RemoveFromStart(Mathf.Min(drop, m_ResampleBuffer.Count - 2)); |
| m_ResampleCursor -= drop; |
| } |
|
|
| return outputCount; |
| } |
|
|
| static float Downmix(float[] data, int frame, int channels) |
| { |
| var sum = 0f; |
| var baseIndex = frame * channels; |
| for (var channel = 0; channel < channels; channel++) |
| sum += data[baseIndex + channel]; |
|
|
| return sum / channels; |
| } |
| } |
| } |
|
|