com.sky.ondeviceagent / Runtime /Inference /Audio /AudioInputService.cs
Sky-Kim's picture
Initial commit
2e7837a
Raw
History Blame Contribute Delete
8.75 kB
using System;
using UnityEngine;
namespace OnDeviceAgent.Inference
{
public sealed class AudioInputService : MonoBehaviour
{
public const int TargetSampleRate = 16000;
[SerializeField] bool m_StartOnEnable = true;
[SerializeField] int m_MicrophoneLoopSeconds = 10;
[SerializeField] int m_HistorySeconds = 12;
[SerializeField] float m_SilenceDetectionSeconds = 4f;
[SerializeField] float m_SilenceEpsilon = 1e-6f;
[SerializeField] float m_RestartCooldownSeconds = 5f;
public event Action<float[], int, long> SamplesReady;
public bool Mute { get; set; }
AudioClip m_MicrophoneClip;
int m_LastMicrophonePosition;
readonly FloatRingBuffer m_ResampleBuffer = new FloatRingBuffer(TargetSampleRate);
FloatRingBuffer m_History;
float[] m_ReadBuffer = Array.Empty<float>();
float[] m_OutputBuffer = Array.Empty<float>();
float m_ResampleCursor;
long m_NextSampleSequence;
float m_SilentSeconds;
float m_LastRestartTime = float.MinValue;
float m_FrameMaxAbs;
public int HistoryCount => m_History.Count;
public bool IsRunning => m_MicrophoneClip != null;
public long NextSampleSequence => m_NextSampleSequence;
void Awake()
{
m_History = new FloatRingBuffer(TargetSampleRate * m_HistorySeconds);
}
void OnEnable()
{
if (m_StartOnEnable)
StartMicrophone();
}
void Update()
{
if (m_MicrophoneClip != null)
ReadMicrophoneSamples();
}
void OnDisable()
{
StopMicrophone();
}
public void StartMicrophone()
{
if (Microphone.devices.Length == 0)
{
Debug.LogError("[Voice] No microphone device found.");
return;
}
m_MicrophoneClip = Microphone.Start(null, true, m_MicrophoneLoopSeconds, TargetSampleRate);
if (m_MicrophoneClip == null)
{
Debug.LogError("[Voice] Failed to start default microphone.");
return;
}
m_LastMicrophonePosition = 0;
m_ResampleCursor = 0f;
m_ResampleBuffer.Clear();
m_History.Clear();
m_NextSampleSequence = 0;
Debug.Log($"[Voice] Mic started: default device, {m_MicrophoneClip.frequency} Hz, {m_MicrophoneClip.channels} ch");
}
public void StopMicrophone()
{
if (Microphone.IsRecording(null))
Microphone.End(null);
m_MicrophoneClip = null;
m_LastMicrophonePosition = 0;
}
public void CopyLatest(int sampleCount, float[] destination, int destinationIndex)
{
m_History.CopyLatest(sampleCount, destination, destinationIndex);
}
void ReadMicrophoneSamples()
{
if (!Microphone.IsRecording(null))
{
TryRestart("[Voice] Mic stopped unexpectedly; restarting");
return;
}
var clipSamples = m_MicrophoneClip.samples;
var position = Microphone.GetPosition(null);
if (clipSamples <= 0 || position < 0)
return;
if (position >= clipSamples)
position = 0;
if (m_LastMicrophonePosition >= clipSamples)
m_LastMicrophonePosition = 0;
if (position == m_LastMicrophonePosition)
return;
m_FrameMaxAbs = 0f;
if (position > m_LastMicrophonePosition)
{
ReadMicrophoneRange(m_LastMicrophonePosition, position - m_LastMicrophonePosition);
}
else
{
ReadMicrophoneRange(m_LastMicrophonePosition, clipSamples - m_LastMicrophonePosition);
ReadMicrophoneRange(0, position);
}
m_LastMicrophonePosition = position;
if (m_FrameMaxAbs >= m_SilenceEpsilon)
{
m_SilentSeconds = 0f;
}
else
{
m_SilentSeconds += Time.unscaledDeltaTime;
if (m_SilentSeconds >= m_SilenceDetectionSeconds)
TryRestart("[Voice] Mic silenced detected; restarting capture");
}
}
void TryRestart(string logMessage)
{
if (Time.unscaledTime - m_LastRestartTime < m_RestartCooldownSeconds)
return;
Debug.Log(logMessage);
m_LastRestartTime = Time.unscaledTime;
m_SilentSeconds = 0f;
StopMicrophone();
StartMicrophone();
}
void ReadMicrophoneRange(int offsetSamples, int sampleFrames)
{
var clipSamples = m_MicrophoneClip.samples;
if (sampleFrames <= 0 || offsetSamples < 0 || offsetSamples >= clipSamples)
return;
sampleFrames = Mathf.Min(sampleFrames, clipSamples - offsetSamples);
var channels = Mathf.Max(1, m_MicrophoneClip.channels);
var sampleCount = sampleFrames * channels;
if (sampleCount <= 0)
return;
if (m_ReadBuffer.Length != sampleCount)
m_ReadBuffer = new float[sampleCount];
m_MicrophoneClip.GetData(m_ReadBuffer, offsetSamples);
for (var i = 0; i < sampleCount; i++)
{
var abs = m_ReadBuffer[i] < 0f ? -m_ReadBuffer[i] : m_ReadBuffer[i];
if (abs > m_FrameMaxAbs) m_FrameMaxAbs = abs;
}
if (m_MicrophoneClip.frequency == TargetSampleRate && channels == 1)
{
PublishSamples(m_ReadBuffer, sampleFrames);
return;
}
var outputCapacity = m_MicrophoneClip.frequency == TargetSampleRate
? sampleFrames
: Mathf.CeilToInt(sampleFrames * (TargetSampleRate / (float)m_MicrophoneClip.frequency)) + 4;
if (m_OutputBuffer.Length < outputCapacity)
m_OutputBuffer = new float[outputCapacity];
var outputCount = m_MicrophoneClip.frequency == TargetSampleRate
? DownmixInto(m_ReadBuffer, sampleFrames, channels, m_OutputBuffer)
: ResampleInto(m_ReadBuffer, sampleFrames, channels, m_MicrophoneClip.frequency, m_OutputBuffer);
if (outputCount > 0)
PublishSamples(m_OutputBuffer, outputCount);
}
void PublishSamples(float[] samples, int length)
{
if (Mute || SupertonicTtsModule.IsSpeaking)
{
m_NextSampleSequence += length;
return;
}
var startSequence = m_NextSampleSequence;
m_NextSampleSequence += length;
m_History.AddRange(samples, length);
SamplesReady?.Invoke(samples, length, startSequence);
}
static int DownmixInto(float[] source, int sampleFrames, int channels, float[] destination)
{
for (var frame = 0; frame < sampleFrames; frame++)
destination[frame] = Downmix(source, frame, channels);
return sampleFrames;
}
int ResampleInto(float[] source, int sampleFrames, int channels, int sourceRate, float[] destination)
{
for (var frame = 0; frame < sampleFrames; frame++)
m_ResampleBuffer.Add(Downmix(source, frame, channels));
var outputCount = 0;
var step = sourceRate / (float)TargetSampleRate;
while (m_ResampleBuffer.Count >= Mathf.CeilToInt(m_ResampleCursor) + 2 && outputCount < destination.Length)
{
var i0 = Mathf.FloorToInt(m_ResampleCursor);
var t = m_ResampleCursor - i0;
destination[outputCount++] = Mathf.Lerp(m_ResampleBuffer[i0], m_ResampleBuffer[i0 + 1], t);
m_ResampleCursor += step;
var drop = Mathf.FloorToInt(m_ResampleCursor);
if (drop > 0)
m_ResampleBuffer.RemoveFromStart(Mathf.Min(drop, m_ResampleBuffer.Count - 2));
m_ResampleCursor -= drop;
}
return outputCount;
}
static float Downmix(float[] data, int frame, int channels)
{
var sum = 0f;
var baseIndex = frame * channels;
for (var channel = 0; channel < channels; channel++)
sum += data[baseIndex + channel];
return sum / channels;
}
}
}