using UnityEngine; using System; using System.Runtime.InteropServices; using System.Threading.Tasks; using Unity.InferenceEngine; using UnityEngine.UI; public class ASRManager : MonoBehaviour { private bool _platformMicrophoneSupported = true; public enum State { Initializing, Ready, Listening, Speaking, STTProcessing, Error } private State _currentState = State.Initializing; public State currentState => _currentState; public enum InferenceBackend { GPUCompute = 0, CPU = 1 } [SerializeField] private Slider m_VadProbabilitySlider; [SerializeField] private Image m_VadProbabilityFill; [SerializeField] private Text m_TextFps; [SerializeField] private MonoBehaviour m_AsrRunnerComponent; [SerializeField, Range(0f, 1f)] private float m_VadThreshold = 0.5f; [SerializeField, Min(1)] private int m_PrePostBufferFrames = 20; [SerializeField, Min(0.1f)] private float m_MaxAudioStreamSeconds = 10f; private IASRRunner _activeRunner; private float _currentVadProbability; private TenVADRunner _vad; private string _selectedMicrophone; private AudioClip _microphoneClip; private int _lastPosition = 0; private int _consecutiveSilenceFrames = 0; private float _currentRecordingTime = 0f; private bool _isListeningSession; private bool _webglMicrophoneInitialized; #if UNITY_WEBGL && !UNITY_EDITOR private bool _webglMicPluginAvailable = true; private bool _webglMicPluginWarningLogged; #endif private const int HOP_SIZE = 256; private const int TARGET_SAMPLE_RATE = 16000; private const float FPS_UPDATE_INTERVAL = 0.25f; private const float VAD_TEXT_UPDATE_INTERVAL = 0.05f; private const int MAX_CHUNKS_PER_FRAME = 24; #if UNITY_WEBGL && !UNITY_EDITOR [DllImport("__Internal")] private static extern int WebGLMic_Start(int sampleRate); [DllImport("__Internal")] private static extern void WebGLMic_Stop(); [DllImport("__Internal")] private static extern int WebGLMic_GetSamples(float[] buffer, int maxSamples); [DllImport("__Internal")] private static extern int WebGLMic_IsRecording(); #endif private CircularBuffer _microphoneCircularBuffer; private CircularBuffer _preSpeechCircularBuffer; private float[] _reusableReadBuffer; private float[] _reusableProcessChunk; private short[] _reusableShortChunk; private float _fpsElapsed; private int _fpsFrameCount; private float _nextVadTextUpdateTime; // Event for state changes public event Action OnStateChanged; public event Action OnSpeechTextReceived; private async void Start() { SetState(State.Initializing); try { InitializeBuffers(); await InitializeASRRunner(); _vad = new TenVADRunner((UIntPtr)HOP_SIZE, m_VadThreshold); UpdateVadProbabilityText(); InitializeMicrophone(); SetState(State.Ready); } catch (Exception e) { Debug.LogError($"[ASRManager] Initialization failed: {e.Message}\n{e.StackTrace}"); SetState(State.Error); } } private void Reset() { if (m_AsrRunnerComponent == null) m_AsrRunnerComponent = ResolveASRRunnerComponent(); } #if UNITY_EDITOR private void OnValidate() { m_VadThreshold = Mathf.Clamp01(m_VadThreshold); m_PrePostBufferFrames = Mathf.Max(1, m_PrePostBufferFrames); m_MaxAudioStreamSeconds = Mathf.Max(0.1f, m_MaxAudioStreamSeconds); if (m_AsrRunnerComponent == null) m_AsrRunnerComponent = ResolveASRRunnerComponent(); } #endif private void Update() { UpdateFpsText(); if (_currentState == State.Listening || _currentState == State.Speaking) { ReadMicrophoneData(); ProcessAudioChunks(); CheckMicrophoneStatus(); } } private void OnDestroy() { if (_activeRunner != null) { _activeRunner.OnFinalResult -= OnFinalResultReceived; } if (!string.IsNullOrEmpty(_selectedMicrophone) && IsMicrophoneRecording(_selectedMicrophone)) { EndMicrophone(_selectedMicrophone); } _vad?.Dispose(); _activeRunner?.Dispose(); } public bool TrySetInferenceBackend(InferenceBackend backend) { if (_activeRunner == null || _isReinitializing) return false; if (_currentState == State.STTProcessing) return false; if (_currentState == State.Listening || _currentState == State.Speaking) StopListening(processCurrentSegment: false); _activeRunner.SetPreferredBackend(ToRunnerBackend(backend)); _ = ReinitializeRunnerAsync(); return true; } private bool _isReinitializing; private async Task ReinitializeRunnerAsync() { if (_isReinitializing) return; _isReinitializing = true; SetState(State.Initializing); try { await _activeRunner.ReinitializeAsync(); SetState(State.Ready); } catch (Exception e) { Debug.LogError($"[ASRManager] Reinitialization failed: {e.Message}\n{e.StackTrace}"); SetState(State.Error); } finally { _isReinitializing = false; } } private static BackendType ToRunnerBackend(InferenceBackend backend) { return backend == InferenceBackend.CPU ? BackendType.CPU : BackendType.GPUCompute; } public bool Listen() { switch (_currentState) { case State.Listening: case State.Speaking: return true; case State.STTProcessing: case State.Error: case State.Initializing: return false; } if (!_platformMicrophoneSupported) { return false; } _isListeningSession = true; StartMicrophone(); if (_microphoneClip == null && !IsMicrophoneRecording(_selectedMicrophone)) return false; SetState(State.Listening); return true; } public bool StopListening(bool processCurrentSegment = false) { switch (_currentState) { case State.Ready: case State.STTProcessing: case State.Error: case State.Initializing: return false; } _isListeningSession = false; if (_currentState == State.Listening) { StopMicrophone(); _consecutiveSilenceFrames = 0; _currentRecordingTime = 0f; _preSpeechCircularBuffer.Clear(); _activeRunner.CancelSpeechSegment(); SetState(State.Ready); return true; } if (_currentState == State.Speaking) { if (processCurrentSegment) { _activeRunner.EndSpeechSegment(); } else { _activeRunner.CancelSpeechSegment(); } _preSpeechCircularBuffer.Clear(); StopMicrophone(); SetState(State.Ready); _consecutiveSilenceFrames = 0; _currentRecordingTime = 0f; return true; } return false; } private void InitializeBuffers() { var bufferFrames = Mathf.Max(1, m_PrePostBufferFrames); _microphoneCircularBuffer = new CircularBuffer(TARGET_SAMPLE_RATE * 2); _preSpeechCircularBuffer = new CircularBuffer(HOP_SIZE * bufferFrames); _reusableReadBuffer = new float[HOP_SIZE * 4]; _reusableProcessChunk = new float[HOP_SIZE]; _reusableShortChunk = new short[HOP_SIZE]; } private async Task InitializeASRRunner() { if (m_AsrRunnerComponent == null) m_AsrRunnerComponent = ResolveASRRunnerComponent(); if (m_AsrRunnerComponent == null) { throw new ArgumentNullException("ASR Runner Component is not assigned in the Inspector."); } _activeRunner = m_AsrRunnerComponent as IASRRunner; if (_activeRunner == null) { throw new InvalidCastException($"The component '{m_AsrRunnerComponent.GetType().Name}' must implement IASRRunner."); } _activeRunner.OnFinalResult += OnFinalResultReceived; await _activeRunner.Initialize(); } private MonoBehaviour ResolveASRRunnerComponent() { // Prefer a runner on the same GameObject to avoid cross-scene mismatches. var localBehaviours = GetComponents(); foreach (var behaviour in localBehaviours) { if (behaviour is IASRRunner) return behaviour; } #if UNITY_2023_1_OR_NEWER var allBehaviours = FindObjectsByType(FindObjectsInactive.Include, FindObjectsSortMode.None); #else var allBehaviours = FindObjectsOfType(true); #endif foreach (var behaviour in allBehaviours) { if (behaviour is IASRRunner) return behaviour; } return null; } private void InitializeMicrophone() { if (!_platformMicrophoneSupported) throw new NotSupportedException("Microphone API is not available on this platform."); #if UNITY_WEBGL && !UNITY_EDITOR if (_webglMicrophoneInitialized) return; _selectedMicrophone = "WebGL Microphone"; _webglMicrophoneInitialized = true; return; #else string[] devices = Microphone.devices; if (devices.Length == 0) throw new InvalidOperationException("No microphone found."); _selectedMicrophone = devices[0]; #endif } private void StartMicrophone() { if (!_platformMicrophoneSupported) return; if (IsMicrophoneRecording(_selectedMicrophone)) return; if (string.IsNullOrEmpty(_selectedMicrophone)) InitializeMicrophone(); if (string.IsNullOrEmpty(_selectedMicrophone)) { Debug.LogError("[ASRManager] No microphone is selected."); return; } #if UNITY_WEBGL && !UNITY_EDITOR int startResult; try { startResult = WebGLMic_Start(TARGET_SAMPLE_RATE); } catch { startResult = 0; } if (startResult != 1) { Debug.LogError($"[ASRManager] Failed to start WebGL microphone. Result={startResult}"); return; } if (_microphoneClip == null) _microphoneClip = AudioClip.Create("webgl-microphone", TARGET_SAMPLE_RATE, 1, TARGET_SAMPLE_RATE, false); #else _microphoneClip = Microphone.Start(_selectedMicrophone, true, Mathf.Max(1, Mathf.CeilToInt(m_MaxAudioStreamSeconds) + 1), TARGET_SAMPLE_RATE); _lastPosition = 0; #endif } private void StopMicrophone() { if (_microphoneClip == null && !IsMicrophoneRecording(_selectedMicrophone)) return; if (IsMicrophoneRecording(_selectedMicrophone)) EndMicrophone(_selectedMicrophone); _microphoneClip = null; } private void ReadMicrophoneData() { if (_microphoneClip == null || !IsMicrophoneRecording(_selectedMicrophone)) return; #if UNITY_WEBGL && !UNITY_EDITOR int sampleCount = 0; try { sampleCount = WebGLMic_GetSamples(_reusableReadBuffer, _reusableReadBuffer.Length); } catch (Exception ex) { if (_webglMicPluginAvailable) { _webglMicPluginAvailable = false; if (!_webglMicPluginWarningLogged) { Debug.LogWarning($"[ASRManager] WebGL microphone plugin is not available: {ex.Message}"); _webglMicPluginWarningLogged = true; } } return; } if (sampleCount <= 0) return; _microphoneCircularBuffer.Write(_reusableReadBuffer, sampleCount); #else int currentPosition = Microphone.GetPosition(_selectedMicrophone); if (currentPosition == _lastPosition) return; int sampleCount = (currentPosition > _lastPosition) ? (currentPosition - _lastPosition) : (_microphoneClip.samples - _lastPosition + currentPosition); if (sampleCount > 0) { int remaining = sampleCount; int readPosition = _lastPosition; while (remaining > 0) { int readLength = Mathf.Min(remaining, _reusableReadBuffer.Length); _microphoneClip.GetData(_reusableReadBuffer, readPosition); _microphoneCircularBuffer.Write(_reusableReadBuffer, readLength); remaining -= readLength; readPosition = (readPosition + readLength) % _microphoneClip.samples; } } _lastPosition = currentPosition; #endif } private void ProcessAudioChunks() { var availableChunks = _microphoneCircularBuffer.Count / HOP_SIZE; if (availableChunks <= 0) return; var frameChunkBudget = Mathf.Clamp(availableChunks, 1, Mathf.Max(1, MAX_CHUNKS_PER_FRAME)); int chunksProcessed = 0; while (_microphoneCircularBuffer.Count >= HOP_SIZE && chunksProcessed < frameChunkBudget) { _microphoneCircularBuffer.Read(_reusableProcessChunk, HOP_SIZE); for (int i = 0; i < HOP_SIZE; i++) { _reusableShortChunk[i] = (short)(_reusableProcessChunk[i] * 32767.0f); } _vad.Process(_reusableShortChunk, out float probability, out int flag); UpdateVadDebug(probability); bool voiceDetected = flag == 1; switch (_currentState) { case State.Listening: _preSpeechCircularBuffer.Write(_reusableProcessChunk, HOP_SIZE); if (voiceDetected) { StartSpeech(); } break; case State.Speaking: ProcessSpeechChunk(_reusableProcessChunk); _currentRecordingTime += (float)HOP_SIZE / TARGET_SAMPLE_RATE; if (voiceDetected) { _consecutiveSilenceFrames = 0; } else { _consecutiveSilenceFrames++; if (_consecutiveSilenceFrames >= Mathf.Max(1, m_PrePostBufferFrames)) { EndSpeech(); } } if (_currentRecordingTime >= Mathf.Max(0.1f, m_MaxAudioStreamSeconds)) { EndSpeech(); } break; } chunksProcessed++; } } private void StartSpeech() { SetState(State.Speaking); _currentRecordingTime = 0f; _consecutiveSilenceFrames = 0; _activeRunner.StartSpeechSegment(); // Process pre-speech buffer data int preSpeechDataLength = _preSpeechCircularBuffer.Count; while (preSpeechDataLength > 0) { int chunkLength = Mathf.Min(HOP_SIZE, preSpeechDataLength); _preSpeechCircularBuffer.Read(_reusableProcessChunk, chunkLength); ProcessSpeechChunk(_reusableProcessChunk, chunkLength); preSpeechDataLength -= chunkLength; } } private void ProcessSpeechChunk(float[] audioChunk, int length = -1) { int sampleCount = (length <= 0 || length > audioChunk.Length) ? audioChunk.Length : length; _activeRunner.ProcessAudioChunk(audioChunk, sampleCount); } private void EndSpeech() { if (_currentState != State.Speaking) return; _preSpeechCircularBuffer.Clear(); _activeRunner.EndSpeechSegment(); _consecutiveSilenceFrames = 0; _currentRecordingTime = 0f; if (_isListeningSession) { // Keep microphone + VAD running continuously while inference consumes queued segments. SetState(State.Listening); return; } StopMicrophone(); SetState(State.STTProcessing); } private void SetState(State newState) { if (_currentState == newState) return; _currentState = newState; OnStateChanged?.Invoke(newState); } private void CheckMicrophoneStatus() { if (!string.IsNullOrEmpty(_selectedMicrophone) && !IsMicrophoneRecording(_selectedMicrophone)) { StartMicrophone(); } } private static bool IsMicrophoneRecording(string deviceName) { #if UNITY_WEBGL && !UNITY_EDITOR try { return WebGLMic_IsRecording() == 1; } catch { return false; } #else return Microphone.IsRecording(deviceName); #endif } private static void EndMicrophone(string deviceName) { #if UNITY_WEBGL && !UNITY_EDITOR try { WebGLMic_Stop(); } catch { // Ignore if plugin call fails. } #else Microphone.End(deviceName); #endif } private void UpdateVadDebug(float probability) { _currentVadProbability = Mathf.Clamp01(probability); UpdateVadProbabilityText(); } private void UpdateVadProbabilityText() { if (m_VadProbabilitySlider == null) return; var interval = Mathf.Max(0.01f, VAD_TEXT_UPDATE_INTERVAL); var now = Time.unscaledTime; if (now < _nextVadTextUpdateTime) return; _nextVadTextUpdateTime = now + interval; m_VadProbabilitySlider.minValue = 0f; m_VadProbabilitySlider.maxValue = 1f; m_VadProbabilitySlider.wholeNumbers = false; m_VadProbabilitySlider.value = _currentVadProbability; var fill = ResolveVadProbabilityFillImage(); if (fill == null) return; fill.color = _currentVadProbability >= m_VadThreshold ? new Color32(0, 255, 0, 255) : new Color32(255, 255, 255, 255); } private Image ResolveVadProbabilityFillImage() { if (m_VadProbabilityFill != null) return m_VadProbabilityFill; if (m_VadProbabilitySlider == null || m_VadProbabilitySlider.fillRect == null) return null; m_VadProbabilityFill = m_VadProbabilitySlider.fillRect.GetComponent(); return m_VadProbabilityFill; } private void UpdateFpsText() { if (m_TextFps == null) return; _fpsElapsed += Time.unscaledDeltaTime; _fpsFrameCount++; var interval = Mathf.Max(0.05f, FPS_UPDATE_INTERVAL); if (_fpsElapsed < interval) return; var fps = Mathf.RoundToInt(_fpsFrameCount / _fpsElapsed); m_TextFps.text = $"FPS: {fps}"; _fpsElapsed = 0f; _fpsFrameCount = 0; } private void OnFinalResultReceived(string final) { if (_isListeningSession) { if (!IsMicrophoneRecording(_selectedMicrophone)) StartMicrophone(); if (_currentState != State.Speaking) SetState(State.Listening); } else { SetState(State.Ready); } var cleaned = SanitizeTranscriptText(final); if (string.IsNullOrWhiteSpace(cleaned)) return; OnSpeechTextReceived?.Invoke(cleaned); } private static string SanitizeTranscriptText(string text) { if (string.IsNullOrEmpty(text)) return string.Empty; if (text.IndexOf('\uFFFD') < 0) return text; return text.Replace("\uFFFD", string.Empty); } private class CircularBuffer { private readonly float[] _buffer; private int _head; private int _tail; private readonly int _capacity; public int Count { get; private set; } public CircularBuffer(int capacity) { _capacity = capacity; _buffer = new float[capacity]; Clear(); } public void Write(float[] data, int length) { for (int i = 0; i < length; i++) { _buffer[_tail] = data[i]; _tail = (_tail + 1) % _capacity; if (Count == _capacity) { // Buffer is full: drop the oldest sample so the latest stream stays contiguous. _head = (_head + 1) % _capacity; } else { Count++; } } } public void Read(float[] destination, int length) { if (length > Count) throw new InvalidOperationException("Not enough data to read."); for (int i = 0; i < length; i++) { destination[i] = _buffer[_head]; _head = (_head + 1) % _capacity; } Count -= length; } public void Clear() { _head = 0; _tail = 0; Count = 0; } } }