Spaces:
Running
Running
| # π NuralVoiceSTT API Documentation | |
| **Developed by Blink Digital** | |
| Complete API documentation for integrating NuralVoiceSTT into your applications. | |
| ## π‘ API Endpoints | |
| ### WebSocket API (Real-time Streaming) | |
| **Endpoint:** `wss://ashishkblink-NuralVoice.hf.space/ws/transcribe` | |
| **Protocol:** WebSocket (WSS for secure connection) | |
| **Best for:** Real-time audio streaming, live transcription, low-latency applications | |
| --- | |
| ## π― Quick Start | |
| ### Prerequisites | |
| - Node.js 14+ installed | |
| - WebSocket library (`ws` package) | |
| - Audio capture capability (microphone or audio file) | |
| ### Installation | |
| ```bash | |
| npm install ws | |
| ``` | |
| --- | |
| ## π Node.js Examples | |
| ### Example 1: Real-time Microphone Streaming | |
| ```javascript | |
| const WebSocket = require('ws'); | |
| const { spawn } = require('child_process'); | |
| // WebSocket URL | |
| const WS_URL = 'wss://ashishkblink-NuralVoice.hf.space/ws/transcribe'; | |
| // Connect to WebSocket | |
| const ws = new WebSocket(WS_URL); | |
| ws.on('open', () => { | |
| console.log('β Connected to NuralVoiceSTT API'); | |
| // Start recording from microphone using arecord (Linux) or sox (macOS/Linux) | |
| // For macOS, you might need: brew install sox | |
| const recorder = spawn('sox', [ | |
| '-d', // Default audio device (microphone) | |
| '-t', 'raw', // Raw audio format | |
| '-r', '16000', // Sample rate: 16kHz | |
| '-c', '1', // Channels: mono | |
| '-b', '16', // Bit depth: 16-bit | |
| '-e', 'signed-integer', // Encoding | |
| '-' // Output to stdout | |
| ]); | |
| // Send audio chunks to WebSocket | |
| recorder.stdout.on('data', (chunk) => { | |
| if (ws.readyState === WebSocket.OPEN) { | |
| ws.send(chunk); | |
| } | |
| }); | |
| recorder.on('error', (error) => { | |
| console.error('Recording error:', error); | |
| }); | |
| // Stop recording after 10 seconds (example) | |
| setTimeout(() => { | |
| recorder.kill(); | |
| ws.send(JSON.stringify({ action: 'stop' })); | |
| }, 10000); | |
| }); | |
| ws.on('message', (data) => { | |
| try { | |
| const message = JSON.parse(data.toString()); | |
| if (message.status === 'connected') { | |
| console.log('π‘ Ready:', message.message); | |
| } else if (message.text) { | |
| if (message.is_final) { | |
| console.log('β Final:', message.text); | |
| } else if (message.is_partial) { | |
| console.log('β³ Partial:', message.text); | |
| } else { | |
| console.log('π Text:', message.text); | |
| } | |
| } else if (message.error) { | |
| console.error('β Error:', message.error); | |
| } | |
| } catch (e) { | |
| console.error('Parse error:', e); | |
| } | |
| }); | |
| ws.on('error', (error) => { | |
| console.error('WebSocket error:', error); | |
| }); | |
| ws.on('close', () => { | |
| console.log('π Disconnected from API'); | |
| }); | |
| ``` | |
| ### Example 2: Audio File Transcription | |
| ```javascript | |
| const WebSocket = require('ws'); | |
| const fs = require('fs'); | |
| const WS_URL = 'wss://ashishkblink-NuralVoice.hf.space/ws/transcribe'; | |
| const AUDIO_FILE = 'audio.wav'; // Your audio file path | |
| // Connect to WebSocket | |
| const ws = new WebSocket(WS_URL); | |
| let transcription = ''; | |
| ws.on('open', () => { | |
| console.log('β Connected to NuralVoiceSTT API'); | |
| // Read audio file | |
| const audioBuffer = fs.readFileSync(AUDIO_FILE); | |
| // Convert to 16-bit PCM if needed | |
| // Note: This assumes the file is already in 16kHz, 16-bit, mono PCM format | |
| // You may need to convert your audio file first using ffmpeg: | |
| // ffmpeg -i input.mp3 -ar 16000 -ac 1 -f s16le output.raw | |
| // Send audio in chunks (4000 bytes = ~0.25 seconds at 16kHz) | |
| const chunkSize = 4000; | |
| let offset = 0; | |
| const sendChunk = () => { | |
| if (offset < audioBuffer.length && ws.readyState === WebSocket.OPEN) { | |
| const chunk = audioBuffer.slice(offset, offset + chunkSize); | |
| ws.send(chunk); | |
| offset += chunkSize; | |
| // Send next chunk after a small delay | |
| setTimeout(sendChunk, 100); | |
| } else { | |
| // All chunks sent, request final result | |
| ws.send(JSON.stringify({ action: 'stop' })); | |
| } | |
| }; | |
| sendChunk(); | |
| }); | |
| ws.on('message', (data) => { | |
| try { | |
| const message = JSON.parse(data.toString()); | |
| if (message.text) { | |
| if (message.is_final) { | |
| transcription += ' ' + message.text; | |
| console.log('β Final transcription:', transcription.trim()); | |
| } else if (message.is_partial) { | |
| console.log('β³ Partial:', message.text); | |
| } | |
| } else if (message.error) { | |
| console.error('β Error:', message.error); | |
| } | |
| } catch (e) { | |
| // Handle binary data or other formats | |
| } | |
| }); | |
| ws.on('close', () => { | |
| console.log('\nπ Complete Transcription:'); | |
| console.log(transcription.trim()); | |
| }); | |
| ``` | |
| ### Example 3: Browser Audio Streaming (Node.js Server Proxy) | |
| ```javascript | |
| // server.js - Node.js server that proxies browser audio to HF Space | |
| const express = require('express'); | |
| const WebSocket = require('ws'); | |
| const http = require('http'); | |
| const cors = require('cors'); | |
| const app = express(); | |
| app.use(cors()); | |
| app.use(express.json()); | |
| const server = http.createServer(app); | |
| const wss = new WebSocket.Server({ server, path: '/ws' }); | |
| const HF_WS_URL = 'wss://ashishkblink-NuralVoice.hf.space/ws/transcribe'; | |
| wss.on('connection', (clientWs) => { | |
| console.log('β Client connected'); | |
| // Connect to HF Space WebSocket | |
| const hfWs = new WebSocket(HF_WS_URL); | |
| hfWs.on('open', () => { | |
| console.log('β Connected to HF Space'); | |
| clientWs.send(JSON.stringify({ | |
| type: 'status', | |
| message: 'Connected to STT service' | |
| })); | |
| }); | |
| // Forward audio from client to HF Space | |
| clientWs.on('message', (data) => { | |
| if (hfWs.readyState === WebSocket.OPEN) { | |
| // If data is JSON, parse it | |
| try { | |
| const message = JSON.parse(data.toString()); | |
| if (message.type === 'audio') { | |
| // Convert array to buffer | |
| const buffer = Buffer.from(message.data); | |
| hfWs.send(buffer); | |
| } else if (message.action === 'stop') { | |
| hfWs.send(JSON.stringify({ action: 'stop' })); | |
| } | |
| } catch (e) { | |
| // Binary data - send directly | |
| hfWs.send(data); | |
| } | |
| } | |
| }); | |
| // Forward transcription from HF Space to client | |
| hfWs.on('message', (data) => { | |
| try { | |
| const message = JSON.parse(data.toString()); | |
| clientWs.send(JSON.stringify({ | |
| type: 'transcription', | |
| text: message.text || '', | |
| isFinal: message.is_final || false, | |
| isPartial: message.is_partial || false | |
| })); | |
| } catch (e) { | |
| // Handle non-JSON messages | |
| } | |
| }); | |
| hfWs.on('error', (error) => { | |
| console.error('HF WebSocket error:', error); | |
| clientWs.send(JSON.stringify({ | |
| type: 'error', | |
| message: error.message | |
| })); | |
| }); | |
| clientWs.on('close', () => { | |
| hfWs.close(); | |
| console.log('β Client disconnected'); | |
| }); | |
| }); | |
| const PORT = process.env.PORT || 3001; | |
| server.listen(PORT, () => { | |
| console.log(`π Server running on port ${PORT}`); | |
| console.log(`π‘ WebSocket: ws://localhost:${PORT}/ws`); | |
| }); | |
| ``` | |
| ### Example 4: Complete Client-Server Application | |
| ```javascript | |
| // client-example.js - Complete example with error handling | |
| const WebSocket = require('ws'); | |
| class NuralVoiceClient { | |
| constructor(wsUrl = 'wss://ashishkblink-NuralVoice.hf.space/ws/transcribe') { | |
| this.wsUrl = wsUrl; | |
| this.ws = null; | |
| this.isConnected = false; | |
| this.transcription = ''; | |
| this.onTranscription = null; | |
| this.onError = null; | |
| } | |
| connect() { | |
| return new Promise((resolve, reject) => { | |
| this.ws = new WebSocket(this.wsUrl); | |
| this.ws.on('open', () => { | |
| this.isConnected = true; | |
| console.log('β Connected to NuralVoiceSTT'); | |
| resolve(); | |
| }); | |
| this.ws.on('message', (data) => { | |
| try { | |
| const message = JSON.parse(data.toString()); | |
| if (message.status === 'connected') { | |
| console.log('π‘ Ready:', message.message); | |
| } else if (message.text) { | |
| if (message.is_final) { | |
| this.transcription += ' ' + message.text; | |
| if (this.onTranscription) { | |
| this.onTranscription(message.text, true); | |
| } | |
| } else if (message.is_partial) { | |
| if (this.onTranscription) { | |
| this.onTranscription(message.text, false); | |
| } | |
| } | |
| } else if (message.error) { | |
| console.error('β Error:', message.error); | |
| if (this.onError) { | |
| this.onError(message.error); | |
| } | |
| } | |
| } catch (e) { | |
| console.error('Parse error:', e); | |
| } | |
| }); | |
| this.ws.on('error', (error) => { | |
| this.isConnected = false; | |
| if (this.onError) { | |
| this.onError(error.message); | |
| } | |
| reject(error); | |
| }); | |
| this.ws.on('close', () => { | |
| this.isConnected = false; | |
| console.log('π Disconnected'); | |
| }); | |
| }); | |
| } | |
| sendAudio(audioBuffer) { | |
| if (this.ws && this.isConnected && this.ws.readyState === WebSocket.OPEN) { | |
| this.ws.send(audioBuffer); | |
| return true; | |
| } | |
| return false; | |
| } | |
| stop() { | |
| if (this.ws && this.isConnected) { | |
| this.ws.send(JSON.stringify({ action: 'stop' })); | |
| } | |
| } | |
| close() { | |
| if (this.ws) { | |
| this.ws.close(); | |
| } | |
| } | |
| getTranscription() { | |
| return this.transcription.trim(); | |
| } | |
| } | |
| // Usage example | |
| async function main() { | |
| const client = new NuralVoiceClient(); | |
| client.onTranscription = (text, isFinal) => { | |
| if (isFinal) { | |
| console.log('β Final:', text); | |
| } else { | |
| console.log('β³ Partial:', text); | |
| } | |
| }; | |
| client.onError = (error) => { | |
| console.error('Error:', error); | |
| }; | |
| try { | |
| await client.connect(); | |
| // Send audio chunks (example) | |
| // In real usage, you'd get audio from microphone or file | |
| const audioChunk = Buffer.alloc(4000); // Example chunk | |
| client.sendAudio(audioChunk); | |
| // Stop after some time | |
| setTimeout(() => { | |
| client.stop(); | |
| console.log('π Complete:', client.getTranscription()); | |
| client.close(); | |
| }, 5000); | |
| } catch (error) { | |
| console.error('Connection failed:', error); | |
| } | |
| } | |
| // Uncomment to run | |
| // main(); | |
| ``` | |
| --- | |
| ## π API Protocol | |
| ### Connection | |
| 1. **Connect** to `wss://ashishkblink-NuralVoice.hf.space/ws/transcribe` | |
| 2. **Wait** for connection confirmation message | |
| 3. **Send** audio data as binary (16-bit PCM, 16kHz, mono) | |
| 4. **Receive** transcription results as JSON | |
| ### Audio Format Requirements | |
| - **Sample Rate:** 16,000 Hz (16kHz) | |
| - **Channels:** Mono (1 channel) | |
| - **Bit Depth:** 16-bit | |
| - **Encoding:** Signed integer PCM | |
| - **Format:** Raw binary data (no headers) | |
| ### Converting Audio Files | |
| Use `ffmpeg` to convert audio files to the required format: | |
| ```bash | |
| # Convert MP3 to required format | |
| ffmpeg -i input.mp3 -ar 16000 -ac 1 -f s16le output.raw | |
| # Convert WAV to required format | |
| ffmpeg -i input.wav -ar 16000 -ac 1 -f s16le output.raw | |
| # Record from microphone directly | |
| ffmpeg -f avfoundation -i ":0" -ar 16000 -ac 1 -f s16le output.raw | |
| ``` | |
| ### Message Format | |
| #### Client β Server (Send Audio) | |
| Send raw binary audio data (16-bit PCM): | |
| ```javascript | |
| ws.send(audioBuffer); // Buffer containing 16-bit PCM audio | |
| ``` | |
| Send stop command: | |
| ```javascript | |
| ws.send(JSON.stringify({ action: 'stop' })); | |
| ``` | |
| #### Server β Client (Receive Transcription) | |
| **Status Message:** | |
| ```json | |
| { | |
| "status": "connected", | |
| "message": "Ready to receive audio. Send 16-bit PCM mono audio at 16kHz sample rate.", | |
| "sample_rate": 16000 | |
| } | |
| ``` | |
| **Partial Transcription:** | |
| ```json | |
| { | |
| "text": "hello world", | |
| "is_final": false, | |
| "is_partial": true | |
| } | |
| ``` | |
| **Final Transcription:** | |
| ```json | |
| { | |
| "text": "hello world", | |
| "is_final": true, | |
| "words": [ | |
| { | |
| "word": "hello", | |
| "start": 0.5, | |
| "end": 1.2, | |
| "conf": 0.95 | |
| }, | |
| { | |
| "word": "world", | |
| "start": 1.3, | |
| "end": 2.0, | |
| "conf": 0.92 | |
| } | |
| ] | |
| } | |
| ``` | |
| **Error Message:** | |
| ```json | |
| { | |
| "error": "Error description", | |
| "status": "error" | |
| } | |
| ``` | |
| --- | |
| ## π§ Integration Examples | |
| ### Express.js Server | |
| ```javascript | |
| const express = require('express'); | |
| const WebSocket = require('ws'); | |
| const http = require('http'); | |
| const app = express(); | |
| const server = http.createServer(app); | |
| // WebSocket endpoint | |
| const wss = new WebSocket.Server({ server, path: '/api/transcribe' }); | |
| wss.on('connection', (ws) => { | |
| const hfWs = new WebSocket('wss://ashishkblink-NuralVoice.hf.space/ws/transcribe'); | |
| ws.on('message', (data) => { | |
| if (hfWs.readyState === WebSocket.OPEN) { | |
| hfWs.send(data); | |
| } | |
| }); | |
| hfWs.on('message', (data) => { | |
| ws.send(data); | |
| }); | |
| }); | |
| server.listen(3000, () => { | |
| console.log('Server running on port 3000'); | |
| }); | |
| ``` | |
| ### React Integration | |
| ```javascript | |
| // In your React component | |
| import { useEffect, useRef, useState } from 'react'; | |
| function SpeechToText() { | |
| const [transcription, setTranscription] = useState(''); | |
| const wsRef = useRef(null); | |
| useEffect(() => { | |
| const ws = new WebSocket('wss://ashishkblink-NuralVoice.hf.space/ws/transcribe'); | |
| wsRef.current = ws; | |
| ws.onmessage = (event) => { | |
| const data = JSON.parse(event.data); | |
| if (data.text) { | |
| setTranscription(prev => prev + ' ' + data.text); | |
| } | |
| }; | |
| return () => ws.close(); | |
| }, []); | |
| const sendAudio = (audioBuffer) => { | |
| if (wsRef.current?.readyState === WebSocket.OPEN) { | |
| wsRef.current.send(audioBuffer); | |
| } | |
| }; | |
| return <div>{transcription}</div>; | |
| } | |
| ``` | |
| --- | |
| ## β οΈ Important Notes | |
| 1. **Rate Limiting:** Be mindful of API usage. Don't send too many requests simultaneously. | |
| 2. **Connection Management:** Always close WebSocket connections when done to free resources. | |
| 3. **Error Handling:** Implement proper error handling for network issues and API errors. | |
| 4. **Audio Quality:** Better audio quality = better transcription accuracy. Use noise reduction when possible. | |
| 5. **Latency:** WebSocket provides low-latency streaming. For best results, send audio in small chunks (2000-4000 bytes). | |
| --- | |
| ## π Troubleshooting | |
| ### Connection Refused | |
| - Check if the Space is running | |
| - Verify the WebSocket URL is correct | |
| - Ensure you're using `wss://` (secure WebSocket) | |
| ### No Transcription | |
| - Verify audio format (16kHz, 16-bit, mono PCM) | |
| - Check if audio is being sent correctly | |
| - Ensure WebSocket connection is open | |
| ### Poor Accuracy | |
| - Use better quality audio | |
| - Reduce background noise | |
| - Speak clearly and at moderate pace | |
| --- | |
| ## π Support | |
| For issues or questions: | |
| - Check the [Space page](https://huggingface.co/spaces/ashishkblink/NuralVoice) | |
| - Review error messages in WebSocket responses | |
| - Ensure your audio format matches requirements | |
| --- | |
| **Developed by Blink Digital** | [Model Repository](https://huggingface.co/ashishkblink/NuralVoiceSTT) | |