Spaces:

ashishkblink
/

NuralVoice

Running

App Files Files Community

NuralVoice / API.md

Ashish Kumar

Add comprehensive API documentation with Node.js examples

568240f about 1 month ago

preview code

raw

history blame contribute delete

16.4 kB

	# 🚀 NuralVoiceSTT API Documentation

	Developed by Blink Digital

	Complete API documentation for integrating NuralVoiceSTT into your applications.

	## 📡 API Endpoints

	### WebSocket API (Real-time Streaming)

	Endpoint: `wss://ashishkblink-NuralVoice.hf.space/ws/transcribe`

	Protocol: WebSocket (WSS for secure connection)

	Best for: Real-time audio streaming, live transcription, low-latency applications

	---

	## 🎯 Quick Start

	### Prerequisites

	- Node.js 14+ installed
	- WebSocket library (`ws` package)
	- Audio capture capability (microphone or audio file)

	### Installation

	```bash
	npm install ws
	```

	---

	## 📝 Node.js Examples

	### Example 1: Real-time Microphone Streaming

	```javascript
	const WebSocket = require('ws');
	const { spawn } = require('child_process');

	// WebSocket URL
	const WS_URL = 'wss://ashishkblink-NuralVoice.hf.space/ws/transcribe';

	// Connect to WebSocket
	const ws = new WebSocket(WS_URL);

	ws.on('open', () => {
	console.log('✅ Connected to NuralVoiceSTT API');

	// Start recording from microphone using arecord (Linux) or sox (macOS/Linux)
	// For macOS, you might need: brew install sox
	const recorder = spawn('sox', [
	'-d', // Default audio device (microphone)
	'-t', 'raw', // Raw audio format
	'-r', '16000', // Sample rate: 16kHz
	'-c', '1', // Channels: mono
	'-b', '16', // Bit depth: 16-bit
	'-e', 'signed-integer', // Encoding
	'-' // Output to stdout
	]);

	// Send audio chunks to WebSocket
	recorder.stdout.on('data', (chunk) => {
	if (ws.readyState === WebSocket.OPEN) {
	ws.send(chunk);
	}
	});

	recorder.on('error', (error) => {
	console.error('Recording error:', error);
	});

	// Stop recording after 10 seconds (example)
	setTimeout(() => {
	recorder.kill();
	ws.send(JSON.stringify({ action: 'stop' }));
	}, 10000);
	});

	ws.on('message', (data) => {
	try {
	const message = JSON.parse(data.toString());

	if (message.status === 'connected') {
	console.log('📡 Ready:', message.message);
	} else if (message.text) {
	if (message.is_final) {
	console.log('✅ Final:', message.text);
	} else if (message.is_partial) {
	console.log('⏳ Partial:', message.text);
	} else {
	console.log('📝 Text:', message.text);
	}
	} else if (message.error) {
	console.error('❌ Error:', message.error);
	}
	} catch (e) {
	console.error('Parse error:', e);
	}
	});

	ws.on('error', (error) => {
	console.error('WebSocket error:', error);
	});

	ws.on('close', () => {
	console.log('🔌 Disconnected from API');
	});
	```

	### Example 2: Audio File Transcription

	```javascript
	const WebSocket = require('ws');
	const fs = require('fs');

	const WS_URL = 'wss://ashishkblink-NuralVoice.hf.space/ws/transcribe';
	const AUDIO_FILE = 'audio.wav'; // Your audio file path

	// Connect to WebSocket
	const ws = new WebSocket(WS_URL);

	let transcription = '';

	ws.on('open', () => {
	console.log('✅ Connected to NuralVoiceSTT API');

	// Read audio file
	const audioBuffer = fs.readFileSync(AUDIO_FILE);

	// Convert to 16-bit PCM if needed
	// Note: This assumes the file is already in 16kHz, 16-bit, mono PCM format
	// You may need to convert your audio file first using ffmpeg:
	// ffmpeg -i input.mp3 -ar 16000 -ac 1 -f s16le output.raw

	// Send audio in chunks (4000 bytes = ~0.25 seconds at 16kHz)
	const chunkSize = 4000;
	let offset = 0;

	const sendChunk = () => {
	if (offset < audioBuffer.length && ws.readyState === WebSocket.OPEN) {
	const chunk = audioBuffer.slice(offset, offset + chunkSize);
	ws.send(chunk);
	offset += chunkSize;

	// Send next chunk after a small delay
	setTimeout(sendChunk, 100);
	} else {
	// All chunks sent, request final result
	ws.send(JSON.stringify({ action: 'stop' }));
	}
	};

	sendChunk();
	});

	ws.on('message', (data) => {
	try {
	const message = JSON.parse(data.toString());

	if (message.text) {
	if (message.is_final) {
	transcription += ' ' + message.text;
	console.log('✅ Final transcription:', transcription.trim());
	} else if (message.is_partial) {
	console.log('⏳ Partial:', message.text);
	}
	} else if (message.error) {
	console.error('❌ Error:', message.error);
	}
	} catch (e) {
	// Handle binary data or other formats
	}
	});

	ws.on('close', () => {
	console.log('\n📄 Complete Transcription:');
	console.log(transcription.trim());
	});
	```

	### Example 3: Browser Audio Streaming (Node.js Server Proxy)

	```javascript
	// server.js - Node.js server that proxies browser audio to HF Space
	const express = require('express');
	const WebSocket = require('ws');
	const http = require('http');
	const cors = require('cors');

	const app = express();
	app.use(cors());
	app.use(express.json());

	const server = http.createServer(app);
	const wss = new WebSocket.Server({ server, path: '/ws' });

	const HF_WS_URL = 'wss://ashishkblink-NuralVoice.hf.space/ws/transcribe';

	wss.on('connection', (clientWs) => {
	console.log('✅ Client connected');

	// Connect to HF Space WebSocket
	const hfWs = new WebSocket(HF_WS_URL);

	hfWs.on('open', () => {
	console.log('✅ Connected to HF Space');
	clientWs.send(JSON.stringify({
	type: 'status',
	message: 'Connected to STT service'
	}));
	});

	// Forward audio from client to HF Space
	clientWs.on('message', (data) => {
	if (hfWs.readyState === WebSocket.OPEN) {
	// If data is JSON, parse it
	try {
	const message = JSON.parse(data.toString());
	if (message.type === 'audio') {
	// Convert array to buffer
	const buffer = Buffer.from(message.data);
	hfWs.send(buffer);
	} else if (message.action === 'stop') {
	hfWs.send(JSON.stringify({ action: 'stop' }));
	}
	} catch (e) {
	// Binary data - send directly
	hfWs.send(data);
	}
	}
	});

	// Forward transcription from HF Space to client
	hfWs.on('message', (data) => {
	try {
	const message = JSON.parse(data.toString());
	clientWs.send(JSON.stringify({
	type: 'transcription',
	text: message.text \|\| '',
	isFinal: message.is_final \|\| false,
	isPartial: message.is_partial \|\| false
	}));
	} catch (e) {
	// Handle non-JSON messages
	}
	});

	hfWs.on('error', (error) => {
	console.error('HF WebSocket error:', error);
	clientWs.send(JSON.stringify({
	type: 'error',
	message: error.message
	}));
	});

	clientWs.on('close', () => {
	hfWs.close();
	console.log('❌ Client disconnected');
	});
	});

	const PORT = process.env.PORT \|\| 3001;
	server.listen(PORT, () => {
	console.log(`🚀 Server running on port ${PORT}`);
	console.log(`📡 WebSocket: ws://localhost:${PORT}/ws`);
	});
	```

	### Example 4: Complete Client-Server Application

	```javascript
	// client-example.js - Complete example with error handling
	const WebSocket = require('ws');

	class NuralVoiceClient {
	constructor(wsUrl = 'wss://ashishkblink-NuralVoice.hf.space/ws/transcribe') {
	this.wsUrl = wsUrl;
	this.ws = null;
	this.isConnected = false;
	this.transcription = '';
	this.onTranscription = null;
	this.onError = null;
	}

	connect() {
	return new Promise((resolve, reject) => {
	this.ws = new WebSocket(this.wsUrl);

	this.ws.on('open', () => {
	this.isConnected = true;
	console.log('✅ Connected to NuralVoiceSTT');
	resolve();
	});

	this.ws.on('message', (data) => {
	try {
	const message = JSON.parse(data.toString());

	if (message.status === 'connected') {
	console.log('📡 Ready:', message.message);
	} else if (message.text) {
	if (message.is_final) {
	this.transcription += ' ' + message.text;
	if (this.onTranscription) {
	this.onTranscription(message.text, true);
	}
	} else if (message.is_partial) {
	if (this.onTranscription) {
	this.onTranscription(message.text, false);
	}
	}
	} else if (message.error) {
	console.error('❌ Error:', message.error);
	if (this.onError) {
	this.onError(message.error);
	}
	}
	} catch (e) {
	console.error('Parse error:', e);
	}
	});

	this.ws.on('error', (error) => {
	this.isConnected = false;
	if (this.onError) {
	this.onError(error.message);
	}
	reject(error);
	});

	this.ws.on('close', () => {
	this.isConnected = false;
	console.log('🔌 Disconnected');
	});
	});
	}

	sendAudio(audioBuffer) {
	if (this.ws && this.isConnected && this.ws.readyState === WebSocket.OPEN) {
	this.ws.send(audioBuffer);
	return true;
	}
	return false;
	}

	stop() {
	if (this.ws && this.isConnected) {
	this.ws.send(JSON.stringify({ action: 'stop' }));
	}
	}

	close() {
	if (this.ws) {
	this.ws.close();
	}
	}

	getTranscription() {
	return this.transcription.trim();
	}
	}

	// Usage example
	async function main() {
	const client = new NuralVoiceClient();

	client.onTranscription = (text, isFinal) => {
	if (isFinal) {
	console.log('✅ Final:', text);
	} else {
	console.log('⏳ Partial:', text);
	}
	};

	client.onError = (error) => {
	console.error('Error:', error);
	};

	try {
	await client.connect();

	// Send audio chunks (example)
	// In real usage, you'd get audio from microphone or file
	const audioChunk = Buffer.alloc(4000); // Example chunk
	client.sendAudio(audioChunk);

	// Stop after some time
	setTimeout(() => {
	client.stop();
	console.log('📄 Complete:', client.getTranscription());
	client.close();
	}, 5000);

	} catch (error) {
	console.error('Connection failed:', error);
	}
	}

	// Uncomment to run
	// main();
	```

	---

	## 📋 API Protocol

	### Connection

	1. Connect to `wss://ashishkblink-NuralVoice.hf.space/ws/transcribe`
	2. Wait for connection confirmation message
	3. Send audio data as binary (16-bit PCM, 16kHz, mono)
	4. Receive transcription results as JSON

	### Audio Format Requirements

	- Sample Rate: 16,000 Hz (16kHz)
	- Channels: Mono (1 channel)
	- Bit Depth: 16-bit
	- Encoding: Signed integer PCM
	- Format: Raw binary data (no headers)

	### Converting Audio Files

	Use `ffmpeg` to convert audio files to the required format:

	```bash
	# Convert MP3 to required format
	ffmpeg -i input.mp3 -ar 16000 -ac 1 -f s16le output.raw

	# Convert WAV to required format
	ffmpeg -i input.wav -ar 16000 -ac 1 -f s16le output.raw

	# Record from microphone directly
	ffmpeg -f avfoundation -i ":0" -ar 16000 -ac 1 -f s16le output.raw
	```

	### Message Format

	#### Client → Server (Send Audio)

	Send raw binary audio data (16-bit PCM):
	```javascript
	ws.send(audioBuffer); // Buffer containing 16-bit PCM audio
	```

	Send stop command:
	```javascript
	ws.send(JSON.stringify({ action: 'stop' }));
	```

	#### Server → Client (Receive Transcription)

	Status Message:
	```json
	{
	"status": "connected",
	"message": "Ready to receive audio. Send 16-bit PCM mono audio at 16kHz sample rate.",
	"sample_rate": 16000
	}
	```

	Partial Transcription:
	```json
	{
	"text": "hello world",
	"is_final": false,
	"is_partial": true
	}
	```

	Final Transcription:
	```json
	{
	"text": "hello world",
	"is_final": true,
	"words": [
	{
	"word": "hello",
	"start": 0.5,
	"end": 1.2,
	"conf": 0.95
	},
	{
	"word": "world",
	"start": 1.3,
	"end": 2.0,
	"conf": 0.92
	}
	]
	}
	```

	Error Message:
	```json
	{
	"error": "Error description",
	"status": "error"
	}
	```

	---

	## 🔧 Integration Examples

	### Express.js Server

	```javascript
	const express = require('express');
	const WebSocket = require('ws');
	const http = require('http');

	const app = express();
	const server = http.createServer(app);

	// WebSocket endpoint
	const wss = new WebSocket.Server({ server, path: '/api/transcribe' });

	wss.on('connection', (ws) => {
	const hfWs = new WebSocket('wss://ashishkblink-NuralVoice.hf.space/ws/transcribe');

	ws.on('message', (data) => {
	if (hfWs.readyState === WebSocket.OPEN) {
	hfWs.send(data);
	}
	});

	hfWs.on('message', (data) => {
	ws.send(data);
	});
	});

	server.listen(3000, () => {
	console.log('Server running on port 3000');
	});
	```

	### React Integration

	```javascript
	// In your React component
	import { useEffect, useRef, useState } from 'react';

	function SpeechToText() {
	const [transcription, setTranscription] = useState('');
	const wsRef = useRef(null);

	useEffect(() => {
	const ws = new WebSocket('wss://ashishkblink-NuralVoice.hf.space/ws/transcribe');
	wsRef.current = ws;

	ws.onmessage = (event) => {
	const data = JSON.parse(event.data);
	if (data.text) {
	setTranscription(prev => prev + ' ' + data.text);
	}
	};

	return () => ws.close();
	}, []);

	const sendAudio = (audioBuffer) => {
	if (wsRef.current?.readyState === WebSocket.OPEN) {
	wsRef.current.send(audioBuffer);
	}
	};

	return <div>{transcription}</div>;
	}
	```

	---

	## ⚠️ Important Notes

	1. Rate Limiting: Be mindful of API usage. Don't send too many requests simultaneously.

	2. Connection Management: Always close WebSocket connections when done to free resources.

	3. Error Handling: Implement proper error handling for network issues and API errors.

	4. Audio Quality: Better audio quality = better transcription accuracy. Use noise reduction when possible.

	5. Latency: WebSocket provides low-latency streaming. For best results, send audio in small chunks (2000-4000 bytes).

	---

	## 🐛 Troubleshooting

	### Connection Refused
	- Check if the Space is running
	- Verify the WebSocket URL is correct
	- Ensure you're using `wss://` (secure WebSocket)

	### No Transcription
	- Verify audio format (16kHz, 16-bit, mono PCM)
	- Check if audio is being sent correctly
	- Ensure WebSocket connection is open

	### Poor Accuracy
	- Use better quality audio
	- Reduce background noise
	- Speak clearly and at moderate pace

	---

	## 📞 Support

	For issues or questions:
	- Check the [Space page](https://huggingface.co/spaces/ashishkblink/NuralVoice)
	- Review error messages in WebSocket responses
	- Ensure your audio format matches requirements

	---

	Developed by Blink Digital \| [Model Repository](https://huggingface.co/ashishkblink/NuralVoiceSTT)