Spaces:

ashishkblink
/

NuralVoice

Running

File size: 16,364 Bytes

568240f

# 🚀 NuralVoiceSTT API Documentation

**Developed by Blink Digital**

Complete API documentation for integrating NuralVoiceSTT into your applications.

## 📡 API Endpoints

### WebSocket API (Real-time Streaming)

**Endpoint:** `wss://ashishkblink-NuralVoice.hf.space/ws/transcribe`

**Protocol:** WebSocket (WSS for secure connection)

**Best for:** Real-time audio streaming, live transcription, low-latency applications

---

## 🎯 Quick Start

### Prerequisites

- Node.js 14+ installed
- WebSocket library (`ws` package)
- Audio capture capability (microphone or audio file)

### Installation

```bash
npm install ws
```

---

## 📝 Node.js Examples

### Example 1: Real-time Microphone Streaming

```javascript
const WebSocket = require('ws');
const { spawn } = require('child_process');

// WebSocket URL
const WS_URL = 'wss://ashishkblink-NuralVoice.hf.space/ws/transcribe';

// Connect to WebSocket
const ws = new WebSocket(WS_URL);

ws.on('open', () => {
    console.log('✅ Connected to NuralVoiceSTT API');
    
    // Start recording from microphone using arecord (Linux) or sox (macOS/Linux)
    // For macOS, you might need: brew install sox
    const recorder = spawn('sox', [
        '-d',                    // Default audio device (microphone)
        '-t', 'raw',             // Raw audio format
        '-r', '16000',           // Sample rate: 16kHz
        '-c', '1',               // Channels: mono
        '-b', '16',              // Bit depth: 16-bit
        '-e', 'signed-integer',  // Encoding
        '-'                      // Output to stdout
    ]);
    
    // Send audio chunks to WebSocket
    recorder.stdout.on('data', (chunk) => {
        if (ws.readyState === WebSocket.OPEN) {
            ws.send(chunk);
        }
    });
    
    recorder.on('error', (error) => {
        console.error('Recording error:', error);
    });
    
    // Stop recording after 10 seconds (example)
    setTimeout(() => {
        recorder.kill();
        ws.send(JSON.stringify({ action: 'stop' }));
    }, 10000);
});

ws.on('message', (data) => {
    try {
        const message = JSON.parse(data.toString());
        
        if (message.status === 'connected') {
            console.log('📡 Ready:', message.message);
        } else if (message.text) {
            if (message.is_final) {
                console.log('✅ Final:', message.text);
            } else if (message.is_partial) {
                console.log('⏳ Partial:', message.text);
            } else {
                console.log('📝 Text:', message.text);
            }
        } else if (message.error) {
            console.error('❌ Error:', message.error);
        }
    } catch (e) {
        console.error('Parse error:', e);
    }
});

ws.on('error', (error) => {
    console.error('WebSocket error:', error);
});

ws.on('close', () => {
    console.log('🔌 Disconnected from API');
});
```

### Example 2: Audio File Transcription

```javascript
const WebSocket = require('ws');
const fs = require('fs');

const WS_URL = 'wss://ashishkblink-NuralVoice.hf.space/ws/transcribe';
const AUDIO_FILE = 'audio.wav'; // Your audio file path

// Connect to WebSocket
const ws = new WebSocket(WS_URL);

let transcription = '';

ws.on('open', () => {
    console.log('✅ Connected to NuralVoiceSTT API');
    
    // Read audio file
    const audioBuffer = fs.readFileSync(AUDIO_FILE);
    
    // Convert to 16-bit PCM if needed
    // Note: This assumes the file is already in 16kHz, 16-bit, mono PCM format
    // You may need to convert your audio file first using ffmpeg:
    // ffmpeg -i input.mp3 -ar 16000 -ac 1 -f s16le output.raw
    
    // Send audio in chunks (4000 bytes = ~0.25 seconds at 16kHz)
    const chunkSize = 4000;
    let offset = 0;
    
    const sendChunk = () => {
        if (offset < audioBuffer.length && ws.readyState === WebSocket.OPEN) {
            const chunk = audioBuffer.slice(offset, offset + chunkSize);
            ws.send(chunk);
            offset += chunkSize;
            
            // Send next chunk after a small delay
            setTimeout(sendChunk, 100);
        } else {
            // All chunks sent, request final result
            ws.send(JSON.stringify({ action: 'stop' }));
        }
    };
    
    sendChunk();
});

ws.on('message', (data) => {
    try {
        const message = JSON.parse(data.toString());
        
        if (message.text) {
            if (message.is_final) {
                transcription += ' ' + message.text;
                console.log('✅ Final transcription:', transcription.trim());
            } else if (message.is_partial) {
                console.log('⏳ Partial:', message.text);
            }
        } else if (message.error) {
            console.error('❌ Error:', message.error);
        }
    } catch (e) {
        // Handle binary data or other formats
    }
});

ws.on('close', () => {
    console.log('\n📄 Complete Transcription:');
    console.log(transcription.trim());
});
```

### Example 3: Browser Audio Streaming (Node.js Server Proxy)

```javascript
// server.js - Node.js server that proxies browser audio to HF Space
const express = require('express');
const WebSocket = require('ws');
const http = require('http');
const cors = require('cors');

const app = express();
app.use(cors());
app.use(express.json());

const server = http.createServer(app);
const wss = new WebSocket.Server({ server, path: '/ws' });

const HF_WS_URL = 'wss://ashishkblink-NuralVoice.hf.space/ws/transcribe';

wss.on('connection', (clientWs) => {
    console.log('✅ Client connected');
    
    // Connect to HF Space WebSocket
    const hfWs = new WebSocket(HF_WS_URL);
    
    hfWs.on('open', () => {
        console.log('✅ Connected to HF Space');
        clientWs.send(JSON.stringify({ 
            type: 'status', 
            message: 'Connected to STT service' 
        }));
    });
    
    // Forward audio from client to HF Space
    clientWs.on('message', (data) => {
        if (hfWs.readyState === WebSocket.OPEN) {
            // If data is JSON, parse it
            try {
                const message = JSON.parse(data.toString());
                if (message.type === 'audio') {
                    // Convert array to buffer
                    const buffer = Buffer.from(message.data);
                    hfWs.send(buffer);
                } else if (message.action === 'stop') {
                    hfWs.send(JSON.stringify({ action: 'stop' }));
                }
            } catch (e) {
                // Binary data - send directly
                hfWs.send(data);
            }
        }
    });
    
    // Forward transcription from HF Space to client
    hfWs.on('message', (data) => {
        try {
            const message = JSON.parse(data.toString());
            clientWs.send(JSON.stringify({
                type: 'transcription',
                text: message.text || '',
                isFinal: message.is_final || false,
                isPartial: message.is_partial || false
            }));
        } catch (e) {
            // Handle non-JSON messages
        }
    });
    
    hfWs.on('error', (error) => {
        console.error('HF WebSocket error:', error);
        clientWs.send(JSON.stringify({ 
            type: 'error', 
            message: error.message 
        }));
    });
    
    clientWs.on('close', () => {
        hfWs.close();
        console.log('❌ Client disconnected');
    });
});

const PORT = process.env.PORT || 3001;
server.listen(PORT, () => {
    console.log(`🚀 Server running on port ${PORT}`);
    console.log(`📡 WebSocket: ws://localhost:${PORT}/ws`);
});
```

### Example 4: Complete Client-Server Application

```javascript
// client-example.js - Complete example with error handling
const WebSocket = require('ws');

class NuralVoiceClient {
    constructor(wsUrl = 'wss://ashishkblink-NuralVoice.hf.space/ws/transcribe') {
        this.wsUrl = wsUrl;
        this.ws = null;
        this.isConnected = false;
        this.transcription = '';
        this.onTranscription = null;
        this.onError = null;
    }
    
    connect() {
        return new Promise((resolve, reject) => {
            this.ws = new WebSocket(this.wsUrl);
            
            this.ws.on('open', () => {
                this.isConnected = true;
                console.log('✅ Connected to NuralVoiceSTT');
                resolve();
            });
            
            this.ws.on('message', (data) => {
                try {
                    const message = JSON.parse(data.toString());
                    
                    if (message.status === 'connected') {
                        console.log('📡 Ready:', message.message);
                    } else if (message.text) {
                        if (message.is_final) {
                            this.transcription += ' ' + message.text;
                            if (this.onTranscription) {
                                this.onTranscription(message.text, true);
                            }
                        } else if (message.is_partial) {
                            if (this.onTranscription) {
                                this.onTranscription(message.text, false);
                            }
                        }
                    } else if (message.error) {
                        console.error('❌ Error:', message.error);
                        if (this.onError) {
                            this.onError(message.error);
                        }
                    }
                } catch (e) {
                    console.error('Parse error:', e);
                }
            });
            
            this.ws.on('error', (error) => {
                this.isConnected = false;
                if (this.onError) {
                    this.onError(error.message);
                }
                reject(error);
            });
            
            this.ws.on('close', () => {
                this.isConnected = false;
                console.log('🔌 Disconnected');
            });
        });
    }
    
    sendAudio(audioBuffer) {
        if (this.ws && this.isConnected && this.ws.readyState === WebSocket.OPEN) {
            this.ws.send(audioBuffer);
            return true;
        }
        return false;
    }
    
    stop() {
        if (this.ws && this.isConnected) {
            this.ws.send(JSON.stringify({ action: 'stop' }));
        }
    }
    
    close() {
        if (this.ws) {
            this.ws.close();
        }
    }
    
    getTranscription() {
        return this.transcription.trim();
    }
}

// Usage example
async function main() {
    const client = new NuralVoiceClient();
    
    client.onTranscription = (text, isFinal) => {
        if (isFinal) {
            console.log('✅ Final:', text);
        } else {
            console.log('⏳ Partial:', text);
        }
    };
    
    client.onError = (error) => {
        console.error('Error:', error);
    };
    
    try {
        await client.connect();
        
        // Send audio chunks (example)
        // In real usage, you'd get audio from microphone or file
        const audioChunk = Buffer.alloc(4000); // Example chunk
        client.sendAudio(audioChunk);
        
        // Stop after some time
        setTimeout(() => {
            client.stop();
            console.log('📄 Complete:', client.getTranscription());
            client.close();
        }, 5000);
        
    } catch (error) {
        console.error('Connection failed:', error);
    }
}

// Uncomment to run
// main();
```

---

## 📋 API Protocol

### Connection

1. **Connect** to `wss://ashishkblink-NuralVoice.hf.space/ws/transcribe`
2. **Wait** for connection confirmation message
3. **Send** audio data as binary (16-bit PCM, 16kHz, mono)
4. **Receive** transcription results as JSON

### Audio Format Requirements

- **Sample Rate:** 16,000 Hz (16kHz)
- **Channels:** Mono (1 channel)
- **Bit Depth:** 16-bit
- **Encoding:** Signed integer PCM
- **Format:** Raw binary data (no headers)

### Converting Audio Files

Use `ffmpeg` to convert audio files to the required format:

```bash
# Convert MP3 to required format
ffmpeg -i input.mp3 -ar 16000 -ac 1 -f s16le output.raw

# Convert WAV to required format
ffmpeg -i input.wav -ar 16000 -ac 1 -f s16le output.raw

# Record from microphone directly
ffmpeg -f avfoundation -i ":0" -ar 16000 -ac 1 -f s16le output.raw
```

### Message Format

#### Client → Server (Send Audio)

Send raw binary audio data (16-bit PCM):
```javascript
ws.send(audioBuffer); // Buffer containing 16-bit PCM audio
```

Send stop command:
```javascript
ws.send(JSON.stringify({ action: 'stop' }));
```

#### Server → Client (Receive Transcription)

**Status Message:**
```json
{
  "status": "connected",
  "message": "Ready to receive audio. Send 16-bit PCM mono audio at 16kHz sample rate.",
  "sample_rate": 16000
}
```

**Partial Transcription:**
```json
{
  "text": "hello world",
  "is_final": false,
  "is_partial": true
}
```

**Final Transcription:**
```json
{
  "text": "hello world",
  "is_final": true,
  "words": [
    {
      "word": "hello",
      "start": 0.5,
      "end": 1.2,
      "conf": 0.95
    },
    {
      "word": "world",
      "start": 1.3,
      "end": 2.0,
      "conf": 0.92
    }
  ]
}
```

**Error Message:**
```json
{
  "error": "Error description",
  "status": "error"
}
```

---

## 🔧 Integration Examples

### Express.js Server

```javascript
const express = require('express');
const WebSocket = require('ws');
const http = require('http');

const app = express();
const server = http.createServer(app);

// WebSocket endpoint
const wss = new WebSocket.Server({ server, path: '/api/transcribe' });

wss.on('connection', (ws) => {
    const hfWs = new WebSocket('wss://ashishkblink-NuralVoice.hf.space/ws/transcribe');
    
    ws.on('message', (data) => {
        if (hfWs.readyState === WebSocket.OPEN) {
            hfWs.send(data);
        }
    });
    
    hfWs.on('message', (data) => {
        ws.send(data);
    });
});

server.listen(3000, () => {
    console.log('Server running on port 3000');
});
```

### React Integration

```javascript
// In your React component
import { useEffect, useRef, useState } from 'react';

function SpeechToText() {
    const [transcription, setTranscription] = useState('');
    const wsRef = useRef(null);
    
    useEffect(() => {
        const ws = new WebSocket('wss://ashishkblink-NuralVoice.hf.space/ws/transcribe');
        wsRef.current = ws;
        
        ws.onmessage = (event) => {
            const data = JSON.parse(event.data);
            if (data.text) {
                setTranscription(prev => prev + ' ' + data.text);
            }
        };
        
        return () => ws.close();
    }, []);
    
    const sendAudio = (audioBuffer) => {
        if (wsRef.current?.readyState === WebSocket.OPEN) {
            wsRef.current.send(audioBuffer);
        }
    };
    
    return <div>{transcription}</div>;
}
```

---

## ⚠️ Important Notes

1. **Rate Limiting:** Be mindful of API usage. Don't send too many requests simultaneously.

2. **Connection Management:** Always close WebSocket connections when done to free resources.

3. **Error Handling:** Implement proper error handling for network issues and API errors.

4. **Audio Quality:** Better audio quality = better transcription accuracy. Use noise reduction when possible.

5. **Latency:** WebSocket provides low-latency streaming. For best results, send audio in small chunks (2000-4000 bytes).

---

## 🐛 Troubleshooting

### Connection Refused
- Check if the Space is running
- Verify the WebSocket URL is correct
- Ensure you're using `wss://` (secure WebSocket)

### No Transcription
- Verify audio format (16kHz, 16-bit, mono PCM)
- Check if audio is being sent correctly
- Ensure WebSocket connection is open

### Poor Accuracy
- Use better quality audio
- Reduce background noise
- Speak clearly and at moderate pace

---

## 📞 Support

For issues or questions:
- Check the [Space page](https://huggingface.co/spaces/ashishkblink/NuralVoice)
- Review error messages in WebSocket responses
- Ensure your audio format matches requirements

---

**Developed by Blink Digital** | [Model Repository](https://huggingface.co/ashishkblink/NuralVoiceSTT)