{% extends "base.html" %} {% block title %}TTSFM {{ _('docs.title') }}{% endblock %} {% block extra_css %} {% endblock %} {% block content %}

{{ _('docs.title') }}

{{ _('docs.subtitle') }}

{{ _('docs.overview_title') }}

{{ _('docs.overview_desc') }}

{{ _('docs.base_url') }} {{ request.url_root }}api/

{{ _('docs.key_features') }}

  • 🎤 {{ _('docs.feature_voices') }}
  • 🎵 {{ _('docs.feature_formats') }}
  • 🤖 {{ _('docs.feature_openai') }}
  • ✨ {{ _('docs.feature_auto_combine') }}
  • 📊 {{ _('docs.feature_validation') }}
  • 📈 {{ _('docs.feature_monitoring') }}
{{ _('docs.new_version') }} {{ _('docs.new_version_desc') }}

{{ _('docs.authentication_title') }}

{{ _('docs.authentication_desc') }}

Authorization: Bearer YOUR_API_KEY

{{ _('docs.text_validation_title') }}

{{ _('docs.text_validation_desc') }}

{{ _('docs.important') }} {{ _('docs.text_validation_warning') }}

{{ _('docs.validation_options') }}

  • max_length: {{ _('docs.max_length_option') }}
  • validate_length: {{ _('docs.validate_length_option') }}
  • preserve_words: {{ _('docs.preserve_words_option') }}

{{ _('docs.endpoints_title') }}

GET /api/voices

{{ _('docs.get_voices_desc') }}

{{ _('docs.response_example') }}
{
  "voices": [
    {
      "id": "alloy",
      "name": "Alloy",
      "description": "Alloy voice"
    },
    {
      "id": "echo",
      "name": "Echo", 
      "description": "Echo voice"
    }
  ],
  "count": 6
}

GET /api/formats

Get available audio formats for speech generation.

Available Formats

We support multiple format requests, but internally:

  • mp3 - Returns actual MP3 format
  • All other formats (opus, aac, flac, wav, pcm) - Mapped to WAV format
Note: When you request opus, aac, flac, wav, or pcm, you'll receive WAV audio data.
{{ _('docs.response_example') }}
{
  "formats": [
    {
      "id": "mp3",
      "name": "MP3",
      "mime_type": "audio/mp3",
      "description": "MP3 audio format"
    },
    {
      "id": "opus", 
      "name": "Opus",
      "mime_type": "audio/wav",
      "description": "Returns WAV format"
    },
    {
      "id": "aac",
      "name": "AAC", 
      "mime_type": "audio/wav",
      "description": "Returns WAV format"
    },
    {
      "id": "flac",
      "name": "FLAC",
      "mime_type": "audio/wav", 
      "description": "Returns WAV format"
    },
    {
      "id": "wav",
      "name": "WAV",
      "mime_type": "audio/wav",
      "description": "WAV audio format"
    },
    {
      "id": "pcm",
      "name": "PCM",
      "mime_type": "audio/wav",
      "description": "Returns WAV format"
    }
  ],
  "count": 6
}

POST /api/validate-text

{{ _('docs.validate_text_desc') }}

{{ _('docs.request_body') }}
{
  "text": "Your text to validate",
  "max_length": 4096
}
{{ _('docs.response_example') }}
{
  "text_length": 5000,
  "max_length": 4096,
  "is_valid": false,
  "needs_splitting": true,
  "suggested_chunks": 2,
  "chunk_preview": [
    "First chunk preview...",
    "Second chunk preview..."
  ]
}

POST /api/generate

{{ _('docs.generate_speech_desc') }}

{{ _('docs.request_body') }}
{
  "text": "Hello, world!",
  "voice": "alloy",
  "format": "mp3",
  "instructions": "Speak cheerfully",
  "max_length": 4096,
  "validate_length": true
}
{{ _('docs.parameters') }}
  • text ({{ _('docs.required') }}): {{ _('docs.text_param') }}
  • voice ({{ _('docs.optional') }}): {{ _('docs.voice_param') }}
  • format ({{ _('docs.optional') }}): {{ _('docs.format_param') }}
  • instructions ({{ _('docs.optional') }}): {{ _('docs.instructions_param') }}
  • max_length ({{ _('docs.optional') }}): {{ _('docs.max_length_param') }}
  • validate_length ({{ _('docs.optional') }}): {{ _('docs.validate_length_param') }}
{{ _('docs.response') }}

{{ _('docs.response_audio') }}

{{ _('docs.python_package_title') }}

{{ _('docs.long_text_support') }}

{{ _('docs.long_text_desc') }}

from ttsfm import TTSClient, Voice, AudioFormat

# Create client
client = TTSClient()

# Generate speech from long text (automatically splits into separate files)
responses = client.generate_speech_long_text(
    text="Very long text that exceeds 4096 characters...",
    voice=Voice.ALLOY,
    response_format=AudioFormat.MP3,
    max_length=2000,
    preserve_words=True
)

# Save each chunk as separate files
for i, response in enumerate(responses, 1):
    response.save_to_file(f"part_{i:03d}.mp3")
{{ _('docs.developer_features') }}
  • {{ _('docs.manual_splitting') }}
  • {{ _('docs.word_preservation') }}
  • {{ _('docs.separate_files') }}
  • {{ _('docs.cli_support') }}
{{ _('docs.note') }} {{ _('docs.auto_combine_note') }}

POST /api/generate-combined

{{ _('docs.combined_audio_desc') }}

{{ _('docs.request_body') }}
{
  "text": "Very long text that exceeds the limit...",
  "voice": "alloy",
  "format": "mp3",
  "instructions": "Optional voice instructions",
  "max_length": 4096,
  "preserve_words": true
}
{{ _('docs.response') }}

{{ _('docs.response_combined_audio') }}

{{ _('docs.response_headers') }}
  • X-Chunks-Combined: {{ _('docs.chunks_combined_header') }}
  • X-Original-Text-Length: {{ _('docs.original_text_length_header') }}
  • X-Audio-Size: {{ _('docs.audio_size_header') }}

POST /v1/audio/speech

{{ _('docs.openai_compatible_desc') }}

{{ _('docs.request_body') }}
{
  "model": "gpt-4o-mini-tts",
  "input": "Text of any length...",
  "voice": "alloy",
  "response_format": "mp3",
  "instructions": "Optional voice instructions",
  "speed": 1.0,
  "auto_combine": true,
  "max_length": 4096
}
{{ _('docs.enhanced_parameters') }}
  • auto_combine (boolean, default: true):
    • true: {{ _('docs.auto_combine_param') }}
    • false: {{ _('docs.auto_combine_false') }}
  • max_length (integer, default: 4096): {{ _('docs.max_length_chunk_param') }}
{{ _('docs.response_headers') }}
  • X-Auto-Combine: {{ _('docs.auto_combine_header') }}
  • X-Chunks-Combined: {{ _('docs.chunks_combined_response') }}
  • X-Original-Text-Length: {{ _('docs.original_text_response') }}
  • X-Audio-Format: {{ _('docs.audio_format_header') }}
  • X-Audio-Size: {{ _('docs.audio_size_response') }}
{{ _('docs.examples_title') }}
# {{ _('docs.short_text_comment') }}
curl -X POST {{ request.url_root }}v1/audio/speech \
  -H "Content-Type: application/json" \
  -d '{
    "model": "gpt-4o-mini-tts",
    "input": "Hello world!",
    "voice": "alloy"
  }'

# {{ _('docs.long_text_auto_comment') }}
curl -X POST {{ request.url_root }}v1/audio/speech \
  -H "Content-Type: application/json" \
  -d '{
    "model": "gpt-4o-mini-tts",
    "input": "Very long text...",
    "voice": "alloy",
    "auto_combine": true
  }'

# {{ _('docs.long_text_no_auto_comment') }}
curl -X POST {{ request.url_root }}v1/audio/speech \
  -H "Content-Type: application/json" \
  -d '{
    "model": "gpt-4o-mini-tts",
    "input": "Very long text...",
    "voice": "alloy",
    "auto_combine": false
  }'
{{ _('docs.audio_combination') }} {{ _('docs.audio_combination_desc') }}
{{ _('docs.use_cases') }}
  • {{ _('docs.use_case_articles') }}
  • {{ _('docs.use_case_audiobooks') }}
  • {{ _('docs.use_case_podcasts') }}
  • {{ _('docs.use_case_education') }}
{{ _('docs.example_usage') }}
# {{ _('docs.python_example_comment') }}
import requests

response = requests.post(
    "{{ request.url_root }}api/generate-combined",
    json={
        "text": "Your very long text content here...",
        "voice": "nova",
        "format": "mp3",
        "max_length": 2000
    }
)

if response.status_code == 200:
    with open("combined_audio.mp3", "wb") as f:
        f.write(response.content)

    chunks = response.headers.get('X-Chunks-Combined')
    print(f"Combined {chunks} chunks into single file")

WebSocket Streaming

Real-time audio streaming for enhanced user experience. Get audio chunks as they're generated instead of waiting for the complete file.

WebSocket streaming provides lower perceived latency and real-time progress tracking for TTS generation.

Connection

// JavaScript WebSocket client
const client = new WebSocketTTSClient({
    socketUrl: '{{ request.url_root[:-1] }}',
    debug: true
});

// Connection events
client.onConnect = () => console.log('Connected');
client.onDisconnect = () => console.log('Disconnected');

Streaming TTS Generation

// Generate speech with real-time streaming
const result = await client.generateSpeech('Hello, WebSocket world!', {
    voice: 'alloy',
    format: 'mp3',
    chunkSize: 1024,  // Characters per chunk
    
    // Progress callback
    onProgress: (progress) => {
        console.log(`Progress: ${progress.progress}%`);
        console.log(`Chunks: ${progress.chunksCompleted}/${progress.totalChunks}`);
    },
    
    // Receive audio chunks in real-time
    onChunk: (chunk) => {
        console.log(`Received chunk ${chunk.chunkIndex + 1}`);
        // Process or play audio chunk immediately
        processAudioChunk(chunk.audioData);
    },
    
    // Completion callback
    onComplete: (result) => {
        console.log('Streaming complete!');
        // result.audioData contains the complete audio
    }
});

WebSocket Events

Client → Server Events
Event Description Payload
generate_stream Start TTS generation {text, voice, format, chunk_size}
cancel_stream Cancel active stream {request_id}
Server → Client Events
Event Description Payload
stream_started Stream initiated {request_id, timestamp}
audio_chunk Audio chunk ready {request_id, chunk_index, audio_data, duration}
stream_progress Progress update {progress, chunks_completed, total_chunks}
stream_complete Generation complete {request_id, total_chunks, status}
stream_error Error occurred {request_id, error, timestamp}

Benefits

  • Real-time feedback: Users see progress as audio generates
  • Lower latency: First audio chunk arrives quickly
  • Cancellable: Stop generation mid-stream if needed
  • Efficient: Process chunks as they arrive

Example: Streaming Audio Player

// Create a streaming audio player
const audioChunks = [];
let isPlaying = false;

const streamingPlayer = await client.generateSpeech(longText, {
    voice: 'nova',
    format: 'mp3',
    
    onChunk: (chunk) => {
        // Store chunk
        audioChunks.push(chunk.audioData);
        
        // Start playing after first chunk
        if (!isPlaying && audioChunks.length >= 3) {
            startStreamingPlayback(audioChunks);
            isPlaying = true;
        }
    },
    
    onComplete: (result) => {
        // Ensure all chunks are played
        finishPlayback(result.audioData);
    }
});
Try It Out!

Experience WebSocket streaming in action at the WebSocket Demo or enable streaming mode in the Playground.

{% endblock %} {% block extra_js %} {% endblock %}