Speech Synthesis‎

Introduction

Speech synthesis (also known as text-to-speech or TTS) is the process of converting written text into spoken audio.

In VSDK, speech synthesis is powered by CSDK, which offers a wide range of voices across different languages, genders, and voice quality (Voice quality availability).

Voice Format

For <language>, refer to the table and use the value from the Vsdk-csdk Code column.
For <name>, use the lowercase version of the name shown in VDK-Studio.
For <quality>, you can find this information in VDK-Studio under Resources → Voice.

Engine	Format	Example
vsdk-csdk	`<language>,<name>,<qua`	`enu,evan,embedded-pro`

SSML Support

VSDK also supports SSML (Speech Synthesis Markup Language), which gives you finer control over how the text is spoken—allowing adjustments such as:

Pronunciation
Pauses
Pitch
Rate
Emphasis

SSML is supported for embedded voices, but not for neural voices (if present in your configuration). Neural voices are more natural-sounding but behave as a black box and do not support markup-based control.

Audio Format

The audio data is a 16-bit signed PCM buffer in Little-Endian format.
It is always mono (1 channel), and the sample rate depends on the engine being used.

Engine	Sample Rate (kHz)
csdk	22050

Examples

This example demonstrates how to:

Retrieve available voices and select the first one for synthesis.
Send a POST request to the synthesis endpoint, including the voice_id and the text to be synthesized.
Receive a token in the response and use it to establish a WebSocket connection.
Stream synthesized audio through the WebSocket, continuously receiving messages that contain audio data.
Save the audio output to a result file while handling any events or errors returned by the service.

Python

PY

import asyncio
import base64
import json
import requests
import websockets


result_raw_audio_file = r"C:\Users\vivoka\Music\tts.raw"
voices_uri = "http://localhost:39806/v1/voice-synthesis/voices"
synthesis_uri = "http://localhost:39806/v1/voice-synthesis/synthesize"
audio_data_header = "data:audio/pcm;base64,"

async def main():
    create_result_audio_file()

    # Get available voices
    voices_response = requests.get(voices_uri)
    voices_response.raise_for_status()
    voices_response_body = voices_response.json()
    if not voices_response_body:
        print("No available voices")
        return
    else:
        print(f"Voices : {voices_response_body}")

    # Use the first available voice for synthesis
    request_data = {
        "text": "I want a coffee",
        "voice_id": list(voices_response_body.keys())[0]
    }

    # Send synthesis request and get the token
    synthesis_response = requests.post(synthesis_uri, json=request_data)
    synthesis_response.raise_for_status()
    synthesis_response_body = synthesis_response.json()
    token = synthesis_response_body["token"]

    # Connect to websocket and start receiving messages
    web_socket_url = f"ws://localhost:39806/v1/ws/{token}"
    async with websockets.connect(web_socket_url) as websocket:
        await handle_message(websocket)

def create_result_audio_file():
    open(result_raw_audio_file, "wb").close()

async def handle_message(websocket):
    try:
        async for message in websocket:
            body = json.loads(message)
            if "data" in body:
                data = body["data"]
                if data.startswith(audio_data_header):
                    # Append result audio file
                    with open(result_raw_audio_file, "ab") as file:
                        file.write(base64.b64decode(data[len(audio_data_header):]))
            elif "event" in body:
                print(f"Event received: {json.dumps(body['event'])}")
            elif "error" in body:
                print(f"Error received: {json.dumps(body['error'])}")
            else:
                print("Unknown message type received.")
    except json.JSONDecodeError as ex:
        print(f"Failed to parse message: {ex}")

if __name__ == "__main__":
    asyncio.run(main())

C#

using System.Text.Json;
using System.Net.WebSockets;
using System.Text;

const string resultRawAudioFile = @"C:\Users\vivoka\Music\coffee.raw";
const string voicesUri = "http://localhost:39806/v1/voice-synthesis/voices";
const string synthesisUri = "http://localhost:39806/v1/voice-synthesis/synthesize";
const string audioDataHeader = "data:audio/pcm;base64,";

using (var client = new HttpClient())
{
    // Create result audio file
    using (File.Create(resultRawAudioFile)) { }

    // Get informations about the available voices
    var voicesResponse = await client.GetAsync(voicesUri);
    voicesResponse.EnsureSuccessStatusCode();
    var voicesResponseBody = await voicesResponse.Content.ReadAsStringAsync();
    var voicesResponseJson = JsonDocument.Parse(voicesResponseBody).RootElement;
    Console.WriteLine($"Voices : {voicesResponseBody}");
    if (voicesResponseJson.EnumerateObject().Count() == 0)
    {
        Console.WriteLine("No available voices");
        return;
    }
   
    // Use first available voice
    var requestData = new { text = "I want a coffee", voice_id = voicesResponseJson.EnumerateObject().First().Name };

    // Send a POST method to request the token
    var json = JsonSerializer.Serialize(requestData);
    var content = new StringContent(json, Encoding.UTF8, "application/json");
    var response = await client.PostAsync(synthesisUri, content);
    response.EnsureSuccessStatusCode();
    var responseBody = await response.Content.ReadAsStringAsync();

    // Extract the WebSocket URL from the response
    var jsonResponse = JsonDocument.Parse(responseBody).RootElement;
    var sampleRate = jsonResponse.GetProperty("sample_rate").ToString();
    var token = jsonResponse.GetProperty("token").ToString();
    var webSocketUrl = $"ws://localhost:39806/v1/ws/{token}";

    // Connect to the WebSocket and start receiving messages
    using (var webSocket = new ClientWebSocket())
    {
        await webSocket.ConnectAsync(new Uri(webSocketUrl), CancellationToken.None);
        var receiving = Task.Run(() => ReceiveMessages(webSocket));
        await Task.WhenAll(receiving);
    }
}

async Task ReceiveMessages(ClientWebSocket webSocket)
{
    var message = "";
    var buffer = new byte[1024];
    while (webSocket.State == WebSocketState.Open)
    {
        var packet = await webSocket.ReceiveAsync(new ArraySegment<byte>(buffer), CancellationToken.None);
        if (packet.MessageType == WebSocketMessageType.Close)
        {
            await webSocket.CloseAsync(WebSocketCloseStatus.NormalClosure, "Closing", CancellationToken.None);
            break;
        }
        else
        {
            message = (message.Length > 0 ? message : "") + Encoding.UTF8.GetString(buffer, 0, packet.Count);
            if (packet.EndOfMessage)
            {
                HandleMessage(message);
                message = "";
            }
        }
    }
}

void HandleMessage(string message)
{
    try
    {
        using (var document = JsonDocument.Parse(message))
        {
            JsonElement root = document.RootElement;
            if (root.TryGetProperty("data", out JsonElement dataElement))
            {
                var data = dataElement.ToString();
                if (data.StartsWith(audioDataHeader))
                {
                    // Append result audio file
                    using (var fileStream = new FileStream(resultRawAudioFile, FileMode.Append, FileAccess.Write, FileShare.None))
                        new BinaryWriter(fileStream).Write(Convert.FromBase64String(data.Substring(audioDataHeader.Length)));
                }
            }
            else if (root.TryGetProperty("event", out JsonElement eventElement))
                Console.WriteLine($"Event received: {JsonSerializer.Serialize(eventElement)}");
            else if (root.TryGetProperty("error", out JsonElement errorElement))
                Console.WriteLine($"Error received: {JsonSerializer.Serialize(errorElement)}");
            else
                Console.WriteLine("Unknown message type received.");
        }
    }
    catch (JsonException ex)
    {
        Console.WriteLine($"Failed to parse message: {ex.Message}");
    }
}