Introduction
Speech synthesis (also known as text-to-speech or TTS) is the process of converting written text into spoken audio.
In VSDK, speech synthesis is powered by CSDK, which offers a wide range of voices across different languages, genders, and voice quality (Voice quality availability).
For <language>, refer to the table and use the value from the Vsdk-csdk Code column.
For <name>, use the lowercase version of the name shown in VDK-Studio.
For <quality>, you can find this information in VDK-Studio under Resources → Voice.
Engine | Format | Example |
|---|
vsdk-csdk | <language>,<name>,<qua
| enu,evan,embedded-pro
|
SSML Support
VSDK also supports SSML (Speech Synthesis Markup Language), which gives you finer control over how the text is spoken—allowing adjustments such as:
Pronunciation
Pauses
Pitch
Rate
Emphasis
SSML is supported for embedded voices, but not for neural voices (if present in your configuration). Neural voices are more natural-sounding but behave as a black box and do not support markup-based control.
The audio data is a 16-bit signed PCM buffer in Little-Endian format.
It is always mono (1 channel), and the sample rate depends on the engine being used.
Engine | Sample Rate (kHz) |
|---|
csdk | 22050 |
Examples
This example demonstrates how to:
Retrieve available voices and select the first one for synthesis.
Send a POST request to the synthesis endpoint, including the voice_id and the text to be synthesized.
Receive a token in the response and use it to establish a WebSocket connection.
Stream synthesized audio through the WebSocket, continuously receiving messages that contain audio data.
Save the audio output to a result file while handling any events or errors returned by the service.
Python
PY
import asyncio
import base64
import json
import requests
import websockets
result_raw_audio_file = r"C:\Users\vivoka\Music\tts.raw"
voices_uri = "http://localhost:39806/v1/voice-synthesis/voices"
synthesis_uri = "http://localhost:39806/v1/voice-synthesis/synthesize"
audio_data_header = "data:audio/pcm;base64,"
async def main():
create_result_audio_file()
# Get available voices
voices_response = requests.get(voices_uri)
voices_response.raise_for_status()
voices_response_body = voices_response.json()
if not voices_response_body:
print("No available voices")
return
else:
print(f"Voices : {voices_response_body}")
# Use the first available voice for synthesis
request_data = {
"text": "I want a coffee",
"voice_id": list(voices_response_body.keys())[0]
}
# Send synthesis request and get the token
synthesis_response = requests.post(synthesis_uri, json=request_data)
synthesis_response.raise_for_status()
synthesis_response_body = synthesis_response.json()
token = synthesis_response_body["token"]
# Connect to websocket and start receiving messages
web_socket_url = f"ws://localhost:39806/v1/ws/{token}"
async with websockets.connect(web_socket_url) as websocket:
await handle_message(websocket)
def create_result_audio_file():
open(result_raw_audio_file, "wb").close()
async def handle_message(websocket):
try:
async for message in websocket:
body = json.loads(message)
if "data" in body:
data = body["data"]
if data.startswith(audio_data_header):
# Append result audio file
with open(result_raw_audio_file, "ab") as file:
file.write(base64.b64decode(data[len(audio_data_header):]))
elif "event" in body:
print(f"Event received: {json.dumps(body['event'])}")
elif "error" in body:
print(f"Error received: {json.dumps(body['error'])}")
else:
print("Unknown message type received.")
except json.JSONDecodeError as ex:
print(f"Failed to parse message: {ex}")
if __name__ == "__main__":
asyncio.run(main())
C#
C#
using System.Text.Json;
using System.Net.WebSockets;
using System.Text;
const string resultRawAudioFile = @"C:\Users\vivoka\Music\coffee.raw";
const string voicesUri = "http://localhost:39806/v1/voice-synthesis/voices";
const string synthesisUri = "http://localhost:39806/v1/voice-synthesis/synthesize";
const string audioDataHeader = "data:audio/pcm;base64,";
using (var client = new HttpClient())
{
// Create result audio file
using (File.Create(resultRawAudioFile)) { }
// Get informations about the available voices
var voicesResponse = await client.GetAsync(voicesUri);
voicesResponse.EnsureSuccessStatusCode();
var voicesResponseBody = await voicesResponse.Content.ReadAsStringAsync();
var voicesResponseJson = JsonDocument.Parse(voicesResponseBody).RootElement;
Console.WriteLine($"Voices : {voicesResponseBody}");
if (voicesResponseJson.EnumerateObject().Count() == 0)
{
Console.WriteLine("No available voices");
return;
}
// Use first available voice
var requestData = new { text = "I want a coffee", voice_id = voicesResponseJson.EnumerateObject().First().Name };
// Send a POST method to request the token
var json = JsonSerializer.Serialize(requestData);
var content = new StringContent(json, Encoding.UTF8, "application/json");
var response = await client.PostAsync(synthesisUri, content);
response.EnsureSuccessStatusCode();
var responseBody = await response.Content.ReadAsStringAsync();
// Extract the WebSocket URL from the response
var jsonResponse = JsonDocument.Parse(responseBody).RootElement;
var sampleRate = jsonResponse.GetProperty("sample_rate").ToString();
var token = jsonResponse.GetProperty("token").ToString();
var webSocketUrl = $"ws://localhost:39806/v1/ws/{token}";
// Connect to the WebSocket and start receiving messages
using (var webSocket = new ClientWebSocket())
{
await webSocket.ConnectAsync(new Uri(webSocketUrl), CancellationToken.None);
var receiving = Task.Run(() => ReceiveMessages(webSocket));
await Task.WhenAll(receiving);
}
}
async Task ReceiveMessages(ClientWebSocket webSocket)
{
var message = "";
var buffer = new byte[1024];
while (webSocket.State == WebSocketState.Open)
{
var packet = await webSocket.ReceiveAsync(new ArraySegment<byte>(buffer), CancellationToken.None);
if (packet.MessageType == WebSocketMessageType.Close)
{
await webSocket.CloseAsync(WebSocketCloseStatus.NormalClosure, "Closing", CancellationToken.None);
break;
}
else
{
message = (message.Length > 0 ? message : "") + Encoding.UTF8.GetString(buffer, 0, packet.Count);
if (packet.EndOfMessage)
{
HandleMessage(message);
message = "";
}
}
}
}
void HandleMessage(string message)
{
try
{
using (var document = JsonDocument.Parse(message))
{
JsonElement root = document.RootElement;
if (root.TryGetProperty("data", out JsonElement dataElement))
{
var data = dataElement.ToString();
if (data.StartsWith(audioDataHeader))
{
// Append result audio file
using (var fileStream = new FileStream(resultRawAudioFile, FileMode.Append, FileAccess.Write, FileShare.None))
new BinaryWriter(fileStream).Write(Convert.FromBase64String(data.Substring(audioDataHeader.Length)));
}
}
else if (root.TryGetProperty("event", out JsonElement eventElement))
Console.WriteLine($"Event received: {JsonSerializer.Serialize(eventElement)}");
else if (root.TryGetProperty("error", out JsonElement errorElement))
Console.WriteLine($"Error received: {JsonSerializer.Serialize(errorElement)}");
else
Console.WriteLine("Unknown message type received.");
}
}
catch (JsonException ex)
{
Console.WriteLine($"Failed to parse message: {ex.Message}");
}
}