Voice Synthesis ‎

This example retrieves available voices and selects the first one for synthesis. Then, it sends a POST request for synthesis with voice_id and text to synthesis. In the response we use the token to connect to the WebSocket. Through this connection, it continuously receives messages, primarily containing synthesized audio data, which are saved into a result audio file, alongside handling any events or errors that may arise.

C# Example

C#

using System.Text.Json;
using System.Net.WebSockets;
using System.Text;

const string resultRawAudioFile = @"C:\Users\vivoka\Music\coffee.raw";
const string voicesUri = "http://localhost:39806/v1/voice-synthesis/voices";
const string synthesisUri = "http://localhost:39806/v1/voice-synthesis/synthesize";
const string audioDataHeader = "data:audio/pcm;base64,";

using (var client = new HttpClient())
{
    // Create result audio file
    using (File.Create(resultRawAudioFile)) { }

    // Get informations about the available voices
    var voicesResponse = await client.GetAsync(voicesUri);
    voicesResponse.EnsureSuccessStatusCode();
    var voicesResponseBody = await voicesResponse.Content.ReadAsStringAsync();
    var voicesResponseJson = JsonDocument.Parse(voicesResponseBody).RootElement;
    Console.WriteLine($"Voices : {voicesResponseBody}");
    if (voicesResponseJson.EnumerateObject().Count() == 0)
    {
        Console.WriteLine("No available voices");
        return;
    }
   
    // Use first available voice
    var requestData = new { text = "I want a coffee", voice_id = voicesResponseJson.EnumerateObject().First().Name };

    // Send a POST method to request the token
    var json = JsonSerializer.Serialize(requestData);
    var content = new StringContent(json, Encoding.UTF8, "application/json");
    var response = await client.PostAsync(synthesisUri, content);
    response.EnsureSuccessStatusCode();
    var responseBody = await response.Content.ReadAsStringAsync();

    // Extract the WebSocket URL from the response
    var jsonResponse = JsonDocument.Parse(responseBody).RootElement;
    var sampleRate = jsonResponse.GetProperty("sample_rate").ToString();
    var token = jsonResponse.GetProperty("token").ToString();
    var webSocketUrl = $"ws://localhost:39806/v1/ws/{token}";

    // Connect to the WebSocket and start receiving messages
    using (var webSocket = new ClientWebSocket())
    {
        await webSocket.ConnectAsync(new Uri(webSocketUrl), CancellationToken.None);
        var receiving = Task.Run(() => ReceiveMessages(webSocket));
        await Task.WhenAll(receiving);
    }
}

async Task ReceiveMessages(ClientWebSocket webSocket)
{
    var message = "";
    var buffer = new byte[1024];
    while (webSocket.State == WebSocketState.Open)
    {
        var packet = await webSocket.ReceiveAsync(new ArraySegment<byte>(buffer), CancellationToken.None);
        if (packet.MessageType == WebSocketMessageType.Close)
        {
            await webSocket.CloseAsync(WebSocketCloseStatus.NormalClosure, "Closing", CancellationToken.None);
            break;
        }
        else
        {
            message = (message.Length > 0 ? message : "") + Encoding.UTF8.GetString(buffer, 0, packet.Count);
            if (packet.EndOfMessage)
            {
                HandleMessage(message);
                message = "";
            }
        }
    }
}

void HandleMessage(string message)
{
    try
    {
        using (var document = JsonDocument.Parse(message))
        {
            JsonElement root = document.RootElement;
            if (root.TryGetProperty("data", out JsonElement dataElement))
            {
                var data = dataElement.ToString();
                if (data.StartsWith(audioDataHeader))
                {
                    // Append result audio file
                    using (var fileStream = new FileStream(resultRawAudioFile, FileMode.Append, FileAccess.Write, FileShare.None))
                        new BinaryWriter(fileStream).Write(Convert.FromBase64String(data.Substring(audioDataHeader.Length)));
                }
            }
            else if (root.TryGetProperty("event", out JsonElement eventElement))
                Console.WriteLine($"Event received: {JsonSerializer.Serialize(eventElement)}");
            else if (root.TryGetProperty("error", out JsonElement errorElement))
                Console.WriteLine($"Error received: {JsonSerializer.Serialize(errorElement)}");
            else
                Console.WriteLine("Unknown message type received.");
        }
    }
    catch (JsonException ex)
    {
        Console.WriteLine($"Failed to parse message: {ex.Message}");
    }
}

Python Example

PY

import asyncio
import base64
import json
import requests
import websockets


result_raw_audio_file = r"C:\Users\vivoka\Music\tts.raw"
voices_uri = "http://localhost:39806/v1/voice-synthesis/voices"
synthesis_uri = "http://localhost:39806/v1/voice-synthesis/synthesize"
audio_data_header = "data:audio/pcm;base64,"

async def main():
    create_result_audio_file()

    # Get available voices
    voices_response = requests.get(voices_uri)
    voices_response.raise_for_status()
    voices_response_body = voices_response.json()
    if not voices_response_body:
        print("No available voices")
        return
    else:
        print(f"Voices : {voices_response_body}")

    # Use the first available voice for synthesis
    request_data = {
        "text": "I want a coffee",
        "voice_id": list(voices_response_body.keys())[0]
    }

    # Send synthesis request and get the token
    synthesis_response = requests.post(synthesis_uri, json=request_data)
    synthesis_response.raise_for_status()
    synthesis_response_body = synthesis_response.json()
    token = synthesis_response_body["token"]

    # Connect to websocket and start receiving messages
    web_socket_url = f"ws://localhost:39806/v1/ws/{token}"
    async with websockets.connect(web_socket_url) as websocket:
        await handle_message(websocket)

def create_result_audio_file():
    open(result_raw_audio_file, "wb").close()

async def handle_message(websocket):
    try:
        async for message in websocket:
            body = json.loads(message)
            if "data" in body:
                data = body["data"]
                if data.startswith(audio_data_header):
                    # Append result audio file
                    with open(result_raw_audio_file, "ab") as file:
                        file.write(base64.b64decode(data[len(audio_data_header):]))
            elif "event" in body:
                print(f"Event received: {json.dumps(body['event'])}")
            elif "error" in body:
                print(f"Error received: {json.dumps(body['error'])}")
            else:
                print("Unknown message type received.")
    except json.JSONDecodeError as ex:
        print(f"Failed to parse message: {ex}")

if __name__ == "__main__":
    asyncio.run(main())