Skip to main content
Skip table of contents

Speech Enhancement ‎

Introduction

Speech enhancement allows you to improve the quality of audio captured from the microphone—reducing noise, removing artifacts, and enhancing clarity before sending it to ASR, saving it to file, or forwarding it elsewhere.

This makes it especially useful when you want to improve speech recognition accuracy.

You can configure your Speech Enhancer using either VDK-Studio. There’s no single configuration that fits all use cases, but you can start with one of the available templates and choose the one that best matches your needs.

Barge-In (AEC)

Acoustic Echo Cancellation (AEC) is a technique used to eliminate the echo that can occur when a device plays audio (e.g., TTS output) and simultaneously captures audio through its microphone. Without AEC, the playback audio may be picked up by the microphone and misinterpreted as user input—especially problematic in interactive voice applications.

Barge-In relies on AEC to ensure the system doesn’t mistakenly detect its own voice as user input.

Feature isn’t available in VDK-Service yet for Linux and Windows. But it is natively implemented in Android.

This feature is already available in the Android API. To enable it, set the AudioSource to VOICE_COMMUNICATION in your AudioPlayer configuration:

JAVA
import com.vivoka.vsdk.audio.producers.AudioRecorder;

AudioRecorder audioRecorder = new AudioRecorder(AudioSource.VOICE_COMMUNICATION);

Audio Format

Input: 16 kHz, 16-bit signed PCM, mono or stereo.

Output: 16 kHz, 16-bit signed PCM, mono.

Note that mono or stereo input is defined when configuring a Speech Enhancement technology in VDK-Studio.

Examples

This example demonstrates how to:

  1. Send a POST request to specify the audio enhancer to use.

  2. Extract the token from the response and use it to establish a WebSocket connection.

  3. Send the audio to be processed through the WebSocket.

  4. Receive processed audio data in incoming packets and save it to a result file.

  5. Log any errors returned by the service.

Python
PY
import asyncio
import base64
import json
import os
import requests
import websockets

raw_audio_file = r"C:\Users\vivoka\Music\coffee.raw"
result_raw_audio_file = r"C:\Users\vivoka\Music\coffee-result.raw"
request_uri = "http://localhost:39806/v1/speech-enhancement/enhance"
audio_data_header = "data:audio/pcm;base64,"
request_data = { "speech_enhancer": "enhancer-1" }

async def main():
    create_result_audio_file()
    response = requests.post(request_uri, json=request_data)
    response.raise_for_status()
    response_body = response.json()
    token = response_body["token"]
    web_socket_url = f"ws://localhost:39806/v1/ws/{token}"

    async with websockets.connect(web_socket_url) as websocket:
        sending = asyncio.create_task(send_audio_data(websocket, raw_audio_file))
        receiving = asyncio.create_task(handle_message(websocket))
        await asyncio.gather(sending, receiving)

def create_result_audio_file():
    open(result_raw_audio_file, "wb").close()

async def send_audio_data(websocket, audio_file_path):
    fileSize = os.path.getsize(audio_file_path)
    with open(audio_file_path, "rb") as f:
        while True:
            chunk = f.read(1024)
            if not chunk:
                break
            base64_chunk = base64.b64encode(chunk).decode("utf-8")
            audio_chunk = {
                "data": f"{audio_data_header}{base64_chunk}",
                "last": f.tell() == fileSize
            }
            await websocket.send(json.dumps(audio_chunk))
            await asyncio.sleep(0.01)  # To prevent overwhelming the server

async def handle_message(websocket):
    try:
        async for message in websocket:
            body = json.loads(message)
            if "data" in body:
                if body["data"].startswith(audio_data_header):
                    with open(result_raw_audio_file, "ab") as f:
                        f.write(base64.b64decode(body["data"][len(audio_data_header):]))
            elif "error" in body:
                print(f"Error received: {json.dumps(body['error'])}")
            else:
                print("Unknown message type received.")
    except json.JSONDecodeError as ex:
        print(f"Failed to parse message: {ex}")

if __name__ == "__main__":
    asyncio.run(main())
C#
C#

using System.Text.Json;
using System.Net.WebSockets;
using System.Text;

const string rawAudioFile = @"C:\Users\vivoka\Music\coffee.wav";
const string resultRawAudioFile = @"C:\Users\vivoka\Music\coffee-result.raw";
const string requestUri = "http://localhost:39806/v1/speech-enhancement/enhance";
const string audioDataHeader = "data:audio/pcm;base64,";
var requestData = new { speech_enhancer = "enhancer-1" };

using (var client = new HttpClient())
{
    // Create result audio file
    using (File.Create(resultRawAudioFile)) { }

    var json = JsonSerializer.Serialize(requestData);
    var content = new StringContent(json, Encoding.UTF8, "application/json");
    var response = await client.PostAsync(requestUri, content);
    response.EnsureSuccessStatusCode();
    var responseBody = await response.Content.ReadAsStringAsync();

    // Step 2: Extract the WebSocket URL from the response
    var jsonResponse = JsonDocument.Parse(responseBody).RootElement;
    var token = jsonResponse.GetProperty("token").ToString();
    var webSocketUrl = $"ws://localhost:39806/v1/ws/{token}";

    // Step 3: Connect to the WebSocket and start sending/receiving audio data
    using (var webSocket = new ClientWebSocket())
    {
        await webSocket.ConnectAsync(new Uri(webSocketUrl), CancellationToken.None);
        var sending = Task.Run(() => SendAudioData(webSocket, rawAudioFile));
        var receiving = Task.Run(() => ReceiveMessages(webSocket));
        await Task.WhenAll(sending, receiving);
    }
}

async Task SendAudioData(ClientWebSocket webSocket, string audioFilePath)
{
    // Read audio data from a file and send it throw websocket
    using (var fs = File.OpenRead(audioFilePath))
    {
        int bytesRead;
        var buffer = new byte[1024];
        while ((bytesRead = await fs.ReadAsync(buffer, 0, buffer.Length)) > 0)
        {
            var base64 = Convert.ToBase64String(new ArraySegment<byte>(buffer, 0, bytesRead));
            var audioChunk = new
            {
                data = $"{audioDataHeader}{base64}",
                last = fs.Position == fs.Length
            };
            var json = JsonSerializer.Serialize(audioChunk);
            var bytes = Encoding.UTF8.GetBytes(json);
            await webSocket.SendAsync(new ArraySegment<byte>(bytes), WebSocketMessageType.Text, true, CancellationToken.None);
        }
    }
}

async Task ReceiveMessages(ClientWebSocket webSocket)
{
    var message = "";
    var buffer = new byte[1024];
    while (webSocket.State == WebSocketState.Open)
    {
        var packet = await webSocket.ReceiveAsync(new ArraySegment<byte>(buffer), CancellationToken.None);
        if (packet.MessageType == WebSocketMessageType.Close)
        {
            await webSocket.CloseAsync(WebSocketCloseStatus.NormalClosure, "Closing", CancellationToken.None);
            break;
        }
        else
        {
            message = (message.Length > 0 ? message : "") + Encoding.UTF8.GetString(buffer, 0, packet.Count);
            if (packet.EndOfMessage)
            {
                HandleMessage(message);
                message = "";
            }
        }
    }
}

void HandleMessage(string message)
{
    try
    {
        using (var document = JsonDocument.Parse(message))
        {
            JsonElement root = document.RootElement;
            if (root.TryGetProperty("data", out JsonElement dataElement))
            {
                var data = dataElement.ToString();
                if (data.StartsWith(audioDataHeader))
                {
                    // Append result audio file
                    using (var fileStream = new FileStream(resultRawAudioFile, FileMode.Append, FileAccess.Write, FileShare.None))
                        new BinaryWriter(fileStream).Write(Convert.FromBase64String(data.Substring(audioDataHeader.Length)));
                }
            }
            else if (root.TryGetProperty("error", out JsonElement errorElement))
                Console.WriteLine($"Error received: {JsonSerializer.Serialize(errorElement)}");
            else
                Console.WriteLine("Unknown message type received.");
        }
    }
    catch (JsonException ex)
    {
        Console.WriteLine($"Failed to parse message: {ex.Message}");
    }
}

JavaScript errors detected

Please note, these errors can depend on your browser setup.

If this problem persists, please contact our support.