Models
In VDK-Studio, you can create three types of models.
Type | Description |
Static models | Predefined vocabularies and grammars. They define a fixed set of valid phrases, their structure, and optionally custom phonemes. |
Dynamic models | A special type of static model that includes slots—placeholders you can fill with vocabulary at runtime. The base model is compiled in the cloud, then updated on the device when slots are filled, requiring an additional on-device compilation. |
Free-speech models | Large-vocabulary models designed for open-ended input (e.g., dictation). Unlike static or dynamic models, they are not limited to a defined set of phrases.
Implement it the same way as the static model.
|
For Static and Free-speech models, you need to send a similar request to the REST API.
JSON
{
"stop_at_first_result": true,
"models": {
"model-name": { }
}
}
To provide slot values for a Dynamic model, send the following request:
JSON
{
"stop_at_first_result": true,
"models": {
"model-name": {
"slots": {
"slot-name": {
"values": ["Coffee", "Cola", "Mojito", "Cup of tea"]
}
}
}
}
}
The parsed result will typically contain multiple hypotheses, each with an associated confidence score—the higher the score, the better the match.
In general, a confidence threshold between 4000 and 5000 is considered acceptable (for csdk), though this may vary depending on your specific use case.
If "stop_at_first_result": true, the process will stop at the first result, regardless of its confidence level.
To ensure you only stop when a result meets your desired confidence threshold, you have two options:
Set "stop_at_first_result" to false and wait until a result with a satisfactory confidence level is returned.
Configure confidence thresholds directly by setting the following parameters in config/vsdk.json file:
For more information on adding additional configuration parameters, see the Configuration File page.
The audio data is a 16-bit signed PCM buffer in Little-Endian format.
It is always mono (1 channel), and the sample rate 16KHz.
Examples
This example illustrates how to:
Send a POST request to the VDK service to initiate a voice recognition session.
Receive an authentication token in the response.
Use the token to establish a WebSocket connection.
Stream audio data from a file to the service.
Handle incoming messages from the WebSocket, including:
Recognition results
System events
Error messages
The client streams audio while simultaneously listening for and printing all incoming events, results, and errors to the console.
Python
PY
import asyncio
import json
import base64
import os
import websocket
import requests
async def main():
raw_audio_file = r"C:\Users\vivoka\Music\coffee.raw"
request_uri = "http://localhost:39806/v1/voice-recognition/recognize"
request_data = {
"models": {
"VendingMachine": {
"slots": {
"drink": {
"values": ["Coffee", "Cola", "Mojito", "Cup of tea"]
}
}
}
}
}
response = requests.post(request_uri, json=request_data)
response_data = response.json()
token = response_data["token"]
web_socket_url = f"ws://localhost:39806/v1/ws/{token}"
ws = websocket.WebSocketApp(web_socket_url, on_message=handle_message)
ws.on_open = lambda _: send_audio_data(ws, raw_audio_file)
ws.run_forever()
def send_audio_data(ws, audio_file_path):
# Read audio data from a file and send it through websocket
fileSize = os.path.getsize(audio_file_path)
with open(audio_file_path, "rb") as f:
while True:
chunk = f.read(1024)
if not chunk:
break
base64_chunk = base64.b64encode(chunk).decode("utf-8")
audio_chunk = {
"data": f"data:audio/pcm;base64,{base64_chunk}",
"last": f.tell() == fileSize
}
json_chunk = json.dumps(audio_chunk)
ws.send(json_chunk)
def handle_message(ws, message):
try:
data = json.loads(message)
if "event" in data:
print(f"Event received: {json.dumps(data['event'])}")
elif "result" in data:
print(f"Result received: {json.dumps(data['result'])}")
elif "error" in data:
print(f"Error received: {json.dumps(data['error'])}")
else:
print("Unknown message type received.")
except json.JSONDecodeError as ex:
print(f"Failed to parse message: {ex}")
if __name__ == "__main__":
asyncio.run(main())
C#
C#
using System.Text.Json;
using System.Net.WebSockets;
using System.Text;
const string rawAudioFile = @"C:\Users\vivoka\Music\coffee.raw";
const string requestUri = "http://localhost:39806/v1/voice-recognition/recognize";
const string audioDataHeader = "data:audio/pcm;base64,";
var requestData = new
{
models = new
{
VendingMachine = new
{
slots = new
{
drink = new { values = new[] { "Coffee", "Cola", "Mojito", "Cup of tea" } }
}
}
}
};
using (var client = new HttpClient())
{
var json = JsonSerializer.Serialize(requestData);
var content = new StringContent(json, Encoding.UTF8, "application/json");
var response = await client.PostAsync(requestUri, content);
response.EnsureSuccessStatusCode();
var responseBody = await response.Content.ReadAsStringAsync();
// Step 2: Extract the WebSocket URL from the response
var jsonResponse = JsonDocument.Parse(responseBody).RootElement;
var token = jsonResponse.GetProperty("token").ToString();
var webSocketUrl = $"ws://localhost:39806/v1/ws/{token}";
// Step 3: Connect to the WebSocket and start sending audio data
using (var webSocket = new ClientWebSocket())
{
await webSocket.ConnectAsync(new Uri(webSocketUrl), CancellationToken.None);
var sending = Task.Run(() => SendAudioData(webSocket, rawAudioFile));
var receiving = Task.Run(() => ReceivePackets(webSocket));
await Task.WhenAll(sending, receiving);
}
}
async Task SendAudioData(ClientWebSocket webSocket, string audioFilePath)
{
// Read audio data from a file and send it throw websocket
using (var fs = File.OpenRead(audioFilePath))
{
int bytesRead;
var buffer = new byte[1024];
while ((bytesRead = await fs.ReadAsync(buffer, 0, buffer.Length)) > 0)
{
var base64 = Convert.ToBase64String(new ArraySegment<byte>(buffer, 0, bytesRead));
var audioChunk = new
{
data = $"{audioDataHeader}{base64}",
last = fs.Position == fs.Length
};
var json = JsonSerializer.Serialize(audioChunk);
var bytes = Encoding.UTF8.GetBytes(json);
await webSocket.SendAsync(new ArraySegment<byte>(bytes), WebSocketMessageType.Text, true, CancellationToken.None);
}
}
}
async Task ReceivePackets(ClientWebSocket webSocket)
{
var message = "";
var buffer = new byte[1024];
while (webSocket.State == WebSocketState.Open)
{
var packet = await webSocket.ReceiveAsync(new ArraySegment<byte>(buffer), CancellationToken.None);
if (packet.MessageType == WebSocketMessageType.Close)
{
await webSocket.CloseAsync(WebSocketCloseStatus.NormalClosure, "Closing", CancellationToken.None);
break;
}
else
{
message = (message.Length > 0 ? message : "") + Encoding.UTF8.GetString(buffer, 0, packet.Count);
if (packet.EndOfMessage)
{
HandleMessage(message);
message = "";
}
}
}
}
void HandleMessage(string message)
{
try
{
using (var document = JsonDocument.Parse(message))
{
var root = document.RootElement;
if (root.TryGetProperty("event", out JsonElement eventElement))
Console.WriteLine($"Event received: {JsonSerializer.Serialize(eventElement)}");
else if (root.TryGetProperty("result", out JsonElement resultElement))
Console.WriteLine($"Result received: {JsonSerializer.Serialize(resultElement)}");
else if (root.TryGetProperty("error", out JsonElement errorElement))
Console.WriteLine($"Error received: {JsonSerializer.Serialize(errorElement)}");
else
Console.WriteLine("Unknown message type received.");
}
}
catch (JsonException ex)
{
Console.WriteLine($"Failed to parse message: {ex.Message}");
}
}