Tropo is part of CiscoLearn More

Manipulating Say with SSML

There are many cases when you need or just want to control the pitch, volume and intonation of your prompts and responses. To make this easy, Tropo natively supports a standard called the Synthesized Speech Markup Language (SSML).

SSML is an international standard from the W3C for controlling the pace, tone, pitch and all around sound of computer-generated voices. Here’s a script that repeats the same sentence twice; once at normal speed and then again at half speed:

require 'tropo-webapi-ruby'
require 'sinatra'

post '/index.json' do
  
  t = Tropo::Generator.new
  
  t.say "<speak>One potato, two potato, three potato, four. <prosody rate='-50%'>One potato, two potato, three potato, four.</prosody></speak>"
  
  t.response

end
var http = require('http');
var tropo_webapi = require('tropo-webapi');

var server = http.createServer(function (request, response) {
	
	var tropo = new TropoWebAPI();

	tropo.say("<speak>One potato, two potato, three potato, four. <prosody rate='-50%'>One potato, two potato, three potato, four.</prosody></speak>");
	
	response.end(TropoJSON(tropo));

}).listen(8000); 
<?php

require('tropo.class.php');

$tropo = new Tropo();
$tropo->say("<speak>One potato, two potato, three potato, four. <prosody rate='-50%'>One potato, two potato, three potato, four.</prosody></speak>");
$tropo->RenderJson();

?>
from itty import *
from tropo import Tropo

@post('/index.json')
def index(request):

    t = Tropo()
    t.say("<speak>One potato, two potato, three potato, four. <prosody rate='-50%'>One potato, two potato, three potato, four.</prosody></speak>")
    return t.RenderJson()

run_itty(server='wsgiref', host='0.0.0.0', port=8888)

{
   "tropo":[
      {
         "say":[
            {
               "value":"<speak>One potato, two potato, three potato, four. <prosody rate='-50%'>One potato, two potato, three potato, four.</prosody></speak>"
            }
         ]
      }
   ]
}

say-as

In addition to controlling pitch, volume and intonation, there are also times when you need to control how the Text to Speech engine interprets text, especially numbers. The SSML say-as element allows you to define whether the text should be interpreted as currency, digits, number, date, and time. While most of the options are self-explanatory, it may help to note that digits will interpret the text as individual numbers instead of one complete number ('1234' will be interpreted as 'one, two, three, four') while number will interpret the text as a complete value ('1234' will sound like 'one thousand two hundred thirty four'). Here's a code example displaying the use of say-as:

require 'tropo-webapi-ruby'
require 'sinatra'

post '/index.json' do
  
  t = Tropo::Generator.new
  
  t.say "<?xml version='1.0'?><speak><say-as interpret-as='currency'>USD51.33</say-as></speak>"
  t.say "<?xml version='1.0'?><speak><say-as interpret-as='digits'>12345678</say-as></speak>"
  t.say "<?xml version='1.0'?><speak><say-as interpret-as='number'>1234.5678</say-as></speak>"
  t.say "<?xml version='1.0'?><speak><say-as interpret-as='date'>20110205</say-as></speak>"
  t.say "<?xml version='1.0'?><speak><say-as interpret-as='time'>12:00p</say-as></speak>"
    
  t.response

end
var http = require('http');
var tropo_webapi = require('tropo-webapi');

var server = http.createServer(function (request, response) {
	
	var tropo = new TropoWebAPI();

	tropo.say("<?xml version='1.0'?><speak><say-as interpret-as='currency'>USD51.33</say-as></speak>");
	tropo.say("<?xml version='1.0'?><speak><say-as interpret-as='digits'>12345678</say-as></speak>");
	tropo.say("<?xml version='1.0'?><speak><say-as interpret-as='number'>1234.5678</say-as></speak>");
	tropo.say("<?xml version='1.0'?><speak><say-as interpret-as='date'>20110205</say-as></speak>");
	tropo.say("<?xml version='1.0'?><speak><say-as interpret-as='time'>12:00p</say-as></speak>");
	
	response.end(TropoJSON(tropo));

}).listen(8000); 
<?php

require('tropo.class.php');

$tropo = new Tropo();

$tropo->say("<?xml version='1.0'?><speak><say-as interpret-as='currency'>USD51.33</say-as></speak>");
$tropo->say("<?xml version='1.0'?><speak><say-as interpret-as='digits'>12345678</say-as></speak>");
$tropo->say("<?xml version='1.0'?><speak><say-as interpret-as='number'>1234.5678</say-as></speak>");
$tropo->say("<?xml version='1.0'?><speak><say-as interpret-as='date'>20110205</say-as></speak>");
$tropo->say("<?xml version='1.0'?><speak><say-as interpret-as='time'>12:00p</say-as></speak>");

$tropo->RenderJson();

?>
from itty import *
from tropo import Tropo

@post('/index.json')
def index(request):

    t = Tropo()
    t.say("<?xml version='1.0'?><speak><say-as interpret-as='currency'>USD51.33</say-as></speak>")
    t.say("<?xml version='1.0'?><speak><say-as interpret-as='digits'>12345678</say-as></speak>")
    t.say("<?xml version='1.0'?><speak><say-as interpret-as='number'>1234.5678</say-as></speak>")
    t.say("<?xml version='1.0'?><speak><say-as interpret-as='date'>20110205</say-as></speak>")
    t.say("<?xml version='1.0'?><speak><say-as interpret-as='time'>12:00p</say-as></speak>")
    return t.RenderJson()

run_itty(server='wsgiref', host='0.0.0.0', port=8888)

{
   "tropo":[
      {
         "say":[
            {"value":"<?xml version='1.0'?><speak><say-as interpret-as='currency'>USD51.33</say-as></speak>"}
         ]
      },
      {
         "say":[
            {"value":"<?xml version='1.0'?><speak><say-as interpret-as='digits'>12345678</say-as></speak>"}
         ]
      },
      {
         "say":[
            {"value":"<?xml version='1.0'?><speak><say-as interpret-as='number'>1234.5678</say-as></speak>"}
         ]
      },
      {
         "say":[
            {"value":"<?xml version='1.0'?><speak><say-as interpret-as='date'>20110205</say-as></speak>"}
         ]
      },
      {
         "say":[
            {"value":"<?xml version='1.0'?><speak><say-as interpret-as='time'>12:00p</say-as></speak>"}
         ]
      }
   ]
}

SSML Support

Tropo supports all of the Required elements of the SSML specification with one exception. SSML supports a tag called "voice" which allows a developer to specify the voice name. In Tropo, "voice" is a parameter of the say or ask verbs and should be set as documented. Attempting to set a voice within SSML will result in a failure for the text to render or use a voice other than the one intended.