Skip to main content

Voice to Action

Introduction

Voice-to-action systems enable robots to understand and execute spoken commands, providing an intuitive human-robot interface. This chapter covers speech-to-text (STT), natural language understanding (NLU), and mapping voice commands to robot actions.

Learning Objectives:

  • Implement speech recognition with Whisper and other STT models
  • Parse natural language commands with LLMs
  • Integrate voice control with ROS 2
  • Handle ambiguous commands and confirmations
  • Deploy voice interfaces on edge devices

Theory

Voice-to-Action Pipeline

Microphone → Speech-to-Text → NLU → Action Planning → Robot Execution
(Audio) (Whisper) (LLM) (Task planner) (Controllers)

Key Components:

ComponentTechnologyPurpose
Audio CapturePyAudio, ROS AudioRecord microphone input
Speech-to-TextWhisper, Vosk, Google STTConvert speech to text
NLUGPT-4, Claude, LlamaUnderstand intent
Action MappingRule-based or learnedMap commands to actions
FeedbackTTS (Text-to-Speech)Confirm understanding

STT Model Comparison

ModelSpeedAccuracyOfflineLanguages
Whisper (Base)Real-time95%Yes99
Whisper (Large)2x slower98%Yes99
Google STTFast97%No125+
VoskVery fast90%Yes20

Implementation

Whisper Speech Recognition

import whisper
import pyaudio
import numpy as np

class VoiceInterface:
def __init__(self):
# Load Whisper model
self.model = whisper.load_model("base") # or "small", "medium", "large"

# Audio capture settings
self.RATE = 16000
self.CHUNK = 1024

def listen(self, duration=5):
"""Capture audio from microphone"""
audio = pyaudio.PyAudio()
stream = audio.open(
format=pyaudio.paInt16,
channels=1,
rate=self.RATE,
input=True,
frames_per_buffer=self.CHUNK
)

print("Listening...")
frames = []
for _ in range(0, int(self.RATE / self.CHUNK * duration)):
data = stream.read(self.CHUNK)
frames.append(np.frombuffer(data, dtype=np.int16))

stream.stop_stream()
stream.close()

# Combine and normalize
audio_data = np.concatenate(frames).astype(np.float32) / 32768.0
return audio_data

def transcribe(self, audio_data):
"""Convert speech to text with Whisper"""
result = self.model.transcribe(audio_data, language='en')
return result['text']

def voice_loop(self):
"""Continuous voice command loop"""
while True:
audio = self.listen(duration=3)
text = self.transcribe(audio)

if text.strip():
print(f"You said: {text}")
yield text

# Usage
voice = VoiceInterface()
for command in voice.voice_loop():
process_command(command)

ROS 2 Voice Command Node

import rclpy
from rclpy.node import Node
from std_msgs.msg import String
import whisper

class VoiceCommandNode(Node):
def __init__(self):
super().__init__('voice_command_node')

# Publisher for recognized commands
self.command_pub = self.create_publisher(String, '/voice_command', 10)

# Load Whisper
self.model = whisper.load_model("base")

# Start listening
self.timer = self.create_timer(3.0, self.listen_and_publish)
self.get_logger().info("Voice command node ready")

def listen_and_publish(self):
"""Listen for voice command and publish to ROS topic"""
audio = self.capture_audio(duration=3)
text = self.model.transcribe(audio)['text']

if text.strip():
msg = String()
msg.data = text
self.command_pub.publish(msg)
self.get_logger().info(f"Published command: {text}")

def main():
rclpy.init()
node = VoiceCommandNode()
rclpy.spin(node)

Natural Language Understanding

Parse commands with LLM:

from transformers import pipeline

class CommandParser:
def __init__(self):
self.llm = pipeline("text-generation", model="meta-llama/Llama-3-8B")

def parse_command(self, voice_text: str):
"""Extract structured action from voice command"""
prompt = f"""
Parse this robot command into JSON:
Command: "{voice_text}"

Output format:
{{
"action": "move" | "pick" | "place" | "navigate",
"object": "object name or null",
"location": "location name or null",
"parameters": {{}}
}}
"""

response = self.llm(prompt, max_length=100)[0]['generated_text']
return self.extract_json(response)

def extract_json(self, text):
"""Extract JSON from LLM response"""
import json, re
match = re.search(r'\{.*\}', text, re.DOTALL)
if match:
return json.loads(match.group())
return None

# Example usage
parser = CommandParser()
command = parser.parse_command("Pick up the red cup and place it on the table")
# Output: {'action': 'pick', 'object': 'red cup', 'location': 'table'}

Advanced Features

Wake Word Detection

import pvporcupine

class WakeWordDetector:
def __init__(self):
self.porcupine = pvporcupine.create(keywords=['robot', 'hey-robot'])

def is_wake_word(self, audio_frame):
"""Check if audio contains wake word"""
keyword_index = self.porcupine.process(audio_frame)
return keyword_index >= 0

# Only listen after hearing "Hey robot"
detector = WakeWordDetector()
if detector.is_wake_word(audio):
command = voice.transcribe(audio)

Command Confirmation

from gtts import gTTS
import os

class VoiceConfirmation:
def speak(self, text: str):
"""Text-to-speech feedback"""
tts = gTTS(text=text, lang='en')
tts.save("temp.mp3")
os.system("mpg321 temp.mp3")

def confirm_action(self, command):
"""Ask user to confirm before executing"""
self.speak(f"Did you say: {command}?")

# Listen for confirmation
response = voice.listen(duration=2)
if "yes" in response.lower():
return True
return False

# Usage
if confirmation.confirm_action("pick up the cup"):
robot.execute_action(action)

Summary

Voice-to-action systems combine speech recognition (Whisper, Vosk) with natural language understanding (LLMs) to enable intuitive robot control. ROS 2 integration allows voice commands to trigger navigation, manipulation, and other behaviors. Wake word detection and command confirmation improve usability and safety.

Further Reading