Vision Pipelines
Introduction
Vision pipelines process camera images to extract semantic information for robot decision-making. This chapter covers object detection, segmentation, depth estimation, and visual servoing for manipulation and navigation tasks.
Learning Objectives:
- Build object detection pipelines with YOLO and DETR
- Implement semantic segmentation for scene understanding
- Estimate depth from stereo or monocular cameras
- Use visual servoing for precise manipulation
- Optimize vision pipelines for real-time performance
Theory
Vision Pipeline Architecture
Camera → Preprocessing → Feature Extraction → Task-Specific Head → Output
(RGB) (Resize, Norm) (Backbone CNN) (Detection/Seg) (Results)
Common Tasks:
| Task | Input | Output | Use Case |
|---|---|---|---|
| Object Detection | RGB image | Bounding boxes + labels | "Where is the cup?" |
| Segmentation | RGB image | Per-pixel labels | "What region is the table?" |
| Depth Estimation | RGB or Stereo | Depth map | "How far is the object?" |
| Pose Estimation | RGB + Depth | 6D pose (x,y,z, roll,pitch,yaw) | "Grasp this object" |
| Visual Servoing | Image features | Control commands | "Center object in view" |
Model Architectures
Object Detection:
- YOLO: Fast, real-time (30-120 FPS)
- RT-DETR: Transformer-based, high accuracy
- EfficientDet: Efficient, mobile-friendly
Segmentation:
- DeepLabV3+: Semantic segmentation
- Mask R-CNN: Instance segmentation
- Segment Anything (SAM): Zero-shot segmentation
Depth:
- MiDaS: Monocular depth estimation
- Stereo matching: Classical or learned (PSMNet)
Implementation
Object Detection with YOLO
from ultralytics import YOLO
import cv2
class ObjectDetector:
def __init__(self, model_path="yolov8n.pt"):
self.model = YOLO(model_path)
def detect(self, image):
"""Run object detection on image"""
results = self.model(image, conf=0.5)
detections = []
for result in results:
for box in result.boxes:
detections.append({
'class': self.model.names[int(box.cls[0])],
'confidence': float(box.conf[0]),
'bbox': box.xyxy[0].tolist(), # [x1, y1, x2, y2]
})
return detections
# Usage
detector = ObjectDetector("yolov8n.pt")
image = cv2.imread("scene.jpg")
detections = detector.detect(image)
for det in detections:
print(f"Found {det['class']} at {det['bbox']} ({det['confidence']:.2f})")
ROS 2 Vision Node
import rclpy
from rclpy.node import Node
from sensor_msgs.msg import Image
from vision_msgs.msg import Detection2DArray, Detection2D, ObjectHypothesisWithPose
from cv_bridge import CvBridge
from ultralytics import YOLO
class VisionNode(Node):
def __init__(self):
super().__init__('vision_node')
# Subscriptions and publishers
self.image_sub = self.create_subscription(
Image, '/camera/image_raw', self.image_callback, 10
)
self.detection_pub = self.create_publisher(
Detection2DArray, '/detections', 10
)
# Load YOLO model
self.model = YOLO('yolov8n.pt')
self.bridge = CvBridge()
self.get_logger().info("Vision node ready")
def image_callback(self, msg):
"""Process incoming images"""
# Convert ROS image to OpenCV
cv_image = self.bridge.imgmsg_to_cv2(msg, "bgr8")
# Run detection
results = self.model(cv_image, conf=0.5)
# Publish detections
detection_msg = Detection2DArray()
detection_msg.header = msg.header
for result in results:
for box in result.boxes:
det = Detection2D()
det.bbox.center.x = (box.xyxy[0][0] + box.xyxy[0][2]) / 2
det.bbox.center.y = (box.xyxy[0][1] + box.xyxy[0][3]) / 2
det.bbox.size_x = box.xyxy[0][2] - box.xyxy[0][0]
det.bbox.size_y = box.xyxy[0][3] - box.xyxy[0][1]
hyp = ObjectHypothesisWithPose()
hyp.hypothesis.class_id = self.model.names[int(box.cls[0])]
hyp.hypothesis.score = float(box.conf[0])
det.results.append(hyp)
detection_msg.detections.append(det)
self.detection_pub.publish(detection_msg)
def main():
rclpy.init()
node = VisionNode()
rclpy.spin(node)
Depth Estimation
Monocular Depth with MiDaS:
import torch
from torchvision.transforms import Compose, Resize, Normalize
class DepthEstimator:
def __init__(self):
self.model = torch.hub.load("intel-isl/MiDaS", "MiDaS_small")
self.model.eval()
self.transform = Compose([
Resize((256, 256)),
Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])
def estimate_depth(self, rgb_image):
"""Estimate depth from single RGB image"""
input_batch = self.transform(rgb_image).unsqueeze(0)
with torch.no_grad():
depth = self.model(input_batch)
depth = torch.nn.functional.interpolate(
depth.unsqueeze(1),
size=rgb_image.shape[:2],
mode="bicubic"
).squeeze()
return depth.cpu().numpy()
# Usage
depth_estimator = DepthEstimator()
depth_map = depth_estimator.estimate_depth(rgb_image)
Stereo Depth:
import cv2
class StereoDepth:
def __init__(self):
self.stereo = cv2.StereoBM_create(numDisparities=16*10, blockSize=15)
def compute_depth(self, left_image, right_image):
"""Compute depth from stereo pair"""
# Convert to grayscale
left_gray = cv2.cvtColor(left_image, cv2.COLOR_BGR2GRAY)
right_gray = cv2.cvtColor(right_image, cv2.COLOR_BGR2GRAY)
# Compute disparity
disparity = self.stereo.compute(left_gray, right_gray)
# Convert to depth (with camera parameters)
baseline = 0.12 # meters
focal_length = 700 # pixels
depth = (baseline * focal_length) / (disparity + 1e-6)
return depth
Visual Servoing
Image-Based Visual Servoing (IBVS):
class VisualServo:
def __init__(self, target_image_position):
"""
Args:
target_image_position: Desired object position in image (u, v)
"""
self.target = target_image_position
self.lambda_gain = 0.5 # Control gain
def compute_velocity(self, current_image_position):
"""
Compute camera velocity to center object
Returns:
velocity: (v_x, v_y, v_z, w_x, w_y, w_z)
"""
# Error in image space
error = self.target - current_image_position
# Interaction matrix (simplified)
L = np.array([
[-1, 0, error[0]],
[0, -1, error[1]]
])
# Compute velocity
velocity = -self.lambda_gain * np.linalg.pinv(L) @ error
return velocity
# Usage with object detector
servo = VisualServo(target_image_position=[320, 240]) # Image center
# In control loop:
detections = detector.detect(image)
if detections:
obj_center = get_bbox_center(detections[0]['bbox'])
vel = servo.compute_velocity(obj_center)
robot.set_velocity(vel)
Performance Optimization
Model Optimization
# Export YOLO to TensorRT for 3-5x speedup
from ultralytics import YOLO
model = YOLO("yolov8n.pt")
model.export(format="engine", half=True) # FP16
# Load optimized model
model_trt = YOLO("yolov8n.engine")
results = model_trt(image) # Much faster inference
Batched Inference
# Process multiple images at once
images = [img1, img2, img3, img4]
results = model(images, batch=4) # Process 4 images simultaneously
Image Preprocessing
# Resize to model input size for faster inference
import cv2
def preprocess_for_speed(image, target_size=(640, 640)):
"""Resize image to model input size"""
resized = cv2.resize(image, target_size)
return resized
# Inference at 640x640 is faster than 1920x1080
Summary
Vision pipelines combine object detection, segmentation, and depth estimation to provide semantic scene understanding for robots. YOLO and RT-DETR enable real-time object detection, while MiDaS and stereo matching provide depth information. Visual servoing uses image features to control robot motion. TensorRT optimization and batched inference improve performance for real-time applications.