一、引言

目标检测作为计算机视觉中的核心任务,广泛应用于安防监控、自动驾驶、工业质检等场景。传统的目标检测方法多使用水平边界框(HBB)进行目标定位。然而在一些特殊应用场景(如遥感图像、文本检测、PCB检测等)中,目标往往呈现任意角度的旋转,这种情况下传统的水平框就显得力不从心。

为了解决这个问题,旋转边界框(Oriented Bounding Box, OBB)被提出,用来更精确地拟合和定位具有方向性的目标。本文将详细介绍如何使用 YOLOv8-OBB 完成旋转目标检测的完整流程,包括:

1.数据集的制作与准备

2.模型的训练与预测

3.旋转框的裁剪与摆正

4.基于模型的自动标注技术

一、制作数据集

可参考博客:https://fakerth.blog.csdn.net/article/details/144069399

数据集目录结构:

dataset/
├── images/
│   ├── train/
│   ├── val/
│   └── test/
└── labels/
    ├── train/
    ├── val/
    └── test/

二、训练,预测

1.训练

import torch
from myultralytics.ultralytics import YOLO


def main():
    #model = YOLO('yolov8-obb.yaml').load('pt/yolov8x-obb.pt')  # build from YAML and transfer weights
    model = YOLO("yolov8n-obb.pt")
    device = 0 if torch.cuda.is_available() else 'cpu'  # 使用第一个 GPU,若无 GPU 则使用 CPU
    model.train(data='./your_dataset.yaml', epochs=5, imgsz=640, batch=16, workers=0, device=device)


if __name__ == '__main__':
    main()

训练参数说明:

  • data: 数据集配置文件路径

  • epochs: 训练轮数

  • imgsz: 输入图像尺寸

  • batch: 批次大小

  • workers: 数据加载线程数

  • device: 训练设备(0表示第一个GPU)

./your_dataset.yaml:

path: D:/Wmrecognition/utils/dataset/
train: images/train
val: images/val
test: images/test
names:
  0: wm

训练后权重保存在 runs/obb/trainX/weights/best.pt

2.预测并将旋转框裁剪摆正

import math
import os

import cv2
import numpy as np
from ultralytics import YOLO
from PIL import Image, ImageDraw

model = YOLO('./runs/obb/train7/weights/best.pt')
results = model('./412.jpg', save=True)

image = cv2.imread('./412.jpg')

# 获取YOLO-obb的旋转矩形框结果
obb = results[0].obb  # 获取旋转框对象

# 提取旋转矩形框的详细信息
xywhr = obb.xywhr.cpu().numpy()  # 旋转框的 [x_center, y_center, width, height, angle]
xyxyxyxy = obb.xyxyxyxy.cpu().numpy()  # 旋转框的四个角点 [x1, y1, x2, y2, ...]


def rotate_image(image, angle, center=None, scale=1.0):
    (h, w) = image.shape[:2]
    if center is None:
        center = (w // 2, h // 2)  # 默认旋转中心为图像的中心
    M = cv2.getRotationMatrix2D(center, angle, scale)  # 获取旋转矩阵
    rotated = cv2.warpAffine(image, M, (w, h))  # 执行图像旋转
    cv2.imshow("sadf", rotated)
    return rotated


# 函数:裁剪旋转矩形框
def crop_rotated_box(image, box, angle):
    # box = [x_center, y_center, width, height]
    x, y, w, h = box
    center = (x, y)

    # 旋转图像
    rotated_image = rotate_image(image, angle, center=center)

    # 计算裁剪区域的坐标
    x1, y1 = int(x - w / 2), int(y - h / 2)
    x2, y2 = int(x + w / 2), int(y + h / 2)

    # 裁剪旋转后的区域
    cropped = rotated_image[y1:y2, x1:x2]
    return cropped


# 遍历每个预测框并裁剪

for box in xywhr:
    x, y, w, h, angle = box  # 获取目标框的 [x_center, y_center, width, height, angle]
    print(angle)
    # 确保角度为正数(根据需要可以调整)
    # angle = angle % 360
    angle = math.degrees(angle)
    # 裁剪目标框
    cropped_image = crop_rotated_box(image, (x, y, w, h), angle)

    # 显示裁剪后的图像
    cv2.imshow(f"Cropped Image", cropped_image)
    cv2.waitKey(0)
    cv2.destroyAllWindows()

三、根据模型自动标注

为了便于模型训练或者使用XML格式工具查看标注结果,我们可以将模型预测的 OBB 转换为 VOC 格式的 xml 文件。这个脚本是单张图片生成XML格式文件,可根据需要修改成目录下所有文件。

import cv2
import numpy as np
from lxml import etree
from pathlib import Path
from typing import List, Dict


def create_rotated_voc_xml(
        img_path: Path,
        objects: List[Dict],
        output_dir: Path,
        database: str = "Unknown",
        segmented: int = 0
) -> None:
    """
    生成包含旋转框标注的VOC格式XML文件

    参数:
        img_path: 图像文件路径
        objects: 物体标注列表,每个元素应包含:
            - 'name': 类别名称
            - 'cx': 旋转框中心x坐标
            - 'cy': 旋转框中心y坐标
            - 'w': 旋转框宽度
            - 'h': 旋转框高度
            - 'angle': 旋转角度()
        output_dir: XML输出目录
        database: 数据集来源(默认"Unknown")
        segmented: 是否分割标注(默认0)
    """
    # 读取图像获取尺寸信息
    img = cv2.imread(str(img_path))
    height, width = img.shape[:2]
    depth = img.shape[2] if len(img.shape) == 3 else 1

    # 创建XML根节点
    annotation = etree.Element("annotation")

    # 添加基础信息
    etree.SubElement(annotation, "folder").text = str(output_dir.name)
    etree.SubElement(annotation, "filename").text = img_path.name
    etree.SubElement(annotation, "path").text = str(img_path)

    # 添加source节点
    source = etree.SubElement(annotation, "source")
    etree.SubElement(source, "database").text = database

    # 添加size节点
    size = etree.SubElement(annotation, "size")
    etree.SubElement(size, "width").text = str(width)
    etree.SubElement(size, "height").text = str(height)
    etree.SubElement(size, "depth").text = str(depth)

    etree.SubElement(annotation, "segmented").text = str(segmented)

    # 添加每个物体标注
    for obj in objects:
        object_node = etree.SubElement(annotation, "object")
        etree.SubElement(object_node, "name").text = str(obj['name'])
        etree.SubElement(object_node, "pose").text = "Unspecified"
        etree.SubElement(object_node, "truncated").text = "0"
        etree.SubElement(object_node, "difficult").text = "0"

        # 添加旋转框信息
        robndbox = etree.SubElement(object_node, "robndbox")
        etree.SubElement(robndbox, "cx").text = f"{obj['cx']:.1f}"
        etree.SubElement(robndbox, "cy").text = f"{obj['cy']:.1f}"
        etree.SubElement(robndbox, "w").text = f"{obj['w']:.1f}"
        etree.SubElement(robndbox, "h").text = f"{obj['h']:.1f}"
        etree.SubElement(robndbox, "angle").text = f"{obj['angle']:.1f}"

        # 添加空extra节点
        etree.SubElement(object_node, "extra")

    # 格式化输出XML
    xml_str = etree.tostring(
        annotation,
        pretty_print=True,
        encoding='utf-8',
        xml_declaration=True
    )

    # 保存文件
    xml_path = output_dir / f"{img_path.stem}.xml"
    with open(xml_path, 'wb') as f:
        f.write(xml_str)



from ultralytics import YOLO
from pathlib import Path

import math

bb = math.pi / 2

class OBBToVOCConverter:
    def __init__(self, model_path: str):
        self.model = YOLO(model_path)

    def process_image(self, img_path: Path, output_dir: Path):
        # 执行OBB检测
        results = self.model(img_path)
        # 解析检测结果
        objects = []
        for r in results:
            print(r.obb)
            for box in r.obb:
                cx, cy, w, h, angle = box.xywhr[0].tolist()
                objects.append({
                    "name": r.names[int(box.cls)],
                    "cx": cx,
                    "cy": cy,
                    "w": h,
                    "h": w,
                    "angle": angle - bb
                })

        # 生成VOC XML
        create_rotated_voc_xml(
            img_path=img_path,
            objects=objects,
            output_dir=output_dir
        )


# 使用示例
converter = OBBToVOCConverter("best7300.pt")
converter.process_image(
    img_path=Path("img/shhn_20241030_czszls_241030103130_vol.jpg"),
    output_dir=Path("Annotations")
)

四、总结

本文详细介绍了使用 YOLOv8-OBB 实现旋转目标检测的完整流程。从数据集的构建,到模型训练与预测,再到旋转框的裁剪、摆正以及自动标注的实现,每一步都贴合实际场景的应用需求,提供了可复现的代码和清晰的逻辑。相较于传统的水平边界框(HBB),旋转边界框(OBB)能更加精准地拟合方向性目标(如文本、斜放物体、工业构件等),显著提升了检测精度和目标表达能力。通过 YOLOv8-OBB 提供的能力,我们可以实现:

  • 对于任意角度的目标检测更具鲁棒性;

  • 利用检测结果进行目标的旋转裁剪与方向校正;

  • 自动生成 VOC 旋转标注数据,显著降低人工标注成本;

  • 快速迭代模型并完成数据闭环,以支持复杂的工业、遥感和文本检测场景。

此外,本文所提供的自动标注流程也为构建半自动标注系统提供了良好基础,可以结合人工审核进一步提高标注效率和质量,助力小样本学习与增量学习场景。

Logo

GitCode 天启AI是一款由 GitCode 团队打造的智能助手,基于先进的LLM(大语言模型)与多智能体 Agent 技术构建,致力于为用户提供高效、智能、多模态的创作与开发支持。它不仅支持自然语言对话,还具备处理文件、生成 PPT、撰写分析报告、开发 Web 应用等多项能力,真正做到“一句话,让 Al帮你完成复杂任务”。

更多推荐