Skip to content

Data API

Datasets

modern_yolonas.data.coco.COCODetectionDataset

Bases: Dataset

COCO-format detection dataset.

Parameters:

Name Type Description Default
root str | Path

Path to image directory (e.g., coco/images/train2017).

required
ann_file str | Path

Path to annotation JSON (e.g., coco/annotations/instances_train2017.json).

required
transforms Transform | None

(image, targets) → (image, targets) callable.

None
input_size int

Target input size (used by transforms).

640
Source code in src/modern_yolonas/data/coco.py
class COCODetectionDataset(Dataset):
    """COCO-format detection dataset.

    Args:
        root: Path to image directory (e.g., ``coco/images/train2017``).
        ann_file: Path to annotation JSON (e.g., ``coco/annotations/instances_train2017.json``).
        transforms: ``(image, targets) → (image, targets)`` callable.
        input_size: Target input size (used by transforms).
    """

    def __init__(
        self,
        root: str | Path,
        ann_file: str | Path,
        transforms: Transform | None = None,
        input_size: int = 640,
        cache_annotations: bool = True,
        ignore_empty_annotations: bool = True,
    ):
        from pycocotools.coco import COCO

        self.root = Path(root)
        self.transforms = transforms
        self.input_size = input_size

        self.coco = COCO(str(ann_file))
        self.ids = list(sorted(self.coco.imgs.keys()))

        # Build contiguous class mapping using only categories present in annotations.
        # Excludes unused categories (e.g. a "background" id=0 that carries no annotations)
        # so that the resulting label indices are always 0-indexed and contiguous.
        ann_cat_ids = {ann["category_id"] for ann in self.coco.dataset.get("annotations", [])}
        cat_ids = sorted(c for c in self.coco.getCatIds() if c in ann_cat_ids)
        self.cat_id_to_label = {cat_id: i for i, cat_id in enumerate(cat_ids)}

        # Ordered list of class names aligned with label indices 0..N-1
        cats = self.coco.loadCats(cat_ids)
        self.class_names: list[str] = [c["name"] for c in cats]

        # Pre-load all annotation arrays into memory to avoid repeated pycocotools lookups
        self._label_cache: list[np.ndarray] | None = None
        if cache_annotations:
            self._label_cache = [self._parse_anns(img_id) for img_id in self.ids]

        # Drop images with no annotations (background-only samples waste training batches)
        if ignore_empty_annotations:
            labels = self._label_cache if self._label_cache is not None else [self._parse_anns(img_id) for img_id in self.ids]
            keep = [i for i, t in enumerate(labels) if len(t) > 0]
            self.ids = [self.ids[i] for i in keep]
            if self._label_cache is not None:
                self._label_cache = [self._label_cache[i] for i in keep]

    def __len__(self) -> int:
        return len(self.ids)

    def _parse_anns(self, img_id: int) -> np.ndarray:
        """Return ``[N, 5]`` float32 targets for *img_id* (no image needed)."""
        img_info = self.coco.loadImgs(img_id)[0]
        h_img, w_img = img_info["height"], img_info["width"]
        ann_ids = self.coco.getAnnIds(imgIds=img_id)
        anns = self.coco.loadAnns(ann_ids)
        targets = []
        for ann in anns:
            if ann.get("iscrowd", 0):
                continue
            x, y, bw, bh = ann["bbox"]
            cls = self.cat_id_to_label[ann["category_id"]]
            xc = (x + bw / 2) / w_img
            yc = (y + bh / 2) / h_img
            nw = bw / w_img
            nh = bh / h_img
            targets.append([cls, xc, yc, nw, nh])
        return np.array(targets, dtype=np.float32).reshape(-1, 5) if targets else np.zeros((0, 5), dtype=np.float32)

    def load_raw(self, index: int) -> tuple[np.ndarray, np.ndarray]:
        """Load image and targets without transforms."""
        img_id = self.ids[index]
        img_info = self.coco.loadImgs(img_id)[0]
        image = cv2.imread(str(self.root / img_info["file_name"]))

        if self._label_cache is not None:
            targets = self._label_cache[index]
        else:
            targets = self._parse_anns(img_id)

        return image, targets

    def __getitem__(self, index: int) -> tuple[np.ndarray, np.ndarray]:
        image, targets = self.load_raw(index)
        if self.transforms is not None:
            image, targets = self.transforms(image, targets)
        return image, targets

load_raw(index)

Load image and targets without transforms.

Source code in src/modern_yolonas/data/coco.py
def load_raw(self, index: int) -> tuple[np.ndarray, np.ndarray]:
    """Load image and targets without transforms."""
    img_id = self.ids[index]
    img_info = self.coco.loadImgs(img_id)[0]
    image = cv2.imread(str(self.root / img_info["file_name"]))

    if self._label_cache is not None:
        targets = self._label_cache[index]
    else:
        targets = self._parse_anns(img_id)

    return image, targets

modern_yolonas.data.yolo.YOLODetectionDataset

Bases: Dataset

Source code in src/modern_yolonas/data/yolo.py
class YOLODetectionDataset(Dataset):
    def __init__(
        self,
        root: str | Path,
        split: str = "train",
        transforms=None,
        input_size: int = 640,
        cache_annotations: bool = True,
        ignore_empty_annotations: bool = True,
    ):
        self.root = Path(root)
        self.transforms = transforms
        self.input_size = input_size

        img_dir = self.root / "images" / split
        label_dir = self.root / "labels" / split

        self.images = sorted(img_dir.glob("*.*"))
        self.images = [p for p in self.images if p.suffix.lower() in (".jpg", ".jpeg", ".png", ".bmp")]
        self.label_dir = label_dir

        # Try to load class names from classes.txt or data.yaml at dataset root
        self.class_names: list[str] | None = self._load_class_names()

        # Pre-load all annotation arrays into memory (avoids repeated disk I/O during training)
        self._label_cache: list[np.ndarray] | None = None
        if cache_annotations:
            self._label_cache = [self._read_label(p) for p in self.images]

        # Drop images with no annotations (background-only samples waste training batches)
        if ignore_empty_annotations:
            labels = self._label_cache if self._label_cache is not None else [self._read_label(p) for p in self.images]
            keep = [i for i, t in enumerate(labels) if len(t) > 0]
            self.images = [self.images[i] for i in keep]
            if self._label_cache is not None:
                self._label_cache = [self._label_cache[i] for i in keep]

    def _load_class_names(self) -> list[str] | None:
        """Return class names from ``classes.txt`` or ``data.yaml`` if present."""
        classes_txt = self.root / "classes.txt"
        if classes_txt.exists():
            names = [label.strip() for label in classes_txt.read_text().splitlines() if label.strip()]
            return names if names else None

        for yaml_name in ("data.yaml", "dataset.yaml"):
            yaml_path = self.root / yaml_name
            if yaml_path.exists():
                import yaml
                data = yaml.safe_load(yaml_path.read_text())
                names = data.get("names")
                if isinstance(names, list) and names:
                    return [str(n) for n in names]
                if isinstance(names, dict):
                    return [str(names[k]) for k in sorted(names)]

        return None

    def __len__(self) -> int:
        return len(self.images)

    @property
    def num_classes(self) -> int:
        """Number of classes inferred from cached labels (or scanning all label files)."""
        labels = self._label_cache if self._label_cache is not None else [self._read_label(p) for p in self.images]
        all_cls = np.concatenate([t[:, 0] for t in labels if len(t)]) if any(len(t) for t in labels) else np.array([])
        return int(all_cls.max()) + 1 if len(all_cls) else 0

    def _label_path(self, img_path: Path) -> Path:
        return self.label_dir / (img_path.stem + ".txt")

    def _read_label(self, img_path: Path) -> np.ndarray:
        label_path = self._label_path(img_path)
        if label_path.exists():
            data = np.loadtxt(str(label_path), ndmin=2).reshape(-1, 5)
        else:
            data = np.zeros((0, 5))
        return data.astype(np.float32)

    def load_raw(self, index: int) -> tuple[np.ndarray, np.ndarray]:
        """Load image and targets without transforms."""
        img_path = self.images[index]
        image = cv2.imread(str(img_path))
        targets = self._label_cache[index] if self._label_cache is not None else self._read_label(img_path)
        return image, targets

    def __getitem__(self, index: int) -> tuple[np.ndarray, np.ndarray]:
        image, targets = self.load_raw(index)
        if self.transforms is not None:
            image, targets = self.transforms(image, targets)
        return image, targets

num_classes property

Number of classes inferred from cached labels (or scanning all label files).

load_raw(index)

Load image and targets without transforms.

Source code in src/modern_yolonas/data/yolo.py
def load_raw(self, index: int) -> tuple[np.ndarray, np.ndarray]:
    """Load image and targets without transforms."""
    img_path = self.images[index]
    image = cv2.imread(str(img_path))
    targets = self._label_cache[index] if self._label_cache is not None else self._read_label(img_path)
    return image, targets

Transforms

modern_yolonas.data.transforms

Detection-aware data augmentations.

Each transform operates on (image, targets) where: - image: HWC uint8 BGR numpy array - targets: [N, 5] numpy array with [class_id, x_center, y_center, w, h] (normalized)

HSVAugment, HorizontalFlip, RandomAffine, RandomResizedCrop, and RandomChannelSwap are backed by Albumentations <https://albumentations.ai>_ (MIT license, v2+). Mosaic, Mixup, LetterboxResize, and Normalize use native implementations because they have no direct Albumentations equivalent.

Compose

Chain multiple transforms sequentially.

Parameters:

Name Type Description Default
transforms list

List of callables, each accepting (image, targets) and returning (image, targets).

required
Source code in src/modern_yolonas/data/transforms.py
class Compose:
    """Chain multiple transforms sequentially.

    Args:
        transforms: List of callables, each accepting ``(image, targets)``
            and returning ``(image, targets)``.
    """

    def __init__(self, transforms: list):
        self.transforms = transforms

    def __call__(self, image: np.ndarray, targets: np.ndarray) -> tuple[np.ndarray, np.ndarray]:
        for t in self.transforms:
            image, targets = t(image, targets)
        return image, targets

HSVAugment

Randomly adjust hue, saturation, and value via Albumentations.

Parameters:

Name Type Description Default
hgain int

Max hue shift in degrees (Albumentations hue_shift_limit). Matches the super-gradients hgain recipe param. Default: 18.

18
sgain int

Max saturation shift in absolute units (sat_shift_limit). Default: 30.

30
vgain int

Max value shift in absolute units (val_shift_limit). Default: 30.

30
p float

Probability of applying the transform.

0.5
Source code in src/modern_yolonas/data/transforms.py
class HSVAugment:
    """Randomly adjust hue, saturation, and value via Albumentations.

    Args:
        hgain: Max hue shift in degrees (Albumentations ``hue_shift_limit``).
              Matches the super-gradients ``hgain`` recipe param. Default: 18.
        sgain: Max saturation shift in absolute units (``sat_shift_limit``).
              Default: 30.
        vgain: Max value shift in absolute units (``val_shift_limit``).
              Default: 30.
        p: Probability of applying the transform.
    """

    def __init__(self, hgain: int = 18, sgain: int = 30, vgain: int = 30, p: float = 0.5):
        self._aug = A.HueSaturationValue(
            hue_shift_limit=hgain,
            sat_shift_limit=sgain,
            val_shift_limit=vgain,
            p=p,
        )

    def __call__(self, image: np.ndarray, targets: np.ndarray) -> tuple[np.ndarray, np.ndarray]:
        # Albumentations expects RGB; our pipeline carries BGR images
        rgb = image[:, :, ::-1].copy()
        result = self._aug(image=rgb)
        return result["image"][:, :, ::-1].copy(), targets

HorizontalFlip

Randomly flip the image and bounding boxes horizontally via Albumentations.

Parameters:

Name Type Description Default
p float

Probability of applying the flip.

0.5
Source code in src/modern_yolonas/data/transforms.py
class HorizontalFlip:
    """Randomly flip the image and bounding boxes horizontally via Albumentations.

    Args:
        p: Probability of applying the flip.
    """

    def __init__(self, p: float = 0.5):
        self.p = p
        self._aug = A.Compose([A.HorizontalFlip(p=1.0)], bbox_params=_BBOX_PARAMS)

    def __call__(self, image: np.ndarray, targets: np.ndarray) -> tuple[np.ndarray, np.ndarray]:
        if random.random() >= self.p:
            return image, targets
        bboxes, labels = _to_albu(targets)
        r = self._aug(image=image, bboxes=bboxes, class_labels=labels)
        return r["image"], _from_albu(r["bboxes"], r["class_labels"], targets.dtype)

RandomAffine

Apply random rotation, scale, translation, and shear via Albumentations.

Parameters:

Name Type Description Default
degrees float

Maximum rotation in degrees.

0.0
translate float

Maximum translation as a fraction of image size.

0.25
scale tuple[float, float]

Scale range (min, max).

(0.5, 1.5)
shear float

Maximum shear in degrees.

0.0
Source code in src/modern_yolonas/data/transforms.py
class RandomAffine:
    """Apply random rotation, scale, translation, and shear via Albumentations.

    Args:
        degrees: Maximum rotation in degrees.
        translate: Maximum translation as a fraction of image size.
        scale: Scale range ``(min, max)``.
        shear: Maximum shear in degrees.
    """

    def __init__(
        self,
        degrees: float = 0.0,
        translate: float = 0.25,
        scale: tuple[float, float] = (0.5, 1.5),
        shear: float = 0.0,
    ):
        self._aug = A.Compose(
            [
                A.Affine(
                    scale=scale,
                    translate_percent={"x": (-translate, translate), "y": (-translate, translate)},
                    rotate=(-degrees, degrees),
                    shear=(-shear, shear),
                    border_mode=cv2.BORDER_CONSTANT,
                    fill=114,
                    p=1.0,
                )
            ],
            bbox_params=_BBOX_PARAMS,
        )

    def __call__(self, image: np.ndarray, targets: np.ndarray) -> tuple[np.ndarray, np.ndarray]:
        bboxes, labels = _to_albu(targets)
        r = self._aug(image=image, bboxes=bboxes, class_labels=labels)
        return r["image"], _from_albu(r["bboxes"], r["class_labels"], targets.dtype)

RandomResizedCrop

Randomly crop a region of the image and resize it to size via Albumentations.

Mirrors torchvision.transforms.RandomResizedCrop but is bounding-box aware. Boxes whose area falls below min_width / min_height pixels after cropping are automatically discarded by _BBOX_PARAMS.

Parameters:

Name Type Description Default
size int

Output square side length in pixels.

640
scale tuple[float, float]

Range of fraction of the original image area to crop. Default (0.08, 1.0) matches the torchvision default.

(0.08, 1.0)
ratio tuple[float, float]

Range of aspect ratio of the crop. Default (0.75, 1.333) matches the torchvision default.

(0.75, 1.333)
interpolation int

OpenCV interpolation flag (default cv2.INTER_LINEAR).

INTER_LINEAR
p float

Probability of applying the transform.

1.0
Source code in src/modern_yolonas/data/transforms.py
class RandomResizedCrop:
    """Randomly crop a region of the image and resize it to ``size`` via Albumentations.

    Mirrors ``torchvision.transforms.RandomResizedCrop`` but is bounding-box
    aware.  Boxes whose area falls below ``min_width`` / ``min_height`` pixels
    after cropping are automatically discarded by ``_BBOX_PARAMS``.

    Args:
        size: Output square side length in pixels.
        scale: Range of fraction of the original image area to crop.
              Default ``(0.08, 1.0)`` matches the torchvision default.
        ratio: Range of aspect ratio of the crop.
              Default ``(0.75, 1.333)`` matches the torchvision default.
        interpolation: OpenCV interpolation flag (default ``cv2.INTER_LINEAR``).
        p: Probability of applying the transform.
    """

    def __init__(
        self,
        size: int = 640,
        scale: tuple[float, float] = (0.08, 1.0),
        ratio: tuple[float, float] = (0.75, 1.333),
        interpolation: int = cv2.INTER_LINEAR,
        p: float = 1.0,
    ):
        self._aug = A.Compose(
            [
                A.RandomResizedCrop(
                    size=(size, size),
                    scale=scale,
                    ratio=ratio,
                    interpolation=interpolation,
                    p=1.0,
                )
            ],
            bbox_params=_BBOX_PARAMS,
        )
        self.p = p

    def __call__(self, image: np.ndarray, targets: np.ndarray) -> tuple[np.ndarray, np.ndarray]:
        if random.random() >= self.p:
            return image, targets
        bboxes, labels = _to_albu(targets)
        r = self._aug(image=image, bboxes=bboxes, class_labels=labels)
        return r["image"], _from_albu(r["bboxes"], r["class_labels"], targets.dtype)

RandomChannelSwap

Randomly swap BGR channel order to RGB (and vice-versa) via Albumentations.

Adds photometric variety without touching bounding boxes.

Parameters:

Name Type Description Default
p float

Probability of swapping channels.

0.5
Source code in src/modern_yolonas/data/transforms.py
class RandomChannelSwap:
    """Randomly swap BGR channel order to RGB (and vice-versa) via Albumentations.

    Adds photometric variety without touching bounding boxes.

    Args:
        p: Probability of swapping channels.
    """

    def __init__(self, p: float = 0.5):
        self._aug = A.ChannelShuffle(p=p)

    def __call__(self, image: np.ndarray, targets: np.ndarray) -> tuple[np.ndarray, np.ndarray]:
        return self._aug(image=image)["image"], targets

CenterCrop

Crop the center of the image to size × size pixels.

Bounding boxes that fall outside the cropped region are discarded; those that overlap are clipped to the new canvas by _BBOX_PARAMS.

Parameters:

Name Type Description Default
size int

Output square side length in pixels.

640
Source code in src/modern_yolonas/data/transforms.py
class CenterCrop:
    """Crop the center of the image to ``size`` × ``size`` pixels.

    Bounding boxes that fall outside the cropped region are discarded;
    those that overlap are clipped to the new canvas by ``_BBOX_PARAMS``.

    Args:
        size: Output square side length in pixels.
    """

    def __init__(self, size: int = 640):
        self._aug = A.Compose(
            [A.CenterCrop(height=size, width=size, p=1.0)],
            bbox_params=_BBOX_PARAMS,
        )

    def __call__(self, image: np.ndarray, targets: np.ndarray) -> tuple[np.ndarray, np.ndarray]:
        bboxes, labels = _to_albu(targets)
        r = self._aug(image=image, bboxes=bboxes, class_labels=labels)
        return r["image"], _from_albu(r["bboxes"], r["class_labels"], targets.dtype)

Mosaic

4-image mosaic augmentation.

Source code in src/modern_yolonas/data/transforms.py
class Mosaic:
    """4-image mosaic augmentation."""

    def __init__(self, dataset: BaseDetectionDataset, input_size: int = 640):
        self.dataset = dataset
        self.input_size = input_size

    def __call__(self, index: int) -> tuple[np.ndarray, np.ndarray]:
        s = self.input_size
        yc, xc = (int(random.uniform(s * 0.5, s * 1.5)) for _ in range(2))

        indices = [index] + [random.randint(0, len(self.dataset) - 1) for _ in range(3)]
        mosaic_img = np.full((s * 2, s * 2, 3), 114, dtype=np.uint8)
        all_targets = []

        for i, idx in enumerate(indices):
            img, targets = self.dataset.load_raw(idx)
            h, w = img.shape[:2]

            if i == 0:
                x1a, y1a, x2a, y2a = max(xc - w, 0), max(yc - h, 0), xc, yc
                x1b, y1b, x2b, y2b = w - (x2a - x1a), h - (y2a - y1a), w, h
            elif i == 1:
                x1a, y1a, x2a, y2a = xc, max(yc - h, 0), min(xc + w, s * 2), yc
                x1b, y1b, x2b, y2b = 0, h - (y2a - y1a), min(w, x2a - x1a), h
            elif i == 2:
                x1a, y1a, x2a, y2a = max(xc - w, 0), yc, xc, min(s * 2, yc + h)
                x1b, y1b, x2b, y2b = w - (x2a - x1a), 0, w, min(y2a - y1a, h)
            else:
                x1a, y1a, x2a, y2a = xc, yc, min(xc + w, s * 2), min(s * 2, yc + h)
                x1b, y1b, x2b, y2b = 0, 0, min(w, x2a - x1a), min(y2a - y1a, h)

            mosaic_img[y1a:y2a, x1a:x2a] = img[y1b:y2b, x1b:x2b]
            pad_w = x1a - x1b
            pad_h = y1a - y1b

            if len(targets):
                targets = targets.copy()
                # Convert to pixel coords, offset, then back to normalized
                targets[:, 1] = (targets[:, 1] * w + pad_w) / (s * 2)
                targets[:, 2] = (targets[:, 2] * h + pad_h) / (s * 2)
                targets[:, 3] = targets[:, 3] * w / (s * 2)
                targets[:, 4] = targets[:, 4] * h / (s * 2)
                all_targets.append(targets)

        targets = np.concatenate(all_targets, 0) if all_targets else np.zeros((0, 5))

        # Crop to input_size
        crop_x = int(random.uniform(0, s))
        crop_y = int(random.uniform(0, s))
        mosaic_img = mosaic_img[crop_y : crop_y + s, crop_x : crop_x + s]

        if len(targets):
            targets = targets.copy()
            targets[:, 1] = targets[:, 1] * 2 - crop_x / s
            targets[:, 2] = targets[:, 2] * 2 - crop_y / s

            # Filter out-of-bounds
            valid = (
                (targets[:, 1] > 0) & (targets[:, 1] < 1)
                & (targets[:, 2] > 0) & (targets[:, 2] < 1)
                & (targets[:, 3] > 0.002) & (targets[:, 4] > 0.002)
            )
            targets = targets[valid]

        return mosaic_img, targets

Mixup

Mixup augmentation for detection.

Should be placed in the pipeline after LetterboxResize and before Normalize, so both images are already square uint8.

Parameters:

Name Type Description Default
dataset BaseDetectionDataset

Dataset exposing a load_raw(index) method.

required
p float

Per-sample probability of applying mixup.

0.5
alpha float

Beta distribution alpha parameter.

1.5
beta float

Beta distribution beta parameter.

1.5
Source code in src/modern_yolonas/data/transforms.py
class Mixup:
    """Mixup augmentation for detection.

    Should be placed in the pipeline **after** ``LetterboxResize`` and
    **before** ``Normalize``, so both images are already square uint8.

    Args:
        dataset: Dataset exposing a ``load_raw(index)`` method.
        p: Per-sample probability of applying mixup.
        alpha: Beta distribution ``alpha`` parameter.
        beta: Beta distribution ``beta`` parameter.
    """

    def __init__(self, dataset: BaseDetectionDataset, p: float = 0.5, alpha: float = 1.5, beta: float = 1.5):
        self.dataset = dataset
        self.p = p
        self.alpha = alpha
        self.beta = beta

    def __call__(self, image: np.ndarray, targets: np.ndarray) -> tuple[np.ndarray, np.ndarray]:
        if random.random() >= self.p:
            return image, targets

        idx2 = random.randint(0, len(self.dataset) - 1)
        img2, targets2 = self.dataset.load_raw(idx2)

        # Letterbox the second image to match the (already resized) first image
        target_h, target_w = image.shape[:2]
        h2, w2 = img2.shape[:2]
        if (h2, w2) != (target_h, target_w):
            scale = min(target_h / h2, target_w / w2)
            new_h, new_w = int(round(h2 * scale)), int(round(w2 * scale))
            img2 = cv2.resize(img2, (new_w, new_h), interpolation=cv2.INTER_LINEAR)
            top  = (target_h - new_h) // 2
            left = (target_w - new_w) // 2
            padded = np.full((target_h, target_w, 3), 114, dtype=np.uint8)
            padded[top:top + new_h, left:left + new_w] = img2
            img2 = padded
            if len(targets2):
                targets2 = targets2.copy()
                targets2[:, 1] = (targets2[:, 1] * new_w + left) / target_w
                targets2[:, 2] = (targets2[:, 2] * new_h + top)  / target_h
                targets2[:, 3] = targets2[:, 3] * new_w / target_w
                targets2[:, 4] = targets2[:, 4] * new_h / target_h

        r = np.random.beta(self.alpha, self.beta)
        mixed = (image.astype(np.float32) * r + img2.astype(np.float32) * (1 - r)).astype(np.uint8)

        if len(targets) and len(targets2):
            combined = np.concatenate([targets, targets2], 0)
        elif len(targets):
            combined = targets
        else:
            combined = targets2

        return mixed, combined

LetterboxResize

Resize with aspect ratio preservation and center padding.

Parameters:

Name Type Description Default
target_size int

Output square dimension.

640
pad_value int

Pixel value for padding (default 114, matching YOLO convention).

114
Source code in src/modern_yolonas/data/transforms.py
class LetterboxResize:
    """Resize with aspect ratio preservation and center padding.

    Args:
        target_size: Output square dimension.
        pad_value: Pixel value for padding (default 114, matching YOLO convention).
    """

    def __init__(self, target_size: int = 640, pad_value: int = 114):
        self.target_size = target_size
        self.pad_value = pad_value

    def __call__(self, image: np.ndarray, targets: np.ndarray) -> tuple[np.ndarray, np.ndarray]:
        h, w = image.shape[:2]
        scale = self.target_size / max(h, w)
        new_h, new_w = int(round(h * scale)), int(round(w * scale))
        image = cv2.resize(image, (new_w, new_h), interpolation=cv2.INTER_LINEAR)

        pad_h = self.target_size - new_h
        pad_w = self.target_size - new_w
        top = pad_h // 2
        left = pad_w // 2

        padded = np.full((self.target_size, self.target_size, 3), self.pad_value, dtype=np.uint8)
        padded[top : top + new_h, left : left + new_w] = image

        if len(targets):
            targets = targets.copy()
            # Adjust for padding (targets are normalized)
            targets[:, 1] = (targets[:, 1] * new_w + left) / self.target_size
            targets[:, 2] = (targets[:, 2] * new_h + top) / self.target_size
            targets[:, 3] = targets[:, 3] * new_w / self.target_size
            targets[:, 4] = targets[:, 4] * new_h / self.target_size

        return padded, targets

Normalize

Convert HWC uint8 to CHW float32 [0,1] tensor.

Source code in src/modern_yolonas/data/transforms.py
class Normalize:
    """Convert HWC uint8 to CHW float32 [0,1] tensor."""

    def __call__(self, image: np.ndarray, targets: np.ndarray) -> tuple[np.ndarray, np.ndarray]:
        image = image[:, :, ::-1].copy()  # BGR → RGB
        image = image.transpose(2, 0, 1).astype(np.float32) / 255.0
        return image, targets