Source code for torchvision_customizer.blocks.conv_block

"""Convolutional block implementation for building CNN architectures."""

from typing import Dict, Literal, Optional, Tuple

import torch
import torch.nn as nn
from torch import Tensor

from torchvision_customizer.layers import get_activation



[docs]
class ConvBlock(nn.Module):
    """A configurable convolutional building block for neural networks.

    This block combines a convolutional layer with optional batch normalization,
    activation functions, dropout, and pooling operations. It serves as a
    fundamental building unit for constructing custom CNN architectures.

    Attributes:
        in_channels (int): Number of input channels.
        out_channels (int): Number of output channels.
        kernel_size (int | Tuple[int, int]): Size of the convolutional kernel.
        stride (int | Tuple[int, int]): Stride of the convolutional layer.
        padding (int | Tuple[int, int]): Padding of the convolutional layer.
        activation (str | None): Type of activation function to use.
        use_batchnorm (bool): Whether to use batch normalization.
        dropout_rate (float): Dropout probability (0 to 1).
        pooling_type (str | None): Type of pooling operation.
        pooling_kernel_size (int | Tuple[int, int]): Kernel size for pooling.
        pooling_stride (int | Tuple[int, int]): Stride for pooling.

    Example:
        >>> # Create a basic convolutional block
        >>> block = ConvBlock(
        ...     in_channels=3,
        ...     out_channels=64,
        ...     kernel_size=3,
        ...     activation='relu',
        ...     use_batchnorm=True,
        ...     dropout_rate=0.1,
        ...     pooling_type='max'
        ... )
        >>> input_tensor = torch.randn(4, 3, 224, 224)
        >>> output = block(input_tensor)
        >>> print(output.shape)
        torch.Size([4, 64, 111, 111])
    """

    def __init__(
        self,
        in_channels: int,
        out_channels: int,
        kernel_size: int | Tuple[int, int] = 3,
        stride: int | Tuple[int, int] = 1,
        padding: int | Tuple[int, int] = 1,
        activation: Literal["relu", "leaky_relu", "gelu", "elu", "selu", "sigmoid", "tanh", "prelu", "silu"] | None = "relu",
        use_batchnorm: bool = True,
        dropout_rate: float = 0.0,
        pooling_type: Literal["max", "avg", "adaptive_avg"] | None = None,
        pooling_kernel_size: int | Tuple[int, int] = 2,
        pooling_stride: int | Tuple[int, int] = 2,
        dilation: int | Tuple[int, int] = 1,
        groups: int = 1,
        bias: bool = True,
    ) -> None:
        """Initialize a ConvBlock.

        Args:
            in_channels: Number of channels in the input image. Must be positive.
            out_channels: Number of channels produced by the convolution. Must be positive.
            kernel_size: Size of the convolutional kernel. Default is 3.
                Can be int (square kernel) or tuple (height, width).
            stride: Stride of the convolution. Default is 1.
                Can be int or tuple for different dimensions.
            padding: Padding added to input. Default is 1.
                Can be int or tuple for different dimensions.
            activation: Type of activation function to apply after convolution.
                Options: 'relu', 'leaky_relu', 'gelu', 'elu', 'selu', 'sigmoid', 
                'tanh', 'prelu', 'silu'.
                If None, no activation is applied. Default is 'relu'.
            use_batchnorm: Whether to apply batch normalization after convolution.
                Default is True.
            dropout_rate: Dropout probability (between 0 and 1). Default is 0.0 (no dropout).
            pooling_type: Type of pooling to apply after the block.
                Options: 'max', 'avg', 'adaptive_avg'.
                If None, no pooling is applied. Default is None.
            pooling_kernel_size: Kernel size for pooling operation. Default is 2.
            pooling_stride: Stride for pooling operation. Default is 2.
            dilation: Spacing between kernel elements. Default is 1.
            groups: Number of groups for grouped convolution. Default is 1.
            bias: Whether to use bias in convolutional layer. Default is True.
                Usually set to False when using batch normalization.

        Raises:
            ValueError: If any parameter is invalid (e.g., negative channels, invalid activation).
        """
        super().__init__()

        # Validate inputs
        if in_channels <= 0:
            raise ValueError(f"in_channels must be positive, got {in_channels}")
        if out_channels <= 0:
            raise ValueError(f"out_channels must be positive, got {out_channels}")
        if not 0.0 <= dropout_rate <= 1.0:
            raise ValueError(f"dropout_rate must be between 0 and 1, got {dropout_rate}")

        self.in_channels = in_channels
        self.out_channels = out_channels
        self.kernel_size = kernel_size if isinstance(kernel_size, tuple) else (kernel_size, kernel_size)
        self.stride = stride if isinstance(stride, tuple) else (stride, stride)
        self.padding = padding if isinstance(padding, tuple) else (padding, padding)
        self.activation_type = activation
        self.use_batchnorm = use_batchnorm
        self.dropout_rate = dropout_rate
        self.pooling_type = pooling_type
        self.pooling_kernel_size = pooling_kernel_size if isinstance(pooling_kernel_size, tuple) else (pooling_kernel_size, pooling_kernel_size)
        self.pooling_stride = pooling_stride if isinstance(pooling_stride, tuple) else (pooling_stride, pooling_stride)

        # Build convolutional layer
        self.conv = nn.Conv2d(
            in_channels=in_channels,
            out_channels=out_channels,
            kernel_size=self.kernel_size,
            stride=self.stride,
            padding=self.padding,
            dilation=dilation,
            groups=groups,
            bias=bias,
        )

        # Build batch normalization layer
        self.bn: nn.Module | None = None
        if use_batchnorm:
            self.bn = nn.BatchNorm2d(out_channels)

        # Build activation layer
        self.activation: nn.Module | None = None
        if activation is not None:
            self.activation = self._get_activation(activation)

        # Build dropout layer
        self.dropout: nn.Module | None = None
        if dropout_rate > 0.0:
            self.dropout = nn.Dropout2d(p=dropout_rate)

        # Build pooling layer
        self.pooling: nn.Module | None = None
        if pooling_type is not None:
            self.pooling = self._get_pooling(
                pooling_type,
                self.pooling_kernel_size,
                self.pooling_stride,
            )

    def _get_activation(self, activation: str) -> nn.Module:
        """Get activation function module by name.

        Uses the activation factory from torchvision_customizer.layers
        to create the activation function.

        Args:
            activation: Name of the activation function.
                Supported: 'relu', 'leaky_relu', 'gelu', 'elu', 'selu',
                'sigmoid', 'tanh', 'prelu', 'silu'

        Returns:
            Activation function module.

        Raises:
            ValueError: If activation name is not recognized.
        """
        try:
            return get_activation(activation)
        except ValueError as e:
            raise ValueError(
                f"Unknown activation function: {activation}. {str(e)}"
            ) from e

    def _get_pooling(
        self,
        pooling_type: str,
        kernel_size: Tuple[int, int],
        stride: Tuple[int, int],
    ) -> nn.Module:
        """Get pooling layer module by type.

        Args:
            pooling_type: Type of pooling ('max', 'avg', 'adaptive_avg').
            kernel_size: Kernel size for pooling.
            stride: Stride for pooling.

        Returns:
            Pooling layer module.

        Raises:
            ValueError: If pooling type is not recognized.
        """
        if pooling_type == "max":
            return nn.MaxPool2d(kernel_size=kernel_size, stride=stride)
        elif pooling_type == "avg":
            return nn.AvgPool2d(kernel_size=kernel_size, stride=stride)
        elif pooling_type == "adaptive_avg":
            return nn.AdaptiveAvgPool2d(output_size=kernel_size)
        else:
            raise ValueError(
                f"Unknown pooling type: {pooling_type}. "
                f"Available options: ['max', 'avg', 'adaptive_avg']"
            )


[docs]
    def forward(self, x: Tensor) -> Tensor:
        """Forward pass through the convolutional block.

        Args:
            x: Input tensor of shape (batch_size, in_channels, height, width).

        Returns:
            Output tensor of shape (batch_size, out_channels, height', width'),
            where height' and width' depend on kernel_size, stride, padding,
            and pooling operations.
        """
        # Convolutional layer
        x = self.conv(x)

        # Batch normalization
        if self.bn is not None:
            x = self.bn(x)

        # Activation function
        if self.activation is not None:
            x = self.activation(x)

        # Dropout
        if self.dropout is not None:
            x = self.dropout(x)

        # Pooling
        if self.pooling is not None:
            x = self.pooling(x)

        return x


    @property
    def get_output_channels(self) -> int:
        """Get the number of output channels.

        Returns:
            Number of output channels produced by this block.
        """
        return self.out_channels


[docs]
    def calculate_output_shape(
        self,
        input_height: int,
        input_width: int,
    ) -> Tuple[int, int]:
        """Calculate output spatial dimensions for given input size.

        This method computes the output height and width after applying
        convolution, and if applicable, pooling operations.

        Args:
            input_height: Height of the input feature map.
            input_width: Width of the input feature map.

        Returns:
            Tuple of (output_height, output_width).

        Example:
            >>> block = ConvBlock(3, 64, kernel_size=3, padding=1, stride=1, pooling_type='max')
            >>> height, width = block.calculate_output_shape(224, 224)
            >>> print(height, width)
            112 112
        """
        # Calculate output size after convolution
        # Formula: out = (in + 2*padding - dilation*(kernel_size-1) - 1) / stride + 1
        conv_height = (
            input_height
            + 2 * self.padding[0]
            - (self.kernel_size[0] - 1)
            - 1
        ) // self.stride[0] + 1
        conv_width = (
            input_width
            + 2 * self.padding[1]
            - (self.kernel_size[1] - 1)
            - 1
        ) // self.stride[1] + 1

        # Calculate output size after pooling (if applicable)
        if self.pooling is not None:
            if self.pooling_type == "adaptive_avg":
                # Adaptive pooling outputs fixed size
                pool_height, pool_width = self.pooling_kernel_size
            else:
                # Regular pooling calculation
                pool_height = (conv_height - self.pooling_kernel_size[0]) // self.pooling_stride[0] + 1
                pool_width = (conv_width - self.pooling_kernel_size[1]) // self.pooling_stride[1] + 1
            return pool_height, pool_width

        return conv_height, conv_width


    def __repr__(self) -> str:
        """String representation of the ConvBlock."""
        config = (
            f"ConvBlock("
            f"in_channels={self.in_channels}, "
            f"out_channels={self.out_channels}, "
            f"kernel_size={self.kernel_size}, "
            f"stride={self.stride}, "
            f"activation={self.activation_type}, "
            f"use_batchnorm={self.use_batchnorm}, "
            f"dropout_rate={self.dropout_rate}, "
            f"pooling_type={self.pooling_type}"
            f")"
        )
        return config