import math
from typing import Optional
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import Tensor
from .. import base
__all__ = ["Dropout", "Dropout2d", "DropConnectLinear"]
[文档]
class Dropout(base.MemoryModule):
def __init__(self, p=0.5, step_mode="s"):
r"""
**API Language:**
:ref:`中文 <Dropout.__init__-cn>` | :ref:`English <Dropout.__init__-en>`
----
.. _Dropout.__init__-cn:
* **中文**
与 ``torch.nn.Dropout`` 的几乎相同。区别在于,在每一轮的仿真中,被设置成0的位置不会发生改变;直到下一轮运行,即网络调用reset()函\
数后,才会按照概率去重新决定,哪些位置被置0。
.. tip::
这种Dropout最早由 `Enabling Spike-based Backpropagation for Training Deep Neural Network Architectures
<https://arxiv.org/abs/1903.06379>`_ 一文进行详细论述:
There is a subtle difference in the way dropout is applied in SNNs compared to ANNs. In ANNs, each epoch of
training has several iterations of mini-batches. In each iteration, randomly selected units (with dropout ratio of :math:`p`)
are disconnected from the network while weighting by its posterior probability (:math:`1-p`). However, in SNNs, each
iteration has more than one forward propagation depending on the time length of the spike train. We back-propagate
the output error and modify the network parameters only at the last time step. For dropout to be effective in
our training method, it has to be ensured that the set of connected units within an iteration of mini-batch
data is not changed, such that the neural network is constituted by the same random subset of units during
each forward propagation within a single iteration. On the other hand, if the units are randomly connected at
each time-step, the effect of dropout will be averaged out over the entire forward propagation time within an
iteration. Then, the dropout effect would fade-out once the output error is propagated backward and the parameters
are updated at the last time step. Therefore, we need to keep the set of randomly connected units for the entire
time window within an iteration.
:param p: 每个元素被设置为0的概率
:type p: float
:param step_mode: 步进模式,可以为 `'s'` (单步) 或 `'m'` (多步)
:type step_mode: str
----
.. _Dropout.__init__-en:
* **English**
This layer is almost same with ``torch.nn.Dropout``. The difference is that elements have been zeroed at first
step during a simulation will always be zero. The indexes of zeroed elements will be update only after ``reset()``
has been called and a new simulation is started.
.. admonition:: Tip
:class: tip
This kind of Dropout is firstly described in `Enabling Spike-based Backpropagation for Training Deep Neural
Network Architectures <https://arxiv.org/abs/1903.06379>`_:
There is a subtle difference in the way dropout is applied in SNNs compared to ANNs. In ANNs, each epoch of
training has several iterations of mini-batches. In each iteration, randomly selected units (with dropout ratio of :math:`p`)
are disconnected from the network while weighting by its posterior probability (:math:`1-p`). However, in SNNs, each
iteration has more than one forward propagation depending on the time length of the spike train. We back-propagate
the output error and modify the network parameters only at the last time step. For dropout to be effective in
our training method, it has to be ensured that the set of connected units within an iteration of mini-batch
data is not changed, such that the neural network is constituted by the same random subset of units during
each forward propagation within a single iteration. On the other hand, if the units are randomly connected at
each time-step, the effect of dropout will be averaged out over the entire forward propagation time within an
iteration. Then, the dropout effect would fade-out once the output error is propagated backward and the parameters
are updated at the last time step. Therefore, we need to keep the set of randomly connected units for the entire
time window within an iteration.
:param p: probability of an element to be zeroed
:type p: float
:param step_mode: the step mode, which can be `s` (single-step) or `m` (multi-step)
:type step_mode: str
:return: None
:rtype: None
"""
super().__init__()
self.step_mode = step_mode
assert 0 <= p < 1
self.register_memory("mask", None)
self.p = p
def extra_repr(self):
return f"p={self.p}"
[文档]
def create_mask(self, x: Tensor):
self.mask = F.dropout(torch.ones_like(x.data), self.p, training=True)
[文档]
def single_step_forward(self, x: Tensor):
if self.training:
if self.mask is None:
self.create_mask(x)
return x * self.mask
else:
return x
[文档]
def multi_step_forward(self, x_seq: Tensor):
if self.training:
if self.mask is None:
self.create_mask(x_seq[0])
return x_seq * self.mask
else:
return x_seq
[文档]
class Dropout2d(Dropout):
def __init__(self, p=0.2, step_mode="s"):
r"""
**API Language:**
:ref:`中文 <Dropout2d.__init__-cn>` | :ref:`English <Dropout2d.__init__-en>`
----
.. _Dropout2d.__init__-cn:
* **中文**
与 ``torch.nn.Dropout2d`` 的几乎相同。区别在于,在每一轮的仿真中,被设置成0的位置不会发生改变;直到下一轮运行,即网络调用reset()函\
数后,才会按照概率去重新决定,哪些位置被置0。
关于SNN中Dropout的更多信息,参见 :ref:`layer.Dropout <Dropout.__init__-cn>`。
:param p: 每个元素被设置为0的概率
:type p: float
:param step_mode: 步进模式,可以为 `'s'` (单步) 或 `'m'` (多步)
:type step_mode: str
----
.. _Dropout2d.__init__-en:
* **English**
This layer is almost same with ``torch.nn.Dropout2d``. The difference is that elements have been zeroed at first
step during a simulation will always be zero. The indexes of zeroed elements will be update only after ``reset()``
has been called and a new simulation is started.
For more information about Dropout in SNN, refer to :ref:`layer.Dropout <Dropout.__init__-en>`.
:param p: probability of an element to be zeroed
:type p: float
:param step_mode: the step mode, which can be `s` (single-step) or `m` (multi-step)
:type step_mode: str
:return: None
:rtype: None
"""
super().__init__(p, step_mode)
[文档]
def create_mask(self, x: Tensor):
self.mask = F.dropout2d(torch.ones_like(x.data), self.p, training=True)
[文档]
class DropConnectLinear(base.MemoryModule):
def __init__(
self,
in_features: int,
out_features: int,
bias: bool = True,
p: float = 0.5,
samples_num: int = 1024,
invariant: bool = False,
activation: Optional[nn.Module] = nn.ReLU(),
step_mode="s",
) -> None:
r"""
**API Language:**
:ref:`中文 <DropConnectLinear.__init__-cn>` | :ref:`English <DropConnectLinear.__init__-en>`
----
.. _DropConnectLinear.__init__-cn:
* **中文**
DropConnect,由 `Regularization of Neural Networks using DropConnect <http://proceedings.mlr.press/v28/wan13.pdf>`_
一文提出。DropConnect与Dropout非常类似,区别在于DropConnect是以概率 ``p`` 断开连接,而Dropout是将输入以概率置0。
.. Note::
在使用DropConnect进行推理时,输出的tensor中的每个元素,都是先从高斯分布中采样,通过激活层激活,再在采样数量上进行平均得到的。
详细的流程可以在 `Regularization of Neural Networks using DropConnect <http://proceedings.mlr.press/v28/wan13.pdf>`_
一文中的 `Algorithm 2` 找到。激活层 ``activation`` 在中间的步骤起作用,因此我们将其作为模块的成员。
:param in_features: 每个输入样本的特征数
:type in_features: int
:param out_features: 每个输出样本的特征数
:type out_features: int
:param bias: 若为 ``False``,则本层不会有可学习的偏置项。默认为 ``True``
:type bias: bool
:param p: 每个连接被断开的概率。默认为0.5
:type p: float
:param samples_num: 在推理时,从高斯分布中采样的数据数量。默认为1024
:type samples_num: int
:param invariant: 若为 ``True``,线性层会在第一次执行前向传播时被按概率断开,断开后的线性层会保持不变,直到 ``reset()`` 函数
被调用,线性层恢复为完全连接的状态。完全连接的线性层,调用 ``reset()`` 函数后的第一次前向传播时被重新按概率断开。 若为
``False``,在每一次前向传播时线性层都会被重新完全连接再按概率断开。 阅读 :ref:`layer.Dropout <Dropout.__init__-cn>` 以
获得更多关于此参数的信息。默认为 ``False``
:type invariant: bool
:param activation: 在线性层后的激活层
:type activation: Optional[torch.nn.Module]
:param step_mode: 步进模式,可以为 `'s'` (单步) 或 `'m'` (多步)
:type step_mode: str
----
.. _DropConnectLinear.__init__-en:
* **English**
DropConnect, which is proposed by `Regularization of Neural Networks using DropConnect <http://proceedings.mlr.press/v28/wan13.pdf>`_,
is similar with Dropout but drop connections of a linear layer rather than the elements of the input tensor with
probability ``p``.
.. admonition:: Note
:class: note
When inference with DropConnect, every elements of the output tensor are sampled from a Gaussian distribution,
activated by the activation layer and averaged over the sample number ``samples_num``.
See `Algorithm 2` in `Regularization of Neural Networks using DropConnect <http://proceedings.mlr.press/v28/wan13.pdf>`_
for more details. Note that activation is an intermediate process. This is the reason why we include
``activation`` as a member variable of this module.
:param in_features: size of each input sample
:type in_features: int
:param out_features: size of each output sample
:type out_features: int
:param bias: If set to ``False``, the layer will not learn an additive bias. Default: ``True``
:type bias: bool
:param p: probability of an connection to be zeroed. Default: 0.5
:type p: float
:param samples_num: number of samples drawn from the Gaussian during inference. Default: 1024
:type samples_num: int
:param invariant: If set to ``True``, the connections will be dropped at the first time of forward and the dropped
connections will remain unchanged until ``reset()`` is called and the connections recovery to fully-connected
status. Then the connections will be re-dropped at the first time of forward after ``reset()``. If set to
``False``, the connections will be re-dropped at every forward. See :ref:`layer.Dropout <Dropout.__init__-en>`
for more information to understand this parameter. Default: ``False``
:type invariant: bool
:param activation: the activation layer after the linear layer
:type activation: Optional[nn.Module]
:param step_mode: the step mode, which can be `s` (single-step) or `m` (multi-step)
:type step_mode: str
:return: None
:rtype: None
"""
super().__init__()
self.step_mode = step_mode
self.in_features = in_features
self.out_features = out_features
self.weight = nn.Parameter(Tensor(out_features, in_features))
if bias:
self.bias = nn.Parameter(Tensor(out_features))
else:
self.register_parameter("bias", None)
self.reset_parameters()
self.p = p # 置0的概率
self.register_memory("dropped_w", None)
if self.bias is not None:
self.register_memory("dropped_b", None)
self.samples_num = samples_num
self.invariant = invariant
self.activation = activation
[文档]
def reset_parameters(self) -> None:
r"""
**API Language:**
:ref:`中文 <DropConnectLinear.reset_parameters-cn>` | :ref:`English <DropConnectLinear.reset_parameters-en>`
----
.. _DropConnectLinear.reset_parameters-cn:
* **中文**
:return: None
:rtype: None
初始化模型中的可学习参数。
----
.. _DropConnectLinear.reset_parameters-en:
* **English**
:return: None
:rtype: None
Initialize the learnable parameters of this module.
"""
nn.init.kaiming_uniform_(self.weight, a=math.sqrt(5))
if self.bias is not None:
fan_in, _ = nn.init._calculate_fan_in_and_fan_out(self.weight)
bound = 1 / math.sqrt(fan_in)
nn.init.uniform_(self.bias, -bound, bound)
[文档]
def reset(self):
r"""
**API Language:**
:ref:`中文 <DropConnectLinear.reset-cn>` | :ref:`English <DropConnectLinear.reset-en>`
----
.. _DropConnectLinear.reset-cn:
* **中文**
将线性层重置为完全连接的状态,若 ``self.activation`` 也是一个有状态的层,则将其也重置。
:return: None
:rtype: None
----
.. _DropConnectLinear.reset-en:
* **English**
Reset the linear layer to fully-connected status. If ``self.activation`` is also stateful, this function will
also reset it.
:return: None
:rtype: None
"""
super().reset()
if hasattr(self.activation, "reset"):
self.activation.reset()
[文档]
def drop(self, batch_size: int):
mask_w = (
torch.rand_like(
self.weight.unsqueeze(0).repeat([batch_size] + [1] * self.weight.dim())
)
> self.p
)
# self.dropped_w = mask_w.to(self.weight) * self.weight # shape = [batch_size, out_features, in_features]
self.dropped_w = self.weight * mask_w
if self.bias is not None:
mask_b = (
torch.rand_like(
self.bias.unsqueeze(0).repeat([batch_size] + [1] * self.bias.dim())
)
> self.p
)
# self.dropped_b = mask_b.to(self.bias) * self.bias
self.dropped_b = self.bias * mask_b
[文档]
def single_step_forward(self, input: Tensor) -> Tensor:
if self.training:
if self.invariant:
if self.dropped_w is None:
self.drop(input.shape[0])
else:
self.drop(input.shape[0])
if self.bias is None:
ret = torch.bmm(self.dropped_w, input.unsqueeze(-1)).squeeze(-1)
else:
ret = (
torch.bmm(self.dropped_w, input.unsqueeze(-1)).squeeze(-1)
+ self.dropped_b
)
if self.activation is None:
return ret
else:
return self.activation(ret)
else:
mu = (1 - self.p) * F.linear(
input, self.weight, self.bias
) # shape = [batch_size, out_features]
if self.bias is None:
sigma2 = (
self.p
* (1 - self.p)
* F.linear(input.square(), self.weight.square())
)
else:
sigma2 = (
self.p
* (1 - self.p)
* F.linear(input.square(), self.weight.square(), self.bias.square())
)
dis = torch.distributions.normal.Normal(mu, sigma2.sqrt())
samples = dis.sample(torch.Size([self.samples_num]))
if self.activation is None:
ret = samples
else:
ret = self.activation(samples)
return ret.mean(dim=0)
def extra_repr(self) -> str:
return f"in_features={self.in_features}, out_features={self.out_features}, bias={self.bias is not None}, p={self.p}, invariant={self.invariant}"