spikingjelly.activation_based.monitor 源代码

import os
import time
import re
import datetime
import threading
from typing import Callable, Union, Optional

import torch
from torch import nn
from torch.utils.tensorboard import SummaryWriter


def _unpack_len1_tuple(x: Union[tuple, torch.Tensor]):
    if isinstance(x, tuple) and len(x) == 1:
        return x[0]
    else:
        return x



[文档]
class BaseMonitor:
    def __init__(self):
        r"""
        **API Language:**
        :ref:`中文 <BaseMonitor.__init__-cn>` | :ref:`English <BaseMonitor.__init__-en>`

        ----

        .. _BaseMonitor.__init__-cn:
        * **中文**

        * **中文**

        监视器基类。维护钩子句柄、被监视层列表、记录缓存以及启停状态。

        :return: ``None``
        :rtype: None

        ----

        .. _BaseMonitor.__init__-en:
        * **English**

        * **English**

        Base monitor class. It maintains hook handles, monitored layer list, recorded data buffers,
        and enable/disable state.

        :return: ``None``
        :rtype: None
        """
        self.hooks = []
        self.monitored_layers = []
        self.records = []
        self.name_records_index = {}
        self._enable = True

    def __getitem__(self, i):
        if isinstance(i, int):
            return self.records[i]
        elif isinstance(i, str):
            y = []
            for index in self.name_records_index[i]:
                y.append(self.records[index])
            return y
        else:
            raise ValueError(i)


[文档]
    def clear_recorded_data(self):
        self.records.clear()
        for v in self.name_records_index.values():
            v.clear()



[文档]
    def enable(self):
        self._enable = True



[文档]
    def disable(self):
        self._enable = False



[文档]
    def is_enable(self):
        return self._enable



[文档]
    def remove_hooks(self):
        for hook in self.hooks:
            hook.remove()


    def __del__(self):
        self.disable()
        self.clear_recorded_data()
        self.remove_hooks()




[文档]
class OutputMonitor(BaseMonitor):
    def __init__(
        self,
        net: nn.Module,
        instance: Optional[Union[type, tuple[type, ...]]] = None,
        function_on_output: Callable = lambda x: x,
    ):
        """
        **API Language:**
        :ref:`中文 <OutputMonitor-cn>` | :ref:`English <OutputMonitor-en>`

        ----

        .. _OutputMonitor-cn:

        * **中文**

        对 ``net`` 中所有类型为 ``instance`` 的模块的输出使用 ``function_on_output`` 作用后，
        记录到类型为 `list`` 的 ``self.records`` 中。可以通过 ``self.enable()`` 和 ``self.disable()``
        来启用或停用这个监视器。可以通过 ``self.clear_recorded_data()`` 来清除已经记录的数据。

        阅读 :doc:`监视器教程 <../tutorials/cn/monitor>` 以获得更多信息。

        :param net: 一个神经网络
        :type net: torch.nn.Module

        :param instance: 被监视的模块的数据类型。若为 ``None`` 则表示类型为 ``type(net)``
        :type instance: Optional[Union[type, tuple[type, ...]]]

        :param function_on_output: 作用于被监控的模块输出的自定义的函数
        :type function_on_output: Callable

        ----

        .. _OutputMonitor-en:

        * **English**

        Applies ``function_on_output`` on outputs of all modules whose instances are
        ``instance`` in ``net``, and records the data into ``self.records``, which is a
        ``list``. Call ``self.enable()`` or ``self.disable()`` to enable or disable the
        monitor. Call ``self.clear_recorded_data()`` to clear the recorded data.

        Refer to the :doc:`Monitor Tutorial <../tutorials/en/monitor>` for more details.

        :param net: a network
        :type net: nn.Module

        :param instance: the instance of modules to be monitored. If ``None``, it will
            be regarded as ``type(net)``
        :type instance: Optional[Union[type, tuple[type, ...]]]

        :param function_on_output: the function that applies on the monitored modules' outputs
        :type function_on_output: Callable

        ----

        * **示例代码 | Example**

        .. code-block:: python

            class Net(nn.Module):
                def __init__(self):
                    super().__init__()
                    self.fc1 = layer.Linear(8, 4)
                    self.sn1 = neuron.IFNode()
                    self.fc2 = layer.Linear(4, 2)
                    self.sn2 = neuron.IFNode()
                    functional.set_step_mode(self, "m")

                def forward(self, x_seq: torch.Tensor):
                    x_seq = self.fc1(x_seq)
                    x_seq = self.sn1(x_seq)
                    x_seq = self.fc2(x_seq)
                    x_seq = self.sn2(x_seq)
                    return x_seq


            net = Net()
            for param in net.parameters():
                param.data.abs_()

            mtor = monitor.OutputMonitor(net, instance=neuron.IFNode)

            with torch.no_grad():
                y = net(torch.rand([1, 8]))
                print(f"mtor.records={mtor.records}")
                # mtor.records=[tensor([[0., 0., 0., 1.]]), tensor([[0., 0.]])]
                print(f"mtor[0]={mtor[0]}")
                # mtor[0]=tensor([[0., 0., 0., 1.]])
                print(f"mtor.monitored_layers={mtor.monitored_layers}")
                # mtor.monitored_layers=['sn1', 'sn2']
                print(f"mtor['sn1']={mtor['sn1']}")
                # mtor['sn1']=[tensor([[0., 0., 0., 1.]])]
        """
        super().__init__()
        self.function_on_output = function_on_output
        if instance is None:
            instance = type(net)
        for name, m in net.named_modules():
            if isinstance(m, instance):
                self.monitored_layers.append(name)
                self.name_records_index[name] = []
                self.hooks.append(m.register_forward_hook(self.create_hook(name)))


[文档]
    def create_hook(self, name):
        def hook(m, x, y):
            if self.is_enable():
                self.name_records_index[name].append(self.records.__len__())
                self.records.append(self.function_on_output(_unpack_len1_tuple(y)))

        return hook





[文档]
class InputMonitor(BaseMonitor):
    def __init__(
        self,
        net: nn.Module,
        instance: Optional[Union[type, tuple[type, ...]]] = None,
        function_on_input: Callable = lambda x: x,
    ):
        """
        **API Language:**
        :ref:`中文 <InputMonitor-cn>` | :ref:`English <InputMonitor-en>`

        ----

        .. _InputMonitor-cn:

        * **中文**

        对 ``net`` 中所有类型为 ``instance`` 的模块的输入使用 ``function_on_input`` 作用后，
        记录到类型为 `list`` 的 ``self.records`` 中。可以通过 ``self.enable()`` 和 ``self.disable()``
        来启用或停用这个监视器。可以通过 ``self.clear_recorded_data()`` 来清除已经记录的数据。

        阅读 :doc:`监视器教程 <../tutorials/cn/monitor>` 以获得更多信息。

        :param net: 一个神经网络
        :type net: nn.Module

        :param instance: 被监视的模块的数据类型。若为 ``None`` 则表示类型为 ``type(net)``
        :type instance: Optional[Union[type, tuple[type, ...]]]

        :param function_on_input: 作用于被监控的模块输入的自定义的函数
        :type function_on_input: Callable

        ----

        .. _InputMonitor-en:

        * **English**

        Applies ``function_on_input`` on inputs of all modules whose instances are
        ``instance`` in ``net``, and records the data into ``self.records``, which is a
        ``list``. Call ``self.enable()`` or ``self.disable()`` to enable or disable the
        monitor. Call ``self.clear_recorded_data()`` to clear the recorded data.

        Refer to the :doc:`Monitor Tutorial <../tutorials/en/monitor>` for more details.

        :param net: a network
        :type net: nn.Module

        :param instance: the instance of modules to be monitored. If ``None``, it will
            be regarded as ``type(net)``
        :type instance: Optional[Union[type, tuple[type, ...]]]

        :param function_on_input: the function that applies on the monitored modules' inputs
        :type function_on_input: Callable

        ----

        * **示例代码 | Example**

        .. code-block:: python

            import torch
            import torch.nn as nn
            from spikingjelly.activation_based import monitor, neuron, functional, layer


            class Net(nn.Module):
                def __init__(self):
                    super().__init__()
                    self.fc1 = layer.Linear(8, 4)
                    self.sn1 = neuron.IFNode()
                    self.fc2 = layer.Linear(4, 2)
                    self.sn2 = neuron.IFNode()
                    functional.set_step_mode(self, "m")

                def forward(self, x_seq: torch.Tensor):
                    x_seq = self.fc1(x_seq)
                    x_seq = self.sn1(x_seq)
                    x_seq = self.fc2(x_seq)
                    x_seq = self.sn2(x_seq)
                    return x_seq


            net = Net()
            for param in net.parameters():
                param.data.abs_()

            mtor = monitor.InputMonitor(net, instance=neuron.IFNode)

            with torch.no_grad():
                y = net(torch.rand([1, 8]))
                print(f"mtor.records={mtor.records}")
                # mtor.records=[tensor([[1.0165, 1.1934, 0.9347, 0.9539]]), tensor([[0.9115, 0.9508]])]
                print(f"mtor[0]={mtor[0]}")
                # mtor[0]=tensor([[1.0165, 1.1934, 0.9347, 0.9539]])
                print(f"mtor.monitored_layers={mtor.monitored_layers}")
                # mtor.monitored_layers=['sn1', 'sn2']
                print(f"mtor['sn1']={mtor['sn1']}")
                # mtor['sn1']=[tensor([[1.0165, 1.1934, 0.9347, 0.9539]])]
        """
        super().__init__()
        self.function_on_input = function_on_input
        if instance is None:
            instance = type(net)
        for name, m in net.named_modules():
            if isinstance(m, instance):
                self.monitored_layers.append(name)
                self.name_records_index[name] = []
                self.hooks.append(m.register_forward_hook(self.create_hook(name)))


[文档]
    def create_hook(self, name):
        def hook(m, x, y):
            if self.is_enable():
                self.name_records_index[name].append(self.records.__len__())
                self.records.append(self.function_on_input(_unpack_len1_tuple(x)))

        return hook





[文档]
class AttributeMonitor(BaseMonitor):
    def __init__(
        self,
        attribute_name: str,
        pre_forward: bool,
        net: nn.Module,
        instance: Optional[Union[type, tuple[type, ...]]] = None,
        function_on_attribute: Callable = lambda x: x,
    ):
        """
        **API Language:**
        :ref:`中文 <AttributeMonitor-cn>` | :ref:`English <AttributeMonitor-en>`

        ----

        .. _AttributeMonitor-cn:

        * **中文**

        对 ``net`` 中所有类型为 ``instance`` 的模块 ``m`` 的成员 ``m.attribute_name`` 使用 ``function_on_attribute`` 作用后，记录到类型为 `list`` 的  ``self.records``。
        可以通过 ``self.enable()`` 和 ``self.disable()`` 来启用或停用这个监视器。
        可以通过 ``self.clear_recorded_data()`` 来清除已经记录的数据。

        阅读 :doc:`监视器教程 <../tutorials/cn/monitor>` 以获得更多信息。

        :param attribute_name: 要监控的成员变量的名字
        :type attribute_name: str

        :param pre_forward: 若为 ``True``，则记录模块在完成前向传播前的成员变量，否则记录完成
            前向传播后的变量
        :type pre_forward: bool

        :param net: 一个神经网络
        :type net: nn.Module

        :param instance: 被监视的模块的数据类型。若为 ``None`` 则表示类型为 ``type(net)``
        :type instance: Optional[Union[type, tuple[type, ...]]]

        :param function_on_attribute: 作用于被监控的模块 ``m`` 的成员 ``m.attribute_name``
            的自定义的函数
        :type function_on_attribute: Callable

        ----

        .. _AttributeMonitor-en:

        * **English**

        Applies ``function_on_attribute`` on ``m.attribute_name`` of each monitored
        module ``m`` whose instance is ``instance`` in ``net``, and records the data
        into ``self.records``, which is a ``list``. Call ``self.enable()`` or ``self.disable()``
        to enable or disable the monitor. Call ``self.clear_recorded_data()`` to
        clear the recorded data.

        Refer to the :doc:`Monitor Tutorial <../tutorials/en/monitor>` for more details.

        :param attribute_name: the monitored attribute's name
        :type attribute_name: str

        :param pre_forward: If ``True``, recording the attribute before forward,
            otherwise recording the attribute after forward
        :type pre_forward: bool

        :param net: a network
        :type net: nn.Module

        :param instance: the instance of modules to be monitored. If ``None``, it will
            be regarded as ``type(net)``
        :type instance: Optional[Union[type, tuple[type, ...]]]

        :param function_on_attribute: the function that applies on each
            monitored module's attribute
        :type function_on_attribute: Callable

        ----

        * **示例代码 | Example**

        .. code-block:: python

            import torch
            import torch.nn as nn
            from spikingjelly.activation_based import monitor, neuron, functional, layer


            class Net(nn.Module):
                def __init__(self):
                    super().__init__()
                    self.fc1 = layer.Linear(8, 4)
                    self.sn1 = neuron.IFNode()
                    self.fc2 = layer.Linear(4, 2)
                    self.sn2 = neuron.IFNode()
                    functional.set_step_mode(self, "m")

                def forward(self, x_seq: torch.Tensor):
                    x_seq = self.fc1(x_seq)
                    x_seq = self.sn1(x_seq)
                    x_seq = self.fc2(x_seq)
                    x_seq = self.sn2(x_seq)
                    return x_seq


            net = Net()
            for param in net.parameters():
                param.data.abs_()

            mtor = monitor.AttributeMonitor("v", False, net, instance=neuron.IFNode)

            with torch.no_grad():
                y = net(torch.rand([1, 8]))
                print(f"mtor.records={mtor.records}")
                # mtor.records=[tensor([0.0000, 0.6854, 0.0000, 0.7968]), tensor([0.4472, 0.0000])]
                print(f"mtor[0]={mtor[0]}")
                # mtor[0]=tensor([0.0000, 0.6854, 0.0000, 0.7968])
                print(f"mtor.monitored_layers={mtor.monitored_layers}")
                # mtor.monitored_layers=['sn1', 'sn2']
                print(f"mtor['sn1']={mtor['sn1']}")
                # mtor['sn1']=[tensor([0.0000, 0.6854, 0.0000, 0.7968])]
        """
        super().__init__()
        self.attribute_name = attribute_name
        self.function_on_attribute = function_on_attribute
        if instance is None:
            instance = type(net)

        for name, m in net.named_modules():
            if isinstance(m, instance):
                self.monitored_layers.append(name)
                self.name_records_index[name] = []
                if pre_forward:
                    self.hooks.append(
                        m.register_forward_pre_hook(self.create_hook(name))
                    )
                else:
                    self.hooks.append(m.register_forward_hook(self.create_hook(name)))


[文档]
    def create_hook(self, name):
        def hook(m, x, y):
            if self.is_enable():
                self.name_records_index[name].append(self.records.__len__())
                self.records.append(
                    self.function_on_attribute(m.__getattr__(self.attribute_name))
                )

        return hook





[文档]
class GradInputMonitor(BaseMonitor):
    def __init__(
        self,
        net: nn.Module,
        instance: Optional[Union[type, tuple[type, ...]]] = None,
        function_on_grad_input: Callable = lambda x: x,
    ):
        r"""
        **API Language:**
        :ref:`中文 <GradInputMonitor-cn>` | :ref:`English <GradInputMonitor-en>`

        ----

        .. _GradInputMonitor-cn:

        * **中文**

        对 ``net`` 中所有类型为 ``instance`` 的模块的输入的梯度使用 ``function_on_grad_input``
        作用后，记录到类型为 `list`` 的 ``self.records`` 中。
        可以通过 ``self.enable()`` 和 ``self.disable()`` 来启用或停用这个监视器。
        可以通过 ``self.clear_recorded_data()`` 来清除已经记录的数据。

        阅读 :doc:`监视器教程 <../tutorials/cn/monitor>` 以获得更多信息。

        .. note::

            对于一个模块，输入为 :math:`X`，输出为 :math:`Y`，损失为 :math:`L`，则 ``GradOutputMonitor``
            记录的是对输入的梯度 :math:`\frac{\partial L}{\partial X}`。

        :param net: 一个神经网络
        :type net: nn.Module

        :param instance: 被监视的模块的数据类型。若为 ``None`` 则表示类型为 ``type(net)``
        :type instance: Optional[Union[type, tuple[type, ...]]]

        :param function_on_grad_input: 作用于被监控的模块输出的输入的梯度的函数
        :type function_on_grad_input: Callable

        ----

        .. _GradInputMonitor-en:

        * **English**

        Applies ``function_on_grad_input`` on grad of inputs of all modules whose
        instances are ``instance`` in ``net``, and records the data into ``self.records``,
        which is a ``list``.
        Call ``self.enable()`` or ``self.disable()`` to enable or disable the monitor.
        Call ``self.clear_recorded_data()`` to clear the recorded data.

        Refer to the :doc:`Monitor Tutorial <../tutorials/en/monitor>` for more details.

        .. note::

            Denote the input and output of the monitored module as :math:`X` and :math:`Y`,
            and the loss is :math:`L`, then ``GradInputMonitor`` will record the gradient
            of input, which is :math:`\frac{\partial L}{\partial X}`.

        :param net: a network
        :type net: nn.Module

        :param instance: the instance of modules to be monitored. If ``None``, it will
            be regarded as ``type(net)``
        :type instance: Optional[Union[type, tuple[type, ...]]]

        :param function_on_grad_input: the function that applies on the grad of
            monitored modules' inputs
        :type function_on_grad_input: Callable

        ----

        * **示例代码 | Example**

        .. code-block:: python

            class Net(nn.Module):
                def __init__(self):
                    super().__init__()
                    self.fc1 = layer.Linear(8, 4)
                    self.sn1 = neuron.IFNode()
                    self.fc2 = layer.Linear(4, 2)
                    self.sn2 = neuron.IFNode()
                    functional.set_step_mode(self, "m")

                def forward(self, x_seq: torch.Tensor):
                    x_seq = self.fc1(x_seq)
                    x_seq = self.sn1(x_seq)
                    x_seq = self.fc2(x_seq)
                    x_seq = self.sn2(x_seq)
                    return x_seq


            net = Net()
            for param in net.parameters():
                param.data.abs_()

            mtor = monitor.GradInputMonitor(net, instance=neuron.IFNode)

            with torch.no_grad():
                y = net(torch.rand([1, 8]))
                print(f"mtor.records={mtor.records}")
                # mtor.records=[tensor([0.0000, 0.6854, 0.0000, 0.7968]), tensor([0.4472, 0.0000])]
                print(f"mtor[0]={mtor[0]}")
                # mtor[0]=tensor([0.0000, 0.6854, 0.0000, 0.7968])
                print(f"mtor.monitored_layers={mtor.monitored_layers}")
                # mtor.monitored_layers=['sn1', 'sn2']
                print(f"mtor['sn1']={mtor['sn1']}")
                # mtor['sn1']=[tensor([0.0000, 0.6854, 0.0000, 0.7968])]
        """
        super().__init__()
        self.function_on_grad_input = function_on_grad_input
        if instance is None:
            instance = type(net)

        for name, m in net.named_modules():
            if isinstance(m, instance):
                self.monitored_layers.append(name)
                self.name_records_index[name] = []
                if torch.__version__ >= torch.torch_version.TorchVersion("1.8.0"):
                    self.hooks.append(
                        m.register_full_backward_hook(self.create_hook(name))
                    )
                else:
                    self.hooks.append(m.register_backward_hook(self.create_hook(name)))


[文档]
    def create_hook(self, name):
        def hook(m, grad_input, grad_output):
            if self.is_enable():
                self.name_records_index[name].append(self.records.__len__())
                self.records.append(
                    self.function_on_grad_input(_unpack_len1_tuple(grad_input))
                )

        return hook





[文档]
class GradOutputMonitor(BaseMonitor):
    def __init__(
        self,
        net: nn.Module,
        instance: Optional[Union[type, tuple[type, ...]]] = None,
        function_on_grad_output: Callable = lambda x: x,
    ):
        r"""
        **API Language:**
        :ref:`中文 <GradOutputMonitor-cn>` | :ref:`English <GradOutputMonitor-en>`

        ----

        .. _GradOutputMonitor-cn:

        * **中文**

        对 ``net`` 中所有类型为 ``instance`` 的模块的输出的梯度使用 ``function_on_grad_output``
        作用后，记录到类型为 `list`` 的 ``self.records`` 中。
        可以通过 ``self.enable()`` 和 ``self.disable()`` 来启用或停用这个监视器。
        可以通过 ``self.clear_recorded_data()`` 来清除已经记录的数据。

        阅读 :doc:`监视器教程 <../tutorials/cn/monitor>` 以获得更多信息。

        .. note::

            对于一个模块，输入为 :math:`X`，输出为 :math:`Y`，损失为 :math:`L`，则 ``GradOutputMonitor``
            记录的是对输出的梯度 :math:`\frac{\partial L}{\partial Y}`。

        :param net: 一个神经网络
        :type net: nn.Module

        :param instance: 被监视的模块的数据类型。若为 ``None`` 则表示类型为 ``type(net)``
        :type instance: Optional[Union[type, tuple[type, ...]]]

        :param function_on_grad_output: 作用于被监控的模块输出的输出的的梯度的函数
        :type function_on_grad_output: Callable

        ----

        .. _GradOutputMonitor-en:

        * **English**

        Applies ``function_on_grad_output`` on grad of outputs of all modules whose instances
        are ``instance`` in ``net``, and records the data into ``self.records``, which is a ``list``.
        Call ``self.enable()`` or ``self.disable()`` to enable or disable the monitor.
        Call ``self.clear_recorded_data()`` to clear the recorded data.

        Refer to the :doc:`Monitor Tutorial <../tutorials/en/monitor>` for more details.

        .. note::

            Denote the input and output of the monitored module as :math:`X` and :math:`Y`,
            and the loss is :math:`L`, then ``GradOutputMonitor`` will record the gradient
            of output, which is :math:`\frac{\partial L}{\partial Y}`.

        :param net: a network
        :type net: nn.Module

        :param instance: the instance of modules to be monitored. If ``None``, it will
            be regarded as ``type(net)``
        :type instance: Optional[Union[type, tuple[type, ...]]]

        :param function_on_grad_output: the function that applies on the grad of
            monitored modules' outputs
        :type function_on_grad_output: Callable

        ----

        * **示例代码 | Example**

        .. code-block:: python

            import torch
            import torch.nn as nn
            from spikingjelly.activation_based import monitor, neuron, functional, layer


            class Net(nn.Module):
                def __init__(self):
                    super().__init__()
                    self.fc1 = layer.Linear(8, 4)
                    self.sn1 = neuron.IFNode()
                    self.fc2 = layer.Linear(4, 2)
                    self.sn2 = neuron.IFNode()
                    functional.set_step_mode(self, "m")

                def forward(self, x_seq: torch.Tensor):
                    x_seq = self.fc1(x_seq)
                    x_seq = self.sn1(x_seq)
                    x_seq = self.fc2(x_seq)
                    x_seq = self.sn2(x_seq)
                    return x_seq


            net = Net()
            for param in net.parameters():
                param.data.abs_()

            mtor = monitor.GradOutputMonitor(net, instance=neuron.IFNode)

            net(torch.rand([1, 8])).sum().backward()
            print(f"mtor.records={mtor.records}")
            # mtor.records=[tensor([[1., 1.]]), tensor([[0.1372, 0.1081, 0.0880, 0.1089]])]
            print(f"mtor[0]={mtor[0]}")
            # mtor[0]=tensor([[1., 1.]])
            print(f"mtor.monitored_layers={mtor.monitored_layers}")
            # mtor.monitored_layers=['sn1', 'sn2']
            print(f"mtor['sn1']={mtor['sn1']}")
            # mtor['sn1']=[tensor([[0.1372, 0.1081, 0.0880, 0.1089]])]
        """
        super().__init__()
        self.function_on_grad_output = function_on_grad_output
        if instance is None:
            instance = type(net)
        for name, m in net.named_modules():
            if isinstance(m, instance):
                self.monitored_layers.append(name)
                self.name_records_index[name] = []
                if torch.__version__ >= torch.torch_version.TorchVersion("1.8.0"):
                    self.hooks.append(
                        m.register_full_backward_hook(self.create_hook(name))
                    )
                else:
                    self.hooks.append(m.register_backward_hook(self.create_hook(name)))


[文档]
    def create_hook(self, name):
        def hook(m, grad_input, grad_output):
            if self.is_enable():
                self.name_records_index[name].append(self.records.__len__())
                self.records.append(
                    self.function_on_grad_output(_unpack_len1_tuple(grad_output))
                )

        return hook





[文档]
class GPUMonitor(threading.Thread):
    def __init__(
        self,
        log_dir: Optional[str] = None,
        gpu_ids: tuple = (0,),
        interval: float = 600.0,
        start_now=True,
    ):
        r"""
        **API Language:**
        :ref:`中文 <GPUMonitor.__init__-cn>` | :ref:`English <GPUMonitor.__init__-en>`

        ----

        .. _GPUMonitor.__init__-cn:

        * **中文**

        GPU监视器，可以开启一个新的线程来记录 ``gpu_ids`` 的使用率和显存使用情况，
        每 ``interval`` 秒记录一次数据。

        .. warning::

            在主线程的工作完成后一定要调用GPU监视器的 ``stop()`` 函数，否则主线程不会退出。

        :param log_dir: 使用 ``tensorboard`` 保存GPU数据的文件夹. 若为None，则日志不会保存，而是
            直接 ``print``
        :type log_dir: Optional[str]

        :param gpu_ids: 监视的GPU，例如 ``(0, 1, 2, 3)``。默认为 ``(0, )``
        :type gpu_ids: tuple

        :param interval: 记录数据的间隔，单位是秒
        :type interval: float

        :param start_now: 若为 ``True`` 则初始化后会立刻开始记录数据，否则需要手动调用 ``start()``
            后才开始记录数据
        :type start_now: bool

        ----

        .. _GPUMonitor.__init__-en:

        * **English**

        The GPU monitor, which starts a new thread to record the utilization and memory used of ``gpu_ids`` every ``interval`` seconds.

        .. warning::

            Do not forget to call this module's ``stop()`` after the main thread
            finishes its job, otherwise the main thread will never stop!

        :param log_dir: the directory for saving logs with tensorboard. If it is None,
            this module will print logs
        :type log_dir: Optional[str]

        :param gpu_ids: the id of GPUs to be monitored, e.g., ``(0, 1, 2, 3)``.
            The default value is ``(0, )``
        :type gpu_ids: tuple

        :param interval: the recording interval (in seconds)
        :type interval: float

        :param start_now: if true, the monitor will start to record now. Otherwise,
            it will start after the user call ``start()`` manually
        :type start_now: bool

        ----

        * **示例代码 | Example**

        .. code-block:: python

            import time

            gm = GPUMonitor(interval=1)
            time.sleep(2)  # make the main thread sleep
            gm.stop()

            # The outputs are:

            # 2022-04-28 10:52:25
            # utilization.gpu [%], memory.used [MiB]
            # 0 %, 376 MiB
        """
        super().__init__()
        self.gpu_ids = gpu_ids
        self.interval = interval
        self.stopped = False
        self.cmds = "nvidia-smi --query-gpu=utilization.gpu,memory.used --format=csv"
        self.cmds += " -i "
        id_str = []
        for gpu_id in self.gpu_ids:
            id_str.append(str(gpu_id))
        self.cmds += ",".join(id_str)
        self.step = 0

        if log_dir is None:
            self.writer = None
        else:
            self.writer = SummaryWriter(os.path.join(log_dir, "gpu_monitor"))

        if start_now:
            self.start()


[文档]
    def stop(self):
        self.stopped = True



[文档]
    def run(self):
        while not self.stopped:
            with os.popen(self.cmds) as fp:
                outputs = fp.read()
                if self.writer is not None:
                    outputs = outputs.split("\n")[1:-1]
                    # skip the first row (header) and the last row ("\n")
                    for i in range(outputs.__len__()):
                        utilization_memory = re.findall(r"\d+", outputs[i])
                        utilization = int(utilization_memory[0])
                        memory_used = int(utilization_memory[1])
                        self.writer.add_scalar(
                            f"utilization_{self.gpu_ids[i]}", utilization, self.step
                        )
                        self.writer.add_scalar(
                            f"memory_used_{self.gpu_ids[i]}", memory_used, self.step
                        )
                else:
                    print(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
                    print(outputs)
                    """
                    2022-04-20 18:14:26
                    utilization.gpu [%], memory.used [MiB]
                    4 %, 1816 MiB
                    0 %, 1840 MiB
                    0 %, 1840 MiB
                    0 %, 1720 MiB
                    """
            time.sleep(self.interval)
            self.step += 1