Source code for pygod.detector.anomalydae

# -*- coding: utf-8 -*-
"""AnomalyDAE: Dual autoencoder for anomaly detection
on attributed networks"""
import warnings

# Author: Xueying Ding <xding2@andrew.cmu.edu>,
#         Kay Liu <zliu234@uic.edu>
# License: BSD 2 clause

import torch
import torch.nn.functional as F

from . import DeepDetector
from ..nn import AnomalyDAEBase


[docs]class AnomalyDAE(DeepDetector):
    """
    Dual Autoencoder for Anomaly Detection on Attributed Networks

    AnomalyDAE is an anomaly detector that consists of a structure
    autoencoder and an attribute autoencoder to learn both node
    embedding and attribute embedding jointly in latent space. The
    structural autoencoder uses Graph Attention layers. The
    reconstruction mean square error of the decoders are defined as
    structure anomaly score and attribute anomaly score, respectively,
    with two additional penalties on the reconstructed adj matrix and 
    node attributes (force entries to be nonzero).

    See :cite:`fan2020anomalydae` for details.

    Parameters
    ----------
    emb_dim : int, optional
        Embedding dimension of model. Default: ``64``.
    hid_dim :  int, optional
        Hidden dimension of model. Default: ``64``.
    num_layers : int, optional
        Total number of layers of AnomalyDAE is fixed to be 4. Changing
        of this parameter will not affect the model. Default: ``4``.
    dropout : float, optional
        Dropout rate. Default: ``0.``.
    weight_decay : float, optional
        Weight decay (L2 penalty). Default: ``0.``.
    act : callable activation function or None, optional
        Activation function if not None.
        Default: ``torch.nn.functional.relu``.
    backbone : torch.nn.Module
        The backbone of AnomalyDAE is fixed. Changing of this
        parameter will not affect the model. Default: ``None``.
    alpha : float, optional
        Weight between reconstruction of node feature and structure.
        Default: ``0.5``.
    theta : float, optional
        The additional penalty for nonzero attribute. Default: ``1.``.
    eta : float, optional
        The additional penalty for nonzero structure. Default: ``1.``.
    contamination : float, optional
        The amount of contamination of the dataset in (0., 0.5], i.e.,
        the proportion of outliers in the dataset. Used when fitting to
        define the threshold on the decision function. Default: ``0.1``.
    lr : float, optional
        Learning rate. Default: ``0.004``.
    epoch : int, optional
        Maximum number of training epoch. Default: ``100``.
    gpu : int
        GPU Index, -1 for using CPU. Default: ``-1``.
    batch_size : int, optional
        Minibatch size, 0 for full batch training. Default: ``0``.
    num_neigh : int, optional
        Number of neighbors in sampling, -1 for all neighbors.
        Default: ``-1``.
    verbose : int, optional
        Verbosity mode. Range in [0, 3]. Larger value for printing out
        more log information. Default: ``0``.
    save_emb : bool, optional
        Whether to save the embedding. Default: ``False``.
    compile_model : bool, optional
        Whether to compile the model with ``torch_geometric.compile``.
        Default: ``False``.
    **kwargs
        Other parameters for the backbone model.

    Attributes
    ----------
    decision_score_ : torch.Tensor
        The outlier scores of the training data. Outliers tend to have
        higher scores. This value is available once the detector is
        fitted.
    threshold_ : float
        The threshold is based on ``contamination``. It is the
        :math:`N`*``contamination`` most abnormal samples in
        ``decision_score_``. The threshold is calculated for generating
        binary outlier labels.
    label_ : torch.Tensor
        The binary labels of the training data. 0 stands for inliers
        and 1 for outliers. It is generated by applying
        ``threshold_`` on ``decision_score_``.
    emb : torch.Tensor or tuple of torch.Tensor or None
        The learned node hidden embeddings of shape
        :math:`N \\times` ``hid_dim``. Only available when ``save_emb``
        is ``True``. When the detector has not been fitted, ``emb`` is
        ``None``. When the detector has multiple embeddings,
        ``emb`` is a tuple of torch.Tensor.
    """

    def __init__(self,
                 emb_dim=64,
                 hid_dim=64,
                 num_layers=4,
                 dropout=0.,
                 weight_decay=0.,
                 act=F.relu,
                 backbone=None,
                 alpha=0.5,
                 theta=1.,
                 eta=1.,
                 contamination=0.1,
                 lr=0.004,
                 epoch=5,
                 gpu=-1,
                 batch_size=0,
                 num_neigh=-1,
                 verbose=0,
                 save_emb=False,
                 compile_model=False,
                 **kwargs):

        if backbone is not None or num_layers != 4:
            warnings.warn("Backbone and num_layers are not used in AnomalyDAE")

        super(AnomalyDAE, self).__init__(hid_dim=hid_dim,
                                         num_layers=num_layers,
                                         dropout=dropout,
                                         weight_decay=weight_decay,
                                         act=act,
                                         backbone=backbone,
                                         contamination=contamination,
                                         lr=lr,
                                         epoch=epoch,
                                         gpu=gpu,
                                         batch_size=batch_size,
                                         num_neigh=num_neigh,
                                         verbose=verbose,
                                         save_emb=save_emb,
                                         compile_model=compile_model,
                                         **kwargs)

        self.emb_dim = emb_dim
        self.alpha = alpha
        self.theta = theta
        self.eta = eta

    def process_graph(self, data):
        AnomalyDAEBase.process_graph(data)

    def init_model(self, **kwargs):
        if self.save_emb:
            self.emb = torch.zeros(self.num_nodes,
                                   self.hid_dim)

        return AnomalyDAEBase(in_dim=self.in_dim,
                              num_nodes=self.num_nodes,
                              emb_dim=self.emb_dim,
                              hid_dim=self.hid_dim,
                              dropout=self.dropout,
                              act=self.act,
                              **kwargs).to(self.device)

    def forward_model(self, data):
        batch_size = data.batch_size
        node_idx = data.n_id

        x = data.x.to(self.device)
        s = data.s.to(self.device)
        edge_index = data.edge_index.to(self.device)

        x_, s_ = self.model(x, edge_index, batch_size)

        # positive weight conversion
        weight = 1 - self.alpha
        pos_weight_a = self.eta / (1 + self.eta)
        pos_weight_s = self.theta / (1 + self.theta)

        score = self.model.loss_func(x[:batch_size],
                                     x_[:batch_size],
                                     s[:batch_size, node_idx],
                                     s_[:batch_size],
                                     weight,
                                     pos_weight_a,
                                     pos_weight_s)

        loss = torch.mean(score)

        return loss, score.detach().cpu()