Source code for pygod.detector.guide

# -*- coding: utf-8 -*-
"""Higher-order Structure based Anomaly Detection on Attributed
    Networks (GUIDE)"""
# Author: Kay Liu <zliu234@uic.edu>
# License: BSD 2 clause

import os
import warnings

import torch
import torch.nn.functional as F

from . import DeepDetector
from ..nn import GUIDEBase


[docs]class GUIDE(DeepDetector):
    """
    Higher-order Structure based Anomaly Detection on Attributed
    Networks

    GUIDE is an anomaly detector consisting of an attribute graph
    convolutional autoencoder, and a structure graph attentive
    autoencoder (not the same as the graph attention networks). Instead
    of the adjacency matrix, node motif degree is used as input of
    structure autoencoder. The reconstruction mean square error of the
    autoencoders are defined as structure anomaly score and attribute
    anomaly score, respectively.

    Note: The calculation of node motif degree in preprocessing has
    high time complexity. It may take longer than you expect.

    See :cite:`yuan2021higher` for details.

    Parameters
    ----------
    hid_a : int, optional
        Hidden dimension for attribute. Default: ``64``.
    hid_s : int, optional
        Hidden dimension for structure. Default: ``4``.
    num_layers : int, optional
        Total number of layers in model. Default: ``4``.
    dropout : float, optional
        Dropout rate. Default: ``0.``.
    weight_decay : float, optional
        Weight decay (L2 penalty). Default: ``0.``.
    act : callable activation function or None, optional
        Activation function if not None.
        Default: ``torch.nn.functional.relu``.
    backbone : torch.nn.Module
        The backbone of GUIDE is fixed. Changing of this
        parameter will not affect the model. Default: ``None``.
    alpha : float, optional
        Weight between reconstruction of node feature and structure.
        Default: ``0.5``.
    contamination : float, optional
        The amount of contamination of the dataset in (0., 0.5], i.e.,
        the proportion of outliers in the dataset. Used when fitting to
        define the threshold on the decision function. Default: ``0.1``.
    lr : float, optional
        Learning rate. Default: ``0.004``.
    epoch : int, optional
        Maximum number of training epoch. Default: ``100``.
    gpu : int
        GPU Index, -1 for using CPU. Default: ``-1``.
    batch_size : int, optional
        Minibatch size, 0 for full batch training. Default: ``0``.
    num_neigh : int, optional
        Number of neighbors in sampling, -1 for all neighbors.
        Default: ``-1``.
    graphlet_size : int, optional
        The maximum size of graphlet. Default: ``4``.
    selected_motif : bool, optional
        Whether to use selected motif in the paper. Default: ``True``.
    cache_dir : str, optional
        The directory for the node motif degree caching. If ``None``,
        ~/.pygod will be used. Default: ``None``.
    verbose : int, optional
        Verbosity mode. Range in [0, 3]. Larger value for printing out
        more log information. Default: ``0``.
    save_emb : bool, optional
        Whether to save the embedding. Default: ``False``.
    compile_model : bool, optional
        Whether to compile the model with ``torch_geometric.compile``.
        Default: ``False``.
    **kwargs
        Other parameters for the backbone.

    Attributes
    ----------
    decision_score_ : torch.Tensor
        The outlier scores of the training data. Outliers tend to have
        higher scores. This value is available once the detector is
        fitted.
    threshold_ : float
        The threshold is based on ``contamination``. It is the
        :math:`N`*``contamination`` most abnormal samples in
        ``decision_score_``. The threshold is calculated for generating
        binary outlier labels.
    label_ : torch.Tensor
        The binary labels of the training data. 0 stands for inliers
        and 1 for outliers. It is generated by applying
        ``threshold_`` on ``decision_score_``.
    emb : torch.Tensor or tuple of torch.Tensor or None
        The learned node hidden embeddings of shape
        :math:`N \\times` ``hid_dim``. Only available when ``save_emb``
        is ``True``. When the detector has not been fitted, ``emb`` is
        ``None``. When the detector has multiple embeddings,
        ``emb`` is a tuple of torch.Tensor.
    """

    def __init__(self,
                 hid_a=64,
                 hid_s=4,
                 num_layers=4,
                 dropout=0.,
                 weight_decay=0.,
                 act=F.relu,
                 backbone=None,
                 alpha=0.5,
                 contamination=0.1,
                 lr=0.004,
                 epoch=100,
                 gpu=-1,
                 batch_size=0,
                 num_neigh=-1,
                 graphlet_size=4,
                 selected_motif=True,
                 cache_dir=None,
                 verbose=0,
                 save_emb=False,
                 compile_model=False,
                 **kwargs):

        if backbone is not None:
            warnings.warn("Backbone is not used in GUIDE")

        super(GUIDE, self).__init__(hid_dim=(hid_a, hid_s),
                                    num_layers=num_layers,
                                    dropout=dropout,
                                    weight_decay=weight_decay,
                                    act=act,
                                    backbone=backbone,
                                    contamination=contamination,
                                    lr=lr,
                                    epoch=epoch,
                                    gpu=gpu,
                                    batch_size=batch_size,
                                    num_neigh=num_neigh,
                                    verbose=verbose,
                                    save_emb=save_emb,
                                    compile_model=compile_model,
                                    **kwargs)

        self.dim_s = None
        self.alpha = alpha
        self.graphlet_size = graphlet_size
        if selected_motif:
            assert self.graphlet_size == 4, \
                "Graphlet size is fixed when using selected motif"
        self.selected_motif = selected_motif
        self.verbose = verbose
        self.cache_dir = cache_dir

    def process_graph(self, data):

        data.s = GUIDEBase.calc_gdd(data,
                                    self.cache_dir,
                                    graphlet_size=self.graphlet_size,
                                    selected_motif=self.selected_motif)
        self.dim_s = data.s.shape[1]

    def init_model(self, **kwargs):
        if self.save_emb:
            self.emb = (torch.zeros(self.num_nodes, self.hid_dim[0]),
                        torch.zeros(self.num_nodes, self.hid_dim[1]))

        return GUIDEBase(dim_a=self.in_dim,
                         dim_s=self.dim_s,
                         hid_a=self.hid_dim[0],
                         hid_s=self.hid_dim[1],
                         num_layers=self.num_layers,
                         dropout=self.dropout,
                         act=self.act,
                         **kwargs).to(self.device)

    def forward_model(self, data):

        batch_size = data.batch_size

        x = data.x.to(self.device)
        s = data.s.to(self.device)
        edge_index = data.edge_index.to(self.device)

        x_, s_ = self.model(x, s, edge_index)

        score = self.model.loss_func(x[:batch_size],
                                     x_[:batch_size],
                                     s[:batch_size],
                                     s_[:batch_size],
                                     self.alpha)

        loss = torch.mean(score)

        return loss, score.detach().cpu()