Source code for pygod.detector.gadnr

# -*- coding: utf-8 -*-
"""GAD-NR: Graph Anomaly Detection via Neighborhood Reconstruction (GADNR)
   The code is partially from the original implementation in 
   https://github.com/Graph-COM/GAD-NR"""
# Author: Yingtong Dou <ytongdou@gmail.com>
# License: BSD 2 clause

import time
import torch
import torch.nn.functional as F
from torch_geometric.loader import NeighborLoader
from torch_geometric.nn import GCN
from torch_geometric import compile

from . import DeepDetector
from ..nn import GADNRBase
from ..utils import logger



[docs]
class GADNR(DeepDetector):
    """
    Graph Anomaly Detection via Neighborhood Reconstruction

    GAD-NR is a new type of GAE based on neighborhood reconstruction
    for graph anomaly detection. GAD-NR aims to reconstruct the entire
    neighborhood (including local structure, self attributes, and
    neighbors attributes) around a node based on the corresponding node
    representation.
    
    See :cite:`roy2024gadnr` for details.
    
    Parameters
    ----------
    hid_dim :  int, optional
        Hidden dimension of model. Default: ``64``.
    num_layers : int, optional
        Total number of layers in the backbone encoder model. Default: ``1``.
    deg_dec_layers : int, optional
        The number of layers for the node degree decoder. Default: ``4``.
    fea_dec_layers : int, optional
        The number of layers for the node feature decoder. Default: ``3``.
    backbone : torch.nn.Module, optional
        The backbone of the deep detector implemented in PyG.
        Default: ``torch_geometric.nn.GCN``.
    sample_size : int, optional
        The number of samples for the neighborhood distribution.
        Default: ``2``.
    sample_time : int, optional
        The number sample times to remove the noise during node feature and
        neighborhood distribution reconstruction. Default: ``3``.
    neigh_loss : str, optional
        The neighbor reconstruction loss. ``KL`` represents the KL divergence
        loss, ``W2`` represents the W2 loss. Default: ``KL``.
    lambda_loss1 : float, optional
        The weight of the neighborhood reconstruction loss term.
        Default: ``1e-2``.
    lambda_loss2 : float, optional
        The weight of the node feature reconstruction loss term.
        Default: ``1e-3``.
    lambda_loss3 : float, optional
        The weight of the node degree reconstruction loss term.
        Default: ``1e-4``.
    real_loss : bool, optional
        Whether using the original loss proposed in the paper as the
        decision score, if not, using the proposed weighted decision score.
        Default: ``True``.
    lr : float, optional
        Learning rate. Default: ``0.01``.
    epoch : int, optional
        Maximum number of training epoch. Default: ``100``.    
    dropout : float, optional
        Dropout rate. Default: ``0.``.
    weight_decay : float, optional
        Weight decay (L2 penalty). Default: ``0.0003``.
    act : callable activation function or None, optional
        Activation function if not None.
        Default: ``torch.nn.functional.relu``.
    gpu : int
        GPU Index, -1 for using CPU. Default: ``-1``.
    batch_size : int, optional
        Minibatch size, 0 for full batch training. Default: ``0``.
    num_neigh : int, optional
        Number of neighbors in sampling, -1 for all neighbors.
        Default: ``-1``.
    contamination : float, optional
        The amount of contamination of the dataset in (0., 0.5], i.e.,
        the proportion of outliers in the dataset. Used when fitting to
        define the threshold on the decision function. Default: ``0.1``.
    verbose : int, optional
        Verbosity mode. Range in [0, 3]. Larger value for printing out
        more log information. Default: ``0``.
    save_emb : bool, optional
        Whether to save the embedding. Default: ``False``.
    compile_model : bool, optional
        Whether to compile the model with ``torch_geometric.compile``.
        Default: ``False``.
    **kwargs : optional
        Other parameters for the backbone.

    Attributes
    ----------
    decision_score_ : torch.Tensor
        The outlier scores of the training data. Outliers tend to have
        higher scores. This value is available once the detector is
        fitted.
    threshold_ : float
        The threshold is based on ``contamination``. It is the
        :math:`N \\times` ``contamination`` most abnormal samples in
        ``decision_score_``. The threshold is calculated for generating
        binary outlier labels.
    label_ : torch.Tensor
        The binary labels of the training data. 0 stands for inliers
        and 1 for outliers. It is generated by applying
        ``threshold_`` on ``decision_score_``.
    emb : torch.Tensor or tuple of torch.Tensor or None
        The learned node hidden embeddings of shape
        :math:`N \\times` ``hid_dim``. Only available when ``save_emb``
        is ``True``. When the detector has not been fitted, ``emb`` is
        ``None``. When the detector has multiple embeddings,
        ``emb`` is a tuple of torch.Tensor.
    """

    def __init__(self,
                 hid_dim=64,
                 num_layers=1,
                 deg_dec_layers=4,
                 fea_dec_layers=3,
                 backbone=GCN,
                 sample_size=2,
                 sample_time=3,
                 neigh_loss='KL',
                 lambda_loss1=1e-2,
                 lambda_loss2=1e-1,
                 lambda_loss3=8e-1,
                 real_loss=True,
                 lr=1e-2,
                 epoch=100,
                 dropout=0.,
                 weight_decay=3e-4,
                 act=F.relu,
                 gpu=-1,
                 batch_size=0,
                 num_neigh=-1,
                 contamination=0.1,
                 verbose=0,
                 save_emb=False,
                 compile_model=False,
                 **kwargs):

        super(GADNR, self).__init__(hid_dim=hid_dim,
                                    num_layers=num_layers,
                                    dropout=dropout,
                                    weight_decay=weight_decay,
                                    act=act,
                                    backbone=backbone,
                                    contamination=contamination,
                                    lr=lr,
                                    epoch=epoch,
                                    gpu=gpu,
                                    batch_size=batch_size,
                                    num_neigh=num_neigh,
                                    verbose=verbose,
                                    save_emb=save_emb,
                                    compile_model=compile_model,
                                    **kwargs)

        self.encoder_layers = num_layers
        self.deg_dec_layers = deg_dec_layers
        self.fea_dec_layers = fea_dec_layers
        self.sample_size = sample_size
        self.sample_time = sample_time
        self.neigh_loss = neigh_loss
        self.lambda_loss1 = lambda_loss1
        self.lambda_loss2 = lambda_loss2
        self.lambda_loss3 = lambda_loss3
        self.real_loss = real_loss
        self.neighbor_num_list = None
        self.neighbor_dict = None
        self.id_mapping = None
        self.full_batch = None
        self.tot_nodes = 0
        self.verbose = verbose

    def process_graph(self, data):
        if self.batch_size != data.x.shape[0]: # mini-batch
            data, neighbor_dict, neighbor_num_list, id_mapping = \
                                GADNRBase.process_graph(data,
                                                        data.input_id.tolist())
        else: # full batch
            data, neighbor_dict, neighbor_num_list, id_mapping = \
                                GADNRBase.process_graph(data)
            self.tot_nodes = data.x.shape[0]

        self.neighbor_num_list = neighbor_num_list.to(self.device)
        self.neighbor_dict = neighbor_dict
        self.id_mapping = id_mapping

        return data

    def init_model(self, **kwargs):
        if self.save_emb:
            self.emb = torch.zeros(self.num_nodes, self.hid_dim)
                         
        return GADNRBase(in_dim=self.in_dim, hid_dim=self.hid_dim,
                         encoder_layers=self.encoder_layers,
                         deg_dec_layers=self.deg_dec_layers,
                         fea_dec_layers=self.fea_dec_layers,
                         sample_size=self.sample_size,
                         sample_time=self.sample_time, 
                         neighbor_num_list=self.neighbor_num_list,
                         neigh_loss=self.neigh_loss,
                         lambda_loss1=self.lambda_loss1,
                         lambda_loss2=self.lambda_loss2,
                         lambda_loss3=self.lambda_loss3,
                         full_batch=self.full_batch,
                         backbone=self.backbone,
                         device=self.device).to(self.device)

    def forward_model(self, data):
        if not self.full_batch: # mini-batch training
            h0, degree_logits, feat_recon_list, neigh_recon_list = \
                                            self.model(data.x,
                                                       data.edge_index,
                                                       data.input_id.tolist(),
                                                       self.neighbor_dict,
                                                       self.id_mapping)
        else: # full batch training
            h0, degree_logits, feat_recon_list, neigh_recon_list = \
                                                self.model(data.x,
                                                           data.edge_index)
        
        loss, loss_per_node, h_loss, degree_loss, feature_loss = \
                                self.model.loss_func(h0,
                                                     degree_logits,
                                                     feat_recon_list,
                                                     neigh_recon_list,
                                                     self.neighbor_num_list)

        return loss, loss_per_node.cpu().detach(), h_loss.cpu().detach(), \
            degree_loss.cpu().detach(), feature_loss.cpu().detach()
    
    def comp_decision_score(self,
                            loss_per_node,
                            h_loss,
                            degree_loss,
                            feature_loss,
                            h_loss_weight,
                            degree_loss_weight,
                            feature_loss_weight):
        """Compute the decision score based on orginal loss or weighted loss.
        """
        if self.real_loss:
            # the orginal decision score from the loss
            comp_loss = loss_per_node
        else:
            # the weighted decision score
            h_loss_norm = h_loss / (torch.max(h_loss) - 
                                    torch.min(h_loss))
            degree_loss_norm = degree_loss / \
                (torch.max(degree_loss) - torch.min(degree_loss))
            feature_loss_norm = feature_loss / \
                (torch.max(feature_loss) - torch.min(feature_loss))
            comp_loss = h_loss_weight * h_loss_norm \
                + degree_loss_weight *  degree_loss_norm \
                    + feature_loss_weight * feature_loss_norm
        return comp_loss


[docs]
    def fit(self,
            data,
            label=None,
            h_loss_weight=1.0,
            degree_loss_weight=0.,
            feature_loss_weight=2.5,
            loss_step=20
            ):
        """
        Overwrite the base model fit function since GAD-NR uses 
        multiple personalized loss functions.

        Parameters
        ----------
        data : torch_geometric.data.Data
            Input graph.
        label : torch.Tensor, optional
            The optional outlier ground truth labels used for testing.
            Default: ``None``.
        h_loss_weight : float, optional
            The weight of the neighborhood reconstruction loss term used in 
            the weighted decision score. Default: ``1.0``.
        degree_loss_weight : float, optional
            The weight of the node degree reconstruction loss term used in 
            the weighted decision score. Default: ``0.``.
        feature_loss_weight : float, optional
            The weight of the node feature reconstruction loss term used in 
            the weighted decision score. Default: ``2.5``.
        loss_step : int, optional
            The epoch interval to update the loss terms. Default: ``20``.
        """

        self.num_nodes, self.in_dim = data.x.shape
        if self.batch_size == 0: # full batch training
            self.batch_size = data.x.shape[0]
            data = self.process_graph(data)
            self.full_batch = True
        else: # mini batch training
            loader = NeighborLoader(data,
                                    self.num_neigh,
                                    batch_size=self.batch_size)
            self.full_batch = False
        self.model = self.init_model(**self.kwargs)
        if self.compile_model:
            self.model = compile(self.model)
        
        degree_params = list(map(id, self.model.degree_decoder.parameters()))
        base_params = filter(lambda p: id(p) not in degree_params,
                         self.model.parameters())
        optimizer = torch.optim.Adam([{'params': base_params}, 
                                      {'params': self.model.degree_decoder.
                                       parameters(), 'lr': 1e-2}],
                                       lr=self.lr,
                                       weight_decay=self.weight_decay)
        
        min_loss = float('inf')
        self.arg_min_loss_per_node = None

        self.model.train()
        self.decision_score_ = torch.zeros(data.x.shape[0])
        for epoch in range(1, self.epoch+1, 1):
            start_time = time.time()
            epoch_loss = 0
            epoch_loss_per_node = torch.zeros(data.x.shape[0]) 
            if epoch%loss_step==0:
                self.model.lambda_loss2 = self.model.lambda_loss2 + 0.5
                self.model.lambda_loss3 = self.model.lambda_loss3 / 2
            
            # full batch training
            if self.full_batch:
                loss, loss_per_node, h_loss, degree_loss, feature_loss = \
                                                self.forward_model(data) 

                comp_loss = self.comp_decision_score(loss_per_node,
                                                     h_loss,
                                                     degree_loss,
                                                     feature_loss,
                                                     h_loss_weight,
                                                     degree_loss_weight,
                                                     feature_loss_weight)
                
                self.decision_score_ = comp_loss.squeeze(1)

                if self.save_emb:
                    self.emb = self.model.emb.cpu()
                
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

                epoch_loss = loss.item() * self.batch_size 
                epoch_loss_per_node = loss_per_node.squeeze(1)
            else: # mini batch training
                for sampled_data in loader:
                    batch_size = sampled_data.batch_size
                    node_idx = sampled_data.n_id
                    sampled_data = self.process_graph(sampled_data)

                    loss, loss_per_node, h_loss, degree_loss, feature_loss = \
                                            self.forward_model(sampled_data)
                    
                    comp_loss = self.comp_decision_score(loss_per_node,
                                                         h_loss,
                                                         degree_loss,
                                                         feature_loss,
                                                         h_loss_weight,
                                                         degree_loss_weight,
                                                         feature_loss_weight)
                                                
                    self.decision_score_[node_idx[:batch_size]] = \
                                                        comp_loss.squeeze(1)

                    if self.save_emb:
                        self.emb[node_idx[:batch_size]] = \
                            self.model.emb[:batch_size].cpu()
                    
                    optimizer.zero_grad()
                    loss.backward()
                    optimizer.step()

                    epoch_loss += loss.item() * batch_size
                    epoch_loss_per_node[node_idx[:batch_size]] = \
                                                    loss_per_node.squeeze(1)
            
            loss_value = epoch_loss / data.x.shape[0]

            if loss_value < min_loss:
                min_loss = loss_value
                self.arg_min_loss_per_node = epoch_loss_per_node
            
            logger(epoch=epoch,
                   loss=loss_value,
                   score=self.decision_score_,
                   target=label,
                   time=time.time() - start_time,
                   verbose=self.verbose,
                   train=True)

        self._process_decision_score()
        return self

    
    def decision_function(self,
                          data,
                          label=None,
                          h_loss_weight=1.0,
                          degree_loss_weight=0.,
                          feature_loss_weight=2.5):
        """ 
        Overwrite the decision function from the base model due to the unique
        loss function and decision score from the GADNR paper.
        The three loss term weights must be the same as the fit function if
        ``real_loss`` is ``False``.
        """
         
        if self.full_batch: # full batch inference
            if self.batch_size != data.x.shape[0]:
                raise ValueError(data, 'should have the same number of nodes '
                                       'as the training data under the full '
                                       'batch mode. To test on the data with '
                                       'different number of nodes, please use '
                                       'the mini-batch mode.')
            data = self.process_graph(data)
        else: # mini batch inference
            loader = NeighborLoader(data,
                                    self.num_neigh,
                                    batch_size=self.batch_size)
        self.model.eval()
        outlier_score = torch.zeros(data.x.shape[0])
        if self.save_emb:
            if type(self.hid_dim) is tuple:
                self.emb = (torch.zeros(data.x.shape[0], self.hid_dim[0]),
                            torch.zeros(data.x.shape[0], self.hid_dim[1]))
            else:
                self.emb = torch.zeros(data.x.shape[0], self.hid_dim)
        start_time = time.time()
        if self.batch_size == data.x.shape[0]: # full batch inference
            loss, loss_per_node, h_loss, degree_loss, feature_loss = \
                                            self.forward_model(data) 
            comp_loss = self.comp_decision_score(loss_per_node,
                                                 h_loss,
                                                 degree_loss,
                                                 feature_loss,
                                                 h_loss_weight,
                                                 degree_loss_weight,
                                                 feature_loss_weight)
            outlier_score = comp_loss.squeeze(1)
            if self.save_emb:
                self.emb = self.model.emb.cpu()
        else: # mini batch inference
            for sampled_data in loader:
                batch_size = sampled_data.batch_size
                node_idx = sampled_data.n_id
                sampled_data = self.process_graph(sampled_data)
                loss, loss_per_node, h_loss, degree_loss, feature_loss = \
                                        self.forward_model(sampled_data)
                comp_loss = self.comp_decision_score(loss_per_node,
                                                     h_loss,
                                                     degree_loss,
                                                     feature_loss,
                                                     h_loss_weight,
                                                     degree_loss_weight,
                                                     feature_loss_weight)
                outlier_score[node_idx[:batch_size]] = comp_loss.squeeze(1)

        logger(loss=loss.item() / data.x.shape[0],
               score=outlier_score,
               target=label,
               time=time.time() - start_time,
               verbose=self.verbose,
               train=False)
        return outlier_score