Source code for pygod.models.guide

# -*- coding: utf-8 -*-
"""Higher-order Structure based Anomaly Detection on Attributed
    Networks (GUIDE)"""
# Author: Kay Liu <zliu234@uic.edu>
# License: BSD 2 clause

import os
import torch
import hashlib
import numpy as np
import networkx as nx
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.utils import softmax
from torch_geometric.nn import MessagePassing
from torch_geometric.utils import add_self_loops
from torch_geometric.loader import NeighborLoader
from networkx.generators.atlas import graph_atlas_g
from sklearn.utils.validation import check_is_fitted

from . import BaseDetector
from .basic_nn import GCN
from ..utils.utility import validate_device
from ..utils.metric import eval_roc_auc


[docs]class GUIDE(BaseDetector): """ GUIDE (Higher-order Structure based Anomaly Detection on Attributed Networks) GUIDE is an anomaly detector consisting of an attribute graph convolutional autoencoder, and a structure graph attentive autoencoder (not same as the graph attention networks). Instead of adjacency matrix, node motif degree (graphlet degree is used in this implementation by default) is used as input of structure autoencoder. The reconstruction mean square error of the autoencoders are defined as structure anomaly score and attribute anomaly score, respectively. Note: The calculation of node motif degree in preprocesing has high time complexity. It may take longer than you expect. See :cite:`yuan2021higher` for details. Parameters ---------- a_hid : int, optional Hidden dimension for attribute autoencoder. Default: ``32``. s_hid : int, optional Hidden dimension for structure autoencoder. Default: ``4``. num_layers : int, optional Total number of layers in autoencoders. Default: ``4``. dropout : float, optional Dropout rate. Default: ``0.``. weight_decay : float, optional Weight decay (L2 penalty). Default: ``0.``. act : callable activation function or None, optional Activation function if not None. Default: ``torch.nn.functional.relu``. alpha : float, optional Loss balance weight for attribute and structure. Default: ``0.5``. contamination : float, optional Valid in (0., 0.5). The proportion of outliers in the data set. Used when fitting to define the threshold on the decision function. Default: ``0.1``. lr : float, optional Learning rate. Default: ``0.004``. epoch : int, optional Maximum number of training epoch. Default: ``10``. gpu : int GPU Index, -1 for using CPU. Default: ``0``. batch_size : int, optional Minibatch size, 0 for full batch training. Default: ``0``. num_neigh : int, optional Number of neighbors in sampling, -1 for all neighbors. Default: ``-1``. graphlet_size : int The maximum graphlet size used to compute structure input. Default: ``4``. selected_motif : bool Use selected motifs which are defined in the original paper. Default: ``True``. cache_dir : str The directory for the node motif degree caching. Default: ``None``. verbose : bool Verbosity mode. Turn on to print out log information. Default: ``False``. Examples -------- >>> from pygod.models import GUIDE >>> model = GUIDE() >>> model.fit(data) # PyG graph data object >>> prediction = model.predict(data) """ def __init__(self, a_hid=32, s_hid=4, num_layers=4, dropout=0.3, weight_decay=0., act=F.relu, alpha=0.1, contamination=0.1, lr=0.001, epoch=10, gpu=0, batch_size=0, num_neigh=-1, graphlet_size=4, selected_motif=True, cache_dir=None, verbose=False): super(GUIDE, self).__init__(contamination=contamination) # model param self.a_hid = a_hid self.s_hid = s_hid self.num_layers = num_layers self.dropout = dropout self.weight_decay = weight_decay self.act = act self.alpha = alpha # training param self.lr = lr self.epoch = epoch self.device = validate_device(gpu) self.batch_size = batch_size self.num_neigh = num_neigh # other param self.graphlet_size = graphlet_size if selected_motif: assert self.graphlet_size == 4, \ "Graphlet size is fixed when using selected motif" self.selected_motif = selected_motif self.verbose = verbose self.model = None if cache_dir is None: cache_dir = os.path.join(os.path.expanduser('~'), '.pygod') self.cache_dir = cache_dir
[docs] def fit(self, G, y_true=None): """ Fit detector with input data. Parameters ---------- G : torch_geometric.data.Data The input data. y_true : numpy.ndarray, optional The optional outlier ground truth labels used to monitor the training progress. They are not used to optimize the unsupervised model. Default: ``None``. Returns ------- self : object Fitted estimator. """ G.node_idx = torch.arange(G.x.shape[0]) G.s = self._get_nmf(G, self.cache_dir) if self.batch_size == 0: self.batch_size = G.x.shape[0] loader = NeighborLoader(G, [self.num_neigh] * self.num_layers, batch_size=self.batch_size) self.model = GUIDE_Base(a_dim=G.x.shape[1], s_dim=G.s.shape[1], a_hid=self.a_hid, s_hid=self.s_hid, num_layers=self.num_layers, dropout=self.dropout, act=self.act).to(self.device) optimizer = torch.optim.Adam(self.model.parameters(), lr=self.lr, weight_decay=self.weight_decay) self.model.train() decision_scores = np.zeros(G.x.shape[0]) for epoch in range(self.epoch): epoch_loss = 0 for sampled_data in loader: batch_size = sampled_data.batch_size node_idx = sampled_data.node_idx x, s, edge_index = self.process_graph(G) x_, s_ = self.model(x, s, edge_index) score = self.loss_func(x[:batch_size], x_[:batch_size], s[:batch_size], s_[:batch_size]) decision_scores[node_idx[:batch_size]] = score.detach() \ .cpu().numpy() loss = torch.mean(score) epoch_loss += loss.item() * batch_size optimizer.zero_grad() loss.backward() optimizer.step() if self.verbose: print("Epoch {:04d}: Loss {:.4f}" .format(epoch, epoch_loss / G.x.shape[0]), end='') if y_true is not None: auc = eval_roc_auc(y_true, decision_scores) print(" | AUC {:.4f}".format(auc), end='') print() self.decision_scores_ = decision_scores self._process_decision_scores() return self
[docs] def decision_function(self, G): """ Predict raw anomaly score using the fitted detector. Outliers are assigned with larger anomaly scores. Parameters ---------- G : PyTorch Geometric Data instance (torch_geometric.data.Data) The input data. Returns ------- outlier_scores : numpy.ndarray The anomaly score of shape :math:`N`. """ check_is_fitted(self, ['model']) G.node_idx = torch.arange(G.x.shape[0]) G.s = self._get_nmf(G, self.cache_dir) loader = NeighborLoader(G, [self.num_neigh] * self.num_layers, batch_size=self.batch_size) self.model.eval() outlier_scores = np.zeros(G.x.shape[0]) for sampled_data in loader: batch_size = sampled_data.batch_size node_idx = sampled_data.node_idx x, s, edge_index = self.process_graph(sampled_data) x_, s_ = self.model(x, s, edge_index) score = self.loss_func(x[:batch_size], x_[:batch_size], s[:batch_size], s_[:batch_size]) outlier_scores[node_idx[:batch_size]] = score.detach()\ .cpu().numpy() return outlier_scores
def process_graph(self, G): """ Process the raw PyG data object into a tuple of sub data objects needed for the model. Parameters ---------- G : PyTorch Geometric Data instance (torch_geometric.data.Data) The input data. Returns ------- x : torch.Tensor Attribute (feature) of nodes. s : torch.Tensor Structure matrix (node motif degree/graphlet degree) edge_index : torch.Tensor Edge list of the graph. """ edge_index = G.edge_index.to(self.device) s = G.s.to(self.device) x = G.x.to(self.device) return x, s, edge_index def loss_func(self, x, x_, s, s_): # attribute reconstruction loss diff_attribute = torch.pow(x - x_, 2) attribute_errors = torch.sqrt(torch.sum(diff_attribute, 1)) # structure reconstruction loss diff_structure = torch.pow(s - s_, 2) structure_errors = torch.sqrt(torch.sum(diff_structure, 1)) score = self.alpha * attribute_errors + (1 - self.alpha) * \ structure_errors return score def _get_nmf(self, G, cache_dir): """ Calculation of Node Motif Degree / Graphlet Degree Distribution. Part of this function is adapted from https://github.com/benedekrozemberczki/OrbitalFeatures. Parameters ---------- G : PyTorch Geometric Data instance (torch_geometric.data.Data) The input data. cache_dir : str The directory for the node motif degree caching Returns ------- s : torch.Tensor Structure matrix (node motif degree/graphlet degree) """ hash_func = hashlib.sha1() hash_func.update(str(G).encode('utf-8')) file_name = 'nmd_' + str(hash_func.hexdigest()[:8]) + \ str(self.graphlet_size) + \ str(self.selected_motif)[0] + '.pt' file_path = os.path.join(cache_dir, file_name) if os.path.exists(file_path): s = torch.load(file_path) else: edge_index = G.edge_index g = nx.from_edgelist(edge_index.T.tolist()) # create edge subsets edge_subsets = dict() subsets = [[edge[0], edge[1]] for edge in g.edges()] edge_subsets[2] = subsets unique_subsets = dict() for i in range(3, self.graphlet_size + 1): for subset in subsets: for node in subset: for neb in g.neighbors(node): new_subset = subset + [neb] if len(set(new_subset)) == i: new_subset.sort() unique_subsets[tuple(new_subset)] = 1 subsets = [list(k) for k, v in unique_subsets.items()] edge_subsets[i] = subsets unique_subsets = dict() # enumerate graphs graphs = graph_atlas_g() interesting_graphs = {i: [] for i in range(2, self.graphlet_size + 1)} for graph in graphs: if 1 < graph.number_of_nodes() < self.graphlet_size + 1: if nx.is_connected(graph): interesting_graphs[graph.number_of_nodes()].append( graph) # enumerate categories main_index = 0 categories = dict() for size, graphs in interesting_graphs.items(): categories[size] = dict() for index, graph in enumerate(graphs): categories[size][index] = dict() degrees = list( set([graph.degree(node) for node in graph.nodes()])) for degree in degrees: categories[size][index][degree] = main_index main_index += 1 unique_motif_count = main_index # setup feature features = {node: {i: 0 for i in range(unique_motif_count)} for node in g.nodes()} for size, node_lists in edge_subsets.items(): graphs = interesting_graphs[size] for nodes in node_lists: sub_gr = g.subgraph(nodes) for index, graph in enumerate(graphs): if nx.is_isomorphic(sub_gr, graph): for node in sub_gr.nodes(): features[node][categories[size][index][ sub_gr.degree(node)]] += 1 break motifs = [[n] + [features[n][i] for i in range( unique_motif_count)] for n in g.nodes()] motifs = torch.Tensor(motifs) motifs = motifs[torch.sort(motifs[:, 0]).indices, 1:] if self.selected_motif: # use motif selected in the original paper only s = torch.zeros((G.x.shape[0], 6)) # m31 s[:, 0] = motifs[:, 3] # m32 s[:, 1] = motifs[:, 1] + motifs[:, 2] # m41 s[:, 2] = motifs[:, 14] # m42 s[:, 3] = motifs[:, 12] + motifs[:, 13] # m43 s[:, 4] = motifs[:, 11] # node degree s[:, 5] = motifs[:, 0] else: # use graphlet degree s = motifs if not os.path.exists(cache_dir): os.makedirs(cache_dir) torch.save(s, file_path) return s
class GUIDE_Base(nn.Module): def __init__(self, a_dim, s_dim, a_hid, s_hid, num_layers, dropout, act): super(GUIDE_Base, self).__init__() self.attr_ae = GCN(in_channels=a_dim, hidden_channels=a_hid, num_layers=num_layers, out_channels=a_dim, dropout=dropout, act=act) self.struct_ae = GNA(in_channels=s_dim, hidden_channels=s_hid, num_layers=num_layers, out_channels=s_dim, dropout=dropout, act=act) def forward(self, x, s, edge_index): x_ = self.attr_ae(x, edge_index) s_ = self.struct_ae(s, edge_index) return x_, s_ class GNA(torch.nn.Module): def __init__(self, in_channels, hidden_channels, num_layers, out_channels, dropout, act): super().__init__() self.layers = torch.nn.ModuleList() self.layers.append(GNAConv(in_channels, hidden_channels)) for layer in range(num_layers - 2): self.layers.append(GNAConv(hidden_channels, hidden_channels)) self.layers.append(GNAConv(hidden_channels, out_channels)) self.dropout = dropout self.act = act def forward(self, s, edge_index): for layer in self.layers: s = layer(s, edge_index) s = F.dropout(s, self.dropout, training=self.training) s = self.act(s) return s class GNAConv(MessagePassing): def __init__(self, in_channels, out_channels): super().__init__(aggr='add') self.w1 = torch.nn.Linear(in_channels, out_channels) self.w2 = torch.nn.Linear(in_channels, out_channels) self.a = nn.Parameter(torch.randn(out_channels, 1)) def forward(self, s, edge_index): edge_index, _ = add_self_loops(edge_index, num_nodes=s.size(0)) out = self.propagate(edge_index, s=self.w2(s)) return self.w1(s) + out def message(self, s_i, s_j, edge_index): alpha = (s_i - s_j) @ self.a alpha = softmax(alpha, edge_index[1], num_nodes=s_i.shape[0]) return alpha * s_j