Source code for pygod.generator.outlier_generator

# -*- coding: utf-8 -*-
"""
This file including functions to generate different types of outliers given
the input dataset for benchmarking
"""
# Author: Yingtong Dou <ytongdou@gmail.com>, Kay Liu <zliu234@uic.edu>
# License: BSD 2 clause

import torch
from torch_geometric.data import Data

from ..utils.utility import check_parameter


[docs] def gen_structural_outlier(data, m, n, p=0, directed=False, seed=None): """Generating structural outliers according to paper : cite:`ding2019deep`. We randomly select ``m`` nodes from the network and then make those nodes fully connected, and then all the ``m`` nodes in the clique are regarded as outliers. We iteratively repeat this process until a number of ``n`` cliques are generated and the total number of structural outliers is ``m * n``. Parameters ---------- data : torch_geometric.data.Data The input data. m : int Number nodes in the outlier cliques. n : int Number of outlier cliques. p : int, optional Probability of edge drop in cliques. Default: ``0``. directed : bool, optional Whether the edges added are directed. Default: ``False``. seed : int, optional The seed to control the randomness, Default: ``None``. Returns ------- data : torch_geometric.data.Data The structural outlier graph with injected edges. y_outlier : torch.Tensor The outlier label tensor where 1 represents outliers and 0 represents normal nodes. """ if not isinstance(data, Data): raise TypeError("data should be torch_geometric.data.Data") if isinstance(m, int): check_parameter(m, low=0, high=data.num_nodes, param_name='m') else: raise ValueError("m should be int, got %s" % m) if isinstance(n, int): check_parameter(n, low=0, high=data.num_nodes, param_name='n') else: raise ValueError("n should be int, got %s" % n) check_parameter(m * n, low=0, high=data.num_nodes, param_name='m*n') if seed: torch.manual_seed(seed) new_edges = [] outlier_idx = torch.randperm(data.num_nodes)[:m * n] # connect all m nodes in each clique for i in range(n): new_edges.append(torch.combinations(outlier_idx[m * i: m * (i + 1)])) new_edges = torch.cat(new_edges) # drop edges with probability p if p != 0: indices = torch.randperm(len(new_edges))[:int((1-p) * len(new_edges))] new_edges = new_edges[indices] y_outlier = torch.zeros(data.x.shape[0], dtype=torch.long) y_outlier[outlier_idx] = 1 if not directed: new_edges = torch.cat([new_edges, new_edges.flip(1)], dim=0) data.edge_index = torch.cat([data.edge_index, new_edges.T], dim=1) return data, y_outlier
[docs] def gen_contextual_outlier(data, n, k, seed=None): r"""Generating contextual outliers according to paper :cite:`ding2019deep`. We randomly select ``n`` nodes as the attribute perturbation candidates. For each selected node :math:`i`, we randomly pick another ``k`` nodes from the data and select the node :math:`j` whose attributes :math:`x_j` deviate the most from node :math:`i`'s attribute :math:`x_i` among ``k`` nodes by maximizing the Euclidean distance :math:`\| x_i − x_j \|`. Afterwards, we then substitute the attributes :math:`x_i` of node :math:`i` to :math:`x_j`. Parameters ---------- data : torch_geometric.data.Data The input data. n : int Number of nodes converting to outliers. k : int Number of candidate nodes for each outlier node. seed : int, optional The seed to control the randomness, Default: ``None``. Returns ------- data : torch_geometric.data.Data The contextual outlier graph with modified node attributes. y_outlier : torch.Tensor The outlier label tensor where 1 represents outliers and 0 represents normal nodes. """ if not isinstance(data, Data): raise TypeError("data should be torch_geometric.data.Data") if isinstance(n, int): check_parameter(n, low=0, high=data.num_nodes, param_name='n') else: raise ValueError("n should be int, got %s" % n) if isinstance(k, int): check_parameter(k, low=0, high=data.num_nodes - n, param_name='k') else: raise ValueError("k should be int, got %s" % k) if seed: torch.manual_seed(seed) outlier_idx = torch.randperm(data.num_nodes)[:n] for i, idx in enumerate(outlier_idx): candidate_idx = torch.randperm(data.num_nodes)[:k] euclidean_dist = torch.cdist(data.x[idx].unsqueeze(0), data.x[ candidate_idx]) max_dist_idx = torch.argmax(euclidean_dist, dim=1) max_dist_node = candidate_idx[max_dist_idx] data.x[idx] = data.x[max_dist_node] y_outlier = torch.zeros(data.x.shape[0], dtype=torch.long) y_outlier[outlier_idx] = 1 return data, y_outlier
# TODO add gen_joint_structural_outliers from GAD-NR