Source code for pygod.models.one

# -*- coding: utf-8 -*-
"""Outlier Aware Network Embedding for Attributed Networks (ONE)
"""
# Author: Xiyang Hu <xiyanghu@cmu.edu>
# License: BSD 2 clause

import time
import gc
import numpy as np
import networkx as nx
from sklearn.decomposition import NMF

from sklearn.utils.validation import check_is_fitted
from torch_geometric.utils import to_scipy_sparse_matrix, to_networkx

from . import BaseDetector
from ..metrics import eval_roc_auc

gc.enable()


# todo: to optimize later
# @njit
def calculate_G(G_mat, alpha, outl1, H, A, gamma, outl3, U, W):
    # The update rule for G_mat[i,k]
    for i in range(G_mat.shape[0]):
        for k in range(G_mat.shape[1]):
            Gik_numer = alpha * np.log(
                np.reciprocal(outl1[i])) * np.dot(H[k, :], (
                    A[i, :] - (
                    np.matmul(G_mat[i], H) - np.multiply(
                G_mat[i, k], H[k, :])))) \
                        + gamma * np.log(
                np.reciprocal(outl3[i])) * np.dot(U[i],
                                                  W[k, :])

            Gik_denom = alpha * np.log(
                np.reciprocal(outl1[i])) * np.dot(H[k, :],
                                                  H[k, :]) + \
                        gamma * np.log(np.reciprocal(outl3[i]))

            G_mat[i, k] = Gik_numer / Gik_denom
            return G_mat


# todo: due to the original paper has very complex loss, this algorithm is not
# in PyTorch yet. Need NetworkX for it.
[docs]class ONE(BaseDetector): """ ONE (Outlier Aware Network Embedding for Attributed Networks) See :cite:`bandyopadhyay2019outlier` for details. Parameters ---------- K : int, optional Every vertex is a K dimensional vector, K < min(N, D). Default: ``36``. iter : int, optional Number of outer Iterations for optimization. Default: ``5``. contamination : float, optional Valid in (0., 0.5). The proportion of outliers in the data set. Used when fitting to define the threshold on the decision function. Default: ``0.1``. verbose : bool Verbosity mode. Turn on to print out log information. Default: ``False``. Examples -------- >>> from pygod.models import ONE >>> model = ONE() >>> model.fit(data) # PyG graph data object >>> prediction = model.predict(data) """ def __init__(self, K=36, iter=5, contamination=0.1, verbose=False): super(ONE, self).__init__(contamination=contamination) self.K = K self.iter = iter # other param self.verbose = verbose
[docs] def fit(self, G, y_true=None): """ Fit detector with input data. Parameters ---------- G : torch_geometric.data.Data The input data. y_true : numpy.ndarray, optional The optional outlier ground truth labels used to monitor the training progress. They are not used to optimize the unsupervised model. Default: ``None``. Returns ------- self : object Fitted estimator. """ A, C = self.process_graph(G) assert A.shape[0] == C.shape[0] K = self.K if self.verbose: print("Number of Dimensions : ", K) self.W = np.eye(K) if self.verbose: print('Dimension of C: {}, {}'.format(C.shape[0], C.shape[1])) gc.collect() opti_values = [] runtime = [] self.mu = 1 gc.collect() start_time = time.time() model = NMF(n_components=K, init='random', random_state=0) # ensure A is non-negative if A.min() < 0: A = A + A.min() * -1 self.G_mat = model.fit_transform(A) self.H = model.components_ model = NMF(n_components=K, init='random', random_state=0) # ensure C is non-negative if C.min() < 0: C = C + C.min() * -1 self.U = model.fit_transform(C) self.V = model.components_ outl1 = outl2 = outl3 = np.ones((A.shape[0])) G = to_networkx(G) bet = nx.betweenness_centrality(G) for i in range(len(outl1)): outl1[i] = float(1) / A.shape[0] + bet[i] outl2[i] = float(1) / A.shape[0] outl3[i] = float(1) / A.shape[0] + bet[i] outl1 = outl1 / sum(outl1) outl2 = outl2 / sum(outl2) outl3 = outl3 / sum(outl3) count_outer = self.iter temp1 = A - np.matmul(self.G_mat, self.H) temp1 = np.multiply(temp1, temp1) temp1 = np.multiply(np.log(np.reciprocal(outl1)), np.sum(temp1, axis=1)) temp1 = np.sum(temp1) temp2 = C - np.matmul(self.U, self.V) temp2 = np.multiply(temp2, temp2) temp2 = np.multiply(np.log(np.reciprocal(outl2)), np.sum(temp2, axis=1)) temp2 = np.sum(temp2) temp3 = self.G_mat.T - np.matmul(self.W, self.U.T) temp3 = np.multiply(temp3, temp3) temp3 = np.multiply(np.log(np.reciprocal(outl3)), np.sum(temp3, axis=0).T) temp3 = np.sum(temp3) self.alpha = 1 self.beta = temp1 / temp2 self.gamma = min(2 * self.beta, temp3) for opti_iter in range(count_outer): if self.verbose: print('Loop {} started: \n'.format(opti_iter)) print("The function values which we are interested are : ") self.calc_lossValues(A, C, self.G_mat, self.H, self.U, self.V, self.W, outl1, outl2, outl3, self.alpha, self.beta, self.gamma) # The update rule for G_mat[i,k] for i in range(self.G_mat.shape[0]): for k in range(self.G_mat.shape[1]): Gik_numer = self.alpha * np.log( np.reciprocal(outl1[i])) * np.dot(self.H[k, :], ( A[i, :] - ( np.matmul(self.G_mat[i], self.H) - np.multiply( self.G_mat[i, k], self.H[k, :])))) \ + self.gamma * np.log( np.reciprocal(outl3[i])) * np.dot(self.U[i], self.W[k, :]) Gik_denom = self.alpha * np.log( np.reciprocal(outl1[i])) * np.dot(self.H[k, :], self.H[k, :]) + \ self.gamma * np.log(np.reciprocal(outl3[i])) self.G_mat[i, k] = Gik_numer / Gik_denom # self.G_mat = calculate_G(self.G_mat, self.alpha, outl1, # self.H, A, self.gamma, outl3, self.U, self.W) self.calc_lossValues(A, C, self.G_mat, self.H, self.U, self.V, self.W, outl1, outl2, outl3, self.alpha, self.beta, self.gamma) if self.verbose: print('Done for G_mat') # The update rule for H[k,j] for k in range(self.H.shape[0]): for j in range(self.H.shape[1]): Hkj_numer = self.alpha * np.dot( np.multiply(np.log(np.reciprocal(outl1)), self.G_mat[:, k]), \ (A[:, j] - (np.matmul(self.G_mat, self.H[:, j]) - np.multiply( self.G_mat[:, k], self.H[k, j])))) Hkj_denom = self.alpha * ( np.dot(np.log(np.reciprocal(outl1)), np.multiply(self.G_mat[:, k], self.G_mat[:, k]))) self.H[k, j] = Hkj_numer / Hkj_denom self.calc_lossValues(A, C, self.G_mat, self.H, self.U, self.V, self.W, outl1, outl2, outl3, self.alpha, self.beta, self.gamma) if self.verbose: print('Done for H') # The up[update rule for U[i,k] for i in range(self.U.shape[0]): for k in range(self.U.shape[1]): Uik_numer_1 = self.beta * np.log( np.reciprocal(outl2[i])) * \ (np.dot(self.V[k, :], (C[i] - (np.matmul( self.U[i, :], self.V) - np.multiply( self.U[i, k], self.V[k, :]))))) Uik_numer_2 = self.gamma * np.log( np.reciprocal(outl3[i])) * np.dot( (self.G_mat[i, :] - (np.matmul(self.U[i, :], self.W) - np.multiply( self.U[i, k], self.W[:, k]))), self.W[:, k]) Uik_denom = self.beta * np.log( np.reciprocal(outl2[i])) * np.dot(self.V[k, :], self.V[k, :] ) + self.gamma * \ np.log(np.reciprocal(outl3[i])) * np.dot( self.W[:, k], self.W[:, k]) self.U[i, k] = (Uik_numer_1 + Uik_numer_2) / Uik_denom self.calc_lossValues(A, C, self.G_mat, self.H, self.U, self.V, self.W, outl1, outl2, outl3, self.alpha, self.beta, self.gamma) if self.verbose: print('Done for U') # The update rule for V[k,d] for k in range(self.V.shape[0]): for d in range(self.V.shape[1]): Vkd_numer = self.beta * np.dot( np.multiply(np.log(np.reciprocal(outl2)), self.U[:, k]), (C[:, d] - (np.matmul( self.U, self.V[:, d]) - np.multiply( self.U[:, k], self.V[k, d])))) Vkd_denom = self.beta * ( np.dot(np.log(np.reciprocal(outl2)), np.multiply(self.U[:, k], self.U[:, k]))) self.V[k][d] = Vkd_numer / Vkd_denom self.calc_lossValues(A, C, self.G_mat, self.H, self.U, self.V, self.W, outl1, outl2, outl3, self.alpha, self.beta, self.gamma) if self.verbose: print('Done for V') # The Update rule for W[p,q] logoi = np.log(np.reciprocal(outl3)) sqrt_logoi = np.sqrt(logoi) sqrt_logoi = np.tile(sqrt_logoi, (K, 1)) assert (sqrt_logoi.shape == self.G_mat.T.shape) term1 = np.multiply(sqrt_logoi, self.G_mat.T) term2 = np.multiply(sqrt_logoi, self.U.T) svd_matrix = np.matmul(term1, term2.T) svd_u, svd_sigma, svd_vt = np.linalg.svd(svd_matrix) self.W = np.matmul(svd_u, svd_vt) self.calc_lossValues(A, C, self.G_mat, self.H, self.U, self.V, self.W, outl1, outl2, outl3, self.alpha, self.beta, self.gamma) if self.verbose: print('Done for W') # The update rule for outl outl1, outl2, outl3 = self.cal_outlierScore(A, C) self.calc_lossValues(A, C, self.G_mat, self.H, self.U, self.V, self.W, outl1, outl2, outl3, self.alpha, self.beta, self.gamma) if self.verbose: print('Done for outlier score') print('Loop {} ended: \n'.format(opti_iter)) if self.verbose: if y_true is not None: auc = eval_roc_auc(y_true, outl2) print(" | AUC {:.4f}".format(auc), end='') print() # Use outl2 as the outlier score. # In the paper: # "O2 is more important to determine outliers." self.decision_scores_ = outl2 self._process_decision_scores() return self
[docs] def decision_function(self, G): """ Predict raw anomaly score using the fitted detector. Outliers are assigned with larger anomaly scores. Parameters ---------- G : PyTorch Geometric Data instance (torch_geometric.data.Data) The input data. Returns ------- outl2 : numpy.ndarray The anomaly score of shape :math:`N`. """ check_is_fitted(self, ['W', 'G_mat', 'H', 'U', 'V']) A, C = self.process_graph(G) C = C - C.min() + 1 # NMF requires the matrix to be non-negative. _, outl2, _ = self.cal_outlierScore(A, C) return outl2
def process_graph(self, G): """ Process the raw PyG data object into a tuple of sub data objects needed for the model. Parameters ---------- G : PyTorch Geometric Data instance (torch_geometric.data.Data) The input graph. Returns ------- A : numpy.array The adjacency matrix. C : numpy.array The node attribute matrix. """ # todo: need some assert or try/catch to make sure certain attributes # are presented. A = to_scipy_sparse_matrix(G['edge_index']).toarray().astype( 'float64') C = G['x'].numpy().astype('float64') # return data objects needed for the network return A, C def calc_lossValues(self, A, C, G_mat, H, U, V, W, outl1, outl2, outl3, alpha, beta, gamma): """ Calculate the loss. This function is called inside the fit() function. Parameters ---------- Multiple variables inside the fit() function. Returns ------- None """ temp1 = A - np.matmul(G_mat, H) temp1 = np.multiply(temp1, temp1) temp1 = np.multiply(np.log(np.reciprocal(outl1)), np.sum(temp1, axis=1)) temp1 = np.sum(temp1) temp2 = C - np.matmul(U, V) temp2 = np.multiply(temp2, temp2) temp2 = np.multiply(np.log(np.reciprocal(outl2)), np.sum(temp2, axis=1)) temp2 = np.sum(temp2) temp3 = G_mat.T - np.matmul(W, U.T) temp3 = np.multiply(temp3, temp3) temp3 = np.multiply(np.log(np.reciprocal(outl3)), np.sum(temp3, axis=0).T) temp3 = np.sum(temp3) if self.verbose: print('\t Component values: {},{} and {}'.format(temp1, temp2, temp3)) func_value = alpha * temp1 + beta * temp2 + gamma * temp3 if self.verbose: print('\t Total Function value {}'.format(func_value)) def cal_outlierScore(self, A, C): """ Calculate the outlier scores. Parameters ---------- A : numpy.array The adjacency matrix. C : numpy.array The node attribute matrix. Returns ------- outlier_scores : Tuple(numpy.array, numpy.array, numpy.array) Three sets of outlier scores from three different layers. """ GH = np.matmul(self.G_mat, self.H) UV = np.matmul(self.U, self.V) WUTrans = np.matmul(self.W, self.U.T) outl1_numer = self.alpha * (np.multiply((A - GH), (A - GH))).sum( axis=1) outl1_denom = self.alpha * pow(np.linalg.norm((A - GH), 'fro'), 2) outl1_numer *= self.mu outl1 = outl1_numer / outl1_denom outl2_numer = self.beta * (np.multiply((C - UV), (C - UV))).sum(axis=1) outl2_denom = self.beta * pow(np.linalg.norm((C - UV), 'fro'), 2) outl2_numer *= self.mu outl2 = outl2_numer / outl2_denom outl3_numer = self.gamma * ( np.multiply((self.G_mat.T - WUTrans), (self.G_mat.T - WUTrans))).sum( axis=0).T outl3_denom = self.gamma * pow( np.linalg.norm((self.G_mat.T - WUTrans), 'fro'), 2) outl3_numer *= self.mu outl3 = outl3_numer / outl3_denom return outl1, outl2, outl3