Source code for spalor.models.robust_pca

from spalor.algorithms.rpca_algorithms import *
from scipy.sparse.linalg import svds
[docs]class RPCA():
    '''
    Robust Principal Component Analysis.  

    Simultaniously performs PCA while identifying and correcting outliers.

    See the `user guide <http://www.spalor.org/user_guide/rpca>` for a detailed description

    Parameters
    ----------
    n_components : int
        Number of principle components to solve for, that is, the rank of the matrix to be completed. If set to a number between 0 ad 1, the parameter will be taken to be the ratio of the smallest singular value to the largest.
    
    solver : {'lmafit', 'svt', 'alt_min', 'alt_proj'}, default='lmafit'
        solver to use  see ../algorithms/mc_algorithms            

    lambda : float, must be larger than 0, default 0.5
        Regularization parameter.  Only used if solver='svt' or 'apgd'.

        Increasing the parameter reduces overfiting, but may lead to estimaiton bias towards zero, particularly with solver='svt'
    
    tol : float, default=1e-6
        Stopping  criteria for matrix completion solver.

    Attributes
    -----------
    d1 : int
        Number of rows in matrix (typically, the number of samples in the dataset)

    d2 : int
        Number of columns in the matrix (typically, the number of features in the dataset)

    U : ndarray of size (d1, n_components)
        left singular vectors

    S : ndarray of size (n_components,)
        singular values

    V : ndarray of size (d2, n_components)
        right singular vectors. Often, these are the prinicipal component axes, or the basis

    T : ndarray of size (d1, n_components)
        Score matrix, U*S.  Often used for classification from PCA.

    outliers : ndarray of size (d1,d2)


    components : ndarray of size (d2, n_components)
        Principal axes in feature space, representing the directions of maximum variance in the data.


    Example:
    ```
    A = np.random.randn(50, 2).dot(np.random.randn(2,30))
    S = np.random.rand(*A.shape)<0.1

    rpca=RPCA(n_components=2, sparsity=0.1)
    rpca.fit(A+S)

    print("Denoised matrix error: \n", np.linalg.norm(rpca.to_matrix()-A)/np.linalg.norm(A))
    print("Outliersm error: \n", np.linalg.norm(rpca.outliers_-S)/np.linalg.norm(S))
    ```

    '''
    def __init__(self, n_components=10, sparsity=0.05):
        self.n_components = n_components
        self.sparsity = sparsity

[docs]    def fit(self,M):
        '''
        Parameters
        ----------
        M : ndarray 
            observed data matrix with an unknown but sparse set of outliers
        '''

        self.M=M;
        d1,d2=M.shape
        self.d1=d1
        self.d2=d2
        if self.sparsity<1:
            self.sparsity=round(d1*d2*self.sparsity)
        (L,outliers) = altProjNiave(self.M, self.n_components, self.sparsity, fTol=1e-10, maxIter=10000)
        #(L,S) = altProj(self.M,r=self.r)
        u,s,v=svds(L, self.n_components)
        self.L=L
        self.U=u.reshape((-1, self.n_components))
        self.V=v
        self.S=s
        self.outliers_=outliers

[docs]    def fit_transform(self,M):
        '''
        Parameters
        ----------
        M : ndarray of size (d1,d2)
            observed data matrix with an unknown but sparse set of outliers

        Returns
        ---------
        T : ndarray of size (d1, r)

        '''

        self.fit(M)

        return self.U.dot(np.diag(self.S))

[docs]    def transform(X):
        
        '''
        V is already solved for, so we just need to solve:

        min U, outliers   ||U*V+outliers -X ||_F^2  s.t. outliers is spart
        '''
        

        pass


    def inverse_transform(self,X):
        return X.dot(self.V)



[docs]    def get_covariance(self):
        """
        Calculates an estimate of covariance matrix.  

        Entry (i,j) will be a the correlation between feature i and feature j.  A value close to 1 is a strong postive correlatio, a value close to -1 is a strong negative correlation, and a value close to 0 is no correlation.

        Returns
        -------
        cov : array, shape=(d2, d2)
            Estimated covariance of data.
        """

        return self.V.dot(self.S**2).dot(self.V.transpose())/(self.d1-1)

[docs]    def to_matrix(self):
        '''
        Calculates the completed matrix.

        Returns
        -----------
        L : ndarray of size (d1,d2)
            Low rank matrix, denoised

        S : sparse matrix of size (d1,d2)
            Sparse outliers

        '''

        return self.L


if __name__=="__main__":
    A = np.random.randn(50, 2).dot(np.random.randn(2,30))
    S = np.random.rand(*A.shape)<0.1

    rpca=RPCA(n_components=2, sparsity=0.1)
    rpca.fit(A+S)

    print("Denoised matrix error: \n", np.linalg.norm(rpca.to_matrix()-A)/np.linalg.norm(A))
    print("Outliersm error: \n", np.linalg.norm(rpca.outliers_-S)/np.linalg.norm(S))


    # #%%
    # rpca=RPCA(n_components=3, sparsity=0.1)
    # U=rpca.fit_transform(A+S)
    # print("Denoised matrix: \n", rpca.inverse_transform(U))
    # #%%
    # B=np.array([[-1, -2, -3, 3],
    #             [0.5, 1, 1.5, 0]])
    #
    # outliers=rpca.predict_outliers(B)
    # print("Outliers: \n", outliers)
SpaLoR

Documentation

Source code for spalor.models.robust_pca