Source code for slycat.cca

# Copyright (c) 2013, 2018 National Technology and Engineering Solutions of Sandia, LLC . Under the terms of Contract
# DE-NA0003525 with National Technology and Engineering Solutions of Sandia, LLC, the U.S. Government
# retains certain rights in this software.

import numbers
import numpy
import scipy.linalg
import scipy.stats
import slycat.email

[docs]def cca(X, Y, scale_inputs=True, force_positive=None, significant_digits=None): """Compute Canonical Correlation Analysis (CCA). Parameters ---------- X : numpy.ndarray :math:`M \\times I` matrix containing :math:`M` observations and :math:`I` input features. Y : numpy.ndarray :math:`M \\times O` matrix containing :math:`M` observations and :math:`O` output features. scale_inputs : bool, optional Scale input and output features to unit variance. force_positive : integer, optional If specified, flip signs in the `x`, `y`, `x_loadings`, and `y_loadings` output values so that the values in row :math:`n` of `y_loadings` are all positive. significant_digits: integer, optional Optionally specify the number of significant digits used to compute the `X` and `Y` ranks. Returns ------- x : numpy.ndarray :math:`M \\times C` matrix containing input metavariable values for :math:`M` observations and :math:`C` CCA components. y : numpy.ndarray :math:`M \\times C` matrix containing output metavariable values for :math:`M` observations and :math:`C` CCA components. x_loadings : numpy.ndarray :math:`I \\times C` matrix containing weights for :math:`I` input variables and :math:`C` CCA components. y_loadings : numpy.ndarray :math:`O \\times C` matrix containing weights for :math:`O` output variables and :math:`C` CCA components. r2 : numpy.ndarray length-:math:`C` vector containing :math:`r^2` values for :math:`C` CCA components. wilks : numpy.ndarray length-:math:`C` vector containing the likelihood-ratio for :math:`C` CCA components. """ # Validate our inputs ... if not isinstance(X, numpy.ndarray) or not isinstance(Y, numpy.ndarray): slycat.email.send_error("cca.py", "X and Y must be numpy.ndarray instances.") raise TypeError("X and Y must be numpy.ndarray instances.") if X.ndim != 2 or Y.ndim != 2: slycat.email.send_error("cca.py", "X and Y must have two dimensions.") raise ValueError("X and Y must have two dimensions.") if X.shape[0] != Y.shape[0]: slycat.email.send_error("cca.py", "X and Y must contain the same number of rows.") raise ValueError("X and Y must contain the same number of rows.") if X.shape[0] < X.shape[1] or X.shape[0] < Y.shape[1]: slycat.email.send_error("cca.py", "Number of rows must be >= the number of column in X, and >= the number of columns in Y.") raise ValueError("Number of rows must be >= the number of columns in X, and >= the number of columns in Y.") if X.shape[1] < 1 or Y.shape[1] < 1: slycat.email.send_error("cca.py", "X and Y must each contain at least one column.") raise ValueError("X and Y must each contain at least one column.") for column in numpy.column_stack((X, Y)).T: if column.min() == column.max(): slycat.email.send_error("cca.py", "Columns in X and Y cannot be constant.") raise ValueError("Columns in X and Y cannot be constant.") if not isinstance(scale_inputs, bool): slycat.email.send_error("cca.py", "scale_inputs must be a boolean.") raise TypeError("scale_inputs must be a boolean.") if not isinstance(force_positive, (type(None), numbers.Integral)): slycat.email.send_error("cca.py", "force_positive must be an integer or None.") raise TypeError("force_positive must be an integer or None.") if force_positive is not None and (force_positive < 0 or force_positive >= Y.shape[1]): slycat.email.send_error("cca.py", "force_positive must be in the range [0, number of Y columns).") raise ValueError("force_positive must be in the range [0, number of Y columns).") if not isinstance(significant_digits, (type(None), numbers.Integral)): slycat.email.send_error("cca.py", "significant_digits must be an integer or None.") raise TypeError("significant_digits must be an integer or None.") eps = numpy.finfo("double").eps if significant_digits is None or significant_digits > numpy.abs(numpy.log10(eps)): significant_digits = numpy.abs(numpy.log10(eps)) n = X.shape[0] p1 = X.shape[1] p2 = Y.shape[1] X -= X.mean(axis=0) Y -= Y.mean(axis=0) if scale_inputs: X /= X.std(axis=0) Y /= Y.std(axis=0) Q1, R1, P1 = scipy.linalg.qr(X, mode="economic", pivoting=True) Q2, R2, P2 = scipy.linalg.qr(Y, mode="economic", pivoting=True) Xrank = numpy.sum(numpy.abs(numpy.diag(R1)) > 10**(numpy.log10(numpy.abs(R1[0,0])) - significant_digits) * max(n, p1)) Yrank = numpy.sum(numpy.abs(numpy.diag(R2)) > 10**(numpy.log10(numpy.abs(R2[0,0])) - significant_digits) * max(n, p2)) # We validate this here, to avoid computing the rank twice. if X.shape[0] < Xrank or X.shape[0] < Yrank: slycat.email.send_error("cca.py", "Number of rows must be >= rank(X), and >= rank(Y).") raise ValueError("Number of rows must be >= rank(X), and >= rank(Y).") L, D, M = scipy.linalg.svd(numpy.dot(Q1.T, Q2), full_matrices=False) d = min(Xrank, Yrank) L = L[:,:d] M = M[:d,:] A = numpy.linalg.solve(R1, L) B = numpy.linalg.solve(R2, M.T) A *= numpy.sqrt(n - 1) B *= numpy.sqrt(n - 1) A[P1] = numpy.copy(A) B[P2] = numpy.copy(B) x = numpy.dot(X, A) y = numpy.dot(Y, B) x_loadings = numpy.array([[scipy.stats.pearsonr(i, j)[0] for j in X.T] for i in x.T]).T y_loadings = numpy.array([[scipy.stats.pearsonr(i, j)[0] for j in Y.T] for i in y.T]).T if force_positive is not None: for j in range(y_loadings.shape[1]): if y_loadings[force_positive, j] < 0: x_loadings[:,j] = -x_loadings[:,j] y_loadings[:,j] = -y_loadings[:,j] x[:,j] = -x[:,j] y[:,j] = -y[:,j] r = numpy.minimum(numpy.maximum(D[:d], 0), 1) nondegenerate = r < 1 wilks = numpy.exp(numpy.cumsum(numpy.log(1-(r[nondegenerate] ** 2))[::-1])[::-1]) return x, y, x_loadings, y_loadings, r, wilks