Source code for slycat.cca

# Copyright (c) 2013, 2018 National Technology and Engineering Solutions of Sandia, LLC . Under the terms of Contract
# DE-NA0003525 with National Technology and Engineering Solutions of Sandia, LLC, the U.S. Government
# retains certain rights in this software.

import numbers
import numpy
import scipy.linalg
import scipy.stats
import slycat.email

[docs]def cca(X, Y, scale_inputs=True, force_positive=None, significant_digits=None):
  """Compute Canonical Correlation Analysis (CCA).

  Parameters
  ----------
  X : numpy.ndarray
    :math:`M \\times I` matrix containing :math:`M` observations and :math:`I` input features.
  Y : numpy.ndarray
    :math:`M \\times O` matrix containing :math:`M` observations and :math:`O` output features.
  scale_inputs : bool, optional
    Scale input and output features to unit variance.
  force_positive : integer, optional
    If specified, flip signs in the `x`, `y`, `x_loadings`, and `y_loadings` output values so
    that the values in row :math:`n` of `y_loadings` are all positive.
  significant_digits: integer, optional
    Optionally specify the number of significant digits used to compute the `X` and `Y` ranks.

  Returns
  -------
  x : numpy.ndarray
    :math:`M \\times C` matrix containing input metavariable values for :math:`M` observations and :math:`C` CCA components.
  y : numpy.ndarray
    :math:`M \\times C` matrix containing output metavariable values for :math:`M` observations and :math:`C` CCA components.
  x_loadings : numpy.ndarray
    :math:`I \\times C` matrix containing weights for :math:`I` input variables and :math:`C` CCA components.
  y_loadings : numpy.ndarray
    :math:`O \\times C` matrix containing weights for :math:`O` output variables and :math:`C` CCA components.
  r2 : numpy.ndarray
    length-:math:`C` vector containing :math:`r^2` values for :math:`C` CCA components.
  wilks : numpy.ndarray
    length-:math:`C` vector containing the likelihood-ratio for :math:`C` CCA components.
  """

  # Validate our inputs ...
  if not isinstance(X, numpy.ndarray) or not isinstance(Y, numpy.ndarray):
    slycat.email.send_error("cca.py", "X and Y must be numpy.ndarray instances.")
    raise TypeError("X and Y must be numpy.ndarray instances.")
  if X.ndim != 2 or Y.ndim != 2:
    slycat.email.send_error("cca.py", "X and Y must have two dimensions.")
    raise ValueError("X and Y must have two dimensions.")
  if X.shape[0] != Y.shape[0]:
    slycat.email.send_error("cca.py", "X and Y must contain the same number of rows.")
    raise ValueError("X and Y must contain the same number of rows.")
  if X.shape[0] < X.shape[1] or X.shape[0] < Y.shape[1]:
    slycat.email.send_error("cca.py", "Number of rows must be >= the number of column in X, and >= the number of columns in Y.")
    raise ValueError("Number of rows must be >= the number of columns in X, and >= the number of columns in Y.")
  if X.shape[1] < 1 or Y.shape[1] < 1:
    slycat.email.send_error("cca.py", "X and Y must each contain at least one column.")
    raise ValueError("X and Y must each contain at least one column.")
  for column in numpy.column_stack((X, Y)).T:
    if column.min() == column.max():
      slycat.email.send_error("cca.py", "Columns in X and Y cannot be constant.")
      raise ValueError("Columns in X and Y cannot be constant.")
  if not isinstance(scale_inputs, bool):
    slycat.email.send_error("cca.py", "scale_inputs must be a boolean.")
    raise TypeError("scale_inputs must be a boolean.")
  if not isinstance(force_positive, (type(None), numbers.Integral)):
    slycat.email.send_error("cca.py", "force_positive must be an integer or None.")
    raise TypeError("force_positive must be an integer or None.")
  if force_positive is not None and (force_positive < 0 or force_positive >= Y.shape[1]):
    slycat.email.send_error("cca.py", "force_positive must be in the range [0, number of Y columns).")
    raise ValueError("force_positive must be in the range [0, number of Y columns).")
  if not isinstance(significant_digits, (type(None), numbers.Integral)):
    slycat.email.send_error("cca.py", "significant_digits must be an integer or None.")
    raise TypeError("significant_digits must be an integer or None.")

  eps = numpy.finfo("double").eps
  if significant_digits is None or significant_digits > numpy.abs(numpy.log10(eps)):
    significant_digits = numpy.abs(numpy.log10(eps))

  n = X.shape[0]
  p1 = X.shape[1]
  p2 = Y.shape[1]

  X -= X.mean(axis=0)
  Y -= Y.mean(axis=0)

  if scale_inputs:
    X /= X.std(axis=0)
    Y /= Y.std(axis=0)

  Q1, R1, P1 = scipy.linalg.qr(X, mode="economic", pivoting=True)
  Q2, R2, P2 = scipy.linalg.qr(Y, mode="economic", pivoting=True)

  Xrank = numpy.sum(numpy.abs(numpy.diag(R1)) > 10**(numpy.log10(numpy.abs(R1[0,0])) - significant_digits) * max(n, p1))
  Yrank = numpy.sum(numpy.abs(numpy.diag(R2)) > 10**(numpy.log10(numpy.abs(R2[0,0])) - significant_digits) * max(n, p2))

  # We validate this here, to avoid computing the rank twice.
  if X.shape[0] < Xrank or X.shape[0] < Yrank:
    slycat.email.send_error("cca.py", "Number of rows must be >= rank(X), and >= rank(Y).")
    raise ValueError("Number of rows must be >= rank(X), and >= rank(Y).")

  L, D, M = scipy.linalg.svd(numpy.dot(Q1.T, Q2), full_matrices=False)

  d = min(Xrank, Yrank)
  L = L[:,:d]
  M = M[:d,:]

  A = numpy.linalg.solve(R1, L)
  B = numpy.linalg.solve(R2, M.T)

  A *= numpy.sqrt(n - 1)
  B *= numpy.sqrt(n - 1)

  A[P1] = numpy.copy(A)
  B[P2] = numpy.copy(B)

  x = numpy.dot(X, A)
  y = numpy.dot(Y, B)

  x_loadings = numpy.array([[scipy.stats.pearsonr(i, j)[0] for j in X.T] for i in x.T]).T
  y_loadings = numpy.array([[scipy.stats.pearsonr(i, j)[0] for j in Y.T] for i in y.T]).T

  if force_positive is not None:
    for j in range(y_loadings.shape[1]):
      if y_loadings[force_positive, j] < 0:
        x_loadings[:,j] = -x_loadings[:,j]
        y_loadings[:,j] = -y_loadings[:,j]
        x[:,j] = -x[:,j]
        y[:,j] = -y[:,j]

  r = numpy.minimum(numpy.maximum(D[:d], 0), 1)

  nondegenerate = r < 1
  wilks = numpy.exp(numpy.cumsum(numpy.log(1-(r[nondegenerate] ** 2))[::-1])[::-1])

  return x, y, x_loadings, y_loadings, r, wilks