Source code for approxposterior.gmmUtils

# -*- coding: utf-8 -*-
"""
:py:mod:`gmmUtils.py` - Gaussian Mixture Model Utilities
--------------------------------------------------------

Gaussian mixture model utility functions for fitting approximations to posterior
probability distributions.

"""

# Tell module what it's allowed to import
__all__ = ["fitGMM"]

import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.mixture import GaussianMixture


[docs]def fitGMM(samples, maxComp=3, covType="full", useBic=True, gmmKwargs=None):
    """
    Fit a Gaussian Mixture Model to the posterior samples to derive an
    approximation of the posterior density.  Fit for the number of components
    by either minimizing the Bayesian Information Criterior (BIC) or via
    cross-validation.

    Parameters
    ----------
    samples : numpy array
        sampler.flatchain MCMC chain array of dimensions (nwalkers x nsteps, ndim)
    maxComp : int (optional)
        Maximum number of mixture model components to fit for.  Defaults to 3.
    covType : str (optional)
        GMM covariance type.  Defaults to "full".  See the documentation here:
        http://scikit-learn.org/stable/modules/generated/sklearn.mixture.GaussianMixture.html
        for more info
    useBic : bool (optional)
        Minimize the BIC to pick the number of GMM components or use 5-fold
        cross validation?  Defaults to True (aka, use the BIC)
    gmmKwargs : dict (optional)
        keyword arguments for sklearn.mixture.GaussianMixture. Defaults to
        None

    Returns
    -------
    GMM : sklearn.mixture.GaussianMixture
        fitted Gaussian mixture model
    """

    if gmmKwargs is None:
        gmmKwargs = dict()

    # Select optimal number of components via minimizing BIC
    if useBic:

        bic = None
        lowestBic = np.inf
        bestGMM = None
        gmm = GaussianMixture()

        for nComponents in range(1,maxComp+1):
            gmmKwargs["n_components"] = nComponents
            gmmKwargs["covariance_type"] = covType

            gmm.set_params(**gmmKwargs)
            gmm.fit(samples)
            bic = gmm.bic(samples)

            if bic < lowestBic:
                lowestBic = bic
                bestN = nComponents
                bestCovType = covType

        # Refit GMM with the lowest bic
        gmmKwargs["n_components"] = bestN
        gmmKwargs["covariance_type"] = bestCovType
        GMM = GaussianMixture(**gmmKwargs)
        GMM.fit(samples)

    # Select optimal number of components via 5 fold cross-validation
    else:
        hyperparams = {"n_components" : np.arange(maxComp+1)}
        gmm = GridSearchCV(GaussianMixture(covariance_type=covType),
                           hyperparams, cv=5)
        gmm.fit(samples)
        GMM = gmm.best_estimator_
        GMM.fit(samples)

    return GMM
# end function