Source code for braviz.interaction.r_functions

##############################################################################
#    Braviz, Brain Data interactive visualization                            #
#    Copyright (C) 2014  Diego Angulo                                        #
#                                                                            #
#    This program is free software: you can redistribute it and/or modify    #
#    it under the terms of the GNU Lesser General Public License as          #
#    published by  the Free Software Foundation, either version 3 of the     #
#    License, or (at your option) any later version.                         #
#                                                                            #
#    This program is distributed in the hope that it will be useful,         #
#    but WITHOUT ANY WARRANTY; without even the implied warranty of          #
#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the           #
#    GNU Lesser General Public License for more details.                     #
#                                                                            #
#    You should have received a copy of the GNU Lesser General Public License#
#    along with this program.  If not, see <http://www.gnu.org/licenses/>.   #
##############################################################################

from __future__ import print_function
from collections import defaultdict

__author__ = 'Diego'

import logging
from itertools import izip

import pandas as pd
from rpy2.robjects import pandas2ri
pandas2ri.activate()
import rpy2.rinterface
from rpy2 import robjects
from rpy2.robjects.packages import importr
import numpy as np

import braviz.readAndFilter.tabular_data as braviz_tab_data


# arm
# car
# randomForest

[docs]def import_or_error(lib_name):
    """
    Tries to import an R package, if it is not found prints a message asking the user to install it
    and raises an exception

    Args:
        lib_name (str) : Name of an R package
    """
    try:
        lib = importr(lib_name,robject_translations={"format_perc": "format_perc_2"})
    except rpy2.rinterface.RRuntimeError:
        print("please install %s from R" % lib_name)
        log = logging.getLogger(__name__)
        log.error("Couldn't load R package %s", lib_name)
        raise
    return lib


[docs]def calculate_ginni_index(outcome, data_frame, sample=None):
    """
    Calculates Ginni impurity indices respect to an outcome variable

    `<http://en.wikipedia.org/wiki/Decision_tree_learning#Gini_impurity>`_
    Uses the RandomForest package

    Args:
        outcome (str) : Name of the outcome variable
        data_frame (pandas.DataFrame) : Data Frame with variable indices as index, this object will be modified
        sample (list) : Restrict the analysis to subjects in this sample

    Returns:
        The input *data_frame* with an additional column called ``Ginni`` that contains the requested indices.
        The value will be ``numpy.inf`` for the outcome variable, and 0 for variables where the index couldn't be
        calculated.
    """
    # construct data frame
    is_nominal = braviz_tab_data.is_variable_name_nominal(outcome)
    outcome_idx = braviz_tab_data.get_var_idx(outcome)

    values_data_frame = braviz_tab_data.get_data_frame_by_index(
        data_frame.index, col_name_index=True)
    if sample is not None:
        values_data_frame = values_data_frame.loc[sample]
    # remove columns with many NaNs
    # df2=values_data_frame.dropna(1,thresh=40)
    n_rows, n_cols = values_data_frame.shape
    values_data_frame.dropna(1, thresh=n_rows//10, inplace=True)
    # df3=df2.dropna(0,thresh=200)
    values_data_frame.dropna(0, thresh=n_cols//10, inplace=True)
    # fill nas with other values
    # shuffle
    permutation = np.random.permutation(range(len(values_data_frame)))
    values_data_frame = values_data_frame.iloc[permutation]
    values_data_frame.fillna(method="pad", inplace=True)
    rev_perm = list(reversed(permutation))
    values_data_frame = values_data_frame.iloc[rev_perm]
    values_data_frame.fillna(method="pad", inplace=True)
    # just in case there are still nas
    values_data_frame.dropna(0, inplace=True)
    values_data_frame.dropna(1, inplace=True)
    # create R data frame
    original_column_names = values_data_frame.columns

    # More R friendly column names
    values_data_frame.columns = [
        "c%s" % i for i in xrange(len(values_data_frame.columns))]

    r_df = values_data_frame
    # if oucome is nominal transform into factor
    outcome_variable_index = original_column_names.get_loc(outcome_idx)
    r_environment = robjects.globalenv
    r_environment["r_df"] = r_df
    if is_nominal:
        robjects.r('r_df["c%d"] <- factor( r_df[["c%d"]])' %
                   (outcome_variable_index, outcome_variable_index))

    r_df = r_environment["r_df"]

    # for i,n in enumerate(original_column_names):
    #    print "%d \t %s"%(i,n)
    # print r_df
    #import randomForest
    randomForest = import_or_error("randomForest")

    # use correct variable in formula
    form = robjects.Formula("c%d~." % outcome_variable_index)
    # robjects.r("table(complete.cases(r_df))")

    fit = randomForest.randomForest(
        form, data=r_df, replace=True, importance=True)
    # fit=robjects.r("randomForest.randomForest(c6~.,data=r_df,replace=True,importance=True)")
    imp = fit.rx2("importance")
    # if outcome is factor it is different
    # print imp
    interesting_column_index = 2
    if is_nominal:
        interesting_column_index = len(
            set(values_data_frame["c%d" % outcome_variable_index])) + 2

    # extract column vector
    imp_v = np.zeros(imp.nrow)
    for i in xrange(imp.nrow):
        imp_v[i] = imp.rx(i + 1, interesting_column_index)[0]

    # Remove target variable from original_names
    potential_regressors = list(original_column_names)
    potential_regressors.pop(outcome_variable_index)
    imp_data_frame = pd.DataFrame(imp_v, index=potential_regressors)
    # print imp_data_frame
    data_frame["Ginni"] = imp_data_frame
    data_frame.replace(np.nan, 0, inplace=True)
    data_frame.loc[outcome_idx,"Ginni"] = np.inf
    # print self.data_frame
    return data_frame


[docs]def calculate_anova(outcome, regressors_data_frame, interactions_dict, sample):
    """
    Calculates an anova regression

    Uses the *car* package with type 3 sum of squares

    Args:
        outcome (str) : Name of outcome variable
        regressors_data_frame (pandas.DataFrame) : A DataFrame with three columns regressor name,
            degrees of freedom, and interaction. The last column should have
            zeros for single variable regressors and 1 for interaction terms.
            see :meth:`braviz.interaction.qt_models.AnovaRegressorsModel.get_data_frame`
        interactions_dict (dict) : Dictionary mapping indices of interaction terms (in the previous DataFrame)
            to the indices of its factors.
        sample (list) : List of subject ids considered during the calculation

    Returns:
        ``(output_df, residuals, intercept, fitted)`` where *output_df* is a DataFrame containing 5 columns:
        Factor name, sum of squares, degrees of freedom, F statistic and P value, it includes an *(intercept)* term.
        *Residuals*, *intercept* and *fitted* are parameters of the regression.
        See :class:`braviz.interaction.qt_models.AnovaResultsModel`

    """
    # is outcome nominal?
    is_nominal = braviz_tab_data.is_variable_name_nominal(outcome)

    # is outcome binary?
    is_binary = False
    if is_nominal:
        # res=conn.execute("SELECT count(*) FROM nom_meta NATURAL JOIN variables WHERE var_name = ?",(outcome,))
        # if res.fetchone()[0]==2:
        #     is_binary=True
        # else:
        #     is_binary=False
        log = logging.getLogger(__name__)
        log.error(
            "Logistic and multinomial ANOVA not yet implemented, choose a real outcome")
        raise Exception(
            "Logistic and multinomial ANOVA not yet implemented, choose a real outcome")

    # are regressors nominal?
    factors_nominal = dict()
    var_names = regressors_data_frame[
        regressors_data_frame["Interaction"] == 0]["variable"].tolist()
    for var in var_names:
        factors_nominal[var] = braviz_tab_data.is_variable_name_nominal(var)
    # print factors_nominal

    # construct pandas data frame
    var_names.append(outcome)
    pandas_df = braviz_tab_data.get_data_frame_by_name(var_names)
    pandas_df = pandas_df.loc[sample]
    # print pandas_df

    # construct r data frame

    new_names = ["r%d" % i for i in xrange(len(var_names))]
    new_names[-1] = "o"

    pandas_df.columns = new_names
    r_df = pandas_df
    # print r_df

    # reformat r_df
    r_environment = robjects.globalenv
    r_environment["r_df"] = r_df
    for i, var in enumerate(var_names):
        if factors_nominal.get(var, False):
            robjects.r('r_df["r%d"] <- factor( r_df[["r%d"]])' % (i, i))

    r_df = r_environment["r_df"]
    # print r_df
    # construct formula
    regressors = new_names[:-1]
    interactions = []
    # print interactions_dict
    for products in interactions_dict.itervalues():
        factors = []
        for f in products:
            f_name = regressors_data_frame["variable"][f]
            f_index = var_names.index(f_name)
            f_new_name = new_names[f_index]
            factors.append(f_new_name)
        interactions.append("*".join(factors))

    regressors.extend(interactions)
    formula_str = "o~" + "+".join(regressors)
    # print formula_str
    form = robjects.Formula(formula_str)

    # construct constrasts list
    stats = import_or_error("stats")
    contrasts_dict = {}
    for i, var in enumerate(var_names[:-1]):
        if factors_nominal[var]:
            k = "r%d" % i
            contrasts_dict[k] = stats.contr_sum

    # run anova
    car = import_or_error("car")

    if len(contrasts_dict) > 0:
        contrasts_list = robjects.ListVector(contrasts_dict)
        model = stats.lm(form, data=r_df, contrasts=contrasts_list)
    else:
        model = stats.lm(form, data=r_df)
    intercept = model[0][0]
    residuals = np.array(model[1])
    anova = car.Anova(model, type=3)
    fitted = np.array(model[4])
    row_names = list(anova.rownames)
    column_names = list(anova.names)

    output_dict = dict((k, list(anova.rx(True, i + 1)))
                       for i, k in enumerate(column_names))

    # decode column_names
    def decode_colum_names(name):
        tokens = name.split(":")
        fs = []
        for t in tokens:
            try:
                i = new_names.index(t)
            except ValueError:
                f = t
            else:
                f = var_names[i]
            fs.append(f)
        return ":".join(fs)

    decoded_names = map(decode_colum_names, row_names)

    output_df = pd.DataFrame(output_dict, index=decoded_names)

    # reorder to match r order
    output_df["Factor"] = output_df.index
    output_df = output_df[["Factor", "Sum Sq", "Df", "F value", "Pr(>F)"]]

    return output_df, residuals, intercept, fitted


[docs]def calculate_normalized_linear_regression(outcome, regressors_data_frame, interactions_dict, sample):
    """
    Calculates a linear regression after normalizing variables

    It uses the arm package `standardize <http://www.inside-r.org/packages/cran/arm/docs/standardize>`_

    Args:
        outcome (str) : Name of outcome variable
        regressors_data_frame (pandas.DataFrame) : A DataFrame with three columns regressor name,
            degrees of freedom, and interaction. The last column should have
            zeros for single variable regressors and 1 for interaction terms.
            see :meth:`braviz.interaction.qt_models.AnovaRegressorsModel.get_data_frame`
        interactions_dict (dict) : Dictionary mapping indices of interaction terms (in the previous DataFrame)
            to the indices of its factors.
        sample (list) : List of subject ids considered during the calculation

    Returns:
        A dictionary with the following fields

            - ``coefficients_df`` : A data frame with 7 columns: Slope, T statistc value, P value, standard error,
              95% confidence interval, r_name (variable alias used inside r)
              and components (Name of variables that make up the term)
            - ``residuals`` : vector with regression residuals
            - ``fitted`` : vector of fitted values
            - ``adj_r2`` : Adjusted r square
            - ``f_pval`` : Regression fit p value
            - ``f_stats_val`` : Value of the regression F statistic
            - ``f_stat_df`` : Degrees of freedom from F stistics (nominator,denominatior)
            - ``data_points`` : Subject ids used in the calculation (after dropping nans)
            - ``standardized_model`` : DataFrame of standardized data
            - ``data`` : DataFrame used in the calculation (after dropping nans)
            - ``mean_sigma`` : mean and standard deviation from outcome
            - ``var_types`` : Dictionary with the type of each variable, options are "r" for real, "b" for binary,
              and "n" for nominal variables with more levels
            - ``dummy_levels`` : The text labels for each level of dummy variables (except the base level)

        see `Fitting & Interpreting Linear Models in R <http://blog.yhathq.com/posts/r-lm-summary.html>`_ for more
        details on the interpretation of the output
    """
    if not braviz_tab_data.is_variable_name_real(outcome):
        raise NotImplementedError(
            "Logistic regression not yet implemented, please select a rational outcome")
    log=logging.getLogger(__name__)
    regressor_names = regressors_data_frame[
        regressors_data_frame["Interaction"] == 0]["variable"].tolist()
    all_variables = [outcome] + regressor_names
    data_frame = braviz_tab_data.get_data_frame_by_name(all_variables)
    data_frame = data_frame.loc[sample].copy()
    data_frame.dropna(inplace=True)
    # we have to classify variables in real, binary, and other nominals
    var_type = dict()
    for var in regressor_names:
        if braviz_tab_data.is_variable_name_real(var):
            var_type[var] = 'r'
        else:
            # is it binary?
            levels = len(np.unique(data_frame[var]))
            if levels == 2:
                var_type[var] = 'b'
            else:
                var_type[var] = 'n'
    # for binary and real variables we need the mean, for real variables we also need the std_dev to de-standardize
    # for binary and nominal variables we need the labels dict
    mean_sigma = dict()
    labels_dicts = dict()
    for var, t in var_type.iteritems():
        if t == "r":
            m = np.mean(data_frame[var])
            std = np.std(data_frame[var])
            mean_sigma[var] = (m, std)
        elif t == "b":
            m = np.mean(data_frame[var])
            # we are going to use center in arm.standardize:
            # "center" (rescale so that the mean of the data is 0 and the difference between the two categories is 1),
            # in the real case we divide by 2$\sigma$, so the final sd is 0.5
            std = np.max(data_frame[var]) - np.min(data_frame[var])
            mean_sigma[var] = (m, std)
            labels = braviz_tab_data.get_labels_dict_by_name(var)
            labels_dicts[var] = labels
        elif t == "n":
            labels = braviz_tab_data.get_labels_dict_by_name(var)
            labels_dicts[var] = labels
        else:
            assert False
    mean_sigma[outcome] = (
        np.mean(data_frame[outcome]), np.std(data_frame[outcome]))
    # variable names can be strange (unicode) ... we are going to change them
    # to more abstract names like in anova
    standard_var_names = ["var_%d_R" % i for i in xrange(len(all_variables))]
    # now create a data frame with the new names
    data_frame_std = data_frame.copy()
    assert np.all(data_frame_std.columns == all_variables)
    data_frame_std.columns = standard_var_names

    # now we are ready to go into the R world
    r_data_frame = data_frame_std
    # create factor variables
    r_environment = robjects.globalenv
    r_environment["r_df"] = r_data_frame
    for var, s_name in izip(all_variables, standard_var_names):
        t = var_type.get(var, "r")
        if t != "r":
            robjects.r('r_df["%s"] <- factor( r_df[["%s"]])' %
                       (s_name, s_name))
    r_data_frame = r_environment["r_df"]
    # construct the formula
    coefficients = standard_var_names
    for products in interactions_dict.itervalues():
        factors = []
        for f in products:
            f_name = regressors_data_frame["variable"][f]
            f_index = all_variables.index(f_name)
            std_name = standard_var_names[f_index]
            factors.append(std_name)
        coefficients.append("*".join(factors))
    formula_str = "var_0_R ~ " + "+".join(coefficients[1:])
    # calculate the linear model
    robjects.r("r_lm <- lm(%s,data=r_df)" % formula_str)
    # standardize
    robjects.r("library('arm')")
    robjects.r("s_lm_r <- standardize(r_lm,standardize.y=T)")
    standardized_model = r_environment["s_lm_r"]

    robjects.r("sum_r <- summary(s_lm_r)")
    fit_summary = r_environment["sum_r"]
    robjects.r(
        "f_pval_r <- pf(sum_r$fstatistic[1],sum_r$fstatistic[2],sum_r$fstatistic[3],lower.tail=FALSE)")
    robjects.r("lm_ci_r <- confint(s_lm_r)")
    conf_intervals = r_environment["lm_ci_r"]
    ses_r = fit_summary.rx2("coefficients").rx(True, 2)
    cof_t_r = fit_summary.rx2("coefficients").rx(True, 3)
    cof_p_r = fit_summary.rx2("coefficients").rx(True, 4)
    conf_95_std = dict((k, (l, h)) for k, l, h in izip(conf_intervals.rx(
        True, 1).names, conf_intervals.rx(True, 1), conf_intervals.rx(True, 2)))

    # now we have to extract the results
    log.info(standardized_model)
    r_coeffs = standardized_model.rx2("coefficients")
    coef_dict_std = dict(izip(r_coeffs.names, r_coeffs))
    std_errors_std = dict(izip(ses_r.names, ses_r))
    t_stats_std = dict(izip(cof_t_r.names, cof_t_r))
    coefs_p_std = dict(izip(cof_p_r.names, cof_p_r))
    residuals = list(standardized_model.rx2("residuals"))
    std_model = standardized_model.rx2("model")

    fitted = list(standardized_model.rx2("fitted.values"))
    intercept = "(Intercept)"
    std_names2orig_names_labels = dict()
    orig_names_labels2orig_vars = dict()
    dummy_vars_levels = defaultdict(dict)
    for std_name in coef_dict_std.iterkeys():
        if std_name == intercept:
            # handle special case
            std_names2orig_names_labels[std_name] = std_name
            continue
        if ":" in std_name:
            # interactions
            factors = std_name.split(":")
        else:
            factors = [std_name]
        orig_factors_levels = []
        orig_factors = []
        for f in factors:
            dummy_level = None
            if f[1] == ".":
                if f[0] == "c":
                    # sentinel value
                    #dummy_level = -1
                    pass
                f = f[2:]
            if f[-1] != "R":
                r_pos = f.rfind("R")
                dummy_level = f[r_pos + 1:]
                f = f[:r_pos + 1]
            index = standard_var_names.index(f)
            orig_name = all_variables[index]
            if dummy_level is not None:
                # sentinel from above
                if dummy_level == -1:

                    ls = labels_dicts[orig_name].items()
                    ls.sort(key=lambda x: x[0])
                    lst = tuple(l[1] for l in ls)
                    label = "%s-%s" % lst

                else:
                    label = labels_dicts[orig_name][int(dummy_level)]
                    dummy_vars_levels[orig_name][label] = int(dummy_level)

                orig_name_label = orig_name + "_%s" % label
            else:
                orig_name_label = orig_name[:]
            orig_factors_levels.append(orig_name_label)
            orig_factors.append(orig_name)
        if len(orig_factors_levels) == 1:
            orig_name_flat = orig_factors_levels[0]
            # coef_dicts[orig_factors[0]]=val
        else:
            orig_name_flat = "*".join(sorted(orig_factors_levels))
            # coef_dicts[frozenset(orig_factors)]=val
        std_names2orig_names_labels[std_name] = orig_name_flat
        orig_names_labels2orig_vars[orig_name_flat] = tuple(orig_factors)

    coef_dicts = dict(
        (std_names2orig_names_labels[k], v) for k, v in coef_dict_std.iteritems())
    std_errors = dict(
        (std_names2orig_names_labels[k], v) for k, v in std_errors_std.iteritems())
    coef_t = dict(
        (std_names2orig_names_labels[k], v) for k, v in t_stats_std.iteritems())
    coef_p = dict(
        (std_names2orig_names_labels[k], v) for k, v in coefs_p_std.iteritems())
    conf_95 = dict(
        (std_names2orig_names_labels[k], v) for k, v in conf_95_std.iteritems())
    r_names = dict((v, k) for k, v in std_names2orig_names_labels.iteritems())
    # combine all this into a data frame
    coefs_df = pd.DataFrame(pd.Series(coef_dicts, name="Slope"))
    coefs_df["T Value"] = pd.Series(coef_t)
    coefs_df["P Value"] = pd.Series(coef_p)
    coefs_df["Std_error"] = pd.Series(std_errors)
    coefs_df["CI_95"] = pd.Series(conf_95)
    coefs_df["r_name"] = pd.Series(r_names)
    coefs_df["components"] = pd.Series(orig_names_labels2orig_vars)
    coefs_df.index.name = "Coefficient"

    # print coefs_df

    # translate columns to standard names
    std_names2orig_names = dict()
    for var, r_var in izip(all_variables, standard_var_names):
        t = var_type.get(var, "r")
        if t == "r":
            std_names2orig_names["z." + r_var] = var
        elif t == "b":
            std_names2orig_names["c." + r_var] = var
        elif t == "n":
            std_names2orig_names[r_var] = var
        else:
            pass
            assert False

    std_model = pandas2ri.ri2py(std_model)
    std_model.columns = [std_names2orig_names[c] for c in std_model.columns]
    std_model.index = data_frame.index

    adjusted_r_squared = fit_summary.rx2("adj.r.squared")[0]
    fit_p_val = r_environment["f_pval_r"][0]
    f_statistic = fit_summary.rx2("fstatistic").rx2("value")[0]
    f_statistic_nom_df = fit_summary.rx2("fstatistic").rx2("numdf")[0]
    f_statistic_denom_df = fit_summary.rx2("fstatistic").rx2("dendf")[0]

    out_dict = {
        "coefficients_df": coefs_df,
        "residuals": residuals,
        "fitted": fitted,
        "adj_r2": adjusted_r_squared,
        "f_pval": fit_p_val,
        "f_stats_val": f_statistic,
        "f_stat_df": (int(f_statistic_nom_df), int(f_statistic_denom_df)),
        "data_points": list(data_frame.index),
        "standardized_model": std_model,
        "data": data_frame,
        "mean_sigma": mean_sigma,
        "var_types": var_type,
        "dummy_levels": dummy_vars_levels,
    }
    log = logging.getLogger(__name__)
    log.info(dummy_vars_levels)
    return out_dict