Source code for braviz.interaction.r_functions

##############################################################################
#    Braviz, Brain Data interactive visualization                            #
#    Copyright (C) 2014  Diego Angulo                                        #
#                                                                            #
#    This program is free software: you can redistribute it and/or modify    #
#    it under the terms of the GNU Lesser General Public License as          #
#    published by  the Free Software Foundation, either version 3 of the     #
#    License, or (at your option) any later version.                         #
#                                                                            #
#    This program is distributed in the hope that it will be useful,         #
#    but WITHOUT ANY WARRANTY; without even the implied warranty of          #
#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the           #
#    GNU Lesser General Public License for more details.                     #
#                                                                            #
#    You should have received a copy of the GNU Lesser General Public License#
#    along with this program.  If not, see <http://www.gnu.org/licenses/>.   #
##############################################################################

from __future__ import print_function
from collections import defaultdict

__author__ = 'Diego'

import logging
from itertools import izip

import pandas as pd
from rpy2.robjects import pandas2ri
pandas2ri.activate()
import rpy2.rinterface
from rpy2 import robjects
from rpy2.robjects.packages import importr
import numpy as np

import braviz.readAndFilter.tabular_data as braviz_tab_data


# arm
# car
# randomForest

[docs]def import_or_error(lib_name): """ Tries to import an R package, if it is not found prints a message asking the user to install it and raises an exception Args: lib_name (str) : Name of an R package """ try: lib = importr(lib_name,robject_translations={"format_perc": "format_perc_2"}) except rpy2.rinterface.RRuntimeError: print("please install %s from R" % lib_name) log = logging.getLogger(__name__) log.error("Couldn't load R package %s", lib_name) raise return lib
[docs]def calculate_ginni_index(outcome, data_frame, sample=None): """ Calculates Ginni impurity indices respect to an outcome variable `<http://en.wikipedia.org/wiki/Decision_tree_learning#Gini_impurity>`_ Uses the RandomForest package Args: outcome (str) : Name of the outcome variable data_frame (pandas.DataFrame) : Data Frame with variable indices as index, this object will be modified sample (list) : Restrict the analysis to subjects in this sample Returns: The input *data_frame* with an additional column called ``Ginni`` that contains the requested indices. The value will be ``numpy.inf`` for the outcome variable, and 0 for variables where the index couldn't be calculated. """ # construct data frame is_nominal = braviz_tab_data.is_variable_name_nominal(outcome) outcome_idx = braviz_tab_data.get_var_idx(outcome) values_data_frame = braviz_tab_data.get_data_frame_by_index( data_frame.index, col_name_index=True) if sample is not None: values_data_frame = values_data_frame.loc[sample] # remove columns with many NaNs # df2=values_data_frame.dropna(1,thresh=40) n_rows, n_cols = values_data_frame.shape values_data_frame.dropna(1, thresh=n_rows//10, inplace=True) # df3=df2.dropna(0,thresh=200) values_data_frame.dropna(0, thresh=n_cols//10, inplace=True) # fill nas with other values # shuffle permutation = np.random.permutation(range(len(values_data_frame))) values_data_frame = values_data_frame.iloc[permutation] values_data_frame.fillna(method="pad", inplace=True) rev_perm = list(reversed(permutation)) values_data_frame = values_data_frame.iloc[rev_perm] values_data_frame.fillna(method="pad", inplace=True) # just in case there are still nas values_data_frame.dropna(0, inplace=True) values_data_frame.dropna(1, inplace=True) # create R data frame original_column_names = values_data_frame.columns # More R friendly column names values_data_frame.columns = [ "c%s" % i for i in xrange(len(values_data_frame.columns))] r_df = values_data_frame # if oucome is nominal transform into factor outcome_variable_index = original_column_names.get_loc(outcome_idx) r_environment = robjects.globalenv r_environment["r_df"] = r_df if is_nominal: robjects.r('r_df["c%d"] <- factor( r_df[["c%d"]])' % (outcome_variable_index, outcome_variable_index)) r_df = r_environment["r_df"] # for i,n in enumerate(original_column_names): # print "%d \t %s"%(i,n) # print r_df #import randomForest randomForest = import_or_error("randomForest") # use correct variable in formula form = robjects.Formula("c%d~." % outcome_variable_index) # robjects.r("table(complete.cases(r_df))") fit = randomForest.randomForest( form, data=r_df, replace=True, importance=True) # fit=robjects.r("randomForest.randomForest(c6~.,data=r_df,replace=True,importance=True)") imp = fit.rx2("importance") # if outcome is factor it is different # print imp interesting_column_index = 2 if is_nominal: interesting_column_index = len( set(values_data_frame["c%d" % outcome_variable_index])) + 2 # extract column vector imp_v = np.zeros(imp.nrow) for i in xrange(imp.nrow): imp_v[i] = imp.rx(i + 1, interesting_column_index)[0] # Remove target variable from original_names potential_regressors = list(original_column_names) potential_regressors.pop(outcome_variable_index) imp_data_frame = pd.DataFrame(imp_v, index=potential_regressors) # print imp_data_frame data_frame["Ginni"] = imp_data_frame data_frame.replace(np.nan, 0, inplace=True) data_frame.loc[outcome_idx,"Ginni"] = np.inf # print self.data_frame return data_frame
[docs]def calculate_anova(outcome, regressors_data_frame, interactions_dict, sample): """ Calculates an anova regression Uses the *car* package with type 3 sum of squares Args: outcome (str) : Name of outcome variable regressors_data_frame (pandas.DataFrame) : A DataFrame with three columns regressor name, degrees of freedom, and interaction. The last column should have zeros for single variable regressors and 1 for interaction terms. see :meth:`braviz.interaction.qt_models.AnovaRegressorsModel.get_data_frame` interactions_dict (dict) : Dictionary mapping indices of interaction terms (in the previous DataFrame) to the indices of its factors. sample (list) : List of subject ids considered during the calculation Returns: ``(output_df, residuals, intercept, fitted)`` where *output_df* is a DataFrame containing 5 columns: Factor name, sum of squares, degrees of freedom, F statistic and P value, it includes an *(intercept)* term. *Residuals*, *intercept* and *fitted* are parameters of the regression. See :class:`braviz.interaction.qt_models.AnovaResultsModel` """ # is outcome nominal? is_nominal = braviz_tab_data.is_variable_name_nominal(outcome) # is outcome binary? is_binary = False if is_nominal: # res=conn.execute("SELECT count(*) FROM nom_meta NATURAL JOIN variables WHERE var_name = ?",(outcome,)) # if res.fetchone()[0]==2: # is_binary=True # else: # is_binary=False log = logging.getLogger(__name__) log.error( "Logistic and multinomial ANOVA not yet implemented, choose a real outcome") raise Exception( "Logistic and multinomial ANOVA not yet implemented, choose a real outcome") # are regressors nominal? factors_nominal = dict() var_names = regressors_data_frame[ regressors_data_frame["Interaction"] == 0]["variable"].tolist() for var in var_names: factors_nominal[var] = braviz_tab_data.is_variable_name_nominal(var) # print factors_nominal # construct pandas data frame var_names.append(outcome) pandas_df = braviz_tab_data.get_data_frame_by_name(var_names) pandas_df = pandas_df.loc[sample] # print pandas_df # construct r data frame new_names = ["r%d" % i for i in xrange(len(var_names))] new_names[-1] = "o" pandas_df.columns = new_names r_df = pandas_df # print r_df # reformat r_df r_environment = robjects.globalenv r_environment["r_df"] = r_df for i, var in enumerate(var_names): if factors_nominal.get(var, False): robjects.r('r_df["r%d"] <- factor( r_df[["r%d"]])' % (i, i)) r_df = r_environment["r_df"] # print r_df # construct formula regressors = new_names[:-1] interactions = [] # print interactions_dict for products in interactions_dict.itervalues(): factors = [] for f in products: f_name = regressors_data_frame["variable"][f] f_index = var_names.index(f_name) f_new_name = new_names[f_index] factors.append(f_new_name) interactions.append("*".join(factors)) regressors.extend(interactions) formula_str = "o~" + "+".join(regressors) # print formula_str form = robjects.Formula(formula_str) # construct constrasts list stats = import_or_error("stats") contrasts_dict = {} for i, var in enumerate(var_names[:-1]): if factors_nominal[var]: k = "r%d" % i contrasts_dict[k] = stats.contr_sum # run anova car = import_or_error("car") if len(contrasts_dict) > 0: contrasts_list = robjects.ListVector(contrasts_dict) model = stats.lm(form, data=r_df, contrasts=contrasts_list) else: model = stats.lm(form, data=r_df) intercept = model[0][0] residuals = np.array(model[1]) anova = car.Anova(model, type=3) fitted = np.array(model[4]) row_names = list(anova.rownames) column_names = list(anova.names) output_dict = dict((k, list(anova.rx(True, i + 1))) for i, k in enumerate(column_names)) # decode column_names def decode_colum_names(name): tokens = name.split(":") fs = [] for t in tokens: try: i = new_names.index(t) except ValueError: f = t else: f = var_names[i] fs.append(f) return ":".join(fs) decoded_names = map(decode_colum_names, row_names) output_df = pd.DataFrame(output_dict, index=decoded_names) # reorder to match r order output_df["Factor"] = output_df.index output_df = output_df[["Factor", "Sum Sq", "Df", "F value", "Pr(>F)"]] return output_df, residuals, intercept, fitted
[docs]def calculate_normalized_linear_regression(outcome, regressors_data_frame, interactions_dict, sample): """ Calculates a linear regression after normalizing variables It uses the arm package `standardize <http://www.inside-r.org/packages/cran/arm/docs/standardize>`_ Args: outcome (str) : Name of outcome variable regressors_data_frame (pandas.DataFrame) : A DataFrame with three columns regressor name, degrees of freedom, and interaction. The last column should have zeros for single variable regressors and 1 for interaction terms. see :meth:`braviz.interaction.qt_models.AnovaRegressorsModel.get_data_frame` interactions_dict (dict) : Dictionary mapping indices of interaction terms (in the previous DataFrame) to the indices of its factors. sample (list) : List of subject ids considered during the calculation Returns: A dictionary with the following fields - ``coefficients_df`` : A data frame with 7 columns: Slope, T statistc value, P value, standard error, 95% confidence interval, r_name (variable alias used inside r) and components (Name of variables that make up the term) - ``residuals`` : vector with regression residuals - ``fitted`` : vector of fitted values - ``adj_r2`` : Adjusted r square - ``f_pval`` : Regression fit p value - ``f_stats_val`` : Value of the regression F statistic - ``f_stat_df`` : Degrees of freedom from F stistics (nominator,denominatior) - ``data_points`` : Subject ids used in the calculation (after dropping nans) - ``standardized_model`` : DataFrame of standardized data - ``data`` : DataFrame used in the calculation (after dropping nans) - ``mean_sigma`` : mean and standard deviation from outcome - ``var_types`` : Dictionary with the type of each variable, options are "r" for real, "b" for binary, and "n" for nominal variables with more levels - ``dummy_levels`` : The text labels for each level of dummy variables (except the base level) see `Fitting & Interpreting Linear Models in R <http://blog.yhathq.com/posts/r-lm-summary.html>`_ for more details on the interpretation of the output """ if not braviz_tab_data.is_variable_name_real(outcome): raise NotImplementedError( "Logistic regression not yet implemented, please select a rational outcome") log=logging.getLogger(__name__) regressor_names = regressors_data_frame[ regressors_data_frame["Interaction"] == 0]["variable"].tolist() all_variables = [outcome] + regressor_names data_frame = braviz_tab_data.get_data_frame_by_name(all_variables) data_frame = data_frame.loc[sample].copy() data_frame.dropna(inplace=True) # we have to classify variables in real, binary, and other nominals var_type = dict() for var in regressor_names: if braviz_tab_data.is_variable_name_real(var): var_type[var] = 'r' else: # is it binary? levels = len(np.unique(data_frame[var])) if levels == 2: var_type[var] = 'b' else: var_type[var] = 'n' # for binary and real variables we need the mean, for real variables we also need the std_dev to de-standardize # for binary and nominal variables we need the labels dict mean_sigma = dict() labels_dicts = dict() for var, t in var_type.iteritems(): if t == "r": m = np.mean(data_frame[var]) std = np.std(data_frame[var]) mean_sigma[var] = (m, std) elif t == "b": m = np.mean(data_frame[var]) # we are going to use center in arm.standardize: # "center" (rescale so that the mean of the data is 0 and the difference between the two categories is 1), # in the real case we divide by 2$\sigma$, so the final sd is 0.5 std = np.max(data_frame[var]) - np.min(data_frame[var]) mean_sigma[var] = (m, std) labels = braviz_tab_data.get_labels_dict_by_name(var) labels_dicts[var] = labels elif t == "n": labels = braviz_tab_data.get_labels_dict_by_name(var) labels_dicts[var] = labels else: assert False mean_sigma[outcome] = ( np.mean(data_frame[outcome]), np.std(data_frame[outcome])) # variable names can be strange (unicode) ... we are going to change them # to more abstract names like in anova standard_var_names = ["var_%d_R" % i for i in xrange(len(all_variables))] # now create a data frame with the new names data_frame_std = data_frame.copy() assert np.all(data_frame_std.columns == all_variables) data_frame_std.columns = standard_var_names # now we are ready to go into the R world r_data_frame = data_frame_std # create factor variables r_environment = robjects.globalenv r_environment["r_df"] = r_data_frame for var, s_name in izip(all_variables, standard_var_names): t = var_type.get(var, "r") if t != "r": robjects.r('r_df["%s"] <- factor( r_df[["%s"]])' % (s_name, s_name)) r_data_frame = r_environment["r_df"] # construct the formula coefficients = standard_var_names for products in interactions_dict.itervalues(): factors = [] for f in products: f_name = regressors_data_frame["variable"][f] f_index = all_variables.index(f_name) std_name = standard_var_names[f_index] factors.append(std_name) coefficients.append("*".join(factors)) formula_str = "var_0_R ~ " + "+".join(coefficients[1:]) # calculate the linear model robjects.r("r_lm <- lm(%s,data=r_df)" % formula_str) # standardize robjects.r("library('arm')") robjects.r("s_lm_r <- standardize(r_lm,standardize.y=T)") standardized_model = r_environment["s_lm_r"] robjects.r("sum_r <- summary(s_lm_r)") fit_summary = r_environment["sum_r"] robjects.r( "f_pval_r <- pf(sum_r$fstatistic[1],sum_r$fstatistic[2],sum_r$fstatistic[3],lower.tail=FALSE)") robjects.r("lm_ci_r <- confint(s_lm_r)") conf_intervals = r_environment["lm_ci_r"] ses_r = fit_summary.rx2("coefficients").rx(True, 2) cof_t_r = fit_summary.rx2("coefficients").rx(True, 3) cof_p_r = fit_summary.rx2("coefficients").rx(True, 4) conf_95_std = dict((k, (l, h)) for k, l, h in izip(conf_intervals.rx( True, 1).names, conf_intervals.rx(True, 1), conf_intervals.rx(True, 2))) # now we have to extract the results log.info(standardized_model) r_coeffs = standardized_model.rx2("coefficients") coef_dict_std = dict(izip(r_coeffs.names, r_coeffs)) std_errors_std = dict(izip(ses_r.names, ses_r)) t_stats_std = dict(izip(cof_t_r.names, cof_t_r)) coefs_p_std = dict(izip(cof_p_r.names, cof_p_r)) residuals = list(standardized_model.rx2("residuals")) std_model = standardized_model.rx2("model") fitted = list(standardized_model.rx2("fitted.values")) intercept = "(Intercept)" std_names2orig_names_labels = dict() orig_names_labels2orig_vars = dict() dummy_vars_levels = defaultdict(dict) for std_name in coef_dict_std.iterkeys(): if std_name == intercept: # handle special case std_names2orig_names_labels[std_name] = std_name continue if ":" in std_name: # interactions factors = std_name.split(":") else: factors = [std_name] orig_factors_levels = [] orig_factors = [] for f in factors: dummy_level = None if f[1] == ".": if f[0] == "c": # sentinel value #dummy_level = -1 pass f = f[2:] if f[-1] != "R": r_pos = f.rfind("R") dummy_level = f[r_pos + 1:] f = f[:r_pos + 1] index = standard_var_names.index(f) orig_name = all_variables[index] if dummy_level is not None: # sentinel from above if dummy_level == -1: ls = labels_dicts[orig_name].items() ls.sort(key=lambda x: x[0]) lst = tuple(l[1] for l in ls) label = "%s-%s" % lst else: label = labels_dicts[orig_name][int(dummy_level)] dummy_vars_levels[orig_name][label] = int(dummy_level) orig_name_label = orig_name + "_%s" % label else: orig_name_label = orig_name[:] orig_factors_levels.append(orig_name_label) orig_factors.append(orig_name) if len(orig_factors_levels) == 1: orig_name_flat = orig_factors_levels[0] # coef_dicts[orig_factors[0]]=val else: orig_name_flat = "*".join(sorted(orig_factors_levels)) # coef_dicts[frozenset(orig_factors)]=val std_names2orig_names_labels[std_name] = orig_name_flat orig_names_labels2orig_vars[orig_name_flat] = tuple(orig_factors) coef_dicts = dict( (std_names2orig_names_labels[k], v) for k, v in coef_dict_std.iteritems()) std_errors = dict( (std_names2orig_names_labels[k], v) for k, v in std_errors_std.iteritems()) coef_t = dict( (std_names2orig_names_labels[k], v) for k, v in t_stats_std.iteritems()) coef_p = dict( (std_names2orig_names_labels[k], v) for k, v in coefs_p_std.iteritems()) conf_95 = dict( (std_names2orig_names_labels[k], v) for k, v in conf_95_std.iteritems()) r_names = dict((v, k) for k, v in std_names2orig_names_labels.iteritems()) # combine all this into a data frame coefs_df = pd.DataFrame(pd.Series(coef_dicts, name="Slope")) coefs_df["T Value"] = pd.Series(coef_t) coefs_df["P Value"] = pd.Series(coef_p) coefs_df["Std_error"] = pd.Series(std_errors) coefs_df["CI_95"] = pd.Series(conf_95) coefs_df["r_name"] = pd.Series(r_names) coefs_df["components"] = pd.Series(orig_names_labels2orig_vars) coefs_df.index.name = "Coefficient" # print coefs_df # translate columns to standard names std_names2orig_names = dict() for var, r_var in izip(all_variables, standard_var_names): t = var_type.get(var, "r") if t == "r": std_names2orig_names["z." + r_var] = var elif t == "b": std_names2orig_names["c." + r_var] = var elif t == "n": std_names2orig_names[r_var] = var else: pass assert False std_model = pandas2ri.ri2py(std_model) std_model.columns = [std_names2orig_names[c] for c in std_model.columns] std_model.index = data_frame.index adjusted_r_squared = fit_summary.rx2("adj.r.squared")[0] fit_p_val = r_environment["f_pval_r"][0] f_statistic = fit_summary.rx2("fstatistic").rx2("value")[0] f_statistic_nom_df = fit_summary.rx2("fstatistic").rx2("numdf")[0] f_statistic_denom_df = fit_summary.rx2("fstatistic").rx2("dendf")[0] out_dict = { "coefficients_df": coefs_df, "residuals": residuals, "fitted": fitted, "adj_r2": adjusted_r_squared, "f_pval": fit_p_val, "f_stats_val": f_statistic, "f_stat_df": (int(f_statistic_nom_df), int(f_statistic_denom_df)), "data_points": list(data_frame.index), "standardized_model": std_model, "data": data_frame, "mean_sigma": mean_sigma, "var_types": var_type, "dummy_levels": dummy_vars_levels, } log = logging.getLogger(__name__) log.info(dummy_vars_levels) return out_dict