Skip to content

API Reference

A collection of Bayesian statistical models and associated utility functions.

BEST(y, group, n_draws=1000)

Implementation of John Kruschke's BEST test.

Estimates parameters related to outcomes of two groups. See: https://jkkweb.sitehost.iu.edu/articles/Kruschke2013JEPG.pdf for more details.

Parameters:

Name Type Description Default
y ndarray / Series

The metric outcome variable.

required
group Series

The grouping variable providing that indexes into y.

required
n_draws

Number of random samples to draw from the posterior.

1000

Returns:

Type Description

pymc.Model and arviz.InferenceData objects.

Source code in bayes_toolbox/glm.py
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
def BEST(y, group, n_draws=1000):
    """Implementation of John Kruschke's BEST test.

    Estimates parameters related to outcomes of two groups. See: 
    https://jkkweb.sitehost.iu.edu/articles/Kruschke2013JEPG.pdf for more details.

    Args:
        y (ndarray/pandas.Series): The metric outcome variable.
        group (pandas.Series): The grouping variable providing that indexes into y.
        n_draws: Number of random samples to draw from the posterior.

    Returns: 
        pymc.Model and arviz.InferenceData objects.
    """

    # Convert grouping variable to categorical dtype if it is not already
    if pd.api.types.is_categorical_dtype(group):
        pass
    else:
        group = group.astype("category")
    group_idx = group.cat.codes.values

    # Extract group levels and make sure there are only two
    level = group.cat.categories
    assert len(level) == 2, f"Expected two groups but got {len(level)}."

    # Calculate pooled empirical mean and SD of data to scale hyperparameters
    mu_y = y.mean()
    sigma_y = y.std()

    with pm.Model() as model:
        # Define priors. Arbitrarily set hyperparameters to the pooled
        # empirical mean of data and twice pooled empirical SD, which
        # applies very diffuse and unbiased info to these quantities.
        group_mean = pm.Normal(
            "group_mean", mu=mu_y, sigma=sigma_y * 2, shape=len(level)
        )
        group_std = pm.Uniform(
            "group_std", lower=sigma_y / 10, upper=sigma_y * 10, shape=len(level)
        )

        # See Kruschke Ch 16.2.1 for in-depth rationale for prior on nu. The addition
        # of 1 is to shift the distribution so that the range of possible values of nu
        # are 1 to infinity (with mean of 30).
        nu_minus_one = pm.Exponential("nu_minus_one", 1 / 29)
        nu = pm.Deterministic("nu", nu_minus_one + 1)
        nu_log10 = pm.Deterministic("nu_log10", np.log10(nu))

        # Define likelihood
        likelihood = pm.StudentT(
            "likelihood",
            nu=nu,
            mu=group_mean[group_idx],
            sigma=group_std[group_idx],
            observed=y,
        )

        # Contrasts of interest
        diff_of_means = pm.Deterministic(
            "difference of means", group_mean[0] - group_mean[1]
        )
        diff_of_stds = pm.Deterministic(
            "difference of stds", group_std[0] - group_std[1]
        )
        effect_size = pm.Deterministic(
            "effect size",
            diff_of_means / np.sqrt((group_std[0] ** 2 + group_std[1] ** 2) / 2),
        )

        # Sample from posterior
        idata = pm.sample(draws=n_draws)

    return model, idata

BEST_paired(y1, y2=None, n_draws=1000)

BEST procedure on single or paired sample(s).

Parameters:

Name Type Description Default
y1 ndarray / Series

Either single sample or difference scores.

required
y2 ndarray / Series

(Optional) If provided, represents the paired sample (i.e., y2 elements are in same order as y1).

None

Returns:

Type Description

pymc.Model and arviz.InferenceData objects.

Source code in bayes_toolbox/glm.py
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
def BEST_paired(y1, y2=None, n_draws=1000):
    """BEST procedure on single or paired sample(s).

    Args:
        y1 (ndarray/Series): Either single sample or difference scores.
        y2 (ndarray/Series): (Optional) If provided, represents the paired sample 
            (i.e., y2 elements are in same order as y1).

    Returns:
        pymc.Model and arviz.InferenceData objects.
    """

    # Check to see if y2 was entered. If so, then this means the
    # goal is to compare difference scores on a within subjects variable
    # (e.g., block). Otherwise, we are comparing location parameter to zero.
    if y2 is None:
        y = y1
    else:
        assert len(y1) == len(y2), f"There must be equal numbers of observations."
        # Convert pre and post to difference scores.
        y = y1 - y2

    # Calculate pooled empirical mean and SD of data to scale hyperparameters
    mu_y = y.mean()
    sigma_y = y.std()

    with pm.Model() as model:
        # Define priors
        mu = pm.Normal("mu", mu=mu_y, sigma=sigma_y * 10)
        sigma = pm.Uniform("sigma", sigma_y / 10, sigma_y * 10)
        nu_minus1 = pm.Exponential("nu_minus_one", 1 / 29)
        nu = pm.Deterministic("nu", nu_minus1 + 1)

        # Define likelihood
        likelihood = pm.StudentT("likelihood", nu=nu, mu=mu, sigma=sigma, observed=y)

        # Standardized effect size
        effect_size = pm.Deterministic("effect_size", mu / sigma)

        # Sample from posterior
        idata = pm.sample(draws=n_draws)

    return model, idata

bayesian_logreg_cat_predictors(X, y, n_draws=1000)

Performs a Bayesian logistic regression using categorical predictors.

Parameters:

Name Type Description Default
X

Predictor matrix.

required
y

The outcome variable.

required

Returns:

Type Description

pymc.Model and arviz.InferenceData objects.

Source code in bayes_toolbox/glm.py
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
def bayesian_logreg_cat_predictors(X, y, n_draws=1000):
    """Performs a Bayesian logistic regression using categorical predictors.

    Args:
        X: Predictor matrix.
        y: The outcome variable.

    Returns:
        pymc.Model and arviz.InferenceData objects.
    """

    # Standardize the predictor variable(s)
    zX, mu_X, sigma_X = standardize(X)

    with pm.Model(coords={"predictors": X.columns.values}) as model:
        # Set  priors
        zbeta0 = pm.Normal("zbeta0", mu=0, sigma=2)
        zbetaj = pm.Normal("zbetaj", mu=0, sigma=2, dims="predictors")

        p = pm.invlogit(zbeta0 + pm.math.dot(zX, zbetaj))

        # Define likelihood function
        likelihood = pm.Bernoulli("likelihood", p, observed=y)

        # Transform parameters to original scale
        beta0 = pm.Deterministic(
            "beta0", (zbeta0 - pm.math.sum(zbetaj * mu_X / sigma_X))
        )
        betaj = pm.Deterministic("betaj", zbetaj / sigma_X, dims="predictors")

        # Sample from the posterior
        idata = pm.sample(draws=n_draws)

        return model, idata

bayesian_logreg_subj_intercepts(subj, X, y, n_draws=1000)

Performs a Bayesian logistic regression using categorical predictors and fitting separate intercepts for each subject.

Parameters:

Name Type Description Default
subj

Subject IDs.

required
X

Predictor matrix.

required
y

The outcome variable.

required

Returns:

Type Description

pymc.Model and arviz.InferenceData objects.

Source code in bayes_toolbox/glm.py
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
def bayesian_logreg_subj_intercepts(subj, X, y, n_draws=1000):
    """Performs a Bayesian logistic regression using categorical predictors and fitting
    separate intercepts for each subject.

    Args:
        subj: Subject IDs.
        X: Predictor matrix.
        y: The outcome variable.

    Returns:
        pymc.Model and arviz.InferenceData objects.
    """
    # Factorize subj IDs and treatment variable
    subj_idx, subj_levels, n_subj = parse_categorical(subj)
    treatment_idx, treatment_levels, n_treatment = parse_categorical(X)

    with pm.Model(coords={"subj": subj_levels, "treatment": treatment_levels}) as model:
        # Set priors
        a = pm.Normal("a", 0.0, 1.5, dims="subj")
        b = pm.Normal("b", 0.0, 0.5, dims="treatment")

        p = pm.Deterministic("p", pm.math.invlogit(a[subj_idx] + b[treatment_idx]))

        # Define likelihood function (in this case, observations are 0 or 1)
        likelihood = pm.Binomial("likelihood", 1, p, observed=y)

        # Draw samples from the posterior
        idata = pm.sample(draws=n_draws)

        return model, idata

bayesian_mixed_model_anova(between_subj_var, within_subj_var, subj_id, y, n_samples=1000)

Performs Bayesian analogue of mixed model (split-plot) ANOVA.

Models instance of outcome resulting from both between- and within-subjects factors. Outcome is measured several times from each observational unit (i.e., repeated measures).

Parameters:

Name Type Description Default
between_subj_var

The between-subjects variable.

required
withing_subj_var

The within-subjects variable.

required
subj_id

The subj ID variable.

required
y

The outcome variable.

required

Returns:

Type Description

pymc.Model and arviz.InferenceData objects.

Source code in bayes_toolbox/glm.py
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
def bayesian_mixed_model_anova(
    between_subj_var, within_subj_var, subj_id, y, n_samples=1000
):
    """Performs Bayesian analogue of mixed model (split-plot) ANOVA.

    Models instance of outcome resulting from both between- and within-subjects
    factors. Outcome is measured several times from each observational unit (i.e.,
    repeated measures).

    Args:
        between_subj_var: The between-subjects variable.
        withing_subj_var: The within-subjects variable.
        subj_id: The subj ID variable.
        y: The outcome variable.

    Returns:
        pymc.Model and arviz.InferenceData objects.
    """
    # Statistical model: Split-plot design after Kruschke Ch. 20
    # Between-subjects factor (i.e., group)
    x_between, levels_x_between, num_levels_x_between = parse_categorical(
        between_subj_var
    )

    # Within-subjects factor (i.e., target set)
    x_within, levels_x_within, num_levels_x_within = parse_categorical(within_subj_var)

    # Individual subjects
    x_subj, levels_x_subj, num_levels_x_subj = parse_categorical(subj_id)

    # Dependent variable
    mu_y = y.mean()
    sigma_y = y.std()

    a_shape, a_rate = gamma_shape_rate_from_mode_sd(sigma_y / 2, 2 * sigma_y)

    with pm.Model(
        coords={
            "between_subj": levels_x_between,
            "within_subj": levels_x_within,
            "subj": levels_x_subj,
        }
    ) as model:
        # Baseline value
        a0 = pm.Normal("a0", mu=mu_y, sigma=sigma_y * 5)

        # Deflection from baseline for between subjects factor
        sigma_B = pm.Gamma("sigma_B", a_shape, a_rate)
        aB = pm.Normal("aB", mu=0.0, sigma=sigma_B, dims="between_subj")

        # Deflection from baseline for within subjects factor
        sigma_W = pm.Gamma("sigma_W", a_shape, a_rate)
        aW = pm.Normal("aW", mu=0.0, sigma=sigma_W, dims="within_subj")

        # Deflection from baseline for combination of between and within subjects factors
        sigma_BxW = pm.Gamma("sigma_BxW", a_shape, a_rate)
        aBxW = pm.Normal(
            "aBxW", mu=0.0, sigma=sigma_BxW, dims=("between_subj", "within_subj")
        )

        # Deflection from baseline for individual subjects
        sigma_S = pm.Gamma("sigma_S", a_shape, a_rate)
        aS = pm.Normal("aS", mu=0.0, sigma=sigma_S, dims="subj")

        mu = a0 + aB[x_between] + aW[x_within] + aBxW[x_between, x_within] + aS[x_subj]
        sigma = pm.Uniform("sigma", lower=sigma_y / 100, upper=sigma_y * 10)

        # Define likelihood
        likelihood = pm.Normal("likelihood", mu=mu, sigma=sigma, observed=y)

        # Sample from the posterior
        idata = pm.sample(draws=n_samples, tune=2000, target_accept=0.95)

    return model, idata

bayesian_oneway_rm_anova(x1, x_s, y, tune=2000, n_draws=1000)

Bayesian analogue of RM ANOVA.

Models instance of outcome resulting from two categorical predictors, x1 and x_s (subject identifier). This model assumes each subject contributes only one value to each cell. Therefore, there is no modeling of an interaction effect (see Kruschke Ch. 20.5).

Parameters:

Name Type Description Default
x1

First categorical predictor variable.

required
x_s

Second categorical predictor variable - subject.

required
y

The outcome variable.

required

Returns:

Type Description

pymc.Model and arviz.InferenceData objects.

Source code in bayes_toolbox/glm.py
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
def bayesian_oneway_rm_anova(x1, x_s, y, tune=2000, n_draws=1000):
    """Bayesian analogue of RM ANOVA.

    Models instance of outcome resulting from two categorical predictors,
    x1 and x_s (subject identifier). This model assumes each subject
    contributes only one value to each cell. Therefore, there is no
    modeling of an interaction effect (see Kruschke Ch. 20.5).

    Args:
        x1: First categorical predictor variable.
        x_s: Second categorical predictor variable - subject.
        y: The outcome variable.

    Returns:
        pymc.Model and arviz.InferenceData objects.
    """
    mu_y = y.mean()
    sigma_y = y.std()

    a_shape, a_rate = gamma_shape_rate_from_mode_sd(sigma_y / 2, 2 * sigma_y)
    x1_vals, levels1, n_levels1 = parse_categorical(x1)
    x_s_vals, levels_s, n_levels_s = parse_categorical(x_s)

    with pm.Model(coords={"factor1": levels1, "factor_subj": levels_s}) as model:
        # To understand the reparameterization, see:
        # http://twiecki.github.io/blog/2017/02/08/bayesian-hierchical-non-centered/
        a0_tilde = pm.Normal("a0_tilde", mu=0, sigma=1)
        a0 = pm.Deterministic("a0", mu_y + sigma_y * 5 * a0_tilde)

        sigma_a1 = pm.Gamma("sigma_a1", a_shape, a_rate)
        # sigma_a1 = 5
        a1_tilde = pm.Normal("a1_tilde", mu=0, sigma=1, dims="factor1")
        a1 = pm.Deterministic("a1", 0.0 + sigma_a1 * a1_tilde)

        sigma_a_s = pm.Gamma("sigma_a_s", a_shape, a_rate)
        a_s_tilde = pm.Normal("a_s_tilde", mu=0, sigma=1, dims="factor_subj")
        a_s = pm.Deterministic("a_s", 0.0 + sigma_a_s * a_s_tilde)

        mu = a0 + a1[x1_vals] + a_s[x_s_vals]
        sigma = pm.Uniform("sigma", sigma_y / 100, sigma_y * 10)

        # Define the likelihood function
        likelihood = pm.Normal("likelihood", mu, sigma=sigma, observed=y)

        # Sample from the posterior
        idata = pm.sample(draws=n_draws, target_accept=0.95)

        return model, idata

bayesian_two_factor_anova(x1, x2, y, n_draws=1000)

Bayesian analogue of two-factor ANOVA.

Models instance of outcome resulting from two categorical predictors.

Parameters:

Name Type Description Default
x1

First categorical predictor variable.

required
x2

Second categorical predictor variable.

required
y

The outcome variable.

required

Returns:

Type Description

pymc.Model and arviz.InferenceData objects.

Source code in bayes_toolbox/glm.py
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
def bayesian_two_factor_anova(x1, x2, y, n_draws=1000):
    """Bayesian analogue of two-factor ANOVA.

    Models instance of outcome resulting from two categorical predictors.

    Args:
        x1: First categorical predictor variable.
        x2: Second categorical predictor variable.
        y: The outcome variable.

    Returns:
        pymc.Model and arviz.InferenceData objects.
    """
    mu_y = y.mean()
    sigma_y = y.std()

    a_shape, a_rate = gamma_shape_rate_from_mode_sd(sigma_y / 2, 2 * sigma_y)
    x1_vals, levels1, n_levels1 = parse_categorical(x1)
    x2_vals, levels2, n_levels2 = parse_categorical(x2)

    with pm.Model(coords={"factor1": levels1, "factor2": levels2}) as model:
        # To understand the reparameterization, see:
        # http://twiecki.github.io/blog/2017/02/08/bayesian-hierchical-non-centered/
        a0_tilde = pm.Normal("a0_tilde", mu=0, sigma=1)
        a0 = pm.Deterministic("a0", mu_y + sigma_y * 5 * a0_tilde)

        sigma_a1 = pm.Gamma("sigma_a1", a_shape, a_rate)
        a1_tilde = pm.Normal("a1_tilde", mu=0, sigma=1, dims="factor1")
        a1 = pm.Deterministic("a1", 0.0 + sigma_a1 * a1_tilde)

        sigma_a2 = pm.Gamma("sigma_a2", a_shape, a_rate)
        a2_tilde = pm.Normal("a2_tilde", mu=0, sigma=1, dims="factor2")
        a2 = pm.Deterministic("a2", 0.0 + sigma_a2 * a2_tilde)

        sigma_a1a2 = pm.Gamma("sigma_a1a2", a_shape, a_rate)
        a1a2_tilde = pm.Normal("a1a2_tilde", mu=0, sigma=1, dims=("factor1", "factor2"))
        a1a2 = pm.Deterministic("a1a2", 0.0 + sigma_a1a2 * a1a2_tilde)

        mu = a0 + a1[x1_vals] + a2[x2_vals] + a1a2[x1_vals, x2_vals]
        sigma = pm.Uniform("sigma", sigma_y / 100, sigma_y * 10)

        # Define the likelihood function
        likelihood = pm.Normal("likelihood", mu, sigma=sigma, observed=y)

        # Sample from the posterior
        idata = pm.sample(draws=n_draws, target_accept=0.95)

        return model, idata

calc_marginal_means(m_SxBxW)

Calculate the marginalized means using the masked array.

Intended for use with bayesian_mixed_model_anova.

Source code in bayes_toolbox/glm.py
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
def calc_marginal_means(m_SxBxW):
    """Calculate the marginalized means using the masked array.

    Intended for use with bayesian_mixed_model_anova.
    """

    # Mean for subject S across levels of W, within the level of B
    m_S = m_SxBxW.mean(dim="within_subj")

    # Mean for treatment combination BxW, across subjects S
    m_BxW = m_SxBxW.mean(dim="subj")

    # Mean for level B, across W and S
    m_B = m_BxW.mean(dim=["within_subj"])

    # Mean for level W, across B and S
    m_W = m_BxW.mean(dim=["between_subj"])

    return m_S, m_BxW, m_B, m_W

convert_to_sum_to_zero(idata, between_subj_var, within_subj_var, subj_id)

Returns coefficients that obey sum-to-zero constraint.

Parameters:

Name Type Description Default
idata

InferenceData object.

required
between_subj_var

Between-subjects predictor variable.

required
within_subj_var

Within-subjects predictor variable.

required
subj_id

Subject ID variable.

required

Returns:

Type Description

Posterior variables.

Source code in bayes_toolbox/glm.py
 960
 961
 962
 963
 964
 965
 966
 967
 968
 969
 970
 971
 972
 973
 974
 975
 976
 977
 978
 979
 980
 981
 982
 983
 984
 985
 986
 987
 988
 989
 990
 991
 992
 993
 994
 995
 996
 997
 998
 999
1000
1001
1002
1003
1004
1005
def convert_to_sum_to_zero(idata, between_subj_var, within_subj_var, subj_id):
    """Returns coefficients that obey sum-to-zero constraint.

    Args:
        idata: InferenceData object.
        between_subj_var: Between-subjects predictor variable.
        within_subj_var: Within-subjects predictor variable.
        subj_id: Subject ID variable.

    Returns:
        Posterior variables.
    """

    posterior = az.extract(idata.posterior)
    a0, aB, aW, aBxW, aS, sigma = unpack_posterior_vars(posterior)
    m_SxBxW = create_masked_array(
        a0, aB, aW, aBxW, aS, posterior, between_subj_var, within_subj_var, subj_id
    )
    m_S, m_BxW, m_B, m_W = calc_marginal_means(m_SxBxW)

    # Equation 20.3
    m = m_BxW.mean(dim=["between_subj", "within_subj"])
    posterior = posterior.assign(b0=m)

    # Equation 20.4
    posterior = posterior.assign(bB=m_B - m)

    # Equation 20.5
    posterior = posterior.assign(bW=m_W - m)

    # Equation 20.6
    posterior = posterior.assign(bBxW=m_BxW - m_B - m_W + m)

    # Equation 20.7 (removing between_subj dimension)
    posterior = posterior.assign(
        bS=(["subj", "sample"], (m_S - m_B).mean(dim="between_subj").data)
    )

    assert np.allclose(posterior.bB.sum(dim=["between_subj"]), 0, atol=1e-5)
    assert np.allclose(posterior.bW.sum(dim=["within_subj"]), 0, atol=1e-5)
    assert np.allclose(
        posterior.bBxW.sum(dim=["between_subj", "within_subj"]), 0, atol=1e-5
    )
    assert np.allclose(posterior.bS.sum(dim=["subj"]), 0, atol=1e-5)

    return posterior

create_masked_array(a0, aB, aW, aBxW, aS, posterior, between_subj_var, within_subj_var, subj_id)

Creates a masked array with all cell values from the posterior.

Intended for use with bayesian_mixed_model_anova.

Source code in bayes_toolbox/glm.py
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
def create_masked_array(
    a0, aB, aW, aBxW, aS, posterior, between_subj_var, within_subj_var, subj_id
):
    """Creates a masked array with all cell values from the posterior.

    Intended for use with bayesian_mixed_model_anova.
    """
    # Between-subjects factor
    x_between, levels_x_between, num_levels_x_between = parse_categorical(
        between_subj_var
    )

    # Within-subjects factor
    x_within, levels_x_within, num_levels_x_within = parse_categorical(within_subj_var)

    # Individual subjects
    x_subj, levels_x_subj, num_levels_x_subj = parse_categorical(subj_id)

    # Initialize the array with zeros
    posterior = posterior.assign(
        m_SxBxW=(
            ["subj", "between_subj", "within_subj", "sample"],
            np.zeros(
                (
                    num_levels_x_subj,
                    num_levels_x_between,
                    num_levels_x_within,
                    len(posterior["sample"]),
                )
            ),
        )
    )

    # Fill the arrray
    for k, i, j in zip(x_subj, x_between, x_within):
        posterior.m_SxBxW[k, i, j, :] = (
            a0 + aB[i, :] + aW[j, :] + aBxW[i, j, :] + aS[k, :]
        )

    # Convert to masked array that masks value '0'.
    posterior = posterior.assign(
        m_SxBxW=(
            ["subj", "between_subj", "within_subj", "sample"],
            (ma.masked_equal(posterior.m_SxBxW, 0)),
        )
    )

    m_SxBxW = posterior.m_SxBxW

    return m_SxBxW

gamma_shape_rate_from_mode_sd(mode, sd)

Calculate Gamma shape and rate parameters from mode and sd.

Parameters:

Name Type Description Default
mode

Mode of distribution.

required
sd

Standard deviation of distribution.

required
Source code in bayes_toolbox/glm.py
428
429
430
431
432
433
434
435
436
437
438
439
def gamma_shape_rate_from_mode_sd(mode, sd):
    """Calculate Gamma shape and rate parameters from mode and sd.

    Args:
        mode: Mode of distribution.
        sd: Standard deviation of distribution.

    """
    rate = (mode + np.sqrt(mode**2 + 4 * sd**2)) / (2 * sd**2)
    shape = 1 + mode * rate

    return shape, rate

hierarchical_bayesian_ancova(x, x_met, y, mu_x_met, mu_y, sigma_x_met, sigma_y, n_draws=1000)

Models outcome resulting from categorical and metric predictors.

The Bayesian analogue of an ANCOVA test.

Parameters:

Name Type Description Default
x

The categorical predictor.

required
x_met

The metric predictor.

required
y

The outcome variable.

required
mu_x_met

The mean of x_met.

required
mu_y

The mean of y.

required
sigma_x_met

The SD of x_met.

required
sigma_y

The SD of y.

required

Returns:

Type Description

pymc.Model and arviz.InferenceData objects.

Source code in bayes_toolbox/glm.py
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
def hierarchical_bayesian_ancova(
    x, x_met, y, mu_x_met, mu_y, sigma_x_met, sigma_y, n_draws=1000
):
    """Models outcome resulting from categorical and metric predictors.

    The Bayesian analogue of an ANCOVA test.

    Args:
        x: The categorical predictor.
        x_met: The metric predictor.
        y: The outcome variable.
        mu_x_met: The mean of x_met.
        mu_y: The mean of y.
        sigma_x_met: The SD of x_met.
        sigma_y: The SD of y.

    Returns:
        pymc.Model and arviz.InferenceData objects.
    """
    x_vals, levels, n_levels = parse_categorical(x)

    a_shape, a_rate = gamma_shape_rate_from_mode_sd(sigma_y / 2, 2 * sigma_y)

    with pm.Model(coords={"groups": levels}) as model:
        # 'a' indicates coefficients not yet obeying sum-to-zero contraint
        sigma_a = pm.Gamma("sigma_a", alpha=a_shape, beta=a_rate)
        a0 = pm.Normal("a0", mu=mu_y, sigma=sigma_y * 5)
        a = pm.Normal("a", 0.0, sigma=sigma_a, dims="groups")
        a_met = pm.Normal("a_met", mu=0, sigma=2 * sigma_y / sigma_x_met)
        # Note that in Warmhoven notebook he uses SD of residuals to set
        # lower bound on sigma_y
        sigma_y = pm.Uniform("sigma_y", sigma_y / 100, sigma_y * 10)
        mu = a0 + a[x_vals] + a_met * (x_met - mu_x_met)

        # Define the likelihood function
        likelihood = pm.Normal("likelihood", mu=mu, sigma=sigma_y, observed=y)

        # Convert a0, a to sum-to-zero b0, b
        b0 = pm.Deterministic("b0", a0 + at.mean(a) + a_met * (-mu_x_met))
        b = pm.Deterministic("b", a - at.mean(a))

        # Sample from the posterior
        idata = pm.sample(draws=n_draws)

        return model, idata

hierarchical_bayesian_anova(x, y, n_draws=1000, acceptance_rate=0.9)

Models metric outcome resulting from single categorical predictor.

Parameters:

Name Type Description Default
x

The categorical (nominal) predictor variable.

required
y

The outcome variable.

required

Returns:

Type Description

pymc.Model and arviz.InferenceData objects.

Source code in bayes_toolbox/glm.py
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
def hierarchical_bayesian_anova(x, y, n_draws=1000, acceptance_rate=0.9):
    """Models metric outcome resulting from single categorical predictor.

    Args:
        x: The categorical (nominal) predictor variable.
        y: The outcome variable.

    Returns:
        pymc.Model and arviz.InferenceData objects.
    """

    mu_y = y.mean()
    sigma_y = y.std()

    x_vals, x_levels, n_levels = parse_categorical(x)
    a_shape, a_rate = gamma_shape_rate_from_mode_sd(sigma_y / 2, 2 * sigma_y)

    with pm.Model(coords={"groups": x_levels}) as model:
        # 'a' indicates coefficients not yet obeying sum-to-zero contraint
        sigma_a = pm.Gamma("sigma_a", alpha=a_shape, beta=a_rate)
        a0 = pm.Normal("a0", mu=mu_y, sigma=sigma_y * 5)
        a = pm.Normal("a", 0.0, sigma=sigma_a, dims="groups")

        sigma_y = pm.Uniform("sigma_y", sigma_y / 100, sigma_y * 10)

        # Define the likelihood function
        likelihood = pm.Normal("likelihood", a0 + a[x_vals], sigma=sigma_y, observed=y)

        # Convert a0, a to sum-to-zero b0, b
        m = pm.Deterministic("m", a0 + a)
        b0 = pm.Deterministic("b0", at.mean(m))
        b = pm.Deterministic("b", m - b0)

        idata = pm.sample(draws=n_draws, target_accept=acceptance_rate)

        return model, idata

hierarchical_regression(x, y, subj, n_draws=1000, acceptance_rate=0.9)

A multi-level model for estimating group and individual level parameters.

Parameters:

Name Type Description Default
x ndarray / Series

Predictor variable.

required
y ndarray / Series

Outcome variable.

required
subj Series

Subj id variable.

required

Returns:

Type Description

pymc.Model and arviz.InferenceData objects.

Source code in bayes_toolbox/glm.py
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
def hierarchical_regression(x, y, subj, n_draws=1000, acceptance_rate=0.9):
    """A multi-level model for estimating group and individual level parameters.

    Args:
        x (ndarray/pandas.Series): Predictor variable.
        y (ndarray/pandas.Series): Outcome variable.
        subj (pandas.Series): Subj id variable.

    Returns:
        pymc.Model and arviz.InferenceData objects.
    """
    zx, mu_x, sigma_x = standardize(x)
    zy, mu_y, sigma_y = standardize(y)

    # Convert subject variable to categorical dtype if it is not already
    subj_idx, subj_levels, n_subj = parse_categorical(subj)

    # Taking advantage of the label-based indexing provided by xarray. See:
    # https://www.pymc.io/projects/docs/en/stable/learn/core_notebooks/pymc_overview.html
    with pm.Model(coords={"subj": subj_levels}) as model:
        # Hyperpriors
        zbeta0 = pm.Normal("zbeta0", mu=0, tau=1 / 10**2)
        zbeta1 = pm.Normal("zbeta1", mu=0, tau=1 / 10**2)
        zsigma0 = pm.Uniform("zsigma0", 10**-3, 10**3)
        zsigma1 = pm.Uniform("zsigma1", 10**-3, 10**3)

        # Priors for individual subject parameters. See:
        # http://twiecki.github.io/blog/2017/02/08/bayesian-hierchical-non-centered/
        # for rationale of following reparameterization.
        zbeta0_s_offset = pm.Normal("zbeta0_s_offset", mu=0, sigma=1, dims="subj")
        zbeta0_s = pm.Deterministic(
            "zbeta0_s", zbeta0 + zbeta0_s_offset * zsigma0, dims="subj"
        )

        zbeta1_s_offset = pm.Normal("zbeta1_s_offset", mu=0, sigma=1, dims="subj")
        zbeta1_s = pm.Deterministic(
            "zbeta1_s", zbeta1 + zbeta1_s_offset * zsigma1, dims="subj"
        )

        mu = zbeta0_s[subj_idx] + zbeta1_s[subj_idx] * zx

        zsigma = pm.Uniform("zsigma", 10**-3, 10**3)
        nu_minus_one = pm.Exponential("nu_minus_one", 1 / 29)
        nu = pm.Deterministic("nu", nu_minus_one + 1)

        # Define likelihood function
        likelihood = pm.StudentT("likelihood", nu, mu=mu, sigma=zsigma, observed=zy)

        # Sample from posterior
        idata = pm.sample(draws=n_draws, target_accept=acceptance_rate)

        return model, idata

is_standardized(X, eps=0.0001)

Checks to see if variable is standardized (i.e., N(0, 1)).

Parameters:

Name Type Description Default
X

Random variable.

required
eps

Some small value to represent tolerance level.

0.0001

Returns:

Type Description

Bool

Source code in bayes_toolbox/glm.py
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
def is_standardized(X, eps=0.0001):
    """Checks to see if variable is standardized (i.e., N(0, 1)).

    Args:
        X: Random variable.
        eps: Some small value to represent tolerance level.

    Returns:
        Bool
    """

    if isinstance(X, pd.DataFrame):
        mu_X = X.mean().values
        sigma_X = X.std().values
        X_s = (X - mu_X) / sigma_X
        return np.equal(
            (mu_X**2 < eps).sum() + ((sigma_X - 1) ** 2 < eps).sum(),
            len(mu_X) + len(sigma_X),
        )
    else:
        return (X.mean() ** 2 < eps) & ((X.std() - 1) ** 2 < eps)

multiple_linear_regression(X, y, n_draws=1000)

Perform a Bayesian multiple linear regression.

Parameters:

Name Type Description Default
X DataFrame

Predictor variables are in different columns.

required
y ndarray / Series

The outcome variable.

required

Returns:

Type Description

pymc.Model and arviz.InferenceData objects.

Source code in bayes_toolbox/glm.py
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
def multiple_linear_regression(X, y, n_draws=1000):
    """Perform a Bayesian multiple linear regression.

    Args:
        X (pandas.DataFrame): Predictor variables are in different columns.
        y (ndarray/Series): The outcome variable.

    Returns:
        pymc.Model and arviz.InferenceData objects.
    """

    # Standardize both predictor and outcome variables.
    if (is_standardized(X)) & (is_standardized(y)):
        pass
    else:
        X, _, _ = standardize(X)
        y, _, _ = standardize(y)

    # For explanation of how predictor variables are handled, see:
    # https://www.pymc.io/projects/docs/en/stable/learn/core_notebooks/pymc_overview.html
    with pm.Model(coords={"predictors": X.columns.values}) as model:
        # Define priors
        zbeta0 = pm.Normal("zbeta0", mu=0, sigma=2)
        zbeta = pm.Normal("zbeta", mu=0, sigma=2, dims="predictors")

        nu_minus_one = pm.Exponential("nu_minus_one", 1 / 29)
        nu = pm.Exponential("nu", nu_minus_one + 1)
        nu_log10 = pm.Deterministic("nu_log10", np.log10(nu))

        mu = zbeta0 + pm.math.dot(X, zbeta)
        zsigma = pm.Uniform("zsigma", 10**-5, 10)

        # Define likelihood
        likelihood = pm.StudentT(
            "likelihood", nu=nu, mu=mu, lam=1 / zsigma**2, observed=y
        )

        # Sample the posterior
        idata = pm.sample(draws=n_draws)

        return model, idata

oneway_rm_anova_convert_to_sum_to_zero(idata, x1, x_s)

Returns coefficients that obey sum-to-zero constraint.

Parameters:

Name Type Description Default
idata

InferenceData object.

required
x1

First categorical predictor variable.

required

Returns:

Type Description

pymc.Model and arviz.InferenceData objects.

Source code in bayes_toolbox/glm.py
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
def oneway_rm_anova_convert_to_sum_to_zero(idata, x1, x_s):
    """Returns coefficients that obey sum-to-zero constraint.

    Args:
        idata: InferenceData object.
        x1: First categorical predictor variable.

    Returns:
        pymc.Model and arviz.InferenceData objects.
    """
    # Extract posterior probabilities and stack your chains
    # post = az.extract(idata.posterior)
    post = az.extract_dataset(idata.posterior)

    _, _, n_levels_x1 = parse_categorical(x1)
    _, _, n_levels_x_s = parse_categorical(x_s)

    # Add variables
    post = post.assign(
        m=(
            ["factor1", "factor_subj", "sample"],
            np.zeros((n_levels_x1, n_levels_x_s, len(post["sample"]))),
        )
    )
    post = post.assign(b0=(["sample"], np.zeros(len(post["sample"]))))
    post = post.assign(
        b1=(["factor1", "sample"], np.zeros((n_levels_x1, len(post["sample"]))))
    )
    post = post.assign(
        b_s=(["factor_subj", "sample"], np.zeros((n_levels_x_s, len(post["sample"]))))
    )

    # Transforming the trace data to sum-to-zero values. First, calculate
    # predicted mean values based on different levels of predictors.
    for j1, j2 in np.ndindex(n_levels_x1, n_levels_x_s):
        post.m[j1, j2] = post["a0"] + post["a1"][j1, :] + post["a_s"][j2, :]

    post["b0"] = post.m.mean(dim=["factor1", "factor_subj"])
    post["b1"] = post.m.mean(dim="factor_subj") - post.b0
    post["b_s"] = post.m.mean(dim="factor1") - post.b0

    assert np.allclose(post.b1.sum(dim=["factor1"]), 0, atol=1e-5)
    assert np.allclose(post.b_s.sum(dim=["factor_subj"]), 0, atol=1e-5)

    return post

parse_categorical(x)

A function for extracting information from a grouping variable.

If the input arg is not already a category-type variable, converts it to one. Then, extracts the codes, unique levels, and number of levels from the variable.

Parameters:

Name Type Description Default
x categorical

The categorical type variable to parse.

required

Returns:

Type Description

The codes, unique levels, and number of levels from the input variable.

Source code in bayes_toolbox/glm.py
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
def parse_categorical(x):
    """A function for extracting information from a grouping variable.

    If the input arg is not already a category-type variable, converts
    it to one. Then, extracts the codes, unique levels, and number of
    levels from the variable.

    Args:
        x (categorical): The categorical type variable to parse.

    Returns:
        The codes, unique levels, and number of levels from the input variable.
    """

    # First, check to see if passed variable is of type "category".
    if pd.api.types.is_categorical_dtype(x):
        pass
    else:
        x = x.astype("category")
    categorical_values = x.cat.codes.values
    levels = x.cat.categories
    n_levels = len(levels)

    return categorical_values, levels, n_levels

robust_bayesian_anova(x, y, mu_y, sigma_y, n_draws=1000, acceptance_rate=0.9)

Bayesian analogue of ANOVA using a t-distributed likelihood function.

Parameters:

Name Type Description Default
x

The categorical predictor variable.

required
y

The outcome variable.

required
mu_y

The mean of y.

required
sigma_y

The SD of y.

required

Returns:

Type Description

pymc.Model and arviz.InferenceData objects.

Source code in bayes_toolbox/glm.py
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
def robust_bayesian_anova(x, y, mu_y, sigma_y, n_draws=1000, acceptance_rate=0.9):
    """Bayesian analogue of ANOVA using a t-distributed likelihood function.

    Args:
        x: The categorical predictor variable.
        y: The outcome variable.
        mu_y: The mean of y.
        sigma_y: The SD of y.

    Returns:
        pymc.Model and arviz.InferenceData objects.
    """
    x_vals, levels, n_levels = parse_categorical(x)

    a_shape, a_rate = gamma_shape_rate_from_mode_sd(sigma_y / 2, 2 * sigma_y)

    with pm.Model(coords={"groups": levels}) as model:
        # 'a' indicates coefficients not yet obeying sum-to-zero contraint
        sigma_a = pm.Gamma("sigma_a", alpha=a_shape, beta=a_rate)
        a0 = pm.Normal("a0", mu=mu_y, sigma=sigma_y * 10)
        a = pm.Normal("a", 0.0, sigma=sigma_a, dims="groups")

        # Hyperparameters
        sigma_y_sd = pm.Gamma("sigma_y_sd", alpha=a_shape, beta=a_rate)
        sigma_y_mode = pm.Gamma("sigma_y_mode", alpha=a_shape, beta=a_rate)
        sigma_y_rate = (
            (sigma_y_mode + np.sqrt(sigma_y_mode**2 + 4 * sigma_y_sd**2))
            / 2
            * sigma_y_sd**2
        )
        sigma_y_shape = sigma_y_mode * sigma_y_rate

        sigma_y = pm.Gamma(
            "sigma", alpha=sigma_y_shape, beta=sigma_y_rate, dims="groups"
        )
        nu_minus1 = pm.Exponential("nu_minus1", 1 / 29)
        nu = pm.Deterministic("nu", nu_minus1 + 1)

        # Define the likelihood function
        likelihood = pm.Normal(
            "likelihood", a0 + a[x_vals], sigma=sigma_y[x_vals], observed=y
        )

        # Convert a0, a to sum-to-zero b0, b
        m = pm.Deterministic("m", a0 + a)
        b0 = pm.Deterministic("b0", at.mean(m))
        b = pm.Deterministic("b", m - b0)

        # Sample from the posterior. Initialization argument is necessary
        # for sampling to converge.
        idata = pm.sample(
            draws=n_draws, init="advi+adapt_diag", target_accept=acceptance_rate
        )

        return model, idata

robust_linear_regression(x, y, n_draws=1000)

Perform a robust linear regression with one predictor.

The observations are modeled with a t-distribution which can much more readily account for "outlier" observations.

Parameters:

Name Type Description Default
x ndarray

The standardized predictor (independent) variable.

required
y ndarray

The standardized outcome (dependent) variable.

required
n_draws

Number of random samples to draw from the posterior.

1000

Returns:

Type Description

pymc.Model and arviz.InferenceData objects.

Source code in bayes_toolbox/glm.py
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
def robust_linear_regression(x, y, n_draws=1000):
    """Perform a robust linear regression with one predictor.

    The observations are modeled with a t-distribution which can much more
    readily account for "outlier" observations.

    Args:
        x (ndarray): The standardized predictor (independent) variable.
        y (ndarray): The standardized outcome (dependent) variable.
        n_draws: Number of random samples to draw from the posterior.

    Returns:
        pymc.Model and arviz.InferenceData objects.
    """

    # Standardize both predictor and outcome variables.
    if (is_standardized(x)) & (is_standardized(y)):
        pass
    else:
        x, _, _ = standardize(x)
        y, _, _ = standardize(y)

    with pm.Model() as model:
        # Define priors
        zbeta0 = pm.Normal("zbeta0", mu=0, sigma=2)
        zbeta1 = pm.Normal("zbeta1", mu=0, sigma=2)

        sigma = pm.Uniform("sigma", 10**-3, 10**3)
        nu_minus_one = pm.Exponential("nu_minus_one", 1 / 29)
        nu = pm.Deterministic("nu", nu_minus_one + 1)
        nu_log10 = pm.Deterministic("nu_log10", np.log10(nu))

        mu = zbeta0 + zbeta1 * x

        # Define likelihood
        likelihood = pm.StudentT("likelihood", nu=nu, mu=mu, sigma=sigma, observed=y)

        # Sample from posterior
        idata = pm.sample(draws=n_draws)

    return model, idata

standardize(X)

Standardize the input variable.

Parameters:

Name Type Description Default
X ndarray

The variable(s) to standardize.

required

Returns:

Type Description

ndarray

Source code in bayes_toolbox/glm.py
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
def standardize(X):
    """Standardize the input variable.

    Args:
        X (ndarray): The variable(s) to standardize.

    Returns:
        ndarray
    """

    # Check if X is a dataframe, in which case it will be handled differently.
    if isinstance(X, pd.DataFrame):
        mu_X = X.mean().values
        sigma_X = X.std().values
    else:
        mu_X = X.mean(axis=0)
        sigma_X = X.std(axis=0)

    # Standardize X
    X_s = (X - mu_X) / sigma_X

    return X_s, mu_X, sigma_X

two_factor_anova_convert_to_sum_to_zero(idata, x1, x2)

Returns coefficients that obey sum-to-zero constraint.

Parameters:

Name Type Description Default
idata

arviz.InferenceData object.

required
x1

First categorical predictor variable.

required
x2

Second categorical predictor variable.

required

Returns:

Type Description

pymc.Model and arviz.InferenceData objects.

Source code in bayes_toolbox/glm.py
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
def two_factor_anova_convert_to_sum_to_zero(idata, x1, x2):
    """Returns coefficients that obey sum-to-zero constraint.

    Args:
        idata: arviz.InferenceData object.
        x1: First categorical predictor variable.
        x2: Second categorical predictor variable.

    Returns:
        pymc.Model and arviz.InferenceData objects.
    """
    # Extract posterior probabilities and stack your chains
    post = az.extract(idata.posterior)

    _, _, n_levels_x1 = parse_categorical(x1)
    _, _, n_levels_x2 = parse_categorical(x2)

    # Add variables
    post = post.assign(
        m=(
            ["factor1", "factor2", "sample"],
            np.zeros((n_levels_x1, n_levels_x2, len(post["sample"]))),
        )
    )
    post = post.assign(
        b1b2=(
            ["factor1", "factor2", "sample"],
            np.zeros((n_levels_x1, n_levels_x2, len(post["sample"]))),
        )
    )
    post = post.assign(b0=(["sample"], np.zeros(len(post["sample"]))))
    post = post.assign(
        b1=(["factor1", "sample"], np.zeros((n_levels_x1, len(post["sample"]))))
    )
    post = post.assign(
        b2=(["factor2", "sample"], np.zeros((n_levels_x2, len(post["sample"]))))
    )

    # Transforming the trace data to sum-to-zero values. First, calculate
    # predicted mean values based on different levels of predictors.
    for j1, j2 in np.ndindex(n_levels_x1, n_levels_x2):
        post.m[j1, j2] = (
            post["a0"] + post["a1"][j1, :] + post["a2"][j2, :] + post["a1a2"][j1, j2, :]
        )

    post["b0"] = post.m.mean(dim=["factor1", "factor2"])
    post["b1"] = post.m.mean(dim="factor2") - post.b0
    post["b2"] = post.m.mean(dim="factor1") - post.b0

    for j1, j2 in np.ndindex(n_levels_x1, n_levels_x2):
        post.b1b2[j1, j2] = post.m[j1, j2] - (post.b0 + post.b1[j1] + post.b2[j2])

    assert np.allclose(post.b1.sum(dim=["factor1"]), 0, atol=1e-5)
    assert np.allclose(post.b2.sum(dim=["factor2"]), 0, atol=1e-5)
    assert np.allclose(post.b1b2.sum(dim=["factor1", "factor2"]), 0, atol=1e-5)

    return post

unpack_posterior_vars(posterior)

Unpacks posterior variables from xarray structure.

Intended for use with bayesian_mixed_model_anova.

Parameters:

Name Type Description Default
posterior

Posterior variables from InferenceData object.

required

Returns:

Type Description

Posterior variables.

Source code in bayes_toolbox/glm.py
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
def unpack_posterior_vars(posterior):
    """Unpacks posterior variables from xarray structure.

    Intended for use with bayesian_mixed_model_anova.

    Args:
        posterior: Posterior variables from InferenceData object.

    Returns:
        Posterior variables.
    """
    a0 = posterior["a0"]
    aB = posterior["aB"]
    aW = posterior["aW"]
    aBxW = posterior["aBxW"]
    aS = posterior["aS"]
    sigma = posterior["sigma"]

    return a0, aB, aW, aBxW, aS, sigma

unstandardize_linreg_parameters(zbeta0, zbeta1, sigma, x, y)

Convert parameters back to raw scale of data.

Function takes in parameter values from PyMC InferenceData and returns them in original scale of raw data.

Parameters:

Name Type Description Default
zbeta0

Intercept for standardized data.

required
zbeta1

Slope for standardized data.

required
sigma

SD of standardized data.

required
x

The predictor (independent) variable.

required
y

The outcome (dependent) variable.

required

Returns:

Type Description

ndarrays

Source code in bayes_toolbox/glm.py
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
def unstandardize_linreg_parameters(zbeta0, zbeta1, sigma, x, y):
    """Convert parameters back to raw scale of data.

    Function takes in parameter values from PyMC InferenceData and returns
    them in original scale of raw data.

    Args:
        zbeta0 (): Intercept for standardized data.
        zbeta1 (): Slope for standardized data.
        sigma: SD of standardized data.
        x: The predictor (independent) variable.
        y: The outcome (dependent) variable.

    Returns:
        ndarrays
    """
    _, mu_x, sigma_x = standardize(x)
    _, mu_y, sigma_y = standardize(y)

    beta0 = zbeta0 * sigma_y + mu_y - zbeta1 * mu_x * sigma_y / sigma_x
    beta1 = zbeta1 * sigma_y / sigma_x
    sigma = sigma * sigma_y

    return beta0, beta1, sigma

unstandardize_multiple_linreg_parameters(zbeta0, zbeta, zsigma, X, y)

Rescale standardized coefficients to magnitudes on raw scale.

To be used with multiple_linear_regression. If the posterior samples come from multiple chains, they should be combined and the zbetas should have dimensionality of (predictors, draws).

Parameters:

Name Type Description Default
zbeta0

Standardized intercept.

required
zbeta

Standardized multiple regression coefficients for predictor variables.

required
zsigma

Standardized standard deviation.

required
X

Predictor matrix.

required
y

Outcome variable.

required

Returns:

Type Description

Standardized coefficients and scale parameter.

Source code in bayes_toolbox/glm.py
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
def unstandardize_multiple_linreg_parameters(zbeta0, zbeta, zsigma, X, y):
    """Rescale standardized coefficients to magnitudes on raw scale.

    To be used with multiple_linear_regression. If the posterior samples come from
    multiple chains, they should be combined and the zbetas should have dimensionality
    of (predictors, draws).

    Args:
        zbeta0: Standardized intercept.
        zbeta: Standardized multiple regression coefficients for predictor
                variables.
        zsigma: Standardized standard deviation.
        X: Predictor matrix.
        y: Outcome variable.

    Returns:
        Standardized coefficients and scale parameter.
    """

    _, mu_X, sigma_X = standardize(X)
    _, mu_y, sigma_y = standardize(y)

    # beta0 will turn out to be 1d bc we are broadcasting, and Numpy's sum
    # reduces the axis over which summation occurs.
    beta0 = zbeta0 * sigma_y + mu_y - np.sum(zbeta.T * mu_X / sigma_X, axis=1) * sigma_y
    beta = (zbeta.T / sigma_X) * sigma_y
    sigma = zsigma * sigma_y

    return beta0, beta, sigma

A collection of Bayesian statistical models and associated utility functions.

meta_binary_outcome(z_t_obs, n_t_obs, z_c_obs, n_c_obs, study, n_draws=1000)

Fits multi-level meta-analysis model of binary outcomes.

See meta-analysis-two-proportions.ipynb in examples for usage.

Parameters:

Name Type Description Default
z_t_obs

number of occurrences in treatment group

required
n_t_obs

number of opportunities/participants in treatment gruops

required
z_c_obs

number of occurrences in control group

required
n_c_obs

number of opportunities/participants in control group

required
study

list of studies included in analysis

required

Returns:

Type Description

pymc.Model and arviz.InferenceData objects.

Source code in bayes_toolbox/meta.py
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
def meta_binary_outcome(z_t_obs, n_t_obs, z_c_obs, n_c_obs, study, n_draws=1000):
    """Fits multi-level meta-analysis model of binary outcomes.

    See meta-analysis-two-proportions.ipynb in examples for usage.

    Args:
        z_t_obs: number of occurrences in treatment group
        n_t_obs: number of opportunities/participants in treatment gruops
        z_c_obs: number of occurrences in control group
        n_c_obs: number of opportunities/participants in control group
        study: list of studies included in analysis

    Returns:
        pymc.Model and arviz.InferenceData objects.
    """

    with pm.Model(coords={"study": study}) as model:
        # Hyper-priors
        mu_rho = pm.Normal("mu_rho", mu=0, sigma=10)
        sigma_rho = pm.Gamma("sigma_rho", alpha=1.64, beta=0.64)  # mode=1, sd=2

        omega_theta_c = pm.Beta("omega_theta_c", alpha=1.01, beta=1.01)
        kappa_minus_two_theta_c = pm.Gamma(
            "kappa_minus_two_theta_c", alpha=2.618, beta=0.162
        )  # mode=10, sd=10
        kappa_theta_c = pm.Deterministic("kappa_theta_c", kappa_minus_two_theta_c + 2)

        # Priors
        rho = pm.Normal("rho", mu=mu_rho, sigma=sigma_rho, dims="study")
        theta_c = pm.Beta(
            "theta_c",
            alpha=omega_theta_c * (kappa_theta_c - 2) + 1,
            beta=(1 - omega_theta_c) * (kappa_theta_c - 2) + 1,
            dims="study",
        )
        theta_t = pm.Deterministic(
            "theta_t", pm.invlogit(rho + pm.logit(theta_c))
        )  # ilogit is logistic

        # Likelihood
        z_t = pm.Binomial("z_t", n_t_obs, theta_t, observed=z_t_obs)
        z_c = pm.Binomial("z_c", n_c_obs, theta_c, observed=z_c_obs)

        # Sample from the posterior
        idata = pm.sample(draws=n_draws, target_accept=0.90)

        return model, idata

meta_normal_outcome_beta_version(eff_size, se_eff_size, study, n_draws=1000)

Fits multi-level meta-analysis model of normally-distribute outcomes.

See meta-analyses.ipynb in examples for usage.

Parameters:

Name Type Description Default
eff_size

Reported standardized effect size

required
se_eff_size

Reported standard error

required
study

List of studies included in analysis

required

Returns:

Type Description

pymc.Model and arviz.InferenceData objects.

Source code in bayes_toolbox/meta.py
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
def meta_normal_outcome_beta_version(eff_size, se_eff_size, study, n_draws=1000):
    """Fits multi-level meta-analysis model of normally-distribute outcomes.

    See meta-analyses.ipynb in examples for usage.

    Args:
        eff_size: Reported standardized effect size
        se_eff_size: Reported standard error
        study: List of studies included in analysis

    Returns:
        pymc.Model and arviz.InferenceData objects.
    """

    with pm.Model(coords={"study": study}) as model:
        # Hyper-priors (assumes observations are standaradized effect sizes)
        mu_theta = pm.Normal("mu_theta", mu=0, sigma=1)  # Diffuse prior
        tau_theta = pm.Exponential("tau_theta", lam=1)  # Expected value = 1 / lam

        # Priors
        theta = pm.Normal("theta", mu=mu_theta, sigma=tau_theta, dims="study")

        # Likelihood: y is empirical effect size of each study
        y = pm.Normal("y", mu=theta, tau=1 / (se_eff_size**2), observed=eff_size)

        # Sample from the posterior
        idata = pm.sample(draws=n_draws, target_accept=0.90)

    return model, idata