A/B Testing¶
Practical examples for comparing variants in experiments: proportion tests, continuous metrics, equivalence testing, and per-segment analysis.
Conversion Rate Comparison¶
The most common A/B test — did variant B improve the conversion rate?
import polars as pl
import polars_statistics as ps
# Experiment results: 1000 users per variant
control_conversions = 120
control_n = 1000
treatment_conversions = 145
treatment_n = 1000
result = pl.select(
ps.prop_test_two(
successes1=treatment_conversions, n1=treatment_n,
successes2=control_conversions, n2=control_n,
).alias("prop_test")
)
pt = result["prop_test"][0]
print(f"Treatment rate: {treatment_conversions/treatment_n:.1%}")
print(f"Control rate: {control_conversions/control_n:.1%}")
print(f"Difference: {pt['estimate']:.4f}")
print(f"Chi²: {pt['statistic']:.4f}")
print(f"p-value: {pt['p_value']:.6f}")
Expected output:

Plot code
import matplotlib.pyplot as plt
fig, ax = plt.subplots(figsize=(5, 4))
groups = ["Control", "Treatment"]
rates = [12.0, 14.5]
ax.bar(groups, rates, color=["#4C72B0", "#55A868"], width=0.5,
yerr=[[2.0, 2.2], [2.0, 2.2]], capsize=8)
ax.set_ylabel("Conversion Rate (%)")
ax.set_title("A/B Test: Conversion Rates")
plt.tight_layout()
plt.savefig("abt_conversion_rates.png", dpi=150)
With Continuity Correction¶
For small sample sizes, apply Yates' correction:
result = pl.select(
ps.prop_test_two(
successes1=15, n1=50,
successes2=8, n2=50,
correction=True,
).alias("prop_test")
)
pt = result["prop_test"][0]
print(f"Corrected Chi²: {pt['statistic']:.4f}")
print(f"p-value: {pt['p_value']:.6f}")
Expected output:
Continuous Metric Comparison¶
Compare revenue per user across variants:
df = pl.DataFrame({
"control": [12.5, 0, 8.3, 0, 15.2, 0, 22.1, 0, 9.8, 0,
18.4, 0, 7.6, 0, 25.3, 0, 11.0, 0, 14.7, 0,
0, 16.8, 0, 19.5, 0, 6.2, 0, 21.9, 0, 13.1],
"treatment": [15.8, 0, 11.2, 0, 18.5, 3.2, 28.4, 0, 13.1, 0,
22.7, 0, 10.9, 5.1, 30.2, 0, 14.3, 0, 17.6, 0,
2.8, 20.1, 0, 24.8, 0, 9.5, 0, 26.3, 3.7, 16.4],
})
# Revenue data is typically skewed (many zeros) — check normality first
normality = df.select(
ps.shapiro_wilk("control").alias("ctrl_norm"),
ps.shapiro_wilk("treatment").alias("treat_norm"),
)
ctrl_p = normality["ctrl_norm"][0]["p_value"]
treat_p = normality["treat_norm"][0]["p_value"]
print(f"Control normality p={ctrl_p:.4f}, Treatment normality p={treat_p:.4f}")
# For skewed data, use non-parametric or robust tests
tests = df.select(
ps.ttest_ind("treatment", "control").alias("ttest"),
ps.mann_whitney_u("treatment", "control").alias("mwu"),
ps.yuen_test("treatment", "control", trim=0.2).alias("yuen"),
)
for name in ["ttest", "mwu", "yuen"]:
r = tests[name][0]
print(f"{name:8s}: statistic={r['statistic']:.4f}, p={r['p_value']:.6f}")
Expected output:
Control normality p=0.0001, Treatment normality p=0.0009
ttest : statistic=0.9932, p=0.324853
mwu : statistic=517.0000, p=0.305012
yuen : statistic=0.7505, p=0.458257

Plot code
import matplotlib.pyplot as plt
import numpy as np
fig, ax = plt.subplots(figsize=(7, 4))
bins = np.linspace(0, 32, 17)
ax.hist(df["control"].to_list(), bins=bins, alpha=0.6,
label="Control", color="#4C72B0")
ax.hist(df["treatment"].to_list(), bins=bins, alpha=0.6,
label="Treatment", color="#55A868")
ax.axvline(df["control"].mean(), color="#4C72B0", ls="--", lw=1.5)
ax.axvline(df["treatment"].mean(), color="#55A868", ls="--", lw=1.5)
ax.set_xlabel("Revenue per User ($)")
ax.set_ylabel("Frequency")
ax.legend()
plt.tight_layout()
plt.savefig("abt_revenue_distributions.png", dpi=150)
Equivalence Testing (TOST)¶
Standard tests ask "is there a difference?" — TOST asks "are these practically equivalent?"
This is critical for non-inferiority trials, platform migrations, and guardrail metrics.
Proportion Equivalence¶
After redesigning the checkout flow, verify the new conversion rate is equivalent to the old one within 2 percentage points:
result = pl.select(
ps.tost_prop_two(
successes1=482, n1=1000, # new design
successes2=475, n2=1000, # old design
delta=0.02, # equivalence margin: ±2%
).alias("tost")
)
tost = result["tost"][0]
print(f"Difference: {tost['estimate']:.4f}")
print(f"TOST p: {tost['tost_p_value']:.6f}")
print(f"Equivalent: {tost['equivalent']}")
# equivalent=True → new design conversion rate is within ±2% of old
Expected output:
Mean Equivalence¶
Test whether two variants produce equivalent average session duration:
df_duration = pl.DataFrame({
"variant_a": [5.2, 4.8, 6.1, 5.5, 4.9, 5.8, 5.3, 6.0, 5.1, 4.7,
5.6, 5.0, 5.9, 5.4, 4.6, 5.7, 5.2, 6.2, 5.3, 4.8],
"variant_b": [5.0, 5.1, 5.8, 5.3, 5.2, 5.5, 5.1, 5.7, 4.9, 5.0,
5.4, 5.2, 5.6, 5.1, 4.8, 5.3, 5.0, 5.9, 5.2, 5.1],
})
# Standard two-sample TOST
tost_result = df_duration.select(
ps.tost_t_test_two_sample("variant_a", "variant_b", delta=0.5).alias("tost")
)
tost = tost_result["tost"][0]
print(f"Mean difference: {tost['estimate']:.4f}")
print(f"CI: [{tost['ci_lower']:.4f}, {tost['ci_upper']:.4f}]")
print(f"TOST p-value: {tost['tost_p_value']:.6f}")
print(f"Equivalent: {tost['equivalent']}")
Expected output:

Plot code
import matplotlib.pyplot as plt
fig, ax = plt.subplots(figsize=(8, 2.5))
delta = 0.5
ax.axvline(-delta, color="#C44E52", ls="--", lw=2)
ax.axvline(delta, color="#C44E52", ls="--", lw=2)
ax.axvspan(-delta, delta, alpha=0.08, color="#55A868")
ax.plot([-0.1213, 0.3113], [0.5, 0.5], color="#4C72B0", lw=3)
ax.plot(0.095, 0.5, "o", color="#4C72B0", ms=10)
ax.set_xlabel("Mean Difference")
ax.set_title("TOST Equivalence Test")
ax.set_yticks([])
plt.tight_layout()
plt.savefig("abt_tost_diagram.png", dpi=150)
Using Cohen's d Bounds¶
When you don't have a natural equivalence margin, use effect size:
tost_cohen = df_duration.select(
ps.tost_t_test_two_sample(
"variant_a", "variant_b",
bounds_type="cohen_d", delta=0.3, # small effect size threshold
).alias("tost")
)
tost = tost_cohen["tost"][0]
print(f"Equivalent (d < 0.3): {tost['equivalent']}")
Expected output:
Comparing Traditional Test vs TOST¶
Run both side-by-side to illustrate the difference:
both = df_duration.select(
ps.ttest_ind("variant_a", "variant_b").alias("traditional"),
ps.tost_t_test_two_sample("variant_a", "variant_b", delta=0.5).alias("equivalence"),
)
trad = both["traditional"][0]
equiv = both["equivalence"][0]
print(f"Traditional t-test: p={trad['p_value']:.4f}")
print(f" → {'Significant' if trad['p_value'] < 0.05 else 'Not significant'} difference")
print(f"TOST equivalence: p={equiv['tost_p_value']:.4f}")
print(f" → {'Equivalent' if equiv['equivalent'] else 'Not equivalent'} within ±0.5")
# A non-significant t-test does NOT prove equivalence — that's what TOST is for.
Expected output:
Traditional t-test: p=0.4624
→ Not significant difference
TOST equivalence: p=0.0017
→ Equivalent within ±0.5
Per-Segment A/B Analysis¶
Run the same test across multiple user segments using group_by:
df_segments = pl.DataFrame({
"segment": (["mobile"] * 20) + (["desktop"] * 20) + (["tablet"] * 20),
"control": [3.2, 4.1, 2.8, 3.5, 4.0, 3.1, 3.8, 2.9, 4.2, 3.6,
3.3, 3.9, 2.7, 4.3, 3.4, 3.0, 4.1, 3.7, 2.6, 3.5,
5.1, 6.2, 5.8, 6.5, 5.3, 6.0, 5.7, 6.3, 5.5, 6.1,
5.9, 5.4, 6.4, 5.2, 6.6, 5.6, 6.0, 5.8, 6.2, 5.3,
4.0, 4.5, 3.8, 4.7, 4.2, 3.9, 4.6, 4.1, 4.8, 4.3,
4.4, 3.7, 4.9, 4.0, 4.6, 4.2, 4.3, 3.8, 4.5, 4.1],
"treatment": [3.8, 4.5, 3.2, 4.0, 4.6, 3.7, 4.3, 3.4, 4.8, 4.1,
3.9, 4.4, 3.1, 4.9, 3.8, 3.5, 4.7, 4.2, 3.0, 4.0,
5.3, 6.4, 6.0, 6.7, 5.5, 6.2, 5.9, 6.5, 5.7, 6.3,
6.1, 5.6, 6.6, 5.4, 6.8, 5.8, 6.2, 6.0, 6.4, 5.5,
4.2, 4.7, 4.0, 4.9, 4.4, 4.1, 4.8, 4.3, 5.0, 4.5,
4.6, 3.9, 5.1, 4.2, 4.8, 4.4, 4.5, 4.0, 4.7, 4.3],
})
segment_results = (
df_segments.group_by("segment")
.agg(
ps.ttest_ind("treatment", "control").alias("ttest"),
ps.tost_t_test_two_sample("treatment", "control", delta=0.5).alias("tost"),
)
.sort("segment")
.with_columns(
pl.col("ttest").struct.field("statistic").alias("t_stat"),
pl.col("ttest").struct.field("p_value").alias("p_value"),
pl.col("tost").struct.field("equivalent").alias("equivalent"),
)
.select("segment", "t_stat", "p_value", "equivalent")
)
print(segment_results)
# ┌─────────┬──────────┬──────────┬────────────┐
# │ segment ┆ t_stat ┆ p_value ┆ equivalent │
# ╞═════════╪══════════╪══════════╪════════════╡
# │ desktop ┆ 1.4051 ┆ 0.168117 ┆ true │
# │ mobile ┆ 2.9529 ┆ 0.005388 ┆ false │
# │ tablet ┆ 1.8092 ┆ 0.078344 ┆ true │
# └─────────┴──────────┴──────────┴────────────┘

Plot code
import matplotlib.pyplot as plt
import numpy as np
# Compute per-segment mean differences and CIs from TOST results
segments = segment_results["segment"].to_list()
tost_data = segment_results.with_columns(
pl.col("tost").struct.field("estimate").alias("est"),
pl.col("tost").struct.field("ci_lower").alias("ci_lo"),
pl.col("tost").struct.field("ci_upper").alias("ci_hi"),
)
fig, ax = plt.subplots(figsize=(7, 3.5))
for i, row in enumerate(tost_data.iter_rows(named=True)):
ax.errorbar(row["est"], i,
xerr=[[row["est"] - row["ci_lo"]], [row["ci_hi"] - row["est"]]],
fmt="o", ms=8, capsize=6, lw=2,
color="#4C72B0" if row["ci_lo"] > 0 else "#999")
ax.axvline(0, color="#C44E52", ls="--", lw=1.5, alpha=0.7)
ax.set_yticks(range(len(segments)))
ax.set_yticklabels(segments)
ax.set_xlabel("Mean Difference (Treatment − Control)")
ax.invert_yaxis()
plt.tight_layout()
plt.savefig("abt_segment_forest.png", dpi=150)
Categorical Outcomes¶
Chi-Square Test of Independence¶
Test whether conversion depends on the variant:
# Contingency table (row-major flattened):
# Converted Not Converted
# Control: 120 880
# Treatment: 145 855
counts = pl.DataFrame({
"counts": [120, 880, 145, 855]
})
result = counts.select(
ps.chisq_test("counts", n_rows=2, n_cols=2).alias("chisq")
)
chi = result["chisq"][0]
print(f"Chi²: {chi['statistic']:.4f}")
print(f"p-value: {chi['p_value']:.6f}")
print(f"df: {chi['df']}")
# Effect size
effect = counts.select(
ps.cramers_v("counts", n_rows=2, n_cols=2).alias("v")
)
v = effect["v"][0]
print(f"Cramér's V: {v['estimate']:.4f}")
# V < 0.1 = negligible, 0.1-0.3 = small, 0.3-0.5 = medium, > 0.5 = large
Expected output:
Fisher's Exact Test¶
For small samples where chi-square approximation is unreliable:
# Small pilot study
# Success Failure
# Treatment: 8 2
# Control: 3 7
result = pl.select(
ps.fisher_exact(a=8, b=2, c=3, d=7).alias("fisher")
)
f = result["fisher"][0]
print(f"Odds ratio: {f['statistic']:.4f}")
print(f"p-value: {f['p_value']:.6f}")
Expected output:
Next Steps¶
- More equivalence tests — paired, correlation, proportion, non-parametric, and bootstrap variants: Equivalence Testing (TOST)
- Categorical analysis — goodness-of-fit, McNemar, Cohen's kappa, and more: Categorical Data Analysis