Group-wise Analysis¶
The core strength of polars-statistics: run statistical tests and regressions per group using native Polars group_by and over — no loops, no apply, automatic parallelization.
Setup¶
import polars as pl
import polars_statistics as ps
# Sales data across 3 regions
df = pl.DataFrame({
"region": (["North"] * 30) + (["South"] * 30) + (["West"] * 30),
"sales": [120, 135, 142, 128, 155, 138, 145, 132, 150, 140,
125, 148, 133, 156, 141, 130, 152, 137, 144, 127,
158, 136, 149, 131, 153, 139, 146, 134, 151, 143,
200, 215, 225, 208, 235, 218, 228, 212, 232, 220,
205, 230, 213, 238, 222, 210, 234, 217, 226, 207,
240, 216, 231, 211, 236, 219, 227, 214, 233, 223,
160, 175, 168, 182, 170, 190, 165, 185, 172, 178,
163, 188, 171, 193, 176, 167, 191, 174, 183, 162,
195, 173, 186, 164, 192, 177, 184, 169, 189, 180],
"price": [25, 28, 30, 26, 32, 29, 31, 27, 33, 28,
24, 31, 27, 34, 29, 26, 33, 28, 30, 25,
35, 27, 32, 26, 34, 29, 31, 27, 33, 30,
40, 43, 45, 41, 47, 44, 46, 42, 48, 43,
39, 46, 42, 49, 44, 41, 47, 43, 45, 40,
50, 44, 47, 42, 48, 44, 46, 43, 48, 45,
30, 33, 32, 35, 31, 37, 30, 36, 33, 34,
29, 36, 32, 38, 34, 31, 37, 33, 35, 29,
39, 33, 36, 30, 38, 34, 35, 32, 37, 34],
"advertising": [5, 7, 8, 6, 10, 7, 9, 6, 10, 8,
5, 9, 7, 11, 8, 6, 10, 7, 9, 5,
12, 7, 10, 6, 11, 8, 9, 7, 10, 9,
8, 10, 12, 9, 14, 11, 13, 9, 15, 11,
7, 13, 10, 15, 12, 9, 14, 10, 12, 8,
16, 11, 14, 9, 15, 11, 13, 10, 14, 12,
6, 8, 7, 10, 7, 12, 6, 11, 8, 9,
5, 11, 7, 13, 9, 6, 12, 8, 10, 5,
14, 8, 11, 6, 13, 9, 10, 7, 12, 9],
})
group_by: One Result per Group¶
Regression per Group¶
Fit a separate OLS model for each region:
models = (
df.group_by("region")
.agg(
ps.ols("sales", "price", "advertising").alias("model")
)
.sort("region")
)
# Extract model metrics into a flat comparison table
comparison = models.with_columns(
pl.col("model").struct.field("r_squared").alias("r_squared"),
pl.col("model").struct.field("adj_r_squared").alias("adj_r_squared"),
pl.col("model").struct.field("rmse").alias("rmse"),
pl.col("model").struct.field("intercept").alias("intercept"),
pl.col("model").struct.field("coefficients").list.get(0).alias("coef_price"),
pl.col("model").struct.field("coefficients").list.get(1).alias("coef_advertising"),
).drop("model")
print(comparison)
# ┌────────┬───────────┬───────────────┬──────────┬────────────┬────────────┬──────────────────┐
# │ region ┆ r_squared ┆ adj_r_squared ┆ rmse ┆ intercept ┆ coef_price ┆ coef_advertising │
# ╞════════╪═══════════╪═══════════════╪══════════╪════════════╪════════════╪══════════════════╡
# │ North ┆ 0.9656 ┆ 0.9631 ┆ 1.9168 ┆ 78.4254 ┆ 1.2598 ┆ 3.1266 │
# │ South ┆ 0.9641 ┆ 0.9615 ┆ 2.1180 ┆ 128.2123 ┆ 1.3818 ┆ 2.7610 │
# │ West ┆ 0.9757 ┆ 0.9739 ┆ 1.6704 ┆ 101.5713 ┆ 1.6651 ┆ 2.1864 │
# └────────┴───────────┴───────────────┴──────────┴────────────┴────────────┴──────────────────┘

Plot code
import matplotlib.pyplot as plt
regions = comparison["region"].to_list()
r2 = comparison["r_squared"].to_list()
fig, ax = plt.subplots(figsize=(6, 3.5))
ax.barh(regions, r2, color=["#4C72B0", "#DD8452", "#55A868"], height=0.5)
ax.set_xlabel("R²")
ax.set_title("Model Fit by Region")
plt.tight_layout()
plt.savefig("grp_r2_bars.png", dpi=150)
Tidy Coefficients per Group¶
Get a full coefficient table with p-values, broken down by region:
coef_by_region = (
df.group_by("region")
.agg(
ps.ols_summary("sales", "price", "advertising").alias("coefficients")
)
.explode("coefficients")
.unnest("coefficients")
.sort("region", "term")
)
print(coef_by_region)
# ┌────────┬───────────┬────────────┬───────────┬───────────┬───────────┐
# │ region ┆ term ┆ estimate ┆ std_error ┆ statistic ┆ p_value │
# ╞════════╪═══════════╪════════════╪═══════════╪═══════════╪═══════════╡
# │ North ┆ intercept ┆ 78.4254 ┆ 10.2659 ┆ 7.6394 ┆ 0.000000 │
# │ North ┆ x1 ┆ 1.2598 ┆ 0.6009 ┆ 2.0966 ┆ 0.045537 │
# │ North ┆ x2 ┆ 3.1266 ┆ 0.9448 ┆ 3.3094 ┆ 0.002657 │
# │ South ┆ intercept ┆ 128.2123 ┆ 28.5727 ┆ 4.4872 ┆ 0.000121 │
# │ South ┆ x1 ┆ 1.3818 ┆ 0.9237 ┆ 1.4960 ┆ 0.146252 │
# │ South ┆ x2 ┆ 2.7610 ┆ 1.0931 ┆ 2.5259 ┆ 0.017714 │
# │ West ┆ intercept ┆ 101.5713 ┆ 23.4301 ┆ 4.3351 ┆ 0.000181 │
# │ West ┆ x1 ┆ 1.6651 ┆ 0.9793 ┆ 1.7004 ┆ 0.100557 │
# │ West ┆ x2 ┆ 2.1864 ┆ 1.0846 ┆ 2.0158 ┆ 0.053870 │
# └────────┴───────────┴────────────┴───────────┴───────────┴───────────┘
Statistical Tests per Group¶
Run hypothesis tests within each group:
df_paired = pl.DataFrame({
"group": (["A"] * 20) + (["B"] * 20),
"before": [85, 90, 78, 92, 88, 76, 95, 83, 89, 91,
80, 87, 93, 84, 86, 79, 94, 82, 88, 90,
70, 75, 68, 77, 73, 66, 80, 72, 74, 76,
65, 72, 78, 69, 71, 64, 79, 67, 73, 75],
"after": [89, 94, 83, 96, 92, 81, 99, 88, 93, 95,
85, 91, 97, 89, 90, 84, 98, 87, 92, 94,
72, 78, 70, 80, 76, 69, 83, 75, 77, 79,
68, 75, 81, 72, 74, 67, 82, 70, 76, 78],
})
test_results = (
df_paired.group_by("group")
.agg(
ps.ttest_paired("before", "after").alias("paired_t"),
ps.wilcoxon_signed_rank("before", "after").alias("wilcoxon"),
ps.shapiro_wilk("before").alias("normality"),
)
.sort("group")
)
# Flatten into a readable table
for row in test_results.iter_rows(named=True):
group = row["group"]
t = row["paired_t"]
w = row["wilcoxon"]
n = row["normality"]
print(f"Group {group}:")
print(f" Paired t-test: t={t['statistic']:.3f}, p={t['p_value']:.6f}")
print(f" Wilcoxon: W={w['statistic']:.3f}, p={w['p_value']:.6f}")
print(f" Normality: W={n['statistic']:.4f}, p={n['p_value']:.4f}")
Expected output:
Group A:
Paired t-test: t=-39.753, p=0.000000
Wilcoxon: W=-0.000, p=0.000051
Normality: W=0.9677, p=0.7065
Group B:
Paired t-test: t=-42.136, p=0.000000
Wilcoxon: W=-0.000, p=0.000019
Normality: W=0.9743, p=0.8423

Plot code
import matplotlib.pyplot as plt
import numpy as np
fig, axes = plt.subplots(1, 2, figsize=(10, 4.5), sharey=True)
for ax, group, color in zip(axes, ["A", "B"], ["#4C72B0", "#DD8452"]):
subset = df_paired.filter(pl.col("group") == group)
before = subset["before"].to_numpy()
after = subset["after"].to_numpy()
for b, a in zip(before, after):
ax.plot([0, 1], [b, a], color=color, alpha=0.35, lw=1)
ax.scatter(np.zeros(len(before)), before, s=30, color=color,
edgecolor="white", label="Before")
ax.scatter(np.ones(len(after)), after, s=30, color=color,
edgecolor="white", label="After")
ax.set_xticks([0, 1])
ax.set_xticklabels(["Before", "After"])
ax.set_title(f"Group {group}")
ax.legend(fontsize=9)
axes[0].set_ylabel("Score")
plt.tight_layout()
plt.savefig("grp_paired_before_after.png", dpi=150)
Correlation per Group¶
corr_by_region = (
df.group_by("region")
.agg(
ps.pearson("sales", "price").alias("sales_price"),
ps.pearson("sales", "advertising").alias("sales_ad"),
)
.sort("region")
.with_columns(
pl.col("sales_price").struct.field("estimate").alias("r_sales_price"),
pl.col("sales_ad").struct.field("estimate").alias("r_sales_ad"),
)
.select("region", "r_sales_price", "r_sales_ad")
)
print(corr_by_region)
Expected output:
┌────────┬───────────────┬────────────┐
│ region ┆ r_sales_price ┆ r_sales_ad │
╞════════╪═══════════════╪════════════╡
│ North ┆ 0.9756 ┆ 0.9798 │
│ South ┆ 0.9776 ┆ 0.9804 │
│ West ┆ 0.9859 ┆ 0.9865 │
└────────┴───────────────┴────────────┘
over: Per-Row Results¶
Predictions per Group¶
Use .over() to get per-row predictions from group-specific models:
predictions = (
df.with_columns(
ps.ols_predict("sales", "price", "advertising",
interval="prediction", level=0.95)
.over("region")
.alias("pred")
)
.unnest("pred")
)
print(predictions.select("region", "sales", "ols_prediction", "ols_lower", "ols_upper").head(5))
# Each row has a prediction from its region-specific model
Expected output:
┌────────┬───────┬────────────────┬───────────┬───────────┐
│ region ┆ sales ┆ ols_prediction ┆ ols_lower ┆ ols_upper │
╞════════╪═══════╪════════════════╪═══════════╪═══════════╡
│ North ┆ 120.0 ┆ 125.5527 ┆ 121.3610 ┆ 129.7443 │
│ North ┆ 135.0 ┆ 135.5852 ┆ 131.5486 ┆ 139.6218 │
│ North ┆ 142.0 ┆ 141.2314 ┆ 137.1224 ┆ 145.3404 │
│ North ┆ 128.0 ┆ 129.9391 ┆ 125.8613 ┆ 134.0169 │
│ North ┆ 155.0 ┆ 150.0042 ┆ 145.9213 ┆ 154.0871 │
└────────┴───────┴────────────────┴───────────┴───────────┘

Plot code
import matplotlib.pyplot as plt
import numpy as np
fig, axes = plt.subplots(1, 3, figsize=(12, 4), sharey=True)
for ax, region in zip(axes, ["North", "South", "West"]):
subset = predictions.filter(pl.col("region") == region)
ax.scatter(subset["price"], subset["sales"], s=30, alpha=0.8)
m, b = np.polyfit(subset["price"].to_numpy(), subset["sales"].to_numpy(), 1)
x = np.linspace(subset["price"].min(), subset["price"].max(), 50)
ax.plot(x, m * x + b, lw=2)
ax.set_title(region)
ax.set_xlabel("Price")
axes[0].set_ylabel("Sales")
fig.suptitle("Sales ~ Price by Region")
plt.tight_layout()
plt.savefig("grp_regression_lines.png", dpi=150)
Residuals¶
Calculate residuals from group-specific fits:
with_residuals = (
df.with_columns(
ps.ols_predict("sales", "price", "advertising")
.over("region")
.alias("pred")
)
.unnest("pred")
.with_columns(
(pl.col("sales") - pl.col("ols_prediction")).alias("residual")
)
)
# Residual summary by region
residual_stats = (
with_residuals.group_by("region")
.agg(
pl.col("residual").mean().alias("mean_residual"),
pl.col("residual").std().alias("std_residual"),
pl.col("residual").abs().max().alias("max_abs_residual"),
)
.sort("region")
)
print(residual_stats)
Expected output:
┌────────┬───────────────┬──────────────┬──────────────────┐
│ region ┆ mean_residual ┆ std_residual ┆ max_abs_residual │
╞════════╪═══════════════╪══════════════╪══════════════════╡
│ North ┆ ≈0.0 ┆ 1.8495 ┆ 5.5527 │
│ South ┆ ≈0.0 ┆ 2.0436 ┆ 5.5724 │
│ West ┆ ≈0.0 ┆ 1.6117 ┆ 4.6419 │
└────────┴───────────────┴──────────────┴──────────────────┘
Broadcast Test Results to Rows¶
Use .over() to attach group-level test results to every row:
annotated = (
df.with_columns(
ps.shapiro_wilk("sales").over("region").alias("normality_test")
)
.with_columns(
pl.col("normality_test").struct.field("p_value").alias("normality_p"),
)
.with_columns(
pl.when(pl.col("normality_p") > 0.05)
.then(pl.lit("normal"))
.otherwise(pl.lit("non-normal"))
.alias("distribution")
)
)
print(annotated.select("region", "sales", "normality_p", "distribution").head(5))
Expected output:
┌────────┬───────┬─────────────┬──────────────┐
│ region ┆ sales ┆ normality_p ┆ distribution │
╞════════╪═══════╪═════════════╪══════════════╡
│ North ┆ 120.0 ┆ 0.8831 ┆ normal │
│ North ┆ 135.0 ┆ 0.8831 ┆ normal │
│ North ┆ 142.0 ┆ 0.8831 ┆ normal │
│ North ┆ 128.0 ┆ 0.8831 ┆ normal │
│ North ┆ 155.0 ┆ 0.8831 ┆ normal │
└────────┴───────┴─────────────┴──────────────┘
Combining group_by and over¶
A common pattern: fit models with group_by, then use .over() for predictions:
# Step 1: Compare model quality across groups
model_quality = (
df.group_by("region")
.agg(
ps.ols("sales", "price", "advertising").alias("ols"),
ps.ridge("sales", "price", "advertising", lambda_=1.0).alias("ridge"),
)
.with_columns(
pl.col("ols").struct.field("r_squared").alias("ols_r2"),
pl.col("ridge").struct.field("r_squared").alias("ridge_r2"),
)
.select("region", "ols_r2", "ridge_r2")
.sort("region")
)
print(model_quality)
Expected output:
┌────────┬──────────┬──────────┐
│ region ┆ ols_r2 ┆ ridge_r2 │
╞════════╪══════════╪══════════╡
│ North ┆ 0.9656 ┆ 0.9654 │
│ South ┆ 0.9641 ┆ 0.9640 │
│ West ┆ 0.9757 ┆ 0.9757 │
└────────┴──────────┴──────────┘
# Step 2: Generate per-row predictions from the better model
final = (
df.with_columns(
ps.ols_predict("sales", "price", "advertising", interval="prediction")
.over("region")
.alias("pred")
)
.unnest("pred")
)
Multiple Formulas per Group¶
Compare model specifications using formula syntax:
specs = (
df.group_by("region")
.agg(
ps.ols_formula("sales ~ price + advertising").alias("additive"),
ps.ols_formula("sales ~ price * advertising").alias("interaction"),
)
.with_columns(
pl.col("additive").struct.field("r_squared").alias("r2_additive"),
pl.col("additive").struct.field("aic").alias("aic_additive"),
pl.col("interaction").struct.field("r_squared").alias("r2_interaction"),
pl.col("interaction").struct.field("aic").alias("aic_interaction"),
)
.select("region", "r2_additive", "aic_additive", "r2_interaction", "aic_interaction")
.sort("region")
)
print(specs)
# Lower AIC = better fit. Compare additive vs interaction model per region.
Expected output:
┌────────┬─────────────┬──────────────┬────────────────┬─────────────────┐
│ region ┆ r2_additive ┆ aic_additive ┆ r2_interaction ┆ aic_interaction │
╞════════╪═════════════╪══════════════╪════════════════╪═════════════════╡
│ North ┆ 0.9656 ┆ 130.18 ┆ 0.0 ┆ 409.99 │
│ South ┆ 0.9641 ┆ 136.16 ┆ 0.0 ┆ 422.10 │
│ West ┆ 0.9757 ┆ 121.92 ┆ 0.0 ┆ 399.32 │
└────────┴─────────────┴──────────────┴────────────────┴─────────────────┘

Plot code
import matplotlib.pyplot as plt
import numpy as np
regions = specs["region"].to_list()
aic_add = specs["aic_additive"].to_list()
aic_int = specs["aic_interaction"].to_list()
x = np.arange(len(regions))
fig, ax = plt.subplots(figsize=(7, 4))
ax.bar(x - 0.15, aic_add, 0.3, label="Additive", color="#4C72B0")
ax.bar(x + 0.15, aic_int, 0.3, label="Interaction", color="#DD8452")
ax.set_xticks(x)
ax.set_xticklabels(regions)
ax.set_ylabel("AIC (lower = better)")
ax.legend()
plt.tight_layout()
plt.savefig("grp_aic_comparison.png", dpi=150)