Statsmodels & Patsy Cheat Sheet

Complete reference guide for statistical modeling with Statsmodels and Patsy, Python's comprehensive stats library for regression analysis, GLMs, time series, and hypothesis testing.


Core Statsmodels Functions

Action Code Example Description
Import Libraries import statsmodels.api as sm
import statsmodels.formula.api as smf
Imports the core API (sm) and the formula API (smf) that integrates with Patsy.
Define a Linear Model (Patsy) "y ~ x1 + x2" A Patsy formula specifying that y is modeled as a linear combination of an intercept, x1, and x2.
Define with Interactions (Patsy) "y ~ x1 * x2" Expands to y ~ 1 + x1 + x2 + x1:x2, including main effects and the interaction term.
Define with Transformations (Patsy) "y ~ np.log(a) + I(b**2)" Applies functions to variables within the formula. The I() function treats the expression inside it as a mathematical operation.
Define with Categorical Data (Patsy) "y ~ C(x)" Treats the variable x as a categorical variable and creates the appropriate dummy variables.
Create and Fit an OLS Model model = smf.ols("cons ~ price + temp", data=df)
result = model.fit()
Creates and fits an Ordinary Least Squares (OLS) regression model using the formula and a pandas DataFrame.
View a Full Summary print(result.summary()) Prints a detailed summary of the regression results, including R-squared, coefficients, p-values, and various statistical tests.
Get Model Parameters result.params Returns a pandas Series containing the fitted coefficients for the model terms.
Get Model Residuals result.resid Returns a pandas Series of the residuals (the difference between observed and fitted values).
Predict New Values y_pred = result.predict(new_data_df) Uses the fitted model to predict response values for a new set of explanatory variables.
Plot Residuals Analysis import statsmodels.graphics.api as smg
smg.qqplot(result.resid, ax=ax)
Creates diagnostic plots to check model assumptions and residuals distribution.

Patsy Formula Syntax Examples

Basic Linear Models

import patsy as pats
import statsmodels.formula.api as smf

# Simple linear regression
model = smf.ols("y ~ x", data=df)
result = model.fit()

# Multiple linear regression
model = smf.ols("sales ~ advertising + price + season", data=df)
result = model.fit()

# Including intercept explicitly (same as default)
model = smf.ols("y ~ 1 + x1 + x2", data=df)
result = model.fit()

# Remove intercept
model = smf.ols("y ~ -1 + x1 + x2", data=df)
result = model.fit()

Polynomial and Interaction Terms

# Quadratic terms
model = smf.ols("y ~ x + I(x**2)", data=df)

# Cubic terms
model = smf.ols("y ~ x + I(x**2) + I(x**3)", data=df)

# Interaction terms
model = smf.ols("y ~ x1 * x2", data=df)
# Equivalent: y ~ 1 + x1 + x2 + x1:x2

# Higher-order interactions
model = smf.ols("y ~ x1 * x2 * x3", data=df)

Transforms and Functions

import numpy as np

# Logarithmic transforms
model = smf.ols("y ~ np.log(x1) + np.log(x2)", data=df)

# Exponential terms
model = smf.ols("y ~ np.exp(x)", data=df)

# Piecewise linear splines
from patsy import dmatrix
m = dmatrix("bs(x, knots=[25,40,60])", {"x": data["x"]})

Categorical Variables and Factors

# Treatment coding (default for dummy variables)
model = smf.ols("y ~ C(category_var)", data=df)

# Sum-to-zero coding
model = smf.ols("y ~ C(category_var, Sum)", data=df)

# Multiple categories
model = smf.ols("y ~ C(brand_type) * C(region)", data=df)

# Ordinal factors
model = smf.ols("y ~ C(teaching_method, levels=['basic', 'intermediate', 'advanced'])", data=df)

Advanced Patsy Features

# Custom functions
def log_positive(x):
    y = np.log(np.maximum(x, 1e-8))  # Avoid log(0)
    return y

# Use in formulas
model = smf.ols("y ~ log_positive(x)", data=df)

# Mathematical operators
model = smf.ols("y ~ I(x1/x2)", data=df)  # Division
model = smf.ols("y ~ I(x**0.5)", data=df)  # Square root
model = smf.ols("y ~ I(1/x)", data=df)  # Inverse

Regression Models

Linear Models

# OLS with robust standard errors
model = smf.ols("y ~ x", data=df)
result = model.fit()
# White's heteroskedasticity-consistent standard errors
robust_result = model.fit(cov_type='HC1')

# Weighted least squares
model = smf.wls("y ~ x", weights=1/np.abs(errors), data=df)
result = model.fit()

# Generalized least squares (for correlated errors)
model = sm.GLS(Y, X, sigma)
result = model.fit()

Generalized Linear Models (GLMs)

# Logistic regression for binary outcomes
model = smf.logit("outcome ~ x1 + x2", data=df)
result = model.fit()

# Logistic with interaction
model = smf.logit("success ~ x1 * x2", data=df)
result = model.fit()

# Probit regression
model = smf.probit("outcome ~ predictors", data=df)
result = model.fit()

# Poisson regression for counts
model = smf.poisson("count ~ predictors", data=df)
result = model.fit()

# Negative binomial
import statsmodels.discrete.discrete_model as dm
nb2_model = dm.NegativeBinomial(endog=Y, exog=X)
nb2_results = nb2_model.fit()

Time Series Models

# ARMA models
from statsmodels.tsa.arima.model import ARIMA
model = ARIMA(endog, order=(1, 0, 1))
results = model.fit()

# VAR (Vector Autoregression)
from statsmodels.tsa.vector_ar import VAR
model = VAR(endog)
results = model.fit()

# Seasonal Decomposition
from statsmodels.tsa.seasonal import seasonal_decompose
decomposition = seasonal_decompose(series, freq=12)

Model Diagnostics and Validation

Model Summary and Statistics

# Model summary
print(result.summary())
print(result.summary2())  # Enhanced summary

# Model statistics
print("AIC:", result.aic)
print("BIC:", result.bic)
print("R-squared:", result.rsquared)
print("F-statistic:", result.fvalue)

# Parameter statistics
print("Parameter estimates:", result.params)
print("Standard errors:", result.bse)
print("t-statistics:", result.tvalues)
print("p-values:", result.pvalues)

Diagnostic Tests

# Test for normality of residuals
from scipy import stats
jb_stat = stats.jarque_bera(result.resid)
dw_stat = sm.stats.durbin_watson(result.resid)

# Breusch-Pagan test for heteroskedasticity
bp_test = sm.stattools.het_breuschpagan(result.resid, result.model.exog)

# Breusch-Godfrey test for autocorrelation
bg_test = sm.stats.diagnostic.acorr_breusch_godfrey(result.model, nlags=4)

# Shapiro-Wilk test for normality
shapiro_test = stats.shapiro(result.resid)

Diagnostic Plots

import matplotlib.pyplot as plt
import statsmodels.graphics.regressionplots as regstats

# Residuals vs fitted values
fig = plt.figure()
plt.scatter(result.fittedvalues, result.resid)
plt.xlabel('Fitted Values')
plt.ylabel('Residuals')
plt.title('Residuals vs Fitted')

# QQ plot
fig = plt.figure()
plt.scatter(stats.probplot(result.resid, plot=plt))

# Influential points (Cook's distance)
influence = result.get_influence()
cook_d = influence.cooks_distance[0]

# Partial regression plots
fig = regstats.plot_partregress("y", "x1", ["x2"], data=df)

# Component-plus-residual (CCPR) plots
fig = regstats.plot_ccpr(result, exog_idx=1)

Practical Examples

Complete Regression Workflow

import pandas as pd
import statsmodels.formula.api as smf
import numpy as np
from statsmodels.stats.outliers_influence import variance_inflation_factor as VIF

# Example 1: Simple linear regression
tips_df = pd.read_csv('tips.csv')

# Fit linear model
model1 = smf.ols("tip ~ total_bill", data=tips_df).fit()
print(model1.summary())

# Residual analysis
import matplotlib.pyplot as plt
plt.scatter(model1.fittedvalues, model1.resid)
plt.xlabel('Fitted Values')
plt.ylabel('Residuals')
plt.show()

# Example 2: Multiple regression with diagnostics
model2 = smf.ols("tip ~ total_bill + sex + smoker + day + time + C(size)", 
                 data=tips_df).fit()

# Check VIF for multicollinearity
X = model2.model.exog
vif_data = pd.DataFrame()
vif_data["feature"] = model2.model.exog_names
vif_data["VIF"] = [VIF(X, i) for i in range(X.shape[1])]

# Check for influential points
influence = model2.get_influence()
cooks_d = influence.cooks_distance[0]
influential = cooks_d > (4/X.shape[0])
print(f"Influential points: {np.sum(influential)}")

Time Series Analysis

# Load economic data
from statsmodels.tsa.datasets import co2
data = co2.load(as_pandas=True).data

# Handle missing values
data = data.interpolate(method='linear')

# Plot time series
import matplotlib.pyplot as plt
data.plot(figsize=(12, 6))
plt.title('CO2 Concentration Over Time')
plt.show()

# Seasonal decomposition
from statsmodels.tsa.seasonal import seasonal_decompose
decomp = seasonal_decompose(data, freq=12, model='additive')

# Create autocorrelation plot
from statsmodels.graphics.tsaplots import plot_acf
plot_acf(data.dropna(), figsize=(10, 6))
plt.show()

# Fit ARIMA model
from statsmodels.tsa.arima.model import ARIMA
arima_model = ARIMA(data, order=(1, 1, 1))
results = arima_model.fit()
print(results.summary())

Best Practices

Model Selection and Validation

from statsmodels.stats.anova import anova_lm
import scipy.stats as stats

def model_diagnostics(model_result, df):
    """
    Perform comprehensive model diagnostics
    """
    residuals = model_result.resid
    fitted = model_result.fittedvalues

    # Normality tests
    normality_p = stats.shapiro(residuals)[1]

    # Independence (Durbin-Watson)
    dw = sm.stats.durbin_watson(residuals)

    # Homoskedasticity (Breusch-Pagan)
    bp_stats = sm.stats.diagnostic.het_breuschpagan(residuals, model_result.model.exog)
    bp_p = bp_stats[1]  # p-value

    # Linearity (Rainbow test for OLS models)
    try:
        from statsmodels.stats.diagnostic import linear_rainbow as rb
        rb_test = rb(model_result, frac=0.5)
        rb_p = rb_test.pvalue
    except:
        rb_p = None

    print(f"Normality test p-value: {normality_p:.4f}")
    print(f"Durbin-Watson statistic: {dw:.3f}")
    print(f"Homoskedasticity test p-value: {bp_p:.4f}")
    if rb_p:
        print(f"Rainbow test p-value: {rb_p:.4f}")

    return {
        'normality': normality_p > 0.05,
        'independence': dw > 1 and dw < 3,
        'homoskedastic': bp_p > 0.05,
        'linearity': rb_p > 0.05 if rb_p else True
    }

Model Comparison

# Compare nested models with F-test
model1 = smf.ols("y ~ x1", data=df).fit()
model2 = smf.ols("y ~ x1 + x2", data=df).fit()

# Likelihood ratio test
from statsmodels.stats.anova import anova_lm
anova_results = anova_lm(model1, model2)
print("F-test for model comparison:")
print(anova_lm(model1, model2))

# AIC and BIC comparison
print(f"Model 1 AIC: {model1.aic:.2f}, BIC: {model1.bic:.2f}")
print(f"Model 2 AIC: {model2.aic:.2f}, BIC: {model2.bic:.2f}")

# Information criteria difference
if model2.aic < model1.aic:
    print("Model 2 better according to AIC")
else:
    print("Model 1 better according to AIC")

Statsmodels provides comprehensive statistical modeling capabilities with elegant R-style formula syntax through Patsy. This cheatsheet covers the essential techniques for regression analysis, model diagnostics, and advanced statistical modeling.

Updated: January 15, 2025
Author: Danial Pahlavan
Category: Statistics & Data Analysis