function(formula,
x.name = "x",
warn.incr.poly.text = "'formula' not an increasing polynomial: 'eq.label' set to NA!",
warn.transf.rhs.txt = "rhs includes transformations requiring an argument for 'eq.x.rhs': 'eq.label' set to NA!.",
warn.transf.lhs.txt = "lhs includes transformations requiring an argument for 'eq.with.lhs': 'eq.label' set to NA!.",
warn.as.is.txt = "Power (^) terms in model formula of a polynomial need to be protected by 'I()': 'eq.label' set to NA!.",
warn.poly.raw.txt = "'poly()' in model formula has to be passed 'raw = TRUE': 'eq.label' set to NA!",
stop.pow.poly.text = "Both 'poly()' and power (^) terms in model formula.",
check.transf.rhs = TRUE,
check.transf.lhs = TRUE) {
transf.ok <- TRUE # flag use below
# parsing
lhs <- as.character(formula)[2]
rhs <- as.character(formula)[3]
formula.terms <- terms(formula)
rhs.terms <- attr(formula.terms, "term.labels")
# replaced as it fails when + or * appear within a function call such as I()
# rhs.terms <- unlist(strsplit(x = rhs, split = c("+", "*"), fixed = TRUE))
num.terms <- length(rhs.terms)
x.terms <- grepl(x.name, rhs.terms)
poly.in.terms <- grepl("poly *\\(", as.character(formula)[3L])
power.terms <- grepl("\\^ *", rhs.terms)
if (!any(power.terms) && grepl("\\^ *", rhs)) {
warning("Terms in 'formula' were dropped! Forgotten 'I()'?")
# even if accepted downstream this indicates an error
return(FALSE)
}
raw.terms <- grepl("raw *=", rhs.terms)
as.is.terms <- grepl("I *\\(", rhs.terms)
# checks
if (num.terms > 1L && poly.in.terms && sum(power.terms) != 0L) {
stop(stop.pow.poly.text)
}
if (num.terms > 1L && !all(which(power.terms) %in% which(as.is.terms))) {
if (length(warn.as.is.txt)) {
warning(warn.as.is.txt)
}
return(FALSE)
}
if (check.transf.rhs &&
(num.terms == 1L && any(as.is.terms) ||
any(grepl("log|exp|sqrt|cos|sin|tan", rhs)))) {
if (length(warn.transf.rhs.txt)) {
warning(warn.transf.rhs.txt)
}
transf.ok <- FALSE
}
if (check.transf.lhs &&
any(grepl("log|exp|sqrt|cos|sin|tan|[-^/*+]|[^I[:space:]]\\(.*[xy].*\\)", lhs))) {
if (length(warn.transf.lhs.txt)) {
warning(warn.transf.lhs.txt)
}
transf.ok <- FALSE
}
if (!transf.ok) {
# flag ensures that both lhs and rhs are always checked and warnings issued
return(FALSE)
}
if (length(warn.poly.raw.txt) &&
poly.in.terms && !sum(raw.terms)) {
warning(warn.poly.raw.txt)
return(FALSE)
}
if (sum(x.terms) <= 1L || poly.in.terms && num.terms == 1L) {
polynomial <- TRUE
increasing <- TRUE
} else if (sum(power.terms) < 1L ||
sum(power.terms) == 1L && num.terms == 1L ||
sum(x.terms) == num.terms ||
sum(power.terms) == num.terms - 2L &&
sum(x.terms) == num.terms - 1L) {
polynomial <- TRUE
if (sum(x.terms) >= 1L || any(power.terms)) {
powers <- numeric(length(x.terms)) # filled with zeros
if (sum(power.terms)) {
hi.powers.chr <- character(length(x.terms))
hi.powers.chr[power.terms] <-
gsub(".*\\^([0-9]+).*", "\\1", rhs.terms[power.terms])
powers[power.terms] <- as.numeric(hi.powers.chr[power.terms])
}
if (!any(powers == 1, na.rm = TRUE)) {
power1.terms <- x.terms & !power.terms
powers[power1.terms] <- 1
}
increasing <- min(powers) == 1 &&
(length(powers) == 1 || (length(powers) >= 2 && all(diff(powers) == 1L)))
} else {
increasing <- FALSE
}
} else {
polynomial <- FALSE
}
if (!polynomial || !increasing) {
if (length(warn.incr.poly.text)) {
warning(warn.incr.poly.text)
}
FALSE
} else {
TRUE
}
}
<bytecode: 0x0000023f6ae95930>
<environment: namespace:ggpmisc>
1 The problem
Statistics stat_poly_eq() and stat_quant_eq() from package ‘ggpmisc’ would generate erroneous equation.label values if the model formula definition passed by users as argument to parameter formula did not describe a complete polynomial with terms given in ascending order. Forcing the intercept to be equal to zero, e.g., y ~ x - 1 or y ~ x + 0, is supported. On-the-fly transformations of variables within formula are supported only if the user passes additional arguments to eq.with.lhs and/or eq.x.rhs and the expressions used for the data transformations are not parsed.
Although the expectated arguments for parameter formula was documented, in practice any valid model formula accepted by lm(), rlm() or other model fit functions were silently accepted as input, resulting in a wrong equation label added to the plot. A check for valid argument for formula was lacking.
The statistics expect what to me seemed like the “normal” way of writing a polynomial, agreeing with the expectation of function polynom::polynomial() that is used internally in ‘ggpmisc’. Some users had different expectations as in some countries and disciplines a decreasing order of powers is common. So, in spite of the explanation and examples in the documentation, an issue was raised reporting a bug. Of course, the lack of a test to validate the argument was an important bug, but not as the user who raised the issue thought, one affecting input that was deemed correct by design. To address this, I added in 2024 the previously missing test.
In 2024, I thought the new test implemented in check_polynomial_formula() was rather thorough. It has been in use in the two statistics since then. However, I noticed today when updating this page that a few formulas resulted in false positive. These formulas were rather unusual, such as containing terms like I(x + 2), or included calls to functions to apply transformations to x or y, such as log(x).
Why did I not implement a such a test earlier? It seemed to me at first that implementing code for this test would be an extremely difficult task. However, one of the special features of the R language is that language objects like model formulas can be manipulated and converted like almost any other object in R. R also includes functions for manipulation model formulas, used by the model fitting functions to build the model matrix. Making the story short, it was fairly easy to write an ad-hoc first version of a function to validate model formulas as representing a polynomial. It took a couple of hours of design, coding, testing, revision, and trying to imagine all the different ways in which polynomials can be described in model functions, as well as imagining model functions that are similar to polynomials but not true polynomials. The first version, and the second partial rewrite were not perfect, they produced both false positive and false negative results occasionally. After adding a bunch of unit tests and further changes to the code, this function, check_polynomial_formula(), added in 2024 worked fairly well. It has been in in use it in three statistics since then.
The task implemented in check_polynomial_formula() is not just to detect model formulas describing a polynomial from others. The aim is to distinguish model formulas describing polynomials that will be converted into valid equation labels from those for which the conversion would result in wrong output. These cases include polynomials with missing terms and polynomials with the power terms ordered differently than expected. When the test fails, statistics stat_poly_eq() and stat_quant_eq() return NA for the equation issuing a warning with a diagnosys of the problem. Forcing the intercept term to zero is handled correctly, and - 1 and + 0 accepted in formulas. Other labels, such as that for \(R^2\), \(F\), and \(P\) are valid as long as they can be extracted from the fitted model object. As the parameter estimates are also returned as numeric values, when the equation is not generated automatically, users can assemble the equation label manually within the call to aes().
In addition of polynomials being required to have terms in ordered of increasing powers and and no missing terms, other checks are needed. For example, terms in a model formula containing a bare unprotected ^ operator are part of the model formula specification, not a power applied to the explanatory variable. When fitting a model, redundant terms in the formula are dropped, such that a formula like y ~ x + x^2 silently becomes y ~ x, thus such formulas now are also flagged.
Only powers of the explanatory variable enclosed in an as.is call (I()) describe an operation appled to the explanatory variable. Polynomials, in addition to being described explicitly by as.is powers of the explanatory variables, can be defined using function poly(). For the estimated numeric values of the polynomial coefficients to be directly interpretable, raw = TRUE must be included in the call to poly(). This is also checked. Finally, added as a check in 2026, when transformations are included in the formula, the user is required to provide
I noticed today when updating this page that some formulas resulted in false positives and some in false negatives. These formulas are rather unusual, e.g., containing terms like I(x + 2), or including calls to functions to apply transformations to x or y other than enclosed in I(), e.g., cos(x). Anyway, I implemented an improved approach, relying for the parsing of the terms in the formula’s rhs on R’s terms() function. More edge cases are now correctly diagnosed. Below I added additional examples.
Does this function give false positives and false negatives? Likely fewer than the earlier one, and hopefully I haven’t missed any frequently used cases. Time and further testing will tell. Please, do raise an issue if you discover a test failure.
Currently, the two statistics, stat_poly_eq()and stat_quant_eq() in ‘ggpmisc’ (> 0.7.0) issue a warning when passed model formulas that fail this test and set eq.label to NA in the returned data frame, which is the data seen by the downstream geometry.
In the future, the construction algorithm used for assembling the fitted-model-equation label could be replaced with one supporting additional model formulas, and the test made less strict. On the other hand, there could be still some edge cases incorrectly handled by the improved check_poly_formula().
The function, check_poly_formula() has an on-line version of its help page, with some code examples.
The definition of function check_poly_formula() in version 0.7.0.9000 of package ‘ggpmisc’ is listed below.
Am I still missing something? Can the code be really this simple? (well, no longer so simple…)
Redundant terms in a model formula are dropped silently. Although the fitted model equation label is created correctly, such formulas can be expected to be entered only by mistake.
Code
# special case, power in one formula term
x <- 1:20
y <- x + rnorm(length(x))
my.df <- data.frame(x, y)
# all these model formulas are equivalnt to the first one
coef(lm(y ~ x, data = my.df))(Intercept) x
0.08488439 0.99965433
Code
coef(lm(y ~ x + x^2, data = my.df))(Intercept) x
0.08488439 0.99965433
Code
coef(lm(y ~ x + x^3, data = my.df))(Intercept) x
0.08488439 0.99965433
Code
coef(lm(y ~ x^3 + x, data = my.df))(Intercept) x
0.08488439 0.99965433
Code
coef(lm(y ~ x^3, data = my.df))(Intercept) x
0.08488439 0.99965433
Code
# this is a data transformation
coef(lm(y ~ I(x^2), data = my.df))(Intercept) I(x^2)
4.20077399 0.04446328
Some examples of the checks.
Code
# supported correct formulas
check_poly_formula(y ~ 1)[1] TRUE
Code
check_poly_formula(y ~ x)[1] TRUE
Code
check_poly_formula(y ~ x + 1)[1] TRUE
Code
check_poly_formula(y ~ x + 0)[1] TRUE
Code
check_poly_formula(y ~ x - 1)[1] TRUE
Code
check_poly_formula(y ~ x + I(x^2))[1] TRUE
Code
check_poly_formula(y ~ 1 + x + I(x^2))[1] TRUE
Code
check_poly_formula(y ~ x + I(x^2) + I(x^3))[1] TRUE
Code
check_poly_formula(y ~ poly(x, 1, raw = TRUE))[1] TRUE
Code
check_poly_formula(y ~ poly(x, 2, raw = TRUE))[1] TRUE
Code
check_poly_formula(y ~ poly(x, 3, raw = TRUE))[1] TRUE
Code
# formula with y in rhs
check_poly_formula(x ~ y + I(y^2), x.name = "y")[1] TRUE
Code
# unlikely to be correct but unlikely to be passed by accident
check_poly_formula(y ~ poly(x, 2, raw = FALSE))[1] TRUE
Code
# single term with a power is ignored by fitting functions
# these are the same as y ~ x but likely to be mistakes
check_poly_formula(y ~ x^2)Warning in check_poly_formula(y ~ x^2): Terms in 'formula' were dropped!
Forgotten 'I()'?
[1] FALSE
Code
check_poly_formula(y ~ x + x^2)Warning in check_poly_formula(y ~ x + x^2): Terms in 'formula' were dropped!
Forgotten 'I()'?
[1] FALSE
Code
check_poly_formula(y ~ x^2 + x)Warning in check_poly_formula(y ~ x^2 + x): Terms in 'formula' were dropped!
Forgotten 'I()'?
[1] FALSE
Code
# a function call is a transformation
check_poly_formula(y ~ I(x^3))Warning in check_poly_formula(y ~ I(x^3)): rhs includes transformations
requiring an argument for 'eq.x.rhs': 'eq.label' set to NA!.
[1] FALSE
Code
check_poly_formula(y ~ log(x))Warning in check_poly_formula(y ~ log(x)): rhs includes transformations
requiring an argument for 'eq.x.rhs': 'eq.label' set to NA!.
[1] FALSE
Code
check_poly_formula(log(y) ~ x)Warning in check_poly_formula(log(y) ~ x): lhs includes transformations
requiring an argument for 'eq.with.lhs': 'eq.label' set to NA!.
[1] FALSE
Code
check_poly_formula(log(y) ~ log(x))Warning in check_poly_formula(log(y) ~ log(x)): rhs includes transformations
requiring an argument for 'eq.x.rhs': 'eq.label' set to NA!.
Warning in check_poly_formula(log(y) ~ log(x)): lhs includes transformations
requiring an argument for 'eq.with.lhs': 'eq.label' set to NA!.
[1] FALSE
Code
check_poly_formula(y ~ I(x + 2))Warning in check_poly_formula(y ~ I(x + 2)): rhs includes transformations
requiring an argument for 'eq.x.rhs': 'eq.label' set to NA!.
[1] FALSE
Code
# check disabled
check_poly_formula(y ~ log(x), check.transf.rhs = FALSE)[1] TRUE
Code
check_poly_formula(log(y) ~ x, check.transf.lhs = FALSE)[1] TRUE
Code
# missing terms
check_poly_formula(y ~ x + I(x^3))Warning in check_poly_formula(y ~ x + I(x^3)): 'formula' not an increasing
polynomial: 'eq.label' set to NA!
[1] FALSE
Code
check_poly_formula(y ~ I(x) + I(x^3))Warning in check_poly_formula(y ~ I(x) + I(x^3)): 'formula' not an increasing
polynomial: 'eq.label' set to NA!
[1] FALSE
Code
check_poly_formula(y ~ I(x^2) + I(x^3))Warning in check_poly_formula(y ~ I(x^2) + I(x^3)): 'formula' not an increasing
polynomial: 'eq.label' set to NA!
[1] FALSE
Code
# out of order terms
check_poly_formula(y ~ I(x^2) + x)Warning in check_poly_formula(y ~ I(x^2) + x): 'formula' not an increasing
polynomial: 'eq.label' set to NA!
[1] FALSE
Code
check_poly_formula(y ~ x + I(x^3) + I(x^2))Warning in check_poly_formula(y ~ x + I(x^3) + I(x^2)): 'formula' not an
increasing polynomial: 'eq.label' set to NA!
[1] FALSE
Code
# missing raw = TRUE
check_poly_formula(y ~ poly(x, 2))Warning in check_poly_formula(y ~ poly(x, 2)): 'poly()' in model formula has to
be passed 'raw = TRUE': 'eq.label' set to NA!
[1] FALSE
The checks working within stat_poly_eq().
Code
p <-
ggplot(my.df, aes(x, y)) +
geom_point()
# correct and as expected
my.mf <- y ~ x
p +
stat_poly_line(formula = my.mf) +
stat_poly_eq(use_label("eq", "R2", "P"), formula = my.mf)Code
# mising term
my.mf <- y ~ x + I(x^3)
p +
stat_poly_line(formula = my.mf) +
stat_poly_eq(use_label("eq", "R2", "P"), formula = my.mf)Warning in check_poly_formula(formula, orientation, check.transf.lhs =
!is.character(eq.with.lhs), : 'formula' not an increasing polynomial:
'eq.label' set to NA!
Code
# correct but surprising, likely a mistake
my.mf <- y ~ x^3
p +
stat_poly_line(formula = my.mf) +
stat_poly_eq(use_label("eq", "R2", "P"), formula = my.mf)Warning in check_poly_formula(formula, orientation, check.transf.lhs =
!is.character(eq.with.lhs), : Terms in 'formula' were dropped! Forgotten 'I()'?
Code
# requires argument
my.mf <- y ~ I(x^3)
p +
stat_poly_line(formula = my.mf) +
stat_poly_eq(use_label("eq", "R2", "P"), formula = my.mf)Warning in check_poly_formula(formula, orientation, check.transf.lhs =
!is.character(eq.with.lhs), : rhs includes transformations requiring an
argument for 'eq.x.rhs': 'eq.label' set to NA!.
Code
p +
stat_poly_line(formula = my.mf) +
stat_poly_eq(use_label("eq", "R2", "P"), formula = my.mf, eq.x.rhs = "~italic(x)^3")