# comprB.q
# -- One state to many other comparisons using demi-Bonferroni bounds
# Written by Duanli Yan and Russell Almond
# With advise and kibitzing from John Tukey and Charlie Lewis
# For additional information, contact Russell Almond at
# ralmond@ets.org or almond@acm.org.
# Copyright 1999 Educational Testing Service
# This software may be used for any purpose deemed scientifically
# appropriate.
# Verbatim copies of this software may be distributed without restriction.
# Modified copies of this software may be distributed subject to the
# following additional restrictions:
# (1) The modified software contains a notice indicating the nature of
# the modification and the name of the person(s) performing the
# modification.
# (2) The software contains instruction as to how unmodified version
# of the software may be obtained.
# PURPOSE
# Provided below are two plotting routines which compare a statistic
# of a given (target) individual against other individuals in the
# population, for example comparing one state to the other 49.
# The comparision uses "Demi-Bonferonni" bounds in which the
# confidence intervals for the differences are widdened so that the
# chance of any of the comparistons with the target individual being
# labelled "significant" due to sampling variability is less than the
# target level.
# The procedure requires means and standard errors for the statistic
# for each individual. It also requires that the means be labelled
# with the names of the individuals. The intervals it caluclates are
# based on normal theory.
# The plots produced show the target individual as a vertical bar
# (pale) giving the mean and standard error of the statistic for the
# target individual. Other individuals are plotted with supplimental
# error bars which are based on the standard error of the difference
# between that individual and target less the width of the pale. Thus
# checking for overlap between the supplimental error bars and the
# pale provides a visual significance test. Inviduals are sorted
# based on the value of their statistic and the width of their
# supplimental bar.
# This plot and its interprentation is described more fully in
# Almond, RG, Lewis, C, Tukey, TW and Yan, D-L (in press) "Displays
# for Comparing a Given State to Many Other." _The American Statistician_.
# The plot is designed primarily for comparing one state to the other
# 49 (assuming United States). It should work fairly well for
# populations containing between 10 and 100 individuals.
# Two versions of the function are provided.
# comprB --- Comparision plot using demi-Bonferonni bounds
# comprBgroup --- Same as above only assume population is divided into
# groups based on a factor variable.
# Additional the function ordbars() is an internal utility used in
# sorting the bars into their plotting order.
# Required Parameters:
# means, #Statistic value for all units (assumes
# these are labelled).
# ses, #Std errors for all units
# target, #Name of target unit
# Required for comprBgroup only
# group --- factor variable giving groups.
# Optional Parmeters
# targetmean = means[target],#mean for target unit
# targetse = ses[target],#Std error for target unit
# xmin=floor(min(means)),#X axis minimum
# xmax=ceiling(max(means)),#Y axis minimum
# labeloffset = (xmax-xmin)/20 #Distance between bar and label
# alpha=.95, #Level for confidence intervals
# unitname="state", #Name for units
# unitsname = paste(unitname,"s"),#Plural version of
# #above (for irregular plurals)
# xlab=deparse(means), #Label for x axis
# showaxis=T, #Should axes be plotted
# atics = seq(xmin,xmax,10), #Axis tics
# alabels = as.character(atics),
# showlegend=T, #Should legend be plotted
# xleg = c(xmin+1,xmin+(xmax-xmin)/3), #X location of legend
# yleg = c(Nt-Nt/6,Nt), #Y location of legend
# showhighlow=T, #Should # units high/low be plotted
# xhighlow = c(xmin+1,xmin+(xmax-xmin)/10), #X location of high/low
# legend
# yhighlow = c(Nt/4.5,Nt/3), #Y location of high/low
# ttrate = (xmax-xmin)/10 #rate for top tics.
# palecol = Color for pale.
# This optional parameter is used only in the comprBgroup
# sub = " " #group breakdown label, e.g., "by Region"
## An example script showing the usage of the function is given at the
## bottom of this file.
comprB <- function(means, ses, target,
targetmean = means[target],
targetse = ses[target],
xmin,
xmax,
labeloffset = (xmax-xmin)/50,
alpha=.95,
unitname="state",
unitsname = paste(unitname,"s",sep=""),
xlab=deparse(means),
showaxis=T,
atics = seq(xmin,xmax,10),
alabels = as.character(atics),
showlegend=T,
xleg = c(xmin+1,xmin+(xmax-xmin)/2.75),
yleg = c(Ntx-Ntx/6,Ntx-2),
showhighlow=T,
xhighlow = c(xmin+1,xmin+(xmax-xmin)/10),
yhighlow = c(Ntx/4.5,Ntx/3),
ttrate = (xmax-xmin)/10,
palecol=4
)
{
Nt <- length(means)
Ntx <- Nt+2 #Top of plotting field
# find out the inverse normal value,
# for width of pale
tcrit <- qnorm(1-(1-alpha)/2, 0, 1)
# Bonf adjusted width for calculating
#suplimentary bar size.
Bcrit <- -qnorm((1-alpha)/2/Nt, 0, 1)
# lines at high and low points of pale
targetlow <- targetmean - tcrit * targetse
targethigh <- targetmean + tcrit * targetse
# Width of bars for other targets.
width <- ( Bcrit*sqrt(targetse*targetse + ses*ses) - tcrit*targetse)
barlow <- means - width
barhigh <- means + width
bars <- cbind(low=barlow, high=barhigh, mean=means)
newbars <- ordbars(bars, target, targetmean, targethigh, targetlow, 1:Nt)
# no. of units below/above/overlap the target band
nlow <- length(barhigh[barhigh < targetlow])
nhigh <- length(barlow[barlow > targethigh])
nover <- length(means[names(means) != target]) -nlow -nhigh
#Calculate the index of the target.
indextarg <- names(newbars[,"mean"]) == target
#vector of all indexes with labels to the left of the target
indexleft <- newbars[,"mean"]<=targetmean
#vector of all indexes with labels to the right of the target
indexright <- newbars[,"mean"]>=targetmean
# set up default range if none supplied.
if (missing(xmin)) {
xmin <- floor(min(newbars[,"low"])*.90)
}
if (missing(xmax)) {
xmax <- ceiling(max(newbars[,"high"])*1.1)
}
# Now plot the figure
if (showaxis == T) {
frame()
par(usr=c(xmin,xmax,-2,Ntx),xaxp=c(xmin,xmax,8),xaxs="d",yaxs="d",
mar=c(4.5,2.1,5.25,2.1))
box()
par(new=T)
}
#First a dummy plot to set up scaling before playing with CEX.
plot(c(newbars[,"low"],newbars[,"high"]),
c(newbars[,"index"],newbars[,"index"]),
axes=F,yaxt="n",type="n",xlab=xlab,ylab="")
#Next draw pale (So we plot over this one)
# shading
#abline(v=seq(targetlow,targethigh,0.5),lty=2)
polygon(c(targetlow,targetlow,targethigh,targethigh),
c(-2,Ntx,Ntx,-2), col=palecol, err=-1)
abline(v=targetmean,lwd=2)
abline(v=c(targetlow,targethigh))
#Next draw bar endpoints
par(new=T)
plot(c(newbars[!indextarg,"low"],newbars[!indextarg,"high"]),
c(newbars[!indextarg,"index"],newbars[!indextarg,"index"]),
axes=F,yaxt="n",pch="I",cex=0.4,xlab="",ylab="")
points (newbars[,"mean"],newbars[,"index"],cex=0.7)
segments(newbars[!indextarg,"low"],newbars[!indextarg,"index"],
newbars[!indextarg,"high"],newbars[!indextarg,"index"])
#Labels on left
text(newbars[indexleft,"low"] - labeloffset,
newbars[indexleft,"index"],
names(newbars[indexleft,"mean"]),
cex=0.5,adj=1)
#Labels on right
text(newbars[indexright,"high"] + labeloffset,
newbars[indexright,"index"],
names(newbars[indexright,"mean"]),
cex=0.5,adj=0)
# Group labels
#text(248,N1,sub1,cex=0.8)
if (showlegend==T) { # legend
legend(xleg,yleg,
legend=c(paste("For",unitname,"bars that do NOT overlap with the",
target,"strip,"),
"the distance between the closer end of the bar and",
paste("the strip is a simultaneous >",(1-(1-alpha)/2)*100,
"% lower confidence"),
"bound on the difference between the means of the",
paste("two",unitsname, "(There is an analogy for the further end)")),
cex=0.6)
}
if(showhighlow==T)
{
legend(xhighlow,yhighlow,legend=c(
paste(nhigh, ifelse(nhigh==1,unitname,unitsname),
"higher(confidently)"),
paste(nover,ifelse(nover==1,unitname,unitsname),
"uncertain"),
paste(nlow,ifelse(nlow==1,unitname,unitsname),
"lower(confidently)")),
bty="n", cex=0.6)
}
# top scale & label
tcklow <- targetlow-xmin
tckhigh <- xmax-targethigh
toplab <- as.character(c(seq(trunc(tcklow/ttrate)*ttrate,0,-ttrate),
seq(0,trunc(tckhigh/ttrate)*ttrate,ttrate)))
toptck <- c(seq(xmin+tcklow-trunc(tcklow/ttrate)*ttrate,targetlow,ttrate),
seq(targethigh,targethigh+tckhigh,ttrate))
if (showaxis) {
axis(1,at=atics,label=alabels)
axis(3, at=toptck,label=toplab,tck=.02,cex=0.8)
axis(3, at=c(targetlow,targethigh),tck=.02,label=c("0","0"),cex=0.8)
#axis(3, at=c(targetlow+1,targethigh-1),tck=0,label=c("<-","->"),cex=0.6)
#axis(3, at=targetmean,label=target,cex=0.7, adj=.5,tck=0)
text(targetmean,Ntx*1.02, target, cex=0.7, adj=0.5)
}
# main title on top & bottom
mtext(side=3,line=3.5,cex=1.2,outer=F,
paste("Comparison of means of other",unitsname ,"with",state))
mtext(side=3,line=2.75,cex=0.75,outer=F,
paste("(This picture is not for other inter-",unitname," comparisons)",sep=""))
mtext(side=3,line=1.8,cex=0.9,outer=F,
"Distance from the strip")
}
#Now the macro for Grouped Comparisons.
# Additional Parameters
# group --- factor variable giving groups.
# sub = " " #group breakdown label, e.g., "by Region"
comprBgroup <- function(means, ses, target, groups,
targetmean = means[target],
targetse = ses[target],
xmin,
xmax,
labeloffset = (xmax-xmin)/50,
alpha=.95,
unitname="state",
unitsname = paste(unitname,"s",sep=""),
xlab=deparse(means),
showaxis=T,
atics = seq(xmin,xmax,10),
alabels = as.character(atics),
showlegend=T,
xleg = c(xmin+1,xmin+(xmax-xmin)/2.75),
yleg = c(Ntx-Ntx/6,Ntx-2),
showhighlow=T,
xhighlow = c(xmin+1,xmin+(xmax-xmin)/10),
yhighlow = c(Ntx/4.5,Ntx/3),
ttrate = (xmax-xmin)/10,
sub = paste("by",deparse(groups)),
palecol=4
)
{
Nt <- length(means)
Ntx <- Nt+2*length(levels(groups)) #Top of plotting field
# find out the inverse normal value,
# for width of pale
tcrit <- qnorm(1-(1-alpha)/2, 0, 1)
# Bonf adjusted width for calculating
#suplimentary bar size.
Bcrit <- -qnorm((1-alpha)/2/Nt, 0, 1)
# lines at high and low points of pale
targetlow <- targetmean - tcrit * targetse
targethigh <- targetmean + tcrit * targetse
# Width of bars for other targets.
width <- ( Bcrit*sqrt(targetse*targetse + ses*ses) - tcrit*targetse)
barlow <- means - width
barhigh <- means + width
bars <- cbind(low=barlow, high=barhigh, mean=means)
# We need to do the sorting within each group and then paste the
# results together.
Ng <- table(groups)
N0 <- 1
for (g in 1:length(Ng)) {
Ng[g] <- Ng[g] + N0 -1
nb <- ordbars(bars[groups==names(Ng)[g],],target,targetmean,targethigh,targetlow,
N0:(Ng[g]))
N0 <- Ng[g] + 3
if (g == 1 ) {newbars <- nb}
else {
newbars <- rbind(newbars,nb)
}
}
# no. of units below/above/overlap the target band
nlow <- length(barhigh[barhigh < targetlow])
nhigh <- length(barlow[barlow > targethigh])
nover <- length(means[names(means) != target]) -nlow -nhigh
#Calculate the index of the target.
indextarg <- names(newbars[,"mean"]) == target
#vector of all indexes with labels to the left of the target
indexleft <- newbars[,"mean"]<=targetmean
#vector of all indexes with labels to the right of the target
indexright <- newbars[,"mean"]>=targetmean
# set up default range if none supplied.
if (missing(xmin)) {
xmin <- floor(min(newbars[,"low"])*.90)
}
if (missing(xmax)) {
xmax <- ceiling(max(newbars[,"high"])*1.1)
}
# Now plot the figure
if (showaxis == T) {
frame()
par(usr=c(xmin,xmax,-2,Ntx),xaxp=c(xmin,xmax,8),xaxs="d",yaxs="d",
mar=c(4.5,2.1,5.25,2.1))
box()
par(new=T)
}
#First a dummy plot to set up scaling before playing with CEX.
plot(c(newbars[,"low"],newbars[,"high"]),
c(newbars[,"index"],newbars[,"index"]),
axes=F,yaxt="n",type="n",xlab=xlab,ylab="")
#Next draw pale (So we plot over this one)
# shading
#abline(v=seq(targetlow,targethigh,0.5),lty=2)
polygon(c(targetlow,targetlow,targethigh,targethigh),
c(-2,Ntx,Ntx,-2), col=palecol, err=-1)
abline(v=targetmean,lwd=2)
abline(v=c(targetlow,targethigh))
#Next draw bar endpoints
par(new=T)
plot(c(newbars[!indextarg,"low"],newbars[!indextarg,"high"]),
c(newbars[!indextarg,"index"],newbars[!indextarg,"index"]),
axes=F,yaxt="n",pch="I",cex=0.4,xlab="",ylab="")
points (newbars[,"mean"],newbars[,"index"],cex=0.7)
segments(newbars[!indextarg,"low"],newbars[!indextarg,"index"],
newbars[!indextarg,"high"],newbars[!indextarg,"index"])
#Labels on left
text(newbars[indexleft,"low"] - labeloffset,
newbars[indexleft,"index"],
names(newbars[indexleft,"mean"]),
cex=0.5,adj=1)
#Labels on right
text(newbars[indexright,"high"] + labeloffset,
newbars[indexright,"index"],
names(newbars[indexright,"mean"]),
cex=0.5,adj=0)
# Group Labels
text(xmin+(xmax-xmin)*.975,Ng+.75,names(Ng),cex=0.8,adj=1)
if (showlegend==T) { # legend
legend(xleg,yleg,
legend=c(paste("For",unitname,"bars that do NOT overlap with the",
target,"strip,"),
"the distance between the closer end of the bar and",
paste("the strip is a simultaneous >",(1-(1-alpha)/2)*100,
"% lower confidence"),
"bound on the difference between the means of the",
paste("two",unitsname, "(There is an analogy for the further end)")),
cex=0.6)
}
if(showhighlow==T)
{
legend(xhighlow,yhighlow,legend=c(
paste(nhigh, ifelse(nhigh==1,unitname,unitsname),
"higher(confidently)"),
paste(nover,ifelse(nover==1,unitname,unitsname),
"uncertain"),
paste(nlow,ifelse(nlow==1,unitname,unitsname),
"lower(confidently)")),
bty="n", cex=0.6)
}
# top scale & label
tcklow <- targetlow-xmin
tckhigh <- xmax-targethigh
toplab <- as.character(c(seq(trunc(tcklow/ttrate)*ttrate,0,-ttrate),
seq(0,trunc(tckhigh/ttrate)*ttrate,ttrate)))
toptck <- c(seq(xmin+tcklow-trunc(tcklow/ttrate)*ttrate,targetlow,ttrate),
seq(targethigh,targethigh+tckhigh,ttrate))
if (showaxis) {
axis(1,at=atics,label=alabels)
axis(3, at=toptck,label=toplab,cex=0.8,tck=0.02)
axis(3, at=c(targetlow,targethigh),tck=0.02,label=c("0","0"),cex=0.8)
#axis(3, at=c(targetlow+1,targethigh-1),tck=0,label=c("<-","->"),cex=0.6)
#axis(3, at=targetmean,label=target,cex=0.7,adj=.5,tck=0)
text(targetmean,Ntx*1.02, target, cex=0.7, adj=0.5)
}
# main title on top & bottom
mtext(side=3,line=3.5,cex=1.2,outer=F,
paste("Comparison of means of other",unitsname ,"with",state,sub))
mtext(side=3,line=2.75,cex=0.75,outer=F,
paste("(This picture is not for other inter-",unitname," comparisons)",sep=""))
mtext(side=3,line=1.8,cex=0.9,outer=F,
"Distance from the strip")
}
#
# This function contains the producure used for sorting the bars for the
# comprB plots.
#
#
############# sort by distance between closer end of bars to strip:
############# right-end of bars for states w/ below statemean,
############# and left-end of bars for states w/ above statemean.
#
ordbars <- function (bars, target, targetmean, targetlow, targethigh, rows) {
# bars[,1] = low
# bars[,2] = high
# bars[,3] = mean
# Sort by closer end of bars
# (right for units w/ below targetmean,
# left for units w/ above targetmean:
# ordbars[,1:4]=order,barlow,barhigh,means
# Sort by bar width for bars with same mean:
# x units lower/over small-> big, x units higher big -> small
tbar <- bars[names(bars[,"mean"]) == target,]
bars <- bars[names(bars[,"mean"]) != target,]
temlow <- bars[bars[,"mean"] <= targetmean, ]
temhigh <- bars[bars[,"mean"] > targetmean, ]
if(length(temlow) > 0) {
#First sort by length to break ties
lowwidths <- temlow[,"high"]-temlow[,"low"]
temlow <- temlow[order(lowwidths),]
# Now sort by difference from near end of pale.
diflow <- temlow[,"high"] - targetlow
temlow <- temlow[order(diflow),]
}
if(length(temhigh) > 0) {
#First sort by length to break ties
highwidths <- temhigh[,"high"]-temhigh[,"low"]
temhigh <- temhigh[rev(order(highwidths)),]
# Now sort by difference from near end of pale.
difhigh <- temhigh[,"low"] - targethigh
temhigh <- temhigh[order(difhigh),]
}
if (length(tbar) >0) {
# Need to make sure target stays between high and low.
newbars <- rbind(temlow, tbar, temhigh)
dimnames(newbars)[[1]] <- c(dimnames(temlow)[[1]],target,
dimnames(temhigh)[[1]])
} else {
newbars <- rbind(temlow,temhigh)
}
newbars <- cbind(index=rows,newbars)
return(newbars)
}
# Example script:
#m <- state.x77[, "HS Grad"] # Use HS Graduation Rates from state.x77 data
# # These data don't have standard errors which are required for the
# # plots. Therefore, we will assume some kind of binomial
# # proportions where the sample raters are related to the populations.
#n <- state.x77[, "Population"]
#ses <- sqrt((m * (100 - m))/n)
#names(m) <- state.abb # Use Two letter abbreviations for names.
#names(ses) <- state.abb # Use Two letter abbreviations for names.
# # State by State Plot with NJ as target state.
#comprB(m, ses, "NJ", xlab = "% HS Graduates in 1970", xmin = 30, xmax = 75)
# # Group by region.
#comprBgroup(m, ses, "NJ", state.region, xlab = "1970 Graduation rate", sub = "by Region")