The power of not hard-coding variables in plotting code–‘How I learned to stop worrying and love assign()’

I recently ran into a problem where I was writing some plot code to run reports from a database that contained many different observations within a field across many clients. For example let’s say you wanted to run plots of unit sales data from all the Vendors that sold goods to each of your clients. One client’s vendor list may include: StoreX and StoreY, while another client’s list may only include StoreX, StoreZ, and StoreA. This became an issue because I needed the plot code to call certain variables by name AFTER a call to dcast, and the dcast call created variable names that were customized according to the observations in the field of interest by client. That is to say, sometimes you’d get DF$StoreX and DF$StoreY and in other cases, DF$StoreX, DF$StoreZ and DF$StoreA.

So you see, the crux of my problem was that I did not know what these variables were going to be named before the plot code was to call them, a side effect of using dcast to reshape my dataframe

After some searching around for an answer, I stumbled across the assign() function in base R, and after some experimenting I came up with a solution to my problem!

While my example may still seem pretty clunky and it uses a dummy data set with very few Vendors, you can imagine a situation where the Vendor list is long and includes a large number of possible groupings by client.

structure(list(yearmon = structure(c(2012.08333333333, 2012.25, 
2012.33333333333, 2012.5, 2012.58333333333, 2012.66666666667, 
2012.83333333333, 2012.16666666667, 2012.41666666667, 2012.75, 
2011.58333333333, 2011.66666666667, 2011.41666666667, 2011.5, 
2011.91666666667, 2012, 2011.75, 2011.83333333333, 2011.41666666667, 
2011.5, 2011.66666666667, 2012.41666666667, 2012.5, 2012.58333333333, 
2012.66666666667, 2011.58333333333, 2011.75, 2011.83333333333, 
2011.91666666667, 2012, 2012.08333333333, 2012.16666666667, 2012.25, 
2012.33333333333, 2012.75, 2012.83333333333), class = "yearmon"), 
    Vendor = c("Vendor1", "Vendor1", "Vendor1", "Vendor1", "Vendor1", 
    "Vendor1", "Vendor1", "Vendor1", "Vendor1", "Vendor1", "Vendor1", 
    "Vendor1", "Vendor1", "Vendor1", "Vendor1", "Vendor1", "Vendor1", 
    "Vendor1", "Vendor2", "Vendor2", "Vendor2", "Vendor2", "Vendor2", 
    "Vendor2", "Vendor2", "Vendor2", "Vendor2", "Vendor2", "Vendor2", 
    "Vendor2", "Vendor2", "Vendor2", "Vendor2", "Vendor2", "Vendor2", 
    "Vendor2"), units = c(14912, 15060, 15406, 14995, 15391, 
    20069, 15789, 15445, 14921, 15586, 14522, 15679, 15157, 14226, 
    16667, 15750, 14798, 15464, 33161, 31238, 39159, 32259, 33710, 
    36358, 42586, 34580, 34411, 34494, 39398, 31233, 33453, 34703, 
    33238, 34309, 34150, 33644)), .Names = c("yearmon", "Vendor", 
"units"), row.names = c(NA, 36L), class = "data.frame")

names <- unique(df1$Vendor)[order(unique(df1$Vendor))]
library(reshape2)
df1 <- dcast(df1, yearmon~Vendor, value.var='units')
df1[is.na(df1)] <- 0
df1$total <- rowSums(df1[unique(names)])

nameindex <- grep(paste(names, collapse="|"), names(df1))  ##  create index only for columns rep DTE pharmacies
ornamevec <- gsub('-| ', '', unique(names)) ## to clean up any troublesome characters that may have been coded into the Vendor names
names(df1)[nameindex] <- ornamevec ## change var names to remove strange characters

namevec <- NA
for(i in 1:length(unique(names))){
  j <- nameindex[i]
  nam <- paste('pct', ornamevec[i], sep='')
  xx1 <- assign(nam, round((df1[j]/df1$total)*100, 2)) ## the magic happens here
  names(xx1) <- nam
  namevec[i] <- nam ## create vector of newly created columns
  df1 <- cbind(df1, xx1)
}
newnameindex <- grep(paste(namevec, collapse="|"), names(df1))


par(mar=c(5.1, 4.1, 4.1, 9.2), xpd=TRUE)
ymax <-  max( df1[c(ornamevec)])+(0.1* max( df1[c(ornamevec)]))
ymin <-  min( df1[c(ornamevec)])+(0.1* min( df1[c(ornamevec)]))
library(RColorBrewer)
cpal <- brewer.pal(length(ornamevec), 'Set2')
plot( df1$yearmon, df1$yearmon,yaxt='n', xaxt='n', ylab="", xlab="")
u <- par("usr")
rect(u[1], u[3], u[2], u[4], col = "gray88", border = "black")
par(new=T)
for (i in 1:length(ornamevec)){
  j <- nameindex[i] ##  calls variable to be plotted
  plot( df1$yearmon, df1[,j], type="o", col=cpal[i], lwd=2, lty=i,yaxt='n', cex.axis=.7, cex.sub=.8, cex.lab=.8, xlab= 'Month', ylab='Units (1,000s)', main=paste('Units by Vendor by Month'), ylim=c(ymin,ymax), sub=paste('' ), cex.sub=.8)
  if (i < length(ornamevec))
    par(new=T)
}
axis(side=2, at= pretty(c(ymin, (ymax-(.1*(ymax-ymin))))), labels=format( pretty(c(ymin, (ymax-(.1*(ymax-ymin)))))/1000, big.mark=','), cex.axis=.75)
legend('right', inset=c(-0.18,0), legend=ornamevec, lwd=rep(2, length(names)), pchrep(1, length(names)), col=cpal, lty= 1:length(ornamevec) ,title="LEGEND", bty='n' , cex=.8)

The resulting plot looks like this.
test

Creating line plots with custom SD bands

I recently was tasked with putting together a dashboard of sorts that provided some visualizations that made it easy for users to see when values were ‘outside of what might be expected.’   My first stab at a deliverable included plots that looked backward at a range of values to set a band of ‘likely’ values, and identified points that fell outside of that range.   As I had some flexibility, I chose to use standard deviations to define my ranges, and I looked back over the previous rolling 6 months worth of values to set my bands.  The code is written in such a way that I can change both the look back and the factor that’s applied to the SD (to create the bands).

The code required I make use of the helpful rollapply() function in the zoo package.

library(zoo)
data <- structure(list(yearmon = structure(c(2011.41666666667, 2011.5,
                                             2011.58333333333, 2011.66666666667, 2011.75, 2011.83333333333,
                                             2011.91666666667, 2012, 2012.08333333333, 2012.16666666667, 2012.25,
                                             2012.33333333333, 2012.41666666667, 2012.5, 2012.58333333333,
                                             2012.66666666667, 2012.75, 2012.83333333333), class = "yearmon"),
                       pct = c(1.565, 1.365, 1.335, 1.315, 1.415, 1.12, 1.265, 0.8,
                               1.065, 0.555, 0.81, 0.835, 0.9, 0.88, 0.72, 0.7, 0.39, 0.525
                               )), .Names = c("yearmon", "pct"), class = "data.frame", row.names = c(NA,
                                                                                                     18L))

lookback <- 6
sdlim <- 3



c <- zoo(data$pct)
rollsd <- c( rep(NA,lookback-1), rollapply(c, 6, sd, align = 'right'))
x1 <- as.data.frame(cbind(data$pct, rollsd*sdlim))
x1 <- cbind(yearmon = data$yearmon, x1)
x1$upest <- c(rep(NA,lookback) ,x1$V1[(lookback):(length(x1$V1)-1)] + x1$V2[lookback:(length(x1$V1)-1)])
x1$dwnest <- c(rep(NA,lookback) ,x1$V1[(lookback):(length(x1$V1)-1)] - x1$V2[lookback:(length(x1$V1)-1)])
x1$test <- ifelse(x1$V1 > x1$upest, "*up", NA)
x1$test <- ifelse(x1$V1 < x1$dwnest, "*dwn", x1$test)
x1$testup <- ifelse(x1$V1 > x1$upest, "*up", NA)
x1$testdwn <- ifelse(x1$V1 < x1$dwnest, "*dwn", NA)
x1$testup <- ifelse(is.na(x1$testup), "", x1$testup)
x1$testdwn <- ifelse(is.na(x1$testdwn), "", x1$testdwn)
ymax <- max(x1$V1)+(0.1*(max(x1$V1)))
ymin <- (min(x1$V1)) - (0.1*(max(x1$V1)))

par(mar = c(6.1, 4.1 , 4.1, 2.1))

plot( x1$yearmon, x1$yearmon, type="n", lwd=2,yaxt='n', xaxt='n', ylab="", xlab="")
u <- par("usr")
rect(u[1], u[3], u[2], u[4], col = "gray88", border = "black")
par(new=T)
plot( x1$yearmon, x1$V1, type="o", yaxt='n', cex.sub=.8, cex.lab=.95, xlab= 'Month', ylab='My Y value', main='This is my plot', ylim=c(ymin,ymax))
abline(h = pretty(ymin:ymax), col='white') ##  draw h lines
abline(v = (x1$yearmon), col='white') ##  draw v lines
par(new=T)
plot( x1$yearmon, x1$V1, type="o", yaxt='n', cex.sub=.8, cex.lab=.95, xlab= 'Month', ylab='My Y value', main='This is my plot', ylim=c(ymin,ymax))
xtail <- tail(x1, n=12)
new <- data.frame(yearmon=xtail$yearmon) ##  dummy dataset for prediction line
lines(new$yearmon, predict(lm(xtail$V1~xtail$yearmon), new), col='blue', lty=3, lwd=2)  ##  draws line onlty throught the last 12 mos
tests <- summary(lm(xtail$V1~xtail$yearmon))
pval <- round(tests[[4]][2,4],3)

title(sub = paste('*Highlighted when outside', sdlim, 'sd using', lookback, 'mo. rolling sd.\n', avg12mo, "OLS p-value= ", pval), line = 4.5, cex.sub=.75)
axis(side=2, at=pretty(c(ymin, ymax)), labels=format(pretty(c(ymin, ymax)), big.mark=','), cex.axis=.75)
par(new=T)
plot( x1$yearmon, x1$upest, type="l", lty=2 ,ylim=c(ymin,ymax), xaxt='n', yaxt='n', xlab="", ylab="",cex.sub=.8,  col='red')
par(new=T)
plot( x1$yearmon, x1$dwnest, type="l", lty=2  ,ylim=c(ymin,ymax), xaxt='n', yaxt='n', xlab="", ylab="",cex.sub=.8, col='red')
text(x1$yearmon, x1$V1, x1$testup, pos=3, offset=.5,cex=.8)
text(x1$yearmon, x1$V1, x1$testdwn, pos=1, offset=.5,cex=.8)

testplot

I’m sure there is some cleaning up I could do to make the code more compact (nested ifelse statements come to mind), but I am trying to make sure the code is very readable at this stage.