January | 2013 | failuretoconverge

I recently ran into a problem where I was writing some plot code to run reports from a database that contained many different observations within a field across many clients. For example let’s say you wanted to run plots of unit sales data from all the Vendors that sold goods to each of your clients. One client’s vendor list may include: StoreX and StoreY, while another client’s list may only include StoreX, StoreZ, and StoreA. This became an issue because I needed the plot code to call certain variables by name AFTER a call to dcast, and the dcast call created variable names that were customized according to the observations in the field of interest by client. That is to say, sometimes you’d get DF$StoreX and DF$StoreY and in other cases, DF$StoreX, DF$StoreZ and DF$StoreA.

So you see, the crux of my problem was that I did not know what these variables were going to be named before the plot code was to call them, a side effect of using dcast to reshape my dataframe

After some searching around for an answer, I stumbled across the assign() function in base R, and after some experimenting I came up with a solution to my problem!

While my example may still seem pretty clunky and it uses a dummy data set with very few Vendors, you can imagine a situation where the Vendor list is long and includes a large number of possible groupings by client.

structure(list(yearmon = structure(c(2012.08333333333, 2012.25, 
2012.33333333333, 2012.5, 2012.58333333333, 2012.66666666667, 
2012.83333333333, 2012.16666666667, 2012.41666666667, 2012.75, 
2011.58333333333, 2011.66666666667, 2011.41666666667, 2011.5, 
2011.91666666667, 2012, 2011.75, 2011.83333333333, 2011.41666666667, 
2011.5, 2011.66666666667, 2012.41666666667, 2012.5, 2012.58333333333, 
2012.66666666667, 2011.58333333333, 2011.75, 2011.83333333333, 
2011.91666666667, 2012, 2012.08333333333, 2012.16666666667, 2012.25, 
2012.33333333333, 2012.75, 2012.83333333333), class = "yearmon"), 
    Vendor = c("Vendor1", "Vendor1", "Vendor1", "Vendor1", "Vendor1", 
    "Vendor1", "Vendor1", "Vendor1", "Vendor1", "Vendor1", "Vendor1", 
    "Vendor1", "Vendor1", "Vendor1", "Vendor1", "Vendor1", "Vendor1", 
    "Vendor1", "Vendor2", "Vendor2", "Vendor2", "Vendor2", "Vendor2", 
    "Vendor2", "Vendor2", "Vendor2", "Vendor2", "Vendor2", "Vendor2", 
    "Vendor2", "Vendor2", "Vendor2", "Vendor2", "Vendor2", "Vendor2", 
    "Vendor2"), units = c(14912, 15060, 15406, 14995, 15391, 
    20069, 15789, 15445, 14921, 15586, 14522, 15679, 15157, 14226, 
    16667, 15750, 14798, 15464, 33161, 31238, 39159, 32259, 33710, 
    36358, 42586, 34580, 34411, 34494, 39398, 31233, 33453, 34703, 
    33238, 34309, 34150, 33644)), .Names = c("yearmon", "Vendor", 
"units"), row.names = c(NA, 36L), class = "data.frame")

names <- unique(df1$Vendor)[order(unique(df1$Vendor))]
library(reshape2)
df1 <- dcast(df1, yearmon~Vendor, value.var='units')
df1[is.na(df1)] <- 0
df1$total <- rowSums(df1[unique(names)])

nameindex <- grep(paste(names, collapse="|"), names(df1))  ##  create index only for columns rep DTE pharmacies
ornamevec <- gsub('-| ', '', unique(names)) ## to clean up any troublesome characters that may have been coded into the Vendor names
names(df1)[nameindex] <- ornamevec ## change var names to remove strange characters

namevec <- NA
for(i in 1:length(unique(names))){
  j <- nameindex[i]
  nam <- paste('pct', ornamevec[i], sep='')
  xx1 <- assign(nam, round((df1[j]/df1$total)*100, 2)) ## the magic happens here
  names(xx1) <- nam
  namevec[i] <- nam ## create vector of newly created columns
  df1 <- cbind(df1, xx1)
}
newnameindex <- grep(paste(namevec, collapse="|"), names(df1))


par(mar=c(5.1, 4.1, 4.1, 9.2), xpd=TRUE)
ymax <-  max( df1[c(ornamevec)])+(0.1* max( df1[c(ornamevec)]))
ymin <-  min( df1[c(ornamevec)])+(0.1* min( df1[c(ornamevec)]))
library(RColorBrewer)
cpal <- brewer.pal(length(ornamevec), 'Set2')
plot( df1$yearmon, df1$yearmon,yaxt='n', xaxt='n', ylab="", xlab="")
u <- par("usr")
rect(u[1], u[3], u[2], u[4], col = "gray88", border = "black")
par(new=T)
for (i in 1:length(ornamevec)){
  j <- nameindex[i] ##  calls variable to be plotted
  plot( df1$yearmon, df1[,j], type="o", col=cpal[i], lwd=2, lty=i,yaxt='n', cex.axis=.7, cex.sub=.8, cex.lab=.8, xlab= 'Month', ylab='Units (1,000s)', main=paste('Units by Vendor by Month'), ylim=c(ymin,ymax), sub=paste('' ), cex.sub=.8)
  if (i < length(ornamevec))
    par(new=T)
}
axis(side=2, at= pretty(c(ymin, (ymax-(.1*(ymax-ymin))))), labels=format( pretty(c(ymin, (ymax-(.1*(ymax-ymin)))))/1000, big.mark=','), cex.axis=.75)
legend('right', inset=c(-0.18,0), legend=ornamevec, lwd=rep(2, length(names)), pchrep(1, length(names)), col=cpal, lty= 1:length(ornamevec) ,title="LEGEND", bty='n' , cex=.8)

The resulting plot looks like this.

library(zoo) data <- structure(list(yearmon = structure(c(2011.41666666667, 2011.5, 2011.58333333333, 2011.66666666667, 2011.75, 2011.83333333333, 2011.91666666667, 2012, 2012.08333333333, 2012.16666666667, 2012.25, 2012.33333333333, 2012.41666666667, 2012.5, 2012.58333333333, 2012.66666666667, 2012.75, 2012.83333333333), class = "yearmon"), pct = c(1.565, 1.365, 1.335, 1.315, 1.415, 1.12, 1.265, 0.8, 1.065, 0.555, 0.81, 0.835, 0.9, 0.88, 0.72, 0.7, 0.39, 0.525 )), .Names = c("yearmon", "pct"), class = "data.frame", row.names = c(NA, 18L)) lookback <- 6 sdlim <- 3 c <- zoo(data$pct) rollsd <- c( rep(NA,lookback-1), rollapply(c, 6, sd, align = 'right')) x1 <- as.data.frame(cbind(data$pct, rollsd*sdlim)) x1 <- cbind(yearmon = data$yearmon, x1) x1$upest <- c(rep(NA,lookback) ,x1$V1[(lookback):(length(x1$V1)-1)] + x1$V2[lookback:(length(x1$V1)-1)]) x1$dwnest <- c(rep(NA,lookback) ,x1$V1[(lookback):(length(x1$V1)-1)] - x1$V2[lookback:(length(x1$V1)-1)]) x1$test <- ifelse(x1$V1 > x1$upest, "*up", NA) x1$test <- ifelse(x1$V1 < x1$dwnest, "*dwn", x1$test) x1$testup <- ifelse(x1$V1 > x1$upest, "*up", NA) x1$testdwn <- ifelse(x1$V1 < x1$dwnest, "*dwn", NA) x1$testup <- ifelse(is.na(x1$testup), "", x1$testup) x1$testdwn <- ifelse(is.na(x1$testdwn), "", x1$testdwn) ymax <- max(x1$V1)+(0.1*(max(x1$V1))) ymin <- (min(x1$V1)) - (0.1*(max(x1$V1))) par(mar = c(6.1, 4.1 , 4.1, 2.1)) plot( x1$yearmon, x1$yearmon, type="n", lwd=2,yaxt='n', xaxt='n', ylab="", xlab="") u <- par("usr") rect(u[1], u[3], u[2], u[4], col = "gray88", border = "black") par(new=T) plot( x1$yearmon, x1$V1, type="o", yaxt='n', cex.sub=.8, cex.lab=.95, xlab= 'Month', ylab='My Y value', main='This is my plot', ylim=c(ymin,ymax)) abline(h = pretty(ymin:ymax), col='white') ## draw h lines abline(v = (x1$yearmon), col='white') ## draw v lines par(new=T) plot( x1$yearmon, x1$V1, type="o", yaxt='n', cex.sub=.8, cex.lab=.95, xlab= 'Month', ylab='My Y value', main='This is my plot', ylim=c(ymin,ymax)) xtail <- tail(x1, n=12) new <- data.frame(yearmon=xtail$yearmon) ## dummy dataset for prediction line lines(new$yearmon, predict(lm(xtail$V1~xtail$yearmon), new), col='blue', lty=3, lwd=2) ## draws line onlty throught the last 12 mos tests <- summary(lm(xtail$V1~xtail$yearmon)) pval <- round(tests[[4]][2,4],3) title(sub = paste('*Highlighted when outside', sdlim, 'sd using', lookback, 'mo. rolling sd.\n', avg12mo, "OLS p-value= ", pval), line = 4.5, cex.sub=.75) axis(side=2, at=pretty(c(ymin, ymax)), labels=format(pretty(c(ymin, ymax)), big.mark=','), cex.axis=.75) par(new=T) plot( x1$yearmon, x1$upest, type="l", lty=2 ,ylim=c(ymin,ymax), xaxt='n', yaxt='n', xlab="", ylab="",cex.sub=.8, col='red') par(new=T) plot( x1$yearmon, x1$dwnest, type="l", lty=2 ,ylim=c(ymin,ymax), xaxt='n', yaxt='n', xlab="", ylab="",cex.sub=.8, col='red') text(x1$yearmon, x1$V1, x1$testup, pos=3, offset=.5,cex=.8) text(x1$yearmon, x1$V1, x1$testdwn, pos=1, offset=.5,cex=.8)

failuretoconverge

father, husband, son, brother, data hacker, seeking optimization

Monthly Archives: January 2013

The power of not hard-coding variables in plotting code–‘How I learned to stop worrying and love assign()’

Creating line plots with custom SD bands