### Functions to visualise logistic regression ### D G Rossiter http://www.itc.nl/personal/rossiter ### 19-April-2006 (Patriot's Day) ### bug fix in logit.roc.plot 19-July-2008 ### ### source() and enjoy ### GPL; YMMV ### contents ### 1. logit.plot ### 2. logit.plot.quad ### 3. logit.roc ### 4. logit.roc.area ### 5. logit.roc.plot ### 6. logit.plot.ss ### function logit.plot ## ## visualise the success of a logistic model ## plot logistic curve, mean change, T/F, AIC, null deviance ## arguments ## model a fitted glm ## title (optional) logit.plot <- function(model, title="Success of logistic model") { # sort the fitted values sf <- sort(fitted(model), index=T) plot(sf$x, ylim=c(0,1), type="l", col="blue", lwd=3, xlab="sorted sample number", ylab="probability") text(0, min(fitted(model))-.03, "fitted probability",col="blue",pos=4) title(title) abline(h=mean(fitted(model)), lty=2) text(0, mean(fitted(model))+.02, "mean probability", pos=4) # name of the response field field.name <- attr(attr(terms(formula(model)), "factors"), "dimnames")[[1]][1] # extract the T/F vector # depends on whether glm was called # with a data argument eval(parse(text=paste("tmp <- ", ifelse(class(model$data) == "data.frame", "model$data$", ""), field.name, sep=""))) abline(v=length(tmp)/2,lty=2) text(length(tmp)/2,.03,"midpoint",pos=4) # show T/F points(1:length(tmp), tmp[sf$ix], pch="|",cex=1,col=ifelse(tmp[sf$ix], "green4", "red")) text(0,.03,"FALSE Samples",col="red",pos=4) text(0,.97,"TRUE Samples",col="green4",pos=4) # print model and fit text(length(tmp),0.30,paste( "Model:", formula(model)[2], formula(model)[1], formula(model)[3],sep=" "),pos=2,font=4) text(length(tmp),0.25,paste( "AIC:", round(model$aic,0), sep=" "),pos=2,font=4) text(length(tmp),0.20,paste( "Null deviance:", round(model$null.deviance,0), sep=" "),pos=2,font=4) } ### function logit.plot.quad ## ## plot logistic curve, threshold, T/F +/-, sensitivity, specificity ## arguments ## model a fitted glm ## threshold cutoff for sensitivity/specificity, default 0.5 ## title (optional) logit.plot.quad <- function(model, threshold=0.5, title="Model success") { sf<-sort(fitted(model), index=T) # leave extra space at bottom par(mar=c(6,4,4,2)+.1); par(xaxs="i", yaxs="r") plot(sf$x, ylim=c(0,1), type="l", col="blue", lwd=3, xlab="", ylab="probability of change") abline(h=c(0,1), lty=1) # show threshold and crossover point abline(h=threshold,lty=2); text(0,threshold+.02, paste("threshold =", threshold), pos=4) crossover <- sum(fitted(model) < threshold) abline(v=crossover,lty=2) text(crossover,.05,"crossover",pos=4) text(crossover, threshold-.03, "fitted probability of change",col="blue",pos=4) # name of the response field field.name <- attr(attr(terms(formula(model)), "factors"), "dimnames")[[1]][1] # extract the T/F from it eval(parse(text=paste("tmp <- ", ifelse(class(model$data) == "data.frame", "model$data$", ""), field.name, sep=""))) # show T/F as vertical bars at the index # colours differ with T/F predictions points(1:length(tmp),tmp[sf$ix], pch="|",cex=1, col=ifelse((tmp[sf$ix] == (sf$x>threshold)),"green4","red")) # compute proportions tn <- sum((!tmp[sf$ix]) & (sf$x < threshold)) fn <- sum((!tmp[sf$ix]) & (sf$x >= threshold)) tp <- sum(tmp[sf$ix] & (sf$x >= threshold)) fp <- sum(tmp[sf$ix] & (sf$x < threshold)) right <- length(sf$x)*.65 text(0,.1,paste("True negatives:",tn), col="green4",pos=4) text(right,.1,paste("False positives:", fn), col="red",pos=4) text(right,.9,paste("True positives:", tp), col="green4",pos=4) text(0,.9,paste("False negatives:", fp), col="red",pos=4) title(main=title) title(sub=paste("Sensitivity:", round(tp/(tp+fp),4), "; Specificity:", round(tn/(tn+fn),4)), line=4) } ### function logit.roc ## ## compute an empirical ROC curve from a fitted logistic model ## and a data frame with the modelled logical field ## ## results are used as an argument to logit.plot.ss, logit.roc.plot ## ## arguments ## model a fitted glm ## steps how many thresholds; default 20 ## returns ## logit.roc data frame with three fields: ## pts vector of points along the curve ## sens, spec sensitivity, specificity logit.roc <- function(model, steps=20) { # get the response field # from the model object field.name <- attr(attr(terms(formula(model)), "factors"), "dimnames")[[1]][1] # and extract the T/F from it eval(parse(text=paste("tmp <- ", ifelse(class(model$data) == "data.frame", "model$data$", ""), field.name, sep=""))) # initialize object r <- data.frame(pts = seq(0, 1-(1/steps), by=1/steps), sens = 0, spec=0); # compute sens/spec at thresholds for (i in 0:steps) { thresh <- i/steps; r$sens[i] <- sum((fitted(model) >= thresh) & tmp)/sum(tmp); r$spec[i] <- sum((fitted(model) < thresh) & !tmp)/sum(!tmp) } return(r) } ### function logit.roc.area ## ## compute area under a ROC curve computed by logit.roc() ## typical usage: logit.roc.area(roc(model, dataset)) ## argument ## r an ROC curve returned by logit.roc() logit.roc.area <- function(r) { area <- 0; for (i in 1:(length(r$pts)-1)) area <- area + ((1 - r$sens[i+1]) - (1 - r$sens[i])) * ((r$spec[i+1] + r$spec[i])/2); return(area) } ### function logit.roc.plot ## ## plot an ROC curve computed by logit.roc() ## typical usage: logit.roc.plot(roc(model, dataset)) ## argument ## r an ROC curve returned by logit.roc() logit.roc.plot <- function(r, title="ROC curve") { old.par <- par(no.readonly = TRUE); on.exit(par(old.par)) par(xaxs="i", yaxs="i") plot(1 - r$spec, r$sens, xlim=c(0, 1), ylim=c(0,1), type="l", xlab="(1 - specificity): false positive rate", ylab="sensitivity: true positive rate", col="blue", lwd=2); points(1 - r$spec, r$sens, pch=20, cex=2, col="blue"); abline(0, 1, lty=2); segments(1-r$spec, 1-r$spec, 1-r$spec, r$sens, lty=2) text(0, 0.9, paste("Area under ROC:",round(logit.roc.area(r),4)), pos=4) title(main = title) } ### function logit.plot.ss ## ## plot sensitivity, specificity vs. threshold for a logistic model ## typical usage: logit.plot.ss(logit.roc(model, dataset)) ## argument ## r ROC curve computed by logit.roc() logit.plot.ss <- function(r) { plot(r$pts, r$spec, type="n", xlab="Threshold", ylab="value", ylim=c(0,1)); title(main = "Sensitivity and specificity vs. threshold"); abline(h=seq(0, 1, by=0.1, lty="dashed", lwd=0.5)); lines(r$pts, r$spec, col="blue", lwd=1.5); lines(r$pts, r$sens, col="green4", lwd=1.5); text(0.05, 0.05, pos=4, col="blue", "Specificity"); text(0.05, 0.95, pos=4, col="green4", "Sensitivity"); }