Category Archives: R

Dissertation thesis online

On the 28th of March 2017 my dissertation thesis was published on the document server of the library of the Freie Universität Berlin and I can call myself Dr. rer. nat. Arnd Weber finally.

thesis

Metadata are available here and a link to the final electronic version of my doctoral thesis is available here. Since I conducted most of the statistical analyses in R they are reproducible. To rerun the analyses and reproduce the results you can follow a description in the Appendix of the thesis and download the data and scripts from the aqualogy.de webserver.

Article online!

Yesterday, a new publication for which I programmed the GIS analysis and co-authored, has been put online:

Heuner, M., Weber, A., Schröder, U., Kleinschmit, B., Schröder, B. (2016). Facilitating political decisions using species distribution models to assess restoration measures in heavily modified estuaries. Marine Pollution Bulletin 110(1), 250–260. DOI: 10.1016/j.marpolbul.2016.06.056.

With the supplementary material of data, R- and Python-scripts you can reproduce the analysis.

DSL Geschwindigkeiten

Da ich in den vergangenen Wochen einen neuen DSL-Vertrag abgeschlossen habe, bei dem vermehrt Probleme bei der Erreichbarkeit meines privaten Webservers auftraten, habe ich beschlossen Verbindungsgeschwindigkeiten und Erreichbarkeit zu überwachen. Basierend auf diesem Blogpost und dem zugrundeliegenden Python-Tool zur Messung von Verbindungsgeschwindigkeiten von Janis Jansons habe ich eine dauerhafte Geschwindigkeitsüberwachung, die alle 10 Minute ausgeführt wird, eingerichtet.
Da die produzierte *.csv-Datei allerdings reichlich unübersichtlich ist, habe ich ein kleines Skript in R geschrieben, um die Daten, die in die *.csv-Datei geschrieben werden, zu visualisieren. Zunächst muss also R installiert werden. Unter Ubuntu erfolgt das mit folgendem Befehl:
sudo apt-get install r-base
Danach speichert man das folgende Skript lokal unter ~/scripts/speedtest_plot.R ab.

### set locale
Sys.setlocale("LC_TIME", "en_US.utf8")

### set working directory
setwd("/home/arnd/")

### import data
data.df <- read.table("speedtest.csv", header=FALSE, sep=",", dec=".", 
                      stringsAsFactors=FALSE, fill=TRUE, na.strings=c("[]", ""))

### preprocess the data
# name the columns
colnames(data.df) <- c("time", "download", "upload", "unit", "server")
# strptime
data.df$time <- strptime(data.df$time, format="%a, %d %b %Y %H:%M:%S %z", tz="")
# down- & upload
data.df$download[which(data.df$download == -1)] <- NA
data.df$upload[which(data.df$upload == -1)] <- NA
# manipulate server names => remove brackets
data.df$server <- gsub("[", "", data.df$server, fixed=TRUE)
data.df$server <- gsub("]", "", data.df$server, fixed=TRUE)
data.df$server <- gsub("'", "", data.df$server, fixed=TRUE)
#unique(data.df$server)
# create and factorize date
data.df$date <- strptime(data.df$time, format="%Y-%m-%d")
data.df$fdate <- as.factor(as.character(data.df$date))

### plot
pdf("speedtest_plot.pdf")
par(font.lab=2)
plot(data.df$time, data.df$download, type="l", col="blue", lwd=2, xlab="Zeit", 
     ylab="Mbit", main="DSL-Geschwindigkeiten", ylim=c(0,100), xaxt="n")
r <- as.POSIXct(round(range(data.df$time), "days"))
axis.POSIXct(1, at=seq(r[1], r[2], by="day"), format="%d.%m.%y", labels=TRUE)
lines(data.df$time, data.df$upload, col="red", lwd=2)
points(data.df$time[which(is.na(data.df$download) == TRUE)], 
       rep(100, length(which(is.na(data.df$download) == TRUE))), pch=19, 
       cex=0.1, col="blue")
text(data.df$time[ceiling(length(data.df$time)/2)], 97.4, 
     "Zeitpunkte ohne Messung", cex=0.7)
points(data.df$time[which(is.na(data.df$upload) == TRUE)], 
       rep(95, length(which(is.na(data.df$upload) == TRUE))), pch=19, cex=0.1, 
       col="red")
text(data.df$time[1], 55, "DOWNLOAD", col="blue", cex=1.5, pos=4)
text(data.df$time[1], 5, "UPLOAD", col="red", cex=1.5, pos=4)
abline(h=50, lty=3, lwd=0.5)
mtext(paste0("Aktualisiert: ", strftime(Sys.time(), "%d.%m.%Y %H:%M:%S")), 
      cex=0.7)
dev.off()

png("speedtest_plot.png")
par(font.lab=2)
plot(data.df$time, data.df$download, type="l", col="blue", lwd=2, xlab="Zeit", 
     ylab="Mbit", main="DSL-Geschwindigkeiten", ylim=c(0,100), xaxt="n")
r <- as.POSIXct(round(range(data.df$time), "days"))
axis.POSIXct(1, at=seq(r[1], r[2], by="day"), format="%d.%m.%y", labels=TRUE)
lines(data.df$time, data.df$upload, col="red", lwd=2)
points(data.df$time[which(is.na(data.df$download) == TRUE)], 
       rep(100, length(which(is.na(data.df$download) == TRUE))), pch=19, 
       cex=0.1, col="blue")
text(data.df$time[ceiling(length(data.df$time)/2)], 97.4, 
     "Zeitpunkte ohne Messung", cex=0.7)
points(data.df$time[which(is.na(data.df$upload) == TRUE)], 
       rep(95, length(which(is.na(data.df$upload) == TRUE))), pch=19, cex=0.1, 
       col="red")
text(data.df$time[1], 55, "DOWNLOAD", col="blue", cex=1.5, pos=4)
text(data.df$time[1], 5, "UPLOAD", col="red", cex=1.5, pos=4)
abline(h=50, lty=3, lwd=0.5)
mtext(paste0("Aktualisiert: ", strftime(Sys.time(), "%d.%m.%Y %H:%M:%S")), 
      cex=0.7)
dev.off()

###
# subset to print the data only for the last month
# select the last month:
# 60 sec * 60 min * 24 hours * 30 days
# 2592000 = one month
data.df <- subset(data.df, time > max(data.df$time) - 2592000)

png("speedtest_plot_last_month.png", height=720, width=720)
par(font.lab=2)
plot(data.df$time, data.df$download, type="l", col="blue", lwd=2, xlab="Zeit", 
     ylab="Mbit", main="DSL-Geschwindigkeiten", ylim=c(0,100), xaxt="n")
r <- as.POSIXct(round(range(data.df$time), "days"))
axis.POSIXct(1, at=seq(r[1], r[2], by="day"), format="%d.%m.%y", labels=TRUE)
lines(data.df$time, data.df$upload, col="red", lwd=2)
points(data.df$time[which(is.na(data.df$download) == TRUE)], 
       rep(100, length(which(is.na(data.df$download) == TRUE))), pch=19, 
       cex=0.1, col="blue")
text(data.df$time[ceiling(length(data.df$time)/2)], 97.4, 
     "Zeitpunkte ohne Messung", cex=0.8)
points(data.df$time[which(is.na(data.df$upload) == TRUE)], 
       rep(95, length(which(is.na(data.df$upload) == TRUE))), pch=19, cex=0.1, 
       col="red")
text(data.df$time[1], 55, "DOWNLOAD", col="blue", cex=1.5, pos=4)
text(data.df$time[1], 5, "UPLOAD", col="red", cex=1.5, pos=4)
abline(h=50, lty=3, lwd=0.5)
mtext(paste0("Aktualisiert: ", strftime(Sys.time(), "%d.%m.%Y %H:%M:%S")), 
      cex=0.8)
dev.off()

###
# subset to print the data only for the last week
# select the last week:
# 60 sec * 60 min * 24 hours * 7 days
# 640800 = one week
data.df <- subset(data.df, time > max(data.df$time) - 604800)

png("speedtest_plot_last_week.png", height=720, width=720)
par(font.lab=2)
plot(data.df$time, data.df$download, type="l", col="blue", lwd=2, xlab="Zeit", 
     ylab="Mbit", main="DSL-Geschwindigkeiten", ylim=c(0,100), xaxt="n")
r <- as.POSIXct(round(range(data.df$time), "days"))
axis.POSIXct(1, at=seq(r[1], r[2], by="day"), format="%d.%m.%y", labels=TRUE)
lines(data.df$time, data.df$upload, col="red", lwd=2)
points(data.df$time[which(is.na(data.df$download) == TRUE)], 
       rep(100, length(which(is.na(data.df$download) == TRUE))), pch=19, 
       cex=0.1, col="blue")
text(data.df$time[ceiling(length(data.df$time)/2)], 97.4, 
     "Zeitpunkte ohne Messung", cex=0.8)
points(data.df$time[which(is.na(data.df$upload) == TRUE)], 
       rep(95, length(which(is.na(data.df$upload) == TRUE))), pch=19, cex=0.1, 
       col="red")
text(data.df$time[1], 55, "DOWNLOAD", col="blue", cex=1.5, pos=4)
text(data.df$time[1], 5, "UPLOAD", col="red", cex=1.5, pos=4)
abline(h=50, lty=3, lwd=0.5)
mtext(paste0("Aktualisiert: ", strftime(Sys.time(), "%d.%m.%Y %H:%M:%S")), 
      cex=0.8)
dev.off()
q("no")

Und führt es anschliessend ebenso zeitgesteuert, wie den Speedtest, als Cronjob aus. Dazu öffnet man mit crontab -e die Crontab und fügt folgende Zeile ein:
1 * * * * Rscript ~/scripts/speedtest_plot.R 2>&1 /dev/null
Dieser Befehl, der stündlich ausgeführt wird, produziert drei unterschiedliche Plots, von denen einer – nämlich der mit den Ergebnissen der Geschwindigkeitsmessungen der letzten Woche – hier dargestellt ist. Hinter dem verlinkten Bild findeet sich eine interaktive R Shiny-Animation:

Geschwindigkeitsdaten der letzten Woche

Parallelize automatic model selection using glmulti and Rmpi

glmulti is a Java-based package for automated model selection and model-averaging. It provides a wrapper to select optimized glm’s and other model functions with specified response and explanatory variables based on an information criterion (AIC, AICc or BIC). With an increasing number of potential explanatory variables the number of candidate models increases exponentially making an exhaustive screening of the candidates computationally intensive and time-consuming. The package-related article by Calcagno & de Mazancourt (2010) in the Journal of Statistical Software gives a general overview about model selection and the way how this can be achieved using the package glmulti.

To speed up this process the function glmulti already provides two variables (chunk and chunks) to split the model selection process into subprocesses. The results of the subprocesses can be stored as objects of the S4 class glmulti to join the results and select the overall “best” models using the function consenus. The parallelization of the computations can be achived through a variety of methods, but here I want to present a possibility using the R-package Rmpi, an interface to MPI (Message-Passing Interface). The Acadia Centre for Mathematical Modelling and Computation provides a very nice tutorial about the use of Rmpi, describing three different methods including example scripts. I applied the Task Pull method to a working example from the original article by Calcagno & de Mazancourt (2010) based on the birth weight dataset provided by the MASS package. In the following script you will find the outcome of a combination of the task pull approach from the Rmpi tutorial and the real data example, given in chapter 3.3 of the research article by Calcagno & de Mazancourt (2010). I hope you are able to re-use it for your personal needs and speed up your computations, too.

These scripts were developed as part of my employment at the Federal Institute of Hydrology (BfG) in Koblenz, Germany.

THIS SCRIPT IS NOT FULLY FUNCTIONAL YET! LINE 91 POSSIBLY NEEDS MODIFCATION…

#################################################
# Parallelized-GLM-Selection-With-Rmpi.R
#
# author: arnd.weber(at)bafg.de
#
# purpose:
#		- glm model selection using package 
#         "glmulti"
#		- parallelized using package "Rmpi"
#		- parallelization based on code from 
#         http://math.acadiau.ca/ACMMaC/Rmpi/index.html
#
#####
# Install on Ubuntu
###
# sudo apt-get install r-base r-base-dev r-cran-mass r-cran-rmpi r-cran-rjava mpi-default-dev
# sudo R
# install.packages("glmulti")
#################################################

# setwd
setwd("/home/arnd") 

# load packages
require("rJava")
require("glmulti")
require("MASS")

#####
# Load R MPI and spawn the slaves
###
if(!is.loaded("mpi_initialize")){
  require("Rmpi")
}

# Spawn 2 slaves 
n_slaves <- 2
mpi.spawn.Rslaves(nslaves=n_slaves)

#####
# Rmpi helper functions / functions the slaves will run
###
# In case R exits unexpectedly, have it automatically clean up resources taken up by Rmpi (slaves, memory, etc...)
.Last <- function(){
    if(is.loaded("mpi_initialize")){
        if(mpi.comm.size(1) > 0){
            print("Please use mpi.close.Rslaves() to close slaves.")
            mpi.close.Rslaves()
        }

        print("Please use mpi.quit() to quit R")
        .Call("mpi_finalize")
    }
}

###
# Function the slaves will call to perform a validation on the
# Assumes: thedata 
glmulti.slave <- function(){

	#####
    # Note the use of the tag for sent messages: 
    #     1=ready_for_task, 2=done_task, 3=exiting 
    # Note the use of the tag for received messages: 
    #     1=task, 2=done_tasks 
	#####

	# Set controlling variables at the first call of the function 
    junk <- 0 
    done <- 0 

	# Start processing
    while(done != 1){

        # Signal being ready to receive a new task 
        mpi.send.Robj(junk, 0, 1) 

        # Receive a task 
        task <- mpi.recv.Robj(mpi.any.source(), mpi.any.tag()) 
        task_info <- mpi.get.sourcetag() 
        tag <- task_info[2] 

        if (tag == 1) {

			chunkNumber <- as.numeric(task$chunkNumber)
            
            # print
            print(paste(sep="", "I am chunk number ", chunkNumber, " of ", thedata$n_slaves, " chunks and I am supposed to process a model selection using glmulti with the formula ", paste(thedata$y[2], thedata$y[1], thedata$y[3], collapse=" ")))
            res <- paste(sep="", "I am chunk number ", chunkNumber, " of ", thedata$n_slaves, " chunks and I am supposed to process a model selection using glmulti with the formula ", paste(thedata$y[2], thedata$y[1], thedata$y[3], collapse=" "))
	        # Run glmulti
	        #res <- glmulti(y=thedata$y, data=thedata$data, family="binomial", chunk=chunkNumber, chunks=thedata$n_slaves,  plotty=FALSE, report=TRUE , marginality=TRUE, crit="aic", fitfunction="glm")
            
            # Send a results message back to the master
            results <- list(result=res, chunkNumber=chunkNumber)
            mpi.send.Robj(results, 0, 2)

		} else if (tag == 2) {
            done <- 1
		}
	}

	mpi.send.Robj(junk, 0, 3)

}

#####
# assemble the list "thedata" to be handed over to the slaves
###
# assemble "bwt"-dataset from package MASS
bwt <- with(birthwt, {
race <- factor(race, labels = c("white", "black", "other"))
ptd <- factor(ptl > 0)
ftv <- factor(ftv)
levels(ftv)[-(1:2)] <- "2+"
data.frame(low = factor(low), age, lwt, race, smoke = (smoke > 0),
ptd, ht = (ht > 0), ui = (ui > 0), ftv)
})

thedata <- list(as.formula("low ~ .*."), bwt, n_slaves)
names(thedata) <- c("y", "data", "n_slaves")
#res1 <- glmulti(y=thedata$y, data=thedata$data, chunk=1, chunks=2, method="h", level=2, crit="aic", family="binomial", plotty=FALSE, fitfunction="glm", report=FALSE)
#res2 <- glmulti(y=thedata$y, data=thedata$data, chunk=2, chunks=2, method="h", level=2, crit="aic", family="binomial", plotty=FALSE, fitfunction="glm", report=FALSE)

#####
# Send "thedata" and functions to the slaves 
###
# Tell the slaves to load required packages
mpi.bcast.cmd(require("rJava"))
mpi.bcast.cmd(require("glmulti"))

# Send the required data and functions to the slaves
mpi.bcast.Robj2slave(thedata)
mpi.bcast.Robj2slave(glmulti.slave)

# Tell the slaves to execute their function glmulti.slave
mpi.bcast.cmd(glmulti.slave())

#####
# Create task / results lists and controlling variables
###
tasks <- vector(mode="list", length=n_slaves) 
for(i in 1:n_slaves){
	tasks[[i]] <- list(chunkNumber=i)
}

results <- vector(mode="list", length=n_slaves)

junk <- 0
closed_slaves <- 0
i <- 1

#####
# Communicate with the slaves to perform the computation
###
while(closed_slaves < n_slaves){ 

	# Receive a message from a slave 
	message <- mpi.recv.Robj(mpi.any.source(), mpi.any.tag()) 
	message_info <- mpi.get.sourcetag() 
	slave_id <- message_info[1] 
	tag <- message_info[2] 

	if(tag == 1){ 
	# slave is ready for a task. Give it the next task, or tell it tasks are done if there are none.
		if(length(tasks) > 0){ 
			# Send a task, and then remove it from the task list 
			mpi.send.Robj(tasks[[1]], slave_id, 1)
			tasks[[1]] <- NULL 
		} else { 
			# No more tasks to process
			mpi.send.Robj(junk, slave_id, 2) 
		} 

	} else if(tag == 2){ 
	# The message contains results to be stored in RESULTS.
		chunkNumber <- message$chunkNumber
		results[[chunkNumber]] <- message$result

	} else if(tag == 3){ 
	# A slave has closed down. 
		closed_slaves <- closed_slaves + 1 

	} 

} 

#####
# Operate on the results
###
print(results)
#best.glm <- consensus(results, confsetsize=1)
#print(summary(best.glm))

#####
# Tell all slaves to close down, and exit the program
###
mpi.close.Rslaves()

#####
# Quit R MPI and R
###
mpi.quit("no")