Below are a few R functions I use to evaluate and tune our Squid caches, which we use in accelerator mode. The functions work by taking a real stream of requests from either an application or squid log. From there the arrival rate can be determined at every second. Or a filter can be applied and the cache hit ratio or arrival rate re-determined. Or better yet, the series can be filtered a number of times with different filter values to determine the best value for a given filter.
Note that I've only implemented a time to live (ttl) filter. Other filters can be implemented and passed to the
Some examples first:
Import a two column log file, where the first column contains time stamps, and the second contains page urls or other symbols.
Plot the log without filtering the stream first.
Apply a 60 minute ttl filter to the log.
Create a list of filtered streams in batch to determine the best ttl value for the empirical stream. Note the default filter is
The functions:
apply.filter function.Some examples first:
Import a two column log file, where the first column contains time stamps, and the second contains page urls or other symbols.
stats <- read.table( 'log.import' )Plot the log without filtering the stream first.
rate <- arrivalrate( stats$V1 )
plot( rate )Apply a 60 minute ttl filter to the log.
result <- filter.ttl( stats$V1, stats$V2, ttl=(60*60) )
rate <- arrivalrate( stats$V1[ result ] )
plot( rate )
Create a list of filtered streams in batch to determine the best ttl value for the empirical stream. Note the default filter is
filter.ttl. And the more values in the fpara sequence, the smoother the curve.results <- filter.apply( stats$V1, stats$V2, fpara=c( 0, 60, 60*60, 60*60*12 ) )
rates <- arrivalrate( stats$V1, results )
plot.rates.summary( rates, pdf=T )
The functions:
# working towards manipulation by index so they can be applied to sibling datasets
# every TRUE is a cache miss, FALSE is a cache hit
filter.ttl.i <- function( tseries, ttl=60 ) {
if( ttl == 0 )
return( rep( TRUE, length( tseries ) ) )
index <- rep( FALSE, length( tseries ) )
curr <- tseries[1] - ttl - 1
for( i in 1:length( tseries ) ) {
if( ttl < tseries[i] - curr ) {
curr <- tseries[i]
index[i] <- TRUE
}
}
return( index )
}
filter.ttl <- function( tseries, events, ttl=60 ) {
unique <- split( tseries, events )
filtered <- sapply( unique, filter.ttl.i, ttl=ttl )
result <- unsplit( filtered, events )
return( result )
}
# apply the specified filter to the tseries and events for each value in the sequence
filter.apply <- function( tseries, events, filter=filter.ttl, fpara=seq(0, 60*60, by=30 ), quiet=FALSE ) {
filtered <- list()
for( i in 1:length( fpara ) ) {
if( !quiet ) print( fpara[i] )
filtered[[i]] <- filter( tseries, events, fpara[i] )
}
names( filtered ) <- fpara
if( length( fpara ) == 1 )
return( filtered[[1]] )
else
return( filtered )
}
# takes a series of time stamps and sums the duplicates. altenately it will
# take a list of indexes that denote which time stamps to ignore.
arrivalrate <- function( tseries, index=TRUE, quiet=FALSE, inverse=FALSE ) {
# making the assumption we want all result to match the same time frame as
# the original series (typically the first index is TRUE)
tsrange <- range( tseries )
if( !is.list( index ) )
return( arrivalrate.i( tseries, tsrange ) )
arrivalrate <- list()
for( i in 1:length( index ) ) {
if( !quiet ) print( i )
if( inverse )
idx <- !index[[i]]
else
idx <- index[[i]]
ts <- tseries[ idx ]
if( length( ts ) == 0 )
arrivalrate[[i]] <- 0
else
arrivalrate[[i]] <- arrivalrate.i( ts, tsrange )
}
names( arrivalrate ) <- names( index )
return( arrivalrate )
}
arrivalrate.i <- function( tseries, tsrange ) {
arrivalrate <- table( tseries )
times <- names( arrivalrate )
frequency <- unsplit( arrivalrate, times )
times <- as.numeric( times )
# need to offset if max/min don't jive with current tseries
times.i <- times - times[1] + 1 + times[1] - tsrange[1]
arrivalrate <- rep( 0, tsrange[2] - tsrange[1] + 1 )
arrivalrate[ times.i ] <- frequency
return( arrivalrate )
}
plot.rates.summary <- function( rates, func=mean, skip=1:1, pdf=F, main="mean" ) {
rates.xlim <- as.vector( names( rates ) )
# skip the first elements, the cache is not initialized
rates.func <- unlist( lapply( rates, function(x) func(x[-skip]) ), names( rates ) )
if( pdf ) pdf( file="cache-rate.pdf", height=11, width=16 )
plot( rates.xlim, rates.func, xlab="cache ttl (seconds)", ylab="req/sec", main=main, pch=1:length(rates.xlim) )
lines( rates.xlim, rates.func )
grid()
abline( h=max( rates.func ) )
abline( h=min( rates.func ) )
xcenter <- max( as.numeric( rates.xlim ) ) / 2
text( xcenter, max( rates.func ), max( rates.func ), pos=3 )
text( xcenter, min( rates.func ), min( rates.func ), pos=3 )
legend( x=(xcenter + xcenter / 2), y=(max( rates.func ) - .25), legend=rates.xlim, pch=1:length(rates.xlim) )
if( pdf ) dev.off()
}
Leave a comment