Cache Simulation with R

| | Comments (0) | TrackBacks (0)
Below are a few R functions I use to evaluate and tune our Squid caches, which we use in accelerator mode. The functions work by taking a real stream of requests from either an application or squid log. From there the arrival rate can be determined at every second. Or a filter can be applied and the cache hit ratio or arrival rate re-determined. Or better yet, the series can be filtered a number of times with different filter values to determine the best value for a given filter.
Note that I've only implemented a time to live (ttl) filter. Other filters can be implemented and passed to the apply.filter function.

Some examples first:

Import a two column log file, where the first column contains time stamps, and the second contains page urls or other symbols.

stats <- read.table( 'log.import' )

Plot the log without filtering the stream first.

rate <- arrivalrate( stats$V1 )
plot( rate )


Apply a 60 minute ttl filter to the log.

result <- filter.ttl( stats$V1, stats$V2, ttl=(60*60) )
rate <- arrivalrate( stats$V1[ result ] )
plot( rate )

Create a list of filtered streams in batch to determine the best ttl value for the empirical stream. Note the default filter is filter.ttl. And the more values in the fpara sequence, the smoother the curve.

results <- filter.apply( stats$V1, stats$V2, fpara=c( 0, 60, 60*60, 60*60*12 ) )
rates <- arrivalrate( stats$V1, results )
plot.rates.summary( rates, pdf=T )

ratevttl_small.png

The functions:
# working towards manipulation by index so they can be applied to sibling datasets
# every TRUE is a cache miss, FALSE is a cache hit
filter.ttl.i <- function( tseries, ttl=60 ) {
  
  if( ttl == 0 ) 
    return( rep( TRUE, length( tseries ) ) )

  index <- rep( FALSE, length( tseries ) )
  curr <- tseries[1] - ttl - 1

  for( i in 1:length( tseries ) ) {
    if( ttl < tseries[i] - curr ) {
      curr <- tseries[i]
      index[i] <- TRUE
    }
  }
  
  return( index )
}

filter.ttl <- function( tseries, events, ttl=60 ) {

  unique <- split( tseries, events )
  filtered <- sapply( unique, filter.ttl.i, ttl=ttl )
  result <- unsplit( filtered, events )
  
  return( result )
}

# apply the specified filter to the tseries and events for each value in the sequence
filter.apply <- function( tseries, events, filter=filter.ttl, fpara=seq(0, 60*60, by=30 ), quiet=FALSE ) {
  filtered <- list()

  for( i in 1:length( fpara ) ) {
    if( !quiet ) print( fpara[i] )
    
    filtered[[i]] <- filter( tseries, events, fpara[i] )
  }

  names( filtered ) <- fpara
  
  if( length( fpara ) == 1 )
    return( filtered[[1]] )
  else
    return( filtered )
}

# takes a series of time stamps and sums the duplicates. altenately it will
# take a list of indexes that denote which time stamps to ignore.
arrivalrate <- function( tseries, index=TRUE, quiet=FALSE, inverse=FALSE ) {

  
  # making the assumption we want all result to match the same time frame as
  # the original series (typically the first index is TRUE)
  tsrange <- range( tseries )

  if( !is.list( index ) ) 
    return( arrivalrate.i( tseries, tsrange ) )

  arrivalrate <- list()

  for( i in 1:length( index ) ) {
    if( !quiet ) print( i )

    if( inverse )
      idx <- !index[[i]]
    else
      idx <- index[[i]]
      
    ts <- tseries[ idx ]

    if( length( ts ) == 0 )
      arrivalrate[[i]] <- 0
    else
      arrivalrate[[i]] <- arrivalrate.i( ts, tsrange )      
  }
  
  names( arrivalrate ) <- names( index )
  
  return( arrivalrate )
}

arrivalrate.i <- function( tseries, tsrange ) {
    
    arrivalrate <- table( tseries )
    times <- names( arrivalrate )
    frequency <- unsplit( arrivalrate, times )
    times <- as.numeric( times )
    # need to offset if max/min don't jive with current tseries
    times.i <- times - times[1] + 1 + times[1] - tsrange[1]

    arrivalrate <- rep( 0, tsrange[2] - tsrange[1] + 1 )
    arrivalrate[ times.i ] <- frequency
    
    return( arrivalrate )
}

plot.rates.summary <- function( rates, func=mean, skip=1:1, pdf=F, main="mean" ) {
  rates.xlim <- as.vector( names( rates ) )
  # skip the first elements, the cache is not initialized
  rates.func <- unlist( lapply( rates, function(x) func(x[-skip]) ), names( rates ) )

  if( pdf ) pdf( file="cache-rate.pdf", height=11, width=16 )

  plot( rates.xlim, rates.func, xlab="cache ttl (seconds)", ylab="req/sec", main=main, pch=1:length(rates.xlim) )

  lines( rates.xlim, rates.func )

  grid()
  abline( h=max( rates.func ) )
  abline( h=min( rates.func ) )
  xcenter <- max( as.numeric( rates.xlim ) ) / 2
  text( xcenter, max( rates.func ), max( rates.func ), pos=3 )
  text( xcenter, min( rates.func ), min( rates.func ), pos=3 )
  legend( x=(xcenter + xcenter / 2), y=(max( rates.func ) - .25), legend=rates.xlim, pch=1:length(rates.xlim) )

  if( pdf ) dev.off()
} 

0 TrackBacks

Listed below are links to blogs that reference this entry: Cache Simulation with R.

TrackBack URL for this entry: http://www.manamplified.org/cgi-bin/mt-tb.cgi/268

Leave a comment