#!/usr/bin/perl # Jonathan Ledlie # findAccessPatterns.pl # This is one of the scripts we wrote for the FAST 2003 paper. # It's included to give you an idea of how to use nfsparse.pm, # which helps parse the traces. ##################################################### use POSIX qw(ceil floor); use Getopt::Std; use strict; use nfsparse; my $recordMode = 'none'; ##################################################### # globals my $blocksize = 8192; my @fileSizes = (); my @readBucket = (); my @writeBucket = (); my @rwBucket = (); my %fh2filename = (); my $entireModeByteCount = 0; my $sequentialModeByteCount = 0; my $randomModeByteCount = 0; ##################################################### # set in getopt # Number of lines parse between purges my $purgeFrequency = 1000; # If a file has not been accesses in this number of seconds, # we assume the end of the access. my $expirationTime = 30; # sort window sizes #my @sortDistances = (0, 100, 250, 500, 1000); my $sortDistance = 0.25; #my $watchReads = 1; #my $watchWrites = 1; my $sortAccesses = 0; my $removeDuplicates = 0; my $sortMethod = ''; my $debug = 0; my $findFilenames = 1; my $swapCount = 0; my $accessCount = 0; my $maxLineCount = 0; my $logfile = "/tmp/access.log.$$"; open LOG, (">$logfile") or die ("Cannot open $logfile file"); #my $log = new FileHandle (">tmpfile"); ##################################################### my $usage = "Usage: gzip -cd [nfs1.gz] [nfs2.gz] | perl findAccessPatterns.pl [options]\nOptions:\n -d print lots of debug output to log file\n -e [seconds] if no activity to a file after this many seconds, assume access is finished\n -m [n|r|d] n = no sorting,\n r = sort but do not remove duplicates\n d = sort and remove duplicates\n -w use a window of this size\n -l [max lines] only parse this many lines\n -h print this help message\n\nDefaults: -e $expirationTime -l $logfile -w $sortDistance\n\n"; # -n match file names to file handles (consumes more memory)\n ##################################################### # parseInput # # Associate input lines with access patterns. # Each call waiting for a response is stored in uid2call. # When the response comes, if we have seen access to this file before, # we call augmentAccess. If it is new, we call pluckAccess # # pluckAccess and augmentAccess each update the fd2access hash # which stores all of the call/response info we want, plus some other stuff # about each access. # # sub parseInput { my $lineCount = 0; my %uid2call = (); my %fd2access = (); my %uid2filename = (); my $currentTime = 0; while (my $rawLine = <>) { $lineCount++; if ($maxLineCount > 0 && $lineCount > $maxLineCount) { print "Finished $lineCount lines\n"; &purgeAccesses (\%fd2access, $currentTime); return; } if ($lineCount % $purgeFrequency == 0) { &purgeAccesses (\%fd2access, $currentTime); &purgeUid2Call (\%uid2call, $currentTime); } my %line = (); &nfsDumpParseLineQuick ($rawLine, \%line); ################################ # associate filenames with filehandles # currently, unused but might be interesting in the future # if ($findFilenames) { if ($line{'opname'} eq 'lookup') { &nfsDumpParseLine ($rawLine, \%line); next if (! defined ($line{'fh'})); my $uid = $line{'xid'}.'-'.$line{'client_id'}; if ($line{'proto'} =~ /^C/) { # LOOKUP CALL if (!defined $line{'name'}) { if (!defined $line{'fn'}) { next; } else { $line{'name'} = $line{'fn'}; } } $uid2filename{$uid} = $line{'name'}; } elsif ($line{'proto'} =~ /^R/) { # LOOKUP RESPONSE my $filename = $uid2filename{$uid}; if (defined $filename) { $line{'fh'} = &nfsDumpCompressFH ($recordMode, $line{'fh'}); my $currentFilename = $fh2filename{$line{'fh'}}; $fh2filename{$line{'fh'}} = $filename; delete $uid2filename{$uid}; } else { if ($debug) { print LOG "no matching filename in uid2filename\n"; } } } else { print LOG "Bad proto\n"; } # all done with 'lookup' next; } } if ( ($line{'opname'} ne 'read') && ($line{'opname'} ne 'write')) { next; } &nfsDumpParseLine ($rawLine, \%line); $line{'time'} =~ /(\d+)\./; $currentTime = $1; if ($debug) { $line{'rawline'} = $rawLine; } my $uid = $line{'xid'}.'-'.$line{'client_id'}; ########## # CALL if ($line{'proto'} eq 'C3' || $line{'proto'} eq 'C2') { # extract a compressed version of my $fh = &nfsDumpCompressFH ($recordMode, $line{'fh'}); $line{'fh'} = $fh; my %call = (); &rememberCall (\%line, \%call); $uid2call{$uid} = \%call; next; } ########## # RESPONSE if ($line{'proto'} ne 'R3' && $line{'proto'} ne 'R2') { next; } if ($line{'status'} ne 'OK') { if (defined ($uid2call{$uid})) { delete $uid2call{$uid}; } next; } my $call = $uid2call{$uid}; ########## # Sometimes we see a response to a call we didn't see. # In this case, just drop it on the floor. if (!defined $call) { # print "undefined file handle\n"; next; } my $fd = $call->{'fh'}; &glueCallAndResponse ($call, \%line); my $access = $fd2access {$fd}; if (!defined ($access)) { ########## # New access to file my %access = (); &pluckAccess ($call, \%access); $fd2access{$fd} = \%access; } else { ########## # Ongoing access to file &augmentAccess ($call, $access); } } &purgeAccesses (\%fd2access, $currentTime); } ##################################################### # pluckAccess # # Assign a subset of the elements from the currently # parsed line to a new file access. # We have not seen accesses to this file (from this user) # recently # # Take from the current response line and the Call line (xact) # and store some info into Access for safekeeping # sub pluckAccess { my ($line, $access) = @_; $access->{'accessCount'} = 1; # getting time from the Call $access->{'ctime'} = $line->{'time'}; $access->{'atime'} = $line->{'time'}; $access->{'name'} = 'null'; if ($findFilenames && defined $fh2filename{$line->{'fh'}}) { $access->{'name'} = $fh2filename{$line->{'fh'}}; } my $mtime = $line->{'time'}; $mtime =~ s/^.*?(\d\d\d\d)\.(\d+)$/$1$2/; my %accesses = (); $accesses{$mtime} = $line; $access->{'accesses'} = \%accesses; } ##################################################### # augmentAccess # # sub augmentAccess { my ($line, $access) = @_; $access->{'accessCount'}++; $access->{'atime'} = $line->{'time'}; my $mtime = $line->{'time'}; $mtime =~ s/^.*?(\d\d\d\d)\.(\d+)$/$1$2/; my $accesses = $access->{'accesses'}; $accesses->{$mtime} = $line; } ##################################################### # rememberCall # # record some Call info so it is available once the response comes in # sub rememberCall { my ($line, $call) = @_; # keep offset in hex for now if (!defined ($line->{'off'})) { if (defined ($line->{'offset'})) { $line->{'off'} = $line->{'offset'}; } elsif (defined ($line->{'begoff'})) { $line->{'off'} = $line->{'begoff'}; } else { &printHash ($line); die ("line off not defined\n"); } } if (!defined ($line->{'count'})) { if (defined ($line->{'tcount'})) { $line->{'count'} = $line->{'tcount'}; } else { &printHash ($line); die ("count not defined\n"); } } $call->{'off'} = hex($line->{'off'}); $call->{'count'} = hex($line->{'count'}); $call->{'fh'} = $line->{'fh'}; $call->{'time'} = $line->{'time'}; $call->{'client_id'} = $line->{'client_id'}; $call->{'opname'} = $line->{'opname'}; } ##################################################### # glueCallAndResponse # # Take what we want from the call and response # and put it into the call hash for safe keeping # sub glueCallAndResponse { my ($call, $response) = @_; if (!defined ($response->{'count'})) { if (defined ($response->{'tcount'})) { $response->{'count'} = $response->{'tcount'}; } elsif (defined ($call->{'count'})) { ; } else { &printHash ($response); die ("count not defined\n"); } } if (defined ($response->{'count'})) { $call->{'count'} = hex ($response->{'count'}); } if (!defined ($call->{'count'})) { die ("Logic check for count failed"); } $call->{'size'} = hex ($response->{'size'}); if (defined ($response->{'eof'}) && $response->{'eof'} == 1) { $call->{'eof'} = $response->{'eof'}; } if ( ($call->{'opname'} eq 'write') && ($call->{'off'} + $call->{'count'} >= $call->{'size'})) { $call->{'eof'} = 1; } } sub printAccessPattern { my ($accesses, $times, $sortDistance, $name) = @_; if ($#{$times} == 0) { my $output = $sortMethod.' s'.$sortDistance.' '; my $bytesRead = 0; my $bytesWritten = 0; my $readCount = 0; my $writeCount = 0; if ($accesses->{$times->[0]}->{'opname'} eq 'read') { $bytesRead = $accesses->{$times->[0]}->{'count'}; $readCount = 1; } else { $bytesWritten = $accesses->{$times->[0]}->{'count'}; $writeCount = 1; } my $pattern = 'sequential'; if ( ($accesses->{$times->[0]}->{'off'} == 0) && defined ($accesses->{$times->[$#{$times}]}->{'eof'})) { $pattern = 'entire'; } $output .= $accesses->{$times->[0]}->{'opname'} ." $pattern size:".$accesses->{$times->[0]}->{'size'}." rb:$bytesRead wb:$bytesWritten rc:$readCount wc:$writeCount m:0.000 v:0.000 sv:0.000 $name"; if ($debug) { print LOG "$output\n"; } print "$output\n"; return; } ########################## # finished with one access case my $opname = ''; my $readCount = 0; my $writeCount = 0; my $bytesRead = 0; my $bytesWritten = 0; if ($accesses->{$times->[0]}->{'opname'} eq 'read') { $readCount = 1; $bytesRead = $accesses->{$times->[0]}->{'count'}; } else { $writeCount = 1; $bytesWritten = $accesses->{$times->[0]}->{'count'}; } my $count = 0; my $metric = 0.0; $opname = $accesses->{$times->[0]}->{'opname'}; my @metrics = (); for (my $i = 1; $i <= $#{$times}; $i++) { if ( (($opname eq 'read') && ($accesses->{$times->[$i]}->{'opname'} eq 'write')) || (($opname eq 'write') && ($accesses->{$times->[$i]}->{'opname'} eq 'read'))) { $opname = 'rw'; } ################ # bytes read/written if ($accesses->{$times->[$i]}->{'opname'} eq 'read') { $readCount++; $bytesRead += $accesses->{$times->[$i-1]}->{'count'}; } if ($accesses->{$times->[$i]}->{'opname'} eq 'write') { $writeCount++; $bytesWritten += $accesses->{$times->[$i-1]}->{'count'}; } my $ratio = 0.0; ################ # rounded metric my $diff = ceil ($accesses->{$times->[$i]}->{'off'}/$blocksize) - ceil ($accesses->{$times->[$i-1]}->{'off'}/$blocksize); if ($accesses->{$times->[$i-1]}->{'count'} != 0) { $ratio = $diff / (ceil($accesses->{$times->[$i-1]}->{'count'}/$blocksize)); } $metrics[$#metrics+1] = 1 - $ratio; $count++; } foreach my $aMetric (@metrics) { $metric += $aMetric; } $metric = $metric / $count; my $variance = 0; foreach my $aMetric (@metrics) { my $diff = $aMetric - $metric; $variance += ($diff * $diff); if ($variance < 0) { die ("variance is negative metric $metric sum $metric\n"); } } if ($count > 1) { $variance = $variance / ($count -1); } my $sqrtVariance = sqrt ($variance); if ($metric != 0) { $metric = -1.0 * $metric; } $metric = sprintf ("%6.3f", $metric); $metric =~ s/\s//g; $variance = sprintf ("%6.3f", $variance); $variance =~ s/\s//g; $sqrtVariance = sprintf ("%6.3f", $sqrtVariance); $sqrtVariance =~ s/\s//g; my $pattern = 'random'; if ($metric eq '0.000') { if ($debug) { print LOG "metric is zero\n"; } if ( ($accesses->{$times->[0]}->{'off'} == 0) && defined ($accesses->{$times->[$#{$times}]}->{'eof'})) { $pattern = 'entire'; } else { $pattern = 'sequential'; } } else { if ($debug) { print LOG "metric is not zero\n"; } } # we stick a + on the end to show that there was more than one access my $output = $sortMethod.' s'.$sortDistance.' '.$opname.' '.$pattern.' size:'.$accesses->{$times->[$#{$times}]}->{'size'}." rb:$bytesRead wb:$bytesWritten rc:$readCount wc:$writeCount m:$metric v:$variance sv:$sqrtVariance $name +"; if ($debug) { print LOG "$output\n"; } print "$output\n"; } sub printSequentialityPattern { my ($accesses, $times, $sortDistance, $name) = @_; if ($#{$times} == 0) { my $output = $sortMethod.' s'.$sortDistance.' '; my $bytesRead = 0; my $bytesWritten = 0; my $readCount = 0; my $writeCount = 0; if ($accesses->{$times->[0]}->{'opname'} eq 'read') { $bytesRead = $accesses->{$times->[0]}->{'count'}; $readCount = 1; } else { $bytesWritten = $accesses->{$times->[0]}->{'count'}; $writeCount = 1; } $output .= $accesses->{$times->[0]}->{'opname'} ." size:".$accesses->{$times->[0]}->{'size'}." rb:$bytesRead wb:$bytesWritten rc:$readCount wc:$writeCount m1:-1.000 m2:-1.000 $name"; if ($debug) { print LOG "$output\n"; } print "$output\n"; return; } ########################## # finished with one access case my $opname = ''; my $readCount = 0; my $writeCount = 0; my $bytesRead = 0; my $bytesWritten = 0; if ($accesses->{$times->[0]}->{'opname'} eq 'read') { $readCount = 1; $bytesRead = $accesses->{$times->[0]}->{'count'}; } else { $writeCount = 1; $bytesWritten = $accesses->{$times->[0]}->{'count'}; } my $count = 0; my $sequentialCountM1 = 0; my $sequentialCountM2 = 0; $opname = $accesses->{$times->[0]}->{'opname'}; for (my $i = 1; $i <= $#{$times}; $i++) { if ( (($opname eq 'read') && ($accesses->{$times->[$i]}->{'opname'} eq 'write')) || (($opname eq 'write') && ($accesses->{$times->[$i]}->{'opname'} eq 'read'))) { $opname = 'rw'; } ################ # bytes read/written if ($accesses->{$times->[$i]}->{'opname'} eq 'read') { $readCount++; $bytesRead += $accesses->{$times->[$i-1]}->{'count'}; } if ($accesses->{$times->[$i]}->{'opname'} eq 'write') { $writeCount++; $bytesWritten += $accesses->{$times->[$i-1]}->{'count'}; } my $ratio = 0.0; { # DJE my $prev_time = $accesses->{$times->[$i-1]}->{'time'}; my $curr_time = $accesses->{$times->[$i]}->{'time'}; my $diff_time = $curr_time - $prev_time; if ($diff_time < 0) { printf ("DJE time-skip %g %s\n", $diff_time, $diff_time < 0 ? "XXX" : ''); } } my $prevOffset = ceil ($accesses->{$times->[$i-1]}->{'off'}/$blocksize); my $curOffset = ceil ($accesses->{$times->[$i]}->{'off'}/$blocksize); my $blocksAccessed = ceil ($accesses->{$times->[$i-1]}->{'count'} / $blocksize); my $offsetDiff = $curOffset - $prevOffset; if ($offsetDiff != $blocksAccessed) { ; } else { $sequentialCountM1++; } if ($offsetDiff >= $blocksAccessed && $offsetDiff < $blocksAccessed + 10) { $sequentialCountM2++; } else { ; } $count++; } my $metricM1 = $sequentialCountM1 / $count; my $metricM2 = $sequentialCountM2 / $count; $metricM1 = sprintf ("%6.3f", $metricM1); $metricM1 =~ s/\s//g; $metricM2 = sprintf ("%6.3f", $metricM2); $metricM2 =~ s/\s//g; if ($debug) { print LOG "done this access: seqCountM1 $sequentialCountM1 seqCountM2 $sequentialCountM2 totalCount: $count "; # might want to include size as well if ( ($metricM1 < 0.5) && ($metricM2 > .45)) { print LOG "Mswitch"; } print LOG "\n"; } # we stick a + on the end to show that there was more than one access my $output = $sortMethod.' s'.$sortDistance.' '.$opname.' size:'.$accesses->{$times->[$#{$times}]}->{'size'}." rb:$bytesRead wb:$bytesWritten rc:$readCount wc:$writeCount m1:$metricM1 m2:$metricM2 $name +"; if ($debug) { print LOG "$output\n"; } print "$output\n"; } sub separateAccesses { my ($accesses, $times, $sortDistance, $name) = @_; my $cardinality = $#{$times}; my @firstAccess = (); for (my $i = 0; $i <= $cardinality; $i++) { $firstAccess[$#firstAccess+1] = $times->[$i]; # if the accesses are more than 4 seconds apart, we assume they # are actually different accesses and should be evaluated separately. # we do the same if the accesses are more than one second apart and # the access is to the same location in the file, # or to the beginning of the file. # ($i < $cardinality) && ($times->[$i+1] > $times->[$i] + 4000000) || # (($i < $cardinality) && ($times->[$i+1] > $times->[$i] + 3000000) && # (($accesses->{$times->[$i+1]}->{'off'} == 0) || # ($accesses->{$times->[$i+1]}->{'off'} == $accesses->{$times->[$i]}->{'off'})))) { if (defined($accesses->{$times->[$i]}->{'eof'})) { &printSequentialityPattern ($accesses, \@firstAccess, $sortDistance, $name); # &printAccessPattern ($accesses, \@firstAccess, $sortDistance, $name); @firstAccess = (); } } if ($#firstAccess >= 0) { &printSequentialityPattern ($accesses, \@firstAccess, $sortDistance, $name); # &printAccessPattern ($accesses, \@firstAccess, $sortDistance, $name); } } sub sortTimes { my ($times, $accesses, $sortDistance) = @_; if (!$sortAccesses) { return; } my $distanceInMilliseconds = $sortDistance * 1000; for (my $i = 0; $i <= $#{$times}; $i++) { $accessCount++; # find Tprime = Tp, where Tp is # entries that are near enough to my $minIndex = $i; my $minValue = $accesses->{$times->[$i]}->{'off'}; for (my $j = $i + 1; ($j <= $#{$times}) && ($times->[$j]-$times->[$i] < $distanceInMilliseconds); $j++) { if ($accesses->{$times->[$j]}->{'off'} < $minValue) { $minIndex = $j; $minValue = $accesses->{$times->[$j]}->{'off'}; } } if ($minIndex != $i) { if ($debug) { print LOG "swapping $i ".$times->[$i]." with $minIndex ".$times->[$minIndex]."\n"; } if ($accesses->{$times->[$i]}->{'opname'} ne $accesses->{$times->[$minIndex]}->{'opname'}) { if ($debug) { print LOG "swap cancelled because not both reads or writes\n"; } } else { my $tmp = $times->[$i]; $times->[$i] = $times->[$minIndex]; $times->[$minIndex] = $tmp; $swapCount++; } } } } ##################################################### # purgeAccesses # # Remove any stale accesses and print their access pattern # # uid2call is actually fd2access sub purgeAccesses { my ($uid2call, $currentTime) = @_; # if ($debug) { # print LOG "Start of purgeAccesses\n\n"; # } foreach my $fd (keys %$uid2call) { next if ($uid2call->{$fd}->{'atime'} >= $currentTime - $expirationTime); my $accesses = $uid2call->{$fd}->{'accesses'}; my @times = keys %$accesses; # if ($uid2call->{$fd}->{'accessCount'} >= 1) { # foreach my $sortDistance (@sortDistances) { # first, let's get @times into a good order # we do this each time to restore 'times' to its # original value @times = sort {$a <=> $b} @times; if ($debug) { if ($#times >= 1) { print LOG "-------------------------------\n"; &printHash ($uid2call->{$fd}); print LOG "Before:\n"; &printAccessList ($accesses, \@times); } print LOG "end of accesses\n"; } if ($sortAccesses) { &sortTimes (\@times, $accesses, $sortDistance); if ($debug) { if ($#times >= 1) { print LOG "After:\n"; &printAccessList ($accesses, \@times); } } } &separateAccesses ($accesses, \@times, $sortDistance, $uid2call->{$fd}->{'name'}); # } if ($debug) { print LOG "\n"; } # print "\n\n"; # } else { # make the common case fast # print "single access\n"; # } foreach my $mtime (keys %$accesses) { delete $accesses->{$mtime}; } delete $uid2call->{$fd}; } } ##################################################### sub purgeUid2Call { my ($uid2call, $currentTime) = @_; foreach my $uid (keys %$uid2call) { my $seconds = $uid2call->{$uid}->{'time'}; # print LOG "currentTime: $currentTime seconds: $seconds "; $seconds =~ s/\..*//; # print LOG "seconds2: $seconds\n"; if ($seconds + 8 < $currentTime) { # print LOG "ding\n"; # &printHash ($uid2call->{$uid}); delete $uid2call->{$uid}; } } } ##################################################### sub printAccessList { my ($accesses, $times) = @_; # foreach my $mtime (@$times) { for (my $i = 0; $i < @$times; $i++) { my $mtime = $times->[$i]; my $xact = $accesses->{$mtime}; print LOG "$mtime: ".$xact->{'opname'}. " off ".$xact->{'off'}; print LOG " count ".$xact->{'count'}; print LOG " size ".$xact->{'size'}; print LOG " time ".$xact->{'time'}; if (defined ($xact->{'eof'})) { print LOG " eof"; } my $diff = 0; if ($i > 0) { $diff = $xact->{'off'} - $accesses->{$times->[$i-1]}->{'off'}; } print LOG " diff $diff\n"; } print LOG "\n"; } ##################################################### # fileSizeToIndex # # Take a file size in kilobytes and return the index # of the bucket into which it falls # e.g. .4k would fall into bucket 0. # 1.1k rounds up to bucket 1. sub fileSizeToIndex { my ($size) = @_; if ($size < $fileSizes[0]) { return 0; } for (my $i = 1; $i <= $#fileSizes; $i++) { if ($size > $fileSizes[$i-1] && $size <= $fileSizes[$i]) { return $i; } } print LOG "max file size: $size\n"; return $#fileSizes; } ##################################################### # printHash # # debugging fn # sub printHash { my ($hash) = @_; foreach my $key (keys %$hash) { print LOG "$key ".$hash->{$key}.":: "; print LOG "$key ".$hash->{$key}.":: "; } print LOG "\n"; } ##################################################### # main # sub main { my %opts = (); getopts('e:dhm:l:w:', \%opts); # &printHash (\%opts); foreach my $opt (keys %opts) { if ($opt eq 'd') { $debug = 1; } elsif ($opt eq 'e') { $expirationTime = $opts{$opt}; # } elsif ($opt eq 'n') { # $findFilenames = 1; } elsif ($opt eq 'm') { if ($opts{'m'} eq 'n') { $sortAccesses = 0; $removeDuplicates = 0; $sortDistance = 0; } elsif ($opts{'m'} eq 'r') { $sortAccesses = 1; $removeDuplicates = 0; } elsif ($opts{'m'} eq 'd') { $sortAccesses = 1; $removeDuplicates = 1; } else { die ("Bad option supplied for -m\n$usage\n"); } $sortMethod = $opts{'m'}; } elsif ($opt eq 'w') { $sortDistance = $opts{'w'}; } elsif ($opt eq 'l') { $maxLineCount = $opts{'l'}; } else { die ($usage); } } if (!defined $opts{'m'}) { die ("Sort method -m required.\n$usage\n"); } &parseInput (); if ($sortAccesses) { my $swapPct = $swapCount / $accessCount; $swapPct = sprintf ("%1.6f", $swapPct); my $swapActivity = "swapCount: $swapCount accessCount: $accessCount pct: $swapPct"; print "$swapActivity\n"; print LOG "$swapActivity\n"; } print LOG "Finished\n"; close (LOG); print "Finished\n"; } &main(); exit (0);