#!/usr/local/bin/perl # # mss_mirror_prep # # Version: 1.2 # # Take a list of NCAR MSS file information, preordered to meet some criteria, # such as tape mount efficiency, and produce one or more lftp scripts # to be used to mirror that list to local storage. # # Copyright 2007, University Corporation for Atmospheric Research # All Rights Reserved # use 5.6.0; use strict; use English; use Errno; use Cwd; use Getopt::Long; use Pod::Usage; ################################################################################ # # Command line option processing # ################################################################################ my $account = ''; my $createDirs = 1; my $dirList = ''; my $host = 'mssftp.ucar.edu'; my $filesPerScript = 0; my $lftpPath = 'lftp'; my $lftpScriptPrefix = 'lftp.script'; my $overwrite = 'never'; my $parallelTransfers = 2; my $port = 21; my $project = ''; my $rpwd = ''; my $scripts = 0; my $socketBuffer = 1048576; my $stripDirs = 0; my $timeout = 1800; my $target = ''; my $usage = 0; my $warningIsError = 0; my $status = GetOptions( 'account=s' => \$account, 'create-dirs!' => \$createDirs, 'dir-list=s' => \$dirList, 'files-per-script=i' => \$filesPerScript, 'help' => sub { $usage = 1 }, 'host=s' => \$host, 'lftp-path=s' => \$lftpPath, 'lftp-script-prefix=s' => \$lftpScriptPrefix, 'man' => sub { $usage = 2 }, 'overwrite=s' => \$overwrite, 'parallel-transfers=i' => \$parallelTransfers, 'port=i' => \$port, 'project=s' => \$project, 'rpwd=s' => \$rpwd, 'scripts=i' => \$scripts, 'socket-buffer=i' => \$socketBuffer, 'strip-dirs=i' => \$stripDirs, 'timeout=i' => \$timeout, 'target=s' => \$target, 'warning-is-error' => \$warningIsError, ); pod2usage(-exitval => 0, -verbose => $usage) if ($usage); exit(1) if (!$status); ################################################################################ # # Global variables # ################################################################################ my $checkSizes = 0; my $clobber; my $warningGiven = 0; my %dirs; my @files; ################################################################################ # # Subroutines # ################################################################################ sub Error ($) { my $msg = shift; print STDERR "ERROR: $msg\n"; exit(1); } sub Warn ($;$) { my $msg = shift; my $supMsg = shift; Error($msg) if ($warningIsError); print STDERR 'WARNING: '; print STDERR $msg; if (defined($supMsg)) { print STDERR ', further warnings of this type suppressed'; } print STDERR "\n"; $warningGiven = 1; } sub CreateScript ($$@) { my $name = shift; my $n = shift; my $files = shift; my $s; unless (open($s, ">$name")) { Error("$name: $!\n"); } # prolog print $s <<"EOF"; set net:socket-buffer $socketBuffer set cmd:parallel $parallelTransfers set cmd:queue-parallel $parallelTransfers set ftp:ssl-force yes set net:connection-limit $parallelTransfers set net:timeout $timeout set ssl:verify-certificate no open -u $account -p $port $host EOF if ($project ne '') { print $s "site proj $project\n"; } print $s "queue stop\n"; while ((scalar @$files > 0) && ($n > 0)) { my $a = shift(@$files); print $s "get " . $a->[0] . " -o " . $a->[1] . "\n"; $n--; } #epilog print $s <<"EOF"; queue start wait all EOF close($s); } sub MakeTree ($) { my $d = shift; return if (-d $d); if (-e _) { Error("$d: will be a parent directory but is not a directory"); } my $i = rindex($d, '/'); return if ($i == -1); MakeTree(substr($d, 0, $i)); if (!mkdir($d)) { Error("$d: $!\n"); } } sub ProcessFileList ($) { my $fn = shift; my $ln = 1; my $fh; my $line; my $fields; my @f; my $mssName; my $localName; my $size; my $quietEmpty = 0; my $quietSizeMissing = 0; my $strip; # # The split will produce a null field from before the leading slash, # thus we need to account for it in the split limit argument. # $strip = $stripDirs + 2; open($fh, "$fn") || die "$fn: $!\n"; while ($line = <$fh>) { @f = split(/\s+/, $line); $fields = scalar @f; if ($fields < 1) { if (!$quietEmpty) { Warn("$fn:$ln: empty or otherwise malformed", 1); $quietEmpty = 1; } next; } $mssName = $f[0]; $size = length($mssName); # # A valid absolute MSS file name will have the format: # # // # # where must be at least two characters long # and must be at least one character long. The # total length must be no larger than 128 characters. # if (($size < 5) || ($size > 128)) { Warn("$fn:$ln: invalid MSS file name length"); next; } if (substr($mssName, 0, 1) ne '/') { Warn("$fn:$ln: absolute MSS file name required"); next; } if (substr($mssName, $size, 1) eq '/') { Warn("$fn:$ln: invalid name contains a trailing slash"); next; } if ($strip <= 2) { $localName = $target . $mssName; } else { my @n; @n = split(/\//, $mssName, $strip); if ($strip != scalar(@n)) { Error("$fn:$ln: too many directories stripped"); } $localName = $target . '/' . pop(@n); } if ($checkSizes) { if ($fields < 2) { if (!$quietSizeMissing) { Warn("$fn:$ln: size field missing", 1); $quietSizeMissing = 1; } } else { $size = $f[1]; if ($size !~ /^\d+$/) { Warn("$fn:$ln: size field invalid"); } else { if (stat($localName)) { next if ($size == (stat(_))[7]); } elsif (!$!{ENOENT}) { Warn("stat $localName: $!"); next; } } } } push(@files, [$mssName, $localName]); if ($createDirs || $dirList) { my $i = rindex($localName, '/'); if ($i < 0) { Error("$localName: malformed local target directory name"); } my $d = substr($localName, 0, $i); $dirs{$d} = $d; } } continue { $ln++; } close($fh); } ################################################################################ # # Start of main processing # ################################################################################ if ($#ARGV < 0) { Error("require file list argument(s)"); } if ($account eq '') { $account = $ENV{'LOGNAME'}; if ($account eq '') { Error("no value provided or defaulted for --account"); } } if ($filesPerScript < 0) { Error("--files-per-script must be positive"); } if ($overwrite eq 'never') { $clobber = 'no'; } elsif ($overwrite eq 'always') { $clobber = 'yes'; } elsif ($overwrite eq 'sizemismatch') { $clobber = 'yes'; $checkSizes = 1; } else { Error("--overwrite must be one of 'always', 'never' or 'sizemismatch'"); } if ($parallelTransfers < 0) { Error("--parallel-transfers must be positive"); } if ($port < 0) { Error("--port must be positive"); } if ($scripts < 0) { Error("--scripts must be positive"); } if ($stripDirs < 0) { Error("--strip-dirs must be positive"); } if ($target eq '') { $target = getcwd(); } elsif (!-d $target) { Error("target_directory '$target' is not a directory"); } if ($timeout < 0) { Error("--timeout must be positive"); } # # Read the file list(s) and process them. # my $fn; while ($fn = shift(ARGV)) { ProcessFileList($fn); } my $nFiles = scalar @files; if ($nFiles == 0) { Error("No MSS file names were found"); } if ($dirList) { open(DL, ">$dirList") || Error("$dirList: $!"); } if ($createDirs || $dirList) { my $d; foreach $d (sort keys %dirs) { print DL $d."\n" if ($dirList); MakeTree($d) if ($createDirs); } } if ($dirList) { close(DL); } # # Create the lftp scripts # if ($scripts > 0) { $filesPerScript = ($nFiles + $scripts - 1) / $scripts; } else { if ($filesPerScript == 0) { $filesPerScript = $nFiles; $scripts = 1; } else { $scripts = sprintf("%0.f", ($nFiles + $filesPerScript - 1) / $filesPerScript); } } my $digits = length(sprintf("%d", $scripts)); for (my $i = 0; (scalar @files > 0) && ($i < $scripts); $i++) { my $n; if ($scripts == 1) { $n = $lftpScriptPrefix; } else { $n = sprintf("%s.%0${digits}d", $lftpScriptPrefix, $i); } CreateScript($n, $filesPerScript, \@files); } exit($warningGiven); __END__ =head1 Name mss_mirror_prep - Prepare for mirroring NCAR MSS directory trees using B. =head1 SYNOPSIS mss_mirror_prep [options] file_list [...] =head1 OPTIONS =over 4 =item B<--account> I Specifies the UCAS account name to use when connecting to the MSS FTP Service. Default: (Environment variable C<$LOGNAME>). =item B<--[no]create-dirs> If B<--create-dirs> is specified, B will precreate the required local directory tree (under F) as determined by the contents of F. This is necessary because B does not currently create any required local parent directories for files being fetched by the I command. If B<--[no]create-dirs> is specified, B will not precreate the required local directory tree. Note that if any required directories are missing, B will generate error messages. Default: B<--create-dirs> =item B<--dir-list> I Specify the name of a file in which to place the list of the required local directory tree. If this option is provided, it will be produced regardless of the state of the B<--[no]create-dirs> options. Default: (none) =item B<--files-per-script> I Specify the number of files to fetch per lftp script. NOTE: See also the descriptoin of the B<--scripts> option. Default: I<0> (unlimited) =item B<--help> Display help information. =item B<--host> F Specify the host name of the MSS FTP Service. Default: F =item B<--lftp-path> F Specify the path name of the lftp command. Default: F =item B<--lftp-script-prefix> F Specify the prefix for the generated lftp script file(s). If more than one script file is generated, the file names will be suffixed with a period followed by an incrementing sequence number. Default: F =item B<--man> Display the complete man page. =item B<--overwrite> I Controls the overwrite behavior of B and B. The values for I are one of I, I or I. If I is specified, lftp will be allowed to overwrite local files (lftp I). If I is specified, lftp will not overwrite local files (lftp I). If I is specified, B will compare the size of any file already present under F to see if it matches the size given for the corresponding MSS file in the F argument(s). If the sizes match, B will skip fetching that file again. This is useful if you need to restart the mirroring process. Default: I =item B<--parallel-transfers> I Specifies the number of parallel transfers to run per job. (lftp I). Default: I<2> =item B<--port> I Specifies an alternate TCP port number to use when connecting to the MSS FTP Service. Default: 21 (The standard FTP control port number.) =item B<--project> I Specifies the NCAR project number to use when reading files from the MSS. Default: (The MSS FTP Service uses the same default project number as is chosen by the DCS servers.) =item B<--rpwd> I Specifies the read password of B of the MSS files listed in F. Note that it is not possible to mirror a list of files with differing read passwords. Default: (none) =item B<--scripts> I Specifies the number of lftp script files to produce. Specifying this option will cause B to try to evenly distribute the number of files between I scripts, overriding any value specified for the B<--files-per-script> option. This is useful to run several B jobs in parallel to help overlap MSS retrieval and network latencies. Default: I<1> =item B<--socket-buffer> I Specifies the socket buffer size to use. This is passed to the C system call to set the C and C parameters. To get the best performance, it may be necessary to increase this parameter, and it may also be necessary for your system administrator to set kernel tuning parameters. Default: I<1048576> =item B<--strip-dirs> I Specifies the number of leading directory components to strip from the MSS file name before concatenation with F. Default: I<0> =item B<--target> F Specify the path of the local target directory. Default: (The current working directory.) =item B<--timeout> I Specifies the number of seconds before causing a timeout due to lack of response from the MSS FTP Service. It is highly recommended not to set a number lower than the default value. Default: I<1800> =item B<--[no]warning-is-error> If B<--warning-is-error> is specified, B will treat warning messages as if they were an error. This will cause an immediate exit with a non-zero status code. If B<--nowarning-is-error> is specified, B will print warning messages and attempt to continue. Default: B<--nowarning-is-error> =back =head1 ARGUMENTS =over 4 =item I The name(s) of the input file list(s). A file list contains one entry for each MSS file per line, with one or more fields separated by one or more (non-newline) white space characters. Leading white space will be skipped. The first field must be an absolute MSS file name. If B<--overwrite sizemismatch> was specified, the second field is the decimal MSS file size. Any additional trailing fields will be ignored. =back =head1 DESCRIPTION The B B script is used in conjunction with the B program to help mirror a directory tree resident on the NCAR MSS Service to local storage. You, or your system administrator, are responsible for obtaining and installing the B program (the URL is given below in the L section). The B script reads one or more input files containing the names (and optionally the expected sizes) of MSS files you wish to copy to your local system. It generates one (or more) B scripts to fetch the files from the MSS. If you have a small number of files to move, you can generate your own file list (see the L section above for the format) and use a single B script. If you are moving a large amount of data, please open a request for MSS Group consultation with CISL Customer Support (http://www.cisl.ucar.edu/support/index.jsp) describing the data you need to move and your required timeframe. The MSS Group will work with you to determine an efficient order in which to fetch files from the MSS. B The MSS FTP Service limits the number of simultaneous connections you can make. This number is configurable on a per-user basis, if required. By default, B configures B to run 2 transfers in parallel. This should be sufficient to keep a MSS tape with multiple files of interest mounted and reduce the latency to retrieve them. =head1 EXAMPLES To have B generate a single B script to fetch a set of files (given in F) with UCAS user name I, under your current directory, run: mss_mirror_prep --account user file.list lftp -f lftp.script The B program will prompt you for your UCAS password at the beginning of the session. To gain some more parallelism, you can have B generate additional B script files, which you can feed to separate B processes. For example, we will specify 2 scripts, a different local target directory (F), have B strip off the first level of the MSS path names and to skip files we have successfully already fetched from the MSS: mss_mirror_prep --account user --strip-dirs 1 --target /mirror \ --scripts 2 --overwrite sizemismatch file.list Then you run two B programs in different terminal windows with the resulting script files (F and F). =head1 SEE ALSO =over 4 =item http://www.cisl.ucar.edu/mss/quick.html =item http://www.cisl.ucar.edu/mss/ftp/ =item http://lftp.yar.ru/ =item L =back =head1 Copyright Copyright 2007, University Corporation for Atmospheric Research. All Rights Reserved.