#!/usr/bin/perl -w
#==============================================================================
# Script:   desift.pl
# Author:   James Shimada
# Purpose:  desift is a data-driven, template-based text formatter.
#==============================================================================

use strict;
use Getopt::Std;

# variables for command-line options
my $template = 'T';
my $tmplfile = 't';
my $delim    = 'd';
my $skiprows = 's';
my $tagbegin = 'B';
my $tagend   = 'E';

# set the default values for options
my %opts = (
	$tmplfile => '',
	$delim    => '\t',
	$skiprows => '^$',
	$tagbegin => '%',
	$tagend   => '',
	$template => '%0'
);

my $err = "";
my %flds;

#------------------------------------------------------------------------------
# Function: showUsage
# Purpose:  Prints usage info if --help option was supplied.
# Input:    None.
# Output:   Prints to stdout
#------------------------------------------------------------------------------
sub showUsage() {
print <<EOF;
Usage: perl desift.pl [options] [input-file] ...

  options:
  -d REGEX      Field delimiter in input-file(s). Default is "\\t" (tab).
  -t FILE       Format data using template contained in FILE.
  -T TEMPLATE   Format using TEMPLATE. By default, no filter applied ("%0").
  -s REGEX      Skip rows in input-file(s) matching REGEX.
  -B TAGSTART   Part of template tag before field index. Default is "%".
  -E TAGEND     Part of template tag after field index. Default is null.

EOF
}

#------------------------------------------------------------------------------
# Function: getTemplate
# Purpose:  Retrieves a string containing the template from file or argument
# Input:    uses script-level %opts hash
# Output:   returns template text as string
#           Sets script-level $err variable if error occured.
#------------------------------------------------------------------------------
sub getTemplate {
	my $tmpl;
	# Get the template text
	if( $opts{$tmplfile} ) {
		open( TMPL, "<$opts{$tmplfile}" ) 
		or $err = "Cannot open '$opts{$tmplfile}': $!\n";
		return if( $err );
		while( $_ = <TMPL> ) {
			$tmpl .= $_;
		}
		close( TMPL );
	} elsif( $opts{$template} ) {
		# automatically add a new line to the supplied template
		$tmpl = "$opts{$template}\n";
	}

	# remember what fields are specified in the template
	my $tagre = $opts{$tagbegin} . '(\d+)' . $opts{$tagend};
	while ( $tmpl =~ /$tagre/g ) {
		$flds{$1}++ if( $1 );
	}
	return $tmpl;
}

#------------------------------------------------------------------------------
# Function: batchit
# Purpose:  Central text-formatting function.
# Input:    stdin, script-level %opts hash
# Returns:  prints formatted text to stdout
#           Sets script-level $err variable if error occured.
#------------------------------------------------------------------------------
sub batchit {
	my $tmpl;
	my $tmpl2;
	my $fld;
	my @cols;
	my $tag;
	
	$tmpl = getTemplate();
	return if( $err );
	# open the data file to do work on it
	while ( <> ) {
		# trim it, chomp it, and skip blank lines
		chomp;
		# skip this line if necessary
		next if (/$opts{$skiprows}/);
		# get a fresh template for this line
		$tmpl2 = $tmpl;
		# expand the "whole line" tag
		$tag = $opts{$tagbegin} . 0 . $opts{$tagend};
		$tmpl2 =~ s/$tag/$_/g;
		# split the line (default variable) on delimiter
		@cols = split( $opts{$delim} );
		# for all fields specified in the template, expand the 
		# field with the corresponding element in the data
		for $fld ( keys %flds ) {
			$tag = $opts{$tagbegin} . $fld . $opts{$tagend};
			# if the field specified in the template does not exist
			# in the data, silently expand the field to null
			if( defined( $cols[$fld-1] )) {
				$tmpl2 =~ s/$tag/$cols[$fld-1]/g;
			} else {
				$tmpl2 =~ s/$tag//g;
			}
		}		
		# and out we go
		print $tmpl2;
	}
}

#------------------------------------------------------------------------------
# Function: isValidOpt
# Purpose:  Validate the value for an option.
# Input:    $optkey, $optval
# Returns:  true if valid
#           Sets script-level $err variable if error occured.
#------------------------------------------------------------------------------
sub isValidOpt {
	my ( $optkey, $optval ) = @_;
	#print "Is $optval a valid value for $optkey?\n";
	# validate template file
	if( $optkey eq $tmplfile ) {
		if( -f $optval ) {
			return 1;
		} else {
			$err .= "Invalid template '$optval'. No such file.\n";
			return 0;
		}
	}
	return 1;
}

#------------------------------------------------------------------------------
# Function: getArgs
# Purpose:  Retrieve command-line options
# Input:    command-line options
# Returns:  populates the script-level %opts hash with the options
#           Sets script-level $err variable if error occured.
#------------------------------------------------------------------------------
sub getArgs {
	my( %args, $arg );
	getopt( join( "", keys %opts ), \%args );
	# make sure each arg submitted is recognized in a generic fashion
	for $arg ( keys %args ) {
		# print "$arg --> $args{$arg} \n";
		if( !inArray( $arg, keys %opts )) {
			$err .= "Unrecognized option '$arg'\n";
		} else {
			if( isValidOpt( $arg, $args{$arg} )) {
				$opts{$arg} = $args{$arg};
			}
		}
	}
	# make sure the tag parts are regex-safe
	$opts{$tagbegin} = quotemeta( $opts{$tagbegin} );
	$opts{$tagend} = quotemeta( $opts{$tagend} );
	
	# now check any input files for errors
	for $arg ( @ARGV ) {
		if( !-f $arg ) {
			$err .= "Invalid input file '$arg'. No such file.\n";
		}
	}
}

#------------------------------------------------------------------------------
# Function: inArray
# Purpose:  Test if a value is an element of an array
# Input:    $val, @ary
# Returns:  true if value is found in array
#------------------------------------------------------------------------------
sub inArray {
	my $val = shift;
	for my $elem ( @_ ) {
		if( $val eq $elem ) {
			return 1;
		}
	}
	return 0;
}

#------------------------------------------------------------------------------
# Function: printErr
# Purpose:  Print error information and exit
# Input:    script-level $err variable.
# Returns:  prints to sterr
#------------------------------------------------------------------------------
sub printErr {
	print STDERR $err;
	exit(1);
}

#------------------------------------------------------------------------------
#------------------------------------------------------------------------------

if( $#ARGV > -1 && $ARGV[0] eq '--help' ) {
	showUsage();
	exit(0);
}

getArgs();
batchit() unless( $err );
printErr() if( $err );

exit(0);

=head1 README

I<desift> is a data-driven, temlate-oriented text filter. It occupies a 
niche somewhere between spartan shell utilities such as 'cut' and 
full-featured template engines.

=head1 SYNOPSIS

Command-line usage:

 perl desift.pl [options] [input-file ...]

 options:
 -d REGEX      Field delimiter in input-file(s). Default is "\t" (tab).
 -t FILE       Format data using template contained in FILE.
 -T TEMPLATE   Format using TEMPLATE. By default, no filter applied ("%0").
 -s REGEX      Skip rows in input-file(s) matching REGEX. 
 -B TAGSTART   Part of template tag before field index. Default is "%".
 -E TAGEND     Part of template tag after field index. Default is null.

If no C<input-file> then desift users standard input.

=head1 DESCRIPTION

Like cut, printf, and other multi-purpose shell scripting
favorites, desift provides generic text formatting and
filtering functions accessible from the shell. It uses
standard input and output channels to maximize how it can be
leveraged. Desift is particularly effective when combined in
a pipeline with programs that query a data source or perform
non-interactive text editing. It's lightweight and easy to
use for impromptu tasks. While you can create separate template
files to use for repeated or rigorous formatting tasks, you
can also simply specify a template as a command-line option,
in the same vein as sed allows you to provide a simple
search and replace function as an argument to perform edits
(as opposed to an external sed script file).

=head1 SCRIPT CATEGORIES

Unix/System_administration

CPAN/Administrative

CPAN

=head1 AUTHOR

James Shimada

jshimada ( a t ) netjs ( d o t ) com

www.netjs.com

=cut
