#!/usr/bin/perl -s

# tsv - stdin filter to extract/manipulate columns of tab-separated text
# Steve Kinzler, steve@kinzler.com, Jan 06
# https://kinzler.com/me/home.html#unix

require 5.000;

die <<EOF if $h;
usage: $0 [ -b | -s | -t ] [ arg ... ]
	-b	split on each blankspace, join with a single blankspace
	-s	split on whitespace sequences, join with a single tab
	-t	split on tab sequences, join with a single tab
Arguments are output column specifications in these forms:
	#	this row's value for column #, numbered from 1
	nf	this row's value for the last column
	x:y	this row's values for column range x thru y
	x::y	this row's values for column range x thru y except named cols
	@	this row's values for all columns
	@@	this row's values for all columns except named cols
or Perl expressions containing these shorthands:
	c#	this row's value for column #, numbered from 1
	C#	this row's value for column #, numbered from 0
	i#	previous input row's value for column #, numbered from 1
	I#	previous input row's value for column #, numbered from 0
	o#	previous output row's value for column #, numbered from 1
	O#	previous output row's value for column #, numbered from 0
	nc	column number of last column in this row, numbered from 1
	NC	column number of last column in this row, numbered from 0
By default, input lines are split on each tab and joined with a single tab.
# must be in the form of an integer, positive or negative.
x and y must be in the form #, c#, C#, or nc.
Named columns are those with an argument of the form #, c# or C#.
Arguments starting like "c#=" are taken as input column value
  preadjustments, and do not specify a column of output.
EOF

$int = '[-+]?\d+';

foreach (@ARGV) {
	$_ = '1:nc'  if $_ eq '@';
	$_ = '1::nc' if $_ eq '@@';
	$_ = 'nc:nc' if $_ eq 'nf' || $_ eq 'NF';
	push(@cols, '@_[' . &colindex($1) . '..' . &colindex($2) . ']'),  next
		if /^([cC]?$int|nc|NC):([cC]?$int|nc|NC)$/;
	push(@cols, '@_[grep(! $except{$_}, '
			  . &colindex($1) . '..' . &colindex($2) . ')]'), next
		if /^([cC]?$int|nc|NC)::([cC]?$int|nc|NC)$/;
	s/^$int(=|$)/c$&/;
	push(@adjs, &colcode($_)), next if /^c$int=/i;
	$except{ /^c($int)$/ ? $1 - 1 :
		 /^C($int)$/ ? $1     : undef } = 1;
	push(@cols, &colcode($_));
}

print STDERR 'Adjustments: ', join('; ', @adjs), "\n",
	     'Columns: ',     join('; ', @cols), "\n" if $d;

($spatt, $jchar) = ('\t',  "\t");
($spatt, $jchar) = (' ',   ' ' ) if $b;
($spatt, $jchar) = ('\t+', "\t") if $t;
($spatt, $jchar) = ('\s+', "\t") if $s;
while (<STDIN>) {
	chomp;
	@_  = split(/$spatt/, $_ . 'EOL'); $_[$#_] =~ s/EOL$//;
	@__ = @_;
	map { eval $_ } @adjs;
	print join($jchar, @o = map { eval $_ } @cols), "\n";
	@i  = @__;
}

###############################################################################

sub colindex {
	local $_ = shift;
	return "($1-1)" if /^[a-z]?($int)$/x;
	return "($1)"	if /^[A-Z]($int) $/x;
	return '$#_'	if /^(nc|NC)     $/x;
	$_;
}

sub colcode {
	local $_ = join('', @_);
	s/\bc($int)\b/\$_[$1-1]/gx;
	s/\bC($int)\b/\$_[$1]/gx;
	s/\bi($int)\b/\$i[$1-1]/gx;
	s/\bI($int)\b/\$i[$1]/gx;
	s/\bo($int)\b/\$o[$1-1]/gx;
	s/\bO($int)\b/\$o[$1]/gx;
	s/\bnc     \b/(\$#_+1)/gx;
	s/\bNC     \b/\$#_/gx;
	$_;
}
