#!/usr/bin/perl -s

# tsvdesc - describe columns of tab-separated text
# Steve Kinzler, steve@kinzler.com, Jan 05/Mar 08
# https://kinzler.com/me/home.html#unix

$usage = "usage: $0 [ -t ] [ -d ] [ -n ] [ file ... ]
	-t	interpret the first row as column titles
	-d	don't count duplicates in the columns
	-n	leave <TAB>, <NL> and <CR> in strings as literal\n";
die $usage if $h;

while (<>) {
	s/[\r\n]*$//;
	@_   = split(/\t/, $_);
	@lbl = @_, next if $. == 1 && $t;
	map { s/<TAB>/\t/gs, s/<NL>/\n/gs, s/<CR>/\r/gs } @_ unless $n;

	$c = 1;
	foreach (@_) {
		$quot{$c}++ if s/^"(.*)"$/$1/;
		$spac{$c}++ if s/^\s+// || s/\s+$//;
		$dups{$c}++ if ! $u && $data{"$c,$_"}++;
		$maxl{$c} = &max($maxl{$c}, $l = length($_));
		$minl{$c} = &min($minl{$c}, $l);
		$ncol	  = &max($ncol, $c++);
	}
}

print $t ? $. - 1 : $., " lines\n";
for $c (1 .. $ncol) {
	print shift @lbl || "Column_$c", "\tVARCHAR2(", $maxl{$c} + 0, ")\t",
	      "minlen ", $minl{$c} + 0, "\t",
	      $u ? () : ($dups{$c} + 0, " dups\t"),
	      $quot{$c} + 0, " quoted\t", $spac{$c} + 0, " spaced\n";
}

sub max {		   ($_[0] > $_[1]) ? $_[0] : $_[1] }
sub min { (defined $_[0] && $_[0] < $_[1]) ? $_[0] : $_[1] }
