#!/usr/bin/perl -w

######################################################################
# 
# Program: fetch_unanswered.pl
#
#       Retrieve articles from to which no reply has yet been posted.  
#       Assumes that arguments passed to program are newsgroup names.   
#       Articles are all printed to the standard output.
#
#       options:
#               -j Turn off threading of articles by subject.
#               -m <limit>              Look back at most <limit> headers/nov.
#               -n <limit>              Fetch at most <limit> NOV records
#                                       with one request to server.
#               -s <news server name>   override default news server 
#               -x <pattern>            exclude articles with header
#                                       matching pattern.
#
# Please send any comments to: RonaldWS@software-path.com
#
# A version with a reply feature exists.  The reply feature is not included
# here since it requires about 200 lines of additional unrelated code and 
# belongs in a separate script. CPAN script submission currently requires 
# that "It must be a single file ...". Contact the author if interested in 
# the reply feature.
#
# Written by: Ronald Schmidt, The Software Path
#
######################################################################

use strict;
use News::NNTPClient;
use Getopt::Std;

my $VERSION = '0.25';

use vars qw($opt_m $opt_j $opt_n $opt_s $opt_x $VERSION);

# server will be set to (in order of decreasing priority)
#       -s command line parameter
#       NNTPSERVER environment variable
#       /etc/nntpserver
#       default set here
my $server;
my $default_server = '"set default_server or use -s parameter"';

my $xover_batch_size = 500;
my $exclude_regex;

######################################################################
# "Nice to have" enhancements:
#       support for newnews
#       time estimation
#       FAQ filtering option/kill file.
######################################################################

my $news_client;
my %unanswered;
my %record_dup_subj;

######################################################################
# Print a status message to STDERR.  If caller does not provide
# line termination then terminate line with time stamp and LF("\n").
######################################################################
sub post_console_message {
        print STDERR @_;
        print STDERR " (", scalar(localtime()), ")\n" unless (
                $_[$#_] =~ /\n/         # Last parm has LF.
        );
}

######################################################################
# Here we remove messages with subjects that look like replies and
# begin to track groups of messages with the same subject.
# User may request no filter by subject.
######################################################################
sub FilterSubject {
        my $msg_id = shift;
        my $subj = lc(shift);
        my $has_ref = shift;

        $subj =~ s/^\s*//;
        $subj =~ s/\s*$//;

        # if subject filtering remove msgs with subject that looks like reply
        delete $unanswered{$msg_id} if (
                ($subj =~ s/^re(\:?)\s+//) && (! $has_ref)
        );

        # List of message id's by subject.  Advanced technique - sorry!
        push @{$record_dup_subj{$subj}}, $msg_id;
}

######################################################################
# Look through duplicate subject hash for cases where multiple messages
# had the same subject and remove their message id's from the unanswered
# list.
######################################################################
sub RemoveDuplicateSubject {
        foreach my $msg_id_lh (values %record_dup_subj) {
                if (scalar(@$msg_id_lh) > 1) {
                        foreach my $dup_msg_id (@$msg_id_lh) {
                                delete $unanswered{$dup_msg_id};
                        }
                }
        }
}

######################################################################
# Use NNTP XOVER request to fetch header information needed to
# determine which articles have not yet received a response.
# This is one of the more efficient approaches.
######################################################################
sub SetUnansweredXover {
        my ($news_client, $first_num, $last_num, $batch_size) = @_;

        my ($batch_first, $batch_last);
        my $overview_fmt;
        my ($i, %overview_fields, $id_field, $ref_field, $subject_field);
        my @all_ref;

        $overview_fmt = $news_client->list('overview.fmt');
        die $news_client->message() unless ($news_client->ok());
        
        %overview_fields = map((uc($_), $i++), 
                grep(s/\s*$//, @$overview_fmt));
        $id_field = $overview_fields{'MESSAGE-ID:'};
        $ref_field = $overview_fields{'REFERENCES:'};
        $subject_field = $overview_fields{'SUBJECT:'};

        for (   $batch_first = $first_num,
                $batch_last = $first_num + $batch_size -1;
                $batch_first < $last_num;
                $batch_first = $batch_last + 1,
                $batch_last = $batch_first + $batch_size -1
        ) {
                $batch_last = $last_num if ($batch_last > $last_num);
                foreach my $xover_line 
                        ($news_client->xover("${batch_first}-${batch_last}")) {
                        my ($msg_num, $msg_id, $ref, $subject) =
                                (split /\t/, $xover_line)
                                 [0, $id_field +1, 
                                        $ref_field +1, $subject_field +1];
                        my $has_ref = (defined($ref) && $ref);
                        if ($has_ref) {
                                foreach my $ref_id (split(' ', $ref)) {
                                        delete $unanswered{$ref_id};
                                }
                        }
                        else {
                                $unanswered{$msg_id} = $msg_num;
                        }
                        FilterSubject($msg_id, $subject, $has_ref) 
                                unless ($opt_j);
                }
                post_console_message 'Processed requests for ', 
                        $batch_last - $first_num +1,
                        " NOV records of ", $last_num - $first_num +1, '.';
        }
}

######################################################################
# Fetch each article header, one at a time, to determine which
# articles have not yet received any response.
# This is a very inefficient approach but does not require any
# NNTP extension services.
######################################################################
sub SetUnansweredHead {
        my ($news_client, $first_num, $last_num) = @_;

        my ($article_num, $err_count);
        my $i = 0;
        
        for (   $article_num = $first_num; 
                $article_num <= $last_num;
                $article_num++) {
                my $head;
                my ($msg_id, $ref_id);

                post_console_message("counting heads: $i") if ((++$i % 100)==0)
;       
                $head = $news_client->head($article_num);
                unless ($news_client->ok()) {
                        $err_count++ if (
                                $news_client->message() !~ 
                                        /bad article number/i
                        );
                        next;
                }

                ($msg_id) = grep(/Message\-ID\:/i, @$head);
                ($msg_id) = ($msg_id =~ /Message\-ID\: (\<.*?\>)/i);
                ($ref_id) = grep(/References\:/i, @$head);
                if (defined $ref_id) {
                        ($ref_id) = ($ref_id =~ /References\: (\<.*?\>)/i);
                        delete $unanswered{$ref_id};
                }
                else {
                        $unanswered{$msg_id} = $article_num;
                }
                unless ($opt_j) {
                        my ($subject) = grep(/Subject\:/i, @$head);
                        ($subject) = ($subject =~ /Subject: (.*)/i);
                        FilterSubject($msg_id, $subject, defined($ref_id));
                }
        }
        post_console_message("counting heads: $i") unless (($i % 100)==0);
        post_console_message("*Warning* errors: $err_count.") if ($err_count);
}

######################################################################
# Here we expend too much effort to be platform independent.
# We really should `cat ...`
######################################################################
sub read_etc_nntpserver {
        my $rc;

        open(FH, '</etc/nntpserver') || return undef;
        $rc = scalar(<FH>);
        close(FH);
        $rc =~ s/\s*$//;
        return $rc || undef;
}

######################################################################
# Decide whether to exclude messages.
######################################################################
sub excluded {
        my $msg = shift;

        if ($exclude_regex) {
                my $header = '';
                for (    my $i = 0;
                         $i < scalar(@$msg) && $msg->[$i] !~ /^\s*$/;
                         $i++) {
                         $header .= $msg->[$i]; 
                }
                return ($header =~ $exclude_regex);
        }
        else {
                return 0;
        }
}

######################################################################
# Fetch unanswered articles for one news group.
######################################################################
sub fetch_group_unanswered {
        my $group = shift;

        # counts for articles that will not be printed
        my ($exclude_count, $unavail_count) = (0, 0);
        # get news article number range
        my ($first_num, $last_num) = $news_client->group($group);
        die $news_client->message() unless ($news_client->ok());
        $first_num = $last_num - $opt_m +1 if ($opt_m && ($opt_m =~ /^\d+$/));

        # Test scaffolding.  Under Linux this forces overview analysis to fail.
#       $news_client->quit();
#       $news_client = new News::NNTPClient($server);
#       $news_client->debug(0);

        post_console_message('Finding unanswered articles.');

        %unanswered = ();
        %record_dup_subj = ();
        
        ######################################################################
        # The actual work of deciding which articles for the group are
        # unanswered is done here.
        ######################################################################
        eval {
                SetUnansweredXover(
                        $news_client, $first_num, $last_num, $xover_batch_size
                );
        };
        if ($@) {
                post_console_message 'Xover failed; trying one message at a ',
                        'time.  This may take a while.', "\n";

        # more test scaffolding
#              $news_client->mode_reader(); 
#              $news_client->group($group);

                SetUnansweredHead($news_client, $first_num, $last_num);
        }

        unless ($opt_j) {
                RemoveDuplicateSubject();
                %record_dup_subj = ();  # free what may be substantial memory
        }

        ######################################################################
        # End of "find unanswered" code block.
        ######################################################################
        post_console_message('Done finding unanswered articles.');
        post_console_message('Fetching ', scalar(keys %unanswered),
                ' unanswered articles.');

        # Fetch each unanswered article from the news server
        # and print it to the standard output.
        foreach my $article_id (
                sort {$unanswered{$b} <=> $unanswered{$a}} keys(%unanswered)
        ) {
                my $msg = $news_client->article($article_id);
                if (! $msg) {
                        $unavail_count++;
                }
                elsif (excluded($msg)) {
                        $exclude_count++;
                }
                else {
                        print @$msg;
                }
        }
        post_console_message(
                "Excluded $exclude_count messages based on pattern."
        ) if ($exclude_count);
        post_console_message("$unavail_count messages were unavailable.")
                if ($unavail_count);
        post_console_message('A total of ',
                scalar(keys %unanswered) - $exclude_count - $unavail_count,
                ' available matching messages printed.')
                if ($exclude_count || $unavail_count);
}

######################################################################
# Start of program.
######################################################################

# process command line options
getopts("jm:n:rs:x:");
unless (@ARGV) {
        print <<EOT;
Usage: fetch_unanswered.pl [options] newsgroup [newsgroup ...]
        see perldoc fetch_unanswered.pl for options
        (use fetch_unaswered-${VERSION}.pl where appropriate)
EOT
        exit(0);
}
$server = $opt_s if (defined($opt_s));

$xover_batch_size = $opt_n if (defined($opt_n));

$server = $ENV{'NNTPSERVER'} if (
        (! defined($server))            &&
        $ENV{'NNTPSERVER'}
);
$server = read_etc_nntpserver() if (
        (! defined($server))            &&
        (-r '/etc/nntpserver')
);
$server = $default_server unless(defined $server);

if ($opt_x) {
        $exclude_regex = eval{qr/$opt_x/im;};
        if ($@) {
                print "Failed regex interpretation of exclude pattern: $@\n";
                exit(-1);
        }
}

# connect to news server
$news_client = new News::NNTPClient($server);
unless ($news_client->ok()) {
        $news_client->quit();
        die $news_client->message();
}

$news_client->debug(0);
$news_client->mode_reader();

foreach my $news_group (@ARGV) {
        eval{fetch_group_unanswered($news_group);};
        print STDERR $@ if($@);
}

post_console_message('Done.');

$news_client->quit();


=head1 NAME

fetch_unanswered.pl - Retrieve news articles that do not have a reply. 

=head1 README

Retrieve usenet news articles to which no reply has yet been posted.

=head1 DESCRIPTION

Retrieve articles from newsgroups to which no reply has yet been posted.  
Newsgroup names are passed as command line arguments to the program.
Articles are all printed to the standard output and status messages are 
printed to STDERR.

Usage: fetch_unanswered.pl [options] newsgroup [ngroup ...]

=head1 COMMAND LINE OPTIONS

=over 4

=item -j

Turn off threading of articles by subject.  Turning this off also
saves (some) time and memory.  Article threading eliminates
articles starting with 'Re:' and groups of articles with the
same subject.

=item -m E<lt>Max headers to look back.E<gt>

Look back at most -m headers/nov records.

=item -n E<lt>NOV record batch sizeE<gt>

Limit number of NOV records we read from server with one
request.  A small number will result in more frequent
feedback to the user.   

=item  -s E<lt>news server nameE<gt>

Override default news server.

        Default is: (in order of decreasing priority)
        value of NNTPSERVER environment variable
        value from /etc/nntpserver file
        value set at start of fetch_unanswered.pl source code.

=item -x E<lt>patternE<gt>

Exclude messages with header matching pattern provided.  Pattern
is interpreted as a Perl regex with multiline and case insensitive
options turned on.  Exclusion is done on unanswered messages after
they have been identified.

 E.G. to remove comp.lang.perl.misc FAQ postings try
 -x'from: perlfaq server'

=back

=head1 PREREQUISITES

This script requires C<Getopt::Std> and C<News::NNTPClient>.

=pod OSNAMES

any

=pod SCRIPT CATEGORIES

News

=cut


