#!/usr/local/bin/perl
#
#  file: extract-edgar
#  auth: Brad Burdick
#  desc: extract the EDGAR data files from the tape feed
#
#  usage:  extract-edgar [-C] [-c corrdir] [-d datadir] [-e errdir]
#            [-v] [-w workdir] [input_file ...]
#
##########################################################################
#  Copyright (c) 1994, 1995 Internet Multicasting Service
#
#  The SEC EDGAR Level 1 Dissemination processing software ("software")
#  was developed by the Internet Multicasting Service and may 
#  be used for academic, research, government, and internal business
#  purposes without charge.  You may not resell this code or include it
#  in a product that you are selling without prior permission of the
#  Internet Multicasting Service.
#
#  This software is provided ``as is'', without express or implied
#  warranty, and with no support nor obligation to assist in its
#  use, correction, modification or enhancement.  We assume no liability
#  with respect to the infringement of copyrights, trade secrets, or any
#  patents, and are not responsible for consequential damages.  Proper
#  use of the software is entirely the responsibility of the user.
##########################################################################

eval 'exec /usr/bin/perl -s $0 ${1+"$@"}'
  if 0;

# who am i?
($prog = $0) =~ s#.*/##;

# where we find our local libraries
push(@INC, '/usr/local/ims/lib');

# for processing command line options
require 'getopts.pl';

# Edgar general utility routines
require 'edgar-util.pl';

# process command line options, if any
&Getopts('Cc:d:e:vw:');

# only extract correction submissions?
$corrections_only = defined($opt_C);

# where to place submissions
$datadir = defined($opt_d) ? "$opt_d" : "/in/edgar";
&makepath($datadir, 0775);

# verbose output?
$verbose = defined($opt_v);

# where to place normal submissions
$workdir = defined($opt_w) ? "$datadir/$opt_w" : "$datadir/work";
&makepath($workdir, 0775);

# where to place correction submissions
$corrdir = defined($opt_c) ? "$datadir/$opt_c" : "$datadir/corrections";
&makepath($corrdir, 0775);

# where to place exception submissions (errors)
$errdir = defined($opt_e) ? "$datadir/$opt_e" : "$datadir/exceptions";
&makepath($errdir, 0775);

# submission text
@submission = ();

# is this submission a correction?
$correction = 0;

# are we processing a submission?
$in_sub = 0;

# take data from stdin if no file provided
if ($#ARGV < 0) {
	push(@ARGV, "<&STDIN");
}

foreach $file (@ARGV) {
	open(IN, "$file") || die "$prog: error getting input: $!\n";

	# reset the submission and clear flags
	@submission = ();
	$in_sub = $correction = 0;

	# will block on input from STDIN if empty
LINE:
	while ($line = <IN>) {
		chop($line);
		#
		# assumes SUBMISSIONs are not nested
		# ignores junk outside of <SUBMISSION> ... </SUBMISSION> nest
		#
		if (! $in_sub) {
			if ($line =~ '<SUBMISSION>') {           # start of submission
				$in_sub = 1;

				push(@submission, '<SUBMISSION>');     # save <SUBMISSION> tag
				chop($line = <IN>);

				#
				# is this a correction?
				#
				if ($line =~ /^<CORRECTION>/) {
					$correction = 1;
					push(@submission, $line);   # <CORRECTION> tag (optional)
					chop($line = <IN>);

					if ($line =~ /\d+:\d+/) {  # time stamp (optional)
						push(@submission, $line);
						chop($line = <IN>);
					}
				}

				#
				# we'll use the accession number as a file name for now.
				#
				if ($line =~ /^<ACCESSION-NUMBER>/) {
					push(@submission, $line);

					($accno = $line) =~ s/<ACCESSION-NUMBER>(\S+)/\1/;

					if ($correction) {
						$outfile = &get_next_file("$corrdir/$accno.corr01");
						print "Processing $outfile ...\n" if $verbose;
					} else {
						# do we only want correction submissions?
						last LINE if ($corrections_only);

						$outfile = "$workdir/$accno.nc";
					}

					if (-e $outfile) {
						warn "$prog: $outfile already exists!!\n";
						$outfile = &get_next_file("$errdir/$accno.dup01");
						print "Processing $outfile ...\n" if $verbose;
					}
				} else {                        # error - accession # MUST be next
					$outfile = &get_next_file("$errdir/$accno.err01");
					print "Processing $outfile ...\n" if $verbose;
				}

				open(OUT, ">$outfile") || die "$prog: $outfile: $!\n";
			}
		} elsif ($line =~ /^<\/SUBMISSION>/) {  # end of submission
			$in_sub = $correction = 0;
			push(@submission, '</SUBMISSION>');

			# save the submission to $outfile
			print OUT join("\n", @submission), "\n";

			# reset the submission and clear flags
			@submission = ();
		} else {
			push(@submission, $line);         # save the line
		}
	}
}

exit 0;

