#!/usr/local/bin/perl
#
#  file: split-edgar
#  auth: Brad Burdick
#  desc: split SEC EDGAR data file into SGML header information file and
#        document text file.
#
#  usage:  split-edgar [-d datadir] [-e errdir] [-v] [-w workdir]
#            [input_file(s)]
#
##########################################################################
#  Copyright (c) 1994, 1995 Internet Multicasting Service
#
#  The SEC EDGAR Level 1 Dissemination processing software ("software")
#  was developed by the Internet Multicasting Service and may 
#  be used for academic, research, government, and internal business
#  purposes without charge.  You may not resell this code or include it
#  in a product that you are selling without prior permission of the
#  Internet Multicasting Service.
#
#  This software is provided ``as is'', without express or implied
#  warranty, and with no support nor obligation to assist in its
#  use, correction, modification or enhancement.  We assume no liability
#  with respect to the infringement of copyrights, trade secrets, or any
#  patents, and are not responsible for consequential damages.  Proper
#  use of the software is entirely the responsibility of the user.
##########################################################################

eval 'exec /usr/bin/perl -s $0 ${1+"$@"}'
  if 0;

# who am i?
($prog = $0) =~ s#.*/##;

# where we find our local libraries
push(@INC, '/usr/local/ims/lib');

# for processing command line options
require 'getopts.pl';

# Edgar general utility routines
require 'edgar-util.pl';

# date stamp for IMS header - century is hard-coded
@date = localtime;
$datestamp = sprintf("%04d%02d%02d", $date[5]+1900, $date[4]+1, $date[3]);

# process command line options, if any
&Getopts('d:e:vw:');

# verbose output?
$verbose = defined($opt_v);

# where to place submissions
$datadir = defined($opt_d) ? "$opt_d" : "/in/edgar";
&makepath($datadir, 0775);

# where to place normal submissions
$workdir = defined($opt_w) ? "$datadir/$opt_w" : "$datadir/work";
&makepath($workdir, 0775);

# where to place exception submissions (errors)
$errdir = defined($opt_e) ? "$datadir/$opt_e" : "$datadir/exceptions";
&makepath($errdir, 0775);

# base file name (accession # for now)
$accno = '';

# document text
@document = ();

# header text
@header = ();

# are we processing the header text?
$in_hdr = 0;

# are we processing the document text?
$in_doc = 0;

# take data from stdin if no file provided
if ($#ARGV < 0) {
	push(@ARGV, "<&STDIN");
}

foreach $file (@ARGV) {
	open(IN, "$file") || die "$prog: $file: $!\n";

	# will block if input is STDIN and empty...
	while ($line = <IN>) {
		chop($line);
		#
		# assumes SUBMISSIONs are not nested
		# ignores junk outside of <SUBMISSION> ... </SUBMISSION> nest
		#
		if (! $in_hdr && ! $in_doc) {
			if ($line =~ '<SUBMISSION>') {           # start of header
				$in_hdr = 1;

				chop($line = <IN>);

				#
				# we'll use the accession number as a file name for now.
				#
				if ($line =~ '<ACCESSION-NUMBER>') {
					($accno = $line) =~ s/<ACCESSION-NUMBER>(\S+)/\1/;

					$outfile = "$workdir/$accno.hdr.sgml";

					if (-e $outfile) {
						warn "$prog: $outfile already exists!!\n";
						$outfile = &get_next_file("$errdir/$accno.hdr.dup01");
						print "Processing $outfile ...\n" if $verbose;
					}
				} else {                        # error - accession # MUST be next
					$outfile = &get_next_file("$errdir/$accno.hdr.err01");
					print "Processing $outfile ...\n" if $verbose;
				}

				$ims_hdr = "<IMS-HEADER>$accno.hdr.sgml : $datestamp";
				push(@header, $ims_hdr);
				push(@header, $line);

				open(OUT, ">$outfile") || die "$prog: $outfile: $!";
			}
		} elsif ($line =~ '<DOCUMENT>' && ! $in_doc) {  # end of header
			$in_hdr = 0;
			$in_doc = 1;

			push(@header, "</IMS-HEADER>");

			# save the header to $outfile
			print OUT join("\n", @header), "\n";

			# reset the array
			@header = ();

			$outfile = "$workdir/$accno.txt";
			if (-e $outfile) {
				warn "$prog: $outfile already exists!!\n";
				$outfile = &get_next_file("$errdir/$accno.txt.dup01");
				print "Processing $outfile ...\n" if $verbose;
			}
			open(OUT, ">$outfile") || die "$prog: $outfile: $!";

			# now process the document text
			push(@document, $line);

		} elsif ($line =~ '</SUBMISSION>') {  # end of document(s)
			$in_hdr = 0;
			$in_doc = 0;

			# save the document(s) to $outfile
			print OUT join("\n", @document), "\n";

			# reset the array
			@document = ();

		} elsif ($in_hdr) {
			push(@header, $line);           # save the header line
		} else {
			push(@document, $line);         # save the document line
		}
	}
}

exit 0;

