#!/usr/local/bin/perl
#
#  file: process-edgar
#  auth: Brad Burdick
#  desc: post-process SEC EDGAR SGML header file
#
#  usage:  process-edgar [-a] [-d datadir] [-e errdir] [-w workdir]
#            [input_file ...]
#
##########################################################################
#  Copyright (c) 1994, 1995 Internet Multicasting Service
#
#  The SEC EDGAR Level 1 Dissemination processing software ("software")
#  was developed by the Internet Multicasting Service and may 
#  be used for academic, research, government, and internal business
#  purposes without charge.  You may not resell this code or include it
#  in a product that you are selling without prior permission of the
#  Internet Multicasting Service.
#
#  This software is provided ``as is'', without express or implied
#  warranty, and with no support nor obligation to assist in its
#  use, correction, modification or enhancement.  We assume no liability
#  with respect to the infringement of copyrights, trade secrets, or any
#  patents, and are not responsible for consequential damages.  Proper
#  use of the software is entirely the responsibility of the user.
##########################################################################

eval 'exec /usr/bin/perl -s $0 ${1+"$@"}'
  if 0;

# who am i?
($prog = $0) =~ s#.*/##;

# allow local libraries
push(@INC, '/usr/local/ims/lib');

# for processing command line options
require 'getopts.pl';

# Edgar SGML description info
require 'edgar-desc.pl';

# Edgar general utility routines
require 'edgar-util.pl';

# process command line options, if any
&Getopts('ad:w:');

# what type of processing?
$do_ascii = defined($opt_a);

# type of processing specified?
if (! $do_ascii) {
	die "$prog: no processing type specified: Exiting ...\n";
}

# base directory
$datadir = defined($opt_d) ? "$opt_d" : "/in/edgar";

# where to place normal submissions
$workdir = defined($opt_w) ? "$datadir/$opt_w" : "$datadir/work";
&makepath($workdir, 0775);

# where to place exception submissions (errors)
$errdir = defined($opt_e) ? "$datadir/$opt_e" : "$datadir/exceptions";
&makepath($errdir, 0775);

# date stamp for header
@date = localtime;
$datestamp = sprintf("%04d%02d%02d", $date[5]+1900, $date[4]+1, $date[3]);

# accession number (used as base file name for now)
$accno = '';

# header text
@header = ();

# are we processing a header?
$in_hdr = 0;

# take data from stdin if no file provided
if ($#ARGV < 0) {
	push(@ARGV, "<&STDIN");
}

#
#  main processing loop
#
foreach $file (@ARGV) {
	&load_hdr($file, *header);

	# get accession number
	for (@header) {
		next unless ($_ =~ '<ACCESSION-NUMBER>');
		($accno = $_) =~ s/<ACCESSION-NUMBER>(\S+)$/\1/;
		last;
	}

	if ($do_ascii) {
		$outfile = "$workdir/$accno.txt";
	}

	if ($do_ascii) {
		local($docfile) = "$workdir/$accno.txt";
		local(@document) = ();
		local(@newheader) = ();
		local($/) = undef;

		&process_ascii(*edgar_desc, *header, *newheader);

		# slurp in the whole document file
		open(DOC, "$docfile") || die "$prog: unable to open $docfile: $!\n";
		@document = <DOC>;

		# open data file
		open(TEXT, ">$outfile") || die "$prog: unable to open $outfile: $!\n";

		print TEXT "<IMS-DOCUMENT>", "$accno.txt : $datestamp\n";
		print TEXT join("\n", @newheader), "\n";
		print TEXT @document;
		print TEXT "</IMS-DOCUMENT>\n";

		close(TEXT);

		# resign the modified document
		system("/usr/local/ims/bin/sign-doc $outfile");
	}

	$in_hdr = 0;
	@header = ();
}

exit 0;


#
# load SGML header
#
sub load_hdr {
	local($file) = shift;
	local(*header) = shift;
	local($/) = undef;

	# slurp in the whole header file
	open(IN, "$file") || die "$prog: unable to open $file: $!\n";
	@header = split("\n", <IN>);
	return;
}


#
#  create a more human-readable header file
#
#  format of description info is:
#    tag text|replacement text|end nest text
#
sub process_ascii {
	local(*desc) = shift;
	local(*header) = shift;
	local(*newheader) = shift;
	local($found) = 0;
	local($indent) = 0;
	local($line);
	local($endnest, $rep, $tag);

	foreach $line (@header) {
		for (@desc) {
			($tag, $rep, $endnest) = split(/\|/);
			if ($line eq "<$endnest>") {
				$indent-- if ($indent > 0);
				$found = 1;
				last;
			} elsif ($line =~ /^<$tag>/) {
				if ($rep) {
					$tag = $rep;
				} else {
					$tag =~ s/-/ /og;
				}

				if ($endnest) {    # true if this $tag starts a nest
					$tag = join("", "\n", "\t" x $indent, "$tag");
					$indent++;
				} else {
					$tag = join("", "\t" x $indent, "$tag\t");
				}

				if ($line =~ /^<ITEMS>/) {
					$line =~ s/<ITEMS>(.*)/$item_desc{\1}/e;
				} elsif ($line =~ /^<ACT>/) {
					$line =~ s/<ACT>(.*)/$sec_codes{\1}/e;
				} elsif ($line =~ /^<ASSIGNED-SIC>/) {
					$line =~ s/<ASSIGNED-SIC>(.*)/\1/;
					$line = "$sic_codes{$line} [$line]";
				} else {
					# strip out the tag info
					$line =~ s/<.*>(.*)/\1/;
				}

				push(@newheader, join("", $tag, $line));

				$found = 1;

				last;
			}
		}

		if (! $found) {
			push(@newheader, $line);
		}
		$found = 0;
	}
}

