#!/usr/local/bin/perl
#
#  file: correct-edgar
#  auth: Brad Burdick
#  desc: apply corrections to SEC EDGAR data file(s)
#
#  usage:  correct-edgar [-a] [-d datadir] [-v] [-w workdir] [input_file ...]
#
##########################################################################
#  Copyright (c) 1994, 1995 Internet Multicasting Service
#
#  The SEC EDGAR Level 1 Dissemination processing software ("software")
#  was developed by the Internet Multicasting Service and may 
#  be used for academic, research, government, and internal business
#  purposes without charge.  You may not resell this code or include it
#  in a product that you are selling without prior permission of the
#  Internet Multicasting Service.
#
#  This software is provided ``as is'', without express or implied
#  warranty, and with no support nor obligation to assist in its
#  use, correction, modification or enhancement.  We assume no liability
#  with respect to the infringement of copyrights, trade secrets, or any
#  patents, and are not responsible for consequential damages.  Proper
#  use of the software is entirely the responsibility of the user.
##########################################################################

eval 'exec /usr/bin/perl -s $0 ${1+"$@"}'
  if 0;

# who am i?
($prog = $0) =~ s#.*/##;

# where we find our local libraries
push(@INC, '/usr/local/ims/lib');

# for processing command line options
require 'getopts.pl';

# Edgar general utility routines
require 'edgar-util.pl';

# Edgar SGML description info
require 'edgar-desc.pl';

# date stamp for IMS header - century is hard-coded
@date = localtime;
$datestamp = sprintf("%04d%02d%02d", $date[5]+1900, $date[4]+1, $date[3]);

# process command line options, if any
&Getopts('ad:vw:');

# what type of processing?
$do_ascii = defined($opt_a);

# type of processing specified?
if (! $do_ascii) {
	die "$prog: no processing type specified:  Exiting ...\n";
}

# verbose output?
$verbose = defined($opt_v);

# where to place submissions
$datadir = defined($opt_d) ? "$opt_d" : "/ftp/edgar";
&makepath($datadir, 0775);

# current working area
$workdir = defined($opt_w) ? "$opt_w" : "/in/edgar/work";
&makepath($workdir, 0775);

# base file name (accession # for now)
$accno = '';

# subdirectory name (cik for now)
$cik = '';

# keep track of the number of corrections made
$corrections = 0;

# output file name
$hdrfile = '';

# document text
@document = ();

# header text
@header = ();

# are we processing a header?
$in_hdr = 0;

FILE: foreach $file (@ARGV) {

	# ignore filings for years we don't carry
	# file name format is 0000000000-99-000000, where -99- values are the 
	# filing year.
	next unless ($file =~ /^\d+-9[45]-\d+/);

	open(IN, "$file") || (warn "$prog: error getting input: $!", next FILE);

LINE: while ($line = <IN>) {
		chop($line);
		#
		# assumes SUBMISSIONs are not nested
		# ignores junk outside of <SUBMISSION> ... </SUBMISSION> nest
		#
		if (! $in_hdr) {
			if ($line =~ '<SUBMISSION>') {           # start of header
				$in_hdr = 1;

				chop($line = <IN>);

				#
				# is this a correction?
				#
				if ($line =~ '<CORRECTION>') {
					chop($line = <IN>);
					if ($line =~ /<TIMESTAMP>\d+:\d+/) {  # time stamp (optional)
						chop($line = <IN>);
					}
				}

				#
				# we'll use the accession number as a file name for now.
				#
				if ($line =~ '<ACCESSION-NUMBER>') {
					($accno = $line) =~ s/<ACCESSION-NUMBER>(\S+)/\1/;
				} else {                        # error - accession # MUST be next
					warn "$prog: $file: invalid format\n";
					next FILE;
				}

				$ims_hdr = "<IMS-HEADER>$accno.hdr.sgml : $datestamp";
				push(@header, $ims_hdr);
				push(@header, $line);
			}
		} elsif ($line =~ '<DOCUMENT>') {  # end of header
			local($dir) = 'data';

			$hdrfile = "$datadir/$dir/$cik/$accno.hdr.sgml";

			$in_hdr = 0;

			# make sure header exists
			if (! -e "$hdrfile") {
				# check for corrections in current feed
				$hdrfile = "$workdir/$accno.hdr.sgml";
				if (! -e "$hdrfile") {
					warn "$prog: $accno.hdr.sgml not found\n";
					@header = ();
					next FILE;
				}
			}

			push(@header, "</IMS-HEADER>");

			open(OUT, ">$hdrfile") ||
			  (warn "$prog: error opening $hdrfile: $!\n", next FILE);

			# save the header to $hdrfile
			print OUT join("\n", @header), "\n";

			# now process the document text
			if ($do_ascii) {
				local($txtfile);
				local(@newheader) = ();
				local(@newdoc) = ();
				local($/) = undef;

				($txtfile = $hdrfile) =~ s/hdr.sgml/txt/o;
				open(DOC, "$txtfile") || 
				  (warn "$prog: unable to open $txtfile: $!\n", next FILE);

				&process_ascii(*edgar_desc, *header, *newheader);

				print "Modifying $txtfile ...\n" if $verbose;

				# slurp in the whole document file
				@document = split("\n", <DOC>);

				# remove existing header
				&remove_header(*document, *newdoc);

				# open data file
				open(TEXT, ">$txtfile") ||
				  (warn "$prog: unable to open $txtfile: $!\n", next FILE);

				print TEXT "<IMS-DOCUMENT>", "$accno.txt : $datestamp\n";
				print TEXT join("\n", @newheader), "\n";
				print TEXT join("\n", @newdoc), "\n";
				print TEXT "</IMS-DOCUMENT>\n";

				# MUST do this before signing document to insure that all data
				# is flushed to the file
				close(TEXT);

				# resign the modified document
				system("/usr/local/ims/bin/sign-doc $txtfile");
			}

			$corrections++;

			$in_hdr = 0;

			# reset the array(s)
			@header = ();
			@document = ();

		} elsif ($in_hdr) {

			# TODO: fully automate this stuff...

			#
			# is this a deletion?
			#
			if ($line =~ /<DELETION>/) {
				&delete_submission($accno);
				$in_hdr = 0;
				# reset the array
				@header = ();
				@document = ();

				next FILE;
			} elsif ($line =~ /<DOCUMENT-|<TRANSFER-/) {
				warn "$prog: $file needs to be hand modified\n";
				$in_hdr = 0;
				# reset the array
				@header = ();
				@document = ();

				next FILE;
#			} elsif ($line =~ /<TRANSFER-ADD>|<TRANSFER-MODIFY>/) {
#				next LINE;  # eat the tag
#			} elsif ($line =~ /<TRANSFER-DELETE>/) {
#				local($tag) = pop(@header);
#				local($endtag);
#
#				($endtag = $tag) =~ s#<(.*)>#</\1>#;
#
#				while ($line = <IN>) {
#					chop($line);
#					next unless ($line eq $endtag);
#					next LINE;
#				}
			}

			#
			# grab the CIK
			#
			if ($line =~ /^<CIK>/) {
				($cik = $line) =~ s/<CIK>0*(.*)/\1/;
			}

			push(@header, $line);           # save the header line
		}
	}
}

# pet peeve... :)
print "$corrections correction", ($corrections == 1) ? " was " : "s were ",
  "applied\n";

exit 0;

#
#  create a more human-readable header file
#
#  format of description info is:
#    tag text|replacement text|end nest text
#
sub process_ascii {
	local(*desc) = shift;
	local(*header) = shift;
	local(*newheader) = shift;
	local($found) = 0;
	local($indent) = 0;
	local($line);
	local($endnest, $rep, $tag);

	# TODO: find a way to speed this up
	foreach $line (@header) {
DESC: for (@desc) {
			($tag, $rep, $endnest) = split(/\|/);
			if ($line eq "<$endnest>") {
				$indent-- if ($indent > 0);
				$found = 1;
				last;
			} elsif ($line =~ /^<$tag>/) {
				if ($rep) {
					$tag = $rep;
				} else {
					$tag =~ s/-/ /og;
				}

				if ($endnest) {    # true if this $tag starts a nest
					$tag = join("", "\n", "\t" x $indent, "$tag");
					$indent++;
				} else {
					$tag = join("", "\t" x $indent, "$tag\t");
				}

				if ($line =~ /^<ITEMS>/) {
					$line =~ s/<ITEMS>(.*)/$item_desc{\1}/e;
				} elsif ($line =~ /^<ACT>/) {
					$line =~ s/<ACT>(.*)/$sec_codes{\1}/e;
				} else {
					# strip out the tag info
					$line =~ s/<.*>(.*)/\1/;
				}

				push(@newheader, join("", $tag, $line));

				$found = 1;

				last DESC;
			}
		}

		if (! $found) {
			push(@newheader, $line);
		}
		$found = 0;
	}
}


#
# remove existing ASCII header
#
sub remove_header {
	local(*doc) = shift;
	local(*newdoc) = shift;
	local($in_doc) = 0;

	# special case for corrections in current feed - no header info
	if ($doc[0] =~ /^<DOCUMENT>/) {
		@newdoc = @doc;
		return;
	}

	foreach $line (@doc) {
		next unless ($in_doc || ($line =~ /<\/IMS-HEADER>/o));

		if ($in_doc == 0) {
			$in_doc = 1;
			next;
		}

		last if ($line =~ /<\/IMS-DOCUMENT>/o);

		push(@newdoc, $line);
	}
}

#
# delete a submission
#
sub delete_submission {
	local($accno) = shift;
	local($savdir) = '/ftp/edgar/data/.deleted';
	# TODO: must be a better way...
	local(@index_list) = ('/ftp/edgar/full-index',
	                      '/ftp/edgar/full-index/1995/QTR2',
	                      '/ftp/edgar/full-index/1995/QTR1',
	                      '/ftp/edgar/full-index/1994/QTR4',
	                      '/ftp/edgar/full-index/1994/QTR3',
	                      '/ftp/edgar/full-index/1994/QTR2',
	                      '/ftp/edgar/full-index/1994/QTR1',
	                     );

	# make path if it doesn't exist
	&makepath($savdir, 0750);

	foreach $i (@index_list) {
		open(IN, "$i/master.idx") || die "$0: error opening index file:$!\n";
		while (<IN>) {
			chop;
			local($cik, $comp, $form, $date, $path) = split(/\|/);
			next unless ($path =~ /$accno/);
			push(@deletions, $path);

			# Note that we cannot short-circuit this search and quit after the
			# first match since each submission may have multiple CIKs (files)
			# associated with it....bummer.
		}
	}

	foreach $path (@deletions) {
		local($path_sgml);

		($path_sgml = $path) =~ s/txt/hdr.sgml/;

		# make sure the file wasn't already deleted
		if (-e "/ftp/$path") {
			local($txt) = (split(/\//, $path))[3];

			# if save file doesn't already exist, create it
			if (! -e "$savdir/$txt") {
				local($sgml);

				($sgml = $txt) =~ s/txt/hdr.sgml/;

				# save backup of file(s) to be deleted
				link("/ftp/$path", "$savdir/$txt");
				link("/ftp/$path_sgml", "$savdir/$sgml");

				print "$path DELETED\n" if $verbose;
			}
			# remove original file(s)
			unlink("/ftp/$path");
			unlink("/ftp/$path_sgml");
		}
	}
}