#!/usr/local/bin/perl
#
#  file: create-index
#  auth: Brad Burdick
#  desc: create SEC EDGAR SGML index file
#
#  usage:  create-index [-D YYMMDD] [-M mod_time] [-m] [-d datadir]
#           [input_file]
#
##########################################################################
#  Copyright (c) 1994, 1995 Internet Multicasting Service
#
#  The SEC EDGAR Level 1 Dissemination processing software ("software")
#  was developed by the Internet Multicasting Service and may 
#  be used for academic, research, government, and internal business
#  purposes without charge.  You may not resell this code or include it
#  in a product that you are selling without prior permission of the
#  Internet Multicasting Service.
#
#  This software is provided ``as is'', without express or implied
#  warranty, and with no support nor obligation to assist in its
#  use, correction, modification or enhancement.  We assume no liability
#  with respect to the infringement of copyrights, trade secrets, or any
#  patents, and are not responsible for consequential damages.  Proper
#  use of the software is entirely the responsibility of the user.
##########################################################################

eval 'exec /usr/bin/perl -s $0 ${1+"$@"}'
  if 0;

# who am i?
($prog = $0) =~ s#.*/##;

# where we find our local libraries
push(@INC, "/usr/local/ims/lib");

# for processing command line options
require 'getopts.pl';

# header values for index files
require 'index-hdr.pl';

# Edgar date manipulation routines
require 'edgar-date.pl';

# miscellaneous support routines
require 'edgar-util.pl';

# process command line options, if any
&Getopts('D:M:d:m');

# true if master index, otherwise assume daily index
$do_master = defined($opt_m);

# date stamp to use in file name
$date = defined($opt_D) ? "$opt_D" : "";

# last modification time of files to search on
# - defaults to last 6 hours
$mod_time = defined($opt_M) ? "$opt_M" : ".25";

# where to place index files
$datadir = defined($opt_d) ? "$opt_d" : "/ftp/edgar";
&makepath($datadir, 0755);

# where to place daily index file(s)
$daily = "daily-index";

# where to place full index file(s)
$full = "full-index";

# message of the day
$motd = 
"\n\n" .
"ATTENTION:  Second quarter index data has been archived to the\n" .
"            following subdirectory:  edgar/full-index/1995/QTR2/.\n\n" .
"            First quarter index data has been archived to the\n" .
"            following subdirectory:  edgar/full-index/1995/QTR1/.\n";

# days from beginning of year to end of last qtr (will change w/ each qtr)
# --  90 is for 1st qtr (Mar 31)
# -- 181 is for 2nd qtr (Jun 30)
# -- 275 is for 3rd qtr (Sep 30)
@today = localtime;
$start_day = $today[7] - 185;

# command used to create file list for index
$find_args = "data -depth -type f -name '*.sgml' -print";

# archive command lines
$compress = "/bin/compress -c";
$sit = "/usr/local/bin/sit -u -C edgar -o";
$zip = "/usr/local/bin/zip -b /tmp -jlq -9";

# don't allow access to files until we're done
$oldumask = umask(077);

# daily or master index info
@index = ();

# data file's date
@today = &edgar_date;
$date = sprintf("%02d%02d%02d", $today[5], $today[4]+1, $today[3])
	unless $date;

if ($do_master) {
	$outfile = "$datadir/$full/master.idx";
	@header = @master_hdr;

	open(FIND, "(chdir $datadir ; /bin/find $find_args) |") ||
 	 die "$prog: error getting file list: $!\n";
} else {
	$outfile = "$datadir/$daily/master.$date.idx";
	@header = @daily_hdr;

	open(FIND, "$datadir/$full/master.idx") ||
 	 die "$prog: error getting file list: $!\n";
}

# process index file header
&process_hdr(*header);

#
#  main processing loop
#
FILE: while ($path = <FIND>) {
	chop($path);

	if (! $do_master) {
		next FILE unless ($path =~ /^[0-9]/);

		local($cik,$cname,$form,$date,$file) = split(/\|/, $path);

		# in case we left some files in the install directories
		next FILE if ($file =~ m#/private/#o);

		# assume daily files modified within last $mod_time hours (approximately)
		next FILE unless -M "/ftp/$file" <= $mod_time;

		push(@index, $path);
	} else {
		local($current_cik, $name, $type, $filedate);

		# master files are searched from beginning of QTR
		next FILE unless -M "$datadir/$path" <= $start_day;

		# $path should have data?/cik/file.hdr.sgml format at this point
		($base, $cik, $file) = split("/", $path);

		# open file to get submission information
		open(IN, "$datadir/$path") ||
		  warn "$prog: unable to open $datadir/$path: $!\n", next FILE;

		LINE: while ($line = <IN>) {
			chop($line);
			if ($line =~ /<CONFORMED-NAME>/) {
				($name = $line) =~ s/<CONFORMED-NAME>(.*)/\1/;
			} elsif ($line =~ /<CIK>/) {
				($current_cik = $line) =~ s/<CIK>0*(.*)/\1/;
				# handle all zero (0) CIKs
				if (! $current_cik) {
					$current_cik = "0" x 10;
				}
			} elsif ($line =~ /<TYPE>/) {
				($type = $line) =~ s/<TYPE>(.*)/\1/;
			} elsif ($line =~ /<FILING-DATE>/) {
				($filedate = $line) =~ s/<FILING-DATE>(.*)/\1/;
			} else {
				next LINE;
			}

			next LINE unless ($name && $type && $filedate && $current_cik);

			# index file points to *.txt file
			$path =~ s/hdr\.sgml/txt/o;

			# save index entry
			push(@index, sprintf("%.10s|%.60s|%.10s|%.8s|edgar/%.55s", $current_cik,
			  $name, $type, $filedate, $path));
			undef $name, $current_cik;
		}
	}
}

if ($do_master) {
	@sortedindex = sort(@index);
	&dedup(*sortedindex);
} else {
	@sortedindex = @index;
	&dedup(*sortedindex);
}

open(INDEX, ">$outfile") || die "$prog: unable to open $outfile: $!\n";

print INDEX join("\n", @header), "\n";

# print message of the day if available
print INDEX $motd if ($do_master && $motd);

print INDEX sprintf("\n%-10.10s|%-20.20s|%-10.10s|%-10.10s|%-s\n", 'CIK',
               'Company Name', 'Form Type', 'Date Filed', 'File Name');
print INDEX "-" x 80, "\n";
print INDEX join("\n", @sortedindex), "\n";

close(INDEX);

# file is ready
chmod(0644, $outfile);

# restore umask
umask($oldumask);

# pack index files
if ($do_master) {
	system("chdir $datadir/$full ; $compress master.idx > master.Z");
	system("chdir $datadir/$full ; $sit master.sit master.idx");
	system("chdir $datadir/$full ; $zip master.zip master.idx");
}

exit 0;


#
# process index header
#   expects fixed format from header - see lib/index-hdr.pl
#
sub process_hdr {
	local(*header) = shift;
	local($recv) = sprintf("%s %02d, %04d", $Months[int(substr($date, 2, 2))],
		               substr($date, 4, 2), 1900+int(substr($date, 0, 2)));

	$header[0] =~ s|%s||;
	# last data recv'd date
	$header[1] =~ s/%s/$recv/;
	if ($do_master) {
		$header[6] =~ s|%s|$full/master.idx|;
	} else {
		$header[6] =~ s|%s|$daily/master.$date.idx|;
	}
}

#
# delete duplicate record info
# - assumes they are already sorted
#
sub dedup {
	local(*list) = shift;
	local($current, $previous) = '';
	local(@tmp) = ();

	for (@list) {
		local($cik,$name,$form,$date,$path) = split(/\|/);
		local($file) = (split(/\//, $path))[3];

		$current = join("#", $name, $form, $date, $file);
		if ($current ne $previous) {
			push(@tmp, $_);
		}
		$previous = $current;
	}

	@list = @tmp;
}

