#!/usr/local/bin/perl
#
#  file: ex3480
#  auth: Brad Burdick
#  desc: Extract SEC EDGAR, U.S. PTO APS/Full-Text, and U.S. PTO Trademark
#        image and text data files from an IBM 3480 1/2" cartridge tape.
#
#  usage: ex3480 [-E|-P|[-T image|text]] [-d data_dir] [-f data_file]
#           [-t tape_device] [-v]
#
#    E = Edgar data
#    P = Patent data
#    T = Trademark data
#
##########################################################################
#  Copyright (c) 1994, 1995 Internet Multicasting Service
#
#  The SEC EDGAR Level 1 Dissemination processing software ("software")
#  was developed by the Internet Multicasting Service and may 
#  be used for academic, research, government, and internal business
#  purposes without charge.  You may not resell this code or include it
#  in a product that you are selling without prior permission of the
#  Internet Multicasting Service.
#
#  This software is provided ``as is'', without express or implied
#  warranty, and with no support nor obligation to assist in its
#  use, correction, modification or enhancement.  We assume no liability
#  with respect to the infringement of copyrights, trade secrets, or any
#  patents, and are not responsible for consequential damages.  Proper
#  use of the software is entirely the responsibility of the user.
##########################################################################
#

eval 'exec /usr/bin/perl -s $0 ${1+"$@"}'
  if 0;

# who am i?
($prog = $0) =~ s#.*/##;

# where we find our local libraries
push(@INC, '/usr/local/ims/lib');

# for processing command line options
require 'getopts.pl';

# for date manipulation routines
require 'edgar-date.pl';

# process command line options, if any
&Getopts('EPT:d:f:t:v');

# current date info
@today = &edgar_date;

# type of processing to perform
$do_edgar = defined($opt_E);
$do_patent = defined($opt_P);
$do_trademark = defined($opt_T);

# type of processing specified?
if (! $do_edgar && ! $do_patent && ! $do_trademark) {
	die "$prog: no processing type specified\n";
}

# don't allow more than one type of processing...
if ($do_edgar && ($do_patent || $do_trademark)) {
	die "$prog: only one type of processing allowed\n";
}

# verbose output?
$verbose = defined($opt_v);

# where to place data file
$datadir = defined($opt_d) ? "$opt_d"        :
             $do_edgar     ? "/in/edgar"     :
             $do_patent    ? "/in/patent"    :
             $do_trademark ? "/in/trademark" :
             ".";

# default to the current date as output data file name
$file = defined($opt_f) ? "$opt_f" :
          sprintf("%02d%02d%02d", $today[5], $today[4]+1, $today[3]);

# patent data files may come on multiple tapes
if ($do_patent) {
	$file .= '01';

	while (-e "$datadir/$file") {
		$file++;
	}
}

# tape device to use - default is the first no-rewind tape device
$tape = defined($opt_t) ? "$opt_t" : "/dev/rmt/0n";

# tape utilities
$dd="/bin/dd";
$mt="/bin/mt";

# data file from tape
$data = "$datadir/$file";

# make sure this data file name does not already exist
if (-e "$data") { 
	print "$prog: $data already exists and will not be overwritten!";
	exit 1;
}

# which program am i?
if ($do_edgar) {
	&extract_edgar;
} elsif ($do_patent) {
	&extract_patent;
} elsif ($do_trademark) {
	&extract_trademark($opt_T);
}

chmod($data, 0664);

exit 0;


#
# extract SEC EDGAR data from 3480 tape
#
sub extract_edgar {
	# audit file from tape
	local($audit) = "$datadir/audit/$file.audit";
	# EDGAR data records have a length of 8196 bytes
	#local($ddflags) = "ibs=8196";
	# new format effective 01-03-95
	local($ddflags) = "ibs=32760";

	# make sure an audit directory exists
	if ( ! -d "$datadir/audit") { 
		print "$prog: creating audit directory ..." if $verbose;
		mkdir("$datadir/audit", 0775);
	}

	# be sure we're at the start of tape
	system("$mt -f $tape rewind");

	# skip EBCDIC header
	system("$mt -f $tape fsf 1");

	# grab the data file
	system("$dd if=$tape of=$data $ddflags 2>/dev/null");

	# skip EBCDIC end of data mark and next EBCDIC header
	system("$mt -f $tape fsf 2");

	# grab the audit file
	system("$dd if=$tape of=$audit $ddflags 2>/dev/null");

	# rewind tape and take it offline
	system("$mt -f $tape rewoff &");
}

#
# extract Patent data from 3480 tape
#
sub extract_patent {
	# Patent data records have a length of 2000 bytes and are 80-byte fixed
	# length records.
	local($ddflags) = "ibs=2000 cbs=80 conv=unblock";

	# be sure we're at the start of tape
	system("$mt -f $tape rewind");

	# skip ASCII header
	system("$mt -f $tape fsf 1");

	# grab the data file
	system("$dd if=$tape of=$data $ddflags 2>/dev/null");

	# rewind tape and take it offline
	system("$mt -f $tape rewoff &");
}

#
# extract Trademark full-text data
#
sub extract_trademark {
	# image or text data?
	local($type) = shift;

	if ($type eq 'text') {
		# Trademark text data records have a length of 327 bytes and are blocked
		# 48 records per block (15696)
		local($ddflags) = "ibs=15696 cbs=327 conv=unblock";

		# be sure we're at the start of tape
		system("$mt -f $tape rewind");

		# read EBCDIC header to get data file name
		@hdr = `$dd if=$tape ibs=80 conv=ascii 2>/dev/null`;
		$data = "$datadir/Text/" . substr($hdr[0], 4, 6);

		# grab the data file
		system("$dd if=$tape of=$data.raw $ddflags 2>/dev/null");

		# rewind tape and take it offline
		system("$mt -f $tape rewoff &");
	} elsif ($type eq 'image') {
		local($ddflags) = "ibs=20000";

		# be sure we're at the start of tape
		system("$mt -f $tape rewind");

		# read EBCDIC header to get data file name
		@hdr = `$dd if=$tape ibs=80 conv=ascii 2>/dev/null`;
		$data = "$datadir/Image/" . substr($hdr[0], 4, 6);

		# grab the image file
		system("$dd if=$tape of=$data.raw $ddflags 2>/dev/null");

		# rewind tape and take it offline
		system("$mt -f $tape rewoff &");
	} else {
		print "$0: unknown trademark data type ($type)\n";
		exit 1;
	}
}

