#!/usr/bin/perl

# fix-ead.pl - modify an existing ead file so it validates, specifically:
#                * give the file a new xml-stylesheet processing instruction
#                * change ISO 639-2 to ISO639-2
#                * remove legalstatus, langmaterial, and inline attributes
#                * remove admininfo element and its children

# Be forewarned. The resulting EAD's are valid but not necessarily correct.

# Eric Lease Morgan <eric_morgan@infomotions.com>

# July 12, 2004 - add barely working loop to process many files
# July 9, 2004  - first cut


# let's get started

# include the necessary modules
use File::Basename;
use File::Find;
use strict;
use XML::SAX::ParserFactory;

# get the input
my $in  = shift @ARGV;
my $out = shift @ARGV;

# check for input
if (! $in or ! $out) {

	# print help text and quit
	print "Usage: $0 [full path to input directory] [full path to output directory]\n";
	exit;
	
}

# create a SAX handler object
my $handler = MyHandler->new();

# create a parser pointing to the handler; assume the PurePerl parser
my $parser = XML::SAX::ParserFactory->parser(Handler => $handler);

# keep the user informed, sort of
print "Processing... Open another shell and watch the files grow in $out.\n";
	
# process every file in the defined directory
find (\&process_files, $in);

# done
exit;


# process every file in the given subdirectory
sub process_files {

	# get the name of the found file
	my $file = $File::Find::name;

	# make sure it has the correct extension; bad assumption
	next if ($file !~ m/\.xml$/);

	# get the leaf name
	my ($leaf, $path, $suffix) = fileparse($file);
	my $leafname = "$leaf$suffix";
	
	# open the output and make the first fix
	open (STDOUT, "> $out$leafname");
	print "<?xml version='1.0'?>\n<?xml-stylesheet type='text/xsl' href='ead2html.xsl'?>\n";

	# do the work
	$parser->parse_uri("$file");
	
	# close
	close STDOUT;

}


#################################
# a rudimentary SAX event handler

package MyHandler;

my $do_nothing;

sub new {

	my $type = shift;
	return bless {}, $type;
	
}

sub start_element {

	my ($self, $element) = @_;

	# check for the element we want to remove (admininfo)
	if ($element->{Name} eq 'admininfo') { $do_nothing = 1 }
	return if ($do_nothing);
	
	# print the element
	print "<" . $element->{Name};
	
	# get any attributes
	my %attributes = %{$element->{Attributes}};
	if (keys(%attributes)) {
	
		# process each attribute
		for my $key (keys(%attributes)) {
		
			# discard the ones we don't want; brain-dead
			next if ($attributes{$key}->{Name} eq 'legalstatus');
			next if ($attributes{$key}->{Name} eq 'langmaterial');
			next if ($attributes{$key}->{Name} eq 'inline');
			
			# check for broken langencoding attribute
			if ($attributes{$key}->{Name} eq 'langencoding') {
			
				# fix it; brain-dead, again
				print ' ' . $attributes{$key}->{Name} . "='ISO639-2'";
				
			}
			
			# print anything else
			else { print ' ' . $attributes{$key}->{Name} . "='" . $attributes{$key}->{Value} . "'" }
		
		}
	
	}
	
	# close the element
	print ">";
			
}


sub end_element {

	my ($self, $element) = @_;

	if ($element->{Name} eq 'admininfo') { undef $do_nothing }
	elsif ($do_nothing) { }
	else { print "</" . $element->{Name} . ">" }
	
}


sub characters {

	my ($self, $characters) = @_;

	return if ($do_nothing);
	print $characters->{Data};
	
}


# return true or die
1;

