#!/usr/bin/perl
#
# Utility for archiving old data
#
# TODO:
#  - Exception handling
#  - Remove expired archives
#  - Config

#
# Runs several times a day to index backup directories with EDI data.
# 
# Every run, we index whatever is not indexed in backup
# We need to make sure the first indexing does not index what should have been archived.
# When messages go past the "when" point, they get moved into a monthly archive.
# Possibly, this could be used for generic OnDemand integration also.
#
# Operates in two stages:
# --mode index
# --mode archive
#
# Indexing just indexes the non-indexed files in the directory.
# Archiving reads the index and moves files out on a per month basis.

# --archive switch enables archiving mode, where it takes the currently indexed files
# and moves it into a indexed archive.

use strict;
use warnings;
use Time::HiRes qw(gettimeofday tv_interval);
use Data::Dumper;

$| = 1;

my $inputs = {
    edb_pamt03 => {
        #input   => '/opt/ess/misc/amtrix/pamt03/data/ftp/in/asedb/backup',
        input    => '/home/atomt/development/edifact/tine',
        output   => '/prod/data/archive/PAMT03/edb/edifact/in',
        when     => 1,
        format   => 'edifact'
    }
};

my $parsers = {
  'edifact' => 'EDIFACT',
};

my $mode;
my $verbose;

# Cmd parsing
while ( scalar @ARGV > 0 ) {
    my $option = shift;
    
    if ( $option eq '--mode' ) {
        $mode = shift;
        if ( $mode ne 'index' && $mode ne 'archive' ) {
            die "Unknown mode specified\n";
        }
    }
    else {
        die "Unknown option \"$option\" on command line\n";
    }
}

# Eww. For stupid date calculations
my ($now_year, $now_month) = (gmtime(time))[5,4];
$now_year += 1900;
$now_month++;
my $now = int("$now_year$now_month");

# Some precompiled regexes for speed
my $reg_dotfile = qr/^\./;

my $start = [gettimeofday];
my $files = 0;
my $documents = 0;

while ( my ($queue_id, $queue) = each %{$inputs} ) {
    my $input_dir  = $queue->{'input'};
    my $format     = $queue->{'format'};
    my $curindex_f = "$input_dir/current.idx";
    my $parser     = $parsers->{$format};
 
    print "Indexing $queue_id ($input_dir)\n";

    # Some sanity to all this mess
    if ( !$parser ) {
        die "Unknown format \"$format\" defined in setup\n";
    }

    # Load currently indexed into memory
    #my $curindex = read_index($curindex_f);
    
    # Index!!
    opendir my $root_dh, $input_dir
      or die "failed to open $input_dir for reading: [$!]\n";
    
    open my $idxfd, '>>', $curindex_f
      or die "failed to open $curindex_f for writing: [$!]\n";
    
    FILE:
    while ( my $entry = readdir($root_dh) ) {
        my $fp = "$input_dir/$entry";
        next if $entry =~ $reg_dotfile;
        next if ! -f $fp;

        # Is already indexed
        #next if exists $curindex->{$entry};
        
        # Index
        my $mtime = (stat($fp))[9];
        
        if ( $format ) {
            open my $infd, '<', $fp
              or die "could not open $fp for processing: [$!]\n";

            # New message object            
            my $message = $parser->new($infd);

            # Get delimeter for loading the file outside of the parser
            my $lineSeperator = $message->getLineSeperator();

            # Load file into memory for fast cached processing
            my @content;
            {
                local $/;
                $/ = $lineSeperator;
                @content = <$infd>;
            }
            close $infd;

            # Pass content to parser
            $message->addContent(\@content);
            
            print Dumper($message);
        }
        else {
            print {$idxfd} $entry, ';', $mtime, "\n";
        }
        
        $files++;

        #print $files, "\r";
        
        # XXX 
        #if ( $files >= 1000 ) {
            last;
        #}
    }
    
    close $idxfd;
}
my $runtime = tv_interval($start);
my $rate = $files / $runtime;

print "Processed $files files in $runtime seconds ($rate/second)\n";

sub read_file {
    my ($f, $linesep) = @_;
    my @content;
    
    if ( !defined $linesep ) {
        $linesep = "\n";
    }
    
    open my $fh, '<', $f or die "could not read $f: [$!]\n";
    {
        local $/;
        $/ = $linesep;
        @content = <$fh>;
    }
    close $fh;
    return \@content;
}


sub read_index {
    my ($file) = @_;
    my $ret = {};
    
    if ( ! -r $file ) {
        return $ret;
    }
    
    my $in = read_file($file);
    
    for my $line ( @{$in} ) {
        my ($filename, $mtime) = (split(';', $line, 3))[0,1];
        $ret->{$filename} = $mtime;
    }
    
    return $ret;
}


package EDIFACT;

use strict;
use warnings;
use Data::Dumper;

my $segment_defs;

my $IDX_SEGMENT_REPEATS;
my $IDX_SEGMENT_GROUPTYPE;
my $IDX_SEGMENT_IS_MANDATORY;
my $IDX_SEGMENT_MANDATORY_COMPONENTS;
my $IDX_SEGMENT_COMPONENTS;

my $IDX_COMPONENT_REF;
my $IDX_COMPONENT_IS_MANDATORY;
my $IDX_COMPONENT_IS_COMPOSITE;
my $IDX_COMPONENT_COMPONENTS;

BEGIN {
	$IDX_SEGMENT_REPEATS              = 0;
	$IDX_SEGMENT_GROUPTYPE            = 1;
    $IDX_SEGMENT_IS_MANDATORY         = 2;
    $IDX_SEGMENT_MANDATORY_COMPONENTS = 3;
    $IDX_SEGMENT_COMPONENTS           = 4;
    
    $IDX_COMPONENT_REF                = 0;
    $IDX_COMPONENT_IS_MANDATORY       = 1;
    $IDX_COMPONENT_IS_COMPOSITE       = 2;
    $IDX_COMPONENT_COMPONENTS         = 3;
	
    $segment_defs = {
        # Interchange trailer
        'UNZ' => [
            0,
            # Group type
            # 0 = root, 1 = segment is start of group (parent), 2 = segment is in group (children)
            0,
            1,
            2,
            [
                # Interchange control count
                [
                    undef,
                    1,
                    0, # Is composite element
                    ['0036', 1, qr/^\d{1,6}$/]
                ],
                # Interchange control reference
                [
                    undef,
                    1,
                    0,
                    ['0020', 1, qr/^\w{1,14}$/]
                ]
            ]
        ],
                
        # Interchange header
        'UNB' => [
            # Repeats
            0,
            # Group type
            # 0 = root, 1 = segment is start of group, 2 = segment is in group
            0,
            # Mandatory
            1,
            # Number of mandatory elements
            5,
            # Components
            [
                # Syntax Identifier
                [
                  'S001',                       # Ref
                  1,                            # Mandatory
                  1,                            # Is composite element
                  ['0001', 1, qr/^\w{1,4}$/],   # Identifier
                  ['0002', 1, qr/^\d{1}$/]      # Version Number
                ],
                # Interchange Sender
                [
                  'S002',
                  1,
                  1,                            # Is composite element
                  ['0004', 1, qr/^\w{1,35}$/],  # Sender
                  ['0007', 0, qr/^\w{1,4}$/],   # Partner Identification code qualifier
                  ['0008', 0, qr/^\w{1,14}$/]   # Address for reverse routing
                ],
                # Interchange recipient
                [
                  'S003',
                  1,
                  1,                            # Is composite element
                  ['0010', 1, qr/^\w{1,35}$/],  # Recipient
                  ['0007', 0, qr/^\w{1,4}$/],   # Partner Identification code qualifier
                  ['0014', 0, qr/^\w{1,14}$/]   # Routing address
                ],
                # Date/Time of preperation
                [
                  'S004',
                  1,
                  1,                            # Is composite element
                  ['0017', 1, qr/^\d{2}[01][0-9][0-3][0-9]$/],     # Date (YYMMDD)
                  ['0019', 1, qr/^\d{4}$/]      # Time (HHMM)
                ],
                # Interchange Control Reference
                [
                  undef,
                  1,
                  0,                            # Is composite element
                  ['0020', 1, qr/^\w{1,14}$/]     # ^
                ],
                # Recipients Reference Password
                [
                  'S005',
                  0,
                  1,                            # Is composite element
                  ['0022', 1, qr/^\w{14}$/],      # ^
                  ['0025', 0, qr/^\w{2}$/]        # Recipient's reference password qualifier
                ],
                # Application Reference
                [
                  undef,
                  0,
                  0,                            # Is composite element
                  ['0026', 0, qr/^\w{1,14}$/]     # ^
                ],
                # Processing Priority Code
                [
                  undef,
                  0,
                  0,                            # Is composite element
                  ['0029', 0, qr/^[a-zA-Z]$/]     # ^
                ],
                # Acknowledgment Request
                [
                  undef,
                  0,
                  0,                            # Is composite element
                  ['0031', 0, qr/^[01]$/]         # ^
                ],
                # Communications Agreement ID
                [
                  undef,
                  0,
                  0,                            # Is composite element
                  ['0032', 0, qr/^\w{1,35}$/]     # ^
                ],
                # Test Indicator
                [
                  undef,
                  0,
                  1,                            # Is composite element
                  ['0035', 0, qr/^[01]$/]         # ^
                ]
            ]
        ],
        # XXX
        #'UNH' => [],
        #'BGM' => [],
        #'DTM' => [],
        #'NAD' => [],
        #'LIN' => [],
        #'PIA' => [],
        #'IMD' => [],
        #'QTY' => [],
        #'UNS' => [],
        #'CNT' => [],
        #'UNT' => [],
        #'UNZ' => [],
        #'RFF' => []
    };
}

sub new {
    my ($class, $fd) = @_;
    
    my $s = {
        una           => [':', '+', '.', '?', ' ', "'"],
        lineSeperator => "'",
        content       => undef,
        envelope      => {},
        documents     => [],
        segment_defs  => $segment_defs
    };

    # Read the start of file, to determine delimeters
    my $unabuf = '';
    read $fd, $unabuf, 9;
    
    if ( substr($unabuf, 0, 3) eq 'UNA' ) {
        my @una = split('', substr($unabuf, 3));
        $s->{'una'} = \@una;
    }
    else {
        # No UNA, reset filepos before returning
        seek $fd, 0, 0;
    } 
    
    bless $s, $class;
    return $s;
}

sub getLineSeperator {
    my ($s) = @_;
    return $s->{'lineSeperator'};
}

sub scrapeBlanks {
    my ($s, $ref) = @_;
    
    for my $item ( @{$ref} ) {
        if ( $item eq '' ) {
            $item = undef;
        }
    }
}

sub addContent {
    my ($s, $ref) = @_;
    my $una          = $s->{'una'};
    my $reg_subcomp  = $s->{'reg_subcomp'};
    my $segment_defs = $s->{'segment_defs'};
    my @output;

    # Split into arrays and arrays of arrays for subcomponents
    for my $line ( @{$ref} ) {
        chop($line);
        my @components = split('\\' . @{$una}[1], $line);
        my $segment = shift @components;
        
        # Check if segment is known
        if ( !exists $segment_defs->{$segment} ) {
            next;
        }
        my $validation_segment = $segment_defs->{$segment};
		my $components_def     = @{$validation_segment}[$IDX_SEGMENT_COMPONENTS];
        
        my $compos = 0;
        for my $component ( @components ) {
        	# Get the component definition from the segment definition by using compos
        	my $component_def = @{$components_def}[$compos];
        	
        	#print $segment, ": ", $compos, ": ", $component, "\n";
        	#print Dumper($component_def);
        	
            # And subcomponents if necesary
            if ( @{$component_def}[$IDX_COMPONENT_IS_COMPOSITE] ) {
            	my @subcomponents = split('\\' . @$una[0], $component);
                
				# Validate subcomponents
               	my $subcompos = 0;
				for my $subcomponent ( @subcomponents ) {
					my $subcomp_def = @{$component_def}[$IDX_COMPONENT_COMPONENTS + $subcompos];
               	   
					if ( !$subcomp_def ) {
						die "subcomponent not found in spec\n";
               	    }
					
					if ( $subcomponent eq '' ) {
						$subcomponent = undef;
					}
					
					my $mandatory = @{$subcomp_def}[1];
					my $regex     = @{$subcomp_def}[2];
               	   
               	   if ( !(!$mandatory && !defined $subcomponent) && $subcomponent !~ $regex ) {
               	   	   die "segment $segment, component $compos, subcomponent $subcompos failed validation.\n";
               	   }
               	   
                   	$subcompos++;
				}
	           	
				# Check wether we have all the mandatory subcomponents.
            	$component = \@subcomponents;
			}
			else {
				if ( $component eq '' ) {
					$component = undef;
				}
				
		        # Validate component
		        my $regex     = @{@{$component_def}[$IDX_COMPONENT_COMPONENTS]}[2];
		        my $mandatory = @{@{$component_def}[$IDX_COMPONENT_COMPONENTS]}[1];
				
		        # Skip validation if the component contents is empty and the field is not required.
		       	if ( !(!$mandatory && !defined $component) && $component !~ $regex ) {
					die "$segment failed validation at component $compos. content: \"$component\"\n";
				}
			}
			
			$compos++;
        }
        
        
        # Check if we have enough elements
        my $required_elements = @{$validation_segment}[$IDX_SEGMENT_MANDATORY_COMPONENTS];
        if ( @components < $required_elements ) {
            print "Missing mandatory elements in $segment (required $required_elements, got ", scalar(@components), ")\n";
        }
        
        # For every component in the initially parsed output, check for format validity etc.
        #print Dumper(\@components);
        #print Dumper($validation_segment);
         
        push @output, [$segment, \@components];
    }
   # print "EOF\n";
    $s->{'content'} = \@output;

    # De-enveloping, read envelope/interchange header/trailer
    #my @envelope_head  = @{shift @{$ref}};
    #my @envelope_trail = @{pop @{$ref}};
    
    #if ( $envelope_head[0] ne 'UNB' ) {
    #    die "Missing Interchange Header (UNB)\n";
    #}
    
    #if ( $envelope_trail[0] ne 'UNZ' ) {
    #    die "Missing Interchange Trailer (UNZ) - incomplete file?\n";
    #}
    
    #if ( $envelope_trail[1] > 1 ) {
    #    die "Only one interchange per message supported. Message specifies $envelope_trail[1].\n";
    #}

    #$s->{'sender'}    = $envelope_head[2][0];
    #$s->{'recipient'} = $envelope_head[3][0];
    
    #for my $line ( @{$ref} ) {
        
    #}
}

