#!/usr/bin/perl
##---------------------------------------------------------------------------##
##  File:
##      @(#) BuildDatabase.pl
##  Author:
##      Arian Smit <asmit@systemsbiology.org>
##      Robert Hubley <rhubley@systemsbiology.org>
##  Description:
##      A utility for creating a WU-Blast/RepeatModeler XDF database
##      from a set of fasta files.
##
#******************************************************************************
#* Copyright (C) Institute for Systems Biology 2004 Developed by
#* Arian Smit and Robert Hubley.
#*
#* This work is licensed under the Open Source License v2.1.  To view a copy
#* of this license, visit http://www.opensource.org/licenses/osl-2.1.php or
#* see the license.txt file contained in this distribution.
#*
###############################################################################
#  ChangeLog:
#
#    $Log: BuildDatabase,v $
#    Revision 1.6  2017/03/31 17:25:39  rhubley
#      - Lots-o-improvements.  Starting preparation for release.
#
#    Revision 1.5  2012/08/22 18:46:24  rhubley
#     -- a refresh checkin and adding a new utility
#
#    Revision 1.4  2010/06/10 18:15:44  rhubley
#     - Fixed a bug that Arian found with RepeatScout models refined by the
#       RMBLAST engine.  The problem had to do with a rev/compl operation
#       in BuildRSConsensi
#
#     - Readying RepeatModeler for release with RMBLAST
#
#    Revision 1.3  2010/06/04 22:55:18  rhubley
#      - Some bug fixes.
#      - Changes to support rmblast package
#
#    Revision 1.2  2010/04/21 17:46:20  rhubley
#      -- working toward a RMBlast version of RepeatModeler
#
#    Revision 1.1  2010/04/20 23:33:28  rhubley
#      - Replacement for BuildXDFDatabase
#
#    Revision 1.12  2008/05/01 23:03:00  rhubley
#    Cleanup before a distribution
#
#
###############################################################################
#
# To Do:
#
#

=head1 NAME

BuildDatabase - Format FASTA files for use with RepeatModeler

=head1 SYNOPSIS

  BuildDatabase [-options] -name "mydb" <seqfile(s) in fasta format>
 or 
  BuildDatabase [-options] -name "mydb" 
                              -dir <dir containing fasta files *.fa, *.fasta,
                                     *.fast, *.FA, *.FASTA, *.FAST, *.dna,
                                     and  *.DNA > 
 or
  BuildDatabase [-options] -name "mydb" 
                              -batch <file containing a list of fasta files> 

=head1 DESCRIPTION

  This is basically a wrapper around AB-Blast's and NCBI Blast's
  DB formating programs.  It assists in aggregating files for processing 
  into a single database.  Source files can be specified by:

      - Placing the names of the FASTA files on the command
        line.
      - Providing the name of a directory containing FASTA files 
        with the file suffixes *.fa or *.fasta.
      - Providing the name of a manifest file which contains the 
        names of FASTA files ( fully qualified ) one per line.

  NOTE: Sequence identifiers are not preserved in this database. Each
        sequence is assigned a new GI ( starting from 1 ).  The 
        translation back to the original sequence is preserved in the
        *.translation file.

The options are:

=over 4

=item -h(elp)

Detailed help

=item -name <database name>

The name of the database to create.

=item -engine <engine name>

The name of the search engine we are using.  I.e abblast/wublast or
ncbi (rmblast version).

=item -dir <directory>

The name of a directory containing fasta files to be processed.  The
files are recognized by their suffix.  Only *.fa and *.fasta files
are processed.

=item -batch <file>

The name of a file which contains the names of fasta files to process.
The files names are listed one per line and should be fully qualified.

=back

=head1 SEE ALSO

=over 4

RepeatModeler, ABBlast, NCBIBlast

=back

=head1 COPYRIGHT

Copyright 2004-2017 Institute for Systems Biology

=head1 AUTHOR

Robert Hubley <rhubley@systemsbiology.org>

=cut

#
# Module Dependence
#
use strict;
use FindBin;
use lib $FindBin::RealBin;
use Getopt::Long;
use Cwd;

# RepeatModeler Libraries
use RepModelConfig;
use lib $RepModelConfig::REPEATMASKER_DIR;

#
# Class Globals & Constants
#
my $CLASS = "BuildDatabase";
my $DEBUG = 0;
$DEBUG = 1 if ( $RepModelConfig::DEBUGALL == 1 );

#
# Option processing
#  e.g.
#   -t: Single letter binary option
#   -t=s: String parameters
#   -t=i: Number paramters
#
my @opts = qw( help dir=s engine=s batch=s name=s );

#
# Get the supplied command line options, and set flags
#
my %options = ();
unless ( &GetOptions( \%options, @opts ) ) {
  exec "pod2text $0";
  exit( 1 );
}

# Print the internal POD documentation if something is missing
if ( ( $#ARGV == -1 && !$options{'batch'} && !$options{'dir'} )
     || $options{'help'} )
{
  print "No query sequence file indicated\n\n";

  # This is a nifty trick so we don't have to have
  # a duplicate "USAGE()" subroutine.  Instead we
  # just recycle our POD docs.  See PERL POD for more
  # details.
  exec "pod2text $0";
  die;
}

my $engine = $RepModelConfig::DEFAULT_SEARCH_ENGINE;
if ( $options{'engine'} )
{
  if ( $options{'engine'} =~ /wublast/i || $options{'engine'} =~ /abblast/i )
  {
    die "RepeatModeler doesn't appear to be configured to use the " 
      . "$options{'engine'} search engine.  Please re-run configure before "
      . "using this or other RepeatModeler programs.\n"
         if ( ! -x $RepModelConfig::XDFORMAT_PRGM );
    $engine = "abblast";
  }elsif ( $options{'engine'} =~ /ncbi/i )
  {
    die "RepeatModeler doesn't appear to be configured to use the " 
      . "$options{'engine'} search engine.  Please re-run configure before "
      . "using this or other RepeatModeler programs.\n"
         if ( ! -x $RepModelConfig::NCBIBLASTDB_PRGM );
    $engine = "ncbi";
  }else {
    print "I don't understand -engine $options{'engine'}\n";
    exec "pod2text $0";
    die;
  }
}

die "Missing database name!\n" if ( !defined $options{'name'} );

my @fileList = ();

# Directory specified
if ( $options{'dir'} ) {
  if ( -d $options{'dir'} ) {

    # Process the directory
    opendir DIR, $options{'dir'}
        or die "Cannot open dir $options{'dir'}!\n";
    my $file;
    while ( defined( $file = readdir( DIR ) ) ) {
      if (    $file =~ /\.fa$/i
           || $file =~ /\.fasta$/i
           || $file =~ /\.fast$/i
           || $file =~ /\.dna/i )
      {
        push @fileList, "$options{'dir'}/$file";
      }
    }
    closedir( DIR );
  }
  else {
    die "Directory $options{'dir'} doesn't exist!\n";
  }
}

# Batch specified
if ( $options{'batch'} ) {
  if ( -s $options{'batch'} ) {
    open BATCH, "<$options{'batch'}"
        or die "Cannot open batch file $options{'batch'}!\n";
    while ( <BATCH> ) {
      s/[\n\r\s]+//g;
      if ( -s $_ ) {
        push @fileList, $_;
      }
      else {
        die "File $_ from batch $options{'batch'} does not exist!\n";
      }
    }
    close BATCH;
  }
  else {
    die "Batch file $options{'batch'} doesn't exist!\n";
  }
}

# Push remaining command line files
foreach my $file ( @ARGV ) {
  if ( -f $file ) {
    push @fileList, $file;
  }
  else {
    die "Command line fasta file $file does not exist!\n";
  }
}

#
# Sanitize sequence names
#
my $IDX;
my $index = 1;
open $IDX, ">$options{'name'}.translation"
    || die $CLASS . ": Cannot open file $options{'name'}.translation\n";

my $results = "";
my $dbSeqs  = 0;
my $dbSize  = 0;
print "Building database $options{'name'}:\n";
for ( my $i = 0 ; $i <= $#fileList ; $i++ ) {
  my $file = $fileList[ $i ];
  print "  Adding $file to database\n";

  #
  # Sanitize sequence names
  #
  open IN, "<$file" || die $CLASS . ": Cannot open file $file\n";
  open SEQ, ">tmpSeqData.fa"
      || die $CLASS . ": Cannot open file tmpSeqData.fa\n";
  while ( <IN> ) {
    if ( /^\>\s*(\S+)\s+(.*)/ ) {
      print SEQ ">gi|$index\n";
      print $IDX "$1\t$index\n";
      $index++;
    }
    else {
      print SEQ $_;
    }
  }
  close IN;
  close SEQ;
  $file = "tmpSeqData.fa";

  if ( $engine eq "ncbi" )
  {
    if ( $i == 0 ) 
    {
      print "Running: $RepModelConfig::NCBIBLASTDB_PRGM -out $options{'name'} -parse_seqids -dbtype nucl -in $file 2>&1n" if ( $DEBUG );
      $results =
          `$RepModelConfig::NCBIBLASTDB_PRGM -out $options{'name'} -parse_seqids -dbtype nucl -in $file 2>&1`;
    }
    else {
      $results =
          `$RepModelConfig::NCBIBLASTDB_PRGM -out $options{'name'} -parse_seqids -dbtype nucl -in "$options{'name'} $file" 2>&1`;
    }
  }else 
  {
    if ( $i == 0 ) 
    {
      $results =
          `$RepModelConfig::XDFORMAT_PRGM -n -o $options{'name'} $file 2>&1`;
    }
    else {
      $results =
          `$RepModelConfig::XDFORMAT_PRGM -n -a $options{'name'} $file 2>&1`;
    }
    # TODO: Check for errors from xdformat
    while ( $results =~ /sequences\s*\(letters\)\s*(?:written|appended):\s*([\d,]+)\s+\(([\d,]+)\).*$/mg
        )
    {
      my $size = $2;
      my $seqs = $1;
      $size =~ s/,//g;
      $dbSize += $size;
      $seqs =~ s/,//g;
      $dbSeqs += $seqs;
      last;
    }
  }
}

if ( $engine eq "abblast" )
{
  $results = `$RepModelConfig::XDFORMAT_PRGM -n -X $options{'name'} 2>&1`;
}else 
{
  $results =
          `$RepModelConfig::NCBIDBCMD_PRGM -db $options{'name'} -info 2>&1`;
  while ( $results =~ /\s+([\d\,]+)\s+sequences;\s+([\d\,]+)\s+total bases.*$/mg )
  {
    my $size = $2;
    my $seqs = $1;
    $size =~ s/,//g;
    $dbSize += $size;
    $seqs =~ s/,//g;
    $dbSeqs += $seqs;
    last;
  }
}
  print "Number of sequences (bp) added to database: $dbSeqs ( $dbSize bp )\n";


close $IDX;

unlink( "tmpSeqData.fa" ) if ( -s "tmpSeqData.fa" );

1;
