#!/usr/bin/perl
# Xiaobin Li on Mar-19-2019
# School of Life Sciences & Biotechnology, Shanghai Jiao Tong University

use warnings;


my $usage1 = q(
Welcome to use MLST_Kp local version!
MLST_Kp is a perl-based tool to perform the multilocus sequence typing (MLST) of the strain of Klebsiella pneumoniae. 
-------------------------------------------------------------------------------------------------
Usage:	perl run_MLST_Kp_local.pl list_file

The list file should contain the names of DNA files(FASTA format)in the directory 'MLST_Kp_linux/input' line by line.
List File Format Example
=====
HS11286_chr.fasta
RJA166_chr.fas
RJF293_chr.fna
HS11286_contigs.fasta
KP5-1_chr.fas

For more details, please see the README.txt file.

If you have any question, please feel free to contact the authors:
hyou@sjtu.edu.cn
xiaobinli@sjtu.edu.cn

);
die($usage1) if ( @ARGV < 1 );


my $filein = shift;
my @strain;
my $i=0;
my $evalue = 0.0001; #E-value
my $log = "MLST.log"; 

system("dos2unix $filein >/dev/null 2>&1");
open LOG, ">$log";
open FILE, "< $filein" or die "ERROR: can't open the $filein!\n$usage1";

while(my $line=<FILE>){
	chomp $line;
	$strain[$i] = trim($line);
	my $seqinput = $strain[$i]; #query files
	if ($seqinput =~ /(.+)\.f/){
		my $acc = $1 ;
		if (-e "./input/$acc.fna" || -e "./input/$acc.fas" || -e "./input/$acc.fasta"){
			my $tmp_path = "./tmp/$acc";
			if(-e $tmp_path){
				system("rm -rf $tmp_path");
			}
			system("mkdir $tmp_path");
			system("chmod 777 $tmp_path");
			print LOG "Work on $acc\n"; 
			my $candidate_fna_1 = "./input/$acc.fas";
			my $candidate_fna_2 = "./input/$acc.fasta";
			my $candidate_fna_3 = "./input/$acc.fna";
			my $candidate_fna = "$tmp_path/$acc.fna";
			if(-e $candidate_fna_1){
				`cp $candidate_fna_1 $candidate_fna`;
			}
			if(-e $candidate_fna_2){
				`cp $candidate_fna_2 $candidate_fna`;
			}
			if(-e $candidate_fna_3){
				`cp $candidate_fna_3 $candidate_fna`;
			}
			if(&check_seq_type($candidate_fna) eq 'dna'){
				print "For $acc: Searching for the allele number of gapA, infB, mdh, pgi, phoE, rpoB, tonB of the strain of Klebsiella pneumoniae...\n";
				
				my $cmd_gapA ="./tool/blastn -query $candidate_fna -db ./data/gapA -evalue $evalue -num_threads 30 -max_hsps 1 -num_descriptions 1 -num_alignments 1 -out $tmp_path/gapA.out >/dev/null 2>&1";
				system($cmd_gapA);

				my $cmd_infB ="./tool/blastn -query $candidate_fna -db ./data/infB -evalue $evalue -num_threads 30 -max_hsps 1 -num_descriptions 1 -num_alignments 1 -out $tmp_path/infB.out >/dev/null 2>&1";
				system($cmd_infB);

				my $cmd_mdh ="./tool/blastn -query $candidate_fna -db ./data/mdh -evalue $evalue -num_threads 30 -max_hsps 1 -num_descriptions 1 -num_alignments 1 -out $tmp_path/mdh.out >/dev/null 2>&1";
				system($cmd_mdh);

				my $cmd_pgi ="./tool/blastn -query $candidate_fna -db ./data/pgi -evalue $evalue -num_threads 30 -max_hsps 1 -num_descriptions 1 -num_alignments 1 -out $tmp_path/pgi.out >/dev/null 2>&1";
				system($cmd_pgi);

				my $cmd_phoE ="./tool/blastn -query $candidate_fna -db ./data/phoE -evalue $evalue -num_threads 30 -max_hsps 1 -num_descriptions 1 -num_alignments 1 -out $tmp_path/phoE.out >/dev/null 2>&1";
				system($cmd_phoE);

				my $cmd_rpoB ="./tool/blastn -query $candidate_fna -db ./data/rpoB -evalue $evalue -num_threads 30 -max_hsps 1 -num_descriptions 1 -num_alignments 1 -out $tmp_path/rpoB.out >/dev/null 2>&1";
				system($cmd_rpoB);

				my $cmd_tonB ="./tool/blastn -query $candidate_fna -db ./data/tonB -evalue $evalue -num_threads 30 -max_hsps 1 -num_descriptions 1 -num_alignments 1 -out $tmp_path/tonB.out >/dev/null 2>&1";
				system($cmd_tonB);
				
				my $cmd_parse ="perl ./script/parse_BLASTn.pl $acc";
				system($cmd_parse);
				
				my $cmd_summary ="perl ./script/MLST_summary.pl $acc";
				system($cmd_summary);
			}else{
				print "For $seqinput: \n ERROR: $seqinput is not a standard FASTA file!\n$usage1";
			}	
		}else{
			print "For $acc: \n ERROR: Can not find the FASTA file of $acc in ./input directory!\n";
			print LOG "For $acc: \n ERROR: Can not find the FASTA file of $acc in ./input directory!\n";
		}
	}else{
		print "For $seqinput: \n ERROR: $seqinput is not a standard FASTA file!\n$usage1";
	}
	$i++;	
}
		
close FILE;
close LOG;

sub  trim { 
############################################
###  extract the names of DNA files line by line
############################################
	my $s = shift; 
	$s =~ s/^\s+|\s+$//g; 
	return $s; 
}

sub check_seq_type{
############################################
### check the upload sequences type(dna or protein)
############################################
	use Bio::SeqIO;
	use Bio::Seq;

	my $file = shift;
	my $seqin = Bio::SeqIO->new(-file => "<$file",
								-format => 'Fasta');
	my $max_check_times = 5;
	my $n = 0;
	my %type_count = ('protein'=>0,'dna'=>0);
	while(my $seq = $seqin->next_seq()){
		$n++;
		my $type=$seq->alphabet;
		$type_count{$type}++;
		if($n >= $max_check_times){
			last;
		}
	}
	if($type_count{'dna'}==$n){
		return 'dna';
	}elsif($type_count{'protein'}==$n){
		return 'protein';
	}else{
		return 'unknown';
	}
}
exit;
