Commands used to build SpPtBB. (Last updated 11/29/2018) #Note the three cycles of fgap_wrapper.sh followed by dedupe.sh. #Each time gaps are filled there is a new possibility to remove a sequence #from one of the haplotypes, and doing so reduces erroneous assembly later on #of the type where both haplotype sequences are incorporated into the genome, #rather than eliminating one of the two. #commands were run on two machines, bassoon and trombone, in several different #directories. export TTOPDIR=/home/mathog export BTOPDIR=/home/mathog export BOTHER=/scratch1/mathog/trombone_mathog export BOTHER2=/scratch1/mathog/oboe_mathog/Lv/P_RNA_scaffolder ##get and install platanus and its trim programs cd $BTOPDIR/src wget -O platanus-1.2.4 http://platanus.bio.titech.ac.jp/?ddownload=145 cd $BTOPDIR/bin wget -O platanus_pe_trim http://platanus.bio.titech.ac.jp/?ddownload=153 wget -O platanus_mp_trim http://platanus.bio.titech.ac.jp/?ddownload=154 ##get and install OPERA-LG #download from sourceforge: # https://sourceforge.net/projects/operasf/files/OPERA-LG%20version%202.0.6/ #install in $BTOPDIR/src #patch one perl script and create a PacBIO specific version. Some of this #may be in newer OPERA-LG versions: cat >/tmp/olg_patches.txt <<'EOD' --- OPERA-long-read.pl 2018-01-24 11:12:15.952938478 -0800 +++ OPERA-pacbio-read.pl 2018-02-08 15:24:04.893806637 -0800 @@ -5,7 +5,19 @@ my $min_contig_size = 300; my $min_opera_contig_size = 500; my $long_read_cluster_threshold = 2; -for($i = 0; $i < 6; $i++){$cluster_threshold_tab[$i] = $long_read_cluster_threshold;} +# sd and mean for fake end pairs derived from long reads. +my @stds = (100, 333, 667, 1333, 2000, 2667, 3333, 4667); +my @means = (300, 1000, 2000, 4000, 6000, 8000, 10000, 14000); +my %inter = (); +my $num_it = @stds; #used elsewhere for the number of loop iterations. +for($i = 0; $i < $num_it; $i++){ + $inter{"i${i}"} = [$stds[$i],$means[$i]]; +} +# For some reason original script had i0 with -200 for stds, so change that one +$inter{"i0"} = [-200,$means[0]]; + + +for($i = 0; $i < $num_it; $i++){$cluster_threshold_tab[$i] = $long_read_cluster_threshold;} my $short_read_cluster_threshold = 3; #my @cluster_threshold_tab = (2,2,2,2,1,1); #my @cluster_threshold_tab = (2,2,2,2,2,1); @@ -27,6 +39,9 @@ #my $mapper_extention my $graphmapDir = "/mnt/software/stow/graphmap-0.3.0-1d16f07/bin/"; +#for emitting more log information +my $command; + #Used in case of grapmap mapping my @contig_id_to_name = (); @@ -39,7 +54,7 @@ my $short_read_tooldir = ""; my $samtools_dir = ""; my $short_read_maptool = "bwa"; -my $kmer_size = 100; +my $kmer_size = 61; my $flag_help; @@ -59,6 +74,7 @@ --opera: Folder which contains opera binary (default PATH) --illumina-read1: fasta file of illumina read1 --illumina-read2: fasta file of illumina read2 + --upto-config: stop after config is generated, before OPERA-LG runs --help : prints this message "; @@ -82,6 +98,7 @@ "short-read-tooldir=s" => \$short_read_tooldir, "illumina-read1=s" => \$illum_read1, "illumina-read2=s" => \$illum_read2, + "upto-config+" => \$upto_config, "help" => \$flag_help ) or die("Error in command line arguments.\n"); @@ -154,7 +171,7 @@ chdir( $outputDir ); my $str_full_path = "or please enter the full path"; -if ( ! -e $contigFile ) {die "\nError: $contigFile - contig file does not exist $str_full_path\n"}; +if ( ! -e $contigFile ) {die "\nError: --num-of-processors $nproc $contigFile - contig file does not exist $str_full_path\n"}; if ( ! -e $readsFile ) {die "\nError: $readsFile - long read file does not exist $str_full_path\n"}; if ( ! -e $illum_read1 ) {die "\nError: $illum_read1 - illumina read 1 file does not exist $str_full_path\n"}; if ( ! -e $illum_read2 ) {die "\nError: $illum_read2 - illumina read 2 file does not exist $str_full_path\n"}; @@ -172,7 +189,7 @@ $str_path_dir = ""; $str_path_dir .= "--tool-dir $short_read_tooldir" if($short_read_tooldir ne ""); $str_path_dir .= "--samtools-dir $samtools_dir" if($samtools_dir ne ""); - run_exe("perl $operaDir/preprocess_reads.pl --map-tool $short_read_maptool $str_path_dir --contig $contigFile --illumina-read1 $illum_read1 --illumina-read2 $illum_read2 --out ${file_pref}.bam"); + run_exe("perl $operaDir/preprocess_reads.pl --num-of-processors $nproc --map-tool $short_read_maptool $str_path_dir --contig $contigFile --illumina-read1 $illum_read1 --illumina-read2 $illum_read2 --out ${file_pref}.bam"); } if(! -e "$file_pref.map.sort"){ @@ -272,7 +289,7 @@ } #Filter the long read edges -for (my $i = 0; $i <= 5; $i++){ +for (my $i = 0; $i < $num_it; $i++){ $edge_file = $all_edge_file."_i$i"; $updated_edge_file = $all_edge_file."_no_repeat_i$i"; push(@allEdgeFiles, $updated_edge_file); @@ -296,22 +313,21 @@ &CreateConfigFile( $contigFile, "", @allEdgeFiles ); # run opera -&run_exe( "${operaDir}OPERA-LG config > log" ); +if (defined($upto_config)) { + $time = localtime; + print STDERR "[$time] config file created, exiting without running OPERA-LG\n"; + exit 0; +} +&run_exe( "$operaDir/OPERA-LG config > log" ); #Link to the result file &run_exe("ln -s results/scaffoldSeq.fasta scaffoldSeq.fasta"); sub extract_edge{ my ($all_edge_file) = @_; + $time = localtime; + print STDERR "[$time] in sub extract_edge\n"; - my %inter = ( - "i0", [-200, 300], - "i1", [300, 1000], - "i2", [1000, 2000], - "i3", [2000, 5000], - "i4", [5000, 15000], - "i5", [15000, 40000] - ); my %out_edge = (); foreach $it (keys %inter){ @@ -327,7 +343,9 @@ @line = split(/\t/, $_); $dist = $line[4]; $support = $line[6]; - foreach $it (keys %inter){ +# foreach $it (keys %inter){ + for($i = 0; $i < $num_it; $i++){ + $it = "i${i}"; if($support >= 0 && $inter{$it}->[0] < $dist && $dist < $inter{$it}->[1]){ $OUT = $out_edge{$it}; print $OUT join("\t", @line)."\n"; @@ -499,6 +517,8 @@ } sub checkMapping{ + $time = localtime; + print STDERR "[$time] in sub createOutputFolder\n"; my ($mapFile, $all_edge_file) = @_; my ($rn, $cn, $score, $unused, $ro, $rs, $re, $rl, $co, $cs, $ce, $cl); my $currentScore = 0;my $previousScore = 0; @@ -636,6 +656,8 @@ sub CreateConfigFile{ my( $contigFile, $suffix, @edgeFiles) = @_; + $time = localtime; + print STDERR "[$time] in sub CreateConfigFile\n"; open( CONF, ">config".$suffix ) or die $!; @@ -666,8 +688,6 @@ } $i = 0; - @means = (300, 1000, 2000, 5000, 15000, 40000); - @stds = (30, 100, 200, 500, 1500, 4000); foreach $edgeFileName ( @edgeFiles ){ print CONF "[LIB]\n"; #print CONF "cluster_threshold=$cluster_threshold\n"; @@ -687,6 +707,6 @@ sub run_exe{ my ($exe) = @_; $run = 1; - print STDERR $exe."\n";; + print STDERR "running command: $exe\n"; print STDERR `$exe` if($run); } --- preprocess_reads.pl.dist 2018-01-25 14:09:22.035548937 -0800 +++ preprocess_reads.pl 2018-01-26 16:58:02.056919925 -0800 @@ -15,12 +15,14 @@ --out Name of the output file --map-tool Mapping tool can be either bwa (default) or bowtie --tool-dir Directory that contains binaries to the chosen mapping tool (default PATH) - --samtools-dir Directory that contains samtools binaries (default PATH) + --samtools-dir Directory that contains samtools binaries (default PATH) + --num-of-processors number of processors used for mapping stages --help Help "; my $path = ""; my $samtoolsDir = ""; +my $nproc=1; GetOptions( @@ -31,12 +33,12 @@ "map-tool=s" => \$mapTool, "tool-dir=s" => \$path, "samtools-dir=s" => \$samtoolsDir, + "num-of-processors=i" => \$nproc, "help" => \$flag_help ) or die("Error in command line arguments.\n"); - if ( $flag_help ){ print $help_message; exit 0; @@ -133,6 +135,8 @@ sub createOutputFolder { + $time = localtime; + print STDERR "[$time] in sub createOutputFolder\n"; @dir = split( "/", $outputFile ); $folder = ""; for( $i = 0; $i < @dir - 1; $i++ ) @@ -148,6 +152,8 @@ sub checkReadFormat { + $time = localtime; + print STDERR "[$time] in sub checkReadFormat\n"; if( $readFile1 =~ /\.gz$/ ){ # open gzip file open( READ, "gunzip -c $readFile1 |" ) or die $!; @@ -182,6 +188,7 @@ { # build the index $time = localtime; + print STDERR "[$time] in sub buildIndexUsingBowtie\n"; print "[$time]\t"; print "Building bowtie index...\n"; @contigName = split( "/", $contigFile ); @@ -190,11 +197,13 @@ if( $type eq "cs" ) { $command = "${path}bowtie-build -C $contigFile $folder$contigName[ -1 ]"; + print "constructed command: $command\nExecuting command\n"; `$command` or die "ERROR! Running bowtie-build error.\n"; } else { $command = "${path}bowtie-build $contigFile $folder$contigName[ -1 ]"; + print "constructed command: $command\nExecuting command\n"; `$command` or die "ERROR! Running bowtie-build error.\n"; } } @@ -207,6 +216,7 @@ sub mapWithBowtie { $time = localtime; + print STDERR "[$time] in sub mapWithBowtie\n"; print "[$time]\t"; print "Mapping reads using bowtie...\n"; if( $type eq "cs" ) @@ -217,11 +227,13 @@ { $command = "${path}bowtie -v 3 -a -m 1 -S -t $fasta -p 15 $folder$contigName[ -1 ] - 2>${folder}bowtie.err | sort -n > $folder$outputFile"; } + print "constructed command: $command\n"; } sub buildIndexUsingBwa { $time = localtime; + print STDERR "[$time] in sub buildIndexUsingBwa\n"; print "[$time]\t"; print "Building bwa index...\n"; @contigName = split( "/", $contigFile ); @@ -230,12 +242,13 @@ if( $type eq "cs" ) { $command = "${path}bwa index -c -p $folder$contigName[ -1 ] $contigFile"; + print "constructed command: $command\nExecuting command\n"; `$command`; # or die "ERROR! Running bwa index error.\n"; } else { $command = "${path}bwa index -p $folder$contigName[ -1 ] $contigFile"; - print $command."\n"; + print "constructed command: $command\nExecuting command\n"; `$command`; # or die "ERROR! Running bwa index error.\n"; } } @@ -247,7 +260,9 @@ sub readAndMap { - if( $readFile1 =~ /\.gz$/ ){ + $time = localtime; + print STDERR "[$time] in sub readAndMap\n"; + if( $readFile1 =~ /\.groot 62551 5389 0 14:06 ? 00:0z$/ ){ # open gzip file open( F, "gunzip -c $readFile1 |" ) or die $!; open( S, "gunzip -c $readFile2 |" ) or die $!; @@ -257,6 +272,7 @@ open( S, "$readFile2" ) or die $!; } + print "Executing command with piped input\n"; open( OUTPUT, "|-", $command ) or die $!; $num = 1; while( $first = ) @@ -316,20 +332,24 @@ { # align with bwa $time = localtime; + print STDERR "[$time] in sub findSAWithBwa\n"; print "[$time]\t"; print "Finding the SA coordinates of the reads using BWA aln...\n"; - $command = "${path}bwa aln -t 30 $folder$contigName[ -1 ] - > ${folder}read.sai"; + $command = "${path}bwa aln -t $nproc $folder$contigName[ -1 ] - > ${folder}read.sai"; + print "constructed command: $command\n"; } sub generateAlignmentUsingBwa { $time = localtime; + print STDERR "[$time] in sub generateAlignmentUsingBwa\n"; print "[$time]\t"; print "Generate alignments of reads using bwa sampe...\n"; # $command = "${path}bwa samse -n 1 $folder$contigName[ -1 ] ${folder}read.sai - | awk \'{ if( \$3 != \"*\" ) print \$0 }\' > $folder$outputFile"; # $command = "${path}bwa samse -n 1 $folder$contigName[ -1 ] ${folder}read.sai - | grep '\\(^@\\|XT:A:U\\)' | samtools view -S -h -b -F 0x4 - | samtools sort -no - ${folder}temporarySam | samtools view -h -b - > $folder$outputFile"; - $command = "${path}bwa samse -n 1 $folder$contigName[ -1 ] ${folder}read.sai - | grep '\\(^@\\|XT:A:U\\)' | ${samtoolsDir}samtools view -S -h -b -F 0x4 - | ${samtoolsDir}samtools sort -\@ 20 -no - ${folder}temporarySam > $folder$outputFile"; - print $command."\n"; + ## $command = "${path}bwa samse -n 1 $folder$contigName[ -1 ] ${folder}read.sai - | grep '\\(^@\\|XT:A:U\\)' | ${samtoolsDir}samtools view -S -h -b -F 0x4 - | ${samtoolsDir}samtools sort -\@ 20 -no - ${folder}temporarySam > $folder$outputFile"; + $command = "${path}bwa samse -n 1 $folder$contigName[ -1 ] ${folder}read.sai - | grep '\\(^@\\|XT:A:U\\)' | ${samtoolsDir}samtools view -S -h -b -F 0x4 - | ${samtoolsDir}samtools sort -\@ 20 -no - > $folder$outputFile"; + print "constructed command: $command\n"; } sub clearBowtie EOD cd $BTOPDIR/src/OPERA-LG_v2.0.6/bin patch -p0 ppe_trim.log 1>&2 & platanus_pe_trim pe_400_R1.fastq pe_400_R2.fastq 2>ppe_trim.log 1>&2 & platanus_mp_trim SRR446979_1.fastq SRR446979_2.fastq >pmp_79.log 2>&1 & platanus_mp_trim SRR446980_1.fastq SRR446980_2.fastq >pmp_80.log 2>&1 & platanus_mp_trim SRR446981_1.fastq SRR446981_2.fastq >pmp_81.log 2>&1 & ln -s SRR446979_1.fastq.int_trimmed SRR446979_1.trimmed.fastq ln -s SRR446979_2.fastq.int_trimmed SRR446979_2.trimmed.fastq ln -s SRR446980_1.fastq.int_trimmed SRR446980_1.trimmed.fastq ln -s SRR446980_2.fastq.int_trimmed SRR446980_2.trimmed.fastq ln -s SRR446981_1.fastq.int_trimmed SRR446981_1.trimmed.fastq ln -s SRR446981_2.fastq.int_trimmed SRR446981_2.trimmed.fastq ln -s pe_400_R1.fastq.trimmed pe_400_R1.trimmed.fq ln -s pe_400_R2.fastq.trimmed pe_400_R2.trimmed.fq #platanus assemble, on Trombone 02/09/2018 mkdir -p $TTOPDIR/platanus/run1 cd $TTOPDIR/platanus/run1 platanus assemble -o SpPtB -f \ ../../SPUR_datasets/pe_400_R[12].trimmed.fq \ -t 40 -m 256 -u 1.0 2>assemble.log 1>&1 #makes SpPtB_contig.fa #platanus scaffold, on Trombone ending 02/11/2018 # add mate pairs and Sanger pairs cd $TTOPDIR/platanus/run1 nice platanus scaffold -o SpPtB -c SpPtB_contig.fa -b SpPtB_contigBubble.fa \ -IP1 ../../SPUR_datasets/pe_400_R1.fastq.trimmed ../../SPUR_datasets/pe_400_R2.fastq.trimmed \ -IP2 ../../SPUR_datasets/sp_sanger_T8_single_for.fastq ../../SPUR_datasets/sp_sanger_T8_single_rev.fastq \ -IP3 ../../SPUR_datasets/sp_sanger_T8_1999_for.fastq ../../SPUR_datasets/sp_sanger_T8_1999_rev.fastq \ -IP4 ../../SPUR_datasets/sp_sanger_T8_2000_for.fastq ../../SPUR_datasets/sp_sanger_T8_2000_rev.fastq \ -IP5 ../../SPUR_datasets/sp_sanger_T8_3000_for.fastq ../../SPUR_datasets/sp_sanger_T8_3000_rev.fastq \ -IP6 ../../SPUR_datasets/sp_sanger_T8_3500_for.fastq ../../SPUR_datasets/sp_sanger_T8_3500_rev.fastq \ -IP7 ../../SPUR_datasets/sp_sanger_T8_4500_for.fastq ../../SPUR_datasets/sp_sanger_T8_4500_rev.fastq \ -IP8 ../../SPUR_datasets/sp_sanger_T8_5000_for.fastq ../../SPUR_datasets/sp_sanger_T8_5000_rev.fastq \ -IP9 ../../SPUR_datasets/sp_sanger_T8_6000_for.fastq ../../SPUR_datasets/sp_sanger_T8_6000_rev.fastq \ -OP10 ../../SPUR_datasets/SRR446979.1.trimmed.fq ../../SPUR_datasets/SRR446979.2.trimmed.fq \ -OP11 ../../SPUR_datasets/SRR446980.1.trimmed.fq ../../SPUR_datasets/SRR446980.2.trimmed.fq \ -OP12 ../../SPUR_datasets/SRR446981.1.trimmed.fq ../../SPUR_datasets/SRR446981.2.trimmed.fq \ -t 40 -u 1.0 >scaffoldB.log 2>&1 #makes SpPtB_scaffold.fa #platanus gap_close, on Trombone ending 02/13/2018 cd $TTOPDIR/platanus/run1 nice platanus gap_close -o SpPtB -c SpPtB_scaffold.fa \ -f ../../SPUR_datasets/sp_sanger_T8_single_for.fastq \ -IP1 ../../SPUR_datasets/pe_400_R1.fastq.trimmed ../../SPUR_datasets/pe_400_R2.fastq.trimmed \ -IP2 ../../SPUR_datasets/sp_sanger_T8_1999_for.fastq ../../SPUR_datasets/sp_sanger_T8_1999_rev.fastq \ -IP3 ../../SPUR_datasets/sp_sanger_T8_2000_for.fastq ../../SPUR_datasets/sp_sanger_T8_2000_rev.fastq \ -IP4 ../../SPUR_datasets/sp_sanger_T8_3000_for.fastq ../../SPUR_datasets/sp_sanger_T8_3000_rev.fastq \ -IP5 ../../SPUR_datasets/sp_sanger_T8_3500_for.fastq ../../SPUR_datasets/sp_sanger_T8_3500_rev.fastq \ -IP6 ../../SPUR_datasets/sp_sanger_T8_4500_for.fastq ../../SPUR_datasets/sp_sanger_T8_4500_rev.fastq \ -IP7 ../../SPUR_datasets/sp_sanger_T8_5000_for.fastq ../../SPUR_datasets/sp_sanger_T8_5000_rev.fastq \ -IP8 ../../SPUR_datasets/sp_sanger_T8_6000_for.fastq ../../SPUR_datasets/sp_sanger_T8_6000_rev.fastq \ -OP9 ../../SPUR_datasets/SRR446979.1.trimmed.fq ../../SPUR_datasets/SRR446979.2.trimmed.fq \ -OP10 ../../SPUR_datasets/SRR446980.1.trimmed.fq ../../SPUR_datasets/SRR446980.2.trimmed.fq \ -OP11 ../../SPUR_datasets/SRR446981.1.trimmed.fq ../../SPUR_datasets/SRR446981.2.trimmed.fq \ -t 40 >gap_fillB.log 2>&1 & #SpPtB_gapClosed.fa #on bassoon 02/15/2018 #make a script fgap_wrapper.sh which runs fgap in parallel and only # on sequences which contain N's (so have a gap to close). It also # retrieves megareads efficiently using the fasta_to_binary and # binary_to_fasta programs. This is a slightly updated # version of the program which was used (different error checking # results should be the same.) cat >$BTOPDIR/bin/fgap_wrapper.sh <<'EOD' #!/bin/bash # 2018/02/26 David Mathog, mathog@caltech.edu. # # This is a wrapper for fgap which runs only the subset of # scaffolds which: # 1. contain N's # 2. had corrected PacBIO reads map to flanking regions for those N's. # It is MUCH faster than letting fgap run on a big genome, which for the 1Gb # one I tried ran for 6 days, most of it with no output, and which was # finally terminated without having emitted a result. # # Method: # Run this script. Providing the parameters: # scaffold_file (fasta) # corrected long reads (fasta) # number of threads # longest sequence allowed (buffer size to allocate) # # fgap is run with -z 1 -g 1. # -z 1 lets it remove just the N's without entering anything else. # -g 1 lets it trim off a bit on the ends flanking the Ns. For some # reason that is needed on sequences like this: # .......AAAAAAAAAAANNNNNNNAGAAAAA.... # Most likely because the blastn extensions span the short NNN and so the # mapped corrected pacbio positions APPEAR to overlap, like: # 100 230 and 200 300 but this is an alignment issue resulting from # doing each side separately, not a real overlap. # ### edit this section if paths change export LONGP=$BTOPDIR/src/MATLAB/MATLAB_Compiler_Runtime/v717 export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:\ $LONGP/runtime/glnxa64:\ $LONGP/bin/glnxa64:\ $LONGP/sys/os/glnxa64:\ $LONGP/sys/java/jre/glnxa64/jre/lib/amd64/native_threads:\ $LONGP/sys/java/jre/glnxa64/jre/lib/amd64/server:\ $LONGP/sys/java/jre/glnxa64/jre/lib/amd64 PFGAP=$BTOPDIR/src/gapFinisher/FGAP/fgap ABATCH=16 # map lines per batch in each thread MINSCORE=50 # also minimum overlap ### test_exec() { THEFILE=`which $1` if [ ! -x $THEFILE ] then echo "fgap_wrapper.sh: fatal error: required binary $THEFILE is not found or executable" exit fi } test_read() { THEFILE=$1 if [ ! -r $THEFILE ] then echo "fgap_wrapper.sh: fatal error: required input $THEFILE is not found or readable" exit fi } do_one_set(){ CHUNK=$1 START=$2 LAST=$3 MAPF=../map_scf_read.txt SCFF=../scf_db CPBF=../cpb_db mkdir chunk_$CHUNK cd chunk_$CHUNK NOW=`date` echo "$NOW thread $CHUNK range $START $LAST" >>all_out.log while [ $START -le $LAST ] do FINAL=$(( $START + $ABATCH - 1 )) if [ $FINAL -ge $LAST ] then FINAL=$LAST fi NOW=`date` echo "$NOW thread $CHUNK processing batch $START $FINAL" >>all_out.log extract -in $MAPF -sr $START -er $FINAL -mt -dl ' ' -fmt '[1]' -wl $MAXMAPLEN \ | binary_to_fasta -in $SCFF -sf 1 -of 2 >scf.fasta 2>/dev/null extract -in $MAPF -sr $START -er $FINAL -mt -dl ' ' -fmt '[2,]' -dv '\n' -wl $MAXMAPLEN \ | sort -n | uniq \ | binary_to_fasta -in $CPBF -sf 1 -of 2 >reads.fasta 2>/dev/null # # do NOT use -z 1 with -g 1, some sequences will take HOURS to process!!!! # $PFGAP -d scf.fasta -a reads.fasta -R 2500 -I 3500 -t 1 -s 50 -z 1 >>all_out.log 2>&1 >all_out.log if [ -e output_fgap.stats ] then cat output_fgap.stats >> all_out.stats rm output_fgap.stats else echo "FAILURE: no output_fgap.stats was created" >> all_out.stats fi if [ -e output_fgap.final.fasta ] then mv output_fgap.final.fasta scf.fasta # # negative ones that can really be replaced are generally small so set -R low. # Otherwise -g 1 may sometimes remove and replace large swaths of bases # which are not Ns. # $PFGAP -d scf.fasta -a reads.fasta -R 200 -I 500 -t 1 -s 50 -p 0 -g 1 >>all_out.log 2>&1 >all_out.log if [ -e output_fgap.stats ] then cat output_fgap.stats >> all_out.stats rm output_fgap.stats else echo "FAILURE: no output_fgap.stats was created" >> all_out.stats fi if [ -e output_fgap.final.fasta ] then cat output_fgap.final.fasta >>all_out.fasta rm -f output_fgap.final.fasta else # do not log failures here because negative gaps are pretty rare so many will "fail" cat scf.fasta >>all_out.fasta # results from first round are still good, copy them fi rm scf.fasta else echo "$NOW BATCH FAILURE -z 1: Some issue with this set or no gaps could be filled." >> all_out.log fi # # If no gaps are filled output_fgap.final.fasta is not written. # If it is not removed afterwards that scaffold will be entered twice into the final list. # Logic above handles these cases. # START=$(( $FINAL + 1 )) done rm scf.fasta NOW=`date` echo "$NOW thread $CHUNK range completed" >>all_out.log } NOW=`date` echo "$NOW processing begins" SCAFFOLDS=$1 CORRECTED=$2 THREADS=$3 MAXSEQ=$4 test_exec binary_to_fasta test_exec extract test_exec seq test_exec fasta2fgap test_exec fastafrag test_exec blastn test_exec makeblastdb test_exec many_blast_plus_1cpu.sh test_read $SCAFFOLDS test_read $CORRECTED #the matlab runtime will sink a hook into X11 if it can find it. If the starting #session then exits so will the runtime. So hide X11 from the matlab runtime. if [ $DISPLAY ] then unset DISPLAY fi WORKDIR=tmp_fgap mkdir -p $WORKDIR NOW=`date` if [ -e $WORKDIR/datasets.fasta ] then echo "$NOW using EXISTING converted corrected reads datasets.fasta" else # # corrected reads. Differs very slightly from FGAP in that it # retains the original header after a space. blastn will drop that # extra text, so final blastn.out will be the same. # echo "$NOW converting corrected reads to datasets.fasta" fastafrag -length 1000000 -outlen 60 -prefix 'dataset_1_%d ' -upper \ <$CORRECTED >$WORKDIR/datasets.fasta fi NOW=`date` if [ -e $WORKDIR/draft_regions.fasta ] then echo "$NOW using EXISTING scaffolds to draft_regions.fasta" else echo "$NOW converting scaffolds to draft_regions.fasta" fasta2fgap -in $SCAFFOLDS -f 300 -m $MINSCORE -d 1 > $WORKDIR/draft_regions.fasta fi cd $WORKDIR NOW=`date` if [ -e datasets.00.nsq ] then echo "$NOW using EXISTING blast database made from datasets.fasta" else echo "$NOW making blast database out of datasets.fasta" makeblastdb -dbtype nucl -in datasets.fasta -title datasets -out datasets fi NOW=`date` if [ -e blastn.out ] then echo "$NOW using EXISTING blastn.out" else echo "$NOW running blastn for draft_regions vs. datasets to make blastn.out" BLASTPARAMS="-db datasets -task blastn -min_raw_gapped_score $MINSCORE -evalue 1e-07 -perc_identity 70 -word_size 15 -gapopen 1 -gapextend 1 -penalty -3 -reward 1 -max_target_seqs 200 -outfmt \"6 qseqid sseqid score bitscore evalue pident nident mismatch sstrand qstart qseq qend sstart sseq send qlen length qcovhsp\"" # PROG=$1 # INFILE=$2 # OUTFILE=$3 # NCPUS=$4 # BLASTPARAMS=$5 many_blast_plus_1cpu.sh blastn draft_regions.fasta blastn.out $THREADS "$BLASTPARAMS" fi cd .. #back in top level # # next command makes a file with structure: # scaffold read1 .... readN # For an assembly with very large scaffolds and lots of repeats # the default of 50M might not be enough. That should be enough to # hold over 6M reads if they take <=8 characters each. # NOW=`date` if [ -e map_scf_read.txt ] then echo "$NOW using EXISTING making map_scf_read.txt" else echo "$NOW making map_scf_read.txt from blastn.out" extract -in $WORKDIR/blastn.out -mt -dl ' \t_' -fmt '[fw6:jr:2] [7]' \ | sort -k 1,1n -k2,2n \ | uniq \ | extract -merge 6 -wl 50000000 >map_scf_read.txt test_read map_scf_read.txt fi NOW=`date` if [ -e cpb_db.hea ] then echo "$NOW using EXISTING cpb_db fasta_to_binary db from $CORRECTED" else echo "$NOW making fasta_to_binary db from $CORRECTED" fasta_to_binary -in $CORRECTED -out cpb_db -ls -ms 500000 test_read cpb_db.hea #one file from binary database of corrected PacBIO reads fi NOW=`date` if [ -e scf_db.hea ] then echo "$NOW using EXISTING scf_db fasta_to_binary db from $SCAFFOLDS" else echo "$NOW making fasta_to_binary db from $SCAFFOLDS" # # Replace ">header" with ">123 header". # This VASTLY simplifies some of the later steps because there is always # an easy to parse index right at the start of every header and we don't # need to poke around within the header itself. Also in some cases there # may not even be an index in the header which corresponds the the entry's # position in the fata file. # cat $SCAFFOLDS \ | fastafrag -prefix '%d ' \ | fasta_to_binary -in - -out scf_db -ls -ms 500000 test_read scf_db.hea #one file from binary database of scaffolds with gaps fi MAXMAPLEN=`cat map_scf_read.txt | wc -L ` MAXMAPLEN=$(( $MAXMAPLEN + 100)) #a few extra bytes are needed ORIGSCFN=`binary_to_fasta -in scf_db -sf 0 -of 2 2>/dev/null | grep '>' | wc -l` LINES=`cat map_scf_read.txt | wc -l` #lines in the map BLOCK=$(( 1 + $LINES / $THREADS )) #total lines from map processed per thread NOW=`date` echo "$NOW lines $LINES block $BLOCK threads $THREADS origscfN $ORIGSCFN" NOW=`date` if [ -e pre_mod_scaffolds.fasta ] then echo "$NOW using EXISTING pre_mod_scaffolds.fasta mod.log mod.stats" else echo "$NOW generating pre_mod_scaffolds.fasta mod.log mod.stats" I=1 MAXTHREADS=0 TSTART=1 while [ $I -le $THREADS ] do TEND=$(($TSTART + $BLOCK -1)) if [ $TSTART -gt $LINES ] then break else MAXTHREADS=$I fi if [ $TEND -gt $LINES ] then TEND=$LINES fi do_one_set $I $TSTART $TEND & TSTART=$(( $TEND + 1 )) I=$(( $I + 1)) done NOW=`date` echo "$NOW started threads: $MAXTHREADS" wait #assemble the pieces which it made truncate --size 0 pre_mod_scaffolds.fasta truncate --size 0 mod.log truncate --size 0 mod.stats I=1 while [ $I -le $MAXTHREADS ] do NOW=`date` echo "$NOW merging results from thread $I" SUBDIR=chunk_$I cat $SUBDIR/all_out.fasta >> pre_mod_scaffolds.fasta cat $SUBDIR/all_out.log >> mod.log cat $SUBDIR/all_out.stats >> mod.stats rm -rf $SUBDIR I=$(( $I + 1)) done fi NOW=`date` if [ -e scaffold_indices.txt ] then echo "$NOW using EXISTING scaffold_indices.txt" else seq 1 $ORIGSCFN > scaffold_indices.txt fi # # some of the scaffolds put into fgap will NOT have been patched at all. # If they were not they will not be in any of the all_out.fasta files # and so not in pre_mod_scaffolds.fasta. The next section grabs the # scaffold number from the header of that file, and it depends on # all scaffolds having the same format headers! # NOW=`date` if [ -e list_modified.txt ] then echo "$NOW using EXISTING list_modified.txt" else echo "$NOW creating list_modified.txt" extract -in pre_mod_scaffolds.fasta -if '^>' -ifonly -mt -dl '> ' -fmt '[fw10:jr:1]X' -wl $MAXMAPLEN >list_modified.txt extract -in scaffold_indices.txt -fmt '[fw10:jr:1,]' >>list_modified.txt fi NUMMODIFIED=`cat list_modified.txt | grep 'X' | wc -l` # # for speed extract all of the entries in one fell swoop which fgap didn't process. Collapse # the fasta files into single lines and sort on the first field. Then process back into fasta # form. # NOW=`date` if [ -e modified_scaffolds.txt ] && [ -e unmodified_scaffolds.txt ] then echo "$NOW using EXISTING modified_scaffolds.txt" else echo "$NOW creating modified_scaffolds.txt" sort -k 1,1n list_modified.txt | extract -merge 10 >runs_both grep -v X runs_both >unmodified_scaffolds.txt grep X runs_both | tr -d 'X' > modified_scaffolds.txt rm runs_both fi NOW=`date` if [ -e tmp_db.hea ] then echo "$NOW using EXISTING tmp_db" else echo "$NOW making tmp_db" # # Enter two sets of sequences first modified, then unmodified. # ( cat pre_mod_scaffolds.fasta ; \ binary_to_fasta -in scf_db -sel unmodified_scaffolds.txt -sf 1 -of 2 ) \ | fasta_to_binary -out tmp_db -ls -ms 500000 fi # # make the final sequence files. Remove the numeric index # from the headers. # cat modified_scaffolds.txt unmodified_scaffolds.txt \ | extract -in -,scaffold_indices.txt -indl ' ' \ | sort -k 1,1n \ | extract -mt -fmt '[2]' \ | binary_to_fasta -in tmp_db -out - -sf 1 -of 2 \ | extract -out all_scf.fasta -if '>' -mt -dl '> ' -fmt '>[2,]' -wl $MAXSEQ cat pre_mod_scaffolds.fasta \ | extract -out mod_scaffolds.fasta -if '>' -mt -dl '> ' -fmt '>[2,]' -wl $MAXSEQ NOW=`date` echo "$NOW cleaning up temporary files" # save these when debugging: change true to false if true then rm cpb_db.* rm scf_db.* rm tmp_db.* rm list_modified.txt fi NOW=`date` echo "$NOW Scaffolds which were processed: $MODSEL" echo "$NOW Scaffolds which were modified: $NUMMODIFIED" echo "$NOW List of processed scaffolds are in modified_scaffolds.txt" echo "$NOW Merged scaffold sequences, modified and unmodified, are in all_scf.fasta" echo "$NOW done" EOD #gap fill with FGAP, on bassoon 03/02/2018 #copy SpPtB_gapClosed.fa on Trombone to $BOTHER #Note: for some obscure reason this would not run without problems on Trombone, even # when the matlab runtime binary was installed there. Since the failure was # within a matlab based routine for which no source code was available the only # workaround was to run it on a machine where it happened to work. cd $BOTHER fgap_wrapper.sh SpPtB_gapClosed.fa \ /home/mathog/wgs_project/Spur_MEGAREADS3C/mr3C.fasta 40 50000000 \ >fgap_wrapperB.log 2>&1 mv all_scf.fasta SpPtB_gapClosedFGAP.fa #makes SpPtB_gapClosedFGAP.fa #deduplicate the gap filled sequences, on bassoon 03/03/2018 # tried many different combinations of parameters, this was best. # Because of the highly polymorphic sequences dedupe.sh does not work # very well. Even if the geometry is right the gaps/mismatches often # exceed the limits and the two haplotypes of a sequence will both persist. # Also it fails miserably when the two haplotype pieces overlap like this: # ------------ # ------------ # /home/mathog/src/bbmap/dedupe.sh \ in=SpPtB_gapClosedFGAP.fa \ out=SpPtB_gapClosedFGAPdedupe.fa \ outd=SpPtB_gapClosedFGAP_dups_removed.fa \ findoverlap=t minidentity=97 maxedits=12 \ >SpPtB_gapClosedFGAP_dedupe.log 2>&1 #makes SpPtB_gapClosedFGAPdedupe.fa #on Trombone ending 03/05/2018 mkdir $TTOPDIR/OPERA-LG cd $TTOPDIR/OPERA-LG #copy SpPtB_gapClosedFGAPdedupe.fa from Basson perl /home/mathog/src/OPERA-LG_v2.0.6/bin/OPERA-pacbio-read.pl \ --upto-config \ --num-of-processors 40 \ --kmer 31 \ --opera /home/mathog/src/OPERA-LG_v2.0.6/bin \ --contig-file /home/mathog/OPERA-LG/RUN3/SpPtB_gapClosedFGAPdedupe.fa \ --illumina-read1 /home/mathog/SPUR_datasets/pe_400_R1.trimmed.fq \ --illumina-read2 /home/mathog/SPUR_datasets/pe_400_R2.trimmed.fq \ --long-read-file /home/mathog/SPUR_datasets/pacbio_all_ccs.fasta \ --output-prefix olg_r3 \ --output-directory RUN3 \ >run3.log 2>&1 cd RUN3 # # OPERA-LG will need many bam and other files, run a script to make them. cat >make_all_bams.sh <<'EOD' #!/bin/bash # make bams for all pairs doit() { THIS=$1 seq1=$2 seq2=$3 NOW=`date` echo "$NOW starting $THIS $seq1 $seq2" nice perl /home/mathog/src/OPERA-LG_v2.0.6/bin/preprocess_reads.pl \ --num-of-processors 40 --map-tool bwa \ --contig /home/mathog/OPERA-LG/RUN1/SpPtB_gapClosed_dedupe.fa \ --illumina-read1 /home/mathog/SPUR_datasets/$seq1 \ --illumina-read2 /home/mathog/SPUR_datasets/$seq2 \ --out op_dd_$THIS.bam >make_${THIS}_bam.log 2>&1 & } type1() { doit $1 $1.1.trimmed.fq $1.2.trimmed.fq } type2() { doit $1 $1.1.fasta $1.2.fasta } type3() { doit $1 ${1}_for.fastq ${1}_rev.fastq } type1 SRR446979 type1 SRR446980 type1 SRR446981 type1 SRR446981 type2 mr3C_mates type3 sp_sanger_T8_1999 type3 sp_sanger_T8_2000 type3 sp_sanger_T8_3000 type3 sp_sanger_T8_3500 type3 sp_sanger_T8_4500 type3 sp_sanger_T8_5000 type3 sp_sanger_T8_6000 EOD chmod 755 make_all_bams.sh #make the many bam files. ends 03/04/2018 ./make_all_bams >make_all_bams.log 2>&1 # OPERA-LG needs a config file to reference all of these many # linkage files (BAM and otherwise). cat >run3_config <olg_r3.log 2>&1 #makes $TTOPDIR/OPERA-LG/RUN3/try1_results/scaffoldSeq.fasta #Trombone, finished 03/05/2018 cd $TTOPDIR/OPERA-LG/RUN3/try1_results mkdir FGAP cd FGAP fgap_wrapper.sh ../scaffoldSeq.fasta /home/mathog/SPUR_datasets/mr3C.fasta 30 50000000 \ >fgap_wrapperA.log 2>&1 mv all_scf.fasta SpPtB_gapClosedFGAPdedupeOLGFGAP.fa #makes SpPtB_gapClosedFGAPdedupeOLGFGAP.fa #Trombone, finished 03/06/2018 cd $TTOPDIR/OPERA-LG/RUN3/try1_results/FGAP /home/mathog/src/bbmap/dedupe.sh \ in=SpPtB_gapClosedFGAPdedupeOLGFGAP.fa \ out=SpPtB_gapClosedFGAPdedupeOLGFGAPdedupe.fa \ outd=SpPtB_gapClosedFGAPdedupeOLGFGAP_dups_removed.fa \ findoverlap=t minidentity=97 maxedits=12 \ >SpPtB_gapClosedFGAPdedupeOLGFGAP_dedupe.log 2>&1 #makes SpPtB_gapClosedFGAPdedupeOLGFGAPdedupe.fa #on bassoon ending 03/09/2018 # This reused a directory created for Lv work. Should have made a new one with a better path! cd $BOTHER2 #copy SpPtB_gapClosedFGAPdedupeOLGFGAPdedupe.fa from Trombone to $BOTHER2/SpPtB.fasta # # s_6_[12]_all_qseq.fq are the same RNA-seq as the reads from SRR532151 at the NCBI, but # they were extracted from the data from the sequencer (which was submitted to the NCBI), # not from the SRA. # nice ~/src/hisat2-2.1.0/hisat2-build SpPtB.fasta Sp_hisat nice ~/src/hisat2-2.1.0/hisat2 -x Sp_hisat -1 s_6_1_all_qseq.fq -2 s_6_2_all_qseq.fq \ -k 3 -p 40 --pen-noncansplice 1000000 -S Sp_input.sam > hisat_pass2.log 2>&1 & mkdir Sp_results nice /home/mathog/src/P_RNA_scaffolder/P_RNA_scaffolder.sh \ -d /home/mathog/src/P_RNA_scaffolder \ -i Sp_input.sam \ -j SpPtB.fasta \ -F s_6_1_all_qseq.fq -R s_6_2_all_qseq.fq \ -o $BOTHER2/Sp_results \ -s yes -t 40 >p_rna_Sp.log 2>&1 & #makes $BOTHER2/Sp_results/P_RNA_scaffold.fasta #Bassoon ending on 03/10/2018 # The copy of mr3C.fasta referenced here was the source for the one on Trombone in an earlier step. cd $BOTHER2/Sp_results mkdir FGAP cd FGAP fgap_wrapper.sh ../P_RNA_scaffold.fasta \ $BTOPDIR/wgs_project/Spur_MEGAREADS3C/mr3C.fasta 40 50000000 \ >fgap_wrapper.log 2>&1 mv all_scf.fasta save_all_scf.fasta ln -s all_scf.fasta save_all_scf.fasta ln -s all_scf.fasta P_RNA_scaffoldFGAP.fasta # nice /home/mathog/src/bbmap/dedupe.sh in=all_scf.fasta \ out=SpPtBB.fa outd=SpPtBB_dups_removed.fa \ findoverlap=t minidentity=97 maxedits=12 >SpPtBB_dedupe.log 2>&1 #makes SpPtBB.fa, final assembly