#Get input files
pacbio = config["pacbio"]
pacbio_path = "/".join(pacbio.split("/")[:-1])+"/"
pacbio_filename = pacbio.removeprefix(pacbio_path)
hicF = config["hicF"]
hicR = config["hicR"]
HAPS = ["hap1", "hap2"]

#Decide which oatk analysis to run
def oatk_isPlant():
    if config["oatk_isPlant"] == True:
        return("results/organelles/oatk.pltd.ctg.fasta")
    else:
        return("results/organelles/oatk.mito.ctg.fasta")

print()
print(f"Input PacBio reads: {pacbio}")
print(f"Input HiC reads: {hicF}, {hicR}")
print()

#Define rules that do not need to be submitted to the cluster
localrules: print_versions

#Summary rule to run full pipeline (pre-assembly, assembly, scaffolding)
rule all:
    input:
        expand("results/pre_assembly/genomescope/genomescope_ploidy{ploidy}.out", ploidy=config["GS_ploidy_range"]),
        expand("results/pre_assembly/smudgeplot/ploidy{ploidy}_smudgeplot.png", ploidy=config["smudge_ploidy"]),
        expand("results/assembly/busco_assembly.hic.{hap}.p_ctg_{lineage}", lineage=config["busco_lineage"], hap=HAPS),
        expand("results/scaffolding/busco_{hap}_scaffolds_final_{lineage}", lineage=config["busco_lineage"], hap=HAPS),
        expand("results/decontamination/{hap}/{hap}_scaffolds_final.decon.fasta", hap=HAPS),
        oatk_isPlant(),
        "software_versions.txt"

#Summary rule to run all pre-assembly steps
rule pre_assembly:
    input:
        expand("results/pre_assembly/genomescope/genomescope_ploidy{ploidy}.out", ploidy=config["GS_ploidy_range"]),
        expand("results/pre_assembly/smudgeplot/ploidy{ploidy}_smudgeplot.png", ploidy=config["smudge_ploidy"]),
        "software_versions.txt"

#General rule for running only primary assembly and busco
rule assembly:
    input:
        expand("results/assembly/busco_assembly.hic.{hap}.p_ctg_{lineage}", lineage=config["busco_lineage"], hap=HAPS),
        "software_versions.txt"
        
#General rule for generated the scaffolded haplotype assemblies (withouth pre-assembly) and busco
rule scaffolding:
    input:
        expand("results/assembly/busco_assembly.hic.{hap}.p_ctg_{lineage}", lineage=config["busco_lineage"], hap=HAPS),
        expand("results/scaffolding/busco_{hap}_scaffolds_final_{lineage}", lineage=config["busco_lineage"], hap=HAPS),
        "software_versions.txt"

#General rule for generationg a decontaminated assembly, but without running busco
rule decontamination:
    input:
        expand("results/decontamination/{hap}/{hap}_scaffolds_final.decon.fasta", hap=HAPS),
        "software_versions.txt"
        
rule organelles:
    input:
        oatk_isPlant()

#Filter PacBio HiFi reads
rule hifiadapterfilt:
    output:
        pacbioFilt = "results/pacbio/pacbio.filt.fastq.gz"
    input:
        pacbio
    resources:
        ntasks = config["adapterfilt_threads"]
    params:
        threads = config["adapterfilt_threads"],
        installdir = config["adapterfilt_install_dir"],
    shell:
        r"""
        export PATH={params.installdir}/DB:{params.installdir}:$PATH
        mkdir -p pbfilt_temp
        cd pbfilt_temp
        ln -sf {input} ./
        mkdir -p tmp
        export TMPDIR=./tmp
        pbadapterfilt.sh \
            -t {params.threads} \
            -o filtered
        mv filtered/*filt.fastq.gz ../{output.pacbioFilt}
        rm -r -f ./tmp
        cd ..
        rm -r -f pbfilt_temp
        """

#Count k-mers for genomescope and smudgeplot
rule count_kmers:
    output:
        histo = "results/pre_assembly/reads.histo"
    input:
        pacbio
    resources:
        ntasks = config["KMC_t"]
    params:
        k = config["KMC_k"],
        t = config["KMC_t"],
        m = config["KMC_m"],
        ci = config["KMC_ci"],
        cx = config["KMC_cx"],
        cs = config["KMC_cs"],
        kmc_path = config["KMC_path"]
    shell:
        r"""
        mkdir -p results/pre_assembly/tmp
        ls {input} > FILES
        export PATH={params.kmc_path}/bin:$PATH
        kmc \
            -k{params.k} \
            -t{params.t} \
            -m{params.m} \
            -ci{params.ci} \
            -cs{params.cs} \
            @FILES \
            results/pre_assembly/reads \
            results/pre_assembly/tmp
        kmc_tools transform \
            results/pre_assembly/reads \
            histogram \
            results/pre_assembly/reads.histo \
            -cx{params.cx}
        rm -r -f results/pre_assembly/tmp FILES
        """

#Run Genomescope
rule genomescope:
    output:
        expand("results/pre_assembly/genomescope/genomescope_ploidy{ploidy}.out", ploidy=config["GS_ploidy_range"])
    input:
        rules.count_kmers.output.histo
    params:
        ploidy_range = config["GS_ploidy_range"],
        k = config["KMC_k"]
    shell:
        r"""
        for ploidy in {params.ploidy_range} 
            do genomescope2 \
                -i {input} \
                -o results/pre_assembly/genomescope/output_ploidy$ploidy \
                -k {params.k} \
                -p $ploidy \
                1> results/pre_assembly/genomescope/genomescope_ploidy${{ploidy}}.out \
                2> results/pre_assembly/genomescope/genomescope_ploidy${{ploidy}}.err
        done
        """

#Run Smudgeplot
rule smudgeplot:
    output:
        expand("results/pre_assembly/smudgeplot/ploidy{ploidy}_smudgeplot.png", ploidy=config["smudge_ploidy"]),
    input:
        rules.count_kmers.output.histo
    params:
        ploidy = config["smudge_ploidy"],
        kmc_path = config["KMC_path"],
        k = config["KMC_k"],
    shell:
        r"""
        L=$(smudgeplot.py cutoff {input} L)
        U=$(smudgeplot.py cutoff {input} U)
        export PATH={params.kmc_path}/bin:$PATH
        kmc_tools transform \
            results/pre_assembly/reads \
            -ci$L \
            -cx$U \
            reduce \
            results/pre_assembly/smudgeplot/kmcdb_L"$L"_U"$U"
        smudge_pairs \
            results/pre_assembly/smudgeplot/kmcdb_L"$L"_U"$U" \
            results/pre_assembly/smudgeplot/kmcdb_L"$L"_U"$U"_coverages.tsv \
            results/pre_assembly/smudgeplot/kmcdb_L"$L"_U"$U"_pairs.tsv \
            > results/pre_assembly/smudgeplot/kmcdb_L"$L"_U"$U"_familysizes.tsv
        smudgeplot.py plot \
            results/pre_assembly/smudgeplot/kmcdb_L"$L"_U"$U"_coverages.tsv \
            -o results/pre_assembly/smudgeplot/ploidy{params.ploidy} \
            -k {params.k}
        """

#Perform genome assembly with hifiasm using pacbio reads and hic reads
rule assembly_hifiams:
    output:
        hap1 = "results/assembly/assembly.hic.hap1.p_ctg.fa",
        hap2 = "results/assembly/assembly.hic.hap2.p_ctg.fa"
    input:
        pacbio = rules.hifiadapterfilt.output.pacbioFilt,
        hicF = hicF,
        hicR = hicR
    resources:
        time = config["hifiasm_cluster_time"],
        mem_per_cpu = config["hifiasm_mem_per_cpu"],
        ntasks = config["hifiasm_threads"]
    params:
        threads = config["hifiasm_threads"],
        hF = hicF.split("/")[-1],
        hR = hicR.split("/")[-1]
    shell:
        r"""
        export CWD=$PWD
        mkdir -p assembly_tmp 
        cd assembly_tmp
        ln -sf ../{input.pacbio} ./
        ln -sf {input.hicF} ./
        ln -sf {input.hicR} ./
        hifiasm \
            -o assembly \
            -t{params.threads} \
            --h1 {params.hF} \
            --h2 {params.hR} \
            ./pacbio.filt.fastq.gz
        mv assembly.* $CWD/results/assembly
        awk '/^S/{{print ">"$2"\n"$3}}' $CWD/results/assembly/assembly.hic.hap1.p_ctg.gfa \
            | fold > $CWD/{output.hap1}
        awk '/^S/{{print ">"$2"\n"$3}}' $CWD/results/assembly/assembly.hic.hap2.p_ctg.gfa \
            | fold > $CWD/{output.hap2}
        cd $CWD && rm -r assembly_tmp
        """

#Run busco on assembly
rule busco:
    output:
        directory(expand("results/{{dir}}/busco_{{assembly}}_{lineage}", lineage=config["busco_lineage"]))
    input:
        assembly = "results/{dir}/{assembly}.fa",
        busco = expand("{busco_db_dir}/{lineage}_odb10", busco_db_dir=config["busco_db_dir"], lineage=config["busco_lineage"])
    resources:
        mem_per_cpu = config["busco_mem"],
        ntasks = config["busco_threads"]
    params:
        lineage = config["busco_lineage"],
        threads = config["busco_threads"]
    shell:
        r"""
        busco \
            -i {input.assembly} \
            -l {input.busco} \
            -c {params.threads} \
            -m genome \
            --offline \
            --out_path results/{wildcards.dir} \
            -o busco_{wildcards.assembly}_{params.lineage} \
            --download_path /tmp
        """

#Create Meryl Database
rule create_meryl_db:
    output:
        directory("results/scaffolding/meryl/assembly.hic.{hap}.p_ctg.meryl")
    input:
        "results/assembly/assembly.hic.{hap}.p_ctg.fa"
    resources:
        ntasks = config["meryl_threads"]
    params:
        k = config["meryl_k"],
        t = config["meryl_threads"],
        m = config["meryl_memory"],
    shell:
        r"""
        meryl \
            k={params.k} \
            threads={params.t} \
            memory={params.m} \
            count \
            output {output} \
            {input}
        """
        
#Run meryl for filtering Hi-C reads before scaffolding
rule meryl_filter:
    output:
        hicFwoH1 = "results/scaffolding/meryl/hicF_hap2.fastq.gz",
        hicRwoH1 = "results/scaffolding/meryl/hicR_hap2.fastq.gz",
        hicFwoH2 = "results/scaffolding/meryl/hicF_hap1.fastq.gz",
        hicRwoH2 = "results/scaffolding/meryl/hicR_hap1.fastq.gz"
    input:
        hicF = hicF,
        hicR = hicR,
        meryl_hap1 = "results/scaffolding/meryl/assembly.hic.hap1.p_ctg.meryl",
        meryl_hap2 = "results/scaffolding/meryl/assembly.hic.hap2.p_ctg.meryl"
    resources:
        ntasks = config["meryl_threads"]
    shell:
        r"""
        meryl difference \
            {input.meryl_hap1} \
            {input.meryl_hap2} \
            output results/scaffolding/meryl/only_hap1.meryl
        meryl difference \
            {input.meryl_hap2} \
            {input.meryl_hap1} \
            output results/scaffolding/meryl/only_hap2.meryl
        meryl-lookup \
            -exclude \
            -sequence {input.hicF} {input.hicR} \
            -mers results/scaffolding/meryl/only_hap1.meryl \
            -output {output.hicFwoH1} {output.hicRwoH1}
        meryl-lookup \
            -exclude \
            -sequence {input.hicF} {input.hicR} \
            -mers results/scaffolding/meryl/only_hap2.meryl \
            -output {output.hicFwoH2} {output.hicRwoH2}
        """

#Prepare filtered Hi-C reads for scaffolding
rule map_hic:
    output:
        mapping = "results/scaffolding/{hap}_mapping.bam"
    input: 
        assembly = "results/assembly/assembly.hic.{hap}.p_ctg.fa",
        hicF = "results/scaffolding/meryl/hicF_{hap}.fastq.gz",
        hicR = "results/scaffolding/meryl/hicR_{hap}.fastq.gz"
    resources:
        ntasks = config["yahs_threads"]
    params:
        t = config["yahs_threads"]
    shell:
        """
        bwa index {input.assembly}
        samtools faidx {input.assembly}
        bwa mem \
            -t {params.t} \
            -R '@RG\\tSM:{wildcards.hap}\\tID:{wildcards.hap}' \
            -5SPM \
            {input.assembly} {input.hicF} {input.hicR} \
        | samtools view -buS - \
            > results/scaffolding/{wildcards.hap}_mapping.bam
        """

rule prepare_hic:
    output:
        markdup = "results/scaffolding/{hap}_hic_markdup.sort_n.bam"
    input:
        mapping = rules.map_hic.output.mapping
    resources:
        ntasks = config["yahs_threads"],
    params:
        t = config["yahs_threads"],
    shell:
        r"""
        samtools sort \
            -@{params.t} \
            -n \
            -T results/scaffolding/{wildcards.hap}_tmp_n \
            -O bam \
            -o results/scaffolding/{wildcards.hap}_mapping.sorted.bam \
            {input.mapping}
        samtools fixmate -mr \
            results/scaffolding/{wildcards.hap}_mapping.sorted.bam \
            results/scaffolding/{wildcards.hap}_mapping.fixmate.bam
        samtools sort \
            -@{params.t} \
            -T results/scaffolding/{wildcards.hap}_hic_tmp \
            -O bam \
            -o results/scaffolding/{wildcards.hap}_mapping.fixmate.sorted.bam \
            results/scaffolding/{wildcards.hap}_mapping.fixmate.bam
        samtools markdup -rs \
            -@{params.t}\
            results/scaffolding/{wildcards.hap}_mapping.fixmate.sorted.bam \
            results/scaffolding/{wildcards.hap}_mapping.markdup.bam \
            2> results/scaffolding/{wildcards.hap}_hic_markdup.stats
        samtools sort \
            -@{params.t} \
            -n \
            -T results/scaffolding/{wildcards.hap}_temp_n \
            -O bam \
            -o {output.markdup} \
            results/scaffolding/{wildcards.hap}_mapping.markdup.bam
        if [ -f {output.markdup} ]; then
            rm results/scaffolding/{wildcards.hap}_mapping.bam \
                results/scaffolding/{wildcards.hap}_mapping.sorted.bam \
                results/scaffolding/{wildcards.hap}_mapping.fixmate.bam \
                results/scaffolding/{wildcards.hap}_mapping.fixmate.sorted.bam \
                results/scaffolding/{wildcards.hap}_mapping.markdup.bam
        fi
        """

#Run scaffolding with yahs
rule scaffold_hap:
    output:
        "results/scaffolding/{hap}_scaffolds_final.fa"
    input:
        assembly = "results/assembly/assembly.hic.{hap}.p_ctg.fa",
        bamfile = "results/scaffolding/{hap}_hic_markdup.sort_n.bam"
    resources:
        ntasks = config["yahs_threads"],
        time = config["yahs_cluster_time"]
    shell:
        r"""
        yahs {input.assembly} {input.bamfile} \
            -o results/scaffolding/{wildcards.hap} \
            1> results/scaffolding/yahs_{wildcards.hap}.out \
            2> results/scaffolding/yahs_{wildcards.hap}.err
        gfastats {output} \
            > results/scaffolding/{wildcards.hap}_scaffolds_final.stats
        """
 
#Remove adaptors and vector contamination
rule fcs_adaptor:
    output:
        "results/decontamination/{hap}/{hap}_scaffolds_final.clean.fa"
    input:
        "results/scaffolding/{hap}_scaffolds_final.fa"
    resources:
        time = config["fcs_adaptor_time"],
        mem_per_cpu = config["fcs_adaptor_mem"],
    params:
        fcs_path = config["fcs_path"],
        fcs_image = config["fcs_adaptor_image"]
    shell:
        r"""
        mkdir -p results/decontamination/{wildcards.hap}/adaptor_output
        bash {params.fcs_path}/run_fcsadaptor.sh \
            --fasta-input {input} \
            --output-dir results/decontamination/{wildcards.hap}/adaptor_output \
            --euk \
            --container-engine singularity \
            --image {params.fcs_path}/fcs-adaptor.sif
        sed 's/lcl|/clean/g' \
            results/decontamination/{wildcards.hap}/adaptor_output/cleaned_sequences/{wildcards.hap}_scaffolds_final.fa > {output}
        """
 
 #Run decontamination pipeline
rule fcs_gx:
    output:
        "results/decontamination/{hap}/{hap}_scaffolds_final.decon.fasta"
    input:
        "results/decontamination/{hap}/{hap}_scaffolds_final.clean.fa"
    resources:
        time = config["fcs_gx_time"],
        mem_per_cpu = config["fcs_gx_mem"],
        partition = config["fcs_gx_partition"],
        ntasks = config["fcs_threads"]
    params:
        fcs_path = config["fcs_path"],
        fcs_threads = config["fcs_threads"],
        fcs_image = config["fcs_gx_image"],
        fcs_db = config["fcs_gx_db"],
        taxid = config["taxid"]
    shell:
        r"""
        mkdir -p results/decontamination/{wildcards.hap}/gx_output
        echo "GX_NUM_CORES={params.fcs_threads}" > results/decontamination/{wildcards.hap}/env.txt
        python3 {params.fcs_path}/fcs.py \
            --image {params.fcs_image} \
            screen genome \
            --fasta {input} \
            --out-dir results/decontamination/{wildcards.hap}/gx_output \
            --gx-db {params.fcs_db} \
            --tax-id {params.taxid}
        cat {input} \
        | python3 {params.fcs_path}/fcs.py \
            --image={params.fcs_image} \
            clean genome \
            --action-report results/decontamination/{wildcards.hap}/gx_output/*.fcs_gx_report.txt \
            --output {output} \
            --contam-fasta-out results/decontamination/{wildcards.hap}/contam.fasta
        """
        
rule run_oatk_plant:
    output:
        "results/organelles/oatk.pltd.ctg.fasta"
    input:
        pacbio = rules.hifiadapterfilt.output.pacbioFilt
    resources:
        tasks = config["oatk_threads"],
        mem_per_cpu = config["oatk_mem"]
    params:
        oatk_install = config["oatk_dir"],
        oatk_db = config["oatk_db"],
        lineage = config["oatk_lineage"],
        k = config["oatk_k"],
        c = config["oatk_c"],
        threads = config["oatk_threads"]
    shell:
        """
        {params.oatk_install}/oatk \
            -k {params.k} \
            -c {params.c} \
            -t {params.threads} \
            --nhmmscan nhmmscan \
            -m {params.oatk_db}/{params.lineage}_mito.fam \
            -p {params.oatk_db}/{params.lineage}_pltd.fam \
            -o results/organelles/oatk \
            {input.pacbio}
        """

rule run_oatk_nonplant:
    output:
        "results/organelles/oatk.mito.ctg.fasta"
    input:
        pacbio = rules.hifiadapterfilt.output.pacbioFilt
    resources:
        ntasks = config["oatk_threads"],
        mem_per_cpu = config["oatk_mem"]
    params:
        oatk_install = config["oatk_dir"],
        oatk_db = config["oatk_db"],
        lineage = config["oatk_lineage"],
        k = config["oatk_k"],
        c = config["oatk_c"],
        threads = config["oatk_threads"],
    shell:
        """
        {params.oatk_install}/oatk \
            -k {params.k} \
            -c {params.c} \
            -t {params.threads} \
            --nhmmscan nhmmscan \
            -m {params.oatk_db}/{params.lineage}_mito.fam \
            -o results/organelles/oatk \
            {input.pacbio}
        """     
        
#Print software versions to file
rule print_versions:
    output:
        "software_versions.txt"
    params:
        kmc_path = config["KMC_path"],
        hifiadapterfilt = config["adapterfilt_install_dir"],
        fcs_path = config["fcs_path"]
    shell:
        r"""
        printf "Conda software versions:\n" > {output}
        conda list -e | grep -v "^#" >> {output}
        printf "\nOther software versions:\n" >> {output}
        printf "KMC version:\t" >> {output}
        {params.kmc_path}/bin/kmc --version | head -n 1 >> {output}
        printf "\nHiFi Adapterfilt version:\t" >> {output}
        bash {params.hifiadapterfilt}/hifiadapterfilt.sh --version >> {output}
        printf "\nNCBI FCS versions:\n" >> {output}
        printf "fcsadaptor:\t" >> {output}
        cat {params.fcs_path}/run_fcsadaptor.sh | grep "^DEFAULT_VERSION" >> {output}
        printf "fcs.py:\t" >> {output}
        cat {params.fcs_path}/fcs.py | grep "^DEFAULT_VERSION" >> {output}
        """
