/*
 * 
 *  Copyright (c) 2008, 2011, 2012 David Hernandez, Patrice Francois, Jacques Schrenzel
 * 
 *  This file is part of EDENA.
 *
 *  EDENA is free software: you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation, either version 3 of the License, or
 *  (at your option) any later version.
 *
 *  EDENA is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with EDENA.  If not, see <http://www.gnu.org/licenses/>.
 */

#include "overlapGraph.h"
#include "Pairing.h"
#include "readsLayout.h"
#include "globalFunc.h"
#include "DevShell.h"
#include "stat.h"
#include "BeamSearchTree.h"
#include "BackwardsWalker.h"

#include <iostream>
#include <iomanip>
#include <fstream>
#include <string>
#include <vector>
#include <sstream>
#include <cstdlib>
#include <ctime>
#include <limits>

using namespace std;

bool DEV_INFO;

ofstream outGlob;
unsigned long global_count;

int main(int argc, char **argv) {
    
    //  outGlob.open("DEVLOG");
    //  outGlob << "node\tindex\tsize\tprob\n";

    global_count = 0;
    string prefix = "out";
    string ovlFile = "";
    vector<string> readsFiles;
    vector<string> pairedReadsFiles;
    vector<string> rdPairedReadsFiles;

    int trim = 4; //min coverage for contigs ends
    unsigned int overlapCutoff = 0;
    double minNodeCov = 0.0;
    unsigned int minOverlapMatch = 0;
    unsigned int truncateReads = 0;
    unsigned int deadEndLimit = 0; //by default = 2*readLength-1

    double minCoverage = 0.0;
    double targetSize = 0.0;
    unsigned int minContigSize = 0;

    unsigned int peHorizon = 0; //To improve
    double maxLogNBranches = 25.0;

    bool discardNonUsable = true;
    bool contextualCleaning = true;
    bool getNonRedundantDataset = false;
    bool cleanBubbles = true;
    bool writeOSG = false;
    bool checkGraph = false;
    bool cleanGraph = true;

   // float CCreliableCutoff = 1e-1;
   // float CCsupectCutoff = 1e-3;
    float CCreliableCutoff = 1e-2;
    float CCsupectCutoff = 1e-4;
    float CCGincoherantCutoff = 1e-6;
    float nsd = 2.0; //used to determine min-max allowed distances for PE libraries
    // d= Estimatedmean +- nsd* estimatedSD

    int minNPair = 5;
    double minRatio = .95;
    unsigned int maxRedundancy=0;

    int nThreads = 2;
    bool dev = false;
    DEV_INFO = false; //global

    //arguments parser
    istringstream iss;
    string tmp;
    edenaVersion(cout);

    if (argc == 1) {
        edenaUsage();
    }

    for (int argn = 1; argn < argc; argn++) {
        if (argv[argn][0] != '-')
            edenaUsage();

        tmp = argv[argn] + 1;

        if (tmp == "h" || tmp == "help") {
            edenaUsage();
        }
        else if (tmp == "r" || tmp == "singleEnd") //one or more read file
        {
            if (ovlFile != "") {
                cerr << "You must provide either FASTA/FASTQ reads file(s) or an Edena .ovl file, not both\n\n";
                exit(0);
            }
            argn++;

            while (argn < argc && argv[argn][0] != '-') {
                readsFiles.push_back(argv[argn]);
                argn++;
            }
            argn--;
        }            //        else if (tmp == "-orientedTranscriptome" || tmp == "-onlyDirect")
            //        {
            //            onlyDirectOv=true;
            //        }
        else if (tmp == "paired" || tmp == "DRpairs") {
            //direct-reverse paired read files
            if (ovlFile != "") {
                cerr << "You must provide either FASTA/FASTQ read file(s) or an Edena .ovl file, not both\n\n";
                exit(0);
            }
            argn++;

            while (argn < argc && argv[argn][0] != '-') {
                pairedReadsFiles.push_back(argv[argn]);
                argn++;
            }
            argn--;
            if (pairedReadsFiles.size() % 2 != 0) {
                cerr << "Paired reads files: you must provide an even number of files\n\n";
                exit(0);
            }
        }
        else if (tmp == "matePairs" || tmp == "RDpairs") {
            //reverse-direct paired read files
            if (ovlFile != "") {
                cerr << "You must provide either FASTA/FASTQ read file(s) or an Edena .ovl file, not both\n\n";
                exit(0);
            }
            argn++;

            while (argn < argc && argv[argn][0] != '-') {
                rdPairedReadsFiles.push_back(argv[argn]);
                argn++;
            }
            argn--;
            if (rdPairedReadsFiles.size() % 2 != 0) {
                cerr << "Mate pairs files: you must provide an even number of files\n\n";
                exit(0);
            }
        }
        else if (tmp == "e" || tmp == "edenaFile") {
            if (readsFiles.size() != 0 || pairedReadsFiles.size() != 0) {
                 cerr << "You must provide either FASTA/FASTQ read file(s) or an Edena .ovl file, not both\n\n";
                exit(0);
            }
            argn++;
            ovlFile = argv[argn];
        }
        else if (tmp == "m" || tmp == "overlapCutoff") //for assembling mode
        {
            argn++;
            iss.clear();
            iss.str(argv[argn]);
            iss >> overlapCutoff;
        }
        else if (tmp == "M" || tmp == "minOverlap") //for overlapping mode
        {
            argn++;
            iss.clear();
            iss.str(argv[argn]);
            iss >> minOverlapMatch;
        } else if (tmp == "minNodeCov") {
            argn++;
            iss.clear();
            iss.str(argv[argn]);
            iss >> minNodeCov;
        }
        else if (tmp == "minCoverage") //minimum coverage required for the contigs
        {
            argn++;
            iss.clear();
            iss.str(argv[argn]);
            iss >> minCoverage;
        } else if (tmp == "targetSize") //estimate of the target size
        {
            argn++;
            iss.clear();
            iss.str(argv[argn]);
            iss >> targetSize;
        } else if (tmp == "c" || tmp == "minContigSize") //min contig size
        {
            argn++;
            iss.clear();
            iss.str(argv[argn]);
            iss >> minContigSize;
        }
        else if (tmp == "p" || tmp == "prefix") // prefix
        {
            argn++;
            prefix = argv[argn];
        }
        else if (tmp == "trim") //low covered contig ends trimming
        {
            argn++;
            iss.clear();
            iss.str(argv[argn]);
            iss >> trim;
        }
        else if (tmp == "d" || tmp == "deadEnds") //for dead-ends removal
        {
            argn++;
            iss.clear();
            iss.str(argv[argn]);
            iss >> deadEndLimit;
        }
        else if (tmp == "discardNonUsable") {
            argn++;
            tmp = argv[argn];
            if (tmp == "no")
                discardNonUsable = false;
            else if (tmp == "yes")
                discardNonUsable = true;
            else {
                cerr << "\n--discardNonUsable: possible values are \"yes\" and \"no\"\n";
                exit(0);
            }
        }
        else if (tmp == "cc" || tmp == "contextualCleaning") {
            argn++;
            tmp = argv[argn];
            if (tmp == "no")
                contextualCleaning = false;
            else if (tmp == "yes")
                contextualCleaning = true;
            else {
                cerr << "\n--contextualCleaning: possible values are \"yes\" and \"no\"\n";
                exit(0);
            }
        } else if (tmp == "clearBubbles") {
            argn++;
            tmp = argv[argn];
            if (tmp == "no")
                cleanBubbles = false;
            else if (tmp == "yes")
                cleanBubbles = true;
            else {
                cerr << "\n--cleanBubbles: possible values are \"yes\" and \"no\"\n";
                exit(0);
            }
        }
            //not used anymore
        else if (tmp == "searchLimit") //maxLogNBranches for class PathFinder
        {
            argn++;
            iss.clear();
            iss.str(argv[argn]);
            iss >> maxLogNBranches;
        } else if (tmp == "peHorizon") {
            argn++;
            iss.clear();
            iss.str(argv[argn]);
            iss >> peHorizon;
        } else if (tmp == "t" || tmp == "truncate") {
            argn++;
            iss.clear();
            iss.str(argv[argn]);
            iss >> truncateReads;
        }
        else if (tmp == "nThreads") {
            argn++;
            iss.clear();
            iss.str(argv[argn]);
            iss >> nThreads;
            if (nThreads < 1)
                nThreads == 1;
        } else if (tmp == "minNPairs") {
            argn++;
            iss.clear();
            iss.str(argv[argn]);
            iss >> minNPair;
        } else if (tmp == "minRatio") {
            argn++;
            iss.clear();
            iss.str(argv[argn]);
            iss >> minRatio;
        }
        else if (tmp == "maxRed")
        {
            argn++;
            iss.clear();
            iss.str(argv[argn]);
            iss >> maxRedundancy;
        }
        else if (tmp == "devInfo") //undocumented
        {
            DEV_INFO = true; //global flag
        }
        else if (tmp == "dev" || tmp=="shell") //undocumented
        {
            dev = true;
        }
        else if (tmp == "v") {
            //version is displayed by default
            exit(0);
        }
        else if (tmp == "getNRdataset") {
            getNonRedundantDataset = true;
        }
        else if (tmp == "wOSG") //write overlaps string graph
        {
            writeOSG = true;
        }
        else if (tmp == "checkGraph") {
            checkGraph = true;
        }
        else if (tmp == "cleanGraph")
        {
            argn++;
            tmp = argv[argn];
            if (tmp == "no")
                cleanGraph = false;
            else if (tmp == "yes")
                cleanGraph = true;
            else {
                cerr << "\n--cleanGraph: possible values are \"yes\" and \"no\"\n";
                exit(0);
            }
        }
        else {
            cout << "-" << tmp << " : unknown argument\n";
            exit(0);
        }
    }

    Pairing *P = new Pairing();
    ReadsLayout *L = new ReadsLayout();
    ReadsStorage *R = new ReadsStorage();
    P->setRpointer(R);
    OverlapsGraph G;
    G.init(R, P, L, maxLogNBranches);

    cout << setprecision(1) << fixed;

    //Overlapping mode
    if (readsFiles.size() != 0 || pairedReadsFiles.size() != 0 || rdPairedReadsFiles.size() != 0) {
        int maxReadLength = numeric_limits<short int>::max();


        //fast estimation of the total number of reads
        //for the Layout initiation
        unsigned int nReadsEstimation = 0; //upper bound
        unsigned int rl, nL = 0;
        unsigned int minRl = numeric_limits<unsigned int>::max();


        cout << "Rapid file(s) examination... " << flush;

        //to prevent copy/paste errors in the command line
        for (size_t i = 0; i < pairedReadsFiles.size(); i += 2) {
            if (pairedReadsFiles[i] == pairedReadsFiles[i + 1]) {
                cerr << "\nError\nPaired reads must be specified using two different files\n";
                cerr << "(you specified the same file twice)\n";
                exit(0);
            }
        }
        for (size_t i = 0; i < rdPairedReadsFiles.size(); i += 2) {
            if (rdPairedReadsFiles[i] == rdPairedReadsFiles[i + 1]) {
                cerr << "\nError\nMate pairs must be specified using two different files\n";
                cerr << "(you specified the same file twice)\n";
                exit(0);
            }
        }

        //nreads estimation
        for (size_t i = 0; i < pairedReadsFiles.size(); i++) {
            nReadsEstimation += estimateNReads(pairedReadsFiles[i], rl);


            if (rl != minRl) {
                nL++;
                if (rl < minRl) {
                    minRl = rl;
                }
            }
        }

        for (size_t i = 0; i < rdPairedReadsFiles.size(); i++) {
            nReadsEstimation += estimateNReads(rdPairedReadsFiles[i], rl);

            if (rl != minRl) {
                nL++;
                if (rl < minRl) {
                    minRl = rl;
                }
            }
        }

        for (size_t i = 0; i < readsFiles.size(); i++) {
            nReadsEstimation += estimateNReads(readsFiles[i], rl);
            if (rl != minRl) {
                nL++;
                if (rl < minRl) {
                    minRl = rl;
                }
            }
        }

        cout << "done" << endl;

        if (minRl > maxReadLength) {
            cout << "   The maximum supported read length is " << maxReadLength << "bp\n";
            cout << "   All reads have been truncated accordingly" << endl;
            truncateReads = maxReadLength;
            minRl = maxReadLength;
        }

        if (truncateReads == 0 && nL > 1) {
            cout << "   The shorter reads are " << minRl << " bases in length" << endl;
            cout << "   Other reads will be truncated accordingly" << endl;
            truncateReads = minRl;
        }

        int readLength = 0;

        if (truncateReads > minRl) {
            cout << "Adjusting reads truncation to " << minRl << endl;
            truncateReads = minRl;
        }

        cout << "Reads layout initialization... ";
        L->init(R, nReadsEstimation);
        R->init(nReadsEstimation, minRl);
        cout << "done" << endl;

        //TO DO: change according to dataset
        int mateOrientation = 1; // DR

        for (size_t i = 0; i < readsFiles.size(); i++) {
            if (
                    R->loadReadsFiles(readsFiles[i], "", 0, truncateReads, P, L)
                    != 0)
                return 1;
        }

        for (size_t i = 0; i < pairedReadsFiles.size(); i += 2) {
            if (
                    R->loadReadsFiles(pairedReadsFiles[i], pairedReadsFiles[i + 1], mateOrientation, truncateReads, P, L)
                    != 0)
                return 1;
        }

        mateOrientation = 2; //RD
        for (size_t i = 0; i < rdPairedReadsFiles.size(); i += 2) {
            if (
                    R->loadReadsFiles(rdPairedReadsFiles[i], rdPairedReadsFiles[i + 1], mateOrientation, truncateReads, P, L)
                    != 0)
                return 1;
        }

        R->adjustAllocation();

        if (getNonRedundantDataset) {
            ofstream outNR((prefix + "_nonRedundant.fasta").c_str());
            //T->writeNRFastaFile(outNR);
            outNR.close();
            //            cout << "writeDataset" << endl;
            //            L->writeFastaReadSet(P);
            exit(0);
        }

        cout << "done\n";
        cout << "   Number of reads: " << R->getEffectiveNReads() << endl;
        cout << "   Number of distinct sequences: " << R->getN_nrReads() << endl;
        cout << "   Average reads redundancy: " << (float) R->getEffectiveNReads() / R->getN_nrReads() << endl;

        if (pairedReadsFiles.size() >= 2 || rdPairedReadsFiles.size() >= 2) {
            cout << "Building pairing index... " << flush;
            P->buildIndex(R->getEffectiveNReads());
            cout << "done" << endl;
        }

        cout << "Initializing prefix tables..." << flush;
        R->adjustAllocation();
        R->initPrefixTables();
        cout << "done" << endl;
        //compute exact overlaps


        if (minOverlapMatch == 0)
            minOverlapMatch = R->getReadsLength() / 2;
        R->setMinOvSize(minOverlapMatch);

       
    
        G.computeOverlaps(nThreads);
        G.removeTransitiveEdges(); 
        G.countEdges();

        if (checkGraph)
            G.checkConsistency();


        //        for (unsigned int i=1; i<G.getNNodes(); i++)
        //            G.nodesTab[i].allocateEdgeValue();
        //    devShell(&G, R, P, L);


        if (!G.saveData((prefix + ".ovl").c_str())) {
            cout << "Problem writing the .ovl file\n";
        }

        return 0;
    }
        //Assembling mode
        //Load .ovl file

    else if (ovlFile == "") {
        cout << "You must provide either FASTA/FASTQ read file(s) or an Edena .ovl file.\n\n";
        exit(0);
    }
    else //ASSEMBLY MODE
    {
        if (G.loadData(ovlFile.c_str()) == false) {
            exit(0);
        }
        
        G.countEdges();
        Node::setEdgeSortedFlag(true);

        cout << "   reads length:             " << R->getReadsLength() << '\n';
        cout << "   number of reads:          " << R->getEffectiveNReads() << '\n';
        cout << "   number of nodes:          " << G.getNNodes() << '\n';
        cout << "   number of edges:          " << G.getNEdges() << '\n';
        cout << "   minimum overlap size:     " << G.getMinOverlap() << '\n';
 
        if (overlapCutoff != 0) {
            if (overlapCutoff > R->getReadsLength()-1)
            {
                overlapCutoff = R->getReadsLength()-1;
                cout << "overlap cutoff cannot exceed the reads length.\n";
                cout << "it has been adjusted" << endl;
            }
            cout << "Discarding overlaps shorter than " << overlapCutoff << "... " << flush;
            G.overlapSizeCutoff(overlapCutoff);
            cout << "done" << endl;
        }
        
        //Write log file
        ofstream outLog((prefix + "_log.txt").c_str());
        outLog << setprecision(1) << fixed;
        edenaVersion(outLog);

        //get time
        time_t rawtime;
        struct tm * timeinfo;
        time(&rawtime);
        timeinfo = localtime(&rawtime);

        outLog << asctime(timeinfo);
        outLog << endl;

        outLog << "    ovl file:            " << ovlFile << '\n';
        outLog << "    overlap cutoff:      " << overlapCutoff << '\n';
        outLog << "    contextual cleaning: ";
        if (contextualCleaning == true)
            outLog << "yes\n";
        else
            outLog << "no\n";
        outLog << "    min contig size:     ";
        if (minContigSize == 0)
            outLog << "auto (1.5 * readLength)\n";
        else
            outLog << minContigSize << "\n";
        outLog << "    min contig coverage: ";
        if (minCoverage == 0)
            outLog << "auto\n";
        else
            outLog << minCoverage << endl;

        outLog << "    Trim contig ends:    " << trim << '\n';
        outLog << flush;

        unsigned int nNodes = 0, nReads = 0;
        unsigned int sum = 0;
        unsigned int nDeadEnd = 0;
        nReads = 0;

        if (cleanGraph) {
            
            if (checkGraph)
                G.checkConsistency();

            G.condense(true, false);

            if (discardNonUsable) {
                cout << "Discarding non-usable reads... " << flush;
                nNodes = G.discardNonUsableNode(nReads);
                cout << "done\n";
                cout << "   " << nNodes << " nodes corresponding to " << nReads << " reads have been discarded ("
                        << (float) nReads / R->getEffectiveNReads()*100 << "%)\n";
            }

            cout << "Removing dead-end path... " << flush;
            sum = 0;
            nDeadEnd = 0;
            nReads = 0;

            do {
                nDeadEnd = G.identifyDeadEnd(deadEndLimit, nReads);
                sum += nDeadEnd;
            } while (nDeadEnd != 0);
            cout << "done\n";
            cout << "   " << sum << " dead-ends (l<=" << deadEndLimit << "nt) have been removed" << endl;
            cout << "   " << "corresponding to " << nReads << " reads ("
                    << (float) nReads / R->getEffectiveNReads()*100 << "%)";

            cout << endl;

            G.condense(true, true);

            if (checkGraph)
                G.checkConsistency();

            if (contextualCleaning) {
                //1) reliable cutoff
                //2) suspect cutoff (removed only if gas at least one reliable brother
                //3) G-incoherent cutoff (anyway removed)

                G.computeEdgesProb(CCreliableCutoff);
                sum = G.removeEdgesByValue(CCsupectCutoff, CCGincoherantCutoff);
                cout << "   " << sum << " edges have been cleaned out" << endl;
                G.condense(true, true);
                cout << "Removing dead-end path..." << flush;

                sum = 0;
                nReads = 0;

                do {
                    nDeadEnd = G.identifyDeadEnd(deadEndLimit, nReads);
                    sum += nDeadEnd;
                } while (nDeadEnd != 0);
                cout << "done\n";
                cout << "   " << sum << " dead-ends (l <= " << deadEndLimit << "nt) have been removed" << endl;
                cout << "   " << "corresponding to " << nReads << " reads ("
                        << (float) nReads / R->getEffectiveNReads()*100 << "%)\n";

                G.condense(true, true);
            }

            G.estimateCoverage(minCoverage, targetSize);

            if (cleanBubbles) {
                G.bubbles((double)minCoverage/2);
                G.condense(true, true);
            }
            if (minNodeCov > 0.0) {
                G.coverageCutoff(minNodeCov);
                G.condense(true, true);
            }
            
            if (discardNonUsable) {
                cout << "Discarding non-usable reads... " << flush;
                nNodes = G.discardNonUsableNode(nReads);
                cout << "done\n";
                cout << "   " << nNodes << " nodes corresponding to " << nReads << " reads have been discarded ("
                        << (float) nReads / R->getEffectiveNReads()*100 << "%)\n";
            }
  
           // cout << "Removing suspicious nodes\n";
            G.discardSuspiciousNode(minCoverage);
          
            
            G.condense(true, true);

            if (writeOSG) {
                //write overlaps string graph
                cout << "Saving the overlaps string graph..." << flush;
                G.saveData((prefix + ".osg").c_str());
                cout << "done" << endl;
            }
        }

        if (cleanGraph==false)
                G.estimateCoverage(minCoverage, targetSize);
        
        G.estimatePairedDistance(peHorizon, nsd, prefix);

        if (checkGraph)
            G.checkConsistency();

        for (unsigned int i = 1; i <= G.getNNodes(); i++)
            G.nodesTab[i].unsetVisited();

        if (!dev) {//OUTPUT CONTIGS
            if (minContigSize == 0)
                minContigSize = R->getReadsLength() + R->getReadsLength() / 2;
            G.assemble(prefix, minContigSize, minCoverage, trim, minNPair, minRatio, maxRedundancy);

        } else {//DEV SHELL
          
            //            for (unsigned int i = 1; i < G.getNNodes(); i++)
            //            {
            //                if (G.nodesTab[i].getNReads() > 10 &&
            //                        L->isUniDirectional(G.nodesTab[i].getLayout()))
            //                {
            //                    if (!done)
            //                    {
            //                        cout << "warning: unidirectional nodes have been detected\n";
            //                        done=true;
            //                    }
            //                    cout << "node " << i << " (" << L->getNReads(G.nodesTab[i].getLayout()) << " reads)" << endl;
            //                    // G.nodesTab[i].isolate();
            //                }
            //            }
            
            G.computeEdgesProb(CCreliableCutoff);
            DevShell DEV;
            DEV.init(&G,P,R,L);
            DEV.prompt();
            
       

        }
    }

    delete P;
    delete L;
    delete R;

    return 0;
}

