#include "HashSearch.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "threadpool.hpp" #include "weight.h" #include "aa.h" #include "n2a.h" #include "mergeUnit.h" using namespace std; using namespace boost; using namespace boost::threadpool; boost::mutex muMonitor; const ushort ONEBYTE = 15; const ushort TWOBYTE = 255; const ushort THRBYTE = 4095; const ushort FOUBYTE = 65535; CHashSearch::CHashSearch(int nThreadNum) { // for any letter which is not in the 20 aa m_uMask = 10; m_uSeg = 8; fill_n(m_aChar2Code, 256, (m_uMask<<4)); fill_n(m_aCode2Char, 256, m_uMask); fill_n(m_aCode2Ten, 256, m_uMask); // read group info in aa.h and build mapping array // defaultly use murphy10s for (int i = 0; i < 500; ++i) { if ('\0' == murphy10s[i][0]) { break; } char* p = murphy10s[i]; for (uint j = 0; j < strlen(p); ++j) { // use new format to fix alignment break in SEGed region // the first four bits: group id // then one bit: SEGed? 1 : 0 // then three bits: offset in a group uint unIdx = (i << 4) + j+1; m_aChar2Code[p[j]] = unIdx; m_aChar2Code[p[j]+32] = unIdx; // add lower-case character m_aCode2Char[unIdx] = p[j]; m_aCode2Ten[unIdx] = i; unIdx |= m_uSeg; // set SEGed bit m_aChar2Code[p[j]+128] = unIdx; // SEGed letter } } // build substitute matrix for compressed char fill_n((int*)m_aSubMatrix, 256*256, -5); for (uint i = 0; i < strlen(aAAAlph); ++i) { for (uint j = 0; j < strlen(aAAAlph); ++j) { if (i < 20 && j < 20) { m_aSubMatrix[m_aChar2Code[aAAAlph[i]]][m_aChar2Code[aAAAlph[j]]] = blosum62[i][j]; } } } m_aChar2Code['.'] = (10 << 4); m_aCode2Char[(10<<4)] = '.'; m_aCode2Ten[(10<<4)] = m_uMask; m_aCode2Char['-'] = '-'; m_pBlastSig = NULL; m_pComptor = NULL; int LONGQUERY = 4096; if (0 == nThreadNum) { m_nThreadNum = 1; } else { m_nThreadNum = nThreadNum; } m_vTrace.assign(m_nThreadNum, vector >(LONGQUERY, vector(LONGQUERY))); m_vETrace.assign(m_nThreadNum, vector >(LONGQUERY, vector(LONGQUERY))); m_vDTrace.assign(m_nThreadNum, vector >(LONGQUERY, vector(LONGQUERY))); m_vBlastPt.assign(m_nThreadNum, -1); m_unTotalSeeds = 0; m_unTotalQuery = 0; m_unTotalSubj = 0; m_bSeqType = false; // used for convert index from with-fram to non-frame m_nIdxScl = 1; m_nQueryType = 0; m_sOutBase = ""; m_sOutput = ""; m_sOutput.reserve(100000000); m_sM8 = ""; //m_sM8.reserve(50000000); m_llOutCum = 0; m_llM8Cum = 0; m_nSeqBase = 0; // for test on gap extension m_unGapExt = 0; // hssp m_vCriteria.assign(100, 0); for (int i = 1; i < 100; ++i) { float f = 290.15 * pow(i, -0.562); f = f * i / 100; m_vCriteria[i] = (int)ceil(f); } m_unXmlSp = 0; m_unXmlCnt = 1; m_lnSeqNum = 0; m_lnTotalAa = 0; m_nStdout = 0; m_sQFile = ""; m_sDFile = ""; m_sStartTime = ""; m_sLeft = ""; } struct CompDbObj { CompDbObj(VUCHAR& vSeqs, VUINT& vLens, uint& unMer) : m_vSeqs(vSeqs), m_vLens(vLens), m_unMer(unMer) {} bool operator() (const uint pos1, const uint pos2) const { // init paras int nIdx1 = pos1>>11; int nLen1 = m_vLens[nIdx1+1] - m_vLens[nIdx1]; int nOff1 = (pos1&0x7ff) + m_unMer; uchar* p1 = &m_vSeqs[m_vLens[nIdx1]] + nOff1; int nIdx2 = pos2>>11; int nLen2 = m_vLens[nIdx2+1] - m_vLens[nIdx2]; int nOff2 = (pos2&0x7ff) + m_unMer; uchar* p2 = &m_vSeqs[m_vLens[nIdx2]] + nOff2; // comp int nDiff = 0; if (nLen1-nOff1 >= 4 && nLen2-nOff2 >= 4) { nDiff = 4; } else { nDiff = (nLen1-nOff1) < (nLen2-nOff2) ? (nLen1-nOff1) : (nLen2-nOff2); } for (int i = 0; i < nDiff; ++i) { if ((*(p1+i)>>4) != (*(p2+i)>>4)) { return (*(p1+i)>>4) < (*(p2+i)>>4); } } return ((nLen1-nOff1) < (nLen2-nOff2)); } VUCHAR& m_vSeqs; VUINT& m_vLens; uint& m_unMer; }; int CHashSearch::BuildDHash(const char* szFile, string& sOutFile, int nSplitNum, bool bFullId) { /***************************************************************/ // revise the size of database according to rapsearch long int lnSeqNum = 0; long int lnAaNum = 0; GuessTotSeq(szFile, lnSeqNum, lnAaNum); ifstream is(szFile); is.seekg(0, ios::end); long int lnFileSize = is.tellg(); is.close(); long int lnBlockSize = (long int)((1<<30) * 0.618); uint unBlockSize = m_unDSize = lnBlockSize; if (0 != nSplitNum) { unBlockSize = m_unDSize = lnFileSize/nSplitNum + 1; } /***************************************************************/ // the para for seed variants in both fast and slow mode m_bFast = true; ifstream ifFile(szFile); if (!ifFile.good()) { ifFile.close(); cout << "can not open the file: " << szFile << endl; return -1; } ofstream of(sOutFile.c_str()); ofstream ofInfo((sOutFile+".info").c_str()); if (!of.good() || !ofInfo.good()) { cout << "can not write files..." << endl; return -1; } archive::binary_oarchive oa(of); archive::binary_oarchive oaInfo(ofInfo); // container for db para MINDEX vHash(m_unTotalIdx, VUINT()); // all k-mer of database VUINT vLens; VUCHAR vSeqs; VNAMES vNames; vector vFreq(strlen(murphy10r), 0); VUINT vWordCnts(m_unTotalIdx, 0); uint unMedian = 0; long int lnTotalAa = 0; // data pool for processing data file POOL vPool(m_unDSize, 0); int nBlock = 0; uint unLeft = 0; while (ifFile.good()) { ifFile.read(&vPool[unLeft], m_unDSize-unLeft); int nRead = ifFile.gcount(); ITER itStop = vPool.begin(); if (unLeft+nRead < m_unDSize) { // for last block of data, give it a '>' to split the last sequence vPool[unLeft+nRead] = '>'; advance(itStop, unLeft+nRead+1); } else { advance(itStop, unLeft+nRead); } // find a completed sequence ITER itSt = find(vPool.begin(), itStop, '>'); ITER itBeg = find(itSt, itStop, '\n'); ITER itEd = find(itBeg, itStop, '>'); while (itEd != itStop) { vLens.push_back(0); while (itEd != itStop && vSeqs.size() < unBlockSize && vNames.size() < 2096152) // 2^21=2097152, assume that the longest sequence is less than 1848000 { ++itBeg; // the lengths of some sequences are more than 2048 ITER itEnd = remove(itBeg, itEd, '\r'); itEnd = remove(itBeg, itEnd, '\n'); int nLen = distance(itBeg, itEnd); if (nLen > 2048) { // store longer seq into several fragments with overlaps of size 200 int nNum = (nLen-200) / 1848; if ((nLen-200) % 1848 != 0) { ++nNum; } for (int i = 0; i < nNum-1; ++i) { if (bFullId == false) { vNames.push_back(string(itSt+1, find(itSt+1, itBeg-1, ' '))); } else { vNames.push_back(string(itSt+1, itBeg-1)); } vSeqs.insert(vSeqs.end(), itBeg+i*1848, itBeg+2048+i*1848); vLens.push_back(vSeqs.size()); } if (bFullId == false) { vNames.push_back(string(itSt+1, find(itSt+1, itBeg-1, ' '))); } else { vNames.push_back(string(itSt+1, itBeg-1)); } vSeqs.insert(vSeqs.end(), itBeg+(nNum-1)*1848, itEnd); vLens.push_back(vSeqs.size()); } else { if (bFullId == false) { vNames.push_back(string(itSt+1, find(itSt+1, itBeg-1, ' '))); } else { vNames.push_back(string(itSt+1, itBeg-1)); } vSeqs.insert(vSeqs.end(), itBeg, itEnd); vLens.push_back(vSeqs.size()); } itSt = itEd; itBeg = find(itSt, itStop, '\n'); itEd = find(itBeg, itStop, '>'); } // char to code lnTotalAa += Encode(vSeqs, vFreq); for (uint i = 0; i < vLens.size()-1; ++i) { // -1 or no, I need to think about it for (uint j = vLens[i]; j < vLens[i+1]-m_unMer; ++j) { int nIdx = Tran2Ten(vSeqs, j); if (-1 != nIdx) { // the left 21 bits denotes the index, the right 11 bits denotes the starting position of the seed vHash[nIdx].push_back((i<<11)|(j-vLens[i])); } } } // output information //PrintInfo(vHash); //PrintHash(vHash); VCOMP vComp(m_unTotalIdx, VUSHORT()); for (uint i = 0; i < vHash.size(); ++i) { sort(vHash[i].begin(), vHash[i].end(), CompDbObj(vSeqs, vLens, m_unMer)); for (uint j = 0; j < vHash[i].size(); ++j) { uint pos1 = vHash[i][j]; int nIdx1 = pos1>>11; int nLen1 = vLens[nIdx1+1] - vLens[nIdx1]; int nOff1 = (pos1&0x7ff) + m_unMer; uchar* p1 = &vSeqs[vLens[nIdx1]] + nOff1; int m = nLen1 - nOff1; int n = 0; ushort nSuff = 0; //string sSuff; if (m >= 4) { nSuff |= (m_aCode2Ten[p1[n]]) << 12; nSuff |= (m_aCode2Ten[p1[++n]]) << 8; nSuff |= (m_aCode2Ten[p1[++n]]) << 4; nSuff |= (m_aCode2Ten[p1[++n]]); } else if (m == 3) { nSuff |= (m_aCode2Ten[p1[n]]) << 12; nSuff |= (m_aCode2Ten[p1[++n]]) << 8; nSuff |= (m_aCode2Ten[p1[++n]]) << 4; nSuff |= ONEBYTE; } else if (m == 2) { nSuff |= (m_aCode2Ten[p1[n]]) << 12; nSuff |= (m_aCode2Ten[p1[++n]]) << 8; nSuff |= TWOBYTE; } else if (m == 1) { nSuff |= (m_aCode2Ten[p1[n]]) << 12; nSuff |= THRBYTE; } else if (m == 0) { nSuff |= FOUBYTE; } vComp[i].push_back(nSuff); } } //serialize oa << vSeqs; oa << vLens; oa << vHash; oa << vNames; oa << vComp; uint unTotalWord = 0; for (uint i = 0; i < vHash.size(); ++i) { vWordCnts[i] += vHash[i].size(); unTotalWord += vHash[i].size(); vHash[i].clear(); } vLens.clear(); vSeqs.clear(); vNames.clear(); ++nBlock; } // move the data at the end of pool to the beginning unLeft = distance(itSt, vPool.end()); POOL::reverse_iterator itLast = find(vPool.rbegin(), vPool.rend(), '>'); copy(itSt, vPool.end(), vPool.begin()); } //serialize for (uint i = 0; i < vFreq.size(); ++i) { vFreq[i] /= lnTotalAa; } oaInfo << nBlock; oaInfo << lnSeqNum; //oaInfo << lnTotalAa; oaInfo << lnAaNum; oaInfo << vWordCnts; sort(vWordCnts.begin(), vWordCnts.end()); //nth_element(vWordCnts.begin(), vWordCnts.begin()+m_unTotalIdx/2, vWordCnts.end()); unMedian = vWordCnts[m_unTotalIdx/2]; oaInfo << unMedian; oaInfo << vFreq; ifFile.close(); of.close(); ofInfo.close(); return nBlock; } int CHashSearch::BuildQHash(istream& input, int nQueryType, map& mTransTable, map& mComple, Seg* seg, Seg* segsht, vector& vSeqs, vector& vLens, VNAMES& vNames) { char cIdSt = '>'; char cSeqEd = '>'; // init m_bSeqType & m_nIdxScl m_nQueryType = nQueryType; if (1 == m_nQueryType) { // nt m_bSeqType = true; m_nIdxScl = 6; printf("Queries are nucleotide sequences in fasta format\n"); } else if (2 == m_nQueryType) { // aa m_bSeqType = false; printf("Queries are protein sequences\n"); } else if (3 == m_nQueryType) { // fastq m_bSeqType = true; m_nIdxScl = 6; printf("Queries are nucleotide sequences in fastq format\n"); cIdSt = '@'; cSeqEd = '+'; } int nSeqNum = 0; POOL vPool(m_unQSize, 0); if (!m_sLeft.empty()) { copy(m_sLeft.begin(), m_sLeft.end(), vPool.begin()); } if (input.good()) { input.read(&vPool[0]+m_sLeft.size(), m_unQSize-m_sLeft.size()); int nRead = input.gcount(); if (nRead+m_sLeft.size() == 0) { return 0; } if (m_sLeft.size()+nRead < m_unQSize) { // for last block of data, give it a '>' to split the last sequence vPool[m_sLeft.size()+nRead] = cIdSt; } if (m_nQueryType == 0) { if (vPool[0] == '>') { m_nQueryType = GuessQueryType(vPool); if (1 == m_nQueryType) { // nt m_bSeqType = true; m_nIdxScl = 6; printf("Queries are nucleotide sequences in fasta format\n"); } else if (2 == m_nQueryType) { // aa m_bSeqType = false; printf("Queries are protein sequences\n"); } } else if (vPool[0] == '@') { // fastq m_bSeqType = true; m_nIdxScl = 6; printf("Queries are nucleotide sequences in fastq format\n"); cIdSt = '@'; cSeqEd = '+'; m_nQueryType = 3; } } ITER itStop = vPool.begin(); if (m_sLeft.size()+nRead < m_unQSize) { advance(itStop, m_sLeft.size()+nRead+1); } else { if (3 == m_nQueryType) { ITER itTemp = itStop; bool bFound = true; while (bFound) { bFound = true; for (int i = 0; i < 4; ++i) { itTemp = find(++itTemp, vPool.end(), '\n'); if (vPool.end() == itTemp) { bFound = false; break; } } if (vPool.end() != itTemp) { itStop = ++itTemp; } } m_sLeft.clear(); m_sLeft.assign(itStop, vPool.end()); ++itStop; } else { POOL::reverse_iterator rit = find(vPool.rbegin(), vPool.rend(), cIdSt); while ('\n' != *(++rit)) { rit = find(rit, vPool.rend(), '>'); } int n = distance(rit, vPool.rbegin()); uint unLeft = m_unQSize + n + 1; m_sLeft.clear(); m_sLeft.assign(vPool.begin()+(unLeft-1), vPool.end()); itStop = vPool.begin() + unLeft; } } vPool.resize(distance(vPool.begin(), itStop)); itStop = vPool.end(); vLens.push_back(0); ITER itSt = find(vPool.begin(), itStop, cIdSt); ITER itBeg = find(itSt, itStop, '\n'); ITER itEd = find(itBeg, itStop, cSeqEd); if (true == m_bSeqType) { // query is dna vector vTran; vTran.reserve(1024); while (itEd != itStop) { ++itBeg; vNames.push_back(/*'>'+*/string(itSt+1, find(itSt+1, itBeg-1, ' '))); vector vS(itBeg, itEd-1); for (ITER itUpper = vS.begin(); itUpper != vS.end(); ++itUpper) { if (*itUpper >= 'a' && *itUpper <= 'z') { *itUpper = *itUpper - 'a' + 'A'; } } vS.erase(remove(vS.begin(), vS.end(), '\r'), vS.end()); vS.erase(remove(vS.begin(), vS.end(), '\n'), vS.end()); int x = vS.size(); for (int nFrame = 0; nFrame < 6; ++nFrame) { vTran.clear(); if (3 == nFrame) { // backward reverse(vS.begin(), vS.end()); for (uint nn = 0; nn < vS.size(); ++nn) { map::iterator it = mComple.find(vS[nn]); if (it != mComple.end()) { vS[nn] = it->second; } else { vS[nn] = 'N'; } } } // translate char* pSt = &vS[0] + nFrame%3; char* pEd = pSt + ((x-nFrame%3)/3)*3; for (; pSt < pEd; pSt += 3) { map::iterator it = mTransTable.find(string(pSt, 3)); if (it != mTransTable.end()) { vTran.push_back(it->second); } else { vTran.push_back(UNKNOWN_AA); } } vTran.push_back('\0'); // mark the sequence char* pMasked = NULL; if (vTran.size()-1 >= 12) { pMasked = seg -> maskseq(&vTran[0]); } else { pMasked = segsht -> maskseq(&vTran[0]); } for (uint i = 0; i < strlen(pMasked); ++i) { if ('X' == pMasked[i] || 'x' == pMasked[i]) { //vTran[i] = pMasked[i]; vTran[i] += 128; } } delete [] pMasked; // a char '\0' was added at the end of vTran, so now ignore it vSeqs.insert(vSeqs.end(), vTran.begin(), vTran.end()-1); vLens.push_back(vSeqs.size()); } itSt = itEd; if (3 == m_nQueryType) { itSt = find(itSt, itStop, '\n'); ++itSt; itSt = find(itSt, itStop, '\n'); ++itSt; } itBeg = find(itSt, itStop, '\n'); itEd = find(itBeg, itStop, cSeqEd); } } else { while (itEd != itStop) { ++itBeg; vNames.push_back(string(itSt+1, find(itSt+1, itBeg-1, ' '))); for (ITER itUpper = itBeg; itUpper != itEd-1; ++itUpper) { if (*itUpper >= 'a' && *itUpper <= 'z') { *itUpper = *itUpper - 'a' + 'A'; } } int nIter = vSeqs.size(); vSeqs.insert(vSeqs.end(), itBeg, itEd-1); vSeqs.erase(remove(vSeqs.begin()+nIter, vSeqs.end(), '\r'), vSeqs.end()); vSeqs.erase(remove(vSeqs.begin()+nIter, vSeqs.end(), '\n'), vSeqs.end()); vLens.push_back(vSeqs.size()); itSt = itEd; itBeg = find(itSt, itStop, '\n'); itEd = find(itBeg, itStop, cSeqEd); } } Encode(vSeqs); nSeqNum += vNames.size(); } return nSeqNum; } void CHashSearch::Search(string& sDbPre, int nSeqNum, vector& vQSeqs, vector& vQLens, VNAMES& vQNames) { ifstream ifD(sDbPre.c_str()); ifstream ifDInfo((sDbPre+".info").c_str()); if (!ifD.good() || !ifDInfo.good()) { ifstream if1((sDbPre+".des").c_str()); ifstream if2((sDbPre+".des").c_str()); ifstream if3((sDbPre+".des").c_str()); ifstream if4((sDbPre+".des").c_str()); if (if1.good() && if2.good() && if3.good() && if4.good()) { cout << "This database file comes from RAPSearch1." << endl; cout << "Please re-index the database using presearch from RAPSearch2" << endl; } else { cout << "Can not open the database file" << endl; cout << "Please check the file name" << endl; } if1.close(); if2.close(); if3.close(); if4.close(); return; } // construct query package CQrPckg Query(vQSeqs, vQLens, vQNames); archive::binary_iarchive iaD(ifD); archive::binary_iarchive iaDInfo(ifDInfo); vector vFreq; VUINT vWordCnts(m_unTotalIdx, 0); int nDbBlockNum = 0; long int lnTotalAa; uint unMedian; long int lnSeqNum; iaDInfo >> nDbBlockNum; iaDInfo >> lnSeqNum; iaDInfo >> lnTotalAa; iaDInfo >> vWordCnts; iaDInfo >> unMedian; iaDInfo >> vFreq; m_lnSeqNum = lnSeqNum; m_lnTotalAa = lnTotalAa; // set BlastStat InitAlignPara(m_bSeqType, lnTotalAa, lnSeqNum, m_nThreadNum); pool tp(m_nThreadNum); cout << "start " << m_nThreadNum << " threads" << endl; m_vOutIdx.assign(nSeqNum, CIndex()); for (int j = 0; j < nDbBlockNum; ++j) { // temp file stream if (m_ofTemp.is_open()) { m_ofTemp << m_sOutput; m_sOutput.clear(); m_ofTemp.close(); m_llOutCum = 0; ofstream ofOut((m_sOutBase+".tmp"+lexical_cast(j-1)+".idx").c_str()); archive::binary_oarchive oaOut(ofOut); oaOut << m_vOutIdx; ofOut.close(); } for (int nn = 0; nn < nSeqNum; ++nn) { m_vOutIdx[nn].m_llBeg = 0; m_vOutIdx[nn].m_nSize = 0; } m_nSeqBase = 0; m_ofTemp.open((m_sOutBase+".tmp"+lexical_cast(j)).c_str()); // read db file and store info MINDEX vDHash(m_unTotalIdx, VUINT()); // all k-mer of database vector vDLens; vector vDSeqs; VNAMES vDNames; VCOMP vComp; iaD >> vDSeqs; iaD >> vDLens; iaD >> vDHash; iaD >> vDNames; iaD >> vComp; // construct query package CDbPckg Db(vDHash, vDSeqs, vDLens, vDNames, vComp, vFreq, vWordCnts, unMedian); m_unTotalSubj += vDLens.size() - 1; //for (int i = 0; i < nQBlockNum; ++i) { m_unTotalQuery += vQLens.size() - 1; // generate results for (uint k = 0; k < vQLens.size()-1; k+=m_nIdxScl) { tp.schedule(bind(&CHashSearch::Searching, this, k, Query, Db)); //Searching(k, Query, Db); } tp.wait(); m_nSeqBase += vQNames.size(); } } tp.wait(); if (m_ofTemp.is_open()) { m_ofTemp << m_sOutput; m_ofTemp.close(); m_llOutCum = 0; ofstream ofOut((m_sOutBase+".tmp"+lexical_cast(nDbBlockNum-1)+".idx").c_str()); archive::binary_oarchive oaOut(ofOut); oaOut << m_vOutIdx; ofOut.close(); m_vOutIdx.clear(); } // merge nDbBlockNum temp results MergeRes(nDbBlockNum, vQNames, sDbPre); ifD.close(); ifDInfo.close(); } void CHashSearch::Process(char* szDBFile, char* szQFile, char* szOFile, int nStdout, bool bEvalue, bool bLogE, double dThr, int nMaxOut, int nMaxM8, int nQueryType, bool bPrintEmpty, bool bGapExt, bool bAcc, bool bHssp, int nMinLen, bool bXml, uint unDSize, uint unQSize, uint unMer) { m_bEvalue = bEvalue; m_bLogE = bLogE; if (m_bEvalue == true) { m_pComptor = new CompEval(); } else { m_pComptor = new CompBits(); } m_dThr = dThr; if (m_bLogE == false) { //m_dThr = log(m_dThr); m_dThr = log(m_dThr) / log(10); //needs to be log_10, YY, Sep 2016 } if (nMaxOut == -1) { m_nMaxOut = LLONG_MAX; } else { m_nMaxOut = abs(nMaxOut); } if (nMaxM8 == -1) { m_nMaxM8 = LLONG_MAX; } else { m_nMaxM8 = abs(nMaxM8); } m_bPrintEmpty = bPrintEmpty; m_bGapExt = bGapExt; m_bAcc = bAcc; m_bHssp = bHssp; m_nMinLen = nMinLen; m_bXml = bXml; m_unMer = unMer; m_unDSize = unDSize; m_unQSize = unQSize; m_unTotalIdx = lexical_cast(pow(10.0, int(m_unMer))); m_bFast = true; if (true == m_bFast) { m_unMutSeedLen = 10; m_vMutation.push_back(lexical_cast(pow(10.0, int(m_unMer-4-1)))); m_vMutation.push_back(lexical_cast(pow(10.0, int(m_unMer-5-1)))); m_vMutation.push_back(lexical_cast(pow(10.0, int(m_unMer-3-1)))); if (m_unMer > 6) { m_vMutation.push_back(lexical_cast(pow(10.0, int(m_unMer-6-1)))); } } else { m_unMutSeedLen = 9; m_vMutation.push_back(lexical_cast(pow(10.0, int(m_unMer-3-1)))); m_vMutation.push_back(lexical_cast(pow(10.0, int(m_unMer-5-1)))); } m_sQFile = szQFile; m_sDFile = szDBFile; m_nStdout = nStdout; if (szOFile != NULL) { m_sOutBase.assign(szOFile); string sDel = m_sOutBase + ".aln"; ifstream iffTest; iffTest.open(sDel.c_str()); if (iffTest.good()) { iffTest.close(); remove(sDel.c_str()); } sDel = m_sOutBase + ".m8"; iffTest.open(sDel.c_str()); if (iffTest.good()) { iffTest.close(); remove(sDel.c_str()); } } else { m_sOutBase = ""; } time_t rawtime; struct tm* timeinfo; time(&rawtime); timeinfo = localtime(&rawtime); m_sStartTime.assign(asctime(timeinfo)); ifstream fIn; if (m_sQFile != "stdin") { fIn.open(m_sQFile.c_str()); if (!fIn.good()) { fIn.close(); cout << "can not open the file: " << m_sQFile << endl; exit(1); } } istream& input = (m_sQFile!="stdin") ? fIn : cin; map mTransTable; map mComple; Seg* seg = NULL; Seg* segsht = NULL; //if (true == m_bSeqType) { // if need, construct paras for translation from nt to aa for (int i = 0; i < TOTCODON; ++i) { const char* p = nt[i]; mTransTable[string(p, 3)] = aa[i]; } mComple['A'] = 'T'; mComple['T'] = 'A'; mComple['a'] = 't'; mComple['t'] = 'a'; mComple['C'] = 'G'; mComple['G'] = 'C'; mComple['c'] = 'g'; mComple['g'] = 'c'; mComple['U'] = 'A'; mComple['u'] = 'a'; seg = new Seg(LGERSEED); segsht = new Seg(DEFSEED); } vector vQSeqs; vector vQLens; VNAMES vQNames; int nSeqNum = 0; while ((nSeqNum=BuildQHash(input, nQueryType, mTransTable, mComple, seg, segsht, vQSeqs, vQLens, vQNames)) > 0) { string sDbOut(szDBFile); Search(sDbOut, nSeqNum, vQSeqs, vQLens, vQNames); vQSeqs.clear(); vQLens.clear(); vQNames.clear(); } if (m_sQFile != "stdin") { ((ifstream&)input).close(); } if (seg != NULL) { delete seg; } if (segsht != NULL) { delete segsht; } } void CHashSearch::Process(char* szDBFile, char* szDbHash, bool bFullId, int nSplitNum, uint unMer) { m_unMer = unMer; m_unTotalIdx = lexical_cast(pow(10.0, int(m_unMer))); string sDbOut(szDbHash); BuildDHash(szDBFile, sDbOut, nSplitNum, bFullId); } // rewrite this part void CHashSearch::Searching(int k, CQrPckg& Query, CDbPckg& Db) { //cout << k << endl; //cout << "my id:\t" << this_thread::get_id() << endl; //int nTreadID = m_mThreadID[this_thread::get_id()]; //BlastStat* pBlastSig = m_vpBlastSig[m_mThreadID[this_thread::get_id()]]; int nTreadID = -1; muMonitor.lock(); for (uint i = 0; i < m_vBlastPt.size(); ++i) { if (-1 == m_vBlastPt[i]) { nTreadID = i; m_vBlastPt[i] = 1; break; } } muMonitor.unlock(); int nFoundHit = 0; //using namespace boost::chrono; //thread_clock::time_point start = thread_clock::now(); MRESULT mRes; for (int nStep = 0; nStep < m_nIdxScl; ++nStep) { int nQrIdx = k + nStep; // the index of frame 0 int nQDnaIdx = nQrIdx / m_nIdxScl * m_nIdxScl; // original length of query uint unQLen = Query.m_vLens[nQrIdx+1] - Query.m_vLens[nQrIdx]; if (unQLen < m_unMer) { continue; } int nQOriLen = unQLen; if (true == m_bSeqType) { nQOriLen = 3 * (Query.m_vLens[nQDnaIdx+1]-Query.m_vLens[nQDnaIdx]); for (int n = 1; n < 3; ++n) { if (Query.m_vLens[nQDnaIdx+n+1]-Query.m_vLens[nQDnaIdx+n] == Query.m_vLens[nQDnaIdx+n]-Query.m_vLens[nQDnaIdx+n-1]) { ++nQOriLen; } else { break; } } } // set up BlastStat if (true == m_bSeqType) { // the index of the first seq considering the direction int n = nQrIdx / 3 * 3; m_vpBlastSig[nTreadID]->blastComputeLengthAdjustmentComp(Query.m_vLens[n+1]-Query.m_vLens[n]); } else { m_vpBlastSig[nTreadID]->blastComputeLengthAdjustmentComp(unQLen); } uchar* pQ = &Query.m_vSeqs[0] + Query.m_vLens[nQrIdx]; CAlnPckg QrAln(pQ, unQLen, 0); // build invalid index position vector vValid(unQLen, 0); for (uint xx = 0; xx < unQLen; ++xx) { vValid[xx] = m_aCode2Ten[pQ[xx]]; if (0 != (m_uSeg&pQ[xx])) { pQ[xx] &= ~m_uSeg; vValid[xx] = m_uMask; } } // for consistence with swift uint unPrvSdLen = 6; for (uint i = 0; i < unQLen - m_unMer; ++i) { uint unCnt = 0; //thread_clock::time_point st = thread_clock::now(); // pick seed length uint unQSeedBeg = QrAln.m_unSeedBeg = i; int nSeed = Tran2Ten(QrAln, vValid); if (-1 == nSeed) { continue; } uint unLocalSeed = 0; uint unIdx = 0; if (m_bAcc == false) { uint unIncr = 0; double dFold = 0.0; int nLeft = unQLen - unQSeedBeg - m_unMer; uint unRng = nLeft+1 >= 3 ? 3 : nLeft+1; //uint unFreq = Db.m_vHash[nSeed].size(); uint unFreq = Db.m_vWordCnts[nSeed]; if(unFreq <= Db.m_unMedian) { unLocalSeed = m_unMer; } else { double dExpFreq = unFreq; for(unIncr = 1; unIncr < unRng; unIncr ++) { if((unIdx = vValid[unQSeedBeg+m_unMer+unIncr-1]) != m_uMask) { dFold = Db.m_vFreq[unIdx]; } else { //dFold = 1.0 / strlen(murphy10r); break; } dExpFreq *= dFold; if(dExpFreq <= Db.m_unMedian) { break; } } unLocalSeed = m_unMer + unIncr; } // if there is a unacceptable char, give up this seed if (m_uMask == unIdx) { continue; } if (unLocalSeed < unPrvSdLen - 1) { unLocalSeed = unPrvSdLen - 1; } // no enough letters if (unQSeedBeg+unLocalSeed > unQLen) { continue; } } else { unLocalSeed = 10; // no enough letters if (unQSeedBeg+unLocalSeed > unQLen) { continue; } for (uint i = m_unMer; i < unLocalSeed; ++i) { if((unIdx = vValid[unQSeedBeg+i]) == m_uMask) { break; } } if (m_uMask == unIdx) { continue; } } vector vExtra(pQ+unQSeedBeg+m_unMer, pQ+unQSeedBeg+unLocalSeed); for (uint idx = 0; idx < vExtra.size(); ++idx) { vExtra[idx] = m_aCode2Ten[vExtra[idx]]; } if (!Db.m_vHash[nSeed].empty()) { int nCnt = ExtendSeq2Set(nSeed, unLocalSeed, vExtra, nQrIdx, QrAln, nQOriLen, vValid, Db.m_vHash[nSeed], Db, Query.m_vNames, Db.m_vNames, mRes, nTreadID); if (nCnt > 0) { unPrvSdLen = unLocalSeed; } else { unPrvSdLen = m_unMer; } } if (m_bAcc == true) { continue; } // mutation, pos: 4, 5, 3, (6) // check whether or not the length is enough if (unQLen < unQSeedBeg+m_unMutSeedLen) { continue; } // check non-aa char for (uint j = unQSeedBeg+unLocalSeed; j < unQSeedBeg+m_unMutSeedLen; ++j) { if((unIdx = vValid[j]) == m_uMask) { break; } } if (m_uMask == unIdx) { continue; } vExtra.assign(pQ+unQSeedBeg+m_unMer, pQ+unQSeedBeg+m_unMutSeedLen); for (uint idx = 0; idx < vExtra.size(); ++idx) { vExtra[idx] = m_aCode2Ten[vExtra[idx]]; } for (uint m = 0; m < m_vMutation.size(); ++m) { int nVal = (nSeed/m_vMutation[m]) % 10; int nMutIdx = nSeed - nVal*m_vMutation[m]; for (int n = 0; n < 10; ++n) { if (nMutIdx == nSeed) { nMutIdx += m_vMutation[m]; continue; } if (Db.m_vHash[nMutIdx].empty()) { nMutIdx += m_vMutation[m]; continue; } int nCnt = ExtendSeq2Set(nMutIdx, m_unMutSeedLen, vExtra, nQrIdx, QrAln, nQOriLen, vValid, Db.m_vHash[nMutIdx], Db, Query.m_vNames, Db.m_vNames, mRes, nTreadID); nFoundHit += nCnt; unCnt += nCnt; nMutIdx += m_vMutation[m]; } } /*********************************************************/ if (6 == m_unMer && m_unMutSeedLen > m_unMer) { // mutation pos 6 // think it as a mutation at pos 5, then do set intersection with current seed set //int nBase = (nQrIdx % 100000) * 10; int nNextNum = m_aCode2Ten[QrAln.m_pSeq[QrAln.m_unSeedBeg+m_unMer]]; for (int i = 0; i < 10; ++i) { if (i == nNextNum) { // if the mutation is equal to the original next char continue; } // change the 6th position vExtra[0] = i; int nCnt = ExtendSeq2Set(nSeed, m_unMutSeedLen, vExtra, nQrIdx, QrAln, nQOriLen, vValid, Db.m_vHash[nSeed], Db, Query.m_vNames, Db.m_vNames, mRes, nTreadID); nFoundHit += nCnt; unCnt += nCnt; } } //[>*******************************************************<] //thread_clock::time_point ed = thread_clock::now(); //cout << nStep << "\t" << i << "\t" << unCnt << "\tduration:\t" << duration_cast(ed-st).count() << " ms" << endl; } } PrintRes(mRes, nTreadID, Query, Db); muMonitor.lock(); m_vBlastPt[nTreadID] = -1; muMonitor.unlock(); } struct CompSeed { CompSeed(CDbPckg& Db, uint unMer, uchar* aCode2Ten) : m_Db(Db), m_unMer(unMer), m_aCode2Ten(aCode2Ten) {} bool operator() (const uint& unPos, const vector& vExtra) { uint unIdx = unPos >> 11; uint unDSeedBeg = unPos & 0x000007FF; uint unDLen = m_Db.m_vLens[unIdx+1] - m_Db.m_vLens[unIdx]; uchar* pD = &m_Db.m_vSeqs[0] + m_Db.m_vLens[unIdx]; int nDOff = unDLen - unDSeedBeg - m_unMer; bool bLess = false; int nLeast = vExtra.size(); int nDiff = nDOff>=nLeast ? nLeast : nDOff; if (0 == nDiff) { bLess = true; } else { uint i = m_unMer; for (; i < m_unMer+nDiff; ++i) { if (m_aCode2Ten[pD[unDSeedBeg+i]] != vExtra[i-m_unMer]) { bLess = (m_aCode2Ten[pD[unDSeedBeg+i]] < vExtra[i-m_unMer]); break; } } // if they are the same for nLeast letters if (i==m_unMer+nDiff && bLess==false && m_aCode2Ten[pD[unDSeedBeg+i-1]] == vExtra[i-1-m_unMer]) { if (nDiff < nLeast) { bLess = true; } else { bLess = false; } } } return bLess; } bool operator() (const vector& vExtra, uint& unPos) { uint unIdx = unPos >> 11; uint unDSeedBeg = unPos & 0x000007FF; uint unDLen = m_Db.m_vLens[unIdx+1] - m_Db.m_vLens[unIdx]; uchar* pD = &m_Db.m_vSeqs[0] + m_Db.m_vLens[unIdx]; int nDOff = unDLen - unDSeedBeg - m_unMer; bool bLess = false; int nLeast = vExtra.size(); int nDiff = nDOff>=nLeast ? nLeast : nDOff; if (0 == nDiff) { bLess = false; } else { uint i = m_unMer; for (; i < m_unMer + nDiff; ++i) { if (m_aCode2Ten[pD[unDSeedBeg+i]] != vExtra[i-m_unMer]) { bLess = (vExtra[i-m_unMer] < m_aCode2Ten[pD[unDSeedBeg+i]]); break; } } if (i==m_unMer+nDiff && bLess==false && m_aCode2Ten[pD[unDSeedBeg+i-1]]==vExtra[i-1-m_unMer]) { if (nDiff > nLeast) { bLess = true; } else { bLess = false; } } } return bLess; } CDbPckg& m_Db; uint m_unMer; uchar* m_aCode2Ten; }; struct CompShortLow { bool operator() (const ushort& s1, const ushort& s2) { int nLen1 = 4; if ((s1&ONEBYTE) == ONEBYTE) { --nLen1; } if ((s1&TWOBYTE) == TWOBYTE) { --nLen1; } if ((s1&THRBYTE) == THRBYTE) { --nLen1; } if ((s1&FOUBYTE) == FOUBYTE) { --nLen1; } int nLen2 = 4; if ((s2&ONEBYTE) == ONEBYTE) { --nLen2; } if ((s2&TWOBYTE) == TWOBYTE) { --nLen2; } if ((s2&THRBYTE) == THRBYTE) { --nLen2; } if ((s2&FOUBYTE) == FOUBYTE) { --nLen2; } int nLen = nLen1>((4-nLen)<<2)) == (s2>>((4-nLen)<<2))); if (true == b) { return nLen1>((4-nLen)<<2)) < (s2>>((4-nLen)<<2))); } } }; struct CompShortUp { bool operator() (const ushort& s1, const ushort& s2) { int nLen1 = 4; if ((s1&ONEBYTE) == ONEBYTE) { --nLen1; } if ((s1&TWOBYTE) == TWOBYTE) { --nLen1; } if ((s1&THRBYTE) == THRBYTE) { --nLen1; } if ((s1&FOUBYTE) == FOUBYTE) { --nLen1; } int nLen2 = 4; if ((s2&ONEBYTE) == ONEBYTE) { --nLen2; } if ((s2&TWOBYTE) == TWOBYTE) { --nLen2; } if ((s2&THRBYTE) == THRBYTE) { --nLen2; } if ((s2&FOUBYTE) == FOUBYTE) { --nLen2; } int nLen = nLen1>((4-nLen)<<2)) == (s2>>((4-nLen)<<2))); if (true == b) { return false; //return nLen1>((4-nLen)<<2)) < (s2>>((4-nLen)<<2))); } } }; int CHashSearch::ExtendSeq2Set(int nSeed, uint unLocalSeedLen, vector& vExtra, int nQSeqIdx, CAlnPckg& QrAln, int nQOriLen, vector& vValid, VUINT& vDSet, CDbPckg& Db, VNAMES& vQNames, VNAMES& vDNames, MRESULT& mRes, int nTreadID) { //using namespace boost::chrono; //thread_clock::time_point st = thread_clock::now(); // find a proper range for the comparisons int nSt = 0; int nEd = 0; if (unLocalSeedLen > m_unMer) { //VUINT::iterator itSd = lower_bound(vDSet.begin(), vDSet.end(), vExtra, CompSeed(Db, m_unMer, m_aCode2Ten)); //nSt = itSd - vDSet.begin(); ushort nExtra = 0; for (uint i = 0; i < vExtra.size(); ++i) { nExtra |= (vExtra[i]) << (12-(i<<2)); } for (int i = vExtra.size(); i < 4; ++i) { nExtra |= (ONEBYTE) << (12-(i<<2)); } VUSHORT::iterator itShort = lower_bound(Db.m_vComp[nSeed].begin(), Db.m_vComp[nSeed].end(), nExtra, CompShortLow()); nSt = itShort - Db.m_vComp[nSeed].begin(); // check if nSt is real hit, if not, break { if (vDSet.size() == nSt) { return 0; } ushort s1 = Db.m_vComp[nSeed][nSt]; ushort s2 = nExtra; int nLen1 = 4; if ((s1&ONEBYTE) == ONEBYTE) { --nLen1; } if ((s1&TWOBYTE) == TWOBYTE) { --nLen1; } if ((s1&THRBYTE) == THRBYTE) { --nLen1; } if ((s1&FOUBYTE) == FOUBYTE) { --nLen1; } int nLen2 = 4; if ((s2&ONEBYTE) == ONEBYTE) { --nLen2; } if ((s2&TWOBYTE) == TWOBYTE) { --nLen2; } if ((s2&THRBYTE) == THRBYTE) { --nLen2; } if ((s2&FOUBYTE) == FOUBYTE) { --nLen2; } int nLen = nLen1>((4-nLen)<<2)) == (s2>>((4-nLen)<<2))); if (true != b) { return 0; } } //itSd = upper_bound(vDSet.begin(), vDSet.end(), vExtra, CompSeed(Db, m_unMer, m_aCode2Ten)); //nEd = itSd - vDSet.begin(); itShort = upper_bound(Db.m_vComp[nSeed].begin(), Db.m_vComp[nSeed].end(), nExtra, CompShortUp()); nEd = itShort - Db.m_vComp[nSeed].begin(); } else { nSt = 0; nEd = vDSet.size(); } //st = thread_clock::now(); //sequence extension STAlnmnt stAlnmnt; for (int j = nSt; j < nEd; ++j) { if (vDNames[vDSet[j]>>11] == "7719.ENSCINP00000006706") { int zya = 0; } uint unDLen, unDSeedBeg; uchar* pD = GetSeq(Db.m_vSeqs, Db.m_vLens, Db.m_vNames, vDSet[j], unDLen, unDSeedBeg); // no enough letters if (unDLen < unDSeedBeg + unLocalSeedLen) { continue; } // have the same previous char, so don't extend them at this time if (4 != vExtra.size() // if this is a mutation case, do not allow to ignore it && m_bAcc==false && 0 != QrAln.m_unSeedBeg && 0 != unDSeedBeg && m_aCode2Ten[QrAln.m_pSeq[QrAln.m_unSeedBeg-1]] == m_aCode2Ten[pD[unDSeedBeg-1]] && m_uMask != vValid[QrAln.m_unSeedBeg-1] ) { continue; } CAlnPckg DbAln(pD, unDLen, unDSeedBeg); ResetResult(stAlnmnt); // for debug uint unLocalCopy = unLocalSeedLen; uchar* pQAlign = QrAln.m_pSeq + QrAln.m_unSeedBeg; uchar* pDAlign =DbAln.m_pSeq + DbAln.m_unSeedBeg; int ii = 0; for (; ii < unLocalCopy; ++ii) { stAlnmnt.nScore += m_aSubMatrix[pQAlign[ii]][pDAlign[ii]]; if (pQAlign[ii] == pDAlign[ii]) { ++stAlnmnt.nMatch; } } // forward maximal extension uint unTempLen = QrAln.m_unLen-QrAln.m_unSeedBeg < DbAln.m_unLen-DbAln.m_unSeedBeg ? QrAln.m_unLen-QrAln.m_unSeedBeg : DbAln.m_unLen-DbAln.m_unSeedBeg; while (ii < unTempLen && m_aCode2Ten[pQAlign[ii]] == m_aCode2Ten[pDAlign[ii]]) { ++unLocalCopy; stAlnmnt.nScore += m_aSubMatrix[pQAlign[ii]][pDAlign[ii]]; if (pQAlign[ii] == pDAlign[ii]) { ++stAlnmnt.nMatch; } ++ii; } // backward maximal extension uint unQSeed = QrAln.m_unSeedBeg; int nRange = QrAln.m_unSeedBeg < DbAln.m_unSeedBeg ? QrAln.m_unSeedBeg : DbAln.m_unSeedBeg; nRange = -nRange; ii = -1; while (ii >= nRange && m_aCode2Ten[pQAlign[ii]] == m_aCode2Ten[pDAlign[ii]]) { ++unLocalCopy; stAlnmnt.nScore += m_aSubMatrix[pQAlign[ii]][pDAlign[ii]]; if (pQAlign[ii] == pDAlign[ii]) { ++stAlnmnt.nMatch; } --QrAln.m_unSeedBeg; --DbAln.m_unSeedBeg; --ii; } if (stAlnmnt.nScore >= UngapExtSCut && stAlnmnt.nMatch >= MinMatch4Exp) { AlignSeqs(nSeed, QrAln, DbAln, unLocalCopy, stAlnmnt, nTreadID); int nDSeqIdx = vDSet[j] >> 11; CalRes(nQSeqIdx, QrAln.m_pSeq, nQOriLen, QrAln.m_unSeedBeg, nDSeqIdx, DbAln.m_pSeq, DbAln.m_unSeedBeg, Db, unLocalCopy, stAlnmnt, mRes, nTreadID); } if (unQSeed != QrAln.m_unSeedBeg) { QrAln.m_unSeedBeg = unQSeed; } } return (nEd-nSt); } bool CHashSearch::AlignSeqs(int nSeed, CAlnPckg& QrAln, CAlnPckg& DbAln, uint& unSeedLen, STAlnmnt& stAlnmnt, int nTreadID) { uchar* pQAlign = QrAln.m_pSeq + QrAln.m_unSeedBeg; uchar* pDAlign =DbAln.m_pSeq + DbAln.m_unSeedBeg; int ext_f = 0; int ext_b = 0; int match_f = 0; int match_b = 0; int nScore0 = stAlnmnt.nScore; int nQLeft = 0; int nDLeft = 0; nQLeft = QrAln.m_unLen-QrAln.m_unSeedBeg-unSeedLen; nDLeft = DbAln.m_unLen-DbAln.m_unSeedBeg-unSeedLen; //if (nQLeft > 0 && nDLeft > 0) { pQAlign = QrAln.m_pSeq + QrAln.m_unSeedBeg + unSeedLen; pDAlign = DbAln.m_pSeq + DbAln.m_unSeedBeg + unSeedLen; stAlnmnt.nScore += AlignFwd(pQAlign, pDAlign, nQLeft, nDLeft, &ext_f, &match_f, nScore0); stAlnmnt.nMatch += match_f; stAlnmnt.nQFwd += ext_f; stAlnmnt.nDFwd += ext_f; } nQLeft = QrAln.m_unSeedBeg-1; nDLeft = DbAln.m_unSeedBeg-1; //if (nQLeft > 0 && nDLeft > 0) { pQAlign = QrAln.m_pSeq; pDAlign = DbAln.m_pSeq; stAlnmnt.nScore += AlignBwd(pQAlign, pDAlign, nQLeft, nDLeft, &ext_b, &match_b, nScore0); stAlnmnt.nMatch += match_b; stAlnmnt.nQBwd += ext_b; stAlnmnt.nDBwd += ext_b; } int hsplen = unSeedLen + ext_f + ext_b; stAlnmnt.vMode.push_back('s'); stAlnmnt.vLen.push_back(hsplen); if (stAlnmnt.nScore < GapExtSCut) { //cout << "less than cut off" << endl; return true; } if (m_bAcc == false && true == m_bGapExt) { ++m_unGapExt; //cout << "1 hit ..." << endl; // vector for alignment path vector vMode; vector vLen; // forward gapped alignment int exta = 0; int extb = 0; int gap = 0; int nQOff = QrAln.m_unSeedBeg + unSeedLen + stAlnmnt.nQFwd; int nDOff = DbAln.m_unSeedBeg + unSeedLen + stAlnmnt.nDFwd; nQLeft = QrAln.m_unLen-nQOff; nDLeft = DbAln.m_unLen-nDOff; if (nQLeft > 2 && nDLeft > 2) { pQAlign = QrAln.m_pSeq + nQOff; pDAlign = DbAln.m_pSeq + nDOff; int n1 = AlignGapped(pQAlign, pDAlign, nQLeft, nDLeft, &exta, &extb, &match_f, &gap, vMode, vLen, nTreadID); if (n1 > 0) { stAlnmnt.nScore += n1; stAlnmnt.nMatch += match_f; stAlnmnt.nQFwd += exta; stAlnmnt.nDFwd += extb; stAlnmnt.vMode.insert(stAlnmnt.vMode.end(), vMode.rbegin(), vMode.rend()); stAlnmnt.vLen.insert(stAlnmnt.vLen.end(), vLen.rbegin(), vLen.rend()); } } // backward gapped alignment nQOff = QrAln.m_unSeedBeg - stAlnmnt.nQBwd; nDOff = DbAln.m_unSeedBeg - stAlnmnt.nDBwd; if (nQOff > 2 && nDOff > 2) { vector vQ; vQ.reserve(nQOff); for (int i = nQOff-1; i >= 0; --i) { vQ.push_back(QrAln.m_pSeq[i]); } pQAlign = &vQ[0]; vector vD; vD.reserve(nDOff); for (int i = nDOff-1; i >= 0; --i) { vD.push_back(DbAln.m_pSeq[i]); } pDAlign = &vD[0]; int n2 = AlignGapped(pQAlign, pDAlign, nQOff, nDOff, &exta, &extb, &match_b, &gap, vMode, vLen, nTreadID); if (n2 > 0) { stAlnmnt.nScore += n2; stAlnmnt.nMatch += match_b; stAlnmnt.nQBwd += exta; stAlnmnt.nDBwd += extb; stAlnmnt.vMode.insert(stAlnmnt.vMode.begin(), vMode.begin(), vMode.end()); stAlnmnt.vLen.insert(stAlnmnt.vLen.begin(), vLen.begin(), vLen.end()); } } } return true; } int CHashSearch::AlignFwd(uchar *queryseq, uchar *dataseq, uint len_queryseq, uint len_dataseq, int *extl, int *match, int score0) { int i, j, l, s, maxs, ma; i = j = l = 0; ma = 0; maxs = s = score0; *extl = 0; *match = 0; while(i < len_queryseq && j < len_dataseq && s >= MINSCORE && s >= maxs - UngapExtDrop) { // uncompleted s += m_aSubMatrix[queryseq[i]][dataseq[j]]; if(queryseq[i] == dataseq[j]) { ma ++; } l ++; if(s > maxs) { maxs = s; *extl = l; *match = ma; } i ++; j ++; } return maxs - score0; } int CHashSearch::AlignBwd(uchar *queryseq, uchar *dataseq, int pos1, int pos2, int *extl, int *match, int score0) { int i, j, l, s, maxs, ma; i = pos1; j = pos2; l = 0; ma = 0; maxs = s = score0; *match = *extl = 0; while(i >= 0 && j >= 0 && s >= MINSCORE && s >= maxs - UngapExtDrop) { // Skip stop codons // uncompleted s += m_aSubMatrix[queryseq[i]][dataseq[j]]; if(queryseq[i] == dataseq[j]) { ma ++; } l ++; if(s > maxs) { maxs = s; *extl = l; *match = ma; } i --; j --; } return maxs - score0; } int CHashSearch::AlignGapped(uchar *seq1, uchar *seq2, int M, int N, int *ext1, int *ext2, int *match_len, int *gap, vector& vMode, vector& vLen, int nTreadID) { int i, j; int t, s, e, c, d, wa; int *CC = new int[N + 1]; //note N + 1 int *DD = new int[N + 1]; int g = GapIni; int h = GapExt; int m = g + h; //gap-create + gap-extend int maxs, E1, E2, match; char trace_e, trace_d; maxs = E1 = E2 = match = 0; //forward-phase CC[0] = 0; DD[0] = -g; t = -g; int bb = 1; //band_begin int be = int((GapExtDrop - GapIni) / GapExt); int bb_pre, be_pre; //these two parameters will be adjusted during the alignment based on the dropoff score vector >& trace = m_vTrace[nTreadID]; vector >& etrace = m_vETrace[nTreadID]; vector >& dtrace = m_vDTrace[nTreadID]; // the aligning sequences may be longer than 4096 bool bModify = false; int nMemory = trace.size(); if (trace.size()-1 < M) { int nSz = M+1; bModify = true; trace.clear(); etrace.clear(); dtrace.clear(); trace.assign(nSz, vector(nSz)); etrace.assign(nSz, vector(nSz)); dtrace.assign(nSz, vector(nSz)); } trace[0][0] = '0'; for(j = 1; j <= N && j <= be; j ++) { CC[j] = t = t - h; //j - 1 ? or j; when j is used, check score is not the same as alignment score DD[j] = CC[j] - g; if(j == 1) { trace[0][j] = etrace[0][j] = 'E'; } else { trace[0][j] = etrace[0][j] = 'e'; } dtrace[0][j] = 'D'; } //global-alignment, with terminal penalty MaxGap = 100; for(i = 1; i <= M; i ++) { bb_pre = bb; be_pre = be; if(be <= bb) break; //band shrinks to zero s = CC[bb - 1]; if(i == 1) { trace[i][bb - 1] = dtrace[i][bb - 1] = 'D'; etrace[i][bb - 1] = 'E'; } else { trace[i][bb - 1] = dtrace[i][bb - 1] = 'd'; etrace[i][bb - 1] = 'e'; } if(DD[bb - 1] - h > CC[bb - 1] - m) { c = DD[bb - 1] - h; } else { c = CC[bb - 1] - m; } CC[bb - 1] = DD[bb - 1] = c; //update it with current row e = c - g; for(j = bb; j <= be && j <= N; j ++) { trace_e = 'e'; //insertion extension if ((c = c - m) >= (e = e - h)) { e = c; trace_e = 'E'; //new insertion }//insertion trace_d = 'd'; //deletion extension if ((c = CC[j] - m) >= (d = DD[j] - h)) { d = c; trace_d = 'D'; //new deletion }//deletion //here CC[j]==CC[i-1][j] DD[j]==DD[i-1][j] wa = m_aSubMatrix[seq1[i - 1]][seq2[j - 1]]; //sij[i - 1][j - 1]; //note i - 1, j - 1 c = s + wa; //s==CC[i-1][j-1], substitution trace[i][j] = 's'; //substitution if (e > c) { c = e; trace[i][j] = trace_e; } if (d > c) { c = d; trace[i][j] = trace_d; } etrace[i][j] = trace_e; dtrace[i][j] = trace_d; s = CC[j]; //important for next replace CC[j] = c; //CC[i][j] DD[j] = d; //DD[i][j] if(c > maxs) { E1 = i; E2 = j; maxs = c; } //local -C else if(c < maxs - GapExtDrop && j > E2) //score drops too much, stop filling this row, note j > E2 { be = j; break; } } //after band_e, only allows insertion if(be < be_pre) continue; for(j = be + 1; j <= N; j ++) { trace_e = 'e'; //insertion extension if ((c = c - m) > (e = e - h)) { e = c; trace_e = 'E'; //new insertion }//insertion c = e; trace[i][j] = trace_e; etrace[i][j] = trace_e; s = CC[j]; //important for next replace CC[j] = c; //CC[i][j] DD[j] = c - g; if(c > maxs) { E1 = i; E2 = j; maxs = c; } //local -C else if(c < maxs - GapExtDrop) //score drops too much, stop filling this row { be = j; break; } } //now infer new bb (starting from E2 going backward) for(j = E2; j >= bb; j --) { if(CC[j] < maxs - GapExtDrop) { bb = j; break; } } } *ext1 = E1; *ext2 = E2; delete[] CC; delete[] DD; //get alignment *match_len = 0; *gap = 0; if(maxs <= 0) return maxs; if(trace[E1][E2] != 's') { printf("E1 %d E2 %d, Not end with substitution %c\n", E1, E2, trace[E1][E2]); exit(1); } char mod = trace[E1][E2]; i = E1; j = E2; vMode.clear(); vLen.clear(); while(mod != '0' && (!(i == 0 && j == 0))) { if (vMode.empty() || toupper(mod) != toupper(vMode.back())) { vMode.push_back(mod); vLen.push_back(0); } ++vLen.back(); if(mod == 's') { if(seq1[i - 1] == seq2[j - 1]) *match_len += 1; i -= 1; j -= 1; mod = trace[i][j]; } else if(mod == 'D' || mod == 'd') { i -= 1; if (mod == 'D') mod = trace[i][j]; else mod = dtrace[i][j]; *gap += 1; } else { j -= 1; if (mod == 'E') mod = trace[i][j]; else mod = etrace[i][j]; *gap += 1; } if (i<0 || j<0) { cout << "This is a bug!" << endl; for (int m = 0; m < M; ++m) { cout << m_aCode2Char[seq1[m]]; } cout << endl; for (int n = 0; n < N; ++n) { cout << m_aCode2Char[seq2[n]]; } cout << endl; break; } } // reset the size of the buffer if (bModify == true) { trace.clear(); etrace.clear(); dtrace.clear(); trace.assign(nMemory, vector(nMemory)); etrace.assign(nMemory, vector(nMemory)); dtrace.assign(nMemory, vector(nMemory)); } return maxs; } void CHashSearch::CalRes(int nQIdx, uchar* pQ, int nQOriLen, uint unQSeedBeg, int nDIdx, uchar* pD, uint unDSeedBeg, CDbPckg& Db, uint unLocalSeedLen, STAlnmnt& stAlnmnt, MRESULT& mRes, int nTreadID) { double dEValue = 0.0; dEValue = m_vpBlastSig[nTreadID]->rawScore2ExpectLog(stAlnmnt.nScore); double dBits = m_vpBlastSig[nTreadID]->rawScore2Bit(stAlnmnt.nScore); int nTotGap = 0; int nGapOpen = 0; int nTotAlnLen = 0; for (uint i = 0; i < stAlnmnt.vMode.size(); ++i) { nTotAlnLen += stAlnmnt.vLen[i]; if ('s' != stAlnmnt.vMode[i]) { ++nGapOpen; nTotGap += stAlnmnt.vLen[i]; } } // evalue criteria if (m_bHssp == false && !(stAlnmnt.nScore>SUMHSP_MINRAWSCORE || (m_bEvalue==true && dEValue<=m_dThr) || (m_bEvalue==false && dBits>=m_dThr))) { return; } // hssp criteria else if (m_bHssp == true && (nTotAlnLen < m_nMinLen || stAlnmnt.nMatch < m_vCriteria[nTotAlnLen])) { return; } // compute frame //cout << nQIdx << endl; int nQSt = 0; int nQEd = 0; if (m_bSeqType == true) { if (nQIdx % m_nIdxScl < 3) { nQSt = 3 * (unQSeedBeg-stAlnmnt.nQBwd) + nQIdx%m_nIdxScl + 1; nQEd = 3 * (unQSeedBeg+unLocalSeedLen+stAlnmnt.nQFwd) + nQIdx%m_nIdxScl; } else { int nFrame = nQIdx % m_nIdxScl - 3; nQSt = nQOriLen - (unQSeedBeg-stAlnmnt.nQBwd)*3 - nFrame; nQEd = nQSt - (stAlnmnt.nQBwd+unLocalSeedLen+stAlnmnt.nQFwd)*3 + 1; } } else { nQSt = unQSeedBeg-stAlnmnt.nQBwd + 1; nQEd = unQSeedBeg+unLocalSeedLen+stAlnmnt.nQFwd; } // print aligned sequences uint nAllc = nTotAlnLen>unLocalSeedLen?nTotAlnLen:unLocalSeedLen; VUCHAR vQ; vQ.reserve(nAllc); VUCHAR vD; vD.reserve(nAllc); uchar* pQAligned = pQ + unQSeedBeg - stAlnmnt.nQBwd; uchar* pDAligned = pD + unDSeedBeg - stAlnmnt.nDBwd; if (0 == stAlnmnt.vMode.size()) { // only one hits vQ.insert(vQ.end(), pQAligned, pQAligned+unLocalSeedLen); vD.insert(vD.end(), pDAligned, pDAligned+unLocalSeedLen); } else if (1 == stAlnmnt.vMode.size()) { // only one hits vQ.insert(vQ.end(), pQAligned, pQAligned+stAlnmnt.vLen[0]); vD.insert(vD.end(), pDAligned, pDAligned+stAlnmnt.vLen[0]); } else { for (uint i = 0; i < stAlnmnt.vMode.size(); ++i) { char cMode = stAlnmnt.vMode[i]; if ('s' == cMode) { vQ.insert(vQ.end(), pQAligned, pQAligned+stAlnmnt.vLen[i]); pQAligned += stAlnmnt.vLen[i]; vD.insert(vD.end(), pDAligned, pDAligned+stAlnmnt.vLen[i]); pDAligned += stAlnmnt.vLen[i]; } else if ('D' == cMode || 'd' == cMode) { vQ.insert(vQ.end(), pQAligned, pQAligned+stAlnmnt.vLen[i]); pQAligned += stAlnmnt.vLen[i]; vD.insert(vD.end(), stAlnmnt.vLen[i], '-'); } else if ('E' == cMode || 'e' == cMode) { vQ.insert(vQ.end(), stAlnmnt.vLen[i], '-'); vD.insert(vD.end(), pDAligned, pDAligned+stAlnmnt.vLen[i]); pDAligned += stAlnmnt.vLen[i]; } } } string sQ; string sD; Decode(vQ, sQ); Decode(vD, sD); string sInfo; for (uint i = 0; i < vQ.size(); ++i) { if (vQ[i] == vD[i]) { sInfo += sQ[i]; } else if (m_aSubMatrix[vQ[i]][vD[i]] > 0) { sInfo += '+'; } else { sInfo += ' '; } } MRESULT::iterator it = mRes.lower_bound(pair(nQIdx/m_nIdxScl, nDIdx)); /****************************************************************/ // for sum evalue, comment this // note: here, the hits are stored according to it's real query index, not 1->6 frame query index // store all results if (mRes.end() != it && (*it).first.first==nQIdx/m_nIdxScl && (*it).first.second==nDIdx && (*it).second.nFrame==nQIdx%m_nIdxScl && (*it).second.nQSt==unQSeedBeg-stAlnmnt.nQBwd && (*it).second.nDSt==unDSeedBeg-stAlnmnt.nDBwd && (*it).second.nQEd==unQSeedBeg+unLocalSeedLen+stAlnmnt.nQFwd-1 && (*it).second.nDEd==unDSeedBeg+unLocalSeedLen+stAlnmnt.nDFwd-1) { CHitUnit& st = (*it).second; if (st.dEValue > dEValue) { st.nScore = stAlnmnt.nScore; st.dBits = dBits; st.dEValue = dEValue; st.dIdent = stAlnmnt.nMatch*100.0/nTotAlnLen; st.nAlnLen = nTotAlnLen; st.nMismatch = nTotAlnLen-stAlnmnt.nMatch-nTotGap; st.nGapOpen = nGapOpen; st.nQBeg = nQSt; st.nQEnd = nQEd; st.sQ = sQ; st.sInfo = sInfo; st.sD = sD; } } else /****************************************************************/ { MRESULT::iterator itTmp = mRes.insert(it, MRESULT::value_type(pair(nQIdx/m_nIdxScl, nDIdx), CHitUnit())); CHitUnit& st = (*itTmp).second; st.nQrLen = nQOriLen; st.nDbIdx = nDIdx; st.nDbLen = Db.m_vLens[nDIdx+1] - Db.m_vLens[nDIdx]; st.nScore = stAlnmnt.nScore; st.dBits = dBits; st.dEValue = dEValue; st.dIdent = stAlnmnt.nMatch*100.0/nTotAlnLen; st.nAlnLen = nTotAlnLen; st.nMismatch = nTotAlnLen-stAlnmnt.nMatch-nTotGap; st.nGapOpen = nGapOpen; st.nFrame = nQIdx%m_nIdxScl; st.nQSt = unQSeedBeg-stAlnmnt.nQBwd; st.nQEd = unQSeedBeg+unLocalSeedLen+stAlnmnt.nQFwd-1; st.nQBeg = nQSt; st.nQEnd = nQEd; st.nDSt = unDSeedBeg - stAlnmnt.nDBwd; st.nDEd = unDSeedBeg + unLocalSeedLen + stAlnmnt.nDFwd - 1; st.sQ = sQ; st.sInfo = sInfo; st.sD = sD; //mRes.insert(it, MRESULT::value_type(pair(nQIdx/m_nIdxScl, nDIdx), st)); } } struct SetCompObj { bool operator() (const uint& p1, const uint& p2) const { return ((p1>>11) < (p2>>11)) && ((p1&0x7ff)); } }mySetComp; void CHashSearch::PrintRes(MRESULT& mRes, int nTreadID, CQrPckg& Query, CDbPckg& Db) { if (mRes.empty()) { return; } MIT it = mRes.begin(); int nQrIdx = (*it).first.first; MRESULT::iterator itFind = mRes.end(); vector vTemp; vTemp.reserve(distance(it, itFind)); // for sum evalue, comment this int nDIdx = it->first.second; vTemp.push_back(it->second); int nSt = 0; MRESULT::iterator itTemp = it; ++itTemp; for (; itTemp != itFind; ++itTemp) { if (itTemp->first.second != nDIdx) { if( vTemp.size() - nSt > 1) { int nLen = Db.m_vLens[nDIdx+1]-Db.m_vLens[nDIdx]; SumEvalue(vTemp, nSt, vTemp.size(), nLen, nTreadID); } nDIdx = itTemp->first.second; nSt = vTemp.size(); } vTemp.push_back(itTemp->second); } // process the last one if( vTemp.size() - nSt > 1) { int nLen = Db.m_vLens[nDIdx+1]-Db.m_vLens[nDIdx]; SumEvalue(vTemp, nSt, vTemp.size(), nLen, nTreadID); } if (0 == vTemp.size()) { return; } uint nMax = max(m_nMaxOut, m_nMaxM8); nMax = min((uint)vTemp.size(), nMax); vector::iterator itPrint = vTemp.begin()+nMax; partial_sort(vTemp.begin(), itPrint, vTemp.end(), ComptorWrapper(m_pComptor)); int nBegStrAligned = 10; vector::iterator itSt = vTemp.begin(); for (; itSt != itPrint; ++itSt) { CHitUnit& st= *itSt; if ((m_bEvalue==true && st.dEValue>m_dThr) || (m_bEvalue==false && st.dBits6 frame query index st.sInfo.insert(0, nBegStrAligned+1, ' '); string sQNum = lexical_cast(st.nQBeg); st.sQ = string(nBegStrAligned-sQNum.size(), ' ') + sQNum + " " + st.sQ + " " + lexical_cast(st.nQEnd); ++st.nDSt; ++st.nDEd; int nFac = 0; while (0 <= (st.nDbIdx-nFac-1) && Db.m_vNames[st.nDbIdx] == Db.m_vNames[st.nDbIdx-nFac-1]) { ++nFac; } st.nDSt = 1848*nFac+st.nDSt; st.nDEd = 1848*nFac+st.nDEd; string sDNum = lexical_cast(st.nDSt); st.sD = string(nBegStrAligned-sDNum.size(), ' ') + sDNum + " " + st.sD + " " + lexical_cast(st.nDEd); st.sQName = Query.m_vNames[nQrIdx]; st.sDName = Db.m_vNames[st.nDbIdx]; } vTemp.resize(itSt-vTemp.begin()); if (vTemp.size() != 0) { // remove possible redundancy in overlapped region // there should be only one redundant hit for each overlapped region for (uint i = 1; i < vTemp.size(); ++i) { if (vTemp[i].nScore==vTemp[i-1].nScore && vTemp[i].sDName == vTemp[i-1].sDName && vTemp[i].sQName == vTemp[i-1].sQName && vTemp[i].nDSt == vTemp[i-1].nDSt && vTemp[i].nDEd == vTemp[i-1].nDEd && vTemp[i].nQBeg == vTemp[i-1].nQBeg && vTemp[i].nQEnd == vTemp[i-1].nQEnd) { vTemp[i].dEValue = 1000000; } } uint nf = 0; uint nl = vTemp.size(); while (nf < nl) { if (vTemp[nf].dEValue == 1000000) { --nl; swap(vTemp[nf], vTemp[nl]); } ++nf; } vTemp.resize(nl); stringstream sOutput; archive::binary_oarchive oa(sOutput); oa << vTemp; muMonitor.lock(); long long llBeg = m_llOutCum + m_sOutput.size(); m_sOutput += sOutput.str(); int nSize = m_llOutCum + m_sOutput.size() - llBeg; m_vOutIdx[m_nSeqBase+nQrIdx].m_llBeg = llBeg; m_vOutIdx[m_nSeqBase+nQrIdx].m_nSize = nSize; if (m_sOutput.size() > 100000000) { m_ofTemp << m_sOutput; m_llOutCum += m_sOutput.size(); m_sOutput.clear(); } muMonitor.unlock(); } mRes.clear(); } void CHashSearch::SumEvalue(vector& v, int nSt, int nEd, int nLen, int nTreadID) { typedef vector::iterator STIT; STIT itSt = v.begin() + nSt; STIT itEd = v.begin() + nEd; // sort by nFrame sort(itSt, itEd, CompFrame()); CHitUnit st; st.nFrame = 3; STIT itDir = lower_bound(itSt, itEd, st, CompFrame()); // if there are more than one hit in one direction int nDisPos = distance(itSt, itDir); int nDisNeg = distance(itDir, itEd); if (nDisPos > 1 || nDisNeg > 1) { vector vRes; STIT itStart = itSt; STIT itEnd = itDir; for (int i = 0; i < 2; ++i) { if (distance(itStart, itEnd) == 0) { itStart = itEnd; itEnd = itEd; continue; } else if (distance(itStart, itEnd) == 1) { if ((m_bEvalue==true && itStart->dEValue<=m_dThr) || (m_bEvalue==false && itStart->dBits>=m_dThr)) { vRes.push_back(*itStart); } itStart = itEnd; itEnd = itEd; continue; } // sort by score and start position of query sort(itStart, itEnd, CompQSt()); stable_sort(itStart, itEnd, ComptorWrapper(m_pComptor)); // check overlap and logevalue vector vNew; vNew.push_back(*itStart); for (STIT itTemp = itStart+1; itTemp!=itEnd; ++itTemp) { int nHalfLen = (itTemp->nQEd - itTemp->nQSt + 1) >> 1; int nOverlap = SUMHSP_OVERLAP < nHalfLen ? SUMHSP_OVERLAP : nHalfLen; if (itTemp->dEValue >= SUMHSP_MINEVALUE && itTemp->nScore <= SUMHSP_MINRAWSCORE) { continue; } bool bNonOvlp = true; for (STIT itIso = vNew.begin(); itIso != vNew.end(); ++itIso) { if ((itTemp->nQSt <= itIso->nQEd - nOverlap && itTemp->nQEd >= itIso->nQSt + nOverlap) || (itIso->nQSt <= itTemp->nQEd - nOverlap && itIso->nQEd >= itTemp->nQSt + nOverlap)) { bNonOvlp = false; break; } } if (true == bNonOvlp) { vNew.push_back(*itTemp); } } if (vNew.size() == 1) { if ((m_bEvalue==true && vNew[0].dEValue<=m_dThr) || (m_bEvalue==false && vNew[0].dBits>=m_dThr)) { vRes.push_back(vNew[0]); } //continue; } else { // calculate the sum of evalue double aRawScore[DEFAULT_SCORE_TOP]; int nNo = 0; for (; nNo < 5 && nNo < vNew.size(); ++nNo) { aRawScore[nNo] = vNew[nNo].nScore; } if (m_bEvalue == true) { double dTmp = m_vpBlastSig[nTreadID]->sumScore2Expect(nNo, aRawScore, nLen); double dSumEvalue = -10000.00; if (0 != dTmp) { dSumEvalue = log(dTmp) / LOG10; } // modify the logevalue if (dSumEvalue < m_dThr) { for (uint i = 0; i < vNew.size(); ++i) { vNew[i].dEValue = dSumEvalue; } vRes.insert(vRes.end(), vNew.begin(), vNew.end()); } } else { double dTmpScore = m_vpBlastSig[nTreadID]->sumScore(nNo, aRawScore, nLen); double dTmpBits = m_vpBlastSig[nTreadID]->rawScore2Bit(dTmpScore); // modify the logevalue if (dTmpBits >= m_dThr) { for (uint i = 0; i < vNew.size(); ++i) { vNew[i].dBits = dTmpBits; } vRes.insert(vRes.end(), vNew.begin(), vNew.end()); } } } itStart = itEnd; itEnd = itEd; } // replace if (!vRes.empty()) { v.erase(itSt, itEd); v.insert(v.begin()+nSt, vRes.begin(), vRes.end()); } } } void CHashSearch::GuessTotSeq(const char* szDBFile, long int& lnSeqNum, long int& lnAaNum) { lnSeqNum = 0; lnAaNum = 0; ifstream fIn(szDBFile); string s; while (fIn.good()) { getline(fIn, s); if (s[0] == '>') { lnSeqNum += 1; } else { lnAaNum += s.size(); } } fIn.close(); } void CHashSearch::MergeRes(int nDbBlockNum, VNAMES& vQNames, string& sDbPre) { ostream* poAln = NULL; ostream* poM8 = NULL; if (m_nStdout == 2) { poAln = &cout; } else if (!m_sOutBase.empty() && m_nMaxOut != 0) { poAln = new ofstream((m_sOutBase+".aln").c_str(),ios_base::out|ios_base::app); if (!poAln->good()) { ((ofstream*)poAln)->close(); delete poAln; cout << "can not open the file: " << m_sOutBase+".aln" << endl; exit(1); } } if (m_nStdout == 1) { poM8 = &cout; } else if (!m_sOutBase.empty() && m_nMaxM8 != 0) { poM8 = new ofstream((m_sOutBase+".m8").c_str(),ios_base::out|ios_base::app); if (!poM8->good()) { ((ofstream*)poM8)->close(); delete poM8; cout << "can not open the file: " << m_sOutBase+".m8" << endl; exit(1); } } if (poM8 && !m_sStartTime.empty()) { (*poM8) << "# RAPSearch\n# Job submitted: " << m_sStartTime << "# Query : " << m_sQFile << "\n" << "# Subject : " << m_sDFile << "\n"; if (m_bLogE == true) { (*poM8) << "# Fields: Query\tSubject\tidentity\taln-len\tmismatch\tgap-openings\tq.start\tq.end\ts.start\ts.end\tlog(e-value)\tbit-score\n"; } else { (*poM8) << "# Fields: Query\tSubject\tidentity\taln-len\tmismatch\tgap-openings\tq.start\tq.end\ts.start\ts.end\te-value\tbit-score\n"; } m_sStartTime = ""; } if (m_bXml) { m_ofXml.open((m_sOutBase+".xml").c_str(),ios_base::out|ios_base::app); PrintXmlBegin(sDbPre); } long long unMax = max(m_nMaxOut, m_nMaxM8); vector v; vector vMergeUnit; for (int i = 0; i < nDbBlockNum; ++i) { string sName = m_sOutBase+".tmp"+lexical_cast(i); CMergeUnit* p = new CMergeUnit(sName.c_str()); vMergeUnit.push_back(p); } int nLastIdx = 0; for (int i = 0; i < nDbBlockNum; ++i) { nLastIdx = max(nLastIdx, vMergeUnit[i]->GetLast()); } /**************************************************************/ for (int i = 0; i < nLastIdx; ++i) { for (int j = 0; j < nDbBlockNum; ++j) { vMergeUnit[j]->Update(i, v); } /***********************************************************/ if (0 == v.size()) { if (poAln && true == m_bPrintEmpty) { (*poAln) << vQNames[i] << "\tNO HIT" << "\n\n"; } continue; } /***********************************************************/ uint n = min(unMax, (long long)v.size()); partial_sort(v.begin(), v.begin()+n, v.end(), ComptorWrapper(m_pComptor)); v.resize(n); if (poAln) { PrintAln(v, *poAln); } if (poM8) { PrintM8(v, *poM8); } if (m_bXml) { PrintXml(v, i+1); } v.clear(); } for (int i = 0; i < nDbBlockNum; ++i) { delete vMergeUnit[i]; } if (poAln && m_nStdout != 2) { ((ofstream*)poAln)->close(); delete poAln; poAln = NULL; } if (poM8 && m_nStdout != 1) { ((ofstream*)poM8)->close(); delete poM8; poM8 = NULL; } if (m_bXml) { PrintXmlEnd(); m_ofXml.close(); } } //---------------------------------------------------------------------- int CHashSearch::GuessQueryType(POOL& vPool) { // read some reads vector seq; ITER itStop = vPool.end(); ITER itSt = find(vPool.begin(), itStop, '>'); ITER itEd = find(itSt+1, itStop, '>'); while (itEd!=itStop && seq.size()<10000) { ITER itBeg = find(itSt, itEd, '\n'); while (itBeg == itEd) { itEd = find(itEd+1, itStop, '>'); itBeg = find(itBeg, itEd, '\n'); } ++itBeg; int nIter = seq.size(); seq.insert(seq.end(), itBeg, itEd-1); seq.erase(remove(seq.begin()+nIter, seq.end(), '\n'), seq.end()); itSt = itEd; itEd = find(itSt+1, itStop, '>'); } char nuc[] = "ATCGatcgUu"; int i, j; int add = 0; for(i = 0; i < seq.size(); i ++) { for(j = 0; j < 10; j ++) { if(seq[i] == nuc[j]) break; } if(j < 10) add += 1; } if(add > seq.size() * 0.95) { return 1; } else { return 2; } } void CHashSearch::PrintAln(vector& v, ostream& of) { int nPrint = min((long long)v.size(), m_nMaxOut); for (int i = 0; i < nPrint; ++i) { CHitUnit& c = v[i]; of << c.sQName << " vs " << c.sDName << " bits=" << c.dBits; if (m_bLogE == true) { of << " log(E-value)=" << c.dEValue; } else { of << " E-value=" << pow(10,c.dEValue); } of << " identity=" << c.dIdent << "%" << " aln-len=" << c.nAlnLen << " mismatch=" << c.nMismatch << " gap-openings=" << c.nGapOpen << " nFrame=" << c.nFrame << "\n" << "Query:\t" << c.sQ << "\n" << " \t" << c.sInfo << "\n" << "Sbjct:\t" << c.sD << "\n" << "\n"; } } void CHashSearch::PrintM8(vector& v, ostream& of) { int nPrint = min((long long)v.size(), m_nMaxM8); for (int i = 0; i < nPrint; ++i) { CHitUnit& c = v[i]; of << c.sQName << "\t" << c.sDName << setprecision(1) << setiosflags(ios::fixed) << "\t" << c.dIdent << "\t" << c.nAlnLen << "\t" << c.nMismatch << "\t" << c.nGapOpen << "\t" << c.nQBeg << "\t" << c.nQEnd << "\t" << c.nDSt << "\t" << c.nDEd; if (m_bLogE == true) { of << setprecision(1) << setiosflags(ios::fixed) << "\t" << c.dEValue; } else { c.dEValue = pow(10, c.dEValue); if (c.dEValue < 0.01) { of << setprecision(2) << setiosflags(ios::scientific) << setiosflags(ios::fixed) << "\t" << c.dEValue; of << resetiosflags(ios::scientific); } else if (c.dEValue < 10.0) { of << setprecision(2) << setiosflags(ios::fixed) << "\t" << c.dEValue; } else { of << setprecision(0) << setiosflags(ios::fixed) << "\t" << c.dEValue; } } of << setprecision(1) << setiosflags(ios::fixed) << "\t" << c.dBits << "\n"; } } template void CHashSearch::PrintXmlLine(char* sTag, T s) { m_ofXml << string(m_unXmlSp, ' ') << "<" << sTag << ">" << s << "" << "\n"; } void CHashSearch::PrintXmlTag(char* sTag) { m_ofXml << string(m_unXmlSp, ' ') << "<" << sTag << ">" << "\n"; m_unXmlSp += 2; } void CHashSearch::PrintXmlTagR(char* sTag) { m_unXmlSp -= 2; m_ofXml << string(m_unXmlSp, ' ') << "" << "\n"; } void CHashSearch::PrintXmlBegin(string& sDbPre) { m_ofXml << "" << "\n"; PrintXmlTag("Output"); PrintXmlLine("Output_program", "RAPSearch"); PrintXmlLine("Output_version", "RAPSearch2"); PrintXmlLine("Output_reference", "Yongan Zhao, Haixu Tang and Yuzhen Ye. RAPSearch2: a fast and memory-efficient protein similarity search tool for next generation sequencing data. Bioinformatics 2012, 28 (1): 125-126"); PrintXmlLine("Output_db", sDbPre); PrintXmlTag("Output_param"); PrintXmlTag("Parameters"); PrintXmlLine("Parameters_matrix", "BLOSUM62"); if (m_bEvalue == true) { if (m_bLogE == true) { PrintXmlLine("Parameters_log-expect_evalue", lexical_cast(m_dThr)); } else { PrintXmlLine("Parameters_expect_evalue", lexical_cast(pow(10,m_dThr))); } } else { PrintXmlLine("Parameters_bits-expect", lexical_cast(m_dThr)); } PrintXmlLine("Parameters_gap-open", "11"); PrintXmlLine("Parameters_gap-extend", "1"); PrintXmlLine("Parameters_filter", "T"); PrintXmlTagR("Parameters"); PrintXmlTagR("Output_param"); PrintXmlTag("Output_iterations"); } void CHashSearch::PrintXml(vector& v, int nIdx) { PrintXmlTag("Iteration"); PrintXmlLine("Iteration_iter-num", lexical_cast(m_unXmlCnt++)); //PrintXmlLine("Iteration_query-ID", "lcl|"+lexical_cast(nIdx)); PrintXmlLine("Iteration_query-def", v[0].sQName); PrintXmlLine("Iteration_query-len", v[0].nQrLen); PrintXmlTag("Iteration_hits"); int nPrint = min((long long)v.size(), m_nMaxOut); for (int i = 0; i < nPrint; ++i) { CHitUnit& c = v[i]; PrintXmlTag("Hit"); PrintXmlLine("Hit_num", lexical_cast(i+1)); //PrintXmlLine("Hit_id", "gnl|"+lexical_cast(c.nDbIdx)); PrintXmlLine("Hit_def", c.sDName); //PrintXmlLine("Hit_accession", c.nDbIdx); PrintXmlLine("Hit_len", c.nDbLen); PrintXmlTag("Hit_hsps"); PrintXmlTag("Hsp"); PrintXmlLine("Hsp_num", 1); PrintXmlLine("Hsp_bit-score", c.dBits); PrintXmlLine("Hsp_score", c.nScore); if (m_bLogE == true) { PrintXmlLine("Hsp_log-evalue", c.dEValue); } else { PrintXmlLine("Hsp_evalue", pow(10,c.dEValue)); } PrintXmlLine("Hsp_query-from", c.nQBeg); PrintXmlLine("Hsp_query-to", c.nQEnd); PrintXmlLine("Hsp_hit-from", c.nDSt); PrintXmlLine("Hsp_hit-to", c.nDEd); PrintXmlLine("Hsp_query-frame", c.nFrame); int nPos = 0; int nIdt = 0; for (uint j = 0; j < c.sInfo.size(); ++j) { if (c.sInfo[j] != ' ') { nPos += 1; if (c.sInfo[j] != '+') { nIdt += 1; } } } PrintXmlLine("Hsp_identity", nIdt); PrintXmlLine("Hsp_positive", nPos); PrintXmlLine("Hsp_align-len", c.nAlnLen); size_t n1 = c.sQ.find_first_not_of(" 0123456789"); size_t n2 = c.sQ.find_last_not_of(" 0123456789"); PrintXmlLine("Hsp_qseq", c.sQ.substr(n1, n2-n1+1)); n1 = c.sD.find_first_not_of(" 0123456789"); n2 = c.sD.find_last_not_of(" 0123456789"); PrintXmlLine("Hsp_hseq", c.sD.substr(n1, n2-n1+1)); n1 = c.sInfo.find_first_not_of(" "); n2 = c.sInfo.find_last_not_of(" "); PrintXmlLine("Hsp_midline", c.sInfo.substr(n1, n2-n1+1)); PrintXmlTagR("Hsp"); PrintXmlTagR("Hit_hsps"); PrintXmlTagR("Hit"); } PrintXmlTagR("Iteration_hits"); PrintXmlTag("Iteration_stat"); PrintXmlTag("Statistics"); PrintXmlLine("Statistics_db-num", m_lnSeqNum); PrintXmlLine("Statistics_db-len", m_lnTotalAa); PrintXmlLine("Statistics_hsp-len", 0); PrintXmlLine("Statistics_eff-space", 0); PrintXmlLine("Statistics_kappa", 0.041); PrintXmlLine("Statistics_lambda", 0.267); PrintXmlLine("Statistics_entropy", 0.14); PrintXmlTagR("Statistics"); PrintXmlTagR("Iteration_stat"); PrintXmlTagR("Iteration"); } void CHashSearch::PrintXmlEnd() { PrintXmlTagR("Output_iterations"); PrintXmlTagR("Output"); }