[Seqan-dev] sorting sam file by SortReadID()
Hi,
I am trying to sort a sam/bam file by read-id and am using the following
construct:
sortAlignedReads(reader, SortReadId());
Unfortunately the compiler is complaining:
In file included from
/pasteur/solexa2/solexa_travail/PF2/programs/seqan/workspace/seqan/core/include/seqan/store.h:63:
/pasteur/solexa2/solexa_travail/PF2/programs/seqan/workspace/seqan/core/include/seqan/store/store_align.h:369:13:
error: member reference base type 'const char' is not a
structure or union
return a1.readId < a2.readId;
~~ ^
/usr/lib/gcc/x86_64-redhat-linux6E/4.4.4/../../../../include/c++/4.4.4/bits/stl_algo.h:2128:8:
note: in instantiation of member function 'seqan::_LessAlignedRead<char,
const seqan::Tag<seqan::SortReadId_> >::operator()' requested here
if (__comp(__val, *__first))
^
/usr/lib/gcc/x86_64-redhat-linux6E/4.4.4/../../../../include/c++/4.4.4/bits/stl_algo.h:3433:4:
note: in instantiation of function template specialization
'std::__insertion_sort<char *, seqan::_LessAlignedRead<char,
const seqan::Tag<seqan::SortReadId_> > >' requested here
std::__insertion_sort(__first, __last, __comp);
^
/usr/lib/gcc/x86_64-redhat-linux6E/4.4.4/../../../../include/c++/4.4.4/bits/stl_algo.h:5468:2:
note: in instantiation of function template specialization
'std::__inplace_stable_sort<char *, seqan::_LessAlignedRead<char,
const seqan::Tag<seqan::SortReadId_> > >' requested here
std::__inplace_stable_sort(__first, __last, __comp);
^
/pasteur/solexa2/solexa_travail/PF2/programs/seqan/workspace/seqan/core/include/seqan/store/store_align.h:433:2:
note: in instantiation of function template
specialization 'std::stable_sort<char *,
seqan::_LessAlignedRead<char, const seqan::Tag<seqan::SortReadId_> > >'
requested here
std::stable_sort(
^
/pasteur/solexa2/solexa_travail/PF2/programs/seqan/workspace/seqan/sandbox/jagla/apps/sambamstat/sambamstat.h:286:5:
note: in instantiation of function template
specialization 'seqan::sortAlignedReads<seqan::Stream<Bgzf>,
seqan::SortReadId_>' requested here
sortAlignedReads(reader, SortReadId());
^
/pasteur/solexa2/solexa_travail/PF2/programs/seqan/workspace/seqan/sandbox/jagla/apps/sambamstat/sambamstat.h:340:16:
note: in instantiation of function template
specialization 'doWork<seqan::Stream<Bgzf>,
seqan::String<seqan::SimpleType<unsigned char, seqan::Dna5_>,
seqan::Alloc<void> >,
seqan::Owner<seqan::Tag<seqan::Default_> >,
seqan::Tag<seqan::Bam_> >' requested here
return doWork(bamStream, seqs, options, Bam());
^
4 warnings and 15 errors generated.
make[2]: ***
[sandbox/jagla/apps/sambamstat/CMakeFiles/sambamstat.dir/sambamstat.cpp.o]
Error 1
make[1]: ***
[sandbox/jagla/apps/sambamstat/CMakeFiles/sambamstat.dir/all] Error 2
make: *** [all] Error 2
I also attach the the cpp and .h files..
Thanks for having a look at this ...
Bernd
// ==========================================================================
// sambamstat
// ==========================================================================
// Copyright (c) 2006-2012, Knut Reinert, FU Berlin
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are met:
//
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of Knut Reinert or the FU Berlin nor the names of
// its contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
// ARE DISCLAIMED. IN NO EVENT SHALL KNUT REINERT OR THE FU BERLIN BE LIABLE
// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
// OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
// DAMAGE.
//
// ==========================================================================
// Author: Your Name <your.email@example.net>
// ==========================================================================
#ifndef SANDBOX_JAGLA_APPS_SAMBAMSTAT_SAMBAMSTAT_H_
#define SANDBOX_JAGLA_APPS_SAMBAMSTAT_SAMBAMSTAT_H_
#include <seqan/basic.h>
#include <seqan/sequence.h>
#include <seqan/bam_io.h>
#include <seqan/misc/misc_cmdparser.h>
#include <seqan/file.h> // For printing SeqAn Strings.
#include <seqan/misc/misc_cmdparser.h>
using namespace seqan;
// ============================================================================
// Forwards
// ============================================================================
// ============================================================================
// Tags, Classes, Enums
// ============================================================================
enum Format
{
FORMAT_AUTO,
FORMAT_SAM,
FORMAT_BAM
};
enum SortOrder
{
SORTORDER_NA,
SORTORDER_ID,
SORTORDER_POS
};
struct Options {
bool showHelp;
bool showVersion;
unsigned int verbosity;
CharString inFileName;
CharString outFileName;
bool statistics;
Format inFormat;
SortOrder sortOrder;
Options() {
// Set defaults.
showHelp = false;
showVersion = false;
verbosity = 0;
statistics = false;
inFileName = "";
outFileName = "";
statistics = false;
inFormat = FORMAT_AUTO;
sortOrder = SORTORDER_NA;
}
};
// ============================================================================
// Metafunctions
// ============================================================================
// ============================================================================
// Functions
// ============================================================================
void setupCommandLineParser(CommandLineParser & parser,
Options const & options) {
addVersionLine(parser, "0.1");
addTitleLine(parser, "**********************");
addTitleLine(parser, "* sambamstat *");
addTitleLine(parser, "**********************");
addTitleLine(parser, "");
addTitleLine(parser, "(c) 2012 by Bernd Jagla <Bernd.Jagla@pasteur.fr>");
addUsageLine(parser, "[OPTIONS] -if inputFile -of outputFile");
addSection(parser, "Main Options");
addOption(parser,
CommandLineOption("if", "input-file", "Input file.",
OptionType::String | OptionType::Label,
options.inFileName));
addOption(parser, CommandLineOption("S", "input-sam", "Input file is SAM (default: auto).", OptionType::Bool, options.inFormat == FORMAT_SAM));
addOption(parser, CommandLineOption("B", "input-bam", "Input file is BAM (default: auto).", OptionType::Bool, options.inFormat == FORMAT_BAM));
addOption(parser, CommandLineOption("sP", "sortOrderPos", "assume sorted by position", OptionType::Bool, options.sortOrder == SORTORDER_POS));
addOption(parser, CommandLineOption("sI", "sortOrderID", "assume sorted by ID", OptionType::Bool, options.sortOrder == SORTORDER_ID));
addOption(parser,
CommandLineOption("of", "output-file", "Output file.",
OptionType::String | OptionType::Label,
options.outFileName));
addSection(parser, "optional parameters:");
addOption(parser,
CommandLineOption("s", "stats", "Perform stats", OptionType::Bool,
options.statistics));
addSection(parser, "General Options");
addOption(parser,
CommandLineOption("vv", "very-verbose", "Very verbose output.",
OptionType::Bool));
addOption(parser,
CommandLineOption("v", "verbose", "verbose output.",
OptionType::Bool));
requiredArguments(parser, 0);
}
int parseCommandLineAndCheck(Options & options, CommandLineParser & parser,
int argc, char const ** argv) {
bool stop = !parse(parser, argc, argv);
if (stop)
return 1;
if (isSetLong(parser, "help")) {
options.showHelp = true;
return 0;
}
if (isSetLong(parser, "version")) {
options.showVersion = true;
return 0;
}
getOptionValueLong(parser, "input-file", options.inFileName);
getOptionValueLong(parser, "output-file", options.outFileName);
if (isSetLong(parser, "verbose"))
options.verbosity = 1;
if (isSetLong(parser, "very-verbose"))
options.verbosity = 2;
if (isSetLong(parser, "stats"))
options.statistics = true;
return 0;
}
struct Stats
{
__uint64 numRecords;
__uint64 alignedRecords;
String<unsigned> editDistanceHisto;
String<unsigned> mismatchHisto;
String<unsigned> insertHisto;
String<unsigned> deletionHisto;
String<double> avrgQuality;
Stats() : numRecords(0), alignedRecords(0)
{}
};
/*
* Analyze read that are sorted by ID
*/
template <typename TStreamOrReader, typename TSeqString, typename TSpec, typename TFormat>
int analyze_idSorted(TStreamOrReader & reader, StringSet<TSeqString, TSpec> & seqs, Options const & options, TFormat const & tag)
{
StringSet<CharString> refNames;
NameStoreCache<StringSet<CharString> > refNamesCache(refNames);
BamIOContext<StringSet<CharString> > context(refNames, refNamesCache);
String<__uint64> qualSum;
// Read alignments.
BamAlignmentRecord record;
if (options.verbosity >= 2)
std::cerr << "Reading alignments" << std::endl;
Align<Dna5String> align;
__int64 reads = 0;
//
while (!atEnd(reader))
{
// Read alignment record.
if (readRecord(record, context, reader, tag) != 0)
{
std::cerr << "Could not read alignment!" << std::endl;
return 1;
}
if (options.verbosity >= 2)
write2(std::cerr, record, context, Sam());
}
return 0;
}
/*
* Analyze reads that are sorted by position
*/
template <typename TStreamOrReader, typename TSeqString, typename TSpec, typename TFormat>
int analyze_posSorted(TStreamOrReader & reader, StringSet<TSeqString, TSpec> & seqs, Options const & options, TFormat const & tag)
{
StringSet<CharString> refNames;
NameStoreCache<StringSet<CharString> > refNamesCache(refNames);
BamIOContext<StringSet<CharString> > context(refNames, refNamesCache);
String<__uint64> qualSum;
// Read alignments.
BamAlignmentRecord record;
if (options.verbosity >= 2)
std::cerr << "Reading alignments" << std::endl;
Align<Dna5String> align;
__int64 reads = 0;
//
while (!atEnd(reader))
{
// Read alignment record.
if (readRecord(record, context, reader, tag) != 0)
{
std::cerr << "Could not read alignment!" << std::endl;
return 1;
}
if (options.verbosity >= 2)
write2(std::cerr, record, context, Sam());
}
return 0;
}
/*
* Here we deal with the sorting and organizing different order of events...
*/
template <typename TStreamOrReader, typename TSeqString, typename TSpec, typename TFormat>
int doWork(TStreamOrReader & reader, StringSet<TSeqString, TSpec> & seqs, Options const & options, TFormat const & tag)
{
StringSet<CharString> refNames;
NameStoreCache<StringSet<CharString> > refNamesCache(refNames);
BamIOContext<StringSet<CharString> > context(refNames, refNamesCache);
String<__uint64> qualSum;
// Read header.
BamHeader header;
if (options.verbosity >= 2)
std::cerr << "Reading header" << std::endl;
if (readRecord(header, context, reader, tag) != 0)
{
std::cerr << "Could not read header!" << std::endl;
return 1;
}
Stats stats;
switch (options.sortOrder){
case SORTORDER_ID :
analyze_idSorted(reader, seqs, options, tag);
//TODO sort
sortAlignedReads(reader, SortBeginPos());
analyze_posSorted(reader, seqs, options, tag);
break;
case SORTORDER_POS :
analyze_posSorted(reader, seqs, options, tag);
//TODO sort
sortAlignedReads(reader, SortReadId());
analyze_idSorted(reader, seqs, options, tag);
break;
case SORTORDER_NA :
//TODO sort
sortAlignedReads(reader, SortReadId());
analyze_idSorted(reader, seqs, options, tag);
//TODO sort
sortAlignedReads(reader, SortBeginPos());
analyze_posSorted(reader, seqs, options, tag);
break;
}
return 0;
}
int mainWithOptions(Options & options) {
typedef Iterator<String<CharString> >::Type TIterator;
std::cout << "Non-option Arguments:" << std::endl;
std::cout << " input file: \"" << options.inFileName << "\""
<< std::endl;
std::cout << " output file: \"" << options.outFileName << "\""
<< std::endl;
std::cout << "Option Arguments:" << std::endl;
StringSet<CharString> seqIds;
StringSet<Dna5String> seqs;
String<char, MMap<> > seqMMapString;
if (options.inFormat == FORMAT_SAM)
{
if (options.verbosity >= 2)
std::cerr << "Opening SAM file " << options.inFileName << std::endl;
String<char, MMap<> > samMMapString;
if (!open(samMMapString, toCString(options.inFileName), OPEN_RDONLY))
{
std::cerr << "Could not open " << options.inFileName << std::endl;
return 1;
}
RecordReader<String<char, MMap<> >, SinglePass<Mapped> > samReader(samMMapString);
return doWork(samReader, seqs, options, Sam());
}
else // options.inFormat == FORMAT_BAM
{
if (options.verbosity >= 2)
std::cerr << "Opening BAM file " << options.inFileName << std::endl;
Stream<Bgzf> bamStream;
if (!open(bamStream, toCString(options.inFileName), "r"))
{
std::cerr << "Could not open " << options.inFileName << std::endl;
return 1;
}
return doWork(bamStream, seqs, options, Bam());
}
return 0;
}
#endif // #ifndef SANDBOX_JAGLA_APPS_SAMBAMSTAT_SAMBAMSTAT_H_
// ==========================================================================
// sambamstat
// ==========================================================================
// Copyright (c) 2006-2012, Knut Reinert, FU Berlin
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are met:
//
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of Knut Reinert or the FU Berlin nor the names of
// its contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
// ARE DISCLAIMED. IN NO EVENT SHALL KNUT REINERT OR THE FU BERLIN BE LIABLE
// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
// OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
// DAMAGE.
//
// ==========================================================================
// Author: Your Name <your.email@example.net>
// ==========================================================================
#include <seqan/basic.h>
#include <seqan/sequence.h>
#include <seqan/misc/misc_cmdparser.h>
#include "sambamstat.h"
using namespace seqan;
// Program entry point
int main(int argc, char const ** argv)
{
// Setup command line parser.
CommandLineParser parser;
Options options;
setupCommandLineParser(parser, options);
// Then, parse the command line and handle the cases where help display
// is requested or erroneous parameters were given.
int ret = parseCommandLineAndCheck(options, parser, argc, argv);
if (ret != 0)
{
std::cerr << "Invalid usage!" << std::endl;
return ret;
}
if (options.showHelp || options.showVersion)
return 0;
if (options.inFormat == FORMAT_AUTO)
{
std::ifstream guessIn(toCString(options.inFileName), std::ios::binary | std::ios::in);
if (!guessIn.good())
{
std::cerr << "Could not open " << options.inFileName << std::endl;
return 1;
}
CharString magic;
resize(magic, 2);
guessIn.read(&magic[0], 2);
if (magic != "\x1f\x8b") // Is not gzip-compressed.
options.inFormat = FORMAT_SAM;
else
options.inFormat = FORMAT_BAM;
}
// Finally, launch the program.
ret = mainWithOptions(options);
return ret;
}