Re: [Seqan-dev] Read trimming question
John,
the following computes a profile from an alignment graph. First, the
connected components are computed, then they are iterated in
lexicographical order. This gives you a colum-by-column iteration of
the matrix based alignment represented by the alignment graph.
Note that the alignment graph can contain ambiguities and there is
more than one matrix based alignment that represents the alignment
graph in most cases.
HTH,
Manuel
template <typename TSpec0, typename TSequence, typename TCargo,
typename TSetSpec, typename TSpec>
bool
computeProfiles(StringSet<String<ProfileType<ModifiedAlphabet<Dna5,
ModExpand<'-'> > > >, TSpec0> & profiles,
String<ProfileSupportInfo> & profileSupportInfos,
Graph<Alignment<StringSet<TSequence, TSetSpec>,
TCargo, TSpec> > /*const*/ & g,
Graph<Undirected<double> > /*const*/ & distances,
bool logging)
{
using namespace seqan;
typedef Graph<Alignment<StringSet<TSequence, TSetSpec>, TCargo,
TSpec> > TAlignmentGraph;
typedef std::map<unsigned, unsigned> TComponentLength;
typedef Dna5 TAlphabet;
// Allocate information for which sequence supports the profile
at which position.
resize(profileSupportInfos, length(stringSet(g)));
//
-----------------------------------------------------------------------
// Compute connected components and get topological sorting of
them.
//
-----------------------------------------------------------------------
String<unsigned> component;
String<unsigned> order;
TComponentLength componentLength;
if (!convertAlignment(g, component, order, componentLength))
return false;
unsigned numComponents = length(order);
// if (logging)
// for (unsigned i = 0; i < numComponents; ++i)
// std::cerr << "order[" << i << "] == " << order[i] <<
std::endl;
//
-----------------------------------------------------------------------
// Get connected components of distances / read alignment clusters.
//
-----------------------------------------------------------------------
// Each cluster corresponds to a contig.
// A cluster is a CC in the graph where each sequences is a
vertex and two vertices are connected if they have an
// overlap alignment.
String<unsigned> seqToCluster;
unsigned numClusters = connectedComponents(distances,
seqToCluster);
// if (logging)
// for (unsigned i = 0; i < length(seqToCluster); ++i)
// std::cerr << "SEQ TO CLUSTER\t" << i << " --> " <<
seqToCluster[i] << std::endl;
// std::cerr << distances << std::endl;
// std::cerr << "numVertices(distances) == " <<
numVertices(distances) << std::endl;
if (logging)
std::cerr << "# clusters: " << numClusters << std::endl
<< "# components: " << numComponents << std::endl;
resize(profiles, numClusters);
//
-----------------------------------------------------------------------
// Visit components in topological order and generate profile
sequences.
//
-----------------------------------------------------------------------
// Get mapping from component to vertices.
String<String<unsigned> > componentVertices;
resize(componentVertices, numComponents);
typedef typename Iterator<TAlignmentGraph, VertexIterator>::Type
TVertexIterator;
for (TVertexIterator itV(g); !atEnd(itV); goNext(itV))
{
// std::cerr << "VERTEX TO COMPONENT\t" << *itV << " --> " <<
getProperty(component, *itV) << std::endl;
appendValue(componentVertices[getProperty(component, *itV)],
*itV);
}
// For each cluster, the number of currently overlapping reads.
String<unsigned> activeReads;
resize(activeReads, numClusters, 0);
// Iterate vertices in topological order.
unsigned verticesVisited = 0;
for (typename Iterator<String<unsigned>, Rooted>::Type it =
begin(order, Rooted()); !atEnd(it); goNext(it))
{
unsigned c = *it; // Current component.
unsigned fLen = fragmentLength(g, front(componentVertices[c]));
unsigned cl = seqToCluster[sequenceId(g,
front(componentVertices[c]))]; // Current cluster/contig.
// Grow profile string for the current contig/cluster.
unsigned from = length(profiles[cl]);
resize(profiles[cl], from + fLen);
// if (logging)
// std::cerr << "seq id == " << sequenceId(g,
front(componentVertices[c])) << ", cl == " << cl << std::endl;
// Make fragments of vertices of current component vote for
their character.
unsigned numNewThisRound = 0;
unsigned numDoneThisRound = 0;
typedef typename Iterator<String<unsigned>, Rooted>::Type
TDescIt;
// std::cerr << "length(componentVertices[" << c << "]) == "
<< length(componentVertices[c]) << std::endl;
for (TDescIt itV = begin(componentVertices[c], Rooted()); !
atEnd(itV); goNext(itV))
{
verticesVisited += 1;
// std::cerr << "VISITING\t" << *itV << std::endl;
unsigned idx = idToPosition(stringSet(g), sequenceId(g,
*itV));
// if (logging)
// std::cerr << "\t id == " <<
idToPosition(stringSet(g), sequenceId(g, *itV)) << ", idx == " << idx
<< std::endl;
unsigned fBeg = fragmentBegin(g, *itV);
// Register sequence as supporting in profile cl starting
at position from in profile.
if (fBeg == 0u)
profileSupportInfos[idx] = ProfileSupportInfo(idx,
cl, from, from);
profileSupportInfos[idx].profileEnd = from + fLen;
numNewThisRound += (fBeg == 0);
unsigned fEnd = fBeg + fLen;
numDoneThisRound += (fEnd == length(stringSet(g)[idx]));
SEQAN_ASSERT_EQ(fLen, fragmentLength(g, *itV));
for (unsigned i = 0; i < fLen; ++i)
profiles[cl][from + i].count[ordValue(stringSet(g)
[idx][fBeg + i])] += 1;
}
// Some reads became active *in* this round.
activeReads[cl] += numNewThisRound;
// Now, make the active reads in the current component vote
for "not here"/'-'.
SEQAN_ASSERT_GEQ(activeReads[cl],
length(componentVertices[c]));
unsigned numGapVotes = activeReads[cl] -
length(componentVertices[c]);
for (unsigned i = from; i < length(profiles[cl]); ++i)
profiles[cl][i].count[ValueSize<TAlphabet>::VALUE] +=
numGapVotes;
// if (logging)
// std::cerr << "NEW THIS ROUND " << numNewThisRound <<
"\tDONE THIS ROUND " << numDoneThisRound << std::endl
// << "\t GAP VOTES " << numGapVotes << "\tNON-
GAP VOTES " << length(componentVertices[c]) << std::endl;
// Some reads become inactive *after* this round.
SEQAN_ASSERT_GEQ(activeReads[cl], numDoneThisRound);
activeReads[cl] -= numDoneThisRound;
}
SEQAN_ASSERT_EQ(numVertices(g), verticesVisited);
// if (logging)
// for (unsigned i = 0; i < numClusters; ++i)
// std::cerr << "len(profiles[" << i << "]) == " <<
length(profiles[i]) << std::endl;
return true;
}