John,the following computes a profile from an alignment graph. First, the connected components are computed, then they are iterated in lexicographical order. This gives you a colum-by-column iteration of the matrix based alignment represented by the alignment graph.
Note that the alignment graph can contain ambiguities and there is more than one matrix based alignment that represents the alignment graph in most cases.
HTH, Manueltemplate <typename TSpec0, typename TSequence, typename TCargo, typename TSetSpec, typename TSpec> bool computeProfiles(StringSet<String<ProfileType<ModifiedAlphabet<Dna5, ModExpand<'-'> > > >, TSpec0> & profiles,
String<ProfileSupportInfo> & profileSupportInfos,Graph<Alignment<StringSet<TSequence, TSetSpec>, TCargo, TSpec> > /*const*/ & g,
Graph<Undirected<double> > /*const*/ & distances, bool logging) { using namespace seqan;typedef Graph<Alignment<StringSet<TSequence, TSetSpec>, TCargo, TSpec> > TAlignmentGraph;
typedef std::map<unsigned, unsigned> TComponentLength; typedef Dna5 TAlphabet;// Allocate information for which sequence supports the profile at which position.
resize(profileSupportInfos, length(stringSet(g)));// ----------------------------------------------------------------------- // Compute connected components and get topological sorting of them. // -----------------------------------------------------------------------
String<unsigned> component; String<unsigned> order; TComponentLength componentLength; if (!convertAlignment(g, component, order, componentLength)) return false; unsigned numComponents = length(order); // if (logging) // for (unsigned i = 0; i < numComponents; ++i)// std::cerr << "order[" << i << "] == " << order[i] << std::endl;
// -----------------------------------------------------------------------
// Get connected components of distances / read alignment clusters.// -----------------------------------------------------------------------
// Each cluster corresponds to a contig.// A cluster is a CC in the graph where each sequences is a vertex and two vertices are connected if they have an
// overlap alignment. String<unsigned> seqToCluster;unsigned numClusters = connectedComponents(distances, seqToCluster);
// if (logging) // for (unsigned i = 0; i < length(seqToCluster); ++i)// std::cerr << "SEQ TO CLUSTER\t" << i << " --> " << seqToCluster[i] << std::endl;
// std::cerr << distances << std::endl;// std::cerr << "numVertices(distances) == " << numVertices(distances) << std::endl;
if (logging) std::cerr << "# clusters: " << numClusters << std::endl << "# components: " << numComponents << std::endl; resize(profiles, numClusters);// ----------------------------------------------------------------------- // Visit components in topological order and generate profile sequences. // -----------------------------------------------------------------------
// Get mapping from component to vertices. String<String<unsigned> > componentVertices; resize(componentVertices, numComponents);typedef typename Iterator<TAlignmentGraph, VertexIterator>::Type TVertexIterator;
for (TVertexIterator itV(g); !atEnd(itV); goNext(itV)) {// std::cerr << "VERTEX TO COMPONENT\t" << *itV << " --> " << getProperty(component, *itV) << std::endl; appendValue(componentVertices[getProperty(component, *itV)], *itV);
} // For each cluster, the number of currently overlapping reads. String<unsigned> activeReads; resize(activeReads, numClusters, 0); // Iterate vertices in topological order. unsigned verticesVisited = 0;for (typename Iterator<String<unsigned>, Rooted>::Type it = begin(order, Rooted()); !atEnd(it); goNext(it))
{ unsigned c = *it; // Current component. unsigned fLen = fragmentLength(g, front(componentVertices[c]));unsigned cl = seqToCluster[sequenceId(g, front(componentVertices[c]))]; // Current cluster/contig.
// Grow profile string for the current contig/cluster. unsigned from = length(profiles[cl]); resize(profiles[cl], from + fLen); // if (logging)// std::cerr << "seq id == " << sequenceId(g, front(componentVertices[c])) << ", cl == " << cl << std::endl;
// Make fragments of vertices of current component vote for their character.
unsigned numNewThisRound = 0; unsigned numDoneThisRound = 0;typedef typename Iterator<String<unsigned>, Rooted>::Type TDescIt; // std::cerr << "length(componentVertices[" << c << "]) == " << length(componentVertices[c]) << std::endl; for (TDescIt itV = begin(componentVertices[c], Rooted()); ! atEnd(itV); goNext(itV))
{ verticesVisited += 1; // std::cerr << "VISITING\t" << *itV << std::endl;unsigned idx = idToPosition(stringSet(g), sequenceId(g, *itV));
// if (logging)// std::cerr << "\t id == " << idToPosition(stringSet(g), sequenceId(g, *itV)) << ", idx == " << idx << std::endl;
unsigned fBeg = fragmentBegin(g, *itV);// Register sequence as supporting in profile cl starting at position from in profile.
if (fBeg == 0u)profileSupportInfos[idx] = ProfileSupportInfo(idx, cl, from, from);
profileSupportInfos[idx].profileEnd = from + fLen; numNewThisRound += (fBeg == 0); unsigned fEnd = fBeg + fLen; numDoneThisRound += (fEnd == length(stringSet(g)[idx])); SEQAN_ASSERT_EQ(fLen, fragmentLength(g, *itV)); for (unsigned i = 0; i < fLen; ++i)profiles[cl][from + i].count[ordValue(stringSet(g) [idx][fBeg + i])] += 1;
} // Some reads became active *in* this round. activeReads[cl] += numNewThisRound;// Now, make the active reads in the current component vote for "not here"/'-'. SEQAN_ASSERT_GEQ(activeReads[cl], length(componentVertices[c])); unsigned numGapVotes = activeReads[cl] - length(componentVertices[c]);
for (unsigned i = from; i < length(profiles[cl]); ++i)profiles[cl][i].count[ValueSize<TAlphabet>::VALUE] += numGapVotes;
// if (logging)// std::cerr << "NEW THIS ROUND " << numNewThisRound << "\tDONE THIS ROUND " << numDoneThisRound << std::endl // << "\t GAP VOTES " << numGapVotes << "\tNON- GAP VOTES " << length(componentVertices[c]) << std::endl;
// Some reads become inactive *after* this round. SEQAN_ASSERT_GEQ(activeReads[cl], numDoneThisRound); activeReads[cl] -= numDoneThisRound; } SEQAN_ASSERT_EQ(numVertices(g), verticesVisited); // if (logging) // for (unsigned i = 0; i < numClusters; ++i)// std::cerr << "len(profiles[" << i << "]) == " << length(profiles[i]) << std::endl;
return true; }