Spaces:
No application file
No application file
| # Copyright 2008-2015 by Peter Cock. All rights reserved. | |
| # | |
| # This file is part of the Biopython distribution and governed by your | |
| # choice of the "Biopython License Agreement" or the "BSD 3-Clause License". | |
| # Please see the LICENSE file that should have been included as part of this | |
| # package. | |
| """Bio.SeqIO support for the "ace" file format. | |
| You are expected to use this module via the Bio.SeqIO functions. | |
| See also the Bio.Sequencing.Ace module which offers more than just accessing | |
| the contig consensus sequences in an ACE file as SeqRecord objects. | |
| """ | |
| from Bio.Seq import Seq | |
| from Bio.SeqRecord import SeqRecord | |
| from Bio.Sequencing import Ace | |
| def AceIterator(source): | |
| """Return SeqRecord objects from an ACE file. | |
| This uses the Bio.Sequencing.Ace module to do the hard work. Note that | |
| by iterating over the file in a single pass, we are forced to ignore any | |
| WA, CT, RT or WR footer tags. | |
| Ace files include the base quality for each position, which are taken | |
| to be PHRED style scores. Just as if you had read in a FASTQ or QUAL file | |
| using PHRED scores using Bio.SeqIO, these are stored in the SeqRecord's | |
| letter_annotations dictionary under the "phred_quality" key. | |
| >>> from Bio import SeqIO | |
| >>> with open("Ace/consed_sample.ace") as handle: | |
| ... for record in SeqIO.parse(handle, "ace"): | |
| ... print("%s %s... %i" % (record.id, record.seq[:10], len(record))) | |
| ... print(max(record.letter_annotations["phred_quality"])) | |
| Contig1 agccccgggc... 1475 | |
| 90 | |
| However, ACE files do not include a base quality for any gaps in the | |
| consensus sequence, and these are represented in Biopython with a quality | |
| of zero. Using zero is perhaps misleading as there may be very strong | |
| evidence to support the gap in the consensus. Previous versions of | |
| Biopython therefore used None instead, but this complicated usage, and | |
| prevented output of the gapped sequence as FASTQ format. | |
| >>> from Bio import SeqIO | |
| >>> with open("Ace/contig1.ace") as handle: | |
| ... for record in SeqIO.parse(handle, "ace"): | |
| ... print("%s ...%s..." % (record.id, record.seq[85:95])) | |
| ... print(record.letter_annotations["phred_quality"][85:95]) | |
| ... print(max(record.letter_annotations["phred_quality"])) | |
| Contig1 ...AGAGG-ATGC... | |
| [57, 57, 54, 57, 57, 0, 57, 72, 72, 72] | |
| 90 | |
| Contig2 ...GAATTACTAT... | |
| [68, 68, 68, 68, 68, 68, 68, 68, 68, 68] | |
| 90 | |
| """ | |
| for ace_contig in Ace.parse(source): | |
| # Convert the ACE contig record into a SeqRecord... | |
| consensus_seq_str = ace_contig.sequence | |
| if "*" in consensus_seq_str: | |
| # For consistency with most other file formats, map | |
| # any * gaps into - gaps. | |
| assert "-" not in consensus_seq_str | |
| consensus_seq = Seq(consensus_seq_str.replace("*", "-")) | |
| else: | |
| consensus_seq = Seq(consensus_seq_str) | |
| # TODO? - Base segments (BS lines) which indicates which read | |
| # phrap has chosen to be the consensus at a particular position. | |
| # Perhaps as SeqFeature objects? | |
| # TODO - Supporting reads (RD lines, plus perhaps QA and DS lines) | |
| # Perhaps as SeqFeature objects? | |
| seq_record = SeqRecord(consensus_seq, id=ace_contig.name, name=ace_contig.name) | |
| # Consensus base quality (BQ lines). Note that any gaps (originally | |
| # as * characters) in the consensus do not get a quality entry, so | |
| # we assign a quality of None (zero would be misleading as there may | |
| # be excellent support for having a gap here). | |
| quals = [] | |
| i = 0 | |
| for base in consensus_seq: | |
| if base == "-": | |
| quals.append(0) | |
| else: | |
| quals.append(ace_contig.quality[i]) | |
| i += 1 | |
| assert i == len(ace_contig.quality) | |
| seq_record.letter_annotations["phred_quality"] = quals | |
| yield seq_record | |
| # All done | |
| if __name__ == "__main__": | |
| from Bio._utils import run_doctest | |
| run_doctest() | |