Spaces:

aakash0017
/

DrVai-Rag-Testing

No application file

App Files Files Community

DrVai-Rag-Testing / myenv /lib /python3.10 /site-packages /Bio /SeqIO /PhdIO.py

aakash0017

Upload folder using huggingface_hub

b7731cd over 2 years ago

raw

history blame contribute delete

6.06 kB

	# Copyright 2008-2016 by Peter Cock. All rights reserved.
	# Revisions copyright 2009 by Cymon J. Cox. All rights reserved.
	#
	# This file is part of the Biopython distribution and governed by your
	# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
	# Please see the LICENSE file that should have been included as part of this
	# package.
	"""Bio.SeqIO support for the "phd" file format.

	PHD files are output by PHRED and used by PHRAP and CONSED.

	You are expected to use this module via the Bio.SeqIO functions, under the
	format name "phd". See also the underlying Bio.Sequencing.Phd module.

	For example, using Bio.SeqIO we can read in one of the example PHRED files
	from the Biopython unit tests:

	>>> from Bio import SeqIO
	>>> for record in SeqIO.parse("Phd/phd1", "phd"):
	... print(record.id)
	... print("%s..." % record.seq[:10])
	... print("%s..." % record.letter_annotations["phred_quality"][:10])
	34_222_(80-A03-19).b.ab1
	ctccgtcgga...
	[9, 9, 10, 19, 22, 37, 28, 28, 24, 22]...
	425_103_(81-A03-19).g.ab1
	cgggatccca...
	[14, 17, 22, 10, 10, 10, 15, 8, 8, 9]...
	425_7_(71-A03-19).b.ab1
	acataaatca...
	[10, 10, 10, 10, 8, 8, 6, 6, 6, 6]...

	Since PHRED files contain quality scores, you can save them as FASTQ or as
	QUAL files, for example using Bio.SeqIO.write(...), or simply with the format
	method of the SeqRecord object:

	>>> print(record[:50].format("fastq"))
	@425_7_(71-A03-19).b.ab1
	acataaatcaaattactnaccaacacacaaaccngtctcgcgtagtggag
	+
	++++))'''')(''')$!$''')''''(+.''$!$))))+)))'''''''
	<BLANKLINE>

	Or,

	>>> print(record[:50].format("qual"))
	>425_7_(71-A03-19).b.ab1
	10 10 10 10 8 8 6 6 6 6 8 7 6 6 6 8 3 0 3 6 6 6 8 6 6 6 6 7
	10 13 6 6 3 0 3 8 8 8 8 10 8 8 8 6 6 6 6 6 6 6
	<BLANKLINE>

	Note these examples only show the first 50 bases to keep the output short.
	"""
	from Bio.SeqIO import QualityIO
	from Bio.SeqRecord import SeqRecord
	from Bio.Sequencing import Phd

	from .Interfaces import SequenceWriter


	def PhdIterator(source):
	"""Return SeqRecord objects from a PHD file.

	Arguments:
	- source - input stream opened in text mode, or a path to a file

	This uses the Bio.Sequencing.Phd module to do the hard work.
	"""
	phd_records = Phd.parse(source)
	for phd_record in phd_records:
	# Convert the PHY record into a SeqRecord...
	# The "filename" can contain spaces, e.g. 'HWI-EAS94_4_1_1_602_99 1'
	# from unit test example file phd_solexa.
	# This will cause problems if used as the record identifier
	# (e.g. output for FASTQ format).
	name = phd_record.file_name.split(None, 1)[0]
	seq_record = SeqRecord(
	phd_record.seq, id=name, name=name, description=phd_record.file_name
	)
	# Just re-use the comments dictionary as the SeqRecord's annotations
	seq_record.annotations = phd_record.comments
	seq_record.annotations["molecule_type"] = "DNA"
	# And store the qualities and peak locations as per-letter-annotation
	seq_record.letter_annotations["phred_quality"] = [
	int(site[1]) for site in phd_record.sites
	]
	try:
	seq_record.letter_annotations["peak_location"] = [
	int(site[2]) for site in phd_record.sites
	]
	except IndexError:
	# peak locations are not always there according to
	# David Gordon (the Consed author)
	pass
	yield seq_record
	# All done


	class PhdWriter(SequenceWriter):
	"""Class to write Phd format files."""

	def __init__(self, handle):
	"""Initialize the class."""
	super().__init__(handle)

	def write_record(self, record):
	"""Write a single Phd record to the file."""
	assert record.seq, "No sequence present in SeqRecord"
	# This method returns the 'phred_quality' scores or converted
	# 'solexa_quality' scores if present, else raises a value error
	phred_qualities = QualityIO._get_phred_quality(record)
	peak_locations = record.letter_annotations.get("peak_location")
	if len(record.seq) != len(phred_qualities):
	raise ValueError(
	"Number of phd quality scores does not match length of sequence"
	)
	if peak_locations:
	if len(record.seq) != len(peak_locations):
	raise ValueError(
	"Number of peak location scores does not "
	"match length of sequence"
	)
	if None in phred_qualities:
	raise ValueError("A quality value of None was found")
	if record.description.startswith(f"{record.id} "):
	title = record.description
	else:
	title = f"{record.id} {record.description}"
	self.handle.write(f"BEGIN_SEQUENCE {self.clean(title)}\nBEGIN_COMMENT\n")
	for annot in [k.lower() for k in Phd.CKEYWORDS]:
	value = None
	if annot == "trim":
	if record.annotations.get("trim"):
	value = "%s %s %.4f" % record.annotations["trim"]
	elif annot == "trace_peak_area_ratio":
	if record.annotations.get("trace_peak_area_ratio"):
	value = f"{record.annotations['trace_peak_area_ratio']:.4f}"
	else:
	value = record.annotations.get(annot)
	if value or value == 0:
	self.handle.write(f"{annot.upper()}: {value}\n")

	self.handle.write("END_COMMENT\nBEGIN_DNA\n")
	for i, site in enumerate(record.seq):
	if peak_locations:
	self.handle.write(
	"%s %i %i\n" % (site, round(phred_qualities[i]), peak_locations[i])
	)
	else:
	self.handle.write("%s %i\n" % (site, round(phred_qualities[i])))

	self.handle.write("END_DNA\nEND_SEQUENCE\n")


	if __name__ == "__main__":
	from Bio._utils import run_doctest

	run_doctest()