Spaces:
No application file
No application file
| #!/usr/bin/env python | |
| # Copyright 2004 Kristian Rother. | |
| # Revisions copyright 2004 Thomas Hamelryck. | |
| # | |
| # This file is part of the Biopython distribution and governed by your | |
| # choice of the "Biopython License Agreement" or the "BSD 3-Clause License". | |
| # Please see the LICENSE file that should have been included as part of this | |
| # package. | |
| """Parse header of PDB files into a python dictionary. | |
| Emerged from the Columba database project www.columba-db.de, original author | |
| Kristian Rother. | |
| """ | |
| import re | |
| from Bio import File | |
| def _get_journal(inl): | |
| # JRNL AUTH L.CHEN,M.DOI,F.S.MATHEWS,A.Y.CHISTOSERDOV, 2BBK 7 | |
| journal = "" | |
| for line in inl: | |
| if re.search(r"\AJRNL", line): | |
| journal += line[19:72].lower() | |
| journal = re.sub(r"\s\s+", " ", journal) | |
| return journal | |
| def _get_references(inl): | |
| # REMARK 1 REFERENCE 1 1CSE 11 | |
| # REMARK 1 AUTH W.BODE,E.PAPAMOKOS,D.MUSIL 1CSE 12 | |
| references = [] | |
| actref = "" | |
| for line in inl: | |
| if re.search(r"\AREMARK 1", line): | |
| if re.search(r"\AREMARK 1 REFERENCE", line): | |
| if actref != "": | |
| actref = re.sub(r"\s\s+", " ", actref) | |
| if actref != " ": | |
| references.append(actref) | |
| actref = "" | |
| else: | |
| actref += line[19:72].lower() | |
| if actref != "": | |
| actref = re.sub(r"\s\s+", " ", actref) | |
| if actref != " ": | |
| references.append(actref) | |
| return references | |
| # bring dates to format: 1909-01-08 | |
| def _format_date(pdb_date): | |
| """Convert dates from DD-Mon-YY to YYYY-MM-DD format (PRIVATE).""" | |
| date = "" | |
| year = int(pdb_date[7:]) | |
| if year < 50: | |
| century = 2000 | |
| else: | |
| century = 1900 | |
| date = str(century + year) + "-" | |
| all_months = [ | |
| "xxx", | |
| "Jan", | |
| "Feb", | |
| "Mar", | |
| "Apr", | |
| "May", | |
| "Jun", | |
| "Jul", | |
| "Aug", | |
| "Sep", | |
| "Oct", | |
| "Nov", | |
| "Dec", | |
| ] | |
| month = str(all_months.index(pdb_date[3:6])) | |
| if len(month) == 1: | |
| month = "0" + month | |
| date = date + month + "-" + pdb_date[:2] | |
| return date | |
| def _chop_end_codes(line): | |
| """Chops lines ending with ' 1CSA 14' and the like (PRIVATE).""" | |
| return re.sub(r"\s\s\s\s+[\w]{4}.\s+\d*\Z", "", line) | |
| def _chop_end_misc(line): | |
| """Chops lines ending with ' 14-JUL-97 1CSA' and the like (PRIVATE).""" | |
| return re.sub(r"\s+\d\d-\w\w\w-\d\d\s+[1-9][0-9A-Z]{3}\s*\Z", "", line) | |
| def _nice_case(line): | |
| """Make A Lowercase String With Capitals (PRIVATE).""" | |
| line_lower = line.lower() | |
| s = "" | |
| i = 0 | |
| nextCap = 1 | |
| while i < len(line_lower): | |
| c = line_lower[i] | |
| if c >= "a" and c <= "z" and nextCap: | |
| c = c.upper() | |
| nextCap = 0 | |
| elif c in " .,;:\t-_": | |
| nextCap = 1 | |
| s += c | |
| i += 1 | |
| return s | |
| def parse_pdb_header(infile): | |
| """Return the header lines of a pdb file as a dictionary. | |
| Dictionary keys are: head, deposition_date, release_date, structure_method, | |
| resolution, structure_reference, journal_reference, author and | |
| compound. | |
| """ | |
| header = [] | |
| with File.as_handle(infile) as f: | |
| for line in f: | |
| record_type = line[0:6] | |
| if record_type in ("ATOM ", "HETATM", "MODEL "): | |
| break | |
| else: | |
| header.append(line) | |
| return _parse_pdb_header_list(header) | |
| def _parse_remark_465(line): | |
| """Parse missing residue remarks. | |
| Returns a dictionary describing the missing residue. | |
| The specification for REMARK 465 at | |
| http://www.wwpdb.org/documentation/file-format-content/format33/remarks2.html#REMARK%20465 | |
| only gives templates, but does not say they have to be followed. | |
| So we assume that not all pdb-files with a REMARK 465 can be understood. | |
| Returns a dictionary with the following keys: | |
| "model", "res_name", "chain", "ssseq", "insertion" | |
| """ | |
| if line: | |
| # Note that line has been stripped. | |
| assert line[0] != " " and line[-1] not in "\n ", "line has to be stripped" | |
| pattern = re.compile( | |
| r""" | |
| (\d+\s[\sA-Z][\sA-Z][A-Z] | # Either model number + residue name | |
| [A-Z]{1,3}) # Or only residue name with 1 (RNA) to 3 letters | |
| \s ([A-Za-z0-9]) # A single character chain | |
| \s+(-?\d+[A-Za-z]?)$ # Residue number: A digit followed by an optional | |
| # insertion code (Hetero-flags make no sense in | |
| # context with missing res) | |
| """, | |
| re.VERBOSE, | |
| ) | |
| match = pattern.match(line) | |
| if match is None: | |
| return None | |
| residue = {} | |
| if " " in match.group(1): | |
| model, residue["res_name"] = match.group(1).split() | |
| residue["model"] = int(model) | |
| else: | |
| residue["model"] = None | |
| residue["res_name"] = match.group(1) | |
| residue["chain"] = match.group(2) | |
| try: | |
| residue["ssseq"] = int(match.group(3)) | |
| except ValueError: | |
| residue["insertion"] = match.group(3)[-1] | |
| residue["ssseq"] = int(match.group(3)[:-1]) | |
| else: | |
| residue["insertion"] = None | |
| return residue | |
| def _parse_pdb_header_list(header): | |
| # database fields | |
| pdbh_dict = { | |
| "name": "", | |
| "head": "", | |
| "idcode": "", | |
| "deposition_date": "1909-01-08", | |
| "release_date": "1909-01-08", | |
| "structure_method": "unknown", | |
| "resolution": None, | |
| "structure_reference": "unknown", | |
| "journal_reference": "unknown", | |
| "author": "", | |
| "compound": {"1": {"misc": ""}}, | |
| "source": {"1": {"misc": ""}}, | |
| "has_missing_residues": False, | |
| "missing_residues": [], | |
| } | |
| pdbh_dict["structure_reference"] = _get_references(header) | |
| pdbh_dict["journal_reference"] = _get_journal(header) | |
| comp_molid = "1" | |
| last_comp_key = "misc" | |
| last_src_key = "misc" | |
| for hh in header: | |
| h = re.sub(r"[\s\n\r]*\Z", "", hh) # chop linebreaks off | |
| # key=re.sub("\s.+\s*","",h) | |
| key = h[:6].strip() | |
| # tail=re.sub("\A\w+\s+\d*\s*","",h) | |
| tail = h[10:].strip() | |
| # print("%s:%s" % (key, tail) | |
| # From here, all the keys from the header are being parsed | |
| if key == "TITLE": | |
| name = _chop_end_codes(tail).lower() | |
| pdbh_dict["name"] = " ".join([pdbh_dict["name"], name]).strip() | |
| elif key == "HEADER": | |
| rr = re.search(r"\d\d-\w\w\w-\d\d", tail) | |
| if rr is not None: | |
| pdbh_dict["deposition_date"] = _format_date(_nice_case(rr.group())) | |
| rr = re.search(r"\s+([1-9][0-9A-Z]{3})\s*\Z", tail) | |
| if rr is not None: | |
| pdbh_dict["idcode"] = rr.group(1) | |
| head = _chop_end_misc(tail).lower() | |
| pdbh_dict["head"] = head | |
| elif key == "COMPND": | |
| tt = re.sub(r"\;\s*\Z", "", _chop_end_codes(tail)).lower() | |
| # look for E.C. numbers in COMPND lines | |
| rec = re.search(r"\d+\.\d+\.\d+\.\d+", tt) | |
| if rec: | |
| pdbh_dict["compound"][comp_molid]["ec_number"] = rec.group() | |
| tt = re.sub(r"\((e\.c\.)*\d+\.\d+\.\d+\.\d+\)", "", tt) | |
| tok = tt.split(":") | |
| if len(tok) >= 2: | |
| ckey = tok[0] | |
| cval = re.sub(r"\A\s*", "", tok[1]) | |
| if ckey == "mol_id": | |
| pdbh_dict["compound"][cval] = {"misc": ""} | |
| comp_molid = cval | |
| last_comp_key = "misc" | |
| else: | |
| pdbh_dict["compound"][comp_molid][ckey] = cval | |
| last_comp_key = ckey | |
| else: | |
| pdbh_dict["compound"][comp_molid][last_comp_key] += tok[0] + " " | |
| elif key == "SOURCE": | |
| tt = re.sub(r"\;\s*\Z", "", _chop_end_codes(tail)).lower() | |
| tok = tt.split(":") | |
| # print(tok) | |
| if len(tok) >= 2: | |
| ckey = tok[0] | |
| cval = re.sub(r"\A\s*", "", tok[1]) | |
| if ckey == "mol_id": | |
| pdbh_dict["source"][cval] = {"misc": ""} | |
| comp_molid = cval | |
| last_src_key = "misc" | |
| else: | |
| pdbh_dict["source"][comp_molid][ckey] = cval | |
| last_src_key = ckey | |
| else: | |
| pdbh_dict["source"][comp_molid][last_src_key] += tok[0] + " " | |
| elif key == "KEYWDS": | |
| kwd = _chop_end_codes(tail).lower() | |
| if "keywords" in pdbh_dict: | |
| pdbh_dict["keywords"] += " " + kwd | |
| else: | |
| pdbh_dict["keywords"] = kwd | |
| elif key == "EXPDTA": | |
| expd = _chop_end_codes(tail) | |
| # chop junk at end of lines for some structures | |
| expd = re.sub(r"\s\s\s\s\s\s\s.*\Z", "", expd) | |
| # if re.search('\Anmr',expd,re.IGNORECASE): expd='nmr' | |
| # if re.search('x-ray diffraction',expd,re.IGNORECASE): expd='x-ray diffraction' | |
| pdbh_dict["structure_method"] = expd.lower() | |
| elif key == "CAVEAT": | |
| # make Annotation entries out of these!!! | |
| pass | |
| elif key == "REVDAT": | |
| rr = re.search(r"\d\d-\w\w\w-\d\d", tail) | |
| if rr is not None: | |
| pdbh_dict["release_date"] = _format_date(_nice_case(rr.group())) | |
| elif key == "JRNL": | |
| # print("%s:%s" % (key, tail)) | |
| if "journal" in pdbh_dict: | |
| pdbh_dict["journal"] += tail | |
| else: | |
| pdbh_dict["journal"] = tail | |
| elif key == "AUTHOR": | |
| auth = _nice_case(_chop_end_codes(tail)) | |
| if "author" in pdbh_dict: | |
| pdbh_dict["author"] += auth | |
| else: | |
| pdbh_dict["author"] = auth | |
| elif key == "REMARK": | |
| if re.search("REMARK 2 RESOLUTION.", hh): | |
| r = _chop_end_codes(re.sub("REMARK 2 RESOLUTION.", "", hh)) | |
| r = re.sub(r"\s+ANGSTROM.*", "", r) | |
| try: | |
| pdbh_dict["resolution"] = float(r) | |
| except ValueError: | |
| # print('nonstandard resolution %r' % r) | |
| pdbh_dict["resolution"] = None | |
| elif hh.startswith("REMARK 465"): | |
| if tail: | |
| pdbh_dict["has_missing_residues"] = True | |
| missing_res_info = _parse_remark_465(tail) | |
| if missing_res_info: | |
| pdbh_dict["missing_residues"].append(missing_res_info) | |
| elif hh.startswith("REMARK 99 ASTRAL"): | |
| if tail: | |
| remark_99_keyval = tail.replace("ASTRAL ", "").split(": ") | |
| if type(remark_99_keyval) == list and len(remark_99_keyval) == 2: | |
| if "astral" not in pdbh_dict: | |
| pdbh_dict["astral"] = { | |
| remark_99_keyval[0]: remark_99_keyval[1] | |
| } | |
| else: | |
| pdbh_dict["astral"][remark_99_keyval[0]] = remark_99_keyval[ | |
| 1 | |
| ] | |
| else: | |
| # print(key) | |
| pass | |
| if pdbh_dict["structure_method"] == "unknown": | |
| res = pdbh_dict["resolution"] | |
| if res is not None and res > 0.0: | |
| pdbh_dict["structure_method"] = "x-ray diffraction" | |
| return pdbh_dict | |